NLP(从零开始的doc2vec)和聚类:基于文本内容对新闻报道进行分类
从零开始的NLP与聚类:基于文本内容对新闻报道进行分类
使用NLP(doc2vec)对新闻文章中的文本进行深度和定制化的文本清洗,然后使用聚类(Birch)来找出文章的主题。
在这个例子中,我使用NLP(Doc2Vec)和聚类算法来尝试按照主题对新闻进行分类。
这种分类可以使用许多方法来实现,比如使用监督方法(有标签的数据集)、使用聚类以及使用特定的LDA算法(主题建模)。
我选择使用Doc2Vec算法,因为我认为它是一种很好的文本向量化算法,并且相对简单易用。
我将按照以下步骤进行处理:
像往常一样,首先加载所需的库:
# 导入需要的库
import numpy as np
import pandas as pd
# 词典数据来源为json
import json
pd.options.mode.chained_assignment = None
#从磁盘读取
from io import StringIO
# 文本预处理和清洗
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltkstop = stopwords.words('english')
from nltk.stem.snowball import SnowballStemmer
nltk.download('punkt')
snow = SnowballStemmer(language='English')
# 建模
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances
from sklearn.cluster import Birch
from sklearn.metrics import silhouette_samples, silhouette_score, calinski_harabasz_score
import warnings
# 绘图
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
然后,我读取数据并准备字典文件。这些数据最初来自Kaggle上的公共数据集(包含国家、姓名、货币等的列表)。
# 这是要处理的文章数据集
maindataset = pd.read_csv("articles1.csv")
maindataset2 = pd.read_csv("articles2.csv")
maindataset = pd.concat([maindataset,maindataset2], ignore_index=True)
# 这是一个国家列表。我们将使用“xcountryx”替换文章中的国家名称
countries = pd.read_json("countries.json")
countries["country"] = countries["country"].str.lower()
countries = pd.DataFrame(countries["country"].apply(lambda x: str(x).replace('-',' ').replace('.',' ').replace('_',' ').replace(',',' ').replace(':',' ').split(" ")).explode())
countries.columns = ['word']
countries["replacement"] = "xcountryx"
# 这是一个省份列表。该列表包含几个备用名称和国家列表,也将添加到字典中
provincies = pd.read_csv("countries_provincies.csv")
provincies1 = provincies[["name"]]
provincies1["name"] = provincies1["name"].str.lower()
provincies1 = pd.DataFrame(provincies1["name"].apply(lambda x: str(x).replace('-',' ').replace('.',' ').replace('_',' ').replace(',',' ').replace(':',' ').split(" ")).explode())
provincies1.columns = ['word']
provincies1["replacement"] = "xprovincex"
provincies2 = provincies[["name_alt"]]
provincies2["name_alt"] = provincies2["name_alt"].str.lower()
provincies2 = pd.DataFrame(provincies2["name_alt"].apply(lambda x: str(x).replace('-',' ').replace('.',' ').replace('_',' ').replace(',',' ').replace(':',' ').split(" ")).explode())
provincies2.columns = ['word']
provincies2["replacement"] = "xprovincex"
provincies3 = provincies[["type_en"]]
provincies3["type_en"] = provincies3["type_en"].str.lower()
provincies3 = pd.DataFrame(provincies3["type_en"].apply(lambda x: str(x).replace('-',' ').replace('.',' ').replace('_',' ').replace(',',' ').replace(':',' ').split(" ")).explode())
provincies3.columns = ['word']
provincies3["replacement"] = "xsubdivisionx"
provincies4 = provincies[["admin"]]
provincies4["admin"] = provincies4["admin"].str.lower()
provincies4 = pd.DataFrame(provincies4["admin"].apply(lambda x: str(x).replace('-',' ').replace('.',' ').replace('_',' ').replace(',',' ').replace(':',' ').split(" ")).explode())
provincies4.columns = ['word']
provincies4["replacement"] = "xcountryx"
provincies5 = provincies[["geonunit"]]
provincies5["geonunit"] = provincies5["geonunit"].str.lower()
provincies5 = pd.DataFrame(provincies5["geonunit"].apply(lambda x: str(x).replace('-',' ').replace('.',' ').replace('_',' ').replace(',',' ').replace(':',' ').split(" ")).explode())
provincies5.columns = ['word']
provincies5["replacement"] = "xcountryx"
provincies6 = provincies[["gn_name"]]
provincies6["gn_name"] = provincies6["gn_name"].str.lower()
provincies6 = pd.DataFrame(provincies6["gn_name"].apply(lambda x: str(x).replace('-',' ').replace('.',' ').replace('_',' ').replace(',',' ').replace(':',' ').split(" ")).explode())
provincies6.columns = ['word']
provincies6["replacement"] = "xcountryx"
provincies = pd.concat([provincies1,provincies2,provincies3,provincies4,provincies5,provincies6], axis=0, ignore_index=True)
# 货币列表
currencies = pd.read_json("country-by-currency-name.json")
currencies1 = currencies[["country"]]
currencies1["country"] = currencies1["country"].str.lower()
currencies1 = pd.DataFrame(currencies1["country"].apply(lambda x: str(x).replace('-',' ').replace('.',' ').replace('_',' ').replace(',',' ').replace(':',' ').split(" ")).explode())
currencies1.columns = ['word']
currencies1["replacement"] = "xcountryx"
currencies2 = currencies[["currency_name"]]
currencies2["currency_name"] = currencies2["currency_name"].str.lower()
currencies2 = pd.DataFrame(currencies2["currency_name"].apply(lambda x: str(x).replace('-',' ').replace('.',' ').replace('_',' ').replace(',',' ').replace(':',' ').split(" ")).explode())
currencies2.columns = ['word']
currencies2["replacement"] = "xcurrencyx"
currencies = pd.concat([currencies1,currencies2], axis=0, ignore_index=True)
# 名字
firstnames = pd.read_csv("interall.csv", header=None)
firstnames = firstnames[firstnames[1] >= 10000]
firstnames = firstnames[[0]]
firstnames[0] = firstnames[0].str.lower()
firstnames = pd.DataFrame(firstnames[0].apply(lambda x: str(x).replace('-',' ').replace('.',' ').replace('_',' ').replace(',',' ').replace(':',' ').split(" ")).explode())
firstnames.columns = ['word']
firstnames["replacement"] = "xfirstnamex"
# 姓氏
lastnames = pd.read_csv("intersurnames.csv", header=None)
lastnames = lastnames[lastnames[1] >= 10000]
lastnames = lastnames[[0]]
lastnames[0] = lastnames[0].str.lower()
lastnames = pd.DataFrame(lastnames[0].apply(lambda x: str(x).replace('-',' ').replace('.',' ').replace('_',' ').replace(',',' ').replace(':',' ').split(" ")).explode())
lastnames.columns = ['word']
lastnames["replacement"] = "xlastnamex"
# 日期、星期等临时名称
temporaldata = pd.read_csv("temporal.csv")
# 整个字典
dictionary = pd.concat([lastnames,temporaldata,firstnames,currencies,provincies,countries], axis=0, ignore_index=True)
dictionary = dictionary.groupby(["word"]).first().reset_index(drop=False)
dictionary = dictionary.dropna()
maindataset
这是原始数据集的预览
maindataset

下面的函数负责:
- 使用上面制作的字典替换单词
- 去除标点符号、双空格等
def replace_words(tt, lookp_dict): temp = tt.split() res = [] for wrd in temp: res.append(lookp_dict.get(wrd, wrd)) res = ' '.join(res) return resdef preprepare(eingang): ausgang = eingang.lower() ausgang = ausgang.replace(u'\xa0', u' ') ausgang = re.sub(r'^\s*$',' ',str(ausgang)) ausgang = ausgang.replace('|', ' ') ausgang = ausgang.replace('ï', ' ') ausgang = ausgang.replace('»', ' ') ausgang = ausgang.replace('¿', '. ') ausgang = ausgang.replace('', ' ') ausgang = ausgang.replace('"', ' ') ausgang = ausgang.replace("'", " ") ausgang = ausgang.replace('?', ' ') ausgang = ausgang.replace('!', ' ') ausgang = ausgang.replace(',', ' ') ausgang = ausgang.replace(';', ' ') ausgang = ausgang.replace('.', ' ') ausgang = ausgang.replace("(", " ") ausgang = ausgang.replace(")", " ") ausgang = ausgang.replace("{", " ") ausgang = ausgang.replace("}", " ") ausgang = ausgang.replace("[", " ") ausgang = ausgang.replace("]", " ") ausgang = ausgang.replace("~", " ") ausgang = ausgang.replace("@", " ") ausgang = ausgang.replace("#", " ") ausgang = ausgang.replace("$", " ") ausgang = ausgang.replace("%", " ") ausgang = ausgang.replace("^", " ") ausgang = ausgang.replace("&", " ") ausgang = ausgang.replace("*", " ") ausgang = ausgang.replace("<", " ") ausgang = ausgang.replace(">", " ") ausgang = ausgang.replace("/", " ") ausgang = ausgang.replace("\\", " ") ausgang = ausgang.replace("`", " ") ausgang = ausgang.replace("+", " ") ausgang = ausgang.replace("=", " ") ausgang = ausgang.replace("_", " ") ausgang = ausgang.replace("-", " ") ausgang = ausgang.replace(':', ' ') ausgang = ausgang.replace('\n', ' ').replace('\r', ' ') ausgang = ausgang.replace(" +", " ") ausgang = ausgang.replace(" +", " ") ausgang = ausgang.replace('?', ' ') ausgang = re.sub('[^a-zA-Z]', ' ', ausgang) ausgang = re.sub(' +', ' ', ausgang) ausgang = re.sub('\ +', ' ', ausgang) ausgang = re.sub(r'\s([?.!"](?:\s|$))', r'\1', ausgang) return ausgang
清理字典数据
dictionary["word"] = dictionary["word"].apply(lambda x: preprepare(x))dictionary = dictionary[dictionary["word"] != " "]dictionary = dictionary[dictionary["word"] != ""]dictionary = {row['word']: row['replacement'] for index, row in dictionary.iterrows()}
准备要转换的文本数据:创建一个新列,将标题(重复4次)和摘要拼接在一起。这是将要转换为向量的数据。我这样做是因为这样更重视标题而不是实际文章内容。
然后我替换停用词和字典中的单词
maindataset["NLPtext"] = maindataset["title"] + maindataset["title"] + maindataset["content"] + maindataset["title"] + maindataset["title"]maindataset["NLPtext"] = maindataset["NLPtext"].str.lower()maindataset["NLPtext"] = maindataset["NLPtext"].apply(lambda x: preprepare(str(x)))maindataset["NLPtext"] = maindataset["NLPtext"].apply(lambda x: ' '.join([word for word in x.split() if word not in (nltkstop)]))maindataset["NLPtext"] = maindataset["NLPtext"].apply(lambda x: replace_words(str(x), dictionary))
准备文本的最后一部分就是词干提取。在这种情况下进行词干提取是因为我要从头开始训练模型。
是否进行词干提取取决于使用的模型。在使用预训练模型(如BERT)时,不建议进行词干提取,因为词汇不会与其库中的词汇匹配。
def steming(sentence): words = word_tokenize(sentence) stems = [snow.stem(whole) for whole in words] oup = ' '.join(stems) return oupmaindataset["NLPtext"] = maindataset["NLPtext"].apply(lambda x: steming(x))maindataset['lentitle'] = maindataset["title"].apply(lambda x: len(str(x).split(' ')))maindataset['lendesc'] = maindataset["content"].apply(lambda x: len(str(x).split(' ')))maindataset['lentext'] = maindataset["NLPtext"].apply(lambda x: len(str(x).split(' ')))maindataset = maindataset[maindataset['NLPtext'].notna()]maindataset = maindataset[maindataset['lentitle']>=4]maindataset = maindataset[maindataset['lendesc']>=4]maindataset = maindataset[maindataset['lentext']>=4]maindataset = maindataset.reset_index(drop=False)maindataset

最后,是时候训练doc2vec模型了。
# 随机化数据集 trainset = maindataset.sample(frac=1).reset_index(drop=True)# 排除过短的文本 trainset = trainset[(trainset['NLPtext'].str.len() >= 5)]# 选择文本列 trainset = trainset[["NLPtext"]]# 分词并创建训练集 tagged_data = []for index, row in trainset.iterrows(): part = TaggedDocument(words=word_tokenize(row[0]), tags=[str(index)]) tagged_data.append(part)# 定义模型 model = Doc2Vec(vector_size=250, min_count=3, epochs=20, dm=1)model.build_vocab(tagged_data)# 训练和保存 model. Train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)model.save("d2v.model")print("Model Saved")
为了限制数据和时间的大小,我将筛选一个新闻来源。
maindataset.groupby('publication').count()['index']

maindatasetF = maindataset[maindataset["publication"]=="Guardian"]
现在,我将为选定的出版物对文本信息进行向量化。
a = []for index, row in maindatasetF.iterrows(): nlptext = row['NLPtext'] ids = row['index'] vector = model.infer_vector(word_tokenize(nlptext)) vector = pd.DataFrame(vector).T vector.index = [ids] a.append(vector)textvectors = pd.concat(a)textvectors

对嵌入进行标准化和PCA(减少维数)
def properscaler(simio): scaler = StandardScaler() resultsWordstrans = scaler.fit_transform(simio) resultsWordstrans = pd.DataFrame(resultsWordstrans) resultsWordstrans.index = simio.index resultsWordstrans.columns = simio.columns return resultsWordstransdatasetR = properscaler(textvectors)def varred(simio): scaler = PCA(n_components=0.8, svd_solver='full') resultsWordstrans = simio.copy() resultsWordstrans = scaler.fit_transform(resultsWordstrans) resultsWordstrans = pd.DataFrame(resultsWordstrans) resultsWordstrans.index = simio.index resultsWordstrans.columns = resultsWordstrans.columns.astype(str) return resultsWordstransdatasetR = varred(datasetR)
现在我想尝试的第一个练习是相似性搜索。寻找与提供的示例类似的文章。
#按索引找到并打印原始搜索对象
index = 95133
texttofind = maindatasetF[maindatasetF["index"]==index]["title"]
print(str(texttofind))
id = index
print(str(id))
cat = maindatasetF[maindatasetF["index"]==index]["publication"]
print(str(cat))
embdfind = datasetR[datasetR.index==id]
#计算欧几里德距离并提取与提供示例最相似的距离
distances = pairwise_distances(X=embdfind, Y=datasetR, metric='euclidean')
distances = pd.DataFrame(distances).T
distances.index = datasetR.index
distances = distances.sort_values(0)
distances = distances.reset_index(drop=False)
distances = pd.merge(distances, maindatasetF[["index","title","publication","content"]], left_on=["index"], right_on=["index"])
pd.options.display.max_colwidth = 100
distances.head(100)[['index',0,'publication','title']]
我们可以看到提取的文本是有意义的,它们在性质上与提供的示例相似。

对于聚类,第一步是找到理想的聚类数。此时,我们希望最大化轮廓和Calinski Harabasz分数,同时保持逻辑上的聚类数(不要太低以难以解释,也不要太高以过于粒度细)。
#循环尝试不同的模型和聚类数
a = []
X = datasetR.to_numpy(dtype='float')
for ncl in np.arange(2, int(20), 1):
clusterer = Birch(n_clusters=int(ncl))
# 防止警告信息干扰输出
with warnings.catch_warnings():
warnings.simplefilter("ignore")
cluster_labels2 = clusterer.fit_predict(X)
silhouette_avg2 = silhouette_score(X, cluster_labels2)
calinski2 = calinski_harabasz_score(X, cluster_labels2)
row = pd.DataFrame({"ncl": [ncl],
"silKMeans": [silhouette_avg2], "c_hKMeans": [calinski2],
})
a.append(row)
scores = pd.concat(a, ignore_index=True)
# 绘制结果
plt.style.use('bmh')
fig, [ax_sil, ax_ch] = plt.subplots(1,2,figsize=(15,7))
ax_sil.plot(scores["ncl"], scores["silKMeans"], 'b-')
ax_ch.plot(scores["ncl"], scores["c_hKMeans"], 'b-')
ax_sil.set_title("轮廓曲线")
ax_ch.set_title("Calinski Harabasz曲线")
ax_sil.set_xlabel('聚类数')
ax_sil.set_ylabel('轮廓系数平均值')
ax_ch.set_xlabel('聚类数')
ax_ch.set_ylabel('calinski_harabasz')
ax_ch.legend(loc="upper right")
plt.show()

我选择了5个聚类,并运行了算法。
ncl_birch = 5
with warnings.catch_warnings():
warnings.simplefilter("ignore")
clusterer2 = Birch(n_clusters=int(ncl_birch))
cluster_labels2 = clusterer2.fit_predict(X)
n_clusters2 = max(cluster_labels2)
silhouette_avg2 = silhouette_score(X, cluster_labels2)
sample_silhouette_values2 = silhouette_samples(X, cluster_labels2)
finalDF = datasetR.copy()
finalDF["cluster"] = cluster_labels2
finalDF["silhouette"] = sample_silhouette_values2
# 绘制轮廓分数
fig, ax2 = plt.subplots()
ax2.set_xlim([-0.1, 1])
ax2.set_ylim([0, len(X) + (n_clusters2 + 1) * 10])
y_lower = 10
for i in range(min(cluster_labels2),max(cluster_labels2)+1):
ith_cluster_silhouette_values = sample_silhouette_values2[cluster_labels2 == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
color = cm.nipy_spectral(float(i) / n_clusters2)
ax2.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7)
ax2.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
y_lower = y_upper + 10
ax2.set_title("Birch的轮廓图")
ax2.set_xlabel("轮廓系数值")
ax2.set_ylabel("聚类标签")
ax2.axvline(x=silhouette_avg2, color="red", linestyle="--")
ax2.set_yticks([])
ax2.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

这些结果告诉我,与其余的簇相比,簇 4 可能表现出较少的“联系”。相反,簇 3 和簇 1 是明确定义的。这是结果的一个样本。
showDF = finalDF.sort_values(['cluster','silhouette'], ascending=[False,False]).groupby('cluster').head(3)showDF = pd.merge(showDF[['cluster','silhouette']],maindatasetF[["index",'title']], left_index=True ,right_on=["index"])showDF

我可以看到簇 4 是与科技相关的新闻,簇 3 是战争/国际事件,簇 2 是娱乐,簇 1 是体育,而簇 0 则像往常一样可以被视为“其他”。


