import pandas as pd
import numpy as np
import re#正则表达式
import jieba.posseg as psg#分词库
#数据预处理
data = pd.read_csv('./data/reviews.csv') #读取数据
reviews = data[['content','content_type']].drop_duplicates()#去重
content = reviews['content'] #提取内容列单独处理
str_info = re.compile('[0-9a-zA-Z]|京东|美的|电热水器|热水器|')#去掉数字,字母和关键字的正则处理器
content = content.apply(lambda x : str_info.sub('',x))#对每一行数据使用这个处理器
jieba.posseg 和 jieba 的区别:jieba.posseg会标注词性
worker = lambda s:[(x.word, x.flag) for x in psg.cut(s)]
seg_word = content.apply(worker)#分词,会标注词性
n_word = seg_word.apply(lambda x:len(x)) #每条评论分词后,词语或者的个数
n_content = [[x+1]*y for x,y in zip(list(seg_word.index),list(n_word))]
index_content = sum(n_content,[])#给每一条分词后的词语编号,这个编号表示,相同编号的词语来自同一评论
seg_word = sum(seg_word,[])#把嵌套列表展开,加入[]之中,只展开一列
word = [x[0] for x in seg_word]#单词
nature = [x[1] for x in seg_word]#词性,对数据依次展开没有打乱原先的顺序
content_type = [[x] * y for x,y in zip(list(reviews['content_type']),list(n_word))] #获取每一条评论的情感类型,是评论不是词条,词条的情感分析需要我们后面分析得出
content_type = sum(content_type,[])#把嵌套列表展开,结果加入一个列表之中
#构建处理结果数据框
result = pd.DataFrame({
"index_content":index_content,
"word":word,
"nature":nature,
"content_type":content_type
})
result = result[result['nature'] != 'x']#删除标点符号,这是根据词性表对照得出的结论
stop = open('./data/stoplist.txt','r',encoding='UTF-8') #加载停用词数据
stop_word = stop.readlines()
stop_word = [x.replace('\n','') for x in stop_word] #对换行符做特殊处理
word = list(set(word) - set(stop_word))
result = result[result['word'].isin(word)]#去停用词
n_word = list(result.groupby(by=['index_content'])['index_content'].count())#来自同一评论的词条进行分组
index_word = [list(np.arange(0,y)) for y in n_word]
index_word = sum(index_word,[])
result['index_word'] = index_word#对词条进行索引,表示来自哪一条评论的第几个词条
#画出名词
ind = result[['n' in x for x in result['nature']]]['index_content'].unique()
result = result[[x in ind for x in result['index_content']]]
词云的模板可以直接用
#画出词云
import matplotlib.pyplot as plt
from wordcloud import WordCloud
#提取含有名词的词语
frequencies = result.groupby(by = ['word'])['word'].count()
frequencies = frequencies.sort_values(ascending = False)
print(frequencies)
backgroud_Image=plt.imread('./data/pl.jpg')
wordcloud = WordCloud(max_words=100,
background_color='white',
mask=backgroud_Image)#这里字体设置会出现问题。1更改库文件默认文件名2把相对应的文件下载到库文件的当前文件(或者不更改库文件在这里
# 手动设置字体名称,但是也需要把对应的库文件添加到相对应的路径)
my_wordcloud = wordcloud.fit_words(frequencies)
plt.imshow(my_wordcloud)
plt.axis('off')
plt.show()
result.to_csv("./word.csv", index = False, encoding = 'utf-8')
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
word = pd.read_csv("./word.csv")
# 读入正面、负面情感评价词
pos_comment = pd.read_csv("./data/正面评价词语(中文).txt", header=None, sep="\n",
encoding='utf-8', engine='python')
neg_comment = pd.read_csv("./data/负面评价词语(中文).txt", header=None, sep="\n",
encoding='utf-8', engine='python')
pos_emotion = pd.read_csv("./data/正面情感词语(中文).txt", header=None, sep="\n",
encoding='utf-8', engine='python')
neg_emotion = pd.read_csv("./data/负面情感词语(中文).txt", header=None, sep="\n",
encoding='utf-8', engine='python')#读取情感评价和词语数据
# 合并情感词与评价词
positive = set(pos_comment.iloc[:, 0]) | set(pos_emotion.iloc[:, 0])
negative = set(neg_comment.iloc[:, 0]) | set(neg_emotion.iloc[:, 0])
intersection = positive & negative # 正负面情感词表中相同的词语
positive = list(positive - intersection)
negative = list(negative - intersection)
positive = pd.DataFrame({"word": positive,
"weight": [1] * len(positive)})
negative = pd.DataFrame({"word": negative,
"weight": [-1] * len(negative)})#对正面词赋权值1,负面词赋权值-1
posneg = positive.append(negative)#合并情感词
# 将分词结果与正负面情感词表合并,定位情感词
data_posneg = posneg.merge(word, left_on='word', right_on='word',
how='right')
data_posneg = data_posneg.sort_values(by=['index_content', 'index_word'])#按照两个维度进行排序,先对评论排序,在对评论词块进行排序
data_posneg['amend_weight'] = data_posneg['weight'] # 构造新列,作为经过否定词修正后的情感值
data_posneg['id'] = np.arange(0, len(data_posneg))
only_inclination = data_posneg.dropna() # 只保留有情感值的词语
only_inclination.index = np.arange(0, len(only_inclination))
index = only_inclination['id']
# 计算每条评论的情感值
emotional_value = only_inclination.groupby(['index_content'],
as_index=False)['amend_weight'].sum()
# 去除情感值为0的评论
emotional_value = emotional_value[emotional_value['amend_weight'] != 0] #正负抵消以后可能为0
# 给情感值大于0的赋予评论类型(content_type)为pos,小于0的为neg
emotional_value['a_type'] = ''
emotional_value['a_type'][emotional_value['amend_weight'] > 0] = 'pos'
emotional_value['a_type'][emotional_value['amend_weight'] < 0] = 'neg'
# 查看情感分析结果
result = emotional_value.merge(word,
left_on='index_content',
right_on='index_content',
how='left')
result = result[['index_content', 'content_type', 'a_type']].drop_duplicates()#对于同一评论只保留一条数据(每个词条表示的感情值一样,去重)
print(result)
confusion_matrix = pd.crosstab(result['content_type'], result['a_type'],
margins=True) # 制作交叉表
print((confusion_matrix.iat[0,0] + confusion_matrix.iat[1,1])/confusion_matrix.iat[2,2])#查看情感分析准去率,89%
# 提取正负面评论信息
ind_pos = list(emotional_value[emotional_value['a_type'] == 'pos']['index_content'])
ind_neg = list(emotional_value[emotional_value['a_type'] == 'neg']['index_content'])
posdata = word[[i in ind_pos for i in word['index_content']]]
negdata = word[[i in ind_neg for i in word['index_content']]]
# 绘制词云
import matplotlib.pyplot as plt
from wordcloud import WordCloud
# 正面情感词词云
freq_pos = posdata.groupby(by=['word'])['word'].count()
freq_pos = freq_pos.sort_values(ascending=False)
backgroud_Image = plt.imread('./data/pl.jpg')
wordcloud = WordCloud(
max_words=100,
background_color='white',
mask=backgroud_Image)
pos_wordcloud = wordcloud.fit_words(freq_pos)
plt.imshow(pos_wordcloud)
plt.axis('off')
plt.show()
# 负面情感词词云
freq_neg = negdata.groupby(by=['word'])['word'].count()
freq_neg = freq_neg.sort_values(ascending=False)
neg_wordcloud = wordcloud.fit_words(freq_neg)
plt.imshow(neg_wordcloud)
plt.axis('off')
plt.show()
# 将结果写出,每条评论作为一行
posdata.to_csv("posdata.csv", index=False, encoding='utf-8')
negdata.to_csv("negdata.csv", index=False, encoding='utf-8')
LDA模型也被称为三层贝叶斯概率模型,包含文档(d),主题(z),词(w)3层结构,能够有效对文本进行建模,增加了概率的信息。LDA是一种无监督的模式,只需提高训练的文档,就可以自动训练出各种概率。无需任何人的人工标注过程,节省了大量的人力及时间。模型的泛化能力比较强,不容易出现过拟合现象。
from gensim import corpora , models
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import itertools
import warnings
warnings.filterwarnings('ignore')
#载入数据
pos_data = pd.read_csv('./posdata.csv')
neg_data = pd.read_csv('./negdata.csv')
#建立词典
pos_dict = corpora.Dictionary([[i] for i in pos_data['word']])
neg_dict = corpora.Dictionary([[i] for i in neg_data['word']])
#建立语料库
pos_corpus = [ pos_dict.doc2bow(j) for j in [[i] for i in pos_data['word']]]
neg_corpus = [ neg_dict.doc2bow(j) for j in [[i] for i in neg_data['word']]] #doc2bow()将每个句子样本表示成向量函数doc2bow() 简单地对每个不同单词的出现次数进行了计数,并将单词转换为其编号,然后以稀疏向量的形式返回结果。(0,1)代表的是编号为0的词在第一个字符串中出现一次。
# 计算余玄值
def cos(vector1,vector2):
dot_product = 0.0
normA = 0.0
normB = 0.0
for a,b in zip(vector1,vector2):
dot_product += a * b
normA += a ** 2
normB += b ** 2
if normA == 0.0 or normB == 0.0:
return None
else:
return (dot_product/((normA * normB) ** 0.5))
#主题数寻优,如果已经找到了最优主题数,可以忽略下面的寻优过程
def lda_k(x_corpus, x_dict):
# 初始化平均余弦相似度
mean_similarity = []
mean_similarity.append(1)
# 循环生成主题并计算主题间相似度
for i in np.arange(2, 11):#主题数从2到11分别计算
lda = models.LdaModel(x_corpus, num_topics=i, id2word=x_dict) # LDA模型训练
for j in np.arange(i):
term = lda.show_topics(num_words=50) #显示50个词语
# 提取各主题词
top_word = []
for k in np.arange(i):
top_word.append([''.join(re.findall('"(.*)"', i)) \
for i in term[k][1].split('+')]) # 列出所有词,正则表达式进行匹配,也可以使用其他方式匹配词语
#[(0, '0.027*"不错" + 0.020*"东西" + 0.020*"信赖" + 0.017*"物流" + 0.016*"购物" + 0.016*"太" + 0.014*"品牌" + 0.014*"家里" + 0.014*"电话" + 0.013*"装"')]
# 构造词频向量
word = sum(top_word, []) # 列出所有的词
unique_word = set(word) # 去除重复的词
# 构造主题词列表,行表示主题号,列表示各主题词
mat = []
for j in np.arange(i):
top_w = top_word[j]
mat.append(tuple([top_w.count(k) for k in unique_word]))
p = list(itertools.permutations(list(np.arange(i)), 2))
l = len(p)
top_similarity = [0]
for w in np.arange(l):
vector1 = mat[p[w][0]]
vector2 = mat[p[w][1]]
top_similarity.append(cos(vector1, vector2))
# 计算平均余弦相似度
mean_similarity.append(sum(top_similarity) / l)
return (mean_similarity)
pos_k = lda_k(pos_corpus,pos_dict)
neg_k = lda_k(neg_corpus,neg_dict)
from matplotlib.font_manager import FontProperties
font = FontProperties(size=14)
#解决中文显示问题
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus'] = False
fig = plt.figure(figsize=(10,8))
ax1 = fig.add_subplot(211)
ax1.plot(pos_k)
ax1.set_xlabel('正面评论LDA主题数寻优', fontproperties=font)
ax2 = fig.add_subplot(212)
ax2.plot(neg_k)
ax2.set_xlabel('负面评论LDA主题数寻优', fontproperties=font)
plt.show()#通过数据显示可以选出最优主题数
余玄值越高,表示越相似,从图中可以看到当主题数达到3的时候相似度达到最低,说明当主题数为3的时候各个主题之间的关联最小,主题数最优,因此我们可以设置正面和负面主题数为3进行主题建模
pos_lda = models.LdaModel(pos_corpus, num_topics = 3, id2word = pos_dict)
neg_lda = models.LdaModel(neg_corpus, num_topics = 3, id2word = neg_dict)
print(pos_lda.print_topics(num_words=10))
print(neg_lda.print_topics(num_words=10))
对输出结果自行进行分析,需要数据的在我仓库自己取,第一次写博客,谢谢收藏点赞!!