复杂度计算和主题词提取,需要三个文件,未分词的数据集,词典文件,停用词,路径最好用/替代\,
import numpy as np
import pandas as pd
import re
import jieba
import jieba.posseg as psg
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
data=pd.read_table("数据分析师测试.txt")#第一行标题为content
dic_file = "dict.txt"#词性字典,例牛马 n,摸鱼 v
stop_file = "stopword.txt"
def chinese_word_cut(mytext):
jieba.load_userdict(dic_file)
jieba.initialize()
try:
stopword_list = open(stop_file, encoding='utf-8')
except:
stopword_list = []
print("error in stop_file")
stop_list = []
flag_list = ['n', 'nz', 'vn']
for line in stopword_list:
line = re.sub(u'\n|\\r', '', line)
stop_list.append(line)
word_list = []
# jieba分词
seg_list = psg.cut(mytext)
for seg_word in seg_list:
word = re.sub(u'[^\u4e00-\u9fa5]', '', seg_word.word)
# word = seg_word.word #如果想要分析英语文本,注释这行代码,启动下行代码
find = 0
for stop_word in stop_list:
if stop_word == word or len(word) < 2: # this word is stopword
find = 1
break
if find == 0 and seg_word.flag in flag_list:
word_list.append(word)
return (" ").join(word_list)
data["content_cutted"] = data.content.apply(chinese_word_cut)
def print_top_words(model, feature_names, n_top_words):
tword = []
for topic_idx, topic in enumerate(model.components_):
print("Topic #%d:" % topic_idx)
topic_w = " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
tword.append(topic_w)
print(topic_w)
return tword
n_features = 1000 #提取1000个特征词语
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
max_features=n_features,
stop_words='english',
max_df = 0.5,
min_df = 10)
tf = tf_vectorizer.fit_transform(data.content_cutted)
n_topics = 8
lda = LatentDirichletAllocation(n_components=n_topics, max_iter=50,
learning_method='batch',
learning_offset=50,
# doc_topic_prior=0.1,
# topic_word_prior=0.01,
random_state=0)
lda.fit(tf)
n_top_words = 25
tf_feature_names = tf_vectorizer.get_feature_names_out()
topic_word = print_top_words(lda, tf_feature_names, n_top_words)
topics=lda.transform(tf)
topic = []
for t in topics:
topic.append("Topic #"+str(list(t).index(np.max(t))))
data['概率最大的主题序号']=topic
data['每个主题对应概率']=list(topics)
data.to_excel("data_topic.xlsx",index=False)
plexs = []
scores = []
n_max_topics = 16
for i in range(1,n_max_topics):
print(i)
lda = LatentDirichletAllocation(n_components=i, max_iter=50,
learning_method='batch',
learning_offset=50,random_state=0)
lda.fit(tf)
plexs.append(lda.perplexity(tf))
scores.append(lda.score(tf))
n_t=15#区间最右侧的值。注意:不能大于n_max_topics
x=list(range(1,n_t+1))
plt.plot(x,plexs[0:n_t])
plt.xlabel("number of topics")
plt.ylabel("perplexity")
plt.savefig('困惑度' + '.png')
print("图片保存成功")
plt.show()
代码源阿婆主传送
文本分词+词云代码,可利用词云图选出无意义的词语。
#注意!文件一定要标明是UTF-8格式,这是个警告!
#文本分词
import jieba as jb
# import docx
import re
# stopWordsFile:停用词表地址,fileName 是待处理文本,writeFile 是新建保存预处理好文本的文件
# 加载停用词,并把停用词存储为列表的形式
def loadStopWords(fileName):
with open(fileName,encoding='utf-8', errors='ignore') as f:
stopwords = f.read()
stopwords_list = stopwords.split('\n')
custom_stopwords_list = [i for i in stopwords_list]
custom_stopwords_list.extend(['二'])
return custom_stopwords_list
# 把文本分词并去除停用词,返回列表
def wordsCut(words, stopWordsFile):
result = jb.cut(words)
newWords = []
stopWords = loadStopWords(stopWordsFile)
for s in result:
if s not in stopWords:
newWords.append(s)
return newWords
# 去空格
def not_empty(s):
return s and s.strip()
# 把样本文件的每一行每一句做分词处理,并写文件
def fileCut(fileName, writeFile, stopWordsFile):
dataMat = []
fr = open(fileName,encoding='utf-8', errors='ignore')#gbk,gb18030
frW = open(writeFile, 'w',encoding='utf-8', errors='ignore')
for line in fr.readlines(): # 将文件逐行读取
curLine = line.strip() # 去掉所有空格
curLine1 = curLine.upper() # 把字符串中的英文字母转换成大写
cutWords = wordsCut(curLine1, stopWordsFile) # 分词且去停用词,返回一行的列表
cutWords = list(filter(not_empty, cutWords))
for i in range(len(cutWords)):
if re.match(r'^\d.*', cutWords[i]):
cutWords[i] = ''
else:
frW.write(cutWords[i])
frW.write(' ')
frW.write('\n')
dataMat.append(cutWords)
frW.close()
# stopWordsFile:停用词表地址,fileName 是待处理文本,writeFile 是新建保存预处理好文本的文件
fileName = r'D:/Project/database/数据分析师测试.txt'
writeFile = r'D:/Project/database/数据分析师测试已分词.txt'
stopWordsFile = r'D:/Project/stopword.txt'
fileCut(fileName, writeFile, stopWordsFile)
#完整代码
import numpy as np
import wordcloud as wc
import matplotlib.pyplot as plt
from PIL import Image
import jieba
#1.打开文件,将评论读入一个字符串变量
with open("D:/Project/岗位职责已分词.txt", mode="r",encoding='utf-8') as fp:
text= fp.read()
mask = np.array(Image.open('D:/Project/小猫咪3.png'))
#3.指定停用词
stopwords=['的','了','熟悉','负责','产品','供应商','新','工程师','岁','关键字','岗位职责','以上学历','专业','年龄','年','熟练','流程','质量','需求','客户','系统','具备','软件','管理','开发','公司','相关','项目','数据','设计','经验','技术','工作','合作','就是','上班','地址','查看','地图','优先','AND','职能','类别','任职']
# 4.创建WordCloud对象,设置基本信息
word_cloud = wc.WordCloud(mask=mask,font_path="C:\Windows\Fonts\SIMHEI.ttf",stopwords=stopwords,background_color='white') #此处stopwords是指定停用词
# 5.调用词云对象的generate方法加载文本,生成词云图
word_cloud.generate(text)
#6.显示词云图
plt.imshow(word_cloud)
plt.show()
word_cloud.to_file('小猫咪' + '.png')
print('小猫咪'+'词云图','保存成功')
LDA可视化,注此处的数据集是已经分词的,代码运行完成后会生成一个链接,点进链接即可看到LDA可视化网页,num_topics为主题词数量。
from gensim.corpora import Dictionary
from gensim.models import LdaModel
import codecs
import pyLDAvis.gensim
if __name__ == '__main__':
doc1 = codecs.open('D:/Project/database/华东已分词.txt', mode='r',encoding='utf-8')
lines = doc1.readlines()
all_set = []
listword = ['数据', '数据分析', '工作', '年', '强', '良好', '相关', '企业', '公司', '要求', '优先', '各类'
,'良好','具备','熟练','熟悉',' - ','类','提供','优先','具有','进行' ,'要求','能力','分析','负责'
,'经验','任职','完成','专业','活动','问题']
for line in lines:
line = line.replace(' \n', '')
line = line.replace('\r\n', '')
line = line.split()
all_set.append([w for w in line if w not in listword])
# print(all_set) #每行的嵌套列表 [['xx','xx'],['xx','xx'],...,['xx','xx']]
# 构建训练语料,并将其可视化
dictionary = Dictionary(all_set)
corpus = [dictionary.doc2bow(text) for text in all_set]
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=4)
vis_data = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
pyLDAvis.show(vis_data, open_browser=False)