LDA主题模型代码实现,实测复杂度为正数

复杂度计算和主题词提取,需要三个文件,未分词的数据集,词典文件,停用词,路径最好用/替代\,

import numpy as np
import pandas as pd
import re
import jieba
import jieba.posseg as psg
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
data=pd.read_table("数据分析师测试.txt")#第一行标题为content
dic_file = "dict.txt"#词性字典,例牛马 n,摸鱼 v
stop_file = "stopword.txt"


def chinese_word_cut(mytext):
    jieba.load_userdict(dic_file)
    jieba.initialize()
    try:
        stopword_list = open(stop_file, encoding='utf-8')
    except:
        stopword_list = []
        print("error in stop_file")
    stop_list = []
    flag_list = ['n', 'nz', 'vn']
    for line in stopword_list:
        line = re.sub(u'\n|\\r', '', line)
        stop_list.append(line)

    word_list = []
    # jieba分词
    seg_list = psg.cut(mytext)
    for seg_word in seg_list:
        word = re.sub(u'[^\u4e00-\u9fa5]', '', seg_word.word)
        # word = seg_word.word  #如果想要分析英语文本,注释这行代码,启动下行代码
        find = 0
        for stop_word in stop_list:
            if stop_word == word or len(word) < 2:  # this word is stopword
                find = 1
                break
        if find == 0 and seg_word.flag in flag_list:
            word_list.append(word)
    return (" ").join(word_list)
data["content_cutted"] = data.content.apply(chinese_word_cut)
def print_top_words(model, feature_names, n_top_words):
    tword = []
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        topic_w = " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
        tword.append(topic_w)
        print(topic_w)
    return tword
n_features = 1000 #提取1000个特征词语
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                max_features=n_features,
                                stop_words='english',
                                max_df = 0.5,
                                min_df = 10)
tf = tf_vectorizer.fit_transform(data.content_cutted)
n_topics = 8
lda = LatentDirichletAllocation(n_components=n_topics, max_iter=50,
                                learning_method='batch',
                                learning_offset=50,
#                                 doc_topic_prior=0.1,
#                                 topic_word_prior=0.01,
                               random_state=0)
lda.fit(tf)
n_top_words = 25
tf_feature_names = tf_vectorizer.get_feature_names_out()

topic_word = print_top_words(lda, tf_feature_names, n_top_words)

topics=lda.transform(tf)
topic = []
for t in topics:
    topic.append("Topic #"+str(list(t).index(np.max(t))))
data['概率最大的主题序号']=topic
data['每个主题对应概率']=list(topics)
data.to_excel("data_topic.xlsx",index=False)

plexs = []
scores = []
n_max_topics = 16
for i in range(1,n_max_topics):
    print(i)
    lda = LatentDirichletAllocation(n_components=i, max_iter=50,
                                    learning_method='batch',
                                    learning_offset=50,random_state=0)
    lda.fit(tf)
    plexs.append(lda.perplexity(tf))
    scores.append(lda.score(tf))
n_t=15#区间最右侧的值。注意:不能大于n_max_topics
x=list(range(1,n_t+1))
plt.plot(x,plexs[0:n_t])
plt.xlabel("number of topics")
plt.ylabel("perplexity")
plt.savefig('困惑度' + '.png')
print("图片保存成功")
plt.show()

代码源阿婆主传送

文本分词+词云代码,可利用词云图选出无意义的词语。

#注意!文件一定要标明是UTF-8格式,这是个警告!
#文本分词
import jieba as jb
# import docx
import re

# stopWordsFile:停用词表地址,fileName 是待处理文本,writeFile 是新建保存预处理好文本的文件
# 加载停用词,并把停用词存储为列表的形式
def loadStopWords(fileName):
    with open(fileName,encoding='utf-8', errors='ignore') as f:
        stopwords = f.read()
    stopwords_list = stopwords.split('\n')
    custom_stopwords_list = [i for i in stopwords_list]
    custom_stopwords_list.extend(['二'])
    return custom_stopwords_list


# 把文本分词并去除停用词,返回列表
def wordsCut(words, stopWordsFile):
    result = jb.cut(words)
    newWords = []
    stopWords = loadStopWords(stopWordsFile)
    for s in result:
        if s not in stopWords:
            newWords.append(s)
    return newWords


# 去空格
def not_empty(s):
    return s and s.strip()


#  把样本文件的每一行每一句做分词处理,并写文件
def fileCut(fileName, writeFile, stopWordsFile):
    dataMat = []
    fr = open(fileName,encoding='utf-8', errors='ignore')#gbk,gb18030
    frW = open(writeFile, 'w',encoding='utf-8', errors='ignore')
    for line in fr.readlines():  # 将文件逐行读取
        curLine = line.strip()  # 去掉所有空格
        curLine1 = curLine.upper()  # 把字符串中的英文字母转换成大写
        cutWords = wordsCut(curLine1, stopWordsFile)  # 分词且去停用词,返回一行的列表
        cutWords = list(filter(not_empty, cutWords))
        for i in range(len(cutWords)):
            if re.match(r'^\d.*', cutWords[i]):
                cutWords[i] = ''
            else:
                frW.write(cutWords[i])
                frW.write(' ')
        frW.write('\n')
        dataMat.append(cutWords)
    frW.close()


# stopWordsFile:停用词表地址,fileName 是待处理文本,writeFile 是新建保存预处理好文本的文件
fileName = r'D:/Project/database/数据分析师测试.txt'
writeFile = r'D:/Project/database/数据分析师测试已分词.txt'
stopWordsFile = r'D:/Project/stopword.txt'
fileCut(fileName, writeFile, stopWordsFile)
#完整代码
import numpy as np
import wordcloud as wc
import matplotlib.pyplot as plt
from PIL import Image
import jieba
#1.打开文件,将评论读入一个字符串变量
with open("D:/Project/岗位职责已分词.txt", mode="r",encoding='utf-8') as fp:
   text= fp.read()
mask = np.array(Image.open('D:/Project/小猫咪3.png'))
#3.指定停用词
stopwords=['的','了','熟悉','负责','产品','供应商','新','工程师','岁','关键字','岗位职责','以上学历','专业','年龄','年','熟练','流程','质量','需求','客户','系统','具备','软件','管理','开发','公司','相关','项目','数据','设计','经验','技术','工作','合作','就是','上班','地址','查看','地图','优先','AND','职能','类别','任职']
# 4.创建WordCloud对象,设置基本信息
word_cloud = wc.WordCloud(mask=mask,font_path="C:\Windows\Fonts\SIMHEI.ttf",stopwords=stopwords,background_color='white') #此处stopwords是指定停用词
# 5.调用词云对象的generate方法加载文本,生成词云图
word_cloud.generate(text)
#6.显示词云图
plt.imshow(word_cloud)
plt.show()
word_cloud.to_file('小猫咪' + '.png')
print('小猫咪'+'词云图','保存成功')
LDA主题模型代码实现,实测复杂度为正数_第1张图片

LDA可视化,注此处的数据集是已经分词的,代码运行完成后会生成一个链接,点进链接即可看到LDA可视化网页,num_topics为主题词数量。

from gensim.corpora import Dictionary
from gensim.models import LdaModel
import codecs
import pyLDAvis.gensim

if __name__ == '__main__':
    doc1 = codecs.open('D:/Project/database/华东已分词.txt', mode='r',encoding='utf-8')
    lines = doc1.readlines()
    all_set = []

    listword = ['数据', '数据分析', '工作', '年', '强', '良好', '相关', '企业', '公司', '要求', '优先', '各类'
            ,'良好','具备','熟练','熟悉',' - ','类','提供','优先','具有','进行' ,'要求','能力','分析','负责'
            ,'经验','任职','完成','专业','活动','问题']
    for line in lines:
         line = line.replace(' \n', '')
         line = line.replace('\r\n', '')
         line = line.split()
         all_set.append([w for w in line if w not in listword])
# print(all_set) #每行的嵌套列表  [['xx','xx'],['xx','xx'],...,['xx','xx']]
# 构建训练语料,并将其可视化
    dictionary = Dictionary(all_set)
    corpus = [dictionary.doc2bow(text) for text in all_set]
    lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=4)
    vis_data = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
    pyLDAvis.show(vis_data, open_browser=False)
LDA主题模型代码实现,实测复杂度为正数_第2张图片

你可能感兴趣的:(python,nlp)