LDA模型 python代码样例

# -*- coding: utf-8 -*-
import re
import warnings
import jieba
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
from gensim import corpora
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel

warnings.filterwarnings('ignore')  # To ignore all warnings that arise here to enhance clarity
PATH = "data_lda.csv"

# 停用词
stop_words_path = "stop_words.txt"
stop_words = []

# 保留词文本路径
# 文本格式: 词语 词频(可省略) 词性(可省略)
# 一个词一行
reserved_words_path = "reserved_words.txt"

with open(stop_words_path, encoding='utf-8') as f:
    for line in f.readlines():
        stop_words.append(line.strip())
f.close()


# 数据清洗, 可以根据自己的需求进行重载
def processing(text):
    text = re.sub("【.+?】", "", text)  # 去除 【xx】 (里面的内容通常都不是用户自己写的)
    text = re.sub("\n", "", text)
    text = re.sub(r'[\W]', "", text)  # 去除标点符号
    text = re.sub(r'[\d]', "", text)  # 去除数字
    return text


# 对句子进行中文分词
def seg_depart(sentence):
    jieba.load_userdict(reserved_words_path)
    sentence_depart = jieba.cut(sentence.strip())
    out_str = ''  # 输出结果为out_str
    for word in sentence_depart:
        if word in stop_words:
            continue
        out_str += word
        out_str += " "
    return out_str


def get_data_set(path):
    data = pd.read_csv(path)
    data_set = []  # 建立存储分词的列表
    print("一共有{}行数据".format(len(data["审稿意见"])))
    for i_ in data["审稿意见"]:
        i_ = i_.strip()
        data_set.append(i_)
    return data_set


def get_fen_ci_data(data):
    output = []
    for line in data:
        line = processing(line)
        line_seg = seg_depart(line)
        output.append(line_seg.split())
    print("分词成功!!!")
    return output


"""
一般我们可以用指标来评估模型好坏,也可以用这些指标来确定最优主题数。
一般用来评价LDA主题模型的指标有困惑度(perplexity)和主题一致性(coherence),
困惑度越低或者一致性越高说明模型越好。一些研究表明perplexity并不是一个好的指标,
所以一般我用coherence来评价模型并选择最优主题
"""


# 计算困惑度
def perplexity(topics_num):
    print("\n#######number of topics is {}#######\n".format(topics_num))
    lda_model = LdaModel(corpus, num_topics=topics_num, id2word=dictionary, passes=30)
    print(lda_model.print_topics(num_topics=topics_num, num_words=15))
    print(lda_model.log_perplexity(corpus))
    return lda_model.log_perplexity(corpus)


# 计算coherence
def coherence(topics_num):
    print("\n####### number of topics is {} #######\n".format(topics_num))
    lda_model = LdaModel(corpus, num_topics=topics_num, id2word=dictionary, passes=30, random_state=1)
    print(lda_model.print_topics(num_topics=topics_num, num_words=10))
    lda_cm = CoherenceModel(model=lda_model, texts=fen_ci_data, dictionary=dictionary, coherence='c_v')
    print(lda_cm.get_coherence())
    return lda_cm.get_coherence()


# 打印LDA模型结果
def show_lda_result(data, topics_num, words_num):
    print("\n============== 主题数:{}  每个主题单词数: {} ==============".format(topics_num, words_num))
    dictionary_ = corpora.Dictionary(data)  # 构建词典
    corpus_ = [dictionary.doc2bow(text) for text in fen_ci_data]  # 表示为第几个单词出现了几次
    lda_model = LdaModel(corpus_, num_topics=topics_num, id2word=dictionary_, passes=30, random_state=1)  # 分为10个主题
    out_put = lda_model.print_topics(num_topics=topics_num, num_words=words_num)  # 每个主题输出15个单词
    for i_ in out_put:
        print(i_)
    print("\n\n")


if __name__ == "__main__":
    print("Hello world!")
    print("当前停用词为: ", stop_words)

    # 获取数据
    input_data = get_data_set(PATH)
    print("\n============ 读取数据 ==========\n")
    for i in input_data[:5]:
        print(i)
        print("\n######################\n")

    # 获取分词数据
    fen_ci_data = get_fen_ci_data(input_data)
    print("\n============ 分词结果 ==========\n")
    for i in fen_ci_data[:5]:
        print(i)
        print("\n######################\n")

    print("\n============ LDA模型 ==========\n")
    dictionary = corpora.Dictionary(fen_ci_data)  # 构建词典
    corpus = [dictionary.doc2bow(text) for text in fen_ci_data]  # 表示为第几个单词出现了几次
    for i in corpus[:5]:
        print(i)
        print("\n######################\n")

    num_words = 15  # 每个主题输出的单词个数
    num_topics = 5  # 主题数目
    show_lda_result(fen_ci_data, num_topics, num_words)

    num_topics = 13  # 主题数目
    show_lda_result(fen_ci_data, num_topics, num_words)

    #  画图主题数为1到15的图
    x = range(1, 15)
    # z = [perplexity(i) for i in x]  #如果想用困惑度就选这个
    y = [coherence(i) for i in x]
    plt.plot(x, y)
    plt.xlabel('主题数目')
    plt.ylabel('coherence大小')
    plt.rcParams['font.sans-serif'] = ['SimHei']
    matplotlib.rcParams['axes.unicode_minus'] = False
    plt.title('主题-coherence变化情况')
    plt.show()

参考
LDA主题模型简介及Python实现

你可能感兴趣的:(python,自然语言处理,数据挖掘)