LDA(二) 文本聚类

一、算法原理:使用Kmeans进行聚类

二、算法流程:

1. 对给定的语料先分词,得到分词后的语料;

2. 构造词典,corpus_tfidf, 最后构造 corpus_lda

3. Kmeans聚类,pred 是对语料的聚类结果列表。

pred = kmean.predict(tfidf_vec)
#!/usr/bin/python
# -*- coding:utf8 -*-

import os
import time
import re
import jieba.analyse

#生成分词后的文件,id + 词语列表。
def post_cut(url):
    fr = open(url + "/post_data.txt",encoding='utf-8')
    fo = open(url + "/post_key.txt", "w+")
    for line in fr.readlines():
        term = line.strip().split("\t")
        if len(term) == 3 and term[2] != "":
            key_list = jieba.analyse.extract_tags(term[2], 30)  # get keywords
            ustr = term[0] + "\t"  #单独取出文章id,以备使用。
            for i in key_list:
                ustr += i + " "
            fo.write(ustr + "\n")
    fr.close()
    fo.close()


def post_tfidf(url):
    from sklearn.feature_extraction.text import HashingVectorizer
    fr = open(url + "/post_key.txt")
    id_list = []
    data_list = []
    for line in fr.readlines():
        term = line.strip().split("\t")
        if len(term) == 2:
            id_list.append(term[0])
            data_list.append(term[1])

    hv = HashingVectorizer(n_features=10000, non_negative=True)  # 该类实现hash
    post_tfidf = hv.fit_transform(data_list)  # return feature vector 'fea_train' [n_samples,n_features]
    print('Size of fea_train:' + repr(post_tfidf.shape))
    print(post_tfidf.nnz)
    post_cluster(url, id_list, post_tfidf)


def post_cluster(url, id, tfidf_vec):
    from sklearn.cluster import KMeans
    kmean = KMeans(n_clusters=300)
    print("kmeans")
    kmean.fit(tfidf_vec)
    pred = kmean.transform(tfidf_vec)

    count1 = 0
    count2 = 0
    pred_str = []

    for item in pred:
        count1 += 1
        vec = ""
        for tmp in item :
            vec += str(tmp)[0:7] + "\t"
        pred_str.append(vec)

    print (len(pred_str))
    print (len(id))

    pred = kmean.predict(tfidf_vec)
    fo = open(url + "/cluster.txt", "w+")
    for i in range(len(pred)):
        count2 += 1
        fo.write(id[i] + "\t" + str(pred[i]) + "\n")
    fo.close()
    print("%d+%d" % (count1, count2))


def post_lda(url, cluster):
    from gensim import corpora, models, matutils
    count = 0
    fr = open(url + "/post_key.txt")
    fo2 = open(url + "/post_vec_lda.txt", "w+")
    id_list = []
    data_list = []

    for line in fr.readlines():
        term = line.strip().split("\t")
        if len(term) == 2:
            count += 1
            id_list.append(term[0])
            word = term[1].strip().split()
            data_list.append(word)
    print("构造词典、语料、tfidf矩阵")
    dic = corpora.Dictionary(data_list)  # 构造词典
    corpus = [dic.doc2bow(text) for text in data_list]  # 每个text 对应的稀疏向量
    tfidf = models.TfidfModel(corpus)  # 统计tfidf
    corpus_tfidf = tfidf[corpus]  # 得到每个文本的tfidf向量,稀疏矩阵
    #构造文本LDA向量
    lda = models.LdaModel(corpus_tfidf, id2word=dic, num_topics=200)
    corpus_lda = lda[corpus_tfidf]  # 每个文本对应的LDA向量,稀疏的,元素值是隶属与对应序数类的权重

    num = 0
    for doc in corpus_lda:
        wstr = ""
        for i in range(len(doc)):    #每个doc由多个词语的索引id和该词语的word2vec向量组成。
            item = doc[i]
            wstr += str(item[0]) + "," + str(item[1])[0:7] + "/" #要存储的content。
        fo2.write(id_list[num] + "\t" + wstr[0:-1] + "\n")   #存储文章id时候,用num作为递增的值。
        num += 1
    fr.close()
    fo2.close()
    print(num)  #文本个数。

    if cluster:
        lda_csc_matrix = matutils.corpus2csc(corpus_lda).transpose()  # gensim sparse matrix to scipy sparse matrix
        post_cluster(url, id_list, lda_csc_matrix)


if __name__ == "__main__":
    url = "path"
    time1 = time.time()
    post_cut(url)
    post_tfidf(url)
    lda_cluster = True
    post_lda(url, lda_cluster)

    print(time.time() - time1)

 

你可能感兴趣的:(NLP)