(三)文本挖掘——Word2vec

# @Time : 2021/3/27 14:45
# @Author : chao

#代码参考自:https://blog.csdn.net/weixin_45314989/article/details/104390725?utm_medium=distribute.pc_relevant.none-task-blog-baidujs_title-0&spm=1001.2101.3001.4242

#采用word2vec对分词后的文件进行训练,将每个词语映射到词向量空间
import logging
import multiprocessing
import os
import sys
from collections import Counter

from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
#word2vec获取词向量
from pandas import np
from sklearn.metrics import silhouette_score

#word2vec
def wordsCluster():
    program = os.path.basename(sys.argv[0])  # 读取当前文件的文件名
    logger = logging.getLogger(program)
    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s', level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))

    # inp为输入语料, outp1为输出模型, outp2为vector格式的模型
    inp = r"C:\Users\代码\去除停用词并分词\去除停用词并分词结果\zong_fengci_tingyongci2.txt"
    out_model = r'C:\Users\代码\word2vec\数据\corpusSegDone_zong2.model'
    out_vector = r'C:\User\代码\word2vec\数据\corpusSegDone_zong2.txt'

    # 训练skip-gram模型
    #原始参数设置
    #model = Word2Vec(LineSentence(inp), size=100, window=5, min_count=5,
                     #workers=multiprocessing.cpu_count(), iter=5)
    #第二种参数设置
    model = Word2Vec(LineSentence(inp), size=100, window=3, min_count=5,
                     workers=multiprocessing.cpu_count(), iter=10, sg=1)
    # 保存模型
    model.save(out_model)
    # 保存词向量
    model.wv.save_word2vec_format(out_vector, binary=False)
    print("word2vec成功!!")

#将词向量的txt文本转换为csv文本
def changeTxtToCsv():
    out_vector = r'C:\Users\代码\word2vec\数据\corpusSegDone_zong2.txt'
    f = open(out_vector, "r", encoding='utf-8')
    new = []
    for line in f:
        new.append(line)
    new[0] = '\n'
    f.close()

    f = open(out_vector, "w", encoding='utf-8')
    for n in new:
        f.write(n)
    f.close()

    import csv
    with open(r'C:\Users\代码\word2vec\数据\corpusSegDone_zong2.csv',
              'w', newline='') as csvfile:  ##data.csv是用来存放词向量的csv文件
        writer = csv.writer(csvfile)
        data = open(out_vector,encoding='utf-8')
        next(data)
        for each_line in data:
            a = each_line.split()
            writer.writerow(a)

    print("转换为csv文件成功!")

#用pca将100维的数据降维至2维
def jiangwei():
    import numpy as np
    from sklearn.decomposition import PCA
    l = []
    words = []
    with open(r'C:\Users\代码\word2vec\数据\corpusSegDone_zong2.csv',
              'r') as fd:
        line = fd.readline()
        while line:
            if line == "":
                continue
            line = line.strip()
            word = line.split(",")
            words.append(word[0])
            l.append(word[1:])
            line = fd.readline()

    X = np.array(l)  # 导入数据,维度为300
    pca = PCA(n_components=2)  # 降到2维
    pca.fit(X)  # 训练
    newX = pca.fit_transform(X)  # 降维后的数据存放在newX列表中

    #将词汇和降维后的数据进行匹配
    dict={}
    for i in range(len(words)):
        word_=words[i]
        dict[word_]=newX[i]
    # for j in range(len(words)):
    #     print(words[j]+':',end='')
    #     print(dict[words[j]])

    #将高频名词的word2vec值从总数据的值(即dict中)中提取出来
    mingci_list = []
    with open(r'C:\Users\代码\词性标注\名词提取\名词提取数据\高频名词(过滤后).txt',
              'r', encoding='ANSI') as mf:
        for i in mf.readlines():
            mingci_list.append(i.strip('\n'))

    mingci_jiangwei_list = []
    for i in mingci_list:
        mingci_jiangwei_list.append(dict[i.strip('\n')])

    result = []
    result.append(mingci_jiangwei_list)
    result.append(mingci_list)
    print("降维成功!!")
    return result

#构建词向量字典并用k-means训练,得出分类情况
def k_means(mingci_jiangwei_list,mingci_list):
    from sklearn.cluster import KMeans
    import numpy as np
    from matplotlib import pyplot as plt

    #聚类个数
    num = 4
    X = np.array(mingci_jiangwei_list)
    kmeans = KMeans(n_clusters=num, random_state=0).fit(X)

    print(str(num) + "个中心词的坐标:")
    print(kmeans.cluster_centers_)



    list1=[]
    list2=[]
    list3=[]
    list4=[]
    list5=[]
    list6=[]
    list7=[]
    for j in range(len(mingci_list)):
        if kmeans.labels_[j]==0:
            list1.append(mingci_list[j])
        elif kmeans.labels_[j]==1:
            list2.append(mingci_list[j])
        elif kmeans.labels_[j]==2:
            list3.append(mingci_list[j])
        elif kmeans.labels_[j]==3:
            list4.append(mingci_list[j])
        elif kmeans.labels_[j]==4:
            list5.append(mingci_list[j])
        elif kmeans.labels_[j]==5:
            list6.append(mingci_list[j])
        elif kmeans.labels_[j]==6:
            list7.append(mingci_list[j])
    print("与关键词"+list1[0]+"相关的词有:",end='')
    print(list1)
    print("与关键词"+list2[0]+"相关的词有:",end='')
    print(list2)
    print("与关键词"+list3[0]+"相关的词有:",end='')
    print(list3)
    print("与关键词"+list4[0]+"相关的词有:",end='')
    print(list4)
    # print("与关键词"+list5[0]+"相关的词有:",end='')
    # print(list5)
    # print("与关键词"+list6[0]+"相关的词有:",end='')
    # print(list6)
    # print("与关键词"+list7[0]+"相关的词有:",end='')
    # print(list7)

    ##将数据用散点图可视化
    f1=[]
    f2=[]
    for i in range(len(mingci_jiangwei_list)):
        f1.append(mingci_jiangwei_list[i][0])
        f2.append(mingci_jiangwei_list[i][1])
    plt.scatter(f1, f2, c='blue', s=6)
    plt.show()


#肘部法则
def sse(mingci_jiangwei_list):
    from sklearn.cluster import KMeans
    from scipy.spatial.distance import cdist
    import numpy as np
    import matplotlib.pyplot as plt

    X = np.array(mingci_jiangwei_list)

    x1 = mingci_jiangwei_list[0]
    x2 = mingci_jiangwei_list[1]

    plt.plot()
    plt.xlim([0, 10])
    plt.ylim([0, 10])
    plt.title('Dataset')
    plt.scatter(x1, x2)
    plt.show()

    # create new plot and data
    plt.plot()
    #X = np.array(list(zip(x1, x2))).reshape(len(x1), 2)
    colors = ['b', 'g', 'r']
    markers = ['o', 'v', 's']

    # k means determine k
    distortions = []
    K = range(1, 10)
    for k in K:
        kmeanModel = KMeans(n_clusters=k).fit(X)
        kmeanModel.fit(X)
        distortions.append(sum(np.min(cdist(X, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0])

    # Plot the elbow
    plt.plot(K, distortions, 'bx-')
    plt.xlabel('k')
    plt.ylabel('Distortion')
    plt.title('The Elbow Method showing the optimal k')
    plt.show()

#轮廓系数
def cs(mingci_jiangwei_list):
    import matplotlib.pyplot as plt
    from sklearn.cluster import KMeans
    from sklearn import metrics
    import numpy as np
    X = np.array(mingci_jiangwei_list)
    x1 = np.array(mingci_jiangwei_list[0])
    x2 = np.array(mingci_jiangwei_list[1])

    plt.xlim([0,10])
    plt.ylim([0,10])
    plt.title('Instances')
    plt.scatter(x1,x2)

    colors = ['b','g','r','c','m','y','k','b']
    markers = ['o','s','D','v','^','p','*','+']

    clusters=[2,3,4,5,8]
    subplot_counter = 1
    sc_scores = []

    for t in clusters:
        subplot_counter += 1
        plt.subplot(3,2,subplot_counter)
        kmeans_model = KMeans(n_clusters=t).fit(X)

        for i,l in enumerate(kmeans_model.labels_):
            plt.plot(x1[i], x2[i], color = colors[l], marker = markers[l], ls = 'None')
            plt.xlim([0,10])
            plt.ylim([0,10])
        sc_score = silhouette_score(X,kmeans_model.labels_,markers='euclidean')
        sc_scores.append(sc_score)
        plt.title('K=%s,silhouette coefficient=%0.03f'%(t,sc_score))
        plt.figure()

    plt.plot(clusters, sc_scores,'*-')
    plt.xlabel('Numbers of clusters')
    plt.ylabel('Silhouette Coefficient score')
    plt.show()

import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn import datasets, metrics


def km_sse_cs(mingci_jiangwei_list):
    """
    KMeans算法效果评价
    1、簇内误方差(SSE, sum of the squared errors),手肘法,肘部法,其大小表明函数拟合的好坏。
    使用图形工具肘部法,根据簇的数量来可视化簇内误方差。下降率突然变缓时即认为是最佳的k值(拐点)。
    当KMeans算法训练完成后,可以通过使用内置inertia属性来获取簇内的误方差。
    2、轮廓系数法(Silhouette Coefficient)结合了聚类的凝聚度(Cohesion)和分离度(Separation)
    平均轮廓系数的取值范围为[-1,1],系数越大,聚类效果越好。当值为负时,暗含该点可能被误分了。
    :return:
    """
    data = np.array(mingci_jiangwei_list)

    # 存放设置不同簇数时的SSE值
    sse_list = []
    # 轮廓系数
    silhouettes = []
    # 循环设置不同的聚类簇数
    for i in range(2, 15):
        model = KMeans(n_clusters=i)
        model.fit(data)
        # kmeans算法inertia属性获取簇内的SSE
        sse_list.append(model.inertia_)
        # 轮廓系数
        silhouette = metrics.silhouette_score(data, model.labels_, metric='euclidean')
        silhouettes.append(silhouette)

    # 绘制簇内误方差曲线
    plt.subplot(211)
    plt.title('KMeans 簇内误方差')
    plt.plot(range(2, 15), sse_list, marker='*')
    plt.xlabel('簇数量')
    plt.ylabel('簇内误方差(SSE)')
    # 绘制轮廓系数曲线
    plt.subplot(212)
    plt.title('KMeans 轮廓系数')
    plt.plot(range(2, 15), silhouettes, marker='o')
    plt.xlabel('簇数量')
    plt.ylabel('轮廓系数')

    plt.tight_layout()
    plt.show()



if __name__ == '__main__':
    import matplotlib as mpl
    import numpy as np
    import pandas as pd
    wordsCluster()
    changeTxtToCsv()
    result = jiangwei()
    mingci_jiangwei_list = result[0]
    mingci_list = result[1]

你可能感兴趣的:(Python)