基于word2vec和k-means的词聚类

import re
from sklearn.cluster import KMeans
from sklearn.externals import joblib
import numpy
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from sklearn import metrics
import numpy as np
from collections import Counter
from operator import itemgetter

def map_label(true_labels, pred_labels):
    label_pair = list(zip(pred_labels, true_labels))
    count = tuple(Counter(label_pair).items())
    mapping = dict()
    n_label = len(np.unique(true_labels))

    for label in range(n_label):
        tuples = [tup for tup in count if tup[0][0] == label]
        likely_tuple = max(tuples, key=itemgetter(1), default=0)[0]
        mapping[likely_tuple[0]] = likely_tuple[1]

    pred_labels_mapped = [mapping[x] for x in pred_labels]
    return pred_labels_mapped

def cluster_quality(true_labels, pred_labels, show=True):
    f = 'data/word2vec_result.txt'
    r = open(f, 'w', encoding='utf-8')
    h, c, v = metrics.homogeneity_completeness_v_measure(true_labels, pred_labels)
    nmi = metrics.normalized_mutual_info_score(true_labels, pred_labels)
    rand = metrics.adjusted_rand_score(true_labels, pred_labels)
    pred_labels_mapped = map_label(true_labels, pred_labels)
    acc = metrics.accuracy_score(true_labels, pred_labels_mapped)
    if show:
        r.write("Homogeneity: %0.3f" % h)
        r.write('\n')
        r.write("Completeness: %0.3f" % c)
        r.write('\n')
        r.write("V-measure: %0.3f" % v)
        r.write('\n')
        r.write("NMI: %0.3f" % nmi)
        r.write('\n')
        r.write("Rand score: %0.3f" % rand)
        r.write('\n')
        r.write("Accuracy: %0.3f" % acc)
    return dict(
        homogeneity=h,
        completeness=c,
        vmeasure=v,
        nmi=nmi,
        rand=rand,
        accuracy=acc,
    )

def wordsCluster(text, vectorSize, classCount):
    '''
    text:输入文本的本地路径
    vectorSize:词向量大小
    classCount:k值
    '''

    name = []
    data = open(text, 'r', encoding='utf-8')
    for line in data.readlines():
        line = line.replace('\n', '')
        if line not in name:
            name.append(line)
    
    #正确的标签,计算准确率
    true_labels = []
    labels = open('data/short_label.txt', 'r', encoding='utf-8')
    for label in labels.readlines():
        label = label.replace('\n', '')
        true_labels.append(label)

    # word2vec向量化
    model = Word2Vec(LineSentence(text), size=vectorSize, window=5, min_count=1, workers=4)
    model.wv.save_word2vec_format('word_model.txt', binary=False)
    # 获取model里面的说有关键词
    keys = model.wv.vocab.keys()

    # 获取词对于的词向量
    wordvector = []
    for key in keys:
        wordvector.append(model[key])

    # 聚类
    clf = KMeans(n_clusters=classCount)
    pred = clf.labels_
    cluster_quality(true_labels, pred)


wordsCluster('data/short_text.txt', 300, 21)

若不计算准确率,只输出聚类结果,如下所示

from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from sklearn.cluster import KMeans

def wordsCluster(text, vectorSize, classCount):
    '''
    text:输入文本的本地路径
    vectorSize:词向量大小
    classCount:k值
    '''

    name = []
    data = open(text, 'r', encoding='utf-8')
    for line in data.readlines():
        line = line.replace('\n', '')
        if line not in name:
            name.append(line)

    true_labels = []
    labels = open('data/short_label.txt', 'r', encoding='utf-8')
    for label in labels.readlines():
        label = label.replace('\n', '')
        true_labels.append(label)

    # word2vec向量化
    model = Word2Vec(LineSentence(text), size=vectorSize, window=5, min_count=1, workers=4)
    model.wv.save_word2vec_format('word_model.txt', binary=False)
    # 获取model里面的说有关键词
    keys = model.wv.vocab.keys()

    # 获取词对于的词向量
    wordvector = []
    for key in keys:
        wordvector.append(model[key])

    # 聚类
    clf = KMeans(n_clusters=classCount)
    s = clf.fit_predict(wordvector)
    for i in range(0, 21):
        label_i = []
        for j in range(0, len(s)):
            if s[j] == i:
                label_i.append(name[j])
        print('label_' + str(i) + ':' + str(label_i))

wordsCluster('data/short_text.txt', 300, 21)

 

你可能感兴趣的:(基于word2vec和k-means的词聚类)