bert2vec+kmeans

from bert_serving.client import BertClient
from sklearn.cluster import KMeans

#ivy_nie
bc = BertClient()
def wordsCluster(text, vectorSize, classCount):
‘’’
text:输入文本的本地路径
vectorSize:词向量大小
classCount:k值
‘’’
name = []
data = open(text, ‘r’, encoding=‘utf-8’)
for line in data.readlines():
line = line.replace(’\n’, ‘’)
if line not in name:
name.append(line)

true_labels = []
labels = open('doc.txt', 'r', encoding='utf-8')
for label in labels.readlines():
    label = label.replace('\n', '')
    true_labels.append(label)

# 获取词对于的词向量
wordvector = []
with open('doc.txt', 'r', encoding='utf-8') as ff:
    lines = ff.readlines()
    for line in lines:
        print(line)
        key = bc.encode([line])[0]
        print(key)
        wordvector.append(key)
fff=open('result.txt','w',encoding='utf-8')
# 聚类
clf = KMeans(n_clusters=classCount)
s = clf.fit_predict(wordvector)
for i in range(0, 500):
    label_i = []
    for j in range(0, len(s)):
        if s[j] == i:
            label_i.append(name[j])
    print('label_' + str(i) + ':' + str(label_i))
    fff.write('label_' + str(i) + ':' + str(label_i)+'\n')

wordsCluster(’./wan.txt’, 300,500)

你可能感兴趣的:(算法)