python文本相似度算法详解

1.读取文档
2.对要计算的多篇文档进行访问
3.将文档整理成指定格式,方便后续进行计算
4.计算出词语的频率
5.对频率低的词语进行过滤
6.通过语料库建立词典
7.加载要对比的文档
8.将要对比的文档通过doc2bow转化为稀疏向量
9.对稀疏向量进行进一步处理,得到新语料库
10.将新语料库通过tfidfmodel进行处理,得到tfidf
11.通过token2id得到特征数
12.稀疏矩阵相似度,建立索引
13.得到最终相似度结果


from gensim import corpora, models, similarities
import jieba
from collections import defaultdict
dog = "C:/Users/xt/PycharmProjects/similar/dog.txt"  # 1.读取文档
cat = "C:/Users/xt/PycharmProjects/similar/cat.txt"
d1 = open(dog).read()  # 2.对要计算的多篇文档进行访问
c1 = open(cat).read()
data1 = jieba.cut(d1)  # 3.将文档整理成指定格式,方便后续进行计算
data2 = jieba.cut(c1)
data11 = ''
for item in data1:
    data11 += item + ' '
data21 = ''
for item in data2:
    data21 += item + ' '
doc = [data11, data21]
print('doc:')
print(doc)
texts = [[word for word in do.split()] for do in doc]
print('texts:')
print(texts)
freq = defaultdict(int)  # 4.计算出词语的频率
for text in texts:
    for token in text:
        freq[token] += 1
print('freq:')
print(freq)
texts = [[token for token in text if freq[token] > 1] for text in texts]  # 5.对频率低的词语进行过滤
print('texts:')
print(texts)
diction = corpora.Dictionary(texts)  # 6.通过语料库建立词典
print('diction:')
print(diction)
diction.save('C:/Users/xt/PycharmProjects/similar/dict.dict')
doc3 = "C:/Users/xt/PycharmProjects/similar/animal.txt"  # 7.加载要对比的文档
d3 = open(doc3).read()
data3 = jieba.cut(d3)
data31 = ""
for item in data3:
    data31 += item+' '
new_doc = data31
print('new_doc:')
print(new_doc)
new_vec = diction.doc2bow(new_doc.split())  # 8.将要对比的文档通过doc2bow转化为稀疏向量
print('new_vec:')
print(new_vec)
corpus = [diction.doc2bow(text) for text in texts]  # 9.对稀疏向量进行进一步处理,得到新语料库
print('corpus:')
print(corpus)
tf_idf = models.TfidfModel(corpus)  # 10.将新语料库通过tfidfmodel进行处理,得到tfidf
featureNum = len(diction.token2id.keys())  # 11.通过token2id得到特征数
index = similarities.SparseMatrixSimilarity(tf_idf[corpus], num_features=featureNum)  # 12.稀疏矩阵相似度,建立索引
print('index:')
print(index)
sim = index[tf_idf[new_vec]]  # 13.得到最终相似度结果
print('sim:')
print(sim)

你可能感兴趣的:(数据挖掘,python,机器学习,数据分析,数据挖掘,大数据)