python自然语言处理-----计算文本相似度


from gensim import corpora,models,similarities
import jieba
from collections import defaultdict
import urllib.request

#d1=open("C:/Users/yyq/Desktop/毕业论文/文档1.txt").read()
#d2=open("C:/Users/yyq/Desktop/毕业论文/文档2.txt").read()

jieba.load_userdict("C:/Users/yyq/Desktop/毕业论文/词典.txt")
d1=urllib.request.urlopen("file:///C:/php/WWW/%E6%96%87%E6%A1%A31.html").read().decode("gbk","ignore")
d2=urllib.request.urlopen("file:///C:/php/WWW/%E6%96%87%E6%A1%A3%202.html").read().decode("gbk","ignore")

#print(data2)
data1=jieba.cut(d1)
data2=jieba.cut(d2)

data11=""
for item in data1:
    data11+=item+"  " 
data21=""
for item in data2:
    data21+=item+"  "
documents=[data11,data21] #存储到数组
texts=[[word for word in document.split()]for document in documents]
#print(texts)
frequency=defaultdict(int)
for text in texts:
    for token in text:
        frequency[token]+=1
#print(frequency)
#texts=[[word for word in text if frequency[token]>2]for text in texts]

dictionary=corpora.Dictionary(texts)
#dictionary.save("C:/php/WWW/分词2.html")


d3=urllib.request.urlopen("file:///C:/php/WWW/%E6%96%87%E6%A1%A33.html").read().decode("gbk","ignore")
data3=jieba.cut(d3)
data31=""
for item in data3:
    data31+=item+"  "
new_doc=data31

new_vec=dictionary.doc2bow(new_doc.split())
#print(new_vec)
corpus=[dictionary.doc2bow(text) for text in texts]
#print(corpus)
corpora.MmCorpus.serialize("C:/Users/yyq/Desktop/毕业论文/corpus.txt",corpus)
tfidf=models.TfidfModel(corpus)
featureNum=len(dictionary.token2id.keys())
index=similarities.SparseMatrixSimilarity(tfidf[corpus],num_features=featureNum)
sim=index[tfidf[new_vec]]
print(sim)

这里写图片描述
结论:第三个文本和第一个第二个文本的相似度为:0.007和0.03

你可能感兴趣的:(python自然语言处理,python机器学习)