NLP入门(1)词袋模型及句子相似度

参考链接:
NLP入门(一)词袋模型及句子相似度

建立词袋 取得句子的向量表示(缺少手写的计算相似度部分)

# coding:utf-8
from nltk import word_tokenize

# nltk可能报错
# 在python的控制台运行
# import nltk
# nltk.download('punkt')

sent1 = "I love sky, I love sea."  # 区分大小写
sent2 = "I like running, I love reading."

# 取得词袋
sentences = [sent1, sent2]
all_text = []
for sentence in sentences:
    all_text.extend(word_tokenize(sentence))
corpus = set(all_text)

# 建立词袋和数字索引序号的映射
corpus_dict = {}
i = 0
for word in corpus:
    corpus_dict[word] = i
    i += 1
print(corpus_dict)
#{'love': 0, 'sea': 1, 'reading': 2, ',': 3, 'I': 4, 'running': 5, 'like': 6, '.': 7, 'sky': 8}

# 建立句子的向量表示
sent1_list = word_tokenize(sent1)
sent2_list = word_tokenize(sent2)
sents_list = [sent1_list, sent2_list]
sent1_vec = []
sent2_vec = []
for key in corpus_dict.keys():
    sent1_vec.append((corpus_dict[key], sent1_list.count(key)))
    # 加入一个tuple : key在corpus_dict的序号,sent1_list1中key出现的次数

print(sent1_list)
#['I', 'love', 'sky', ',', 'I', 'love', 'sea', '.']
print(sent1_vec)
#[(0, 2), (1, 1), (2, 0), (3, 1), (4, 2), (5, 0), (6, 0), (7, 1), (8, 1)]


也可以直接调包(包含计算相似度)

sent1 = "I love sky, I love sea."
sent2 = "I like running, I love reading."

from nltk import word_tokenize
sents = [sent1, sent2]
texts = [[word for word in word_tokenize(sent)] for sent in sents]
print(texts)

from gensim import corpora
from gensim.similarities import Similarity

#  语料库
dictionary = corpora.Dictionary(texts)

# 利用doc2bow作为词袋模型
corpus = [dictionary.doc2bow(text) for text in texts]
similarity = Similarity('-Similarity-index', corpus, num_features=len(dictionary))
print(similarity)
# 获取句子的相似度
new_sensence = sent1
test_corpus_1 = dictionary.doc2bow(word_tokenize(new_sensence))

cosine_sim = similarity[test_corpus_1][1]
print("利用gensim计算得到两个句子的相似度: %.4f。"%cosine_sim)

你可能感兴趣的:(深度学习)