文本相似度对比

import numpy as np
from scipy.linalg import norm
from scipy import stats
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
def jaccard_similarity(s1, s2):
    vectorizer = CountVectorizer(tokenizer=lambda s: s.split())
    corpus = [s1, s2]
    vectors = vectorizer.fit_transform(corpus).toarray()
    numerator = np.sum(np.min(vectors, axis=0))
    denominator = np.sum(np.max(vectors, axis=0))
    return 1.0 * numerator / denominator


def cosine_similarity_tf(s1, s2):

    vectorizer = CountVectorizer(tokenizer=lambda s: s.split())
    corpus = [s1, s2]
    vectors = vectorizer.fit_transform(corpus).toarray()
    return np.dot(vectors[0], vectors[1]) / (norm(vectors[0]) * norm(vectors[1]))


def cosine_similarity_tfidf(s1, s2):

    vectorizer = TfidfVectorizer(tokenizer=lambda s: s.split())
    corpus = [s1, s2]
    vectors = vectorizer.fit_transform(corpus).toarray()
    # print(vectors[0],vectors[1])
    return np.dot(vectors[0], vectors[1]) / (norm(vectors[0]) * norm(vectors[1]))


params = ['新 疆 生 产 建 设 兵 团 阿 克 苏 垦 区 人 民 检 察 院', '新 疆 产 建 设 兵 团 阿 克 苏 垦 区 ']
print(jaccard_similarity(*params))
print(cosine_similarity_tf(*params))
print(cosine_similarity_tfidf(*params))

 

你可能感兴趣的:(文本相似度对比)