句子相似度计算的几种方法

雅可比相似度,余弦相似度,带tfidf的余弦相似度

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# created by fhqplzj on 2017/07/22 上午12:23
import numpy as np
from scipy.linalg import norm
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


def jaccard_similarity(s1, s2):
    """
    计算两个句子的雅可比相似度
    :param s1: 
    :param s2: 
    :return: 
    """
    vectorizer = CountVectorizer(tokenizer=lambda s: s.split())
    corpus = [s1, s2]
    vectors = vectorizer.fit_transform(corpus).toarray()
    numerator = np.sum(np.min(vectors, axis=0))
    denominator = np.sum(np.max(vectors, axis=0))
    return 1.0 * numerator / denominator


def cosine_similarity_tf(s1, s2):
    """
    计算两个句子的TF余弦相似度
    :param s1: 
    :param s2: 
    :return: 
    """
    vectorizer = CountVectorizer(tokenizer=lambda s: s.split())
    corpus = [s1, s2]
    vectors = vectorizer.fit_transform(corpus).toarray()
    return np.dot(vectors[0], vectors[1]) / (norm(vectors[0]) * norm(vectors[1]))


def cosine_similarity_tfidf(s1, s2):
    """
    计算两个句子的TFIDF余弦相似度
    :param s1: 
    :param s2: 
    :return: 
    """
    vectorizer = TfidfVectorizer(tokenizer=lambda s: s.split())
    corpus = [s1, s2]
    vectors = vectorizer.fit_transform(corpus).toarray()
    return np.dot(vectors[0], vectors[1]) / (norm(vectors[0]) * norm(vectors[1]))


params = ['a b b c c c', 'b c c d d d']
print jaccard_similarity(*params)
print cosine_similarity_tf(*params)
print cosine_similarity_tfidf(*params)


你可能感兴趣的:(python)