计算两篇文档的余弦相似度(tfidf)

# -*- coding:utf-8 -*-

"""
@author: Linlifang
"""

import os
import jieba
import sys
import re
import string
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
reload(sys)
sys.setdefaultencoding('utf-8')

'''
首先读取文件夹里的文档,然后通过结巴分词,将分词的结果存入文件,接着使用sklearn包计算每一篇文档的tfidf值
并保存在一个文件里,最后从这些文件中任选两个txt文件来计算他们的余弦相似度。
'''
def getFileList(path):
    filelist = []
    files = os.listdir(path)
    for f in files:
        if f[0] == '.':
            pass
        else:
            filelist.append(f)
    return filelist, path


def segment(filename, path, segPath):
    f = open(path + "/" + filename, 'r+')
    file_list = f.read()
    f.close()
#对文档进行分词处理
    if not os.path.exists(segPath):
        os.mkdir(segPath)
#对空格,换行符进行处理
    # Segmenting the document
    seg_list = jieba.cut(file_list, cut_all=False)
    # stopword = open('stopworda.txt').readlines()
    result = []
    for seg in seg_list:
        seg = ''.join(seg.split())
        reg = 'w+'
        r = re.search(reg, seg)
        if seg != '' and seg != ' = ' and seg != '[' and seg != ']' and seg != '(' and seg != ')' and not r:
            result.append(seg)
    finalresult = []
    stopword = open('stopworda.txt').read()
    for word in result: #去除停用词
        if word in stopword:
            continue
        else:
            if word >= u'\u4e00' and word <= u'\u9fa5': #判断是否是汉字
                finalresult.append(word)

            # 将分词后的结果用空格隔开,保存在本地
    f = open(segPath + "/" + filename + "-seg.txt", "w+")
    f.write(' '.join(finalresult))
    f.close()


# 读取已经分词好的文档,进行TFIDF计算
def Tfidf(filelist, sFilePath, path):
    corpus = []
    for ff in filelist:
        fname = path + "/" + ff
        f = open(fname + "-seg.txt", 'r+')
        content = f.read()
        f.close()
        corpus.append(content)

    vectorizer = CountVectorizer()
    transformer = TfidfTransformer()
    tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
    word = vectorizer.get_feature_names()  # 全部文本关键字
    weight = tfidf.toarray()

    if not os.path.exists(sFilePath):
        os.mkdir(sFilePath)

    for i in range(len(weight)):
        print u'-writing all the tf-idf in the ', i, u'file into ', sFilePath + '/' + string.zfill(i, 2) + ".txt"
        f = open(sFilePath + "/" + string.zfill(i, 2) + ".txt", 'w+')
        for j in range(len(word)):
            f.write(word[j] + "  " + str(weight[i][j]) + " " + "\n")
        
def coutcos(file1,file2):
    cipin1 = open(file1).readlines()
    cipin2 = open(file2).readlines()
    list1 = []
    list2 = []
    for x in cipin1:
        y = x.split(' ')
        list1.append(y[2])
    for x in cipin2:
        y = x.split(' ')
        list2.append(y[2])
    dot_product = 0.0
    normA = 0.0
    normB = 0.0
    for a, b in zip(list1, list2):
        a = float(a)
        b = float(b)
        dot_product += a * b
        normA += a ** 2
        normB += b ** 2
    if normA == 0.0 or normB == 0.0:
        return None
    else:
        return dot_product / ((normA * normB) ** 0.5)

if __name__ == "__main__":
    # 保存TFIDF的计算结果到文件夹
    sFilePath = "C:/Users/llfang1/PycharmProjects/untitled2/corpus/tfidffile"
    # 保存分词的文件夹
    segPath = 'C:/Users/llfang1/PycharmProjects/untitled2/corpus/segfile'
    (allfile, path) = getFileList('C:/Users/llfang1/PycharmProjects/untitled2/corpus/allkeyword')
    for ff in allfile:
        print "Using jieba on " + ff
        segment(ff, path, segPath)
    Tfidf(allfile, sFilePath, segPath)
    file1 = sFilePath + "/" + "04.txt"
    file2 = sFilePath + "/" + "05.txt"
    similar = coutcos(file1,file2)
    print similar
注:此程序參考了一位同行的程序后进行了改动并添加一些内容
 
 

你可能感兴趣的:(计算两篇文档的余弦相似度(tfidf))