python 中文切词并计算相似度

# -*- coding: utf-8 -*-
# * @author haoming
# * @date 2016/11/08

import os
os.chdir(u"G:\project")
import MySQLdb
import pandas as pd
import re
import codecs 
import jieba
import jieba.analyse
import logging
from gensim import corpora, models, similarities
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import HashingVectorizer

## 根据trade_filter1提取关键词,作为用户dict,以便后续做分词
def write_userdict(): 
    sql=('select distinct trade_filter1 from clue limit 5')
    data = pd.read_sql(sql,conn)
    datalist = []
    for dataitem in data['trade_filter1']:
        datalist.append(dataitem)
    str_symptom = str(datalist).replace('u\'','\'')
    dataset = (str_symptom.decode("unicode-escape"))
    tags = jieba.analyse.extract_tags(dataset)#, topK=len(dataset)
    words = " ".join(tags)
    print type(words)
    f = codecs.open('dict.txt', 'wb', 'utf-8')
    for i in words: 
        f.write(i) 
    f.close() 



## 选取reg_no, bus_scope,根据 bus_scope 切词
def word_cut():
    sql=('select reg_no, bus_scope from clue where length(bus_scope)>=6 limit 5')  #trade_filter1,  
    data0 = pd.read_sql(sql,conn) 
    data = data0.set_index(['reg_no'])
    print data
    data0 = data['bus_scope'][0]
    print data0
    data1 = data['bus_scope'][1]
    print data1
    jieba.load_userdict("dict.txt")
    #stop=[line.strip().decode('utf-8') for line in open('stopword.txt').readlines()]  #读取停止词文件并保存到列表stopkey
    stop = [line.strip() for line in open('stopword.txt').readlines()]

    r1 = u'[a-zA-Z0-9’!"#$%&\'()*+,-./:;<=>?@,。?★、…【】《》?“”‘’![\\]^_`{|}~]+'
    r2 = u'\s+'
    L =[data0,data1]
    L2 = []
    for item in L:
        seg_list = jieba.cut(re.sub(r2,"",re.sub(r1,"",item)), cut_all=True)
        word = ' '.join(seg_list)
    #word = ' '.join(list(set(word)-set(stop))) #去停用词
        L2.append(word)

    return L2

def calculate_sim(word_seg): #Tfidf
    hv = HashingVectorizer(n_features=1000)
    ##vectorizer=CountVectorizer()
    transformer=TfidfTransformer()
    word_vc=hv.fit_transform(word_seg)
    tfidf=transformer.fit_transform(word_vc)


    ## 根据所选词,计算不同公司之间相似度,生成一个n*n的矩阵
def calculate_sim(wordset1, wordset2): #Tfidf
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    Corp = wordset1

    print Corp
    dictionary = corpora.Dictionary(Corp)
    #dic = corpora.Dictionary([a.split()])
    corpus = [dictionary.doc2bow(text) for text in Corp]
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]

    vec_bow = dictionary.doc2bow(wordset2)
    vec_tfidf = tfidf[vec_bow]

    index = similarities.MatrixSimilarity(corpus_tfidf)
    sims = index[vec_tfidf]
    similarity = list(sims)


    sim_file = open('similarity.txt', 'w')
    for i in similarity:
        sim_file.write(str(i)+'\n')
    sim_file.close()



if __name__ == "__main__":
    conn = MySQLdb.connect(host='10.0.0.2',user='root',passwd='root',db='clue',port=3306,charset='utf8')
    #write_userdict()
    result = word_cut()
    print result[0]
    #dataset = word_cut()
    #dataset1 = dataset[0] 
    #dataset2 = dataset[1] 

    #calculate_sim(dataset1, dataset2)





你可能感兴趣的:(Python,python)