提取文章高频词

# -*- coding: utf-8 -*-
# * @author haoming
# * @date 2016/11/08

import MySQLdb
import pandas as pd
import numpy as np
import re
import codecs 
import jieba
import jieba.analyse
import logging
from gensim import corpora, models, similarities
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.externals import joblib
import time
from numpy import random
from sklearn import preprocessing
import xlwt
import csv
import os
os.chdir("F:\project\frequence")


## 选取reg_no, bus_scope,根据 bus_scope 切词,添加停用词,自定义词典 停用词(日期) 括号里,分情况
def word_cut(newdata):

    stoplist = [line.strip().decode('utf-8') for line in open('stopword.txt').readlines()]    #添加停用词
    jieba.load_userdict('userdict.txt')    #添加自定义词典
    r1 = u'[a-zA-Z0-9’!"#$%&\'()*+,-./:;<=>?@,。?★、…【】《》?“”‘’![\\]^_`{|}~]+'
    r2 = u'\s+'
    r0=r'\(.*?\)|\(.*?\)|\{.*?\}'
    for word in newdata:
        seg="\n".join(word)
        seg_list = jieba.cut(re.sub(r2,"",re.sub(r1,"",re.sub(r0,"",seg))), cut_all=False)
        words = ' '.join(seg_list)
    words = [word for word in words.split() if word not in stoplist and len(word)>=2]  #去除停用词
    return words


def get_high_frequence_word(num):  
    sql = ('select bus_scope from bj_predict limit 10') ## 764066条数据

    data = pd.read_sql(sql,conn)      

    words = []
    cur = conn.cursor()
    cur.execute(sql)
    rows=cur.fetchall()
    for row in rows:
        cuted_word = word_cut(row)
        #print cuted_word
        words.append(cuted_word)

    word_seg = [item for w in words for item in w]

    texts = []
    texts.append(word_seg)

    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    print corpus,'\n\n'
    text_list = sorted(corpus[0], key=lambda x: x[1], reverse = True)
    print text_list[0:num],'\n\n'
    #print(dictionary.token2id)#词:id

    '''
    csvfile = open('frequence.csv', 'wb')
    csvfile.write(codecs.BOM_UTF8) #防止乱码
    writer = csv.writer(csvfile)
    writer.writerow(['单词', '词频'])
    for i in range(num):
        word = dictionary[(text_list[i][0])]
        writer.writerows(word)
        writer.writecows(text_list[i][1])
    csvfile.close()
    '''

    book = xlwt.Workbook(encoding = 'utf-8',style_compression=0)
    sheet = book.add_sheet('sheet1',cell_overwrite_ok=True)

    for i in range(num):
        #print dictionary[(text_list[i][0])],
        print text_list[i][1]
        word = dictionary[(text_list[i][0])]
        print word,
        sheet.write(i,0,word)
        sheet.write(i,1,text_list[i][1])

    book.save('frequence.xls')



if __name__ == "__main__":
    start = time.clock()
    conn = MySQLdb.connect(host='10.0.0.2',user='root',passwd='root',db='clue',port=3306,charset='utf8')
    num = 50
    get_high_frequence_word(num)

    end = time.clock()
    print end-start

你可能感兴趣的:(Python,MySQL)