# -*- coding: UTF-8 -*-
#!/usr/bin/env python
from collections import Counter
import collections
import jieba.analyse
import jieba
import time
import re
import sys
#stopwords = {}.fromkeys(['的', '包括', '等', '是'])
stopwords = {}.fromkeys([ line.strip() for line in open("stopwords.txt") ])
bill_path = r'article_nohtml.txt'
bill_result_path = r'result.txt'
with open(bill_path,'r') as fr:
all_the_text = fr.read()
all_the_text = re.sub("\"|,|\.", "", all_the_text)
data = jieba.cut(all_the_text)
data = dict(Counter(data))
def sort_by_count(d):
d = collections.OrderedDict(sorted(d.items(), key = lambda t: -t[1]))
return d
data = sort_by_count(data)
with open(bill_result_path,'w') as fw:
for k,v in data.items():
k = k.encode('utf-8')
if k not in stopwords:
#fw.write("%s,%d\n" % (k,v))
fw.write(str(k)+':%d'%v + '\n')