"""
author:魏振东
data:2019.12.18
func:统计词频 词性标注 excel文件操作
"""
import jieba.posseg as psg
from collections import Counter
import xlwt
def cixing(filenamer,filenamerw):
with open(filenamer,'r',encoding='utf-8',errors='ignore') as fr:
article = fr.read()
seg_list = psg.cut(article)
result = " ".join(["{0}:{1}\n".format(w, t) for w, t in seg_list if len(w)!=1])
with open(filenamerw,'w+') as r:
r.write(result)
def cipin(filenamer,filenamerw):
with open(filenamer, 'r', encoding='utf-8', errors='ignore') as fr:
article = fr.read()
seg_list = psg.cut(article)
seg_list1 = ["{0}".format(w) for w, t in seg_list if len(w)!=1]
count = Counter(seg_list1)
dic3 = sorted(count.items(), key=lambda x: x[1], reverse=True)
with open(filenamerw, 'w+') as r:
for x in dic3:
r.write('{0} 出现{1}次\n'.format(x[0],x[1]))
def tongji(filenamer,filenamerw):
with open(filenamer, 'r', encoding='utf-8', errors='ignore') as fr:
article = fr.read()
seg_list = psg.cut(article)
excludes = {'将军', '却说', '令人', '赶来', '徐州', '不见', '下马', '喊声', '因此', '未知', '大败', '百姓', '大事', '一军', '之后', '接应', '起兵',
'成都', '原来', '江东', '正是', '忽然', '原来', '大叫', '上马', '天子', '一面', '太守', '不如', '忽报', '后人', '背后', '先主', '此人',
'城中', '然后', '大军', '何不', '先生', '何故', '夫人', '不如', '先锋', '二人', '不可', '如何', '荆州', '不能', '如此', '主公', '军士',
'商议', '引兵', '次日', '大喜', '魏兵', '军马', '于是', '东吴', '今日', '左右', '天下', '不敢', '陛下', '人马', '不知', '都督', '汉中',
'一人', '众将', '后主', '只见', '蜀兵', '马军', '黄巾', '立功', '白发', '大吉', '红旗', '士卒', '钱粮', '于汉', '郎舅', '龙凤', '古之',
'白虎','古人云', '尔乃', '马飞报', '轩昂', '史官', '侍臣', '列阵', '玉玺', '车驾', '老夫', '伏兵', '都尉', '侍中', '西凉', '安民', '张曰',
'文武','白旗','祖宗', '寻思'}
dic1 = {}
for word, t in seg_list:
if t == 'nr':
if len(word) == 1 or len(word) >= 4 or word in excludes:
continue
elif word == '孔明' or word == '孔明曰' or word == '卧龙先生':
real_word = '诸葛亮'
elif word == '云长' or word == '关公曰' or word == '关公':
real_word = '关羽'
elif word == '玄德' or word == '玄德曰' or word == '玄德甚' or word == '玄德遂' or word == '玄德兵' or word == '玄德领' \
or word == '玄德同' or word == '刘豫州' or word == '刘玄德':
real_word = '刘备'
elif word == '孟德' or word == '丞相' or word == '曹贼' or word == '阿瞒' or word == '曹丞相' or word == '曹将军':
real_word = '曹操'
elif word == '高祖':
real_word = '刘邦'
elif word == '光武':
real_word = '刘秀'
elif word == '桓帝':
real_word = '刘志'
elif word == '灵帝':
real_word = '刘宏'
elif word == '公瑾':
real_word = '周瑜'
elif word == '伯符':
real_word = '孙策'
elif word == '吕奉先' or word == '布乃' or word == '布大怒' or word == '吕布之':
real_word = '吕布'
elif word == '赵子龙' or word == '子龙':
real_word = '赵云'
elif word == '卓大喜' or word == '卓大怒':
real_word = '董卓'
else:
real_word = word
dic1.setdefault(t, []).append("{0}".format(real_word))
elif t == 'ns':
if len(word) == 1 or len(word) >= 4:
continue
else:
dic1.setdefault(t, []).append("{0}".format(word))
count = Counter(dic1['ns'])
count1 = Counter(dic1['nr'])
dic3 = sorted(count.items(), key=lambda x: x[1], reverse=True)
dic4 = sorted(count1.items(), key=lambda x: x[1], reverse=True)
workbook = xlwt.Workbook(encoding='utf-8')
worksheet = workbook.add_sheet('ns')
worksheet.write(0, 0, label='地点')
worksheet.write(0, 1, label='出现次数')
i=1
for x in dic3:
worksheet.write(i, 0, label=x[0])
worksheet.write(i, 1, label=x[1])
i = i+1
worksheet1 = workbook.add_sheet('nr')
worksheet1.write(0, 0, label='人名')
worksheet1.write(0, 1, label='出现次数')
i = 1
for x in dic4:
worksheet1.write(i, 0, label=x[0])
worksheet1.write(i, 1, label=x[1])
i = i + 1
workbook.save(filenamerw)
if __name__ == '__main__':
cixing('171182.txt','词性标注.txt')