1、都挺好小说连接
链接: https://pan.baidu.com/s/14VsZ12Rwn_kWlm_daD4ChQ 提取码: sik9
2、python
# -*- coding: utf-8 -*-
# !/usr/bin/env python3
import jieba.analyse
from pyecharts.charts import Funnel
import gensim
import logging
import re
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import networkx as nx
names={
'明玉':['明玉','苏明玉','苏总'],
'朱丽':['朱丽','丽丽'],
'明哲':['明哲','苏明哲','大哥'],
'明成':['明成','苏明成','二哥'],
'苏大强':['苏大强','爸','公公'],
'吴非':['吴非'],'苏家':['苏家'],
'宝宝':['宝宝'],'老蒙':['老蒙','蒙总'],
'小蒙':['小蒙'],'天冬':['天冬','石天冬','石大哥'],
'柳青':['柳青'],'蔡根花':['蔡根花','小蔡'],'温伟光':['温伟光','温总'],
'苏母':['苏母']
}
with open('都挺好.txt','r',encoding='utf-8') as f:
content = list(line.strip() for line in f.readlines())
with open('stop_word.txt','r',encoding='utf-8') as f:
stop_words = list(line.strip() for line in f.readlines())
def re_str(keys,data):
for key in keys:
for name in names[key]:
data = re.sub(name,key,data)
return data
def df_count():
'''创建dataFream二维矩阵'''
name_arr = []
for name in names:
name_arr.append(name)
count = len(name_arr)
data = [[0 for i in range(0,count)] for j in range(0,count)]
pcount = pd.DataFrame(data,index = name_arr,columns=name_arr)
return pcount
def find_people_count(num = 15):
'''读入主要人物姓名词典'''
novel = ''.join(content)
showup_counts = []
for name in names:
showup_counts.append([name,novel.count(name)])
showup_counts.sort(key=lambda v:v[1],reverse=True)
return showup_counts[:num]
def tf_idf():
'''使用tf_idf分析文章中关键词'''
tags = jieba.analyse.extract_tags(''.join(content),topK = 50,withWeight = True)
return tags
def chart_show(tags):
'''echarts进行图标展示'''
funnel = (Funnel()
.add("", [list(z) for z in tags])
)
funnel.render()
def word2vecTest():
'''进行分词'''
sentence = []
for (k,v) in tf_idf():
jieba.add_word(k)
for name in names:
jieba.add_word(name)
for line in content:
seg_list = list(jieba.cut(line,cut_all=False))
#去除停用词
unique_list = []
for seg in seg_list:
if seg not in stop_words:
unique_list.append(seg)
sentence.append(unique_list)
return sentence
def excerice():
'''训练模型'''
model = gensim.models.Word2Vec(sentences=word2vecTest(),vector_size=100,window=5,min_count=4,workers=3)
model.save('qye.model')
def loadModel(nameq):
'''加载训练模板'''
model = gensim.models.Word2Vec.load('qye.model')
for name in nameq:
print("=============="+name+"==============")
for s in model.wv.most_similar(positive=[name])[:10]:
print(s)
def not_match(data):
'''查询异类'''
model = gensim.models.Word2Vec.load('qye.model')
return model.wv.doesnt_match(data)
def relation():
'''已段落判断任务之间关系,采用共现矩阵计算'''
for name in names:
jieba.add_word(name)
p_count = df_count()
for line in content:
seg_list = list(jieba.cut(line,cut_all=False))
#去除停用词
unique_list = []
for seg in seg_list:
if seg not in stop_words:
unique_list.append(seg)
s_split = []
for w in unique_list:
name = getPersion(w)
if(name != '无' and name not in s_split):
s_split.append(name)
if(len(s_split) > 0):
for p1 in s_split:
for p2 in s_split:
p_count.loc[p1][p2] += 1
return (s_split,p_count)
def getPersion(line):
'''判断分词中出现人物名称'''
name_new = '无'
for key in names:
name_arr = []
name_arr.append(key)
for name in names[key]:
name_arr.append(name)
count = len(re.findall('('+"|".join(name_arr)+')',line))
if(count != 0):
name_new = key
break
return name_new
def img():
'''绘图'''
count = 0
for name in names:
count = count + 1
p_img = []
metric_data = relation()
qs = metric_data[1]
ori = qs.index.array
# 共现矩阵是 所以只取一半并且排除对角线的数据
for i in np.arange(count):
for j in np.arange(i + 1,count):
p_img.append([ori[i],ori[j],qs.iloc[i,j]])
pcount = pd.DataFrame(p_img,columns=['Source','Target','Weight'])
painting(pcount)
def painting(pcount): #绘制人物亲密度图
res = pcount.values.tolist()
result = []
for i in range(len(res)):
if(res[i][2] != 0):
result.append(res[i])
g = nx.Graph()
for i in range(len(result)):
g.add_edge(result[i][0], result[i][1], weight=result[i][2])
nx.draw(g, with_labels=True,font_size=12,node_size=1000,node_color='g')
plt.show()
if __name__ == '__main__':
# excerice()
# loadModel(['明玉','明哲','明成','苏大强'])
# print(not_match(['蔡根花','明成']))
# print(content[3])
# print(getPersion("我是蔡根花asdfasd蔡根花"))
# print(len(re.findall("(蔡根花|明成)","我是蔡根花asdfasd蔡根花")))
# print(relation())
# print(list(set(['wangwu','lsi']).union(set(['lsi','mazhi']))))
# print(np.array(init_matrix()) )
# data = [[0 for i in range(0,4)] for j in range(0,4)]
# indexs = ['wangwu','zhangsan','lisi','wangmaz']
# pcount = pd.DataFrame(data,index = indexs,columns=indexs)
img()