1. 获取小说文本 读取文件
# 获取小说文本
# 读取文件
fn = open("prepare\\红楼梦_曹雪芹.txt", encoding="utf-8")
string_data = fn.read() # 读出整个文件
fn.close() # 关闭文件
2.对文本进行处理
# 文本预处理
pattern = re.compile(u'\t|\n|\.|-|:|;|\)|\(|\?|"') # 定义正则表达式匹配模式
txt = re.sub(pattern, '', string_data) # 将符合模式的字符去除
print('预处理完毕')
# 停词文档
def stopwordslist(filepath):
stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
return stopwords
stopwords = stopwordslist('D:\\Python studybag\\prepare\\tingyong.txt')
excludes = {'之','其','方','即','因','仍','故','尚','乃','呀','吗','咧','罢','咧','啊','罢','了','么',
'或', ' 亦', '于', ' 皆', '的', '着', '一' , '不', '把', '让','向', '往', '是' , '在', '别',
'好', '可', '便', '就',' 但','越','再','更', '比','很','偏',
'那里','如今','一个','我们','你们','起来','姑娘','这里','二人','说道',
'知道','如何','今日','什么','于是','还有','出来','他们','众人','奶奶',
'自己','一面','太太','只见','怎么','两个','没有','不是','不知','这个',
'听见','这样','进来','告诉','东西','咱们','就是','如此','回来','大家',
'只是','老爷','只得','丫头','这些','不敢','出去','所以','不过','姐姐',
'的话','不好','鸳鸯','一时','过来','不能','心里','银子','答应','几个'} # 排除的词汇
3. 词频
# 通过键值对的形式存储词语及其出现的次数
counts1 = {} # 存放词性词频
counts2 = {} # 存放人物词频
# # 生成词频词性文件
def getWordTimes1():
cutFinal = pseg.cut(txt)
for w in cutFinal:
if w.word in stopwords or w.word == None:
continue
else:
real_word = w.word + '_' + w.flag
counts1[real_word] = counts1.get(real_word, 0) + 1
getWordTimes1()
items1 = list(counts1.items())
# 进行降序排列 根据词语出现的次数进行从大到小排序
items1.sort(key=lambda x: x[1], reverse=True)
# 导出数据
# 分词生成人物词频(写入文档)
def wordFreq1(filepath, topn1):
with codecs.open(filepath, "w", "utf-8") as f:
for i in range(topn1):
word, count = items1[i]
f.write("{}:{}\n".format(word, count))
# 生成词频文件
wordFreq1("output\\红楼梦词频词性.txt", 300)
# 将txt文本里的数据转换为字典形式
fr1 = open('output\\红楼梦词频词性.txt', 'r', encoding='utf-8')
dic1 = {}
keys1 = [] # 用来存储读取的顺序
for line in fr1:
# 去空白,并用split()方法返回列表
v1 = line.strip().split(':')
dic1[v1[0]] = v1[1]
keys1.append(v1[0])
fr1.close()
list_name1 = list(dic1.keys()) # 人名
list_name_times1 = list(dic1.values()) # 提取字典里的数据作为绘图数据
def create_wordproperties():
bar1 = Bar()
bar1.add_xaxis(list_name1[0:keshihuaTop])
bar1.add_yaxis("词语出现次数", list_name_times1)
bar1.set_global_opts(title_opts=opts.TitleOpts(title="词频词性可视化图", subtitle="词频词性top10"),
xaxis_opts=opts.AxisOpts(axislabel_opts={"rotate": 45}))
bar1.set_series_opts(label_opts=opts.LabelOpts(position="top"))
# 生成 html 文件
bar1.render("\\output\\红楼梦词频词性可视化图.html")
4. 对人名进行分析
# 得到 分词和出现次数
def getWordTimes2():
# 分词,返回词性
poss = pseg.cut(txt)
for w in poss:
if w.flag != 'nr' or len(w.word) < 2 or w.word in excludes:
continue # 当分词长度小于2或该词词性不为nr(人名)时认为该词不为人名
elif w.word == '宝哥哥' or w.word == '宝玉曰' or w.word == '宝二爷' or w.word == '绛洞花主' \
or w.word == '怡红公子' or w.word == '宝兄弟' or w.word == '混世魔王' or w.word == '宝玉':
real_word = '贾宝玉'
elif w.word == '黛玉' or w.word == '颦儿' or w.word == '潇湘妃子' or w.word == '林姑娘' or \
w.word == '林妹妹' or w.word == '黛玉曰' or w.word == '颦颦':
real_word = '林黛玉'
elif w.word == '宝钗' or w.word == '宝钗曰' or w.word == '宝丫头' or w.word == '宝姐姐' or w.word == '薛大姑娘':
real_word = '林宝钗'
elif w.word == '熙凤' or w.word == '熙凤曰' or w.word == '琏二奶奶' or w.word == '凤辣子' or w.word == '凤哥儿' \
or w.word == '凤丫头' or w.word == '凤姐' or w.word == '凤姐儿' or w.word == '琏二嫂子':
real_word = '王熙凤'
elif w.word == '贾母' or w.word == '贾母曰' or w.word == '史太君' or w.word == '老祖宗' or w.word == '老太太' \
or w.word == '老神仙':
real_word = '贾母'
elif w.word == '湘云' or w.word == '湘云曰' or w.word == '枕霞旧友' or w.word == '史大姑娘' or w.word == '云妹妹':
real_word = '史湘云'
elif w.word == '姨妈' or w.word == '姨妈曰' or w.word == '薛夫人' or w.word == '薛王氏' or w.word == '姨太太':
real_word = '贾迎春'
elif w.word == '探春' or w.word == '探春曰' or w.word == '玫瑰花' or w.word == '蕉下客':
real_word = '贾探春'
elif w.word == '贾珍' or w.word == '贾珍曰' or w.word == '珍老爷' or w.word == '大爷' or w.word == '大哥哥' :
real_word = '贾珍'
elif w.word == '贾琏' or w.word == '贾琏曰' or w.word == '琏二爷' or w.word == '二爷':
real_word = '贾琏'
elif w.word == '袭人' or w.word == '袭人曰' or w.word == '蕊珠' or w.word == '花珍珠':
real_word = '袭人'
elif w.word == '平儿' or w.word == '平儿曰' or w.word == '小平' or w.word == '平姑娘' or w.word == '平姐姐':
real_word = '平儿' # 把相同意思的名字归为一个人
else:
real_word = w.word
counts2[real_word] = counts2.get(real_word, 0) + 1
getWordTimes2()
items2 = list(counts2.items())
# 进行降序排列 根据词语出现的次数进行从大到小排序
items2.sort(key=lambda x: x[1], reverse=True)
# 导出数据
# 分词生成人物词频(写入文档)
def wordFreq2(filepath, topn):
with codecs.open(filepath, "w", "utf-8") as f:
for i in range(topn):
word, count = items2[i]
f.write("{}:{}\n".format(word, count))
# 生成词频文件
wordFreq2("D:\\Python studybag\\output\\红楼梦词频_人名.txt", 300)
# 将txt文本里的数据转换为字典形式
fr = open('D:\\Python studybag\\output\\红楼梦词频_人名.txt', 'r', encoding='utf-8')
dic = {}
keys = [] # 用来存储读取的顺序
for line in fr:
# 去空白,并用split()方法返回列表
v = line.strip().split(':')
dic[v[0]] = v[1]
keys.append(v[0])
fr.close()
# 输出前几个的键值对
print("人物出现次数TOP", mainTop)
print(list(dic.items())[:mainTop])
# 绘图
# 人名列表 (用于人物关系图,pyecharts人物出场次数图)
list_name = list(dic.keys()) # 人名
list_name_times = list(dic.values()) # 提取字典里的数据作为绘图数据
# 可视化人物出场次数
def creat_people_view():
bar = Bar()
bar.add_xaxis(list_name[0:keshihuaTop])
bar.add_yaxis("人物出场次数", list_name_times)
bar.set_global_opts(title_opts=opts.TitleOpts(title="人物出场次数可视化图", subtitle="红楼梦TOP10"),
xaxis_opts=opts.AxisOpts(axislabel_opts={"rotate": 45}))
bar.set_series_opts(label_opts=opts.LabelOpts(position="top"))
# bar.render_notebook() # 在 notebook 中展示
# make_snapshot(snapshot, bar.render(), "bar.png")
# 生成 html 文件
bar.render("D:\\Python studybag\\output\\红楼梦人物出场次数可视化图.html")
# 使用pyecharts 的方法生成词云
def creat_wordcloud_pyecharts():
wordsAndTimes = list(dic.items())
(
WordCloud()
.add(series_name="人物次数", data_pair=wordsAndTimes,
word_size_range=[20, 100], textstyle_opts=opts.TextStyleOpts(font_family="cursive"), )
.set_global_opts(title_opts=opts.TitleOpts(title="红楼梦词云"))
.render("D:\\Python studybag\\output\\红楼梦词云_人名.html")
)
# 颜色生成
colorNum = len(list_name[0:peopleTop])
# print('颜色数',colorNum)
def randomcolor():
colorArr = ['1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F']
color = ""
for i in range(6):
color += colorArr[random.randint(0, 14)]
return "#" + color
def color_list():
colorList = []
for i in range(colorNum):
colorList.append(randomcolor())
return colorList