2019-8-7
文中内容参考自博客:爱情大数据 | 你的专属微信聊天记录统计
- 前言:
准备在七夕那天给女朋友准备一份小礼物
就是把在一起之后的所有微信聊天记录生成一份词云
chat = pd.read_csv('chat.csv', sep=',', usecols=[6,7,8])
chat_file = 'chat.csv'
myGirl = 'wxid_XXXXXXX'
''' 读取原数据 '''
chat = pd.read_csv(chat_file, sep=',', usecols=[6, 7, 8])
chat_time = []
chat_content = []
chat_all = []
for i in range(len(chat) - 1):
content = chat[i:i + 1]
if content['talker'].values[0] == myGirl:
t = content['createTime'].values[0] // 1000
c = content['content'].values[0]
chat_time.append(t)
chat_content.append(c)
chat_all.append([t, c])
chat_all = sorted(chat_all, key=itemgetter(0)) # 以第一维为索引排序
result = re.compile(r"[\u4e00-\u9fa5]+").findall(" ".join(chat_content)) #使用正则表达式提取中文
chat_record = " ".join(result)
''' 转换时间格式 '''
def to_hour(t):
struct_time = time.localtime(t)
hour = round((struct_time[3] + struct_time[4] / 60), 2)
return hour
hour_set = [to_hour(i) for i in chat_time]
import seaborn as sns
from matplotlib.font_manager import *#如果想在图上显示中文,需导入这个包
myfont = FontProperties(fname=r'C:\Windows\Fonts\MSYH.TTC',size=22)#标题字体样式
myfont2 = FontProperties(fname=r'C:\Windows\Fonts\MSYH.TTC',size=18)#横纵坐标字体样式
sns.set_style('darkgrid')#设置图片为深色背景且有网格线
sns.distplot(hour_set, 24, color='lightcoral')
plt.xticks(np.arange(0, 25, 1.0), fontsize=15)
plt.yticks(fontsize=15)
plt.title('聊天时间分布', fontproperties=myfont)
plt.xlabel('时间段', fontproperties=myfont2)
plt.ylabel('聊天时间分布', fontproperties=myfont2)
fig = plt.gcf()
fig.set_size_inches(15,8)
fig.savefig('chat_time.png',dpi=100)
plt.show()
option = {
title: {
text: '聊天时间段',
textStyle: {
color: '#000',
fontSize: 20
}
},
//toolbox配置项可以在网页上直接生成下载图片的按钮而不用截图
toolbox: {
show: true,
feature: {
saveAsImage: {
show:true,
excludeComponents :['toolbox'],
pixelRatio: 2
}
}
},
tooltip: {},
radar: {
// shape: 'circle',
name: {
textStyle: {
color: '#fff',
backgroundColor: '#999',
borderRadius: 3,
padding: [3, 5]
}
},
indicator: [
//max值也需要修改,可以用最大的数值向上取整当最大数
{ name: '凌晨2点至6点', max: 400},
{ name: '6点至10点次', max: 400},
{ name: '10点至14点', max: 400},
{ name: '14点至18点', max: 400},
{ name: '18点至22点', max: 400},
{ name: '22点至次日凌晨2点', max: 400}
]
},
series: [{
name: '聊天时间段',
type: 'radar',
// areaStyle: {normal: {}},
data : [
{
value : [63, 141, 250, 213, 263, 390], //替换数据即可
name : '聊天时段'
}
]
}]
};
import imageio
import wordcloud
import jieba
from scipy.misc import imread
import jieba.analyse as analyse
# -------文本分词--------
mask = imageio.imread("shape.jpg")
ls = jieba.lcut(chat_record) #精简模式
words = ["过滤词汇", "根据实际生成的图片将一些词汇过滤掉"]
#替换过滤词汇
for i in range(len(ls)):
if ls[i] in words:
ls[i] = " "
else:
continue
txt = "/".join(ls) #将ls列表转换成为字符串,每个词用 / 分隔
# -------生成词云图--------
w = wordcloud.WordCloud(font_path="msyh.ttc",
width=1024, height=800, background_color="white",
mask=mask)
w.generate(txt)
w.to_file("honey.png")
import imageio
import pandas as pd
import time
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import re
from operator import itemgetter
import wordcloud
import jieba
from matplotlib.font_manager import * # 如果想在图上显示中文,需导入这个包
chat_file = 'chat.csv'
myGirl = 'wxid_xxxxxxxxxxxxx'
''' 读取原数据 '''
chat = pd.read_csv(chat_file, sep=',', usecols=[6, 7, 8])
chat_time = []
chat_content = []
chat_all = []
for i in range(len(chat) - 1):
content = chat[i:i + 1]
if content['talker'].values[0] == myGirl:
t = content['createTime'].values[0] // 1000
c = content['content'].values[0]
chat_time.append(t)
chat_content.append(c)
chat_all.append([t, c])
chat_all = sorted(chat_all, key=itemgetter(0)) # 以第一维为索引排序
result = re.compile(r"[\u4e00-\u9fa5]+").findall(" ".join(chat_content)) #使用正则表达式提取中文
chat_record = " ".join(result)
''' 转换时间格式 '''
def to_hour(t):
struct_time = time.localtime(t)
hour = round((struct_time[3] + struct_time[4] / 60), 2)
return hour
hour_set = [to_hour(i) for i in chat_time]
'''消息所处时间段统计'''
print('\n.......................\n开始画图\n.......................')
myfont = FontProperties(fname=r'C:\Windows\Fonts\MSYH.TTC', size=22) # 标题字体样式
myfont2 = FontProperties(fname=r'C:\Windows\Fonts\MSYH.TTC', size=18) # 横纵坐标字体样式
sns.set_style('darkgrid') # 设置图片为深色背景且有网格线
sns.distplot(hour_set, 24, color='lightcoral')
plt.xticks(np.arange(0, 25, 1.0), fontsize=15)
plt.yticks(fontsize=15)
plt.title('聊天时间分布', fontproperties=myfont)
plt.xlabel('时间段', fontproperties=myfont2)
plt.ylabel('聊天时间分布', fontproperties=myfont2)
fig = plt.gcf()
fig.set_size_inches(15, 8)
fig.savefig('chat_time.png', dpi=100)
plt.show()
print('\n.......................\n画图结束\n.......................')
''' 聊天时段分布 '''
print('\n.......................\n开始聊天时段统计\n.......................')
time_slice = [0, 0, 0, 0, 0, 0]
deep_night = []
for i in range(len(hour_set)):
if hour_set[i] >= 2 and hour_set[i] < 6:
time_slice[0] += 1
deep_night.append([chat_time[i], chat_content[i]])
elif hour_set[i] >= 6 and hour_set[i] < 10:
time_slice[1] += 1
elif hour_set[i] >= 10 and hour_set[i] < 14:
time_slice[2] += 1
elif hour_set[i] >= 14 and hour_set[i] < 18:
time_slice[3] += 1
elif hour_set[i] >= 18 and hour_set[i] < 22:
time_slice[4] += 1
else:
time_slice[5] += 1
labels = ['凌晨2点至6点', '6点至10点', '10点至14点',
'14点至18点', '18点至22点', '22点至次日凌晨2点']
time_distribution = {
labels[0]: time_slice[0],
labels[1]: time_slice[1],
labels[2]: time_slice[2],
labels[3]: time_slice[3],
labels[4]: time_slice[4],
labels[5]: time_slice[5]
}
print(time_distribution)
''' 聊天记录词云生成 '''
print('\n..........\n开始生成词云\n............\n')
mask = imageio.imread("heart.png")
ls = jieba.lcut(chat_record)
words = ["什么", "这么", "那个", "不是", "这个", "然后", "现在"]
for i in range(len(ls)):
if ls[i] in words:
ls[i] = " "
else:
continue
txt = "/".join(ls)
w = wordcloud.WordCloud(font_path="msyh.ttc",
width=1024, height=800, background_color="white",
mask=mask)
w.generate(txt)
w.to_file("chat.png")
print('\n..........\n消息记录词云图生成完毕\n............\n')