import re
import jieba
import pandas as pd
import numpy as np
from datetime import datetime
from snownlp import SnowNLP
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from datetime import timedelta
def parse_chat_records(file_path):
with open(file_path, encoding='utf-8') as f:
content = f.read()
all_messages = re.findall(r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) (.*?)\n(.*?)\n', content)
messages = []
for date_time, user, message in all_messages:
year = int(date_time[:4])
if year >= 2023:
messages.append((date_time, user, message))
return messages
def sentiment_analysis(messages):
sentiment_scores = []
for _, _, message in messages:
if message.strip():
sentiment_scores.append(SnowNLP(message).sentiments)
return sentiment_scores
def time_differences(messages):
diffs = []
for i in range(1, len(messages)):
time1 = datetime.strptime(messages[i-1][0], '%Y-%m-%d %H:%M:%S')
time2 = datetime.strptime(messages[i][0], '%Y-%m-%d %H:%M:%S')
delta = time2 - time1
if delta <= timedelta(hours=8):
diffs.append(delta.seconds)
return diffs
def word_frequency(messages, stopwords_path=None):
words = []
stopwords = set()
if stopwords_path:
with open(stopwords_path, encoding='utf-8') as f:
for line in f:
stopwords.add(line.strip())
for _, _, message in messages:
for word in jieba.cut(message):
if len(word) > 1 and word not in stopwords and word not in ['图片', '表情', '这些', '那些', '就是', '那个', '之前', '一个', '现在']:
words.append(word)
counter = Counter(words)
return counter.most_common()
def generate_wordcloud(words, file_path):
if len(words) == 0:
print('没有足够的词汇生成词云。')
return
wc = WordCloud(font_path='simhei.ttf', background_color='white', width=1920, height=1080)
wc.generate_from_frequencies(dict(words))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.savefig(file_path)
def generate_pie_chart(labels, values, file_path):
plt.pie(values, labels=labels, autopct='%1.1f%%')
plt.savefig(file_path)
def analyze_chat_records(file_path):
messages = parse_chat_records(file_path)
sentiment_scores = sentiment_analysis(messages)
time_diffs = time_differences(messages)
words = word_frequency(messages)
generate_wordcloud(words, 'wordcloud.png')
print('情感分析(平均分):', np.mean(sentiment_scores))
print('每次对话之间的时间差值(平均秒数):', np.mean(time_diffs))
print('词频统计(前10):', words[:10])
if __name__ == '__main__':
analyze_chat_records('test.txt')