1.合并数据文件
import os
import os.path #文件夹遍历函数
files = os.listdir('./raw_data') #特定目录下的文件存入列表
f=open('result.txt','w') #打开当前目录下的result.txt文件,如果没有则创建
for file in files:
filepath = './raw_data/'+file
for line in open(filepath): #遍历单个文件,读取行数
f.writelines(line)
f.write('\n')
f.close()
2.词频统计
import re
import jieba
from collections import Counter
import csv
# 读入数据文件文件
content = open('all_data.txt',encoding="gbk").read()
#数据清理
content = re.sub(r'\n+','',content) #去除换行符
content = re.sub(r'\W+',' ',content) #符号替换为空白
content = re.sub(r' +','',content) #去除空格
#分词
seg_list = list(jieba.cut(content))
#print("分词结果: \n","/".join(seg_list[:99]))
#去停用词
stopwords = open('stopwords.txt',encoding="utf-8").read()
stopwords = stopwords.split('\n') #字符串按'\n'分割,构建列表类型
#print("停用词: \n",",".join(stopwords[:20])) #显示部分停用词,第一个为空格
final_content = []
for seg in seg_list:
if seg not in stopwords:
final_content.append(seg)
#print("分词结果: \n","/".join(final_content[:99])) #显示部分处理结果
#词频统计
counting_words = Counter(final_content)
common_words = counting_words.most_common(50)
common_words.sort(key = lambda x:x[1], reverse = True)
#print(commo_words)
#词频写入csv
with open('word_excel.csv', 'w', encoding = 'utf-8', newline = '') as csvfile:
write = csv.writer(csvfile) #创建一个csv的writer对象用于写每一行内容
write.writerow(['词组','词频']) #写表格表头
write.writerows(common_words)