github上找的源码,自己改的,记在这里。
对图中的文档做分词及词频统计,然后将统计生成的excel表格和分词后的text文本存入result文件夹里。
待分词的文本:
最后生成的文档:
文件批量处理函数:
主要用到os模块为新生成的文件命名,实现批量处理
def word_frequency_analysis(path):
files = os.listdir(path) # files为列表,存储的是path里面的所有文件名
result_dir = os.path.abspath(os.path.join(path, 'result')) # 返回result文档的路径
csv_all = os.path.abspath(os.path.join(result_dir, 'csv_all.csv'))
if not os.path.exists(result_dir):
os.mkdir(result_dir) #若不存在该文件路径,则创建一个
for filename in files:
if not fnmatch.fnmatch(filename, '*.txt'):
continue
txt_path = os.path.join(path, filename)
txt_content = open(txt_path, 'r').read()
field_name = filename[:-4] + '年' # eg:返回2014年,2015年
header_filed.append(field_name)
filename_fulltext = filename[:-4] + '_all.txt'
filename_counter = filename[:-4] + '_tj.csv'
# filename_key = filename[:-4] + '_hy_tj.csv'
txt_to_all = os.path.join(os.path.join(path, 'result'), filename_fulltext)
txt_to_counter = os.path.join(os.path.join(path, 'result'), filename_counter)
# txt_to_key = os.path.join(os.path.join(path, 'result'), filename_key)
text_cutted = jiebaCutText(txt_content)
text_cleared = clearText(text_cutted)
text_counted = countwords(text_cleared, txt_to_counter)
newfile = open(txt_to_all, 'w')
newfile.write(text_cleared)
newfile.close()
分词函数:
主要用jieba第三方库进行分词
def jiebaCutText(text):
seg_list = jieba.cut(text, cut_all=False)
liststr = '/'.join(seg_list)
return liststr # 返回的结果中会带标点符号
去除标点和单音节词:
将符合要求的词记入列表
def clearText(text):
mywordlist = []
for myword in text.split('/'):
if len(myword.strip()) > 1 and contain_zh(myword.strip()):
mywordlist.append(myword.strip())
return '/'.join(mywordlist)
判断字符是否为汉字:
用到re模块,判断字符是否是汉字
def contain_zh(word):
zh = re.compile(u'[\u4200-\u9fa5]+')
match = zh.search(word)
return match
词频统计函数:
用到字典和collections模块
def countwords(text, counter_file):
count_dict = dict()
for item in text.split('/'):
if item in count_dict:
count_dict[item] += 1
else:
count_dict[item] = 1
d_sorted_by_value = OrderedDict(sorted(count_dict.items(), key=lambda x: x[1]))
with open(counter_file, 'w',encoding='utf-8-sig') as f:
#f.write(codecs.BOM_UTF8)
w = csv.writer(f)
w.writerows(d_sorted_by_value.items())
完整的代码:
import csv
import fnmatch
import os
import re
from collections import OrderedDict
import jieba
#header_filed = []
def word_frequency_analysis(path):
files = os.listdir(path) # files为列表,存储的是path里面的所有文件名
result_dir = os.path.abspath(os.path.join(path, 'result')) # 返回result文档的路径
csv_all = os.path.abspath(os.path.join(result_dir, 'csv_all.csv'))
if not os.path.exists(result_dir):
os.mkdir(result_dir) #若不存在该文件路径,则创建一个
for filename in files:
if not fnmatch.fnmatch(filename, '*.txt'):
continue
txt_path = os.path.join(path, filename)
txt_content = open(txt_path, 'r').read()
field_name = filename[:-4] + '年' # eg:返回2014年,2015年
header_filed.append(field_name)
filename_fulltext = filename[:-4] + '_all.txt'
filename_counter = filename[:-4] + '_tj.csv'
# filename_key = filename[:-4] + '_hy_tj.csv'
txt_to_all = os.path.join(os.path.join(path, 'result'), filename_fulltext)
txt_to_counter = os.path.join(os.path.join(path, 'result'), filename_counter)
# txt_to_key = os.path.join(os.path.join(path, 'result'), filename_key)
text_cutted = jiebaCutText(txt_content)
text_cleared = clearText(text_cutted)
text_counted = countwords(text_cleared, txt_to_counter)
newfile = open(txt_to_all, 'w')
newfile.write(text_cleared)
newfile.close()
def jiebaCutText(text):
seg_list = jieba.cut(text, cut_all=False)
liststr = '/'.join(seg_list)
return liststr # 返回的结果中会带标点符号
def clearText(text):
mywordlist = []
for myword in text.split('/'):
if len(myword.strip()) > 1 and contain_zh(myword.strip()):
mywordlist.append(myword.strip())
return '/'.join(mywordlist)
def contain_zh(word):
zh = re.compile(u'[\u4200-\u9fa5]+')
match = zh.search(word)
return match
def countwords(text, counter_file):
count_dict = dict()
for item in text.split('/'):
if item in count_dict:
count_dict[item] += 1
else:
count_dict[item] = 1
d_sorted_by_value = OrderedDict(sorted(count_dict.items(), key=lambda x: x[1]))
with open(counter_file, 'w',encoding='utf-8-sig', newline = '') as f: #newline参数防止生成的文件有空行
#f.write(codecs.BOM_UTF8)
w = csv.writer(f)
w.writerows(d_sorted_by_value.items())
if __name__=='__main__':
path = 'E:/Programe/PySeg/jieba-wordcloud-demo-master/基础数据/韶关(分年度)'
word_frequency_analysis(path)