系统:win7 32位
分词软件:PyNLPIR
集成开发环境(IDE):Pycharm
功能:实现多级文本预处理全过程,包括文本分词,过滤停用词,词频统计,特征选择,文本表示,并将结果导出为WEKA能够处理的.arff格式。
直接上代码:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
功能:PyNLPIR文本预处理
过程:文本分词,过滤停用词,词频统计,特征选择,文本表示
时间:2016年8月25日10:52:43
"""
import pynlpir
import codecs
import math
pynlpir.open()
#文本分词
typelist = [u"财经",u"IT",u"健康",u"体育",u"旅游",u"教育",u"招聘",u"文化",u"军事"]
typetxt = codecs.open('C:\\Users\\Administrator\\Desktop\\txttype.txt', 'a', encoding='utf-8')
wordseg_result = codecs.open('C:\\Users\\Administrator\\Desktop\\wordseg_result.txt', 'a',encoding='utf-8')
allresult = []
for j in range(1,10):
for i in range(10,510):
typetxt.write(typelist[j-1] + "\n")
s = ""
singletext_result = []
print (u'正在对第 %s 个文件夹的第 %s 个文本进行分词处理.....'%(j,i))
f = codecs.open('C:\\Users\\Administrator\\Desktop\\textmining_experiment2\\Word Segment\\traintxt500\\%d\\%d.txt' % (j,i),'r',"gb18030")
for line in f:
s += line.strip().encode('utf-8')
for item in pynlpir.segment(s):
singletext_result.append(item[0])
allresult.append(singletext_result)
typetxt.close()
print (u'文本类别析出完毕!结果已输出到desktop的txttype.txt!')
#直接打印出结果
#for singletext_result in allresult:
# for item in singletext_result:
# print item
#所有结果写入一个txt,一行一个文本
for singletext_result in allresult:
for item in singletext_result:
wordseg_result.write(item+'\t')
wordseg_result.write('\n')
wordseg_result.close()
print (u'分词完毕!分词结果已输出到desktop的wordseg_result.txt!'+'\n')
#过滤停用词
stopwords = []
delstopwords_alltxt = []
st = codecs.open('C:\\Users\\Administrator\\PycharmProjects\\Newtextmining\\File\\stopwords.txt', 'rb',encoding='utf-8')
delstopwords_result = codecs.open('C:\\Users\\Administrator\\Desktop\\delstopwords_result.txt' , 'a',encoding='utf-8')
for line in st:
line = line.strip()
stopwords.append(line)
print (u'正在过滤停用词......')
for singletext_result in allresult:
delstopwords_singletxt = []
for word in singletext_result:
word = word.strip()
if word not in stopwords:
if word >= u'\u4e00' and word <= u'\u9fa5':#判断是否是汉字
delstopwords_singletxt.append(word)
delstopwords_alltxt.append(delstopwords_singletxt)
for delstopwords_singletxt in delstopwords_alltxt:
for everyword in delstopwords_singletxt:
delstopwords_result.write(everyword + '\t')
delstopwords_result.write('\n')
delstopwords_result.close()
print (u'停用词过滤完毕!已将结果输出到desktop的delstopwords_result.txt!'+'\n')
#统计绝对词频统计TF
getTF_alltxt_dic = {}
getTF_result = codecs.open('C:\\Users\\Administrator\\Desktop\\getTF_result.txt' , 'a',encoding='utf-8')
print (u'正在统计TF......')
for delstopwords_singletxt in delstopwords_alltxt:
getTF_singletxt_dic = {}
for everyword in delstopwords_singletxt:
everyword = everyword.strip()
if everyword in getTF_singletxt_dic:
getTF_singletxt_dic[everyword] += 1
else:
getTF_singletxt_dic[everyword] = 1
getTF_singletxt_dic = sorted(getTF_singletxt_dic.items(), key=lambda d: d[1], reverse=1)
for a, b in getTF_singletxt_dic:
if b > 0:
getTF_result.write(a + '\t' + str(b) + '\t')
getTF_result.write('\n')
getTF_result.close()
print (u'TF值统计完毕!已将结果输出到desktop的getTF_result.txt!'+'\n')
#特征选择
#计算所有类别DF
alltext = []
allwords = []
delstopwords_result = codecs.open('C:\\Users\\Administrator\\Desktop\\delstopwords_result.txt' , 'rb',encoding='utf-8')
wordlist = []
for line in delstopwords_result:
alltext.append(line)
words = line.strip('\n').split('\t')
for word in words:
if word:
wordlist.append(word)
print u"原始文本词汇总数:", len(wordlist)
print u"文本个数:", len(alltext)
print u"词汇种数:", len(set(wordlist))
print ('\n'+u'正在计算所有类别DF......')
word_df = []
for word in set(wordlist):
count = 0
for words in alltext:
if word in words:
count += 1
word_df.append([word, str(count)]) # 存储形式[word,DF]
# 输出
word_df.sort(key=lambda x: int(x[1]),reverse=True) # 词频从大到小排序
b = codecs.open('C:\\Users\\Administrator\\Desktop\\DF_allclass_result.txt', "a", encoding="utf-8")
b.truncate()
for item in word_df:
for word in item:
b.write(word+ '\t')
b.write( '\n')
b.close()
b = codecs.open('C:\\Users\\Administrator\\Desktop\\DF_allclass_result.txt', "rb", encoding="utf-8")
for line in b:
line = line.split('\t')
if len(line[0])>1:
c = codecs.open('C:\\Users\\Administrator\\Desktop\\DF_allclass_result1.txt', "a", encoding="utf-8")
c.write(line[0]+'\t'+line[1])
c.write('\n')
c.close()
print (u'所有类别DF值统计完毕!已将结果输出到desktop的DF_allclass_result1.txt!'+'\n')
#特征选择
#计算单一类别DF
print (u'正在计算单一类别DF......')
word_df2 = []
d = codecs.open('C:\\Users\\Administrator\\Desktop\\DF_allclass_result1.txt', "rb", encoding="utf-8")
for line in d:
line = line.split()
word = line[0]
count_1 = 0
count_2 = 0
count_3 = 0
count_4 = 0
count_5 = 0
count_6 = 0
count_7 = 0
count_8 = 0
count_9 = 0
for words in alltext[0:500]:
if word in words:
count_1 += 1
for words in alltext[500:1000]:
if word in words:
count_2 += 1
for words in alltext[1000:1500]:
if word in words:
count_3 += 1
for words in alltext[1500:2000]:
if word in words:
count_4 += 1
for words in alltext[2000:2500]:
if word in words:
count_5 += 1
for words in alltext[2500:3000]:
if word in words:
count_6 += 1
for words in alltext[3000:3500]:
if word in words:
count_7 += 1
for words in alltext[3500:4000]:
if line[0] in words:
count_8 += 1
for words in alltext[4000:4500]:
if word in words:
count_9 += 1
word_df2.append([word,str(count_1),str(count_2),str(count_3),str(count_4),str(count_5),str(count_6),str(count_7),str(count_8),str(count_9)]) # 存储形式[word,DF]
d.close()
# 输出
e = codecs.open('C:\\Users\\Administrator\\Desktop\\DF_singleclass_result.txt', "a", encoding="utf-8")
for item in word_df2:
for term in item:
e.write(term+'\t')
e.write( '\n')
e.close()
print (u'单一类别DF值统计完毕!已将结果输出到desktop的DF_singleclass_result.txt!'+'\n')
#计算特征项信息熵
print (u'正在计算信息熵......')
IG = []
g = codecs.open('C:\\Users\\Administrator\\Desktop\\DF_allclass_result1.txt', "rb", encoding="utf-8")
for line in g:
line = line.split()
word = line[0]
word2 = float(line[1])
PC = float(500)/float(4500)
PC_1 = float(PC) * 9
Entropy = -(float(PC_1)*float(math.log(PC_1,2)))
PT = float(word2)/float(len(alltext))
PT_1 = float(1) - PT
h = codecs.open('C:\\Users\\Administrator\\Desktop\\DF_singleclass_result.txt', "rb", encoding="utf-8")
lines = h.readline()
line = lines[:-1]
line = line.split()
PCT_evenplus = ( float(line[1]) + float(line[2]) + float(line[3]) + float(line[4]) + float(line[5]) + float(line[6]) + float(line[7]) + float(line[8]) )/float(word2)
PCT_evenplus_ = float(1) - float(PCT_evenplus)
E1 = - (float(PCT_evenplus) * float(math.log(PCT_evenplus, 2)))
E2 = -(float(PCT_evenplus_) * float(math.log(float(PCT_evenplus),2)))
exEtropy =float(PT) * float(E1) + float(PT_1) * float(E2)
IG_value = float(Entropy) - float(exEtropy)
IG.append([word, str(IG_value)]) # 存储形式[word,IG_value]
IG.sort(key=lambda x: float(x[1]), reverse=True) # 词频从大到小排序
i = codecs.open('C:\\Users\\Administrator\\Desktop\\IG_value.txt', "a", encoding="utf-8")
i.truncate()
for item in IG:
for word in item:
i.write(word + '\t')
i.write('\n')
i.close()
h.close()
g.close()
print (u'信息增益值统计完毕!已将结果输出到desktop的IG_value.txt!'+'\n')
print (u'正在选择特征词......')
j = codecs.open('C:\\Users\\Administrator\\Desktop\\IG_value.txt', "rb", encoding="utf-8")
for line in j:
line = line.split()
if float(line[1])> -float(10):
k = codecs.open('C:\\Users\\Administrator\\Desktop\\FeatureWords.txt', "a", encoding="utf-8")
k.write(line[0])
k.write('\n')
k.close()
j.close()
print (u'特征词选择完毕!已将结果输出到desktop的FeatureWords.txt!'+'\n')
f1 = codecs.open('C:\\Users\\Administrator\\Desktop\\DF_allclass_result1.txt', "rb", encoding="utf-8")
f2 = codecs.open('C:\\Users\\Administrator\\Desktop\\FeatureWords.txt', "rb", encoding="utf-8")
FeatureWords_value = codecs.open('C:\\Users\\Administrator\\Desktop\\FeatureWords_value.txt', "a", encoding="utf-8")
dic = {}
for line in f1:
line =line.strip('\n').split('\t')
dic[line[0]]=line[1]
f1.close()
for word in f2:
if word in dic:
FeatureWords_value.write(dic[word]+'\t'+dic[word]+'\n')
f2.close()
FeatureWords_value.close()
#文档向量化
print (u'正在进行文本向量化处理......')
f1 = codecs.open('C:\\Users\\Administrator\\Desktop\\DF_allclass_result1.txt', "rb", encoding="utf-8")
f2 = codecs.open('C:\\Users\\Administrator\\Desktop\\FeatureWords.txt', "rb", encoding="utf-8")
FeatureWords_value = codecs.open('C:\\Users\\Administrator\\Desktop\\FeatureWords_value.txt', "a", encoding="utf-8")
allfw=[]
for line in f1:
dic = {}
line =line.strip('\n').split('\t')
dic[line[0]]=line[1]
allfw.append(dic)
f1.close()
#for dic in allfw:
# for k,v in dic.iteritems():
# print k,v
fw=[]
for line in f2:
line=line.strip('\n').split('\t')
fw.append(line[0])
f2.close()
#for word in fw:
# print word
for dic in allfw:
for word in fw:
if word in dic:
for k, v in dic.iteritems():
FeatureWords_value.write(k+'\t'+v+'\n')
FeatureWords_value.close()
feture_word = []
feture_word_dic = {}
feture_word_dic2 = {}
FeatureWords_value = codecs.open('C:\\Users\\Administrator\\Desktop\\FeatureWords_value.txt', "rb", encoding="utf-8")
for line in FeatureWords_value:
line = line.split()
IDF = math.log(4500/float(line[1]),10)
feture_word.append(line[0])
feture_word_dic[line[0]] = line[1]
feture_word_dic2[line[0]] = IDF
FeatureWords_value.close()
getTF_result = codecs.open('C:\\Users\\Administrator\\Desktop\\getTF_result.txt', "rb", encoding="utf-8")
all=[]
for line in getTF_result:
line = line.strip('\n').split('\t')
single=[]
for words in line:
single.append(words)
all.append(single)
#for single in all:
#print single
# for word in single:
# print word
FeatureWords_value = codecs.open('C:\\Users\\Administrator\\Desktop\\FeatureWords_value.txt', "rb", encoding="utf-8")
alltext_vector = []
for single in all:
# print tmax
vector = []
for word in feture_word:
if word in single:
tmax = single[1]
inde=single.index(word)
t = single[inde+1]
else:
t = 0
# print t
tf_idf = (float(t)/float(tmax))*float(feture_word_dic2[word])
vector.append(tf_idf)
alltext_vector.append(vector)
for vector in alltext_vector[0:500]:
vector.append('Economy')
for vector in alltext_vector[500:1000]:
vector.append('IT')
for vector in alltext_vector[1000:1500]:
vector.append( 'Health')
for vector in alltext_vector[1500:2000]:
vector.append('PE')
for vector in alltext_vector[2000:2500]:
vector.append('Travel')
for vector in alltext_vector[2500:3000]:
vector.append( 'Education')
for vector in alltext_vector[3000:3500]:
vector.append('Enployment')
for vector in alltext_vector[3500:4000]:
vector.append('Culture')
for vector in alltext_vector[4000:4500]:
vector.append('Military')
#for vector in alltext_vector:
# print vector
# for value in vector:
# print value
data = codecs.open('C:\\Users\\Administrator\\Desktop\\data.arff', "a", encoding="utf-8")
data.truncate()
data.write(u'@relation'+' '+u'sougoucorpus'+'\n\n')
for everyword in feture_word:
data.write(u'@attribute'+ ' '+ everyword +' '+u'numeric\n')
data.write(u'@attribute type {Economy,IT,Health,PE,Travel,Educaiton,Enployment,Culture,Military}\n\n@data\n')
for vector in alltext_vector:
for value in vector[:-1]:
data = codecs.open('C:\\Users\\Administrator\\Desktop\\data.arff', "a", encoding="utf-8")
data.write(str(value) + ',')
data.write(str(vector[-1]) + '\n')
data.close()
print (u'文本向量化处理完毕!已将结果输出到desktop的data.arff!'+'\n')
print (u'文本预处理结束!'+'\n')