tf-idf http://www.ruanyifeng.com/blog/2013/03/tf-idf.html
chi https://blog.csdn.net/hubin232/article/details/81272126 【比较新】
sklearn里面算的是每个文本的tdidf向量,max_features是对所有词得频率进行降序排序只取前max_features个词。加上之前参考了 https://github.com/chenfei0328/BayesProject
思路就被局限在了所有文本建一个总的矩阵再进行降维,后来被大佬提点,才转了思路。
我们做的时候的大致思路:
(听老师说还有用CHI*TF-IDF来降维的,别的区特征的方法比如IG也可以)
写的时候计数老有问题,同一类的数据是一样的。发现是拷贝和引用没有弄清楚。
每类文档的数目懒得贴了
#统计词频和文档数,考虑词频来计算CHI
#wroddict={tf:出现次数;total:该类总单词数;idf:出现的文档数目}
import json
import os
from collections import Counter
import time
def count_words(cut_path):
cate=os.listdir(cut_path)
wordDict={'total':{}}
for i,category in enumerate(cate):
print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()), '>', '=' * 40 + '[' + category + ']' + '=' * 40)
wordInfo={'tf':0,'idf':1}
wordsLen=0
file_path=cut_path+category+'/'
file_list=os.listdir(file_path)
for j,file_name in enumerate(file_list):
full_path=file_path+file_name
with open(full_path,"r",encoding='utf-8') as f:
content=f.read()
words=content.split()
wordsLen+=len(words)
wordCounter=Counter(words)
for index,wordTuple in enumerate(wordCounter.most_common(len(wordCounter))):
(word,count)=wordTuple
wordDict.setdefault(word,{})
wordDict[word].setdefault(category,wordInfo.copy())
wordDict[word][category]['tf']+=count
wordDict[word][category]['idf']+=1
wordDict['total'][category]=wordDict.setdefault(category,wordsLen)
wordDict=dict(wordDict)
fname='C:/lyr/DM/feature_reduction/wordDict.json'
with open(fname,'w') as fp:
json.dump(wordDict,fp)
前面的字典好像有问题。。然后就为了能运行直接忽略了异常。长度比原来只少1
from math import log
weightDict={}
featureDict={'chi':0,'tfidf':'0'}
#weightDict={chi: ,tfidf:}都一起算了吧
def gen_weightDict():
for (word,value) in ddict.items():
if word!='total':
weightDict.setdefault(word,{})
totalCount=0#A+B该词出现的总次数
try:
for (cate,times) in value.items():
totalCount+=times['tf']
except Exception as ex:
print(word,value)
try:
for (cate,times) in value.items():
weightDict[word].setdefault(cate,featureDict.copy())
#乘除100,防止太大太小!?
weightDict[word][cate]['tfidf']=(times['tf']/ddict['total'][cate])*100*log(docLen[cate]/(1+times['idf']))
#times['idf']#相当于a
not_in_Class=totalCount-times['idf']#相当于B
not_has_word=docLen[cate]-times['idf']#相当于c
d=docLen['All']-totalCount-not_has_word
chi=pow((times['idf']*d-not_in_Class*not_has_word),2)/(100*totalCount*(d+not_has_word))
weightDict[word][cate]['chi']=chi
except Exception as ex:
pass
fname='C:/lyr/DM/feature_reduction/weigthDict.json'
with open(fname,'w') as fp:
json.dump(weightDict,fp)
然后换了一下dict的组织方式,不然不会排序
我,每类取了8000
resultDict={'chi':{},'tfidf':{},'both':{}}#存3种结果
maxCount=8000
def gen_featureDict(maxCount):
for cate,words in new_weigthDict.items():
print(cate)
chiMix=dict(sorted(words.items(),key=lambda x:x[1]['chi'],reverse=True)[:maxCount])
tdidfMix=dict(sorted(words.items(),key=lambda x:x[1]['tfidf'],reverse=True)[:maxCount])
bothMix=dict(sorted(words.items(),key=lambda x:x[1]['both'],reverse=True)[:maxCount])
resultDict['chi'].setdefault(cate,{})
resultDict['tfidf'].setdefault(cate,{})
resultDict['both'].setdefault(cate,{})
chiSum,tfidfSum,bothSum=0,0,0
#我想把里面混的别的数据分出去
for word,weight in chiMix.items():
chiSum+=weight['chi'] #虽然能放下但就是觉得好大
resultDict['chi'][cate].setdefault(word,weight['chi'])
for word,weight in tdidfMix.items():
tfidfSum+=weight['tfidf']
resultDict['tfidf'][cate].setdefault(word,weight['tfidf'])
for word,weight in bothMix.items():
bothSum+=weight['both']
resultDict['both'][cate].setdefault(word,weight['both'])
#结果到0-1之间,不然chi乘起来就gg了
for word,weight in chiMix.items():
resultDict['chi'][cate][word]/=chiSum
for word,weight in tdidfMix.items():
resultDict['tfidf'][cate][word]/=tfidfSum
for word,weight in bothMix.items():
resultDict['both'][cate][word]/=bothSum
gen_featureDict(maxCount)