任务:对10个战略新兴产业描述文档提取特征词,从而建立10个产业的特征,要求是10个产业特征词区分度和代表度越大越好。采用TF-IDF算法对文档提取特征词,一开始使用jieba自带tf-idf算法,结果不太理想,见下图,每一列为10个产业提取的特征词,红色是之间有重复的情况。
分析原因:jieba的tf-idf算法tf值和idf值依托自身的词典,所以没有针对性。
自己编写TF-IDF算法,效果优于jieba,特征词的区分度和代表度都较好,见下图:
结论:针对自身任务,最好了解算法,自己编写算法实现。
附:TF-IDF算法python代码:
def tf_idf(self,corpus,topk): tt= open(corpus,'r').readlines() kws=[] for t in tt: word_tfidf = self.word_freq(t,2) t1,t2=[],[] for i in range(len(word_tfidf)): t1.append(word_tfidf['word'][i]) t2.append(word_tfidf['tf'][i]) kws.append(t1) kws.append(t2) idf_list=[] tf_idf= pd.DataFrame(columns=['kw','tf','idf','tf_idf']) for i in range(0,len(kws),2): idf1,tfidf=[],[] for j in range(0,len(kws[i])): idf = 0 for k in range(0,len(kws),2): if kws[i][j] in kws[k]: idf=idf+1 idf1.append(round(np.log(float(len(tt))/float(idf)),5)) tfidf.append(round(kws[i+1][j]*idf1[-1],5)) idf_list.append(idf1) idf_df= pd.DataFrame({'kw':kws[i],'tf':kws[i+1],'idf':idf1,'tf_idf':tfidf}) idf_df= idf_df.sort_values(by='tf_idf', ascending=False) idf_df= idf_df.iloc[:topk,:] tf_idf= pd.concat([tf_idf,idf_df],axis=0,ignore_index=True) tf_idf.to_csv('./indus_introduction/indus_tf_idf.csv') def word_freq(self,sentence,freq): wordlist = pd.DataFrame({'word': sentence.strip().split(' '), 'freq': 1}) wordcount = wordlist.groupby('word').sum() wordcount = wordcount.sort_values(by='freq', ascending=False) tmp = pd.DataFrame({'word': wordcount.index}, index=wordcount.index) wordcount = pd.concat([tmp, wordcount], axis=1) wordcount = pd.DataFrame.reset_index(wordcount, drop=True) wordcount = wordcount[wordcount['freq'] >= freq] wordcount = wordcount[wordcount['word'] != ''] wordcount = pd.DataFrame.reset_index(wordcount, drop=True) freq_p=[0.]*len(wordcount) for i in range(len(wordcount)): freq_p[i]= round(float(wordcount['freq'][i])/float(sum(wordcount['freq'])),5) wordcount= pd.concat([wordcount, pd.DataFrame({'tf':freq_p})], axis=1) return wordcount