python使用jieba实现tf-idf

分词实现:

import pandas as pd
import jieba
import jieba.analyse
# 数据源
df_news = pd.read_table('C:/Users/Shirley/Desktop/python/article.txt',names=['id','content'],encoding='utf-8')
# 存在缺失值,drop掉
df_news = df_news.dropna()
content = df_news.content.values.tolist()
content_S = []
for line in content:
 	current_segment = jieba.lcut(line)
  	if len(current_segment)>1 and current_segment != '\r\n':
  		content_S.append(current_segment)
df_content = pd.DataFrame({'content_S':content_S})
print(df_content.head())

加入停用词:

stopwords= pd.read_csv("C:/Users/Shirley/Desktop/python/stopwords_3.txt",index_col=False,sep='\t',quoting=3,names=['stopwords'],encoding='utf-8')
stopwords.head()
def drop_stopwords(contents,stopwords):
	contents_clean = []
 	all_words = []
 	for line in contents:
  		line_clean = []
  		for word in line:
   			if word in stopwords:
    			continue
   		line_clean.append(word)
   		all_words.append(str(word))
  	contents_clean.append(line_clean)
 	return contents_clean,all_words
 	
contents = df_content.content_S.values.tolist()
stopwords = stopwords.stopwords.values.tolist()
contents_clean,all_words = drop_stopwords(contents,stopwords)
df_content = pd.DataFrame({'contents_clean':contents_clean})
print(df_content.head())

tf-idf关键词提取实现:

index = 1
content_S_str="".join(contents_clean[index])
print(" ".join(jieba.analyse.extract_tags(content_S_str,topK=5,withWeight=False)))

你可能感兴趣的:(python)