分词实现:
import pandas as pd
import jieba
import jieba.analyse
# 数据源
df_news = pd.read_table('C:/Users/Shirley/Desktop/python/article.txt',names=['id','content'],encoding='utf-8')
# 存在缺失值,drop掉
df_news = df_news.dropna()
content = df_news.content.values.tolist()
content_S = []
for line in content:
current_segment = jieba.lcut(line)
if len(current_segment)>1 and current_segment != '\r\n':
content_S.append(current_segment)
df_content = pd.DataFrame({'content_S':content_S})
print(df_content.head())
加入停用词:
stopwords= pd.read_csv("C:/Users/Shirley/Desktop/python/stopwords_3.txt",index_col=False,sep='\t',quoting=3,names=['stopwords'],encoding='utf-8')
stopwords.head()
def drop_stopwords(contents,stopwords):
contents_clean = []
all_words = []
for line in contents:
line_clean = []
for word in line:
if word in stopwords:
continue
line_clean.append(word)
all_words.append(str(word))
contents_clean.append(line_clean)
return contents_clean,all_words
contents = df_content.content_S.values.tolist()
stopwords = stopwords.stopwords.values.tolist()
contents_clean,all_words = drop_stopwords(contents,stopwords)
df_content = pd.DataFrame({'contents_clean':contents_clean})
print(df_content.head())
tf-idf关键词提取实现:
index = 1
content_S_str="".join(contents_clean[index])
print(" ".join(jieba.analyse.extract_tags(content_S_str,topK=5,withWeight=False)))