http://www.sogou.com/labs/resource/ca.php
import jieba
import pandas as pd
import re
import numpy as np
from wordcloud import WordCloud
from jieba import analyse
import warnings
warnings.filterwarnings("ignore")
from gensim import corpora, models, similarities
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
class NewClassify(object):
def __init__(self):
#原始数据集路径
self.data_set_path = "./dataset/news.txt"
#停用词列表
self.cease_words_table = open("./dataset/cease_words_table.txt", "r", encoding="utf-8").read().split()
#数据预处理
def pre_process_dataset(self):
with open(self.data_set_path, "r", encoding="utf-8") as fp:
#读取原始数据
news_txt = fp.read()
#提取新闻内容和标题
news_contents = re.findall(r"(.*) ", news_txt)
#将内容进行分词
news_contents = list(map(lambda s: jieba.lcut(s), news_contents))
titles = re.findall(r"(.*) ", news_txt)
#制作新闻分好词的dataframe
df_content = pd.DataFrame({
"content_S": news_contents})
#去除停用词
contents = df_content.content_S.values.tolist()
#存储去除停用词后的新闻内容列表
contents_clean = []
#存储新闻出现的全部非停用词的列表
all_words = []
#遍历新闻内容剔除停用词
for line in contents:
line_clean = []
for word in line:
if word in self.cease_words_table:
continue
line_clean.append(word)
all_words.append(word)
contents_clean.append(line_clean)
#返回处理后的新闻列表和出现的全部非停用词
return contents_clean, all_words
#统计词频
def statistic_word_frequency(self, de_all_words):
#对全部词进行分组后执行聚合函数统计每个词的出现次数
words_count = df_all_words.groupby(by=["all_words"])["all_words"].agg({
"count": np.size})
#对统计表进行重新排列,按照count值降序
words_count = words_count.reset_index().sort_values(by=["count"], ascending=False)
return words_count
#绘制词云
def paint_word_cloud(self, words_count):
pass
#提取关键词
def extract_abstract_words(self, df_content):
news = df_content["contents_clean"][10]
news_str = "".join(news)
print(news_str)
abstract_words = analyse.extract_tags(news_str, topK=5, withWeight=False)
print(abstract_words)
#LDA主题模型
def LDA_model(self, contents_clean):
#做映射,即每个词映射成唯一的数值
dictionary = corpora.Dictionary(contents_clean)
#根据映射表将新闻列表映射成数值列表
#corpus中每个元组代表每个词,元组中第一个值代表该词在映射表中的数值,第二个词代表该次在该文档中出现的次数
corpus = [dictionary.doc2bow(sentence) for sentence in contents_clean]
#建立LDA主题模型
lda = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=20)
#打印20个主题
for topic in lda.print_topics(num_topics=20, num_words=5):
print(topic)
def lable_news_process(self, contents_clean):
lables = []
lable = ["汽车","财经","科技","健康","体育","教育","文化",",军事","娱乐","时尚"]
for i in range(20):
lables += lable
df_train = pd.DataFrame({
"contents_clean": contents_clean, "lable": lables})
return df_train
def lable_map(self, df_train):
lable_map = {
"汽车": 1,"财经": 2,"科技": 3,"健康": 4,"体育": 5,"教育": 6,"文化": 7,",军事": 8,"娱乐": 9,"时尚": 0}
df_train["lable"] = df_train["lable"].map(lable_map)
return df_train
if __name__ == '__main__':
nc = NewClassify()
#获取预处理后的新闻内容列表
contents_clean, all_words = nc.pre_process_dataset()
#制作新闻内容的Dataframe
df_content = pd.DataFrame({
"contents_clean": contents_clean})
#制作全部词的DataFrame
df_all_words = pd.DataFrame({
"all_words": all_words})
# #统计词频
# words_count = nc.statistic_word_frequency(df_all_words)
# #提取关键词
# nc.extract_abstract_words(df_content)
#LDA建模
#nc.LDA_model
#给新闻数据打标签
df_train = nc.lable_news_process(contents_clean)
#将标签映射成数值
df_train = nc.lable_map(df_train)
#利用sklearn进行新闻分类
#划分训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(df_train["contents_clean"].values, df_train["lable"].values)
#制作词向量
words = []
for line_index in range(len(x_train)):
try:
words.append(" ".join(x_train[line_index]))
except:
print(line_index)
vec = CountVectorizer(analyzer="word", max_features=4000, lowercase=False)
vec.fit(words)
#实例化一个贝叶斯分类器
classifier = MultinomialNB()
#传入数据进行训练
classifier.fit(vec.transform(words), y_train)
#利用测试机数据进行模型测试
test_words = []
for line_index in range(len(x_test)):
try:
test_words.append(" ".join(x_test[line_index]))
except:
print(line_index)
#打印准确率
print(classifier.score(vec.transform(test_words), y_test))