新闻分类问题

朴素贝叶斯实战

  • 1 代码
  • 2 输出结果
    • 1、词云
    • 2、精度

1 代码

import pandas as pd

# 读取数据
df_news = pd.read_table('./data/val.txt', names=['category', 'theme', 'URL', 'content'], encoding='utf-8')
df_news = df_news.dropna()
# print(df_news.shape)

import jieba
content = df_news.content.values.tolist()   # 利用tolist将其转化为列表的形式
# print(content[0])

# 利用jieba进行分词,并将分好的词保存在content_S中
content_S = []
for line in content:
    current_segment = jieba.lcut(line)
    if len(current_segment) > 1 and current_segment != '\r\n':
        content_S.append(current_segment)
# print(content_S[1000])
# 创建一个DataFrame格式
df_content = pd.DataFrame({'content_S': content_S})
# print(df_content.head())

# 分完词后数据太乱,对数据进行清洗工作,停用词过滤操作
stopwords = pd.read_csv('stopwords.txt', index_col=False, sep='\t', quoting=3, names=['stopword'], encoding='utf-8')
# print(stopwords.head())


def drop_stopwords(contents, stopwords):
    """

    :param contents: 输入分好词的列表
    :param stopwords: 停用词
    :return: 去掉停用词的contents_clean列表以及用词云查看词频可视化的all_words
    """
    # 停用词过滤
    contents_clean = []
    all_words = []
    for line in contents:
        line_clean = []
        for word in line:
            if word in stopwords:
                continue
            line_clean.append(word)
            all_words.append(str(word))  # 将字符串的词提取出来方便下面进行词云查看
        contents_clean.append(line_clean)
    return contents_clean, all_words


contents = df_content.content_S.values.tolist()
stopwords = stopwords.stopword.values.tolist()
contents_clean, all_word = drop_stopwords(contents, stopwords)
df_all_words = pd.DataFrame({'all_words': all_word})
# print(df_all_words.head())
df_content = pd.DataFrame({'contents_clean': contents_clean})
# print(df_content.head())

# 查看词频
import numpy as np

# words_count = df_all_words.groupby(['all_words'])['all_words'].agg({'count': np.size})
words_count = df_all_words.groupby(['all_words'])['all_words'].agg(np.size)
words_count = words_count.to_frame()
words_count.columns = ['count']
words_count = words_count.reset_index().sort_values(by=['count'], ascending=False)
# print(words_count.head())

# 利用词云查看词频
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import matplotlib

matplotlib.rcParams['figure.figsize'] = (100, 50)
wordcloud = WordCloud(font_path = 'data/simhei.ttf', background_color='white', max_font_size=100)
word_freq = {x[0]: x[1] for x in words_count.head(500).values}
wordcloud = wordcloud.fit_words(word_freq)
plt.imshow(wordcloud)
plt.show()

# TF_IDF:提取关键词
import jieba.analyse
index = 2000
# print(df_news['content'][index])
content_S_str = ''.join(content_S[index])
# 返回5个关键词
# print('  '.join(jieba.analyse.extract_tags(content_S_str, topK=5, withWeight=False)))

# LDA: 主题模型
from gensim import corpora, models, similarities
import gensim
# 做映射,相当于词袋
dictionary = corpora.Dictionary(contents_clean)
corpus = [dictionary.doc2bow(sentence) for sentence in contents_clean]
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=20)
# print(lda.print_topic(1, topn=5))
# 打印20个
# for topic in lda.print_topic(num_topics=20, num_word=5):
    # print(topic[1])

# 做分类任务
df_train = pd.DataFrame({'contents_clean': contents_clean, 'label': df_news['category']})
# print(df_train.tail())
# print(df_train.label.unique())
# # 映射
label_mapping = {"汽车": 1, "财经": 2, "科技": 3, "健康": 4, "体育":5, "教育": 6,"文化": 7,"军事": 8,"娱乐": 9,"时尚": 0}
df_train['label'] = df_train['label'].map(label_mapping)
# print(df_train.head())

from sklearn.model_selection import train_test_split
# 将数据划分成训练集和测试集,这个函数的用途就是讲传入的内容进行随机划分
x_train, x_test, y_train, y_test = train_test_split(df_train['contents_clean'].values, df_train['label'].values)
# print(len(x_test))

words = []
for line_index in range(len(x_train)):
    try:
        words.append(' '.join(x_train[line_index]))
    except:
        print(line_index)
        # print(line_index, word_index)
print(words[0])

from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer(analyzer='word', max_features=4000, lowercase=False)
vec.fit(words)

from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(vec.transform(words), y_train)

test_words = []
for line_index in range(len(x_test)):
    try:
        test_words.append(' '.join(x_test[line_index]))
    except:
        print(line_index)
        # print(line_index, word_index)
print(test_words[0])

acc = classifier.score(vec.transform(test_words), y_test)  # 算出最终的分类准确度为0.83左右
print(acc)






2 输出结果

1、词云

新闻分类问题_第1张图片

2、精度

acc = 0.808
数据集:https://pan.baidu.com/s/1I2jKsKEHVDvs89zEo27CVg
停用词:https://pan.baidu.com/s/15C0IeNTHAQ2g3odU4ze7BQ

你可能感兴趣的:(机器学习)