基于word2vec的虚假新闻检测系统

 文章目录

前言

  • 数据处理
  • 模型训练
  • 三,调参
  • 四,结果预测

前言

        当时也没有发现什么问题,后面老师给我们说了这个检测的目的性不明确,就像有的新闻并不是虚假的,但是文辞过于偏激就很容易判断错误,可以做一个关于新闻性质的判断,例如是否危害社会稳定,是否传播不良信息。


一、数据处理

import pandas as pd
import numpy as np
import re
from gensim.models import word2vec
import jieba
import os
import pickle


# 过滤分词列表中的停用词
def stopwords_filter(stopwords_list, seg_list):
    filter_words_list = []
    # 停用词过滤
    for word in seg_list:
        if word not in stopwords_list:
            filter_words_list.append(word)
    return filter_words_list


# 中文段落分词,返回词语列表(包含停用词过滤)
def sentence_seg(sentence):
    pattern = re.compile("[^\u4e00-\u9fa5]+")
    # 以下两行过滤出中文及字符串以外的其他符号
    sentence = pattern.sub('', sentence)
    return stopwords_filter(pd.read_table('dataset/cn_stopwords.txt', header=None).iloc[:, :].values,
                            jieba.cut(sentence))


# 新闻csv预处理成特征向量和标签
def csv2vec(csv_path, is_train=True):
    df = pd.read_csv(csv_path)  # 读取数据

    # 数据清理
    df.drop(axis=1, inplace=True, columns=["Unnamed: 0"])  # 删除索引列
    df = df.replace(re.compile(r'\[.*?\]'), " ", regex=True)  # 去除[xxx]
    df = df.replace(re.compile(r'@.*?:'), " ", regex=True)  # 去除@xxx
    df = df.replace("\t", " ", regex=False)  # 去除转义字符
    df = df.replace("网页链接", " ", regex=False)  # 去除网页链接
    df['content'] = df['content'].str.strip()  # 去除首尾空格
    df = df.fillna(value=' ')   # 填充空值

    # 内容分词和评论分词
    df['content'] = df['content'].apply(lambda x: ' '.join(sentence_seg(x)))
    df['comment_all'] = df['comment_all'].apply(lambda x: ' '.join(sentence_seg(x)))
    df = df.fillna(value=' ')  # 填充空值

    # 根据新闻语料构建词向量模型
    content_seglist = [x.split(' ') for x in df['content']]
    comment_seglist = [x.split(' ') for x in df['comment_all']]
    wv_model = None
    wv_size = 50
    if is_train:
        wv_model = word2vec.Word2Vec(content_seglist + comment_seglist, vector_size=wv_size, min_count=1)
        with open('model/wv.model', 'wb') as outfile:
            pickle.dump(wv_model, outfile)  # 保存词向量
    else:
        with open('model/wv.model', 'rb') as infile:
            wv_model = pickle.load(infile)  # 载入词向量

    # 提取新闻特征向量
    feature = []
    for i in range(len(content_seglist)):
        feature_vec = np.zeros(shape=[0], dtype='float32')  # 2n维特征向量
        text_vec = np.zeros(shape=[wv_size], dtype='float32')  # 文本向量(n维),采用n维词向量的平均值
        count = 0  # 词数量
        for word in content_seglist[i]:
            if wv_model.wv.has_index_for(word):
                text_vec += wv_model.wv[word]   # 词向量累加
                count += 1
        if count != 0:
            feature_vec = np.concatenate((feature_vec, text_vec / count))
        else:
            feature_vec = np.concatenate((feature_vec, text_vec))

        text_vec = np.zeros(shape=[wv_size], dtype='float32')  # 文本向量(n维),采用n维词向量的平均值
        count = 0  # 词数量
        for word in comment_seglist[i]:
            if wv_model.wv.has_index_for(word):
                text_vec += wv_model.wv[word]   # 词向量累加
                count += 1
        if count != 0:
            feature_vec = np.concatenate((feature_vec, text_vec / count))
        else:
            feature_vec = np.concatenate((feature_vec, text_vec))
        feature.append(feature_vec.tolist())

    label = []
    if is_train:    # 对于训练集还要返回label集合
        for x in df['label']:
            label.append(x)
    return {'X': np.array(feature), 'y': np.array(label)}


# 预处理
train_set = csv2vec('dataset/train.csv', is_train=True)
test_set = csv2vec('dataset/test.csv', is_train=False)
# 保存结果
with open('dataset/train.pkl', 'wb') as file:
    pickle.dump(train_set, file)
with open('dataset/test.pkl', 'wb') as file:
    pickle.dump(test_set, file)

二、模型训练

import pickle
import joblib
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

clf = MLPClassifier(max_iter=500, hidden_layer_sizes=(100,), solver='adam', alpha=0.0002)
with open('dataset/train.pkl', 'rb') as file:
    # 训练
    train_set = pickle.load(file)
    scaler = StandardScaler()
    train_data = scaler.fit_transform(train_set['X'])   # 标准化(MLP对此敏感)
    joblib.dump(scaler, 'model/scaler.model')
    train_label = train_set['y']
    clf.fit(train_data, train_label)
    joblib.dump(clf, 'model/mlp.model')  # 保存模型

三.调参

import pickle
import joblib
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

clf = MLPClassifier()
mlp_clf_tuned_parameters = {"hidden_layer_sizes": [(100,), (100, 30), (100, 30, 30)],
                            "solver": ['adam', 'sgd', 'lbfgs'],
                            "max_iter": [20],   # 为了快速得出最优参数这里把迭代次数设置小一点
                            "alpha": np.linspace(0.0001, 0.0005, 5)
                            }   # mlp参数调整范围
opt = GridSearchCV(clf, mlp_clf_tuned_parameters)     # 自动调参器
with open('dataset/train.pkl', 'rb') as file:
    train_set = pickle.load(file)
    scaler = StandardScaler()
    # 标准化(MLP对此敏感)
    train_data = scaler.fit_transform(train_set['X'])
    train_label = train_set['y']
    opt.fit(train_data, train_label)
    print(opt.get_params().keys())
    print(opt.best_params_)

四.模型预测

import joblib
import pickle
from sklearn.neural_network import MLPClassifier

#0为真实新闻,1为虚假新闻,-1为评论
clf = joblib.load('model/mlp.model')
scaler = joblib.load('model/scaler.model')
with open('dataset/test.pkl', 'rb') as infile:
    test_set = pickle.load(infile)
    pred = clf.predict(scaler.transform(test_set['X']))
    with open('mlp_pred.txt', 'w') as outfile:
        for x in pred:
            outfile.write(str(x) + '\n')

你可能感兴趣的:(word2vec,python)