文章目录
前言
当时也没有发现什么问题,后面老师给我们说了这个检测的目的性不明确,就像有的新闻并不是虚假的,但是文辞过于偏激就很容易判断错误,可以做一个关于新闻性质的判断,例如是否危害社会稳定,是否传播不良信息。
一、数据处理
import pandas as pd
import numpy as np
import re
from gensim.models import word2vec
import jieba
import os
import pickle
# 过滤分词列表中的停用词
def stopwords_filter(stopwords_list, seg_list):
filter_words_list = []
# 停用词过滤
for word in seg_list:
if word not in stopwords_list:
filter_words_list.append(word)
return filter_words_list
# 中文段落分词,返回词语列表(包含停用词过滤)
def sentence_seg(sentence):
pattern = re.compile("[^\u4e00-\u9fa5]+")
# 以下两行过滤出中文及字符串以外的其他符号
sentence = pattern.sub('', sentence)
return stopwords_filter(pd.read_table('dataset/cn_stopwords.txt', header=None).iloc[:, :].values,
jieba.cut(sentence))
# 新闻csv预处理成特征向量和标签
def csv2vec(csv_path, is_train=True):
df = pd.read_csv(csv_path) # 读取数据
# 数据清理
df.drop(axis=1, inplace=True, columns=["Unnamed: 0"]) # 删除索引列
df = df.replace(re.compile(r'\[.*?\]'), " ", regex=True) # 去除[xxx]
df = df.replace(re.compile(r'@.*?:'), " ", regex=True) # 去除@xxx
df = df.replace("\t", " ", regex=False) # 去除转义字符
df = df.replace("网页链接", " ", regex=False) # 去除网页链接
df['content'] = df['content'].str.strip() # 去除首尾空格
df = df.fillna(value=' ') # 填充空值
# 内容分词和评论分词
df['content'] = df['content'].apply(lambda x: ' '.join(sentence_seg(x)))
df['comment_all'] = df['comment_all'].apply(lambda x: ' '.join(sentence_seg(x)))
df = df.fillna(value=' ') # 填充空值
# 根据新闻语料构建词向量模型
content_seglist = [x.split(' ') for x in df['content']]
comment_seglist = [x.split(' ') for x in df['comment_all']]
wv_model = None
wv_size = 50
if is_train:
wv_model = word2vec.Word2Vec(content_seglist + comment_seglist, vector_size=wv_size, min_count=1)
with open('model/wv.model', 'wb') as outfile:
pickle.dump(wv_model, outfile) # 保存词向量
else:
with open('model/wv.model', 'rb') as infile:
wv_model = pickle.load(infile) # 载入词向量
# 提取新闻特征向量
feature = []
for i in range(len(content_seglist)):
feature_vec = np.zeros(shape=[0], dtype='float32') # 2n维特征向量
text_vec = np.zeros(shape=[wv_size], dtype='float32') # 文本向量(n维),采用n维词向量的平均值
count = 0 # 词数量
for word in content_seglist[i]:
if wv_model.wv.has_index_for(word):
text_vec += wv_model.wv[word] # 词向量累加
count += 1
if count != 0:
feature_vec = np.concatenate((feature_vec, text_vec / count))
else:
feature_vec = np.concatenate((feature_vec, text_vec))
text_vec = np.zeros(shape=[wv_size], dtype='float32') # 文本向量(n维),采用n维词向量的平均值
count = 0 # 词数量
for word in comment_seglist[i]:
if wv_model.wv.has_index_for(word):
text_vec += wv_model.wv[word] # 词向量累加
count += 1
if count != 0:
feature_vec = np.concatenate((feature_vec, text_vec / count))
else:
feature_vec = np.concatenate((feature_vec, text_vec))
feature.append(feature_vec.tolist())
label = []
if is_train: # 对于训练集还要返回label集合
for x in df['label']:
label.append(x)
return {'X': np.array(feature), 'y': np.array(label)}
# 预处理
train_set = csv2vec('dataset/train.csv', is_train=True)
test_set = csv2vec('dataset/test.csv', is_train=False)
# 保存结果
with open('dataset/train.pkl', 'wb') as file:
pickle.dump(train_set, file)
with open('dataset/test.pkl', 'wb') as file:
pickle.dump(test_set, file)
import pickle
import joblib
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
clf = MLPClassifier(max_iter=500, hidden_layer_sizes=(100,), solver='adam', alpha=0.0002)
with open('dataset/train.pkl', 'rb') as file:
# 训练
train_set = pickle.load(file)
scaler = StandardScaler()
train_data = scaler.fit_transform(train_set['X']) # 标准化(MLP对此敏感)
joblib.dump(scaler, 'model/scaler.model')
train_label = train_set['y']
clf.fit(train_data, train_label)
joblib.dump(clf, 'model/mlp.model') # 保存模型
import pickle
import joblib
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
clf = MLPClassifier()
mlp_clf_tuned_parameters = {"hidden_layer_sizes": [(100,), (100, 30), (100, 30, 30)],
"solver": ['adam', 'sgd', 'lbfgs'],
"max_iter": [20], # 为了快速得出最优参数这里把迭代次数设置小一点
"alpha": np.linspace(0.0001, 0.0005, 5)
} # mlp参数调整范围
opt = GridSearchCV(clf, mlp_clf_tuned_parameters) # 自动调参器
with open('dataset/train.pkl', 'rb') as file:
train_set = pickle.load(file)
scaler = StandardScaler()
# 标准化(MLP对此敏感)
train_data = scaler.fit_transform(train_set['X'])
train_label = train_set['y']
opt.fit(train_data, train_label)
print(opt.get_params().keys())
print(opt.best_params_)
import joblib
import pickle
from sklearn.neural_network import MLPClassifier
#0为真实新闻,1为虚假新闻,-1为评论
clf = joblib.load('model/mlp.model')
scaler = joblib.load('model/scaler.model')
with open('dataset/test.pkl', 'rb') as infile:
test_set = pickle.load(infile)
pred = clf.predict(scaler.transform(test_set['X']))
with open('mlp_pred.txt', 'w') as outfile:
for x in pred:
outfile.write(str(x) + '\n')