比赛地址:https://www.biendata.com/competition/falsenews/rules/
虚假新闻文本检测:文本是新闻信息的主要载体,对新闻文本的研究有助于虚假新闻的有效识别。具体任务为:给定一个新闻事件的文本,判定该事件属于真实新闻还是虚假新闻。
数据字段:
id:新闻id,每条文本中id均不相同,唯一表征一条新闻;
text: 新闻的文本内容;
label: 取值为{0,1},0表示真实新闻,1表示虚假新闻。
BERT-Base Chinese
1、数据读取:
import re, os, json, codecs, gc
import numpy as np
import pandas as pd
from random import choice
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from keras_bert import load_trained_model_from_checkpoint, Tokenizer
from keras.layers import *
from keras.callbacks import *
from keras.models import Model
import keras.backend as K
from keras.optimizers import Adam
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
# %%
train_df = pd.read_csv('./data/train.csv',usecols=['text','label']).drop_duplicates()
test_df = pd.read_csv('./data/test_stage1.csv')
#debunking = pd.read_csv('./debunking.csv',usecols=['text']).drop_duplicates()
2、定义文本筛选函数,清洗文本:
def stop_words(x):
try:
x = x.strip()
except:
return ''
x = re.sub('\?\?+','',x)
x = re.sub('@','',x)
x = re.sub('\d{8,11}','',x)
x = re.sub('\{IMG:.?.?.?\}','',x)
x = re.sub('{IMG','',x)
x = re.sub('nan,','',x)
x = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+','',x)
x = re.sub(r'<.*?>','',x)
x = re.sub(r'\n','',x)
x = re.sub(r' ','',x)
x = re.sub(r'&ldquo','',x)
x = re.sub(r'//','',x)
x = re.sub(r'#','',x)
x = x.replace(",", ",")
x = x.replace("\xa0", "")
x = x.replace("\b", "")
x = x.replace('"', "")
x = re.sub("\t|\n|\x0b|\x1c|\x1d|\x1e", "", x)
x = re.sub('\?\?+','',x)
x = re.sub('\{IMG:.?.?.?\}','',x)
x = re.sub('\t|\n','', x)
return x
train_df['text'] = train_df['text'].apply(lambda x:stop_words(x))
test_df['text'] = test_df['text'].apply(lambda x:stop_words(x))
train_df.text = train_df.text
test_df.text = test_df.text
3、加载sklearn和keras相关包,设置好bert路径:
# ! -*- coding:utf-8 -*-
import re, os, json, codecs, gc
import numpy as np
import pandas as pd
from random import choice
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from keras_bert import load_trained_model_from_checkpoint, Tokenizer
import tensorflow as tf
from keras.layers import *
from keras.callbacks import *
from keras.models import Model
import keras.backend as K
from keras.optimizers import Adam
maxlen = 256
config_path = './bert_base_zh/bert_config.json'
checkpoint_path = './bert_base_zh/bert_model.ckpt'
dict_path = './bert_base_zh/vocab.txt'
4、token+padding
token_dict = {}
with codecs.open(dict_path, 'r', 'utf8') as reader:
for line in reader:
token = line.strip()
token_dict[token] = len(token_dict)
class OurTokenizer(Tokenizer):
def _tokenize(self, text):
R = []
for c in text:
if c in self._token_dict:
R.append(c)
elif self._is_space(c):
R.append('[unused1]')
else:
R.append('[UNK]')
return R
tokenizer = OurTokenizer(token_dict)
def seq_padding(X, padding=0):
L = [len(x) for x in X]
ML = max(L)
return np.array([
np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X
])
5、创建按批次生成数据的类:
class data_generator:
def __init__(self, data, batch_size=32, shuffle=True):
self.data = data
self.batch_size = batch_size
self.shuffle = shuffle
self.steps = len(self.data) // self.batch_size
if len(self.data) % self.batch_size != 0:
self.steps += 1
def __len__(self):
return self.steps
def __iter__(self):
while True:
idxs = list(range(len(self.data)))
if self.shuffle:
np.random.shuffle(idxs)
X1, X2, Y = [], [], []
for i in idxs:
d = self.data[i]
text = d[0][:maxlen]
x1, x2 = tokenizer.encode(first=text)
y = d[1]
X1.append(x1)
X2.append(x2)
Y.append([y])
if len(X1) == self.batch_size or i == idxs[-1]:
X1 = seq_padding(X1)
X2 = seq_padding(X2)
Y = seq_padding(Y)
yield [X1, X2], Y[:, 0, :]
[X1, X2, Y] = [], [], []
6、加载bert模型,进行fine-tune:
def build_bert(nclass):
bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, seq_len=None)
for l in bert_model.layers:
l.trainable = True
x1_in = Input(shape=(None,))
x2_in = Input(shape=(None,))
x = bert_model([x1_in, x2_in])
x = Lambda(lambda x: x[:, 0])(x)
p = Dense(nclass, activation='softmax')(x)
model = Model([x1_in, x2_in], p)
model.compile(loss='categorical_crossentropy',
optimizer=Adam(1e-5),
metrics=['accuracy'])
print(model.summary())
return model
7、定义训练函数:
from keras.utils import to_categorical
DATA_LIST = []
for data_row in train_df.iloc[:].itertuples():
DATA_LIST.append((data_row.text, to_categorical(data_row.label, 2)))
DATA_LIST = np.array(DATA_LIST, dtype=object)
DATA_LIST_TEST = []
for data_row in test_df.iloc[:].itertuples():
DATA_LIST_TEST.append((data_row.text, to_categorical(0, 2)))
DATA_LIST_TEST = np.array(DATA_LIST_TEST, dtype=object)
def run_cv( data, data_test):
train_model_pred = np.zeros((len(data), 2))
test_model_pred = np.zeros((len(data_test), 2))
shuffle_indexes = np.random.permutation(len(data))
test_ratio = 0.25
test_size = int(len(data) * test_ratio)
test_indexes = shuffle_indexes[:test_size]
train_indexes = shuffle_indexes[test_size:]
X_train = data[train_indexes]
X_valid = data[test_indexes]
model = build_bert(2)
early_stopping = EarlyStopping(monitor='val_accuracy', patience=3)
plateau = ReduceLROnPlateau(monitor="val_accuracy", verbose=1, mode='max', factor=0.5, patience=2)
checkpoint = ModelCheckpoint('./model/' + 'bert2.hdf5', monitor='val_accuracy',
verbose=2, save_best_only=True, mode='max', save_weights_only=True)
train_D = data_generator(X_train, shuffle=True)
valid_D = data_generator(X_valid, shuffle=True)
test_D = data_generator(data_test, shuffle=False)
model.fit_generator(
train_D.__iter__(),
steps_per_epoch=len(train_D),
epochs=7,
validation_data=valid_D.__iter__(),
validation_steps=len(valid_D),
callbacks=[early_stopping, plateau, checkpoint],
)
# model.load_weights('./bert_dump/' + str(i) + '.hdf5')
# return model
test_model_pred += model.predict_generator(test_D.__iter__(), steps=len(test_D), verbose=1)
del model
gc.collect()
K.clear_session()
return test_model_pred # train_model_pred
8、训练并输出结果:
test_model_pred = run_cv(DATA_LIST, DATA_LIST_TEST)
# %%
test_df['label'] = np.argmax(test_model_pred, 1)
# %%
test_df[['id', 'label']].to_csv('task_bert.csv', index=None)
由于自身实力不足,最后比排名定格在72\322,未能进入前50进入复赛,有些可惜,以后继续加油吧。