互联网虚假新闻检测挑战赛-比赛记录

比赛地址:https://www.biendata.com/competition/falsenews/rules/

任务描述

虚假新闻文本检测:文本是新闻信息的主要载体,对新闻文本的研究有助于虚假新闻的有效识别。具体任务为:给定一个新闻事件的文本,判定该事件属于真实新闻还是虚假新闻。

数据描述

  1. 虚假新闻文本检测任务中,训练集共包含38,471条新闻,其中包含真实新闻19,186条,虚假新闻19,285条。初赛测试集共4,000条,复赛测试集3,902条,真假新闻比例与训练集基本一致。

数据字段:

    id:新闻id,每条文本中id均不相同,唯一表征一条新闻;

    text: 新闻的文本内容;

    label: 取值为{0,1},0表示真实新闻,1表示虚假新闻。

bert模型下载

BERT-Base Chinese

代码实现

1、数据读取:

import re, os, json, codecs, gc
import numpy as np
import pandas as pd
from random import choice
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from keras_bert import load_trained_model_from_checkpoint, Tokenizer
from keras.layers import *
from keras.callbacks import *
from keras.models import Model
import keras.backend as K
from keras.optimizers import Adam
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score

os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
# %%

train_df = pd.read_csv('./data/train.csv',usecols=['text','label']).drop_duplicates()
test_df = pd.read_csv('./data/test_stage1.csv')
#debunking = pd.read_csv('./debunking.csv',usecols=['text']).drop_duplicates()

2、定义文本筛选函数,清洗文本:

def stop_words(x):
    try:
        x = x.strip()
    except:
        return ''
    x = re.sub('\?\?+','',x)
    x = re.sub('@','',x)
    x = re.sub('\d{8,11}','',x)
    x = re.sub('\{IMG:.?.?.?\}','',x)
    x = re.sub('{IMG','',x)
    x = re.sub('nan,','',x)
    x = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+','',x)
    x = re.sub(r'<.*?>','',x)
    x = re.sub(r'\n','',x)
    x = re.sub(r' ','',x)
    x = re.sub(r'&ldquo','',x)
    x = re.sub(r'//','',x)
    x = re.sub(r'#','',x)
    x = x.replace(",", ",")
    x = x.replace("\xa0", "")
    x = x.replace("\b", "")
    x = x.replace('"', "")
    x = re.sub("\t|\n|\x0b|\x1c|\x1d|\x1e", "", x)
    x = re.sub('\?\?+','',x)
    x = re.sub('\{IMG:.?.?.?\}','',x)
    x = re.sub('\t|\n','', x)
    return x

train_df['text'] = train_df['text'].apply(lambda x:stop_words(x))
test_df['text'] = test_df['text'].apply(lambda x:stop_words(x))

train_df.text = train_df.text
test_df.text = test_df.text

3、加载sklearn和keras相关包,设置好bert路径:

# ! -*- coding:utf-8 -*-
import re, os, json, codecs, gc
import numpy as np
import pandas as pd
from random import choice
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from keras_bert import load_trained_model_from_checkpoint, Tokenizer
import tensorflow as tf
from keras.layers import *
from keras.callbacks import *
from keras.models import Model
import keras.backend as K
from keras.optimizers import Adam

maxlen = 256
config_path = './bert_base_zh/bert_config.json'
checkpoint_path = './bert_base_zh/bert_model.ckpt'
dict_path = './bert_base_zh/vocab.txt'

4、token+padding


token_dict = {}
with codecs.open(dict_path, 'r', 'utf8') as reader:
    for line in reader:
        token = line.strip()
        token_dict[token] = len(token_dict)


class OurTokenizer(Tokenizer):
    def _tokenize(self, text):
        R = []
        for c in text:
            if c in self._token_dict:
                R.append(c)
            elif self._is_space(c):
                R.append('[unused1]')
            else:
                R.append('[UNK]')
        return R


tokenizer = OurTokenizer(token_dict)


def seq_padding(X, padding=0):
    L = [len(x) for x in X]
    ML = max(L)
    return np.array([
        np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X
    ])

5、创建按批次生成数据的类:

class data_generator:
    def __init__(self, data, batch_size=32, shuffle=True):
        self.data = data
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.steps = len(self.data) // self.batch_size
        if len(self.data) % self.batch_size != 0:
            self.steps += 1

    def __len__(self):
        return self.steps

    def __iter__(self):
        while True:
            idxs = list(range(len(self.data)))

            if self.shuffle:
                np.random.shuffle(idxs)

            X1, X2, Y = [], [], []
            for i in idxs:
                d = self.data[i]
                text = d[0][:maxlen]
                x1, x2 = tokenizer.encode(first=text)
                y = d[1]
                X1.append(x1)
                X2.append(x2)
                Y.append([y])
                if len(X1) == self.batch_size or i == idxs[-1]:
                    X1 = seq_padding(X1)
                    X2 = seq_padding(X2)
                    Y = seq_padding(Y)
                    yield [X1, X2], Y[:, 0, :]
                    [X1, X2, Y] = [], [], []

6、加载bert模型,进行fine-tune:

def build_bert(nclass):
    bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, seq_len=None)

    for l in bert_model.layers:
        l.trainable = True

    x1_in = Input(shape=(None,))
    x2_in = Input(shape=(None,))

    x = bert_model([x1_in, x2_in])
    x = Lambda(lambda x: x[:, 0])(x)
    p = Dense(nclass, activation='softmax')(x)

    model = Model([x1_in, x2_in], p)
    model.compile(loss='categorical_crossentropy',
                  optimizer=Adam(1e-5),
                  metrics=['accuracy'])
    print(model.summary())
    return model

7、定义训练函数:

from keras.utils import to_categorical

DATA_LIST = []
for data_row in train_df.iloc[:].itertuples():
    DATA_LIST.append((data_row.text, to_categorical(data_row.label, 2)))
DATA_LIST = np.array(DATA_LIST, dtype=object)

DATA_LIST_TEST = []
for data_row in test_df.iloc[:].itertuples():
    DATA_LIST_TEST.append((data_row.text, to_categorical(0, 2)))
DATA_LIST_TEST = np.array(DATA_LIST_TEST, dtype=object)

def run_cv( data, data_test):
    train_model_pred = np.zeros((len(data), 2))
    test_model_pred = np.zeros((len(data_test), 2))

    shuffle_indexes = np.random.permutation(len(data))
    test_ratio = 0.25
    test_size = int(len(data) * test_ratio)
    test_indexes = shuffle_indexes[:test_size]
    train_indexes = shuffle_indexes[test_size:]
    X_train = data[train_indexes]
    X_valid = data[test_indexes]
    model = build_bert(2)
    early_stopping = EarlyStopping(monitor='val_accuracy', patience=3)
    plateau = ReduceLROnPlateau(monitor="val_accuracy", verbose=1, mode='max', factor=0.5, patience=2)
    checkpoint = ModelCheckpoint('./model/' + 'bert2.hdf5', monitor='val_accuracy',
                                 verbose=2, save_best_only=True, mode='max', save_weights_only=True)
    train_D = data_generator(X_train, shuffle=True)
    valid_D = data_generator(X_valid, shuffle=True)
    test_D = data_generator(data_test, shuffle=False)
    model.fit_generator(
        train_D.__iter__(),
        steps_per_epoch=len(train_D),
        epochs=7,
        validation_data=valid_D.__iter__(),
        validation_steps=len(valid_D),
        callbacks=[early_stopping, plateau, checkpoint],
    )

    # model.load_weights('./bert_dump/' + str(i) + '.hdf5')

    # return model
    test_model_pred += model.predict_generator(test_D.__iter__(), steps=len(test_D), verbose=1)

    del model
    gc.collect()
    K.clear_session()
    return test_model_pred  # train_model_pred

8、训练并输出结果:

test_model_pred = run_cv(DATA_LIST, DATA_LIST_TEST)

# %%

test_df['label'] = np.argmax(test_model_pred, 1)

# %%

test_df[['id', 'label']].to_csv('task_bert.csv', index=None)

后记

由于自身实力不足,最后比排名定格在72\322,未能进入前50进入复赛,有些可惜,以后继续加油吧。

你可能感兴趣的:(深度学习,竞赛)