import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from tensorflow.python.keras.utils import to_categorical
from tqdm import tqdm
import tensorflow as tf
import tensorflow.keras.backend as K
import os
from transformers import *
print(tf.__version__)
# 文件读取
TRAIN_PATH = './data/train_dataset/'
TEST_PATH = './data/test_dataset/'
BERT_PATH = './bert_base_chinese/'
MAX_SEQUENCE_LENGTH = 140
input_categories = '微博中文内容'
output_categories = '情感倾向'
df_train = pd.read_csv(TRAIN_PATH+'nCoV_100k_train.labled.csv',engine ='python')
df_train = df_train[df_train[output_categories].isin(['-1','0','1'])]
df_test = pd.read_csv(TEST_PATH+'nCov_10k_test.csv',engine ='python')
df_sub = pd.read_csv(TEST_PATH+'submit_example.csv')
print('train shape =', df_train.shape)
print('test shape =', df_test.shape)
tokenizer = BertTokenizer.from_pretrained(BERT_PATH+'bert-base-chinese-vocab.txt')
tokenizer.encode_plus("深度之眼",
add_special_tokens=True,
max_length=20,
truncation_strategy= 'longest_first')
# {'input_ids': [101, 3918, 2428, 722, 4706, 102], #Token序列
# 'token_type_ids': [0, 0, 0, 0, 0, 0], #Segment序列
# 'attention_mask': [1, 1, 1, 1, 1, 1]} #Position序列 会在bert里面embedding
将微博的文本转化成三个序列进行输入
max_sequence_length 设置的固定的文本长度(取140)
def _convert_to_transformer_inputs(instance, tokenizer, max_sequence_length):
"""Converts tokenized input to ids, masks and segments for transformer (including bert)"""
"""默认返回input_ids,token_type_ids,attention_mask"""
# 使用tokenizer接口 将文本进行编码 生成一个字典 字典中包含三个元素
# instance 文本
inputs = tokenizer.encode_plus(instance,
add_special_tokens=True,
max_length=max_sequence_length,
truncation_strategy='longest_first')
# 将编码后的内容取出来
input_ids = inputs["input_ids"]
input_masks = inputs["attention_mask"]
input_segments = inputs["token_type_ids"]
padding_length = max_sequence_length - len(input_ids)
# 填充
padding_id = tokenizer.pad_token_id
input_ids = input_ids + ([padding_id] * padding_length)
input_masks = input_masks + ([0] * padding_length)
input_segments = input_segments + ([0] * padding_length)
return [input_ids, input_masks, input_segments]
# 将所有的文本进行保存
def compute_input_arrays(df, columns, tokenizer, max_sequence_length):
input_ids, input_masks, input_segments = [], [], []
for instance in tqdm(df[columns]):
ids, masks, segments = _convert_to_transformer_inputs(str(instance), tokenizer, max_sequence_length)
input_ids.append(ids)
input_masks.append(masks)
input_segments.append(segments)
return [np.asarray(input_ids, dtype=np.int32),
np.asarray(input_masks, dtype=np.int32),
np.asarray(input_segments, dtype=np.int32)
]
# 将训练集和测试集同时变成三个序列
inputs = compute_input_arrays(df_train, input_categories, tokenizer, MAX_SEQUENCE_LENGTH)
test_inputs = compute_input_arrays(df_test, input_categories, tokenizer, MAX_SEQUENCE_LENGTH)
# 标签类别转化
# 改为从0开始 变为 0 1 2 只需要加一
def compute_output_arrays(df, columns):
return np.asarray(df[columns].astype(int) + 1)
outputs = compute_output_arrays(df_train, output_categories)
BERT模型
构建BERT模型
def create_model():
# 三个序列作为输入
input_id = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32)
input_mask = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32)
input_atn = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32)
# 导入bert模型
# output_hidden_states transformer中的每一层都可以取出来
config = BertConfig.from_pretrained(BERT_PATH + 'bert-base-chinese-config.json', output_hidden_states=True)
bert_model = TFBertModel.from_pretrained(BERT_PATH + 'bert-base-chinese-tf_model.h5', config=config)
# bert模型会返回三个向量
# sequence_output 最后一层transformer的向量 (bs,140,768) batchsize 文本的长度 每一个Token的向量
# pooler_output 通过pooling之后的到的结果
# hidden_states 12层的transformer
sequence_output, pooler_output, hidden_states = bert_model(input_id, attention_mask=input_mask,
token_type_ids=input_atn)
# (bs,140,768)(bs,768)
x = tf.keras.layers.GlobalAveragePooling1D()(sequence_output)
x = tf.keras.layers.Dropout(0.15)(x)
x = tf.keras.layers.Dense(3, activation='softmax')(x)
# 模型的定义
model = tf.keras.models.Model(inputs=[input_id, input_mask, input_atn], outputs=x)
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
# 定义loss 优化函数 和 metrics指标
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['acc', 'mae'])
return model
gkf = StratifiedKFold(n_splits=5).split(X=df_train[input_categories].fillna('-1'), y=df_train[output_categories].fillna('-1'))
valid_preds = []
test_preds = []
# 简单的进行分裂
for fold, (train_idx, valid_idx) in enumerate(gkf):
train_inputs = [inputs[i][train_idx] for i in range(len(inputs))]
train_outputs = to_categorical(outputs[train_idx])
valid_inputs = [inputs[i][valid_idx] for i in range(len(inputs))]
valid_outputs = to_categorical(outputs[valid_idx])
K.clear_session()
model = create_model()
# 进行模型训练的部分
model.fit(train_inputs, train_outputs, validation_data= [valid_inputs, valid_outputs], epochs=2, batch_size=32)
# model.save_weights(f'bert-{fold}.h5')
valid_preds.append(model.predict(valid_inputs))
test_preds.append(model.predict(test_inputs))
df_test.head()
# 模型预测 1做平均 将概率相加取平均
sub = np.average(test_preds, axis=0)
sub = np.argmax(sub,axis=1)
# df_sub['y'] = sub-1
# #df_sub['id'] = df_sub['id'].apply(lambda x: str(x))
# df_sub.to_csv('test_sub.csv',index=False, encoding='utf-8')
# 要用测试集 微博id
df_sub = df_test[['微博id']]
df_sub.head()
# 将测试集写入
# 预测的时候 是将结果+1 实际写入的时候 要-1
df_sub['y'] = sub-1
df_sub.columns=['id','y']
df_sub.head()
df_sub.to_csv('test_sub.csv',index=False, encoding='utf-8')