问答摘要与推理(nlp)之baseline构建

项目信息请看上一篇blog https://blog.csdn.net/weixin_42813521/article/details/105893077
本篇先构建一个baseline。^-^
问答摘要与推理(nlp)之baseline构建_第1张图片

1.数据预处理

py文件链接:链接:https://pan.baidu.com/s/1A6Yb9MQCUpiW2JCGQs2ncg 提取码:d66s

import pandas as pd
import numpy as np
from utils.data_loader import build_dataset,pad_proc,sentences_proc
from utils.config import *
from utils.multi_proc_utils import parallelize
from gensim.models.word2vec import LineSentence, Word2Vec
train_data_path = 'data/AutoMaster_TrainSet.csv'
test_data_path = 'data/AutoMaster_TestSet.csv'

1.1加载数据

train_df = pd.read_csv(train_data_path)
test_df = pd.read_csv(test_data_path)
print('train data size {},test data size {}'.format(len(train_df), len(test_df)))

输出:train data size 82943,test data size 20000

1.2 空值填充

train_df.dropna(subset=['Question', 'Dialogue', 'Report'], how='any', inplace=True)
test_df.dropna(subset=['Question', 'Dialogue'], how='any', inplace=True)
print('train data size {},test data size {}'.format(len(train_df), len(test_df)))

输出:train data size 82871,test data size 20000

1.3.多进程, 批量数据处理

train_df = parallelize(train_df, sentences_proc)
test_df = parallelize(test_df, sentences_proc)

1.4 合并训练测试数据

train_df['merged'] = train_df[['Question', 'Dialogue', 'Report']].apply(lambda x: ' '.join(x), axis=1)
test_df['merged'] = test_df[['Question', 'Dialogue']].apply(lambda x: ' '.join(x), axis=1)
merged_df = pd.concat([train_df[['merged']], test_df[['merged']]], axis=0)
print('train data size {},test data size {},merged_df data size {}'.format(len(train_df), len(test_df),len(merged_df)))

输出:train data size 82871,test data size 20000,merged_df data size 102871

1.5 保存处理好的 训练 测试集合

train_df = train_df.drop(['merged'], axis=1)
test_df = test_df.drop(['merged'], axis=1)
train_df.to_csv(train_seg_path, index=None, header=True)
test_df.to_csv(test_seg_path, index=None, header=True)
merged_df.to_csv(merger_seg_path, index=None, header=False)

2. 词向量

2.1 预训练词向量

wv_model = Word2Vec(LineSentence(merger_seg_path),
                    size=300, 
                    negative=5, 
                    workers=8, 
                    iter=wv_train_epochs, 
                    window=3,
                    min_count=5)

2.2. 建立词表

vocab = {word: index for index, word in enumerate(wv_model.wv.index2word)}
reverse_vocab = {index: word for index, word in enumerate(wv_model.wv.index2word)}
len(vocab)

输出:33230

2.3. 获取词向量矩阵

embedding_matrix = wv_model.wv.vectors
embedding_matrix.shape

输出:(33230, 300)

3. 构建训练数据

  1. 可以把Question,Dialogue当做一句 长文本处理, 合并构建成X
  2. Report作为需要预测的标签,构建Y
train_df['X'] = train_df[['Question', 'Dialogue']].apply(lambda x: ' '.join(x), axis=1)
test_df['X'] = test_df[['Question', 'Dialogue']].apply(lambda x: ' '.join(x), axis=1)

3.1 填充字段

def pad_proc(sentence, max_len, vocab):
    '''
    < start > < end > < pad > < unk >
    '''
    # 0.按空格统计切分出词
    words = sentence.strip().split(' ')
    # 1. 截取规定长度的词数
    words = words[:max_len]
    # 2. 填充< unk > ,判断是否在vocab中, 不在填充 < unk >
    sentence = [word if word in vocab else '' for word in words]
    # 3. 填充< start > < end >
    sentence = [''] + sentence + ['']
    # 4. 判断长度,填充 < pad >
    sentence = sentence + [''] * (max_len + 2 - len(words))
    return ' '.join(sentence)

3.2 获取适当的Max_Len

max_len一般选择均值加二倍方差

def get_max_len(data):
    """
    获得合适的最大长度值
    :param data: 待统计的数据  train_df['Question']
    :return: 最大长度值
    """
    max_lens = data.apply(lambda x: x.count(' '))
    return int(np.mean(max_lens) + 2 * np.std(max_lens))
# 获取输入数据 适当的最大长度
train_y_max_len = get_max_len(train_df['X'])
test_y_max_len = get_max_len(test_df['X'])

x_max_len = max(train_y_max_len, test_y_max_len)

# 获取标签数据 适当的最大长度
train_y_max_len = get_max_len(train_df['Report'])

3.3 填充处理

< start > - 句子开始
< end > - 句子结尾
< pad > - 短句填充
< unk > - 未知词

# 训练集X处理
train_df['X'] = train_df['X'].apply(lambda x: pad_proc(x, x_max_len, vocab))
# 训练集Y处理
train_df['Y'] = train_df['Report'].apply(lambda x: pad_proc(x, train_y_max_len, vocab))
# 测试集X处理
test_df['X'] = test_df['X'].apply(lambda x: pad_proc(x, x_max_len, vocab))

保存中间结果数据

# 保存中间结果数据
train_df['X'].to_csv(train_x_pad_path, index=None, header=False)
train_df['Y'].to_csv(train_y_pad_path, index=None, header=False)
test_df['X'].to_csv(test_x_pad_path, index=None, header=False)

现在出现的情况是新加的这些符号不在词表和词向量中,需要更新词表和词向量矩阵

3.4 词表更新

print('start retrain w2v model')
wv_model.build_vocab(LineSentence(train_x_pad_path), update=True)
wv_model.train(LineSentence(train_x_pad_path), epochs=wv_train_epochs, total_examples=wv_model.corpus_count)
print('1/3')
wv_model.build_vocab(LineSentence(train_y_pad_path), update=True)
wv_model.train(LineSentence(train_y_pad_path), epochs=wv_train_epochs, total_examples=wv_model.corpus_count)
print('2/3')
wv_model.build_vocab(LineSentence(test_x_pad_path), update=True)
wv_model.train(LineSentence(test_x_pad_path), epochs=wv_train_epochs, total_examples=wv_model.corpus_count)
#保存词向量模型
wv_model.save(save_wv_model_path)

更新后的vocab和embedding_matrix

#更新vocab
vocab = {word: index for index, word in enumerate(wv_model.wv.index2word)}
reverse_vocab = {index: word for index, word in enumerate(wv_model.wv.index2word)}
# 更新词向量矩阵
embedding_matrix = wv_model.wv.vectors
embedding_matrix.shape

输出:(33234, 300)

3.5数值转换

# 遇到未知词就填充unk的索引
unk_index = vocab['']
def transform_data(sentence,vocab):
    # 字符串切分成词
    words=sentence.split(' ')
    # 按照vocab的index进行转换
    ids=[vocab[word] if word in vocab else unk_index for word in words]
    return ids

将文本转换成数值型数据

# 将词转换成索引  [ 方向机 重 ...] -> [32800, 403, 986, 246, 231
train_ids_x=train_df['X'].apply(lambda x:transform_data(x,vocab))
train_ids_y=train_df['Y'].apply(lambda x:transform_data(x,vocab))
test_ids_x=test_df['X'].apply(lambda x:transform_data(x,vocab))
# 将索引列表转换成矩阵 [32800, 403, 986, 246, 231] --> array([[32800,   403,   986 ]]
train_data_X=np.array(train_ids_x.tolist())
train_data_Y=np.array(train_ids_y.tolist())
test_data_X=np.array(test_ids_x.tolist())

4.搭建简易的seq2seq模型

使用的神经元为GRU单元

from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional
from tensorflow.keras.layers import Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy
def seq2seq(input_length, output_sequence_length, embedding_matrix, vocab_size):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=300, weights=[embedding_matrix], trainable=False,
                        input_length=input_length))
    model.add(Bidirectional(GRU(300, return_sequences=False)))
    model.add(Dense(300, activation="relu"))
    model.add(RepeatVector(output_sequence_length))
    model.add(Bidirectional(GRU(300, return_sequences=True)))
    model.add(TimeDistributed(Dense(vocab_size, activation='softmax')))
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(1e-3))
    model.summary()
    return model

4.1 基本参数设置

# 输入的长度   x  max_len
input_length = train_data_X.shape[1]
# 输出的长度  y  max_len
output_sequence_length = train_data_Y.shape[1]
# 词表大小
vocab_size=len(vocab)
# 词向量矩阵
embedding_matrix = wv_model.wv.vectors

4.2 模型构建

model = seq2seq(input_length,output_sequence_length,embedding_matrix,vocab_size)

4.3 模型训练

model.fit(train_data_X, train_data_Y, batch_size=32, epochs=1, validation_split=0.2)

4.4 模型保存

model.save('data/seq2seq_model.h')

你可能感兴趣的:(nlp,python,深度学习,tensorflow,nlp,问答系统)