1 大纲概述
文本分类这个系列将会有十篇左右,包括基于word2vec预训练的文本分类,与及基于最新的预训练模型(ELMo,BERT等)的文本分类。总共有以下系列:
jupyter notebook代码均在textClassifier仓库中,python代码在NLP-Project中的text_classfier中。
2 数据集
数据集为IMDB 电影影评,总共有三个数据文件,在/data/rawData目录下,包括unlabeledTrainData.tsv,labeledTrainData.tsv,testData.tsv。在进行文本分类时需要有标签的数据(labeledTrainData),数据预处理如文本分类实战(一)—— word2vec预训练词向量中一样,预处理后的文件为/data/preprocess/labeledTrain.csv。
3 Transformer 模型结构
Transformer模型来自于论文Attention Is All You Need,关于Transformer具体的介绍见这篇。Transformer模型具体结构如下图:
Transformer结构有两种:Encoder和Decoder,在文本分类中只使用到了Encoder,Decoder是生成式模型,主要用于自然语言生成的。
4 参数配置
importosimportcsvimporttimeimportdatetimeimportrandomimportjsonimportwarningsfrom collections importCounterfrom math importsqrtimportgensimimportpandas as pdimportnumpy as npimporttensorflow as tffrom sklearn.metrics importroc_auc_score, accuracy_score, precision_score, recall_score
warnings.filterwarnings("ignore")
#配置参数
classTrainingConfig(object):
epoches= 10evaluateEvery= 100checkpointEvery= 100learningRate= 0.001
classModelConfig(object):
embeddingSize= 200filters= 128 #内层一维卷积核的数量,外层卷积核的数量应该等于embeddingSize,因为要确保每个layer后的输出维度和输入维度是一致的。
numHeads = 8 #Attention 的头数
numBlocks = 1 #设置transformer block的数量
epsilon = 1e-8 #LayerNorm 层中的最小除数
keepProp = 0.9 #multi head attention 中的dropout
dropoutKeepProb= 0.5 #全连接层的dropout
l2RegLambda = 0.0
classConfig(object):
sequenceLength= 200 #取了所有序列长度的均值
batchSize = 128dataSource= "../data/preProcess/labeledTrain.csv"stopWordSource= "../data/english"numClasses= 1 #二分类设置为1,多分类设置为类别的数目
rate= 0.8 #训练集的比例
training=TrainingConfig()
model=ModelConfig()#实例化配置参数对象
config = Config()
5 生成训练数据
1)将数据加载进来,将句子分割成词表示,并去除低频词和停用词。
2)将词映射成索引表示,构建词汇-索引映射表,并保存成json的数据格式,之后做inference时可以用到。(注意,有的词可能不在word2vec的预训练词向量中,这种词直接用UNK表示)
3)从预训练的词向量模型中读取出词向量,作为初始化值输入到模型中。
4)将数据集分割成训练集和测试集
#数据预处理的类,生成训练集和测试集
classDataset(object):def __init__(self, config):
self.config=config
self._dataSource=config.dataSource
self._stopWordSource=config.stopWordSource
self._sequenceLength= config.sequenceLength #每条输入的序列处理为定长
self._embeddingSize =config.model.embeddingSize
self._batchSize=config.batchSize
self._rate=config.rate
self._stopWordDict={}
self.trainReviews=[]
self.trainLabels=[]
self.evalReviews=[]
self.evalLabels=[]
self.wordEmbedding=None
self.labelList=[]def_readData(self, filePath):"""从csv文件中读取数据集"""df=pd.read_csv(filePath)if self.config.numClasses == 1:
labels= df["sentiment"].tolist()elif self.config.numClasses > 1:
labels= df["rate"].tolist()
review= df["review"].tolist()
reviews= [line.strip().split() for line inreview]returnreviews, labelsdef_labelToIndex(self, labels, label2idx):"""将标签转换成索引表示"""labelIds= [label2idx[label] for label inlabels]returnlabelIdsdef_wordToIndex(self, reviews, word2idx):"""将词转换成索引"""reviewIds= [[word2idx.get(item, word2idx["UNK"]) for item in review] for review inreviews]returnreviewIdsdef_genTrainEvalData(self, x, y, word2idx, rate):"""生成训练集和验证集"""reviews=[]for review inx:if len(review) >=self._sequenceLength:
reviews.append(review[:self._sequenceLength])else:
reviews.append(review+ [word2idx["PAD"]] * (self._sequenceLength -len(review)))
trainIndex= int(len(x) *rate)
trainReviews= np.asarray(reviews[:trainIndex], dtype="int64")
trainLabels= np.array(y[:trainIndex], dtype="float32")
evalReviews= np.asarray(reviews[trainIndex:], dtype="int64")
evalLabels= np.array(y[trainIndex:], dtype="float32")returntrainReviews, trainLabels, evalReviews, evalLabelsdef_genVocabulary(self, reviews, labels):"""生成词向量和词汇-索引映射字典,可以用全数据集"""allWords= [word for review in reviews for word inreview]#去掉停用词
subWords = [word for word in allWords if word not inself.stopWordDict]
wordCount= Counter(subWords) #统计词频
sortWordCount = sorted(wordCount.items(), key=lambda x: x[1], reverse=True)#去除低频词
words = [item[0] for item in sortWordCount if item[1] >= 5]
vocab, wordEmbedding=self._getWordEmbedding(words)
self.wordEmbedding=wordEmbedding
word2idx=dict(zip(vocab, list(range(len(vocab)))))
uniqueLabel=list(set(labels))
label2idx=dict(zip(uniqueLabel, list(range(len(uniqueLabel)))))
self.labelList=list(range(len(uniqueLabel)))#将词汇-索引映射表保存为json数据,之后做inference时直接加载来处理数据
with open("../data/wordJson/word2idx.json", "w", encoding="utf-8") as f:
json.dump(word2idx, f)
with open("../data/wordJson/label2idx.json", "w", encoding="utf-8") as f:
json.dump(label2idx, f)returnword2idx, label2idxdef_getWordEmbedding(self, words):"""按照我们的数据集中的单词取出预训练好的word2vec中的词向量"""wordVec= gensim.models.KeyedVectors.load_word2vec_format("../word2vec/word2Vec.bin", binary=True)
vocab=[]
wordEmbedding=[]#添加 "pad" 和 "UNK",
vocab.append("PAD")
vocab.append("UNK")
wordEmbedding.append(np.zeros(self._embeddingSize))
wordEmbedding.append(np.random.randn(self._embeddingSize))for word inwords:try:
vector=wordVec.wv[word]
vocab.append(word)
wordEmbedding.append(vector)except:print(word + "不存在于词向量中")returnvocab, np.array(wordEmbedding)def_readStopWord(self, stopWordPath):"""读取停用词"""with open(stopWordPath,"r") as f:
stopWords=f.read()
stopWordList=stopWords.splitlines()#将停用词用列表的形式生成,之后查找停用词时会比较快
self.stopWordDict =dict(zip(stopWordList, list(range(len(stopWordList)))))defdataGen(self):"""初始化训练集和验证集"""
#初始化停用词
self._readStopWord(self._stopWordSource)#初始化数据集
reviews, labels =self._readData(self._dataSource)#初始化词汇-索引映射表和词向量矩阵
word2idx, label2idx =self._genVocabulary(reviews, labels)#将标签和句子数值化
labelIds =self._labelToIndex(labels, label2idx)
reviewIds=self._wordToIndex(reviews, word2idx)#初始化训练集和测试集
trainReviews, trainLabels, evalReviews, evalLabels =self._genTrainEvalData(reviewIds, labelIds, word2idx, self._rate)
self.trainReviews=trainReviews
self.trainLabels=trainLabels
self.evalReviews=evalReviews
self.evalLabels=evalLabels
data=Dataset(config)
data.dataGen()
6 生成batch数据集
采用生成器的形式向模型输入batch数据集,(生成器可以避免将所有的数据加入到内存中)
#输出batch数据集
defnextBatch(x, y, batchSize):"""生成batch数据集,用生成器的方式输出"""perm=np.arange(len(x))
np.random.shuffle(perm)
x=x[perm]
y=y[perm]
numBatches= len(x) //batchSizefor i inrange(numBatches):
start= i *batchSize
end= start +batchSize
batchX= np.array(x[start: end], dtype="int64")
batchY= np.array(y[start: end], dtype="float32")yield batchX, batchY
7 Transformer模型
关于transformer模型的一些使用心得:
1)我在这里选择固定的one-hot的position embedding比论文中提出的利用正弦余弦函数生成的position embedding的效果要好,可能的原因是论文中提出的position embedding是作为可训练的值传入的,
这样就增加了模型的复杂度,在小数据集(IMDB训练集大小:20000)上导致性能有所下降。
2)mask可能不需要,添加mask和去除mask对结果基本没啥影响,也许在其他的任务或者数据集上有作用,但论文也并没有提出一定要在encoder结构中加入mask,mask更多的是用在decoder。
3)transformer的层数,transformer的层数可以根据自己的数据集大小调整,在小数据集上基本上一层就够了。
4)在subLayers上加dropout正则化,主要是在multi-head attention层加,因为feed forward是用卷积实现的,不加dropout应该没关系,当然如果feed forward用全连接层实现,那也加上dropout。
5)在小数据集上transformer的效果并不一定比Bi-LSTM + Attention好,在IMDB上效果就更差。
#生成位置嵌入
deffixedPositionEmbedding(batchSize, sequenceLen):
embeddedPosition=[]for batch inrange(batchSize):
x=[]for step inrange(sequenceLen):
a=np.zeros(sequenceLen)
a[step]= 1x.append(a)
embeddedPosition.append(x)return np.array(embeddedPosition, dtype="float32")#模型构建
classTransformer(object):"""Transformer Encoder 用于文本分类"""
def __init__(self, config, wordEmbedding):#定义模型的输入
self.inputX = tf.placeholder(tf.int32, [None, config.sequenceLength], name="inputX")
self.inputY= tf.placeholder(tf.int32, [None], name="inputY")
self.dropoutKeepProb= tf.placeholder(tf.float32, name="dropoutKeepProb")
self.embeddedPosition= tf.placeholder(tf.float32, [None, config.sequenceLength, config.sequenceLength], name="embeddedPosition")
self.config=config#定义l2损失
l2Loss = tf.constant(0.0)#词嵌入层, 位置向量的定义方式有两种:一是直接用固定的one-hot的形式传入,然后和词向量拼接,在当前的数据集上表现效果更好。另一种
#就是按照论文中的方法实现,这样的效果反而更差,可能是增大了模型的复杂度,在小数据集上表现不佳。
with tf.name_scope("embedding"):#利用预训练的词向量初始化词嵌入矩阵
self.W = tf.Variable(tf.cast(wordEmbedding, dtype=tf.float32, name="word2vec") ,name="W")#利用词嵌入矩阵将输入的数据中的词转换成词向量,维度[batch_size, sequence_length, embedding_size]
self.embedded =tf.nn.embedding_lookup(self.W, self.inputX)
self.embeddedWords= tf.concat([self.embedded, self.embeddedPosition], -1)
with tf.name_scope("transformer"):for i inrange(config.model.numBlocks):
with tf.name_scope("transformer-{}".format(i + 1)):#维度[batch_size, sequence_length, embedding_size]
multiHeadAtt = self._multiheadAttention(rawKeys=self.inputX, queries=self.embeddedWords,
keys=self.embeddedWords)#维度[batch_size, sequence_length, embedding_size]
self.embeddedWords =self._feedForward(multiHeadAtt,
[config.model.filters, config.model.embeddingSize+config.sequenceLength])
outputs= tf.reshape(self.embeddedWords, [-1, config.sequenceLength * (config.model.embeddingSize +config.sequenceLength)])
outputSize= outputs.get_shape()[-1].value#with tf.name_scope("wordEmbedding"):#self.W = tf.Variable(tf.cast(wordEmbedding, dtype=tf.float32, name="word2vec"), name="W")#self.wordEmbedded = tf.nn.embedding_lookup(self.W, self.inputX)
#with tf.name_scope("positionEmbedding"):#print(self.wordEmbedded)#self.positionEmbedded = self._positionEmbedding()
#self.embeddedWords = self.wordEmbedded + self.positionEmbedded
#with tf.name_scope("transformer"):#for i in range(config.model.numBlocks):#with tf.name_scope("transformer-{}".format(i + 1)):
## 维度[batch_size, sequence_length, embedding_size]#multiHeadAtt = self._multiheadAttention(rawKeys=self.wordEmbedded, queries=self.embeddedWords,#keys=self.embeddedWords)## 维度[batch_size, sequence_length, embedding_size]#self.embeddedWords = self._feedForward(multiHeadAtt, [config.model.filters, config.model.embeddingSize])
#outputs = tf.reshape(self.embeddedWords, [-1, config.sequenceLength * (config.model.embeddingSize)])
#outputSize = outputs.get_shape()[-1].value
with tf.name_scope("dropout"):
outputs= tf.nn.dropout(outputs, keep_prob=self.dropoutKeepProb)#全连接层的输出
with tf.name_scope("output"):
outputW=tf.get_variable("outputW",
shape=[outputSize, config.numClasses],
initializer=tf.contrib.layers.xavier_initializer())
outputB= tf.Variable(tf.constant(0.1, shape=[config.numClasses]), name="outputB")
l2Loss+=tf.nn.l2_loss(outputW)
l2Loss+=tf.nn.l2_loss(outputB)
self.logits= tf.nn.xw_plus_b(outputs, outputW, outputB, name="logits")if config.numClasses == 1:
self.predictions= tf.cast(tf.greater_equal(self.logits, 0.0), tf.float32, name="predictions")elif config.numClasses > 1:
self.predictions= tf.argmax(self.logits, axis=-1, name="predictions")#计算二元交叉熵损失
with tf.name_scope("loss"):if config.numClasses == 1:
losses= tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=tf.cast(tf.reshape(self.inputY, [-1, 1]),
dtype=tf.float32))elif config.numClasses > 1:
losses= tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=self.inputY)
self.loss= tf.reduce_mean(losses) + config.model.l2RegLambda *l2Lossdef _layerNormalization(self, inputs, scope="layerNorm"):#LayerNorm层和BN层有所不同
epsilon =self.config.model.epsilon
inputsShape= inputs.get_shape() #[batch_size, sequence_length, embedding_size]
paramsShape= inputsShape[-1:]#LayerNorm是在最后的维度上计算输入的数据的均值和方差,BN层是考虑所有维度的
#mean, variance的维度都是[batch_size, sequence_len, 1]
mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
beta=tf.Variable(tf.zeros(paramsShape))
gamma=tf.Variable(tf.ones(paramsShape))
normalized= (inputs - mean) / ((variance + epsilon) ** .5)
outputs= gamma * normalized +betareturnoutputsdef _multiheadAttention(self, rawKeys, queries, keys, numUnits=None, causality=False, scope="multiheadAttention"):#rawKeys 的作用是为了计算mask时用的,因为keys是加上了position embedding的,其中不存在padding为0的值
numHeads=self.config.model.numHeads
keepProp=self.config.model.keepPropif numUnits is None: #若是没传入值,直接去输入数据的最后一维,即embedding size.
numUnits = queries.get_shape().as_list()[-1]#tf.layers.dense可以做多维tensor数据的非线性映射,在计算self-Attention时,一定要对这三个值进行非线性映射,
#其实这一步就是论文中Multi-Head Attention中的对分割后的数据进行权重映射的步骤,我们在这里先映射后分割,原则上是一样的。
#Q, K, V的维度都是[batch_size, sequence_length, embedding_size]
Q = tf.layers.dense(queries, numUnits, activation=tf.nn.relu)
K= tf.layers.dense(keys, numUnits, activation=tf.nn.relu)
V= tf.layers.dense(keys, numUnits, activation=tf.nn.relu)#将数据按最后一维分割成num_heads个, 然后按照第一维拼接
#Q, K, V 的维度都是[batch_size * numHeads, sequence_length, embedding_size/numHeads]
Q_ = tf.concat(tf.split(Q, numHeads, axis=-1), axis=0)
K_= tf.concat(tf.split(K, numHeads, axis=-1), axis=0)
V_= tf.concat(tf.split(V, numHeads, axis=-1), axis=0)#计算keys和queries之间的点积,维度[batch_size * numHeads, queries_len, key_len], 后两维是queries和keys的序列长度
similary = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1]))#对计算的点积进行缩放处理,除以向量长度的根号值
scaledSimilary = similary / (K_.get_shape().as_list()[-1] ** 0.5)#在我们输入的序列中会存在padding这个样的填充词,这种词应该对最终的结果是毫无帮助的,原则上说当padding都是输入0时,
#计算出来的权重应该也是0,但是在transformer中引入了位置向量,当和位置向量相加之后,其值就不为0了,因此在添加位置向量
#之前,我们需要将其mask为0。虽然在queries中也存在这样的填充词,但原则上模型的结果之和输入有关,而且在self-Attention中
#queryies = keys,因此只要一方为0,计算出的权重就为0。
#具体关于key mask的介绍可以看看这里: https://github.com/Kyubyong/transformer/issues/3
#利用tf,tile进行张量扩张, 维度[batch_size * numHeads, keys_len] keys_len = keys 的序列长度
keyMasks = tf.tile(rawKeys, [numHeads, 1])#增加一个维度,并进行扩张,得到维度[batch_size * numHeads, queries_len, keys_len]
keyMasks = tf.tile(tf.expand_dims(keyMasks, 1), [1, tf.shape(queries)[1], 1])#tf.ones_like生成元素全为1,维度和scaledSimilary相同的tensor, 然后得到负无穷大的值
paddings = tf.ones_like(scaledSimilary) * (-2 ** (32 + 1))#tf.where(condition, x, y),condition中的元素为bool值,其中对应的True用x中的元素替换,对应的False用y中的元素替换
#因此condition,x,y的维度是一样的。下面就是keyMasks中的值为0就用paddings中的值替换
maskedSimilary = tf.where(tf.equal(keyMasks, 0), paddings, scaledSimilary) #维度[batch_size * numHeads, queries_len, key_len]
#在计算当前的词时,只考虑上文,不考虑下文,出现在Transformer Decoder中。在文本分类时,可以只用Transformer Encoder。
#Decoder是生成模型,主要用在语言生成中
ifcausality:
diagVals= tf.ones_like(maskedSimilary[0, :, :]) #[queries_len, keys_len]
tril = tf.contrib.linalg.LinearOperatorTriL(diagVals).to_dense() #[queries_len, keys_len]
masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(maskedSimilary)[0], 1, 1]) #[batch_size * numHeads, queries_len, keys_len]
paddings= tf.ones_like(masks) * (-2 ** (32 + 1))
maskedSimilary= tf.where(tf.equal(masks, 0), paddings, maskedSimilary) #[batch_size * numHeads, queries_len, keys_len]
#通过softmax计算权重系数,维度 [batch_size * numHeads, queries_len, keys_len]
weights =tf.nn.softmax(maskedSimilary)#加权和得到输出值, 维度[batch_size * numHeads, sequence_length, embedding_size/numHeads]
outputs =tf.matmul(weights, V_)#将多头Attention计算的得到的输出重组成最初的维度[batch_size, sequence_length, embedding_size]
outputs = tf.concat(tf.split(outputs, numHeads, axis=0), axis=2)
outputs= tf.nn.dropout(outputs, keep_prob=keepProp)#对每个subLayers建立残差连接,即H(x) = F(x) + x
outputs +=queries#normalization 层
outputs =self._layerNormalization(outputs)returnoutputsdef _feedForward(self, inputs, filters, scope="multiheadAttention"):#在这里的前向传播采用卷积神经网络
#内层
params = {"inputs": inputs, "filters": filters[0], "kernel_size": 1,"activation": tf.nn.relu, "use_bias": True}
outputs= tf.layers.conv1d(**params)#外层
params = {"inputs": outputs, "filters": filters[1], "kernel_size": 1,"activation": None, "use_bias": True}#这里用到了一维卷积,实际上卷积核尺寸还是二维的,只是只需要指定高度,宽度和embedding size的尺寸一致
#维度[batch_size, sequence_length, embedding_size]
outputs = tf.layers.conv1d(**params)#残差连接
outputs +=inputs#归一化处理
outputs =self._layerNormalization(outputs)returnoutputsdef _positionEmbedding(self, scope="positionEmbedding"):#生成可训练的位置向量
batchSize =self.config.batchSize
sequenceLen=self.config.sequenceLength
embeddingSize=self.config.model.embeddingSize#生成位置的索引,并扩张到batch中所有的样本上
positionIndex = tf.tile(tf.expand_dims(tf.range(sequenceLen), 0), [batchSize, 1])#根据正弦和余弦函数来获得每个位置上的embedding的第一部分
positionEmbedding = np.array([[pos / np.power(10000, (i-i%2) / embeddingSize) for i inrange(embeddingSize)]for pos inrange(sequenceLen)])#然后根据奇偶性分别用sin和cos函数来包装
positionEmbedding[:, 0::2] = np.sin(positionEmbedding[:, 0::2])
positionEmbedding[:,1::2] = np.cos(positionEmbedding[:, 1::2])#将positionEmbedding转换成tensor的格式
positionEmbedding_ = tf.cast(positionEmbedding, dtype=tf.float32)#得到三维的矩阵[batchSize, sequenceLen, embeddingSize]
positionEmbedded =tf.nn.embedding_lookup(positionEmbedding_, positionIndex)returnpositionEmbedded
8 定义计算metrics的函数
"""定义各类性能指标"""
def mean(item: list) ->float:"""计算列表中元素的平均值
:param item: 列表对象
:return:"""res= sum(item) / len(item) if len(item) > 0 else0returnresdefaccuracy(pred_y, true_y):"""计算二类和多类的准确率
:param pred_y: 预测结果
:param true_y: 真实结果
:return:"""
ifisinstance(pred_y[0], list):
pred_y= [item[0] for item inpred_y]
corr=0for i inrange(len(pred_y)):if pred_y[i] ==true_y[i]:
corr+= 1acc= corr / len(pred_y) if len(pred_y) > 0 else0returnaccdef binary_precision(pred_y, true_y, positive=1):"""二类的精确率计算
:param pred_y: 预测结果
:param true_y: 真实结果
:param positive: 正例的索引表示
:return:"""corr=0
pred_corr=0for i inrange(len(pred_y)):if pred_y[i] ==positive:
pred_corr+= 1
if pred_y[i] ==true_y[i]:
corr+= 1prec= corr / pred_corr if pred_corr > 0 else0returnprecdef binary_recall(pred_y, true_y, positive=1):"""二类的召回率
:param pred_y: 预测结果
:param true_y: 真实结果
:param positive: 正例的索引表示
:return:"""corr=0
true_corr=0for i inrange(len(pred_y)):if true_y[i] ==positive:
true_corr+= 1
if pred_y[i] ==true_y[i]:
corr+= 1rec= corr / true_corr if true_corr > 0 else0returnrecdef binary_f_beta(pred_y, true_y, beta=1.0, positive=1):"""二类的f beta值
:param pred_y: 预测结果
:param true_y: 真实结果
:param beta: beta值
:param positive: 正例的索引表示
:return:"""precision=binary_precision(pred_y, true_y, positive)
recall=binary_recall(pred_y, true_y, positive)try:
f_b= (1 + beta * beta) * precision * recall / (beta * beta * precision +recall)except:
f_b=0returnf_bdefmulti_precision(pred_y, true_y, labels):"""多类的精确率
:param pred_y: 预测结果
:param true_y: 真实结果
:param labels: 标签列表
:return:"""
ifisinstance(pred_y[0], list):
pred_y= [item[0] for item inpred_y]
precisions= [binary_precision(pred_y, true_y, label) for label inlabels]
prec=mean(precisions)returnprecdefmulti_recall(pred_y, true_y, labels):"""多类的召回率
:param pred_y: 预测结果
:param true_y: 真实结果
:param labels: 标签列表
:return:"""
ifisinstance(pred_y[0], list):
pred_y= [item[0] for item inpred_y]
recalls= [binary_recall(pred_y, true_y, label) for label inlabels]
rec=mean(recalls)returnrecdef multi_f_beta(pred_y, true_y, labels, beta=1.0):"""多类的f beta值
:param pred_y: 预测结果
:param true_y: 真实结果
:param labels: 标签列表
:param beta: beta值
:return:"""
ifisinstance(pred_y[0], list):
pred_y= [item[0] for item inpred_y]
f_betas= [binary_f_beta(pred_y, true_y, beta, label) for label inlabels]
f_beta=mean(f_betas)returnf_betadef get_binary_metrics(pred_y, true_y, f_beta=1.0):"""得到二分类的性能指标
:param pred_y:
:param true_y:
:param f_beta:
:return:"""acc=accuracy(pred_y, true_y)
recall=binary_recall(pred_y, true_y)
precision=binary_precision(pred_y, true_y)
f_beta=binary_f_beta(pred_y, true_y, f_beta)returnacc, recall, precision, f_betadef get_multi_metrics(pred_y, true_y, labels, f_beta=1.0):"""得到多分类的性能指标
:param pred_y:
:param true_y:
:param labels:
:param f_beta:
:return:"""acc=accuracy(pred_y, true_y)
recall=multi_recall(pred_y, true_y, labels)
precision=multi_precision(pred_y, true_y, labels)
f_beta=multi_f_beta(pred_y, true_y, labels, f_beta)return acc, recall, precision, f_beta
9 训练模型
在训练时,我们定义了tensorBoard的输出,并定义了两种模型保存的方法。
#训练模型
#生成训练集和验证集
trainReviews =data.trainReviews
trainLabels=data.trainLabels
evalReviews=data.evalReviews
evalLabels=data.evalLabels
wordEmbedding=data.wordEmbedding
labelList=data.labelList
embeddedPosition=fixedPositionEmbedding(config.batchSize, config.sequenceLength)#定义计算图
with tf.Graph().as_default():
session_conf= tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
session_conf.gpu_options.allow_growth=True
session_conf.gpu_options.per_process_gpu_memory_fraction= 0.9 #配置gpu占用率
sess= tf.Session(config=session_conf)#定义会话
with sess.as_default():
transformer=Transformer(config, wordEmbedding)
globalStep= tf.Variable(0, name="globalStep", trainable=False)#定义优化函数,传入学习速率参数
optimizer =tf.train.AdamOptimizer(config.training.learningRate)#计算梯度,得到梯度和变量
gradsAndVars =optimizer.compute_gradients(transformer.loss)#将梯度应用到变量下,生成训练器
trainOp = optimizer.apply_gradients(gradsAndVars, global_step=globalStep)#用summary绘制tensorBoard
gradSummaries =[]for g, v ingradsAndVars:if g is notNone:
tf.summary.histogram("{}/grad/hist".format(v.name), g)
tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
outDir= os.path.abspath(os.path.join(os.path.curdir, "summarys"))print("Writing to {}\n".format(outDir))
lossSummary= tf.summary.scalar("loss", transformer.loss)
summaryOp=tf.summary.merge_all()
trainSummaryDir= os.path.join(outDir, "train")
trainSummaryWriter=tf.summary.FileWriter(trainSummaryDir, sess.graph)
evalSummaryDir= os.path.join(outDir, "eval")
evalSummaryWriter=tf.summary.FileWriter(evalSummaryDir, sess.graph)#初始化所有变量
saver = tf.train.Saver(tf.global_variables(), max_to_keep=5)#保存模型的一种方式,保存为pb文件
savedModelPath = "../model/transformer/savedModel"
ifos.path.exists(savedModelPath):
os.rmdir(savedModelPath)
builder=tf.saved_model.builder.SavedModelBuilder(savedModelPath)
sess.run(tf.global_variables_initializer())deftrainStep(batchX, batchY):"""训练函数"""feed_dict={
transformer.inputX: batchX,
transformer.inputY: batchY,
transformer.dropoutKeepProb: config.model.dropoutKeepProb,
transformer.embeddedPosition: embeddedPosition
}
_, summary, step, loss, predictions=sess.run(
[trainOp, summaryOp, globalStep, transformer.loss, transformer.predictions],
feed_dict)if config.numClasses == 1:
acc, recall, prec, f_beta= get_binary_metrics(pred_y=predictions, true_y=batchY)elif config.numClasses > 1:
acc, recall, prec, f_beta= get_multi_metrics(pred_y=predictions, true_y=batchY,
labels=labelList)
trainSummaryWriter.add_summary(summary, step)returnloss, acc, prec, recall, f_betadefdevStep(batchX, batchY):"""验证函数"""feed_dict={
transformer.inputX: batchX,
transformer.inputY: batchY,
transformer.dropoutKeepProb:1.0,
transformer.embeddedPosition: embeddedPosition
}
summary, step, loss, predictions=sess.run(
[summaryOp, globalStep, transformer.loss, transformer.predictions],
feed_dict)if config.numClasses == 1:
acc, recall, prec, f_beta= get_binary_metrics(pred_y=predictions, true_y=batchY)elif config.numClasses > 1:
acc, recall, prec, f_beta= get_multi_metrics(pred_y=predictions, true_y=batchY,
labels=labelList)
trainSummaryWriter.add_summary(summary, step)returnloss, acc, prec, recall, f_betafor i inrange(config.training.epoches):#训练模型
print("start training model")for batchTrain innextBatch(trainReviews, trainLabels, config.batchSize):
loss, acc, prec, recall, f_beta= trainStep(batchTrain[0], batchTrain[1])
currentStep=tf.train.global_step(sess, globalStep)print("train: step: {}, loss: {}, acc: {}, recall: {}, precision: {}, f_beta: {}".format(
currentStep, loss, acc, recall, prec, f_beta))if currentStep % config.training.evaluateEvery ==0:print("\nEvaluation:")
losses=[]
accs=[]
f_betas=[]
precisions=[]
recalls=[]for batchEval innextBatch(evalReviews, evalLabels, config.batchSize):
loss, acc, precision, recall, f_beta= devStep(batchEval[0], batchEval[1])
losses.append(loss)
accs.append(acc)
f_betas.append(f_beta)
precisions.append(precision)
recalls.append(recall)
time_str=datetime.datetime.now().isoformat()print("{}, step: {}, loss: {}, acc: {},precision: {}, recall: {}, f_beta: {}".format(time_str, currentStep, mean(losses),
mean(accs), mean(precisions),
mean(recalls), mean(f_betas)))if currentStep % config.training.checkpointEvery ==0:#保存模型的另一种方法,保存checkpoint文件
path = saver.save(sess, "../model/Transformer/model/my-model", global_step=currentStep)print("Saved model checkpoint to {}\n".format(path))
inputs= {"inputX": tf.saved_model.utils.build_tensor_info(transformer.inputX),"keepProb": tf.saved_model.utils.build_tensor_info(transformer.dropoutKeepProb)}
outputs= {"predictions": tf.saved_model.utils.build_tensor_info(transformer.predictions)}
prediction_signature= tf.saved_model.signature_def_utils.build_signature_def(inputs=inputs, outputs=outputs,
method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME)
legacy_init_op= tf.group(tf.tables_initializer(), name="legacy_init_op")
builder.add_meta_graph_and_variables(sess, [tf.saved_model.tag_constants.SERVING],
signature_def_map={"predict": prediction_signature}, legacy_init_op=legacy_init_op)
builder.save()