Hierarchical Attention Network for Document Classification中文文本分类,以及模型部署

这周五写了尝试把attention加入rnn网络中做文本分类,实际这种思想来源于Hierarchical Attention Network for Document Classification,周末整整花了一天半的时间来搞这个网络,终于把模型在tensorflow训练好,并部署在java上面,下面看看吧:



首先是把一段中文文本看成一个doc,然后把doc分解成sen,再对sen进行分词,在这里涉及的主要逻辑是,doc怎么分解成sen,我首先是把全角转化为半角,然后按照标点符号进行split,符合涉及:

     
     
     
     
  1. public final static String splitstr = "[;。.,:?!~、]";
      
      
      
      
  1. public static String getSplitStr(String text) {
  2. String result = qj2bj(text).replaceAll("#", "").replaceAll("\n", "#")
  3. .replaceAll("\\s+", "#").replaceAll("\\-{2,}", "#")
  4. .replaceAll(splitstr, "#");
  5. return result;
  6. }


句子跟句子之间以#号隔开,最后分词转化为下面文本:


    
    
    
    
  1. 1 我是 这边 工作人员 # 要是 有 什么 问题 可以 咨询 我 #


第一列是label,以\t隔开,后面是doc,像这个doc下面有两个sen,我这边由于全部是短文本,所以全部都转化为20*20的一个形式,多余的截取,不够用0填充。后面对句子进行了分词,把文本抓化为这种形式然后就是python代码了,主要文件如下:


     
     
     
     
  1. data_utils.py #一些方法
  2. runha.py #主程序
  3. test.py
  4. configuration.py #一些参数
  5. model.py #han模型
  6. tensorflow #存放数据目录

data_utils.py :

     
     
     
     
  1. import os
  2. import numpy as np
  3. import codecs
  4. import pickle
  5. def _read_vocab(filename):
  6. """读取词汇列别"""
  7. words=list(map(lambda line:line.strip(),codecs.open(filename,'r',encoding='utf-8').readlines()))
  8. word_to_id=dict(zip(words,range(len(words))))
  9. return words,word_to_id
  10. def _read_file(filename,word_to_id,num_classes=2,max_sent_in_doc=20,max_word_in_sent=20):
  11. data_x=[]
  12. data_y=[]
  13. with open(filename, "r") as f:
  14. for line in f:
  15. # doc=[]
  16. doc = np.zeros((max_sent_in_doc, max_word_in_sent), dtype=np.int32)
  17. doclabel = line.split("\t")
  18. if len(doclabel)>1:
  19. label=int(doclabel[0])
  20. sents=doclabel[1].split("#")
  21. for i, sent in enumerate(sents):
  22. if i < max_sent_in_doc and sent!='':
  23. for j, word in enumerate(sent.strip().split(" ")):
  24. if j < max_word_in_sent and word!='' :
  25. doc[i][j] = word_to_id.get(word, 0)
  26. labels = [0] * num_classes
  27. labels[label - 1] = 1
  28. data_y.append(labels)
  29. data_x.append(doc.tolist())
  30. #pickle.dump((data_x, data_y), open('tensorflow/business/business_data', 'wb'))
  31. return data_x,data_y
  32. def preocess_file(data_path,vocapath):
  33. """一次性返回所有的数据"""
  34. words,word_to_id=_read_vocab(vocapath)
  35. x_train, y_train = _read_file(data_path, word_to_id)
  36. return x_train, y_train,words
  37. def batch_iter(data,batch_size=64,num_epochs=5):
  38. """生成批次数据"""
  39. data=np.array(data)
  40. data_size=len(data)
  41. num_batchs_per_epchs=int((data_size-1)/batch_size)+1
  42. for epoch in range(num_epochs):
  43. indices=np.random.permutation(np.arange(data_size))
  44. shufflfed_data=data[indices]
  45. for batch_num in range(num_batchs_per_epchs):
  46. start_index=batch_num*batch_size
  47. end_index=min((batch_num + 1) * batch_size, data_size)
  48. yield shufflfed_data[start_index:end_index]
  49. if __name__=='__main__':
  50. path = "tensorflow/business/vocab.txt"
  51. words,word_to_id=_read_vocab(path)
  52. print(words[0:10])
  53. print(len(word_to_id))

model.py 

      
      
      
      
  1. import tensorflow as tf
  2. from tensorflow.contrib import rnn
  3. from tensorflow.contrib import layers
  4. import time
  5. class HAN():
  6. def __init__(self,config):
  7. self.config=config
  8. self.max_sentence_num=self.config.max_sent_in_doc
  9. self.max_sentence_length=self.config.max_word_in_sent
  10. self.vocab_size =self.config.vocab_size
  11. self.num_classes = self.config.num_classes
  12. self.embedding_size = self.config.embedding_size
  13. self.hidden_size = self.config.hidden_dim
  14. # x的shape为[batch_size, 句子数, 句子长度(单词个数)],但是每个样本的数据都不一样,,所以这里指定为空
  15. # y的shape为[batch_size, num_classes]
  16. self.input_x = tf.placeholder(tf.int32, [None, self.max_sentence_num, self.max_sentence_length], name='input_x')
  17. self.input_y = tf.placeholder(tf.float32, [None, self.num_classes], name='input_y')
  18. self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')
  19. # 构建模型
  20. word_embedded = self.word2vec()
  21. sent_vec = self.sent2vec(word_embedded)
  22. doc_vec = self.doc2vec(sent_vec)
  23. out = self.classifer(doc_vec)
  24. self.rnnhamodel(out)
  25. def rnnhamodel(self,out):
  26. with tf.name_scope("score"):
  27. self.pred_y = tf.nn.softmax(out, name="pred_y")
  28. tf.add_to_collection('pred_network', self.pred_y)
  29. with tf.name_scope('loss'):
  30. self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=self.input_y,
  31. logits=out,
  32. name='loss'))
  33. with tf.name_scope('acc'):
  34. predict = tf.argmax(out, axis=1, name='predict')
  35. label = tf.argmax(self.input_y, axis=1, name='label')
  36. self.acc = tf.reduce_mean(tf.cast(tf.equal(predict, label), tf.float32))
  37. global_step = tf.Variable(0, trainable=False)
  38. optimizer = tf.train.AdamOptimizer(self.config.learning_rate)
  39. # RNN中常用的梯度截断,防止出现梯度过大难以求导的现象
  40. tvars = tf.trainable_variables()
  41. grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars),self.config.grad_clip)
  42. grads_and_vars = tuple(zip(grads, tvars))
  43. self.train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
  44. def word2vec(self):
  45. # 嵌入层
  46. with tf.name_scope("embedding"):
  47. embedding_mat = tf.Variable(tf.truncated_normal((self.vocab_size, self.embedding_size)))
  48. # shape为[batch_size, sent_in_doc, word_in_sent, embedding_size]
  49. word_embedded = tf.nn.embedding_lookup(embedding_mat, self.input_x)
  50. return word_embedded
  51. def sent2vec(self, word_embedded):
  52. with tf.name_scope("sent2vec"):
  53. # GRU的输入tensor是[batch_size, max_time, ...].在构造句子向量时max_time应该是每个句子的长度,所以这里将
  54. # batch_size * sent_in_doc当做是batch_size.这样一来,每个GRU的cell处理的都是一个单词的词向量
  55. # 并最终将一句话中的所有单词的词向量融合(Attention)在一起形成句子向量
  56. # shape为[batch_size*sent_in_doc, word_in_sent, embedding_size]
  57. word_embedded = tf.reshape(word_embedded, [-1, self.max_sentence_length, self.embedding_size])
  58. # shape为[batch_size*sent_in_doce, word_in_sent, hidden_size*2]
  59. word_encoded = self.BidirectionalGRUEncoder(word_embedded, name='word_encoder')
  60. # shape为[batch_size*sent_in_doc, hidden_size*2]
  61. sent_vec = self.AttentionLayer(word_encoded, name='word_attention')
  62. return sent_vec
  63. def doc2vec(self, sent_vec):
  64. # 原理与sent2vec一样,根据文档中所有句子的向量构成一个文档向量
  65. with tf.name_scope("doc2vec"):
  66. sent_vec = tf.reshape(sent_vec, [-1, self.max_sentence_num, self.hidden_size * 2])
  67. # shape为[batch_size, sent_in_doc, hidden_size*2]
  68. doc_encoded = self.BidirectionalGRUEncoder(sent_vec, name='sent_encoder')
  69. # shape为[batch_szie, hidden_szie*2]
  70. doc_vec = self.AttentionLayer(doc_encoded, name='sent_attention')
  71. return doc_vec
  72. def classifer(self, doc_vec):
  73. # 最终的输出层,是一个全连接层
  74. with tf.name_scope('doc_classification'):
  75. out = layers.fully_connected(inputs=doc_vec, num_outputs=self.num_classes, activation_fn=None)
  76. return out
  77. def BidirectionalGRUEncoder(self, inputs, name):
  78. # 双向GRU的编码层,将一句话中的所有单词或者一个文档中的所有句子向量进行编码得到一个 2×hidden_size的输出向量,然后在经过Attention层,将所有的单词或句子的输出向量加权得到一个最终的句子/文档向量。
  79. # 输入inputs的shape是[batch_size, max_time, voc_size]
  80. with tf.variable_scope(name):
  81. if self.config.isgru:
  82. GRU_cell_fw = rnn.GRUCell(self.hidden_size)
  83. GRU_cell_bw = rnn.GRUCell(self.hidden_size)
  84. else:
  85. GRU_cell_fw = rnn.LSTMCell(self.hidden_size)
  86. GRU_cell_bw = rnn.LSTMCell(self.hidden_size)
  87. # fw_outputs和bw_outputs的size都是[batch_size, max_time, hidden_size]
  88. ((fw_outputs, bw_outputs), (_, _)) = tf.nn.bidirectional_dynamic_rnn(cell_fw=GRU_cell_fw,
  89. cell_bw=GRU_cell_bw,
  90. inputs=inputs,
  91. sequence_length=self.length(inputs),
  92. dtype=tf.float32)
  93. # outputs的size是[batch_size, max_time, hidden_size*2]
  94. outputs = tf.concat((fw_outputs, bw_outputs), 2)
  95. return outputs
  96. def AttentionLayer(self, inputs, name):
  97. # inputs是GRU的输出,size是[batch_size, max_time, encoder_size(hidden_size * 2)]
  98. with tf.variable_scope(name):
  99. # u_context是上下文的重要性向量,用于区分不同单词/句子对于句子/文档的重要程度,
  100. # 因为使用双向GRU,所以其长度为2×hidden_szie
  101. u_context = tf.Variable(tf.truncated_normal([self.hidden_size * 2]), name='u_context')
  102. # 使用一个全连接层编码GRU的输出的到期隐层表示,输出u的size是[batch_size, max_time, hidden_size * 2]
  103. h = layers.fully_connected(inputs, self.hidden_size * 2, activation_fn=tf.nn.tanh)
  104. # shape为[batch_size, max_time, 1]
  105. alpha = tf.nn.softmax(tf.reduce_sum(tf.multiply(h, u_context), axis=2, keep_dims=True), dim=1)
  106. # reduce_sum之前shape为[batch_szie, max_time, hidden_szie*2],之后shape为[batch_size, hidden_size*2]
  107. atten_output = tf.reduce_sum(tf.multiply(inputs, alpha), axis=1)
  108. atten_output = tf.nn.dropout(
  109. atten_output, self.keep_prob,
  110. name="dropout")
  111. return atten_output
  112. def length(self,sequences):
  113. used = tf.sign(tf.reduce_max(tf.abs(sequences), reduction_indices=2))
  114. seq_len = tf.reduce_sum(used, reduction_indices=1)
  115. return tf.cast(seq_len, tf.int32)
config.py:
     
     
     
     
  1. class HAConfig:
  2. max_sent_in_doc = 20
  3. max_word_in_sent = 20
  4. embedding_size = 64 # 词向量维度
  5. num_classes = 2 # 类别个数
  6. vocab_size = 9000 # 词汇表的大小
  7. num_layers = 2 # 隐含层的层数
  8. hidden_dim = 128 # 隐藏层神经元
  9. rnn = 'gru' # lstm 或 gru
  10. keep_prob = 0.7 # dropout保留比例
  11. learning_rate = 1e-3 # 学习率
  12. grad_clip=5
  13. batch_size = 64 # 每批训练大小
  14. num_epochs = 15 # 总迭代轮次
  15. print_per_batch = 100 # 每多少轮输出一次结果
  16. l2_reg_lambda = 0.003
  17. isgru = True

run.py:



结果:

     
     
     
     
  1. 2017-11-26 14:48:10.169322: I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 3141 get requests, put_count=2789 evicted_count=1000 eviction_rate=0.358551 and unsatisfied allocation rate=0.437759
  2. 2017-11-26 14:48:10.169371: I tensorflow/core/common_runtime/gpu/pool_allocator.cc:259] Raising pool_size_limit_ from 256 to 281
  3. 2017-11-26 14:48:11.145964: I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 3278 get requests, put_count=3359 evicted_count=1000 eviction_rate=0.297708 and unsatisfied allocation rate=0.298353
  4. 2017-11-26 14:48:11.146014: I tensorflow/core/common_runtime/gpu/pool_allocator.cc:259] Raising pool_size_limit_ from 655 to 720
  5. Iter: 100, Train Loss: 0.12, Train Acc: 95.31%, Time: 0:00:12
  6. Iter: 200, Train Loss: 0.096, Train Acc: 95.31%, Time: 0:00:22
  7. Iter: 300, Train Loss: 0.11, Train Acc: 96.88%, Time: 0:00:33
  8. Iter: 400, Train Loss: 0.11, Train Acc: 96.88%, Time: 0:00:44
  9. Iter: 500, Train Loss: 0.16, Train Acc: 93.75%, Time: 0:00:55
  10. Converted 25 variables to const ops.
  11. attention模型在第500步已经保存
  12. Iter: 600, Train Loss: 0.048, Train Acc: 96.88%, Time: 0:01:06
  13. Iter: 700, Train Loss: 0.092, Train Acc: 96.88%, Time: 0:01:16
  14. Iter: 800, Train Loss: 0.057, Train Acc: 98.44%, Time: 0:01:27
  15. Iter: 900, Train Loss: 0.047, Train Acc: 96.88%, Time: 0:01:38
  16. Iter: 1000, Train Loss: 0.056, Train Acc: 98.44%, Time: 0:01:48
  17. Converted 25 variables to const ops.
  18. attention模型在第1000步已经保存
  19. Iter: 1100, Train Loss: 0.02, Train Acc: 98.44%, Time: 0:01:59
  20. Iter: 1200, Train Loss: 0.028, Train Acc: 98.44%, Time: 0:02:10
  21. Iter: 1300, Train Loss: 0.0041, Train Acc: 100.00%, Time: 0:02:21
  22. Iter: 1400, Train Loss: 0.042, Train Acc: 98.44%, Time: 0:02:31
  23. Iter: 1500, Train Loss: 0.0043, Train Acc: 100.00%, Time: 0:02:42




在java中调用要写两个方法具体如下:

     
     
     
     
  1. public static int[][][] gettexttoidBuinessByCutHAN(String text, Map<String, Integer> map) {
  2. int[][][] docs = new int[1][20][20];
  3. if (StringUtils.isBlank(text)) {
  4. return docs;
  5. }
  6. String docword = WordUtilHAN.getSegmentHANModelStr(text);
  7. if (StringUtils.isBlank(docword)) {
  8. return docs;
  9. }
  10. String[] sents=docword.split("#");
  11. for(int i=0;i<sents.length&& i<20;i++){
  12. if(StringUtils.isNotBlank(sents[i])){
  13. String[] words=sents[i].trim().split(" ");
  14. for(int j=0;j<words.length&& j<20;j++){
  15. if(StringUtils.isNotBlank(words[j])){
  16. if(map.containsKey(words[j])){
  17. docs[0][i][j]=map.get(words[j]);
  18. }
  19. }
  20. }
  21. }
  22. }
  23. return docs;
  24. }
      
      
      
      
  1. public static double getClassifyBusinessByHANModel(String text, Session sess, Map<String, Integer> map, Tensor keep_prob) {
  2. if (StringUtils.isBlank(text)) {
  3. return 0.0;
  4. }
  5. int[][][] arr = gettexttoidBuinessByCutHAN(text, map);
  6. Tensor input = Tensor.create(arr);
  7. Tensor result = sess.runner().feed("input_x", input).feed("keep_prob", keep_prob).fetch("score/pred_y").run()
  8. .get(0);
  9. long[] rshape = result.shape();
  10. int nlabels = (int) rshape[1];
  11. int batchSize = (int) rshape[0];
  12. float[][] logits = result.copyTo(new float[batchSize][nlabels]);
  13. if (nlabels > 1 && batchSize > 0) {
  14. return logits[0][0];
  15. }
  16. return 0.0;
  17. }


启动service:

Hierarchical Attention Network for Document Classification中文文本分类,以及模型部署_第1张图片


你可能感兴趣的:(python编程,机器学习)