常见的深度学习文本分类主要分为三个模块。1、数据处理。2、网络构建。3、整体调度main。
初始数据: 体育\tNBA球星某某某绝杀了...and so on.
转换 ↓
目标数据:(神经网络所能接收的数据)x = tf.placeholder([自定义]) y=tf.placeholder([onehot])
↓
体育--->label---->[0,0,0,0,0,0,0,0,0,1]
NBA球星--->sentence(text)--->[[embedding_dim],[],[]....]
from collections import Counter # 用于计数方便
import numpy as np # 不解释
import tensorflow.contrib.keras as kr
def open_file(filename, mode='r'):
return open(filename, mode, encoding='utf-8',errors='ignore')
def read_file(filename):
# 以单个字为单位,不需要分词,单个初始文本转换为[N,B,A,球,星...],[体育]
# 返回所有文本
return contents,labels
def build_vocab(train_dir, vocab_dir, vocab_size=5000):
# 不会用到所有的字,那么我们需要形成一个字典,方便汉字转换成数字,以便计算机处理。
# 取topK
data_train, _ = read_file(train_dir) # 此时用不到label
# 遍历data_train,得到全部的字集all_data
counter = Counter(all_data)
count_pairs = counter.most_common(vocab_size-1)
# 此时已可形成字典。在头部加入,并将字典存入vocab_dir
def read_vocab(vocab_dir):
# 目标:读取词汇表words,和词汇表对应字典word_to_id
return words,word_to_id # 在字典头部加入 形成字典后,对应序号为0
def read_category():
# 读取分类记录 标签
categories = ['体育','财经'...]
cat_to_id = dict(zip(categories, range(len(categories))))
return categories, c2i
def to_words(content,words):
# 将id转换成文字
return ''.join(words[x] for x in content)
def process_file(filename, word_to_id, cat_to_id, max_length=600):
# 主要对外接口
contents,labels = read_file(filename)
data_id, label_id = [],[]
for i in range(len(contents)):
data_id.append() # 每次添加长为600的list
word_id.append() # 每次添加一个数
x_pad = kr.processing.sequence.pad_sequences(data_id, max_length)
y_pad = kr.utils.to_categories(label_id, num_classes=len(cat_to_id))
return x_pad, y_pad
def batch_iter(x,y,batch_size=64):
# 返回迭代生成器
import tensorflow as tf
class TCNNConfig(object):
embedding_dim = 64
seq_length = 600
num_classes = 10
num_filters = 256 # 卷积核数目
kernel_size = 5
vocab_size = 5000
hidden_dim = 128 # 全连接层神经元
dropout_keep_prob = 0.5
learning_rate = 1e-3
batch_size = 64
num_epochs = 10
print_per_batch = 100
save_per_batch = 10
class TextCNN(object):
def __init__(self, config):
self.config = config
# 待输入
self.input_x = tf.placeholder(tf.int32, [None, self.config.seq_length], name='input_x') # 此时还没有embedding
self.input_y = tf.placeholder(tf.int32,[None, self.config.num_classes], name='input_y')
self.keep_prob = tf.placeholder(tf.float32,name='keep_prob')
self.cnn()
def cnn(self):
with tf.device('/gpu:0'):
embedding = tf.get_variable('embedding',[self.config.vocab_size,self.config.embedding_dim])
embedding_inputs = tf.nn.embedding_lookup(embedding,self.input_x)
with tf.name_scope('CNN'):
conv = tf.layers.conv1d(embedding_inputs, self.config.num_filters, self.config.kernel_size)
# 写入卷积核数目 卷积核大小
gmp = tf.reduce_max(conv, reduction_indices = [1], name='gmp') # 256*1
with tf.name_scope('score'):
fc = tf.layers.dense(gmp,self.config.hidding_dim, name='fc1')
fc = tf.contrib.layers.dropout(fc,self.keep_prob)
fc = tf.nn.relu(fc)
self.logits = tf.layers.dense(fc, self.config.num_classes, name='fc2')
self.y_pred_cls = tf.argmax(tf.nn.softmax(self.logits),1)
with tf.name_scope('optimize'):
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logtis=self.logits,labels=self.input_y)
self.loss = tf.reduce_mean(cross_entropy)
self.optim = tf.train.Adamoptimizer(learning_rate = self.config.learning_rate).minimize(self.loss)
with tf.name_scope('acc'):
c_p = tf.equal(tf.argmax(self.input_y,1),self.y_pred_cls)
self.acc = tf.reduce_mean(tf.cast(c_p,tf.float32))
接下来需要将前两部分,结合起来使用。需要通过逻辑设计,完成训练功能和测试功能。
有机会再来补充。