# 文本卷积 import tensorflow as tf import numpy as np import re from tensorflow.contrib import learn # os.environ["TF_CPP_MIN_LOG_LEVEL"]='2' tf.set_random_seed(777) #设置随机种子 def clean_str(string): #去掉文本中的无效字符并切分单词 string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)#去掉A-Za-z0-9(),!?\ string = re.sub(r"\'s", " \'s", string) #将's,转化成' s' string = re.sub(r"\'ve", " \'ve", string) string = re.sub(r"n\'t", " n\'t", string) string = re.sub(r"\'re", " \'re", string) string = re.sub(r"\'d", " \'d", string) string = re.sub(r"\'ll", " \'ll", string) string = re.sub(r",", " , ", string) string = re.sub(r"!", " ! ", string) string = re.sub(r"\(", " \( ", string) string = re.sub(r"\)", " \) ", string) string = re.sub(r"\?", " \? ", string) string = re.sub(r"\s{2,}", " ", string) #空格2次以上 return string.strip().lower() #去掉两边的空格strip()并变成小写lower() learning_rate = 0.001 #学习率 training_epochs = 10 #训练总周期 batch_size = 64 #每批样本 dev_sample_percentage = 0.1 #测试集的比例 positive_data_file ='rt-polarity.pos' #正面评价文本路径 negative_data_file ='rt-polarity.neg' #负面评价文本路径 positive_examples = list(open(positive_data_file, "r", encoding='gbk',errors='ignore').readlines());print(positive_examples,'\n') #读取正面评价文件,多行 positive_examples = [s.strip() for s in positive_examples] #去掉每个句子的两边空格 print(positive_examples[0]) #正面评价文本(临时测试用) negative_examples = list(open(negative_data_file, "r", encoding='gbk',errors='ignore').readlines()) negative_examples = [s.strip() for s in negative_examples] #去掉每个句子的两边空格 print(negative_examples[0]) #负面评价文本(临时测试用) #切分单词 x_text = positive_examples + negative_examples #把所有正面和负面文本拼接起来 x_text = [clean_str(sent) for sent in x_text] #去掉无效字符并切分单词 #生成标签y positive_labels = [[0, 1] for _ in positive_examples] # 独热编码,正面[0,1] negative_labels = [[1, 0] for _ in negative_examples] # 独热编码,负面[1,0] y_data = np.concatenate([positive_labels, negative_labels], 0) #连接所有标签, axis=0 print(x_text[0]) #经过处理的文本(临时测试用) print(y_data) #标签(临时测试用) #建立词汇表 max_document_length = max([len(x.split(" ")) for x in x_text]) #返回一个句子中最大的单词数56 print('一个句子最大的单词数:', max_document_length) #(临时测试用) #把所有的单词重新编码 vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length) #返回一个句子中最大的单词数:56, x=每一个句子 x = np.array(list(vocab_processor.fit_transform(x_text))) # x.shape=(10662,56) print('单词总数:', len(vocab_processor.vocabulary_)) #(临时测试用)单词总数=18759 print('句子的编码', x) #(临时测试用) x.shape=(10662,56), 每个位置是单词编号,若该位置无单词,则为0 # # [[ 1 2 3 ... 0 0 0] # [ 1 31 32 ... 0 0 0] # [ 57 58 59 ... 0 0 0] # ... # [ 75 84 1949 ... 0 0 0] # [ 1 2191 2690 ... 0 0 0] # [11513 3 147 ... 0 0 0]] # 洗牌Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y_data))) x_shuffled = x[shuffle_indices] y_shuffled = y_data[shuffle_indices] #切分训练集和测试集 dev_sample_index = -1 * int(dev_sample_percentage * float(len(y_data))) x_train, x_dev = np.split(x_shuffled, [dev_sample_index,]) y_train, y_dev = np.split(y_shuffled, [dev_sample_index,]) total = x_train.shape[0] #训练集样本总数 sequence_length = x_train.shape[1] #句子的最大长度 56 print('训练集', x_train.shape, '(句子数,每个句子的最大单词数)') #(临时测试用) print('测试集:', x_dev.shape) #(临时测试用) g_b=0 # 自己实现next_batch函数,每次返回一批数据 def next_batch(size): global g_b xb = x_train[g_b:g_b+size] yb = y_train[g_b:g_b+size] g_b = g_b + size return xb,yb #summary的路径 TB_SUMMARY_DIR = 'textdir1' # 定义占位符 X = tf.placeholder(tf.int32, [None, max_document_length]) # x.shape=(10662,56) # 加入嵌入层 embedding_size = 8 # 词向量的维度 W = tf.Variable(tf.random_uniform([len(vocab_processor.vocabulary_), embedding_size], -1.0, 1.0)) # W.shape=(18759,8) embedded_chars = tf.nn.embedding_lookup(W, X) # shape=(?, 56, 8) X_img = tf.expand_dims(embedded_chars, -1) # 在axis=-1, 增加一个纬度,变成四维数据(?,56,8,1)# [?, max_document_length, embedding_size, 1]) print(X_img) #(临时测试用) Y = tf.placeholder(tf.float32, [None, 2]) # 独热编码 # 第1层卷积 W1 = tf.Variable(tf.random_normal([3, 3, 1, 64], stddev=0.01)) L1 = tf.nn.conv2d(X_img, W1, strides=[1, 1, 1, 1], padding='SAME') L1 = tf.nn.relu(L1) L1 = tf.nn.dropout(L1, keep_prob=0.9) # 第2层卷积 W2 = tf.Variable(tf.random_normal([3, 3, 64, 64], stddev=0.01)) L2 = tf.nn.conv2d(L1, W2, strides=[1, 1, 1, 1], padding='SAME') L2 = tf.nn.relu(L2) L2 = tf.nn.dropout(L2, keep_prob=0.9) dim = L2.get_shape()[1].value * L2.get_shape()[2].value * L2.get_shape()[3].value L2_flat = tf.reshape(L2,[-1, dim]) # 全连接层 W = tf.get_variable("W", shape=[dim, 2], initializer=tf.contrib.layers.xavier_initializer()) b = tf.Variable(tf.random_normal([2])) logits = tf.matmul(L2_flat, W) + b # 代价或损失函数 cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=Y)) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost) # 优化器 tf.summary.scalar("loss", cost) summary = tf.summary.merge_all() global_step = 0 # 创建会话 sess = tf.Session() sess.run(tf.global_variables_initializer()) #全局变量初始化 writer = tf.summary.FileWriter(TB_SUMMARY_DIR, sess.graph) correct_prediction = tf.equal(tf.argmax(logits, 1), tf.argmax(Y, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) print('开始学习...') for epoch in range(training_epochs): avg_cost = 0 total_batch = int(total / batch_size) #计算总批次 g_b = 0 for i in range(total_batch): batch_xs, batch_ys = next_batch(batch_size) feed_dict = {X: batch_xs, Y: batch_ys} s, c, _ = sess.run([summary, cost, optimizer], feed_dict=feed_dict) avg_cost += c / total_batch writer.add_summary(s, global_step=global_step) global_step = global_step + 1 acc = sess.run(accuracy, feed_dict={X: x_dev, Y: y_dev}) print('Epoch:', (epoch + 1), 'cost =', avg_cost, 'acc=', acc) print('学习完成') # 测试模型检查准确率 print('Accuracy:', sess.run(accuracy, feed_dict={X: x_dev, Y: y_dev}))