# encoding:utf-8
import numpy as np
wordsList = np.load('wordsList.npy')
wordsList = wordsList.tolist()
wordsList = [word.decode('UTF-8')
for word in wordsList]
wordVectors = np.load('wordVectors.npy')
# print("wordsList", wordsList)
# 'muguti', 'boidin', 'madueke', 'smikle', 'uteritz', 'gusin', ...40万个
# print("wordVectors", wordVectors)
# wordVectors [[ 0. 0. 0. ... 0. 0. 0. ]
# [ 0.013441 0.23682 -0.16899 ... -0.56657 0.044691 0.30392 ]
# [ 0.15164 0.30177 -0.16763 ... -0.35652 0.016413 0.10216 ]
# ...
# [-0.51181 0.058706 1.0913 ... -0.25003 -1.125 1.5863 ]
# [-0.75898 -0.47426 0.4737 ... 0.78954 -0.014116 0.6448 ]
# [-0.79149 0.86617 0.11998 ... -0.29996 -0.0063003 0.3954 ]]
# 40万*50 维的向量
import os
from os.path import isfile, join
pos_files = ['pos/' + f for f in os.listdir(
'pos/') if isfile(join('pos/', f))]
neg_files = ['neg/' + f for f in os.listdir(
'neg/') if isfile(join('neg/', f))]
num_words = []
for pf in pos_files:
with open(pf, "r", encoding='utf-8') as f:
line = f.readline()
counter = len(line.split())
for nf in neg_files:
with open(nf, "r", encoding='utf-8') as f:
line = f.readline()
counter = len(line.split())
num_files = len(num_words)
print('文件总数', num_files)
print('所有的词的数量', sum(num_words))
print('平均文件词的长度', sum(num_words) / len(num_words))
print("num_words", num_words)
# num_words [140, 428, 147, 124, 120, 171, 108, 340, 436, 324, 280, 86, 282, 224, 145, 158, ...25000个的列表
# 每个文件的单词长度
# 数据可视化 画直方图 代码未跑通
# import matplotlib.pyplot as plt
# 指定默认字体
# plt.figure()
# matplotlib.pyplot.hist(num_words, 50, facecolor='g')
# plt.scatter(num_words, 50,)
# plt.xlabel('文本长度')
# plt.ylabel('频次')
# plt.axis([0, 1200, 0, 8000])
# plt.show()
import re
strip_special_chars = re.compile("[^A-Za-z0-9 ]+")
num_dimensions = 300 # Dimensions for each word vector
def cleanSentences(string):
string = string.lower().replace("
", " ")
return re.sub(strip_special_chars, "", string.lower())
max_seq_num = 250
# 下面一段代码是将文本生成索引矩阵
# 25000x300索引矩阵
ids = np.zeros((num_files, max_seq_num), dtype='int32')
file_count = 0
for pf in pos_files:
with open(pf, "r", encoding='utf-8') as f:
indexCounter = 0
line = f.readline()
cleanedLine = cleanSentences(line)
split = cleanedLine.split()
for word in split:
ids[file_count][indexCounter] = wordsList.index(word)
except ValueError:
ids[file_count][indexCounter] = 399999 # 未知的词
indexCounter = indexCounter + 1
if indexCounter >= max_seq_num:
file_count = file_count + 1
for nf in neg_files:
with open(nf, "r", encoding='utf-8') as f:
indexCounter = 0
line = f.readline()
cleanedLine = cleanSentences(line)
split = cleanedLine.split()
for word in split:
ids[file_count][indexCounter] = wordsList.index(word)
except ValueError:
ids[file_count][indexCounter] = 399999 # 未知的词语
indexCounter = indexCounter + 1
if indexCounter >= max_seq_num:
file_count = file_count + 1
np.save('idsMatrix', ids)
from random import randint
batch_size = 24
lstm_units = 64
num_labels = 2
iterations = 100
lr = 0.001
ids = np.load('idsMatrix.npy')
def get_train_batch():
labels = []
# 25000x300
# batch x300 每次取批次数据 batch_size =24
# 随机取24个 训练集
arr = np.zeros([batch_size, max_seq_num])
for i in range(batch_size):
if (i % 2 == 0):
num = randint(1, 11499)
# pos
labels.append([1, 0])
num = randint(13499, 24999)
# neg
labels.append([0, 1])
arr[i] = ids[num - 1:num]
return arr, labels
def get_test_batch():
labels = []
arr = np.zeros([batch_size, max_seq_num])
for i in range(batch_size):
# 13499 -11499 = 2000 做测试集 这里可以看出在训练的过程中 分好测试集和训练集的
num = randint(11499, 13499)
if (num <= 12499):
labels.append([1, 0])
labels.append([0, 1])
arr[i] = ids[num - 1:num]
return arr, labels
import tensorflow as tf
labels = tf.placeholder(tf.float32, [batch_size, num_labels])
input_data = tf.placeholder(tf.int32, [batch_size, max_seq_num])
# labels = tf.get_variable(tf.float32, [batch_size, num_labels])
# input_data = tf.placeholder(tf.int32, [batch_size, max_seq_num])
data = tf.Variable(
tf.zeros([batch_size, max_seq_num, num_dimensions]), dtype=tf.float32)
# data = tf.get_variable(
# tf.zeros([batch_size, max_seq_num, num_dimensions]), dtype=tf.float32)
data = tf.nn.embedding_lookup(wordVectors, input_data)
lstmCell = tf.contrib.rnn.BasicLSTMCell(lstm_units)
lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=0.5)
value, _ = tf.nn.dynamic_rnn(lstmCell, data, dtype=tf.float32)
weight = tf.Variable(tf.truncated_normal([lstm_units, num_labels]))
bias = tf.Variable(tf.constant(0.1, shape=[num_labels]))
value = tf.transpose(value, [1, 0, 2])
last = tf.gather(value, int(value.get_shape()[0]) - 1)
prediction = (tf.matmul(last, weight) + bias)
correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(labels, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
logits=prediction, labels=labels))
optimizer = tf.train.AdamOptimizer(lr).minimize(loss)
# saver = tf.train.Saver()
saver = tf.train.Saver(save_relative_paths=True)
with tf.Session() as sess:
if os.path.exists("models") and os.path.exists("models\\checkpoint"):
saver.restore(sess, tf.train.latest_checkpoint('models'))
if int((tf.__version__).split('.')[1]) < 12 and int((tf.__version__).split('.')[0]) < 1:
init = tf.initialize_all_variables()
init = tf.global_variables_initializer()
iterations = 100
for step in range(iterations):
next_batch, next_batch_labels = get_test_batch()
if step % 20 == 0:
print("step:", step, " 正确率:", (sess.run(
accuracy, {input_data: next_batch, labels: next_batch_labels})) * 100)
if not os.path.exists("models"):
save_path = saver.save(sess, "models\\model.ckpt")
print("Model saved in path: %s" % save_path)