整理中。。。
情感分析(Sentiment analysis),又称倾向性分析,包含较多的任务,如意见抽取(Opinion extraction),意见挖掘(Opinion mining),情感挖掘(Sentiment mining),主观分析(Subjectivity analysis)。它是对带有情感色彩的主观性文本进行分析、处理、归纳和推理的过程,如从评论文本中分析用户对“数码相机”的“变焦、价格、大小、重量、闪光、易用性”等属性的情感倾向。是NLP领域一个比较重要的课题。
物品好坏分析: 从评论中分析物品的好坏。例如电影好坏,是否值得看。
物品属性分析:例如某些价位区间几款车的舒适度,油耗, 操作性能等。
产品反馈分析: 产品哪些功能点最受用户喜欢,哪些功能最受用户吐槽。
网民舆情分析: 例如分析美团外面清真事情等。
金融走势分析:例如,2012年5月,世界首家基于社交媒体的对冲基金 Derwent Capital Markets 上线。它会即时关注Twitter 中的公众情绪指导投资。
总的来说:情感分类分析再小到平台物品,产品本身,大到金融事情都有其用武之地。随着这波数据浪潮和人工智能浪潮的兴起。这一领域将会起到越来越重要的作用。
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Author: Hou-Hou
import numpy as np
class DataIterator:
"""产生batch数据"""
def __init__(self, data1, data2, batch_size):
self.data1 = data1
self.data2 = data2
self.batch_size = batch_size
self.iter = self.make_random_iter() # 数据集的索引值
def next_batch(self):
try:
idxs = next(self.iter) # 对迭代器不断使用next()来获取下⼀条数据
except StopIteration:
self.iter = self.make_random_iter()
idxs = next(self.iter)
X = [self.data1[i] for i in idxs]
Y = [self.data2[i] for i in idxs]
X = np.array(X)
Y = np.array(Y)
return X, Y
# 将数据集切分成batch_size长度的数据
def make_random_iter(self):
splits = np.arange(self.batch_size, len(self.data1), self.batch_size) # 起点,终点,步长
# np.random.permutation产生一个随机序列作为索引,再使用这个序列从原来的数据集中按照新的随机顺序产生随机数据集
it = np.split(np.random.permutation(range(len(self.data1))), splits)[:-1] # 第二个参数为沿轴切分的位置
return iter(it) # iter() 函数用来生成迭代器。
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Author: Hou-Hou
from keras.datasets import imdb
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.python.ops import rnn, rnn_cell
import createdata
(X_train,y_train),(X_test,y_test) = imdb.load_data(num_words=None,
skip_top=0,
maxlen=None,
seed=113,
start_char=1,
oov_char=2,
index_from=3)
a = [len(x) for x in X_train]
plt.plot(a)
plt.show()
t = [item for sublist in X_train for item in sublist]
vocabulary = len(set(t))+1
# 将序列处理为指定长度
max_length = 200
x_filter = []
y_filter = []
for i in range(len(X_train)):
a = len(X_train[i])
if a max_length:
X_train[i] = X_train[i][:max_length]
# 定义超参数
embedding_size = 100
n_hidden = 200 # 隐藏单元数
learning_rate = 0.06
training_iters = 100000
batch_size = 32
beta = 0.0001
# 声明其他参数
n_steps = max_length # timestepswords
n_classes = 2 # 0/1 : binary classification for negative and positive reviews
da = 350 # hyper-parameter : Self-attention MLP has hidden layer with da units
r = 30 # count of different parts to be extracted from sentence (= number of rows in matrix embedding)
display_step = 10
hidden_units = 3000
y_train = np.asarray(pd.get_dummies(y_filter))
X_train = np.asarray([np.asarray(g) for g in x_filter])
# 创建一个内部文件夹来记录日志
logs_path = './recent_logs/'
# 初始化权重和偏差
with tf.name_scope("weights"):
Win = tf.Variable(tf.random_uniform([n_hidden*r, hidden_units], -1/np.sqrt(n_hidden), 1/np.sqrt(n_hidden)), name='W_input') # 均匀分布
Wout = tf.Variable(tf.random_uniform([hidden_units, n_classes], -1/np.sqrt(n_hidden), 1/np.sqrt(n_hidden)), name='W-out')
Ws1 = tf.Variable(tf.random_uniform([da, n_hidden], -1/np.sqrt(da), 1/np.sqrt(da)), name='Ws1')
Ws2 = tf.Variable(tf.random_uniform([r, da], -1/np.sqrt(r), 1/np.sqrt(r)), name='Ws2')
with tf.name_scope("biases"):
biasesin = tf.Variable(tf.random_normal([hidden_units]), name='biases-in')
biasesout = tf.Variable(tf.random_normal([n_classes]), name='biases-out')
# 定义输入输出占位符
with tf.name_scope('input'):
x = tf.placeholder("int32", [32,max_length], name='x-input')
y = tf.placeholder("int32", [32, 2], name='y-input')
# 嵌入向量
with tf.name_scope('embedding'):
embeddings = tf.Variable(tf.random_uniform([vocabulary, embedding_size], -1, 1), name='embeddings')
embed = tf.nn.embedding_lookup(embeddings, x) # 将单词转换为向量表示形式
def length(sequence):
# Computing maximum of elements across dimensions of a tensor
used = tf.sign(tf.reduce_max(tf.abs(sequence), reduction_indices=2))
length = tf.reduce_sum(used, reduction_indices=1)
length = tf.cast(length, tf.int32) # 将length的数据格式转化成dtype
return length
with tf.variable_scope('forward', reuse=True):
lstm_fw_cell = rnn_cell.BasicLSTMCell(n_hidden)
with tf.name_scope('model'):
outputs, states = rnn.dynamic_rnn(lstm_fw_cell, embed, sequence_length=length(embed), dtype=tf.float32, time_major=False)
# in the next step we multiply the hidden-vec matrix with the Ws1 by reshaping
h = tf.nn.tanh(tf.transpose(tf.reshape(tf.matmul(Ws1, tf.reshape(outputs, [n_hidden, batch_size*n_steps])), [da, batch_size, n_steps]), [1, 0, 2]))
# in this step we multiply the generated matrix with Ws2
a = tf.reshape(tf.matmul(Ws2, tf.reshape(h, [da, batch_size*n_steps])), [batch_size, r, n_steps])
def fn3(a,x):
return tf.nn.softmax(x)
h3 = tf.scan(fn3, a)
'''
tf.scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True, swap_memory=False, infer_shape=True, name=None)
fn:计算函数
elems:以elems的第一维度的变量list作函数计算直到遍历完整个elems
initializer:fn计算的初始值,替代elems做第一次计算。
'''
with tf.name_scope('flattening'):
# here we again multiply(batch) of the generated batch with the same hidden matrix
h4 = tf.matmul(h3,outputs)
# flattening the output embedded matrix
last = tf.reshape(h4,[-1,r*n_hidden])
with tf.name_scope('MLP'):
tf.nn.dropout(last,.5, noise_shape=None, seed=None, name=None)
pred1 = tf.nn.sigmoid(tf.matmul(last, Win)+biasesin)
pred = tf.matmul(pred1, Wout) + biasesout
# Define loss and optimizer
with tf.name_scope('cross'):
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y) + beta*tf.nn.l2_loss(Ws2) )
with tf.name_scope('train'):
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
gvs = optimizer.compute_gradients(cost)
# tf.clip_by_norm对梯度进行裁剪,通过控制梯度的最大范式,防止梯度爆炸的问题
capped_gvs = [(tf.clip_by_norm(grad, 0.5), var) for grad, var in gvs]
optimizer.apply_gradients(capped_gvs)
optimized = optimizer.minimize(cost)
# Evaluate model
with tf.name_scope('Accuracy'):
correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) # 将correct_pred的数据格式转化成dtype
# TensorFlow中最重要的可视化方法是通过tensorBoard、tf.summary和tf.summary.FileWriter这三个模块相互合作来完成的
tf.summary.scalar("cost", cost)
tf.summary.scalar("accuracy", accuracy)
# merge all summaries into a single "summary operation" which we can execute in a session
summary_op = tf.summary.merge_all()
# Initializing the variables
train_iter = createdata.DataIterator(X_train, y_train, batch_size)
init = tf.global_variables_initializer()
# This could give warning if in case the required port is being used already
# Running the command again or releasing the port before the subsequent run should solve the purpose
# 开始训练模型
with tf.Session() as sess:
sess.run(init)
# Creating log file writer object
writer = tf.summary.FileWriter(logs_path, graph=tf.get_default_graph())
step = 1
# Keep training until reach max iterations
while step * batch_size < training_iters:
batch_x, batch_y = train_iter.next_batch()
sess.run(optimized, feed_dict={x: batch_x, y: batch_y})
# Executing the summary operation in the session
summary = sess.run(summary_op, feed_dict={x: batch_x, y: batch_y})
# Writing the values in log file using the FileWriter object created above
writer.add_summary(summary, step * batch_size)
if step % display_step == 2:
# Calculate batch accuracy
acc = sess.run(accuracy, feed_dict={x: batch_x, y: batch_y})
# Calculate batch loss
loss = sess.run(cost, feed_dict={x: batch_x, y: batch_y})
print("Iter " + str(step * batch_size) + \
", Minibatch Loss= " + "{:.6f}".format(loss) + \
", Training Accuracy= " + "{:.2f}".format(acc * 100) + "%")
step += 1
print("Optimization Finished!")
# tensorboard --logdir=./
参考:
https://www.jianshu.com/p/158c3f02a15b