今天用tensorflow实现word2vec,用来分析红楼梦,给出相关性分析,统计出词频并绘制词云。TensorFlow很强大,代码很简单,所以不做过多解释,注释已在代码中。
# -*- coding:utf-8 -*-
import tensorflow as tf
import numpy as np
import math
import collections
import pickle as pkl
from pprint import pprint
import re
import jieba
import os.path as path
import os
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from scipy.misc import imread
import string
import pandas as pd
def draw_wordcloud(word_list=[]):
str_convert = ' '.join(word_list)
#将分词后的数据写入文件
content = str_convert.encode(encoding='utf_8')
with open('data.txt','wb') as f:
f.write(content)
color_mask = imread('1.jpg')
#创建词云
cloud = WordCloud(
# 设置背景色
background_color='black',
# 词云形状
mask=color_mask,
#允许最大词汇
max_words=2000,
# 设置字体,不设置可能出现乱码
font_path="STFANGSO.ttf",
#最大号字体
max_font_size=100,
)
# 产生词云
word_cloud = cloud.generate(str_convert)
# 保存图片
word_cloud.to_file('after.jpg')
# 显示词云图片
plt.imshow(word_cloud)
plt.show()
#统计词频
def word_freq(word_list=[]):
wordDF = pd.DataFrame({'word':word_list})
wordStat = wordDF.groupby(by=["word"])["word"].agg({
"计数":np.size
}).reset_index().sort_values(
by=["计数"],
ascending=False
);
wordStat.to_csv('cipin.csv',sep=',',header=True,index=True,encoding='utf-8')
class word2vec():
def __init__(self,
vocab_list=None,
embedding_size=200,
win_len=3, # 单边窗口长
num_sampled=1000,
learning_rate=1.0,
logdir='/tmp/simple_word2vec',
model_path= None
):
# 获得模型的基本参数
self.batch_size = None # 一批中数据个数, 目前是根据情况来的
if model_path!=None:
self.load_model(model_path)
else:
# model parameters
assert type(vocab_list)==list
self.vocab_list = vocab_list
self.vocab_size = vocab_list.__len__()
self.embedding_size = embedding_size
self.win_len = win_len
self.num_sampled = num_sampled
self.learning_rate = learning_rate
self.logdir = logdir
self.word2id = {} # word => id 的映射
for i in range(self.vocab_size):
self.word2id[self.vocab_list[i]] = i
#定义用于可视化展示的参数
self.train_words_num = 0 # 训练的单词对数
self.train_sents_num = 0 # 训练的句子数
self.train_times_num = 0 # 训练的次数(一次可以有多个句子)
# train loss records
self.train_loss_records = collections.deque(maxlen=10) # 保存最近10次的误差
#十次的平均损失
self.train_loss_k10 = 0
self.build_graph()
self.init_op()
if model_path!=None:
tf_model_path = os.path.join(model_path,'tf_vars')
self.saver.restore(self.sess,tf_model_path)
def init_op(self):
self.sess = tf.Session(graph=self.graph)
self.sess.run(self.init)
self.summary_writer = tf.summary.FileWriter(self.logdir, self.sess.graph)
def build_graph(self):
self.graph = tf.Graph()
with self.graph.as_default():
self.train_inputs = tf.placeholder(tf.int32, shape=[self.batch_size])
self.train_labels = tf.placeholder(tf.int32, shape=[self.batch_size, 1])
#初始化词向量
self.embedding_dict = tf.Variable(
tf.random_uniform([self.vocab_size,self.embedding_size],-1.0,1.0)
)
self.nce_weight = tf.Variable(tf.truncated_normal([self.vocab_size, self.embedding_size],
stddev=1.0/math.sqrt(self.embedding_size)))
self.nce_biases = tf.Variable(tf.zeros([self.vocab_size]))
# 将输入序列向量化
embed = tf.nn.embedding_lookup(self.embedding_dict, self.train_inputs) # batch_size
# 得到NCE损失
self.loss = tf.reduce_mean(
tf.nn.nce_loss(
weights = self.nce_weight,
biases = self.nce_biases,
labels = self.train_labels,
inputs = embed,
num_sampled = self.num_sampled,
num_classes = self.vocab_size
)
)
# tensorboard 相关
tf.summary.scalar('loss',self.loss) # 让tensorflow记录参数
# 根据 nce loss 来更新梯度和embedding
self.train_op = tf.train.GradientDescentOptimizer(learning_rate=0.1).minimize(self.loss) # 训练操作
# 计算与指定若干单词的相似度
self.test_word_id = tf.placeholder(tf.int32,shape=[None])
vec_l2_model = tf.sqrt( # 求各词向量的L2模
tf.reduce_sum(tf.square(self.embedding_dict),1,keep_dims=True)
)
avg_l2_model = tf.reduce_mean(vec_l2_model)
tf.summary.scalar('avg_vec_model',avg_l2_model)
#归一化
self.normed_embedding = self.embedding_dict / vec_l2_model
# self.embedding_dict = norm_vec # 对embedding向量正则化
test_embed = tf.nn.embedding_lookup(self.normed_embedding, self.test_word_id)
#定义相似度
self.similarity = tf.matmul(test_embed, self.normed_embedding, transpose_b=True)
# 变量初始化
self.init = tf.global_variables_initializer()
self.merged_summary_op = tf.summary.merge_all()
self.saver = tf.train.Saver()
def train_by_sentence(self, input_sentence=[]):
# input_sentence: [sub_sent1, sub_sent2, ...]
# 每个sub_sent是一个单词序列,例如['这次','大选','让']
sent_num = input_sentence.__len__()
batch_inputs = []
batch_labels = []
for sent in input_sentence:
for i in range(sent.__len__()):
#句子的起始和终止位置
start = max(0,i-self.win_len)
end = min(sent.__len__(),i+self.win_len+1)
for index in range(start,end):
if index == i:
continue
else:
#上下文和label
input_id = self.word2id.get(sent[i])
label_id = self.word2id.get(sent[index])
if not (input_id and label_id):
continue
batch_inputs.append(input_id)
batch_labels.append(label_id)
if len(batch_inputs)==0:
return
#为使用numpy reshape 先转换为numpy格式
batch_inputs = np.array(batch_inputs,dtype=np.int32)
batch_labels = np.array(batch_labels,dtype=np.int32)
#将label转换为列格式
batch_labels = np.reshape(batch_labels,[batch_labels.__len__(),1])
feed_dict = {
self.train_inputs: batch_inputs,
self.train_labels: batch_labels
}
_, loss_val, summary_str = self.sess.run([self.train_op,self.loss,self.merged_summary_op], feed_dict=feed_dict)
# train loss
self.train_loss_records.append(loss_val)
# self.train_loss_k10 = sum(self.train_loss_records)/self.train_loss_records.__len__()
self.train_loss_k10 = np.mean(self.train_loss_records)
if self.train_sents_num % 1000 == 0 :
self.summary_writer.add_summary(summary_str,self.train_sents_num)
print("{a} sentences dealed, loss: {b}"
.format(a=self.train_sents_num,b=self.train_loss_k10))
# train times
self.train_words_num += batch_inputs.__len__()
self.train_sents_num += input_sentence.__len__()
self.train_times_num += 1
def cal_similarity(self,test_word_id_list,top_k=10):
sim_matrix = self.sess.run(self.similarity, feed_dict={self.test_word_id:test_word_id_list})
sim_mean = np.mean(sim_matrix)
sim_var = np.mean(np.square(sim_matrix-sim_mean))
test_words = []
near_words = []
for i in range(test_word_id_list.__len__()):
test_words.append(self.vocab_list[test_word_id_list[i]])
nearst_id = (-sim_matrix[i,:]).argsort()[1:top_k+1]
nearst_word = [self.vocab_list[x] for x in nearst_id]
near_words.append(nearst_word)
return test_words,near_words,sim_mean,sim_var
def save_model(self, save_path):
if os.path.isfile(save_path):
raise RuntimeError('the save path should be a dir')
if not os.path.exists(save_path):
os.mkdir(save_path)
# 记录模型各参数
model = {}
var_names = ['vocab_size', # int model parameters
'vocab_list', # list
'learning_rate', # int
'word2id', # dict
'embedding_size', # int
'logdir', # str
'win_len', # int
'num_sampled', # int
'train_words_num', # int train info
'train_sents_num', # int
'train_times_num', # int
'train_loss_records', # int train loss
'train_loss_k10', # int
]
for var in var_names:
model[var] = eval('self.'+var)
param_path = os.path.join(save_path,'params.pkl')
if os.path.exists(param_path):
os.remove(param_path)
with open(param_path,'wb') as f:
pkl.dump(model,f)
# 记录tf模型
tf_path = os.path.join(save_path,'tf_vars')
if os.path.exists(tf_path):
os.remove(tf_path)
self.saver.save(self.sess,tf_path)
def load_model(self, model_path):
if not os.path.exists(model_path):
raise RuntimeError('file not exists')
param_path = os.path.join(model_path,'params.pkl')
with open(param_path,'rb') as f:
model = pkl.load(f)
self.vocab_list = model['vocab_list']
self.vocab_size = model['vocab_size']
self.logdir = model['logdir']
self.word2id = model['word2id']
self.embedding_size = model['embedding_size']
self.learning_rate = model['learning_rate']
self.win_len = model['win_len']
self.num_sampled = model['num_sampled']
self.train_words_num = model['train_words_num']
self.train_sents_num = model['train_sents_num']
self.train_times_num = model['train_times_num']
self.train_loss_records = model['train_loss_records']
self.train_loss_k10 = model['train_loss_k10']
if __name__=='__main__':
# step 1 读取停用词
stop_words = []
with open('stop_words.txt',encoding= 'utf-8') as f:
line = f.readline()
while line:
stop_words.append(line[:-1])
line = f.readline()
stop_words = set(stop_words)
print('停用词读取完毕,共{n}个单词'.format(n=len(stop_words)))
# step2 读取文本,预处理,分词,得到词典
raw_word_list = []
sentence_list = []
with open('hongloumeng.txt',encoding='utf-8') as f:
line = f.readline()
while line:
while '\n' in line:
line = line.replace('\n','')
while ' ' in line:
line = line.replace(' ','')
if len(line)>0: # 如果句子非空
raw_words = list(jieba.cut(line,cut_all=False))
dealed_words = []
for word in raw_words:
if word not in stop_words and word not in ['qingkan520','www','com','http']:
raw_word_list.append(word)
dealed_words.append(word)
sentence_list.append(dealed_words)
line = f.readline()
word_freq(raw_word_list)
draw_wordcloud(raw_word_list)
word_count = collections.Counter(raw_word_list)
print('文本中总共有{n1}个单词,不重复单词数{n2},选取前40000个单词进入词典'
.format(n1=len(raw_word_list),n2=len(word_count)))
word_count = word_count.most_common(40000)
word_list = [x[0] for x in word_count]
# 创建模型,训练
w2v = word2vec(vocab_list=word_list, # 词典集
embedding_size=200,
win_len=2,
learning_rate=1,
num_sampled=100, # 负采样个数
logdir='/tmp/280', # tensorboard记录地址
)
num_steps = 10000
for i in range(num_steps):
#print (i%len(sentence_list))
sent = sentence_list[i%len(sentence_list)]
w2v.train_by_sentence([sent])
w2v.save_model('model')
w2v.load_model('model')
test_word = ['贾母','王熙凤']
test_id = [word_list.index(x) for x in test_word]
test_words,near_words,sim_mean,sim_var = w2v.cal_similarity(test_id)
print (test_words,near_words,sim_mean,sim_var)
输出的词频为如下,看到宝玉和贾母果然很多。
ID word 计数
39755 道 6365
15158 宝玉 3647
37553 贾母 1189
7783 凤姐 1188
3420 事 935
14335 姑娘 932
28528 王夫人 929
33203 老太太 903
10480 吃 894
13487 太太 854
4841 众人 828
13897 奶奶 794
10153 只见 750
37935 走 748
6132 做 741
10993 听见 722
2340 两个 708
1964 不知 688
36899 请 679
43313 黛玉 678
36692 话 672
37645 贾琏 660
40475 里 652
19177 想 634
输出的词云为:
程序运行过程如下:
停用词读取完毕,共1904个单词
Building prefix dict from the default dictionary ...
Loading model from cache C:\TEMP\jieba.cache
Loading model cost 0.959 seconds.
Prefix dict has been built succesfully.
F:\PythonWorkPlace\word2Vec\src2\main.py:28: FutureWarning: using a dict on a Series for aggregation
is deprecated and will be removed in a future version
"计数":np.size
文本中总共有237382个单词,不重复单词数43530,选取前40000个单词进入词典
2018-02-07 18:14:34.401306: I C:\tf_jenkins\home\workspace\rel-win\M\windows\PY\36\tensorflow\core\platform\cpu_feature_guard.cc:137] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX
0 sentences dealed, loss: 395.44708251953125
1000 sentences dealed, loss: 284.55487060546875
2000 sentences dealed, loss: 262.3647155761719
3000 sentences dealed, loss: 212.4544677734375
4000 sentences dealed, loss: 185.51414489746094
5000 sentences dealed, loss: 133.9457550048828
6000 sentences dealed, loss: 142.91770935058594
7000 sentences dealed, loss: 89.031982421875
8000 sentences dealed, loss: 94.11927795410156
9000 sentences dealed, loss: 76.94000244140625
['贾母', '王熙凤'] [['宝玉', '走', '只见', '王夫人', '做', '凤姐', '东西', '二人', '两个', '平儿'], ['房里', '走出', '姑奶奶', '偏来', '不负', '因指众', '散', '花枝招展', '贾珍遂', '早']] 0.0353934 0.0100291
最后对比了一下文中和贾母以及王熙凤相似度最高的词:
贾母:
['宝玉', '走', '只见', '王夫人', '做', '凤姐', '东西', '二人', '两个', '平儿']
王熙凤:
['房里', '走出', '姑奶奶', '偏来', '不负', '因指众', '散', '花枝招展', '贾珍遂', '早']