# coding: utf-8
# In[151]:
import jieba
import tensorflow as tf
import numpy as np
import math
import collections
import pickle as pkl
from pprint import pprint
from pymongo import MongoClient
import re
import jieba
import os.path as path
import os
import re
import codecs
# In[3]:
# step 1 读取停用词
stop_words = []
with open('stop_words.txt') as f:
line = f.readline()
while line:
stop_words.append(line[:-1])
line = f.readline()
stop_words = set(stop_words)
print('停用词读取完毕,共{n}个单词'.format(n=len(stop_words)))
# In[216]:
def Re(k):
k=re.sub(r'\n','',k)
k=re.sub(r'\xa0','',k)
k=re.sub(r'\r','',k)
#k=re.sub(r'\s','',k)
k=re.sub(r'\t','',k)
return k
# In[217]:
text=codecs.open('standata.txt')
content=text.readlines()
data=[]
label=[]
for coni in content:
coni=re.sub(r'__label__.*','',coni)
coni=Re(coni)
lis=coni.split(' ')
for li in lis:
data.append(li)
# In[219]:
raw_word_list=[]
for datai in data:
if datai not in stop_words:
raw_word_list.append(datai)
# In[220]:
len(raw_word_list)
# In[221]:
word_count = collections.Counter(raw_word_list)
g=[i for i in word_count.values() if i==1]
len(word_count)-len(g)
# In[229]:
# In[222]:
vocabulary_size=16266
def build_dataset(words):
count=[['UNK','-1']]
count.extend(collections.Counter(words).most_common(vocabulary_size))
dictionary=dict()
for word,_ in count:
dictionary[word]=len(dictionary)
data=list()
unk_count=0
for word in words:
if word in dictionary:
index=dictionary[word]
else:
index=0
unk_count +=1
data.append(index)
count[0][1]=unk_count
reverse_dictionary=dict(zip(dictionary.values(),dictionary.keys()))
return data,count,dictionary,reverse_dictionary
# In[223]:
data,count,dictionary,reverse_dictionary=build_dataset(raw_word_list)
# In[224]:
dictionary
# In[59]:
collections.deque(maxlen=3)
b=8
n=2
print(b/2.5)
print(b//2.5)
# In[108]:
data_index=0
def generate_batch(batch_size,num_skips,skip_window):
global data_index
assert batch_size%num_skips==0
assert num_skips<=2*skip_window
batch=np.ndarray(shape=(batch_size),dtype=np.int32)
labels=np.ndarray(shape=(batch_size,1),dtype=np.int32)
span=2*skip_window+1
buffer=collections.deque(maxlen=span)
for _ in range(span):
buffer.append(data[data_index])
data_index=(data_index+1)%len(data)
for i in range(batch_size//num_skips):
target=skip_window
targets_to_avoid=[skip_window]
for j in range(num_skips):
while target in targets_to_avoid:
target=random.randint(0,span-1)
targets_to_avoid.append(target)
batch[i*num_skips+j]=buffer[skip_window]
labels[i*num_skips+j,0]=buffer[target]
buffer.append(data[data_index])
data_index=(data_index+1)%len(data)
return batch,labels
# In[91]:
import random
# In[96]:
batch_size=8
num_skips=3
skip_window=1
data_index=0
global data_index
#assert batch_size% num_skips==0
#assert num_skips<=2*skip_window
batch=np.ndarray(shape=(batch_size),dtype=np.int32)
labels=np.ndarray(shape=(batch_size,1),dtype=np.int32)
span=2*skip_window+1
buffer=collections.deque(maxlen=span)
for _ in range(span):
buffer.append(data[data_index])
data_index=(data_index+1)%len(data)
# In[225]:
batch,labels=generate_batch(batch_size=8,num_skips=2,skip_window=1)
for i in range(8):
print(batch[i],reverse_dictionary[batch[i]],'->',labels[i,0],reverse_dictionary[labels[i,0]])
# In[138]:
batch_size=128
embedding_size=128
skip_window=1
num_skips=2
valid_size=16
valid_window=100
valid_examples=np.random.choice(valid_window,valid_size,replace=False)
num_sampled=64
# In[226]:
vocabulary_size=16266
# In[227]:
graph=tf.Graph()
with graph.as_default():
train_inputs=tf.placeholder(tf.int32,shape=[batch_size])
train_labels=tf.placeholder(tf.int32,shape=[batch_size,1])
valid_dataset=tf.constant(valid_examples,dtype=tf.int32)
embeddings=tf.Variable(tf.random_uniform([vocabulary_size,embedding_size],-1.0,1.0))
embed=tf.nn.embedding_lookup(embeddings,train_inputs)
nce_weights=tf.Variable(tf.truncated_normal([vocabulary_size,embedding_size],stddev=1.0/math.sqrt(embedding_size)))
nce_biases=tf.Variable(tf.zeros([vocabulary_size]))
loss=tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights,
biases=nce_biases,
labels=train_labels,
inputs=embed,
num_sampled=num_sampled,
num_classes=vocabulary_size))
optimizer=tf.train.GradientDescentOptimizer(1.0).minimize(loss)
norm=tf.sqrt(tf.reduce_sum(tf.square(embeddings),1,keep_dims=True))
normalized_embeddings=embeddings/norm
valid_embeddings=tf.nn.embedding_lookup(
normalized_embeddings,valid_dataset)
similarity=tf.matmul(valid_embeddings,normalized_embeddings,transpose_b=True)
init=tf.initialize_all_variables()#tf.global_variables_initializer()
# In[228]:
num_steps=100001
with tf.Session(graph=graph) as session:
init.run()
print('Initialized')
average_loss=0
for step in range(num_steps):
batch_inputs,batch_labels=generate_batch(
batch_size,num_skips,skip_window
)
feed_dict={train_inputs:batch_inputs,train_labels:batch_labels}
_,loss_val=session.run([optimizer,loss],feed_dict=feed_dict)
average_loss+=loss_val
if step%2000==0:
if step>0:
average_loss/=2000
print('Average loss at step',step,':',average_loss)
average_loss=0
if step%10000==0:
sim=similarity.eval()
for i in range(valid_size):
valid_word=reverse_dictionary[valid_examples[i]]
top_k=8
nearest=(-sim[i,:]).argsort()[1:top_k+1]
log_str='Nearest to %s:'%valid_word
for k in range(top_k):
close_word=reverse_dictionary[nearest[k]]
log_str='%s %s,'%(log_str,close_word)
print(log_str)
final_embeddings=normalized_embeddings.eval()
# In[134]:
print(random.randint(0,span-1))
print(random.randint(0,span-1))
print(random.randint(0,span-1))
print(random.randint(0,span-1))
print(random.randint(0,span-1))
print(random.randint(0,span-1))
target=1
target_to_avoid=[1]
while target in target_to_avoid:
target=random.randint(0,span-1)
#span
#buffer.append(1)
#buffer.append(5)
#buffer
print(target)
print(target_to_avoid)