import sys
f = open('reviews.txt')
raw_reviews = f.readlines()
f.close()
f = open('labels.txt')
raw_labels = f.readlines()
f.close()
tokens = list(map(lambda x:set(x.split(" ")), raw_reviews)) #map对右边的参数做左边参数操作
vocab = set()
for sent in tokens:
for word in sent:
if len(word)>0:
vocab.add(word)
vocab = list(vocab)
word2index = {
}
for i,word in enumerate(vocab):
word2index[word] = i
#构建输入数据
input_dataset = list()
for sent in tokens:
sent_indices = list()
for word in sent:
try:
sent_indices.append(word2index[word])
except:
""
input_dataset.append(list(set(sent_indices))) #在各个可能出现重复的地方都用了set
target_dataset = list()
for label in raw_labels:
if label == 'positive\n':
target_dataset.append(1)
else:
target_dataset.append(0)
print(target_dataset[0:10])
[1, 0, 1, 0, 1, 0, 1, 0, 1, 0]
import numpy as np
np.random.seed(1)
def sigmoid(x):
return 1/(1 + np.exp(-x))
alpha, iteration = (0.01, 2)
hidden_size = 100
weight_0_1 = 0.2 * np.random.random((len(vocab),hidden_size)) - 0.1
weight_1_2 = 0.2 * np.random.random((hidden_size,1)) - 0.1
correct, total = (0,0)
for iter in range(iteration):
for i in range(10000): #训练10000条
x,y = (input_dataset[i], target_dataset[i])
layer_1 = sigmoid(np.sum(weight_0_1[x],axis=0)) #因为输入都是0,1,所以这样比计算矩阵乘法好很多,把相关的权重加起来
layer_2 = sigmoid(np.dot(layer_1,weight_1_2))
#反向传播
layer_2_delta = layer_2 - y
layer_1_delta = layer_2_delta.dot(weight_1_2.T)
weight_0_1[x] -= layer_1_delta * alpha
weight_1_2 -= np.outer(layer_1, layer_2_delta) * alpha
if(np.abs(layer_2_delta) < 0.5): #正确结果非0即1,所以可以用这个判断
correct += 1
total += 1
if(i % 1000 == 0):
print('iter:'+ str(iter) + 'index:'+ str(i) + 'correct:' + str(correct/total))
iter:0index:0correct:1.0
iter:0index:1000correct:0.46653346653346656
iter:0index:2000correct:0.5977011494252874
iter:0index:3000correct:0.6717760746417861
iter:0index:4000correct:0.7040739815046239
iter:0index:5000correct:0.7218556288742252
iter:0index:6000correct:0.7390434927512082
iter:0index:7000correct:0.7563205256391944
iter:0index:8000correct:0.7695288088988876
iter:0index:9000correct:0.7775802688590157
iter:1index:0correct:0.7872212778722127
iter:1index:1000correct:0.7942005272247977
iter:1index:2000correct:0.8015998666777768
iter:1index:3000correct:0.8098607799400046
iter:1index:4000correct:0.8162274123276909
iter:1index:5000correct:0.8219452036530898
iter:1index:6000correct:0.8267608274482845
iter:1index:7000correct:0.8320687018410682
iter:1index:8000correct:0.8372868173990334
iter:1index:9000correct:0.8411136255986527
sentence = raw_reviews[10]
sen = tokens[10]
print(sentence)
print(sen)
this isn t the comedic robin williams nor is it the quirky insane robin williams of recent thriller fame . this is a hybrid of the classic drama without over dramatization mixed with robin s new love of the thriller . but this isn t a thriller per se . this is more a mystery suspense vehicle through which williams attempts to locate a sick boy and his keeper . br br also starring sandra oh and rory culkin this suspense drama plays pretty much like a news report until william s character gets close to achieving his goal . br br i must say that i was highly entertained though this movie fails to teach guide inspect or amuse . it felt more like i was watching a guy williams as he was actually performing the actions from a third person perspective . in other words it felt real and i was able to subscribe to the premise of the story . br br all in all it s worth a watch though it s definitely not friday saturday night fare . br br it rates a . from . . . br br the fiend .
{'', 'hybrid', 'of', 'without', 'must', 'person', 'highly', 'to', 'that', 'se', 'rates', 'was', 'over', 'boy', 'keeper', 'a', 's', 'comedic', 'rory', 'sick', 'mystery', 'starring', 'mixed', 'watch', 'the', 'inspect', 'achieving', 'fiend', 'as', 'also', 'he', 'real', 'vehicle', 'guy', 'plays', 'per', 'premise', 'new', 'oh', '.', 'robin', 'movie', 'br', 'is', 'with', 'insane', 'like', 'from', 'culkin', 'more', 't', 'thriller', 'third', 'words', 'locate', 'close', 'performing', 'drama', 'fails', 'entertained', 'other', 'i', 'guide', 'report', 'felt', 'perspective', 'fare', 'his', 'which', 'all', 'news', 'actions', 'teach', 'watching', 'suspense', 'goal', 'isn', 'gets', 'until', 'love', 'through', 'sandra', 'and', 'actually', 'though', 'friday', 'character', 'definitely', 'story', 'dramatization', 'worth', 'william', 'pretty', '\n', 'classic', 'say', 'quirky', 'not', 'or', 'williams', 'fame', 'attempts', 'able', 'night', 'but', 'this', 'amuse', 'in', 'saturday', 'subscribe', 'it', 'nor', 'much', 'recent'}
layer_1_value = sigmoid(np.sum(weight_0_1[input_dataset[10]],axis=0))
pre = sigmoid(np.dot(layer_1_value, weight_1_2))
print(pre)
if(pre > 0.5):
print('预测为积极')
else:
print('预测为消极')
print('真实值:' + raw_labels[10])
[0.92190673]
预测为积极
真实值:positive
word = 'man'
import math
from collections import Counter
#通过看单词到隐藏层的权重的相似程度来判断
def word_similar(x):
x_index = word2index[x]
scores = Counter()
for word,index in word2index.items():
distance = (weight_0_1[index] - weight_0_1[x_index])**2
scores[word] = -math.sqrt(np.sum(distance))
return scores.most_common(10)
print(word_similar(word))
[('man', -0.0), ('slingblade', -0.5648583520454541), ('ashok', -0.6107447183616072), ('talen', -0.6244865719084539), ('tsurube', -0.6288293245899911), ('mulroney', -0.6339924052895604), ('conga', -0.6347788631488298), ('unprofessionally', -0.6373325281488093), ('longinotto', -0.6391352679326392), ('causality', -0.6397687287907046)]
数据集链接:
https://download.csdn.net/download/zjh12312311/12868190