导包和词嵌入数据
import numpy as np
import pandas as pd
import w2v_utils
word, word_to_vec_map = w2v_utils.read_glove_vecs("data/glove.6B.50d.txt")
word = pd.DataFrame(word)
print(word.shape)
word_to_vec_map = pd.DataFrame(word_to_vec_map)
print(word_to_vec_map.shape)
用余弦相似度计算两个词的差别。
def cosine_similarity(u, v):
return np.dot(u.T, v) / (np.sqrt(np.sum(u ** 2)) * np.sqrt(np.sum(v ** 2)))
dis = cosine_similarity(word_to_vec_map["man"], word_to_vec_map["woman"])
dis1 = cosine_similarity(word_to_vec_map["france"], word_to_vec_map["italy"])
dis2 = cosine_similarity(word_to_vec_map["ball"], word_to_vec_map["man"])
dis3 = cosine_similarity(word_to_vec_map["man"] - word_to_vec_map["woman"],
word_to_vec_map["boy"] - word_to_vec_map["girl"])
输出:
0.886033771849582
0.7788637392080092
0.6076481498468148
0.6695094729169302
查找词对:
def find_similarity(u, v, ec, word2vec):
d = u - v
max = -9999
et = ""
for i in word2vec:
if (ec - word2vec[i]).sum() == 0: # 防止分母为0报错
continue
if cosine_similarity(d, ec - word2vec[i]) > max:
max = cosine_similarity(d, ec - word2vec[i])
et = i
return et
print(find_similarity(word_to_vec_map["man"], word_to_vec_map["woman"], word_to_vec_map["boy"], word_to_vec_map)
)
导包和词嵌入数据(实验室安装不了emoji包):
import emo_utils
import numpy as np
import keras as ks
import os
# import emoji
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # 忽略警告
train_x, train_y = emo_utils.read_csv("data/train_emoji.csv")
test_x, test_y = emo_utils.read_csv("data/test.csv")
words_to_index, index_to_words, word_to_vec_map = emo_utils.read_glove_vecs("data/glove.6B.50d.txt")
# print(train_x.shape,train_y.shape)
# print(train_y)
独热编码:
def one_hot(y): # tf.onehot() | sklearn.processing.OneHotEncoder()
y_one_hot = np.zeros((y.shape[0], y.max() + 1))
for i in range(y.shape[0]):
y_one_hot[i, y[i]] = 1
return y_one_hot
计算平均值:
def average(x, word2vec):
sentense_all = []
for i in range(x.shape[0]):
words = np.asarray(x[i].lower().split()) #转换为小写并分割开
sentense = []
for j in words:
sentense.append(word2vec[j])
sentense_avg = np.sum(np.asarray(sentense), axis=0) / words.shape[0]
sentense_all.append(sentense_avg)
sentense_all = np.asarray(sentense_all)
return sentense_all
构建模型,使用一层全连接层:
def avg_model(sentense_all, y):
X_Input = ks.layers.Input(shape=(sentense_all.shape[1],))
a1 = ks.layers.Dense(y.shape[1], activation="softmax")(X_Input)
print(a1)
model = ks.Model(X_Input, outputs=a1)
model.compile(optimizer=ks.optimizers.Adam(lr=0.003), loss="categorical_crossentropy", metrics=["accuracy"])
model.fit(sentense_all, y, batch_size=132, epochs=1200)
return model
运行
train_y = one_hot(train_y)
test_y = one_hot(test_y)
average_train_x = average(train_x, word_to_vec_map)
average_test_x = average(test_x, word_to_vec_map)
model = avg_model(average_train_x, train_y)
loss, acc = model.evaluate(average_test_x, test_y)
print(acc)
# model.predict()
# print(train_y)
输出:训练集准确率0.9697,验证集准确率0.8571
Epoch 1/1200
132/132 [==============================] - 1s 8ms/step - loss: 2.1337 - acc: 0.2424
Epoch 2/1200
132/132 [==============================] - 0s 15us/step - loss: 2.0982 - acc: 0.2500
...
Epoch 1199/1200
132/132 [==============================] - 0s 15us/step - loss: 0.2240 - acc: 0.9697
0.8571428486279079
导包
import keras
import numpy as np
import emo_utils
import os
import pandas as pd
# import emoji
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # 忽略警告
train_x, train_y = emo_utils.read_csv("data/train_emoji.csv")
test_x, test_y = emo_utils.read_csv("data/test.csv")
words_to_index, index_to_words, word_to_vec_map = emo_utils.read_glove_vecs("data/glove.6B.50d.txt")
word_to_vec_map=pd.DataFrame(word_to_vec_map)
def one_hot(y):
y_one_hot = np.zeros((y.shape[0], y.max() + 1))
for i in range(y.shape[0]):
y_one_hot[i, y[i]] = 1
return y_one_hot
构建词嵌入,因为句子长度不一样,需要填充0到统一长度。
def build_embed(x, word2vec,max_senmtense_size=20):
sentense_all = np.zeros((x.shape[0],max_senmtense_size,word2vec.shape[0]))
for i in range(x.shape[0]):
length=0
words = np.asarray(x[i].lower().split())
sentense = np.zeros((max_senmtense_size,word2vec.shape[0]))
for j in words:
sentense[length]=word2vec[j]
length+=1
sentense_all[i]=sentense
sentense_all = np.asarray(sentense_all)
return sentense_all
搭建模型
def LSTM_model(x,y,word2vec,max_senmtense_size=20):
embed_x =build_embed(x,word2vec,max_senmtense_size)
input_x = keras.Input(shape=(embed_x.shape[1],embed_x.shape[2]))
lstm1=keras.layers.LSTM(128,return_sequences=True)(input_x,)#[batch, timesteps, feature].
drop1=keras.layers.Dropout(0.5)(lstm1)
lstm2=keras.layers.LSTM(128,return_sequences=False)(drop1)
drop2=keras.layers.Dropout(0.5)(lstm2)
classfication=keras.layers.Dense(y.shape[1],activation="softmax")(drop2)
model=keras.Model(input_x,classfication)
model.compile(optimizer=keras.optimizers.Adam(),loss="categorical_crossentropy",metrics=["accuracy"])
model.fit(embed_x,y,epochs=100)
return model
运行
train_y=one_hot(train_y)
test_y=one_hot(test_y)
model=LSTM_model(train_x,train_y,word_to_vec_map)
embed_test=build_embed(test_x,word_to_vec_map)
loss,acc=model.evaluate(embed_test,test_y)
print(acc)
a=[]
a.append("i am angry")
a.append("i am not happy")
a.append("let us play basketball")
a=np.asarray(a)
diy_sentense=build_embed(a,word_to_vec_map)
print(np.argmax(model.predict(diy_sentense),axis=1))
训练过程:训练集准确度1,验证集准确度0.91
Epoch 1/100
32/132 [======>.......................] - ETA: 5s - loss: 1.6080 - acc: 0.1875
96/132 [====================>.........] - ETA: 0s - loss: 1.5970 - acc: 0.2708
132/132 [==============================] - 2s 15ms/step - loss: 1.5933 - acc: 0.2803
...
Epoch 100/100
32/132 [======>.......................] - ETA: 0s - loss: 0.0032 - acc: 1.0000
96/132 [====================>.........] - ETA: 0s - loss: 0.0019 - acc: 1.0000
132/132 [==============================] - 0s 1ms/step - loss: 0.0019 - acc: 1.0000
0.910714294229235
自己输入三句话验证输出为[3 3 1],能分辨出not happy,而使用全连接层无法分辨。