基于LSTM对IMDB数据集进行情感分析

IMDB情感分析

第一步导包

from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers.core import Dense,Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
import collections
import pandas as pd
import numpy as np
import os
import codecs

第二步设置标签

pos_list=[]
with open('pos_all.txt','r',encoding='utf8')as f:
    line=f.readlines()
    pos_list.extend(line)
    print(len(pos_list))
neg_list=[]
with open('neg_all.txt','r',encoding='utf8')as f:
    line=f.readlines()
    neg_list.extend(line)
    print(len(neg_list))
#创建标签
label=[1 for i in range(7301)]
label.extend([0 for i in range(12499)])
#评论内容整合
content=pos_list.extend(neg_list)
content=pos_list

3.对数据集进行预处理,对停用词与标点符号进行处理

seq=[]
seqtence=[]
stop_words=set(stopwords.words('english'))
for con in content:
    words=nltk.word_tokenize(con)
    line=[]
    for word in words:
        if word.isalpha() and word not in stop_words:
            line.append(word)
    seq.append(line)
seqtence.extend(line)

4.设置词索引、设置最大长度

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(content)
    one_hot_results = tokenizer.texts_to_matrix(content, mode='binary')

# 获取词索引
word_index=tokenizer.word_index
sequences=tokenizer.texts_to_sequences(seq)
# 此处设置每个句子最长不超过 800
final_sequences=sequence.pad_sequences(sequences,maxlen=800)

5.打乱数据

# 转换为numpy类型
label=np.array(label)
# 随机打乱数据
indices=np.random.permutation(len(final_sequences))
X=final_sequences[indices]
y=label[indices]
# 划分测试集和训练集
Xtrain,Xtest,ytrain,ytest=train_test_split(X,y,test_size=0.2)

6.建立网络

# 网络构建
model=Sequential()
model.add(Embedding(89483,256,input_length=800))
model.add(LSTM(128,dropout=0.2))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.fit(Xtrain,ytrain,batch_size=32,epochs=1,validation_data=(Xtest,ytest))

你可能感兴趣的:(深度学习,自然语言处理)