MindSpore入门:使用LSTM进行文本情感分析

MindSpore是华为最近开源的深度学习框架,根据官方的说法,开发这款深度学习框架主要是为了充分利用华为自研的昇腾AI处理器(Ascend)的硬件能力,当然这款框架除了运行在Ascend平台也可以运行在CPU和GPU上面。由于该框架只开发到了0.3版本,目前网络上相关的资料比较少,所以这篇博客想要通过一个简单的小项目,介绍一下如何使用MindSpore训练一个深度学习模型。想要更深入的学习MindSpore可以访问他的官网:https://www.mindspore.cn 和项目代码仓库:https://gitee.com/mindspore/mindspore

这篇Notebook介绍如何使用MindSpore对IMDB数据集中的电影评论进行情感分析。主要思路就是对电影评论中的单词进行词嵌入处理,然后将处理后的数据送入LSTM模型,模型对评论进行打标签(正面或者负面)。

整个处理过程分为三个部分:

  • 准备数据:该教程使用的数据采用IMDB影评数据集,下载地址:http://ai.stanford.edu/~amaas/data/sentiment/ , 如果需要运行该notebook你需要把下载之后的数据解压之后放到 ./data/imdb目录下。由于我们需要对评论中的单词进行词嵌入处理,所以我们还需要用到预训练好的词向量,这里我们不再自己去训练词向量,而是直接采用GloVe,下载地址为:http://nlp.stanford.edu/data/glove.6B.zip。 该文件解压之后包含多个txt文件,多个txt文件的数据都是常用词汇的词向量,只不过向量的维度不同,分为50、100、200、300四种,向量维度越高,词向量的表达能力越强。你可以根据需要选择一个文件使用。将文件放到 ./data/glove目录下面。imdb和glove的下载完之后,我们需要将原始的文本数据经过切词、词嵌入、对齐之后,保存为mindrecord格式。

  • 模型训练:MindSpore为我们定义好了很多常用模型,我们可以直接从model_zoo中选择基于LSTM实现的SentimentNet使用。

  • 模型评估:使用MindSpore定义好的接口可方便的对训练好的模型进行评估,比如准确率等等。

详细的处理流程,可以参考下面的代码。

准备数据

import os
import math
from itertools import chain
import gensim
import numpy as np
from mindspore.mindrecord import FileWriter
def read_imdb(path, seg='train'):
    labels = ['pos', 'neg']
    data = []
    for label in labels:
        files = os.listdir(os.path.join(path, seg, label))
        for file in files:
            with open(os.path.join(path, seg, label, file), 'r', encoding='utf8') as rf:
                review = rf.read().replace('\n', '')
                if label == 'pos':
                    data.append([review, 1])
                elif label == 'neg':
                    data.append([review, 0])
    return data
def tokenize_samples(raw_data):
    tokenized_data = []
    for review in raw_data:
        tokenized_data.append([tok.lower() for tok in review.split()])
    return tokenized_data
def encode_samples(tokenized_samples, word_to_idx):
    """
    tokenized_samples: [[word, word, ...]]
    word_to_idx: {word:idx, word:idx, ...}
    features: [[idx, idx, ...], [idx, idx, ...], ...]
    """
    features = []
    for sample in tokenized_samples:
        feature = []
        for token in sample:
            feature.append(word_to_idx.get(token, 0))
        features.append(feature)
    return features
def pad_samples(features, maxlen=500, pad=0):
    padded_features = []
    for feature in features:
        if len(feature) >= maxlen:
            padded_feature = feature[:maxlen]
        else:
            padded_feature = feature
            while len(padded_feature) < maxlen:
                padded_feature.append(pad)
        padded_features.append(padded_feature)
    return padded_features
def prepare_data(imdb_data_path='./data/imdb/aclImdb'):      
    raw_data_train = read_imdb(imdb_data_path, seg='train')
    raw_data_test = read_imdb(imdb_data_path, seg='test')
    y_train = np.array([label for _, label in raw_data_train]).astype(np.int32)
    y_test = np.array([label for _, label in raw_data_test]).astype(np.int32)
    tokenized_data_train = tokenize_samples([review for review, _ in raw_data_train])
    tokenized_data_test = tokenize_samples([review for review, _ in raw_data_test])
    vocab = set(chain(*tokenized_data_train))
    word_to_idx = {word: i+1 for i, word in enumerate(vocab)}
    word_to_idx[''] = 0
    X_train = np.array(pad_samples(encode_samples(tokenized_data_train, word_to_idx))).astype(np.int32)
    X_test = np.array(pad_samples(encode_samples(tokenized_data_test, word_to_idx))).astype(np.int32)
    return X_train, y_train, X_test, y_test, word_to_idx
X_train, y_train, X_test, y_test, word_to_idx = prepare_data()
#!sed -i '1i\400000 50' ./data/glove/glove.6B.50d.txt
def load_embeddings(glove_file_path, word_to_idx, embed_size=50):
    word2vector = gensim.models.KeyedVectors.load_word2vec_format(
        glove_file_path, binary=False, encoding='utf-8')
    assert embed_size == word2vector.vector_size
    embeddings = np.zeros((len(word_to_idx), embed_size)).astype(np.float32)
    for word, idx in word_to_idx.items():
        try:
            embeddings[idx, :] = word2vector.word_vec(word)
        except KeyError:
            continue
    return embeddings
embeddings = load_embeddings('./data/glove/glove.6B.50d.txt', word_to_idx)
def get_json_data_list(X, y):
    data_list = []
    for i, (feature, label) in enumerate(zip(X, y)):
        data_json = {"id": i, "feature": feature.reshape(-1), "label": int(label)}
        data_list.append(data_json)
    return data_list
def convert_np_to_mindrecord(X_train, y_train, X_test, y_test, mindrecord_save_path="./data/mindrecord"):
    schema_json = {"id": {"type": "int32"},
                  "label": {"type": "int32"},
                  "feature": {"type": "int32", "shape": [-1]}}
    writer = FileWriter(os.path.join(mindrecord_save_path, "aclImdb_train.mindrecord"), shard_num=4)
    data_train = get_json_data_list(X_train, y_train)
    writer.add_schema(schema_json, "nlp_schema")
    writer.add_index(["id", "label"])
    writer.write_raw_data(data_train)
    writer.commit()
    
    writer = FileWriter(os.path.join(mindrecord_save_path, "aclImdb_test.mindrecord"), shard_num=4)
    data_test = get_json_data_list(X_test, y_test)
    writer.add_schema(schema_json, "nlp_schema")
    writer.add_index(["id", "label"])
    writer.write_raw_data(data_test)
    writer.commit()
!ls ./data/mindrecord
aclImdb_test.mindrecord0     aclImdb_train.mindrecord0
aclImdb_test.mindrecord0.db  aclImdb_train.mindrecord0.db
aclImdb_test.mindrecord1     aclImdb_train.mindrecord1
aclImdb_test.mindrecord1.db  aclImdb_train.mindrecord1.db
aclImdb_test.mindrecord2     aclImdb_train.mindrecord2
aclImdb_test.mindrecord2.db  aclImdb_train.mindrecord2.db
aclImdb_test.mindrecord3     aclImdb_train.mindrecord3
aclImdb_test.mindrecord3.db  aclImdb_train.mindrecord3.db
np.savetxt("./data/mindrecord/weight.txt", embeddings)
convert_np_to_mindrecord(X_train, y_train, X_test, y_test)

创建数据集

import mindspore.dataset as mds
def create_dataset(base_path, batch_size, num_epochs, is_train):
    columns_list = ["feature", "label"]
    num_consumer = 4
    if is_train:
        path = os.path.join(base_path, "aclImdb_train.mindrecord0")
    else:
        path = os.path.join(base_path, "aclImdb_test.mindrecord0")
    dataset = mds.MindDataset(path, columns_list=["feature", "label"], num_parallel_workers=4)
    dataset = dataset.shuffle(buffer_size=dataset.get_dataset_size())
    dataset = dataset.batch(batch_size=batch_size, drop_remainder=True)
    dataset = dataset.repeat(count=num_epochs)
    return dataset
dataset_train = create_dataset("./data/mindrecord", batch_size=32, num_epochs=10, is_train=True)

定义模型并训练

from mindspore import Tensor, nn, Model, context, Parameter
from mindspore.common.initializer import initializer
from mindspore.ops import operations as P
from mindspore.nn import Accuracy
from mindspore.train.callback import LossMonitor, CheckpointConfig, ModelCheckpoint, TimeMonitor
from mindspore.model_zoo.lstm import SentimentNet
embedding_tabel = np.loadtxt(os.path.join("./data/mindrecord", "weight.txt")).astype(np.float32)
network = SentimentNet(vocab_size=embedding_tabel.shape[0],
                embed_size=50,
                num_hiddens=100,
                num_layers=2,
                bidirectional=False,
                num_classes=2,
                weight=Tensor(embedding_tabel),
                batch_size=32)
loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True)
opt = nn.Momentum(network.trainable_params(), 0.1, 0.9)
loss_callback = LossMonitor(per_print_times=60)
model = Model(network, loss, opt, {'acc': Accuracy()})
config_ck = CheckpointConfig(save_checkpoint_steps=390, keep_checkpoint_max=10)
checkpoint_cb = ModelCheckpoint(prefix="lstm", directory="./model", config=config_ck)
from mindspore import context
context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="GPU")
model.train(10, dataset_train, callbacks=[checkpoint_cb, loss_callback], dataset_sink_mode=False)

评估模型

dataset_test = create_dataset("./data/mindrecord", batch_size=32, num_epochs=10, is_train=False)
acc = model.eval(dataset_test)
print("accuracy:{}".format(acc))
accuracy:{'acc': 0.6604833546734955}

你可能感兴趣的:(MindSpore入门:使用LSTM进行文本情感分析)