Pytorch-LSTM+Attention文本分类

摘抄笔记

语料链接:https://pan.baidu.com/s/1aDIp3Hxw-Xuxcx-lQ_0w9A
提取码:hpg7

train.txt  pos/neg各500条,一共1000条(用于训练模型)
dev.txt    pos/neg各100条,一共200条(用于调参数)
test.txt    pos/neg各150条,一共300条(用于测试)

例如:下面是一个正面样本的例子。
1
sit back in one of those comfortable chairs.

1. 数据预处理

加载数据、创建vocabulary、创建iterator

import numpy as np
import torch
import torch.nn.functional as F 
from torchtext import data
import math
import time

SEED = 123
BATCH_SIZE = 128
LEARNING_RATE = 1e-3      # 学习率
EMBEDDING_DIM = 100       # 词向量维度

# 设置device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 为CPU设置随机种子
torch.manual_seed(123)

# 两个Field对象定义字段的处理方法(文本字段、标签字段)
TEXT = data.Field(tokenize=lambda x: x.split(), lower=True)
LABEL = data.LabelField(dtype=torch.float)

# get_dataset: 返回Dataset所需的 text 和 label
def get_dataset(corpus_path, text_field, label_field):
    fields = [('text', text_field), ('label', label_field)]   # torchtext文本配对关系
    examples = []
    
    with open(corpus_path) as f:
        li = []
        while True:
            content = f.readline().replace('\n', '')
            if not content:     # 为空行,表示取完一次数据(一次的数据保存在li中)
                if not li:
                    break
                label = li[0][10]
                text = li[1][6:-7]
                examples.append(data.Example.fromlist([text, label], fields=fields))
                li = []
            else:
                li.append(content)
    return examples, fields

# 得到构建Dataset所需的examples 和 fields
train_examples, train_fileds = get_dataset('./corpus/trains.txt', TEXT, LABEL)
dev_examples, dev_fields = get_dataset('./corpus/dev.txt', TEXT, LABEL)
test_examples, test_fields = get_dataset('./corpus/tests.txt', TEXT, LABEL)

# 构建Dataset数据集
train_data = data.Dataset(train_examples, train_fileds)
dev_data = data.Dataset(dev_examples, dev_fields)
test_data = data.Dataset(test_examples, test_fields)
# for t in test_data:
#     print(t.text, t.label)

print('len of train data:', len(train_data))  # 1000
print('len of dev data:', len(dev_data))      # 200
print('len of test data:', len(test_data))    # 300

# 创建vocabulary
TEXT.build_vocab(train_data, max_size=5000, vectors='glove.6B.100d')
LABEL.build_vocab(train_data)
print(len(TEXT.vocab))         # 3287
print(TEXT.vocab.itos[:12])    # ['', '', 'the', 'and', 'a', 'to', 'is', 'was', 'i', 'of', 'for', 'in']
print(TEXT.vocab.stoi['love']) # 129
# print(TEXT.vocab.stoi)         # defaultdict {'': 0, '': 1, ....}

# 创建iterators, 每个iteration都会返回一个batch的example
train_iterator, dev_iterator, test_iterator = data.BucketIterator.splits(
                                            (train_data, dev_data, test_data), 
                                            batch_size=BATCH_SIZE,
                                            device=device,
                                            sort = False)

你可能感兴趣的:(Pytorch-LSTM+Attention文本分类)