PyTrch深度学习简明实战36 - 一维卷积神经网络

数据集: 某外卖平台收集的用户评价,正向 4000 条,负向 约 8000 条

## 字段说明[]
image.png
安装 jieba 和 pandas

 pip install jieba -i https://pypi.doubanio.com/simple
import torch
# import torchtext 
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import jieba
from torch.utils.data import DataLoader

data = pd.read_csv('waimai_10k.csv')
data.head()    # label 1/正面评价   0/负面评价
#   label   review
# 0 1   很快,好吃,味道足,量大
# 1 1   没有送水没有送水没有送水
# 2 1   非常快,态度好。
# 3 1   方便,快捷,味道可口,快递给力
# 4 1   菜味道很棒!送餐很及时!

data.info()   
# 
# RangeIndex: 11987 entries, 0 to 11986
# Data columns (total 2 columns):
#  #   Column  Non-Null Count  Dtype 
# ---  ------  --------------  ----- 
#  0   label   11987 non-null  int64 
#  1   review  11987 non-null  object
# dtypes: int64(1), object(1)
# memory usage: 187.4+ KB

data.label.value_counts()    # 统计正面/负面评价的数量
# 0    7987
# 1    4000
# Name: label, dtype: int64
# 该数据是不均衡数据,正负面评论条数不同;可以对负面数据采样,都是4000条数据;或者对正面数据过采样。

jieba.lcut('这是日月光华在网易云课堂的课程')    # 中文分词
# ['这是', '日月', '光华', '在', '网易', '云', '课堂', '的', '课程']

def pre_text(text):     
    text = text.replace(',', '').replace('!', '')   # 将逗号/叹号都替换为空
    return jieba.lcut(text)   # 返回分词后的结果

data['review'] = data.review.apply(pre_text)   # 将pre_text函数应用到每个评论上面

data['review']     
# 0                                      [很快, 好吃, 味道, 足量, 大]
# 1                                 [没有, 送水, 没有, 送水, 没有, 送水]
# 2                                        [非常, 快, 态度, 好, 。]
# 3                                 [方便快捷, 味道, 可口, 快, 递给, 力]
# 4                                   [菜, 味道, 很棒, 送餐, 很, 及时]
#                                ...                        
# 11982                   [以前, 几乎, 天天, 吃, 现在, 调料, 什么, 都, 不放]
# 11983    [昨天, 订, 凉皮, 两份, 什么, 调料, 都, 没有, 放, 就, 放, 了, 点, ...
# 11984                                  [凉皮, 太辣, ,, 吃不下, 都]
# 11985                                [本来, 迟到, 了, 还, 自己, 点]
# 11986    [肉夹馍, 不错, 羊肉, 泡馍, 酱肉, 包, 很, 一般, 。, 凉面, 没, 想象, ...
# Name: review, Length: 11987, dtype: object

文本处理的两步骤:
    1. 分词,创建词表
    2. embeding

# 导入创建词表工具(日月: 1, 光华: 2, 吃饭: 3, 调料: 4)
from torchtext.vocab import build_vocab_from_iterator

def yield_tokens(data):
    for text in data:
        yield text     # 以生成器的方式返回

# 创建词表,将每个单词编码为索引。 文本处理的本质,对文本进行分类
vocab = build_vocab_from_iterator(yield_tokens(data['review']),
                                  specials=['', ''],    # specials 标注特殊字符,不认识的返回unk, pad填充; pad编码为0/unk编码为1
                                  min_freq=2)   # 如果出现次数小于2,则删除,认为次数太少,无意义的词

vocab.set_default_index(vocab[''])   # 将unk设置为默认索引

vocab['调料']
# 965

vocab(['很快', '好吃', '味道', '足量', '大'])   # 必须是字符串的形式
# [56, 15, 14, 5229, 114]

vocab['']   # 默认返回值
# 1

vocab['山峰']   # 对于生僻的词,评论里面不存在的词,返回unk 1
# 1

data.info()
# 
# RangeIndex: 11987 entries, 0 to 11986
# Data columns (total 2 columns):
#  #   Column  Non-Null Count  Dtype 
# ---  ------  --------------  ----- 
#  0   label   11987 non-null  int64 
#  1   review  11987 non-null  object
# dtypes: int64(1), object(1)
# memory usage: 187.4+ KB

i = int(len(data)*0.8)      # 选取80%作为训练集
train_data = data.sample(i)  # 用sample方法进行采样
train_data.head()
#           label   review
# 5365  0   [两个, 小时, 才, 送到, 慢]
# 11666 0   [第一次, 见, 服务态度, 这么, 差, 的, 点, 了, 一份, 套餐, 和, 一根, ...
# 766   1   [很, 是, 不错, 的, 送餐, 体验, 。]
# 3570  1   [不错, 不错, 胃, 不, 舒服, 才, 点, 的, 粥, 清爽, 料足, 。, 外卖, ...
# 11969 0   [谢谢, 速度, 很快, 辛苦, 了]

len(train_data)
# 9589

# iloc按照索引取值
test_data = data.iloc[data.index[~data.index.isin(train_data.index)]]   # 将剩余的数据作为测试数据
test_data.head()      # vx  : louhh01
#   label   review
# 0 1   [很快, 好吃, 味道, 足量, 大]
# 7 1   [超级, 快, 就, 送到, 了, 这么, 冷, 的, 天气, 骑士, 们, 辛苦, 了, ...
# 17    1   [好吃, 速度, 包装, 也, 有, 品质, 不, 出, 家门, 就, 能, 吃, 到, 餐...
# 18    1   [味道, 好极, 啦, 送餐, 很快, 师傅, 辛苦, 啦]
# 21    1   [送货, 速度, 很快, 一直, 定, 这家, 赞]

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_data.values
# array([[0, list(['两个', '小时', '才', '送到', '慢'])],
#        [0,
#         list(['第一次', '见', '服务态度', '这么', '差', '的', '点', '了', '一份', '套餐', '和', '一根', '烤肠', '烤肠', '居然', '直接', '就', '没有', '送过来', '送到', '的', '时候', '饭菜', '都', '凉', '了', '也', '是', '我', '吃', '过', '的', '最', '难吃', '的', '梅菜', '扣肉', '再也', '不订', '他家', '的', '了'])],
#        [1, list(['很', '是', '不错', '的', '送餐', '体验', '。'])],
#        ...,
#        [0, list(['送餐', '服务', '有待', '提高', '味道', '不用说', '晚', '了', '半小时'])],
#        [0,
#         list(['岂', '一个', '慢字', '了', '得', '而且', '明明', '是', '自己', '送', '的', '晚', '还', '赖', '人家', '送', '外卖', '的', '小哥', '速度慢', '真是', '醉', '死', '了'])],
#        [1,
#        list(['一盒', '撒', '了', ',', '小哥', '又', '去', '排队', '拿', '了', '一盒', ',', '棒棒', '哒'])]],
#      dtype=object)

train_data.values[0]
# array([0, list(['两个', '小时', '才', '送到', '慢'])], dtype=object)

# 文本的批处理函数,参数是batch  一个批次的数据
def collate_batch(batch):
    label_list, text_list = [], []       # 创建标签/评论列表
    for (_label, _text) in batch:   # 对批次进行迭代
        label_list.append(_label)  
        precessed_text = torch.tensor(vocab(_text), dtype=torch.int64)   # 使用vocab将文本转换为索引值 -> 转换为tensor
        text_list.append(precessed_text)
    label_list = torch.tensor(label_list)
    text_list = torch.nn.utils.rnn.pad_sequence(text_list, batch_first=True)   # 使用填充,使得一个批次中所有序列与最长的序列一致
    return label_list.to(device), text_list.to(device)

# dataset   __getitem__; __len__()
train_dl = DataLoader(train_data.values, batch_size=64,
                      collate_fn=collate_batch,
                      shuffle=True)

test_dl = DataLoader(test_data.values, batch_size=64,
                      collate_fn=collate_batch)

label_batch, text_batch = next(iter(train_dl))   # next返回一个批次
label_batch
# tensor([1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0,
#         1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,
#         0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0])

text_batch         # 发现文本已经填充为同样的长度。 louhh01 
# tensor([[1811, 1049,   96,  ...,    0,    0,    0],
#         [  63,  273,  209,  ...,    0,    0,    0],
#         [ 789, 3769,   31,  ...,    0,    0,    0],
#         ...,
#         [ 768,  100,    3,  ...,    0,    0,    0],
#         [ 223,  384,    3,  ...,    0,    0,    0],
#         [ 106,    5,   11,  ...,    0,    0,    0]])

# 一维卷积模型:
   1. embdeing                           # 文本词嵌入到张量
   2. conv1d + maxpool              # 一维卷积 + 最大池化
   3. conv1d
   4. nn.AdaptiveAvgPool1d        # 自适应平均池化层,将不同长度的序列规范为统一长度
# nn.AdaptiveAvgPool2d          用于规范二维图像 
   5. view()                                   # view方法展平为2维
   6. Linear()                                # 输出层

vocab_size = len(vocab)
embeding_dim = 100

# 输入 text shape: batch, lengh
# 经过embeding后:  batch, lengh, feathues=100

class CONV1D_Net(nn.Module):
    def __init__(self, vocab_size, embeding_dim):
        super(CONV1D_Net, self).__init__()
        self.em = nn.Embedding(vocab_size, embeding_dim)
        self.conv1 = nn.Conv1d(embeding_dim, 64, kernel_size=7)
        self.pool = nn.MaxPool1d(kernel_size=2)
        self.conv2 = nn.Conv1d(64, 128, kernel_size=7)
        self.avgpool = nn.AdaptiveAvgPool1d(output_size=5)  # batch*128*5
        self.fc1 = nn.Linear(128*5, 64)
        self.fc2 = nn.Linear(64, 2)
    def forward(self, x):
        x = self.em(x)
        x = x.permute(0, 2, 1)
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = F.relu(self.conv2(x))
        x = self.avgpool(x)
        x = x.view(-1, x.size(1)*x.size(2))
        x = F.dropout(F.relu(self.fc1(x)))
        x = self.fc2(x)
        return x

model = CONV1D_Net(vocab_size, embeding_dim).to(device)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

def train(dataloader):
    total_acc, total_count, total_loss, = 0, 0, 0
    model.train()
    for label, text in dataloader:
        predicted_label = model(text)
        loss = loss_fn(predicted_label, label)
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        with torch.no_grad():
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
            total_loss += loss.item()*label.size(0)
    return total_loss/total_count, total_acc/total_count

def test(dataloader):
    model.eval()
    total_acc, total_count, total_loss, = 0, 0, 0

    with torch.no_grad():
        for label, text in dataloader:
            predicted_label = model(text)
            loss = loss_fn(predicted_label, label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
            total_loss += loss.item()*label.size(0)
    return total_loss/total_count, total_acc/total_count

def fit(epochs, train_dl, test_dl):
    train_loss = []
    train_acc = []
    test_loss = []
    test_acc = []

    for epoch in range(epochs):
        epoch_loss, epoch_acc = train(train_dl)
        epoch_test_loss, epoch_test_acc = test(test_dl)
        train_loss.append(epoch_loss)
        train_acc.append(epoch_acc)
        test_loss.append(epoch_test_loss)
        test_acc.append(epoch_test_acc)
        template = ("epoch:{:2d}, train_loss: {:.5f}, train_acc: {:.1f}% ," 
                    "test_loss: {:.5f}, test_acc: {:.1f}%")
        print(template.format(
              epoch, epoch_loss, epoch_acc*100, epoch_test_loss, epoch_test_acc*100))
    print("Done!")
    
    return train_loss, test_loss, train_acc, test_acc

EPOCHS = 10

train_loss, test_loss, train_acc, test_acc = fit(EPOCHS, 
                                                 train_dl, 
                                                 test_dl)
# epoch: 0, train_loss: 0.55354, train_acc: 71.5% ,test_loss: 0.49842, test_acc: 79.2%
# epoch: 1, train_loss: 0.40823, train_acc: 83.9% ,test_loss: 0.36572, test_acc: 85.1%
# epoch: 2, train_loss: 0.31483, train_acc: 88.1% ,test_loss: 0.40902, test_acc: 85.1%
# epoch: 3, train_loss: 0.25132, train_acc: 90.8% ,test_loss: 0.39407, test_acc: 87.2%
# epoch: 4, train_loss: 0.19346, train_acc: 93.4% ,test_loss: 0.59312, test_acc: 85.9%
# epoch: 5, train_loss: 0.15583, train_acc: 94.8% ,test_loss: 0.41282, test_acc: 86.2%
# epoch: 6, train_loss: 0.11183, train_acc: 96.4% ,test_loss: 0.59546, test_acc: 86.1%
# epoch: 7, train_loss: 0.09305, train_acc: 97.3% ,test_loss: 0.73912, test_acc: 85.4%
# epoch: 8, train_loss: 0.06812, train_acc: 98.0% ,test_loss: 0.85146, test_acc: 84.7%
# epoch: 9, train_loss: 0.05540, train_acc: 98.3% ,test_loss: 0.82380, test_acc: 85.4%
# Done!

你可能感兴趣的:(PyTrch深度学习简明实战36 - 一维卷积神经网络)