数据集: 某外卖平台收集的用户评价,正向 4000 条,负向 约 8000 条
## 字段说明[]
安装 jieba 和 pandas
pip install jieba -i https://pypi.doubanio.com/simple
import torch
# import torchtext
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import jieba
from torch.utils.data import DataLoader
data = pd.read_csv('waimai_10k.csv')
data.head() # label 1/正面评价 0/负面评价
# label review
# 0 1 很快,好吃,味道足,量大
# 1 1 没有送水没有送水没有送水
# 2 1 非常快,态度好。
# 3 1 方便,快捷,味道可口,快递给力
# 4 1 菜味道很棒!送餐很及时!
data.info()
#
# RangeIndex: 11987 entries, 0 to 11986
# Data columns (total 2 columns):
# # Column Non-Null Count Dtype
# --- ------ -------------- -----
# 0 label 11987 non-null int64
# 1 review 11987 non-null object
# dtypes: int64(1), object(1)
# memory usage: 187.4+ KB
data.label.value_counts() # 统计正面/负面评价的数量
# 0 7987
# 1 4000
# Name: label, dtype: int64
# 该数据是不均衡数据,正负面评论条数不同;可以对负面数据采样,都是4000条数据;或者对正面数据过采样。
jieba.lcut('这是日月光华在网易云课堂的课程') # 中文分词
# ['这是', '日月', '光华', '在', '网易', '云', '课堂', '的', '课程']
def pre_text(text):
text = text.replace(',', '').replace('!', '') # 将逗号/叹号都替换为空
return jieba.lcut(text) # 返回分词后的结果
data['review'] = data.review.apply(pre_text) # 将pre_text函数应用到每个评论上面
data['review']
# 0 [很快, 好吃, 味道, 足量, 大]
# 1 [没有, 送水, 没有, 送水, 没有, 送水]
# 2 [非常, 快, 态度, 好, 。]
# 3 [方便快捷, 味道, 可口, 快, 递给, 力]
# 4 [菜, 味道, 很棒, 送餐, 很, 及时]
# ...
# 11982 [以前, 几乎, 天天, 吃, 现在, 调料, 什么, 都, 不放]
# 11983 [昨天, 订, 凉皮, 两份, 什么, 调料, 都, 没有, 放, 就, 放, 了, 点, ...
# 11984 [凉皮, 太辣, ,, 吃不下, 都]
# 11985 [本来, 迟到, 了, 还, 自己, 点]
# 11986 [肉夹馍, 不错, 羊肉, 泡馍, 酱肉, 包, 很, 一般, 。, 凉面, 没, 想象, ...
# Name: review, Length: 11987, dtype: object
文本处理的两步骤:
1. 分词,创建词表
2. embeding
# 导入创建词表工具(日月: 1, 光华: 2, 吃饭: 3, 调料: 4)
from torchtext.vocab import build_vocab_from_iterator
def yield_tokens(data):
for text in data:
yield text # 以生成器的方式返回
# 创建词表,将每个单词编码为索引。 文本处理的本质,对文本进行分类
vocab = build_vocab_from_iterator(yield_tokens(data['review']),
specials=['', ''], # specials 标注特殊字符,不认识的返回unk, pad填充; pad编码为0/unk编码为1
min_freq=2) # 如果出现次数小于2,则删除,认为次数太少,无意义的词
vocab.set_default_index(vocab['']) # 将unk设置为默认索引
vocab['调料']
# 965
vocab(['很快', '好吃', '味道', '足量', '大']) # 必须是字符串的形式
# [56, 15, 14, 5229, 114]
vocab[''] # 默认返回值
# 1
vocab['山峰'] # 对于生僻的词,评论里面不存在的词,返回unk 1
# 1
data.info()
#
# RangeIndex: 11987 entries, 0 to 11986
# Data columns (total 2 columns):
# # Column Non-Null Count Dtype
# --- ------ -------------- -----
# 0 label 11987 non-null int64
# 1 review 11987 non-null object
# dtypes: int64(1), object(1)
# memory usage: 187.4+ KB
i = int(len(data)*0.8) # 选取80%作为训练集
train_data = data.sample(i) # 用sample方法进行采样
train_data.head()
# label review
# 5365 0 [两个, 小时, 才, 送到, 慢]
# 11666 0 [第一次, 见, 服务态度, 这么, 差, 的, 点, 了, 一份, 套餐, 和, 一根, ...
# 766 1 [很, 是, 不错, 的, 送餐, 体验, 。]
# 3570 1 [不错, 不错, 胃, 不, 舒服, 才, 点, 的, 粥, 清爽, 料足, 。, 外卖, ...
# 11969 0 [谢谢, 速度, 很快, 辛苦, 了]
len(train_data)
# 9589
# iloc按照索引取值
test_data = data.iloc[data.index[~data.index.isin(train_data.index)]] # 将剩余的数据作为测试数据
test_data.head() # vx : louhh01
# label review
# 0 1 [很快, 好吃, 味道, 足量, 大]
# 7 1 [超级, 快, 就, 送到, 了, 这么, 冷, 的, 天气, 骑士, 们, 辛苦, 了, ...
# 17 1 [好吃, 速度, 包装, 也, 有, 品质, 不, 出, 家门, 就, 能, 吃, 到, 餐...
# 18 1 [味道, 好极, 啦, 送餐, 很快, 师傅, 辛苦, 啦]
# 21 1 [送货, 速度, 很快, 一直, 定, 这家, 赞]
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_data.values
# array([[0, list(['两个', '小时', '才', '送到', '慢'])],
# [0,
# list(['第一次', '见', '服务态度', '这么', '差', '的', '点', '了', '一份', '套餐', '和', '一根', '烤肠', '烤肠', '居然', '直接', '就', '没有', '送过来', '送到', '的', '时候', '饭菜', '都', '凉', '了', '也', '是', '我', '吃', '过', '的', '最', '难吃', '的', '梅菜', '扣肉', '再也', '不订', '他家', '的', '了'])],
# [1, list(['很', '是', '不错', '的', '送餐', '体验', '。'])],
# ...,
# [0, list(['送餐', '服务', '有待', '提高', '味道', '不用说', '晚', '了', '半小时'])],
# [0,
# list(['岂', '一个', '慢字', '了', '得', '而且', '明明', '是', '自己', '送', '的', '晚', '还', '赖', '人家', '送', '外卖', '的', '小哥', '速度慢', '真是', '醉', '死', '了'])],
# [1,
# list(['一盒', '撒', '了', ',', '小哥', '又', '去', '排队', '拿', '了', '一盒', ',', '棒棒', '哒'])]],
# dtype=object)
train_data.values[0]
# array([0, list(['两个', '小时', '才', '送到', '慢'])], dtype=object)
# 文本的批处理函数,参数是batch 一个批次的数据
def collate_batch(batch):
label_list, text_list = [], [] # 创建标签/评论列表
for (_label, _text) in batch: # 对批次进行迭代
label_list.append(_label)
precessed_text = torch.tensor(vocab(_text), dtype=torch.int64) # 使用vocab将文本转换为索引值 -> 转换为tensor
text_list.append(precessed_text)
label_list = torch.tensor(label_list)
text_list = torch.nn.utils.rnn.pad_sequence(text_list, batch_first=True) # 使用填充,使得一个批次中所有序列与最长的序列一致
return label_list.to(device), text_list.to(device)
# dataset __getitem__; __len__()
train_dl = DataLoader(train_data.values, batch_size=64,
collate_fn=collate_batch,
shuffle=True)
test_dl = DataLoader(test_data.values, batch_size=64,
collate_fn=collate_batch)
label_batch, text_batch = next(iter(train_dl)) # next返回一个批次
label_batch
# tensor([1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0,
# 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,
# 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0])
text_batch # 发现文本已经填充为同样的长度。 louhh01
# tensor([[1811, 1049, 96, ..., 0, 0, 0],
# [ 63, 273, 209, ..., 0, 0, 0],
# [ 789, 3769, 31, ..., 0, 0, 0],
# ...,
# [ 768, 100, 3, ..., 0, 0, 0],
# [ 223, 384, 3, ..., 0, 0, 0],
# [ 106, 5, 11, ..., 0, 0, 0]])
# 一维卷积模型:
1. embdeing # 文本词嵌入到张量
2. conv1d + maxpool # 一维卷积 + 最大池化
3. conv1d
4. nn.AdaptiveAvgPool1d # 自适应平均池化层,将不同长度的序列规范为统一长度
# nn.AdaptiveAvgPool2d 用于规范二维图像
5. view() # view方法展平为2维
6. Linear() # 输出层
vocab_size = len(vocab)
embeding_dim = 100
# 输入 text shape: batch, lengh
# 经过embeding后: batch, lengh, feathues=100
class CONV1D_Net(nn.Module):
def __init__(self, vocab_size, embeding_dim):
super(CONV1D_Net, self).__init__()
self.em = nn.Embedding(vocab_size, embeding_dim)
self.conv1 = nn.Conv1d(embeding_dim, 64, kernel_size=7)
self.pool = nn.MaxPool1d(kernel_size=2)
self.conv2 = nn.Conv1d(64, 128, kernel_size=7)
self.avgpool = nn.AdaptiveAvgPool1d(output_size=5) # batch*128*5
self.fc1 = nn.Linear(128*5, 64)
self.fc2 = nn.Linear(64, 2)
def forward(self, x):
x = self.em(x)
x = x.permute(0, 2, 1)
x = F.relu(self.conv1(x))
x = self.pool(x)
x = F.relu(self.conv2(x))
x = self.avgpool(x)
x = x.view(-1, x.size(1)*x.size(2))
x = F.dropout(F.relu(self.fc1(x)))
x = self.fc2(x)
return x
model = CONV1D_Net(vocab_size, embeding_dim).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
def train(dataloader):
total_acc, total_count, total_loss, = 0, 0, 0
model.train()
for label, text in dataloader:
predicted_label = model(text)
loss = loss_fn(predicted_label, label)
# Backpropagation
optimizer.zero_grad()
loss.backward()
optimizer.step()
with torch.no_grad():
total_acc += (predicted_label.argmax(1) == label).sum().item()
total_count += label.size(0)
total_loss += loss.item()*label.size(0)
return total_loss/total_count, total_acc/total_count
def test(dataloader):
model.eval()
total_acc, total_count, total_loss, = 0, 0, 0
with torch.no_grad():
for label, text in dataloader:
predicted_label = model(text)
loss = loss_fn(predicted_label, label)
total_acc += (predicted_label.argmax(1) == label).sum().item()
total_count += label.size(0)
total_loss += loss.item()*label.size(0)
return total_loss/total_count, total_acc/total_count
def fit(epochs, train_dl, test_dl):
train_loss = []
train_acc = []
test_loss = []
test_acc = []
for epoch in range(epochs):
epoch_loss, epoch_acc = train(train_dl)
epoch_test_loss, epoch_test_acc = test(test_dl)
train_loss.append(epoch_loss)
train_acc.append(epoch_acc)
test_loss.append(epoch_test_loss)
test_acc.append(epoch_test_acc)
template = ("epoch:{:2d}, train_loss: {:.5f}, train_acc: {:.1f}% ,"
"test_loss: {:.5f}, test_acc: {:.1f}%")
print(template.format(
epoch, epoch_loss, epoch_acc*100, epoch_test_loss, epoch_test_acc*100))
print("Done!")
return train_loss, test_loss, train_acc, test_acc
EPOCHS = 10
train_loss, test_loss, train_acc, test_acc = fit(EPOCHS,
train_dl,
test_dl)
# epoch: 0, train_loss: 0.55354, train_acc: 71.5% ,test_loss: 0.49842, test_acc: 79.2%
# epoch: 1, train_loss: 0.40823, train_acc: 83.9% ,test_loss: 0.36572, test_acc: 85.1%
# epoch: 2, train_loss: 0.31483, train_acc: 88.1% ,test_loss: 0.40902, test_acc: 85.1%
# epoch: 3, train_loss: 0.25132, train_acc: 90.8% ,test_loss: 0.39407, test_acc: 87.2%
# epoch: 4, train_loss: 0.19346, train_acc: 93.4% ,test_loss: 0.59312, test_acc: 85.9%
# epoch: 5, train_loss: 0.15583, train_acc: 94.8% ,test_loss: 0.41282, test_acc: 86.2%
# epoch: 6, train_loss: 0.11183, train_acc: 96.4% ,test_loss: 0.59546, test_acc: 86.1%
# epoch: 7, train_loss: 0.09305, train_acc: 97.3% ,test_loss: 0.73912, test_acc: 85.4%
# epoch: 8, train_loss: 0.06812, train_acc: 98.0% ,test_loss: 0.85146, test_acc: 84.7%
# epoch: 9, train_loss: 0.05540, train_acc: 98.3% ,test_loss: 0.82380, test_acc: 85.4%
# Done!