最近在学习文本分类,记录一些使用TextCNN进行新闻真假分类的过程
import torch
from torch.utils.data import random_split
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
dataset = pd.read_csv('/home/mw/input/news_data58668156/train_news.csv')
dataset.head()
dataset['label'].value_counts()
样本还是相关均衡的
REAL 2548
FAKE 2520
Name: label, dtype: int64
导入线上验证的测试集
test_dataset = pd.read_csv('/home/mw/input/news_data58668156/test_news.csv')
test_dataset.head()
处理训练文本
import re
def handle_data(data):
# X = data['text']+data['title']
X = data['text']
y = data['label']
tv_data = []
label = []
for i in range(len(X)):
r = '[’!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\n。!,]+'
temp = X[i].replace('\n', '')
temp = re.sub(r, '', temp)
# temp = temp.split(' ')
# temp = [temp[i].lower() for i in range(len(temp)) if temp[i] != '']
tv_data.append(temp)
# print(tv_data)
if y[i] == 'REAL':
label.append(0)
elif y[i] == 'FAKE':
label.append(1)
return tv_data,label
X_data,label = handle_data(dataset)
处理线上验证文本
def handle_test_data(data):
# X = data['text']+data['title']
X = data['text']
test_data = []
for i in range(len(X)):
r = '[’!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\n。!,]+'
temp = X[i].replace('\n', '')
temp = re.sub(r, '', temp)
test_data.append(temp)
return test_data
test_text = handle_test_data(test_dataset)
生成字典
word_list = " ".join(X_data+test_text).split()
word_list = list(set(word_list))
word_dict = {w: i for i, w in enumerate(word_list)}
def transform(sentence, max_len=512):
"""
把句子转换为数字序列
:param sentence:
:param max_len: 句子的最大长度
:return:
"""
if len(sentence) > max_len:
# 句子太长时进行截断
sentence = sentence[:max_len]
else:
# 句子长度不够标准长度时,进行填充
sentence = sentence + [0] * (max_len - len(sentence))
# print(sentence)
return sentence
x_input = [np.asarray(transform([word_dict[n] for n in sen.split()])) for sen in X_data]
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
# train_inputs, train_labels = make_data(X_data)
# train_inputs, train_labels = torch.LongTensor(train_inputs), torch.LongTensor(train_labels)
# x_input = [np.asarray([word_dict[n] for n in sen.split()]) for sen in X_data]
train_inputs = torch.LongTensor(x_input)
train_labels = torch.LongTensor([out for out in label])
# 加载训练数据集
dataset = TensorDataset(train_inputs, train_labels)
train_size = int(len(dataset) * 0.8)
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset=dataset,lengths=[train_size,val_size],generator=torch.Generator().manual_seed(2022)) #分割验证和训练集
# # 加载测试数据集
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True,drop_last=False)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=True,drop_last=False)
class MyModel(nn.Module):
def __init__(self, vocab_size, embedding_dim, num_filter,
filter_sizes, output_dim, dropout=0.5, pad_idx=0):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.convs = nn.ModuleList([
nn.Conv2d(in_channels=1, out_channels=num_filter,
kernel_size=(fs, embedding_dim))
for fs in filter_sizes
])
# in_channels:输入的channel,文字都是1
# out_channels:输出的channel维度
# fs:每次滑动窗口计算用到几个单词,相当于n-gram中的n
# for fs in filter_sizes用好几个卷积模型最后concate起来看效果。
self.fc = nn.Linear(len(filter_sizes) * num_filter, output_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, text):
embedded = self.dropout(self.embedding(text)) # [batch size, sent len, emb dim]
embedded = embedded.unsqueeze(1) # [batch size, 1, sent len, emb dim]
# 升维是为了和nn.Conv2d的输入维度吻合,把channel列升维。
conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
# conved = [batch size, num_filter, sent len - filter_sizes+1]
# 有几个filter_sizes就有几个conved
pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved] # [batch,num_filter]
cat = self.dropout(torch.cat(pooled, dim=1))
# cat = [batch size, num_filter * len(filter_sizes)]
# 把 len(filter_sizes)个卷积模型concate起来传到全连接层。
return self.fc(cat)
vocab_size = len(word_dict) # 词典数量
dmodel = 128 # embedding层词向量
num_filter = 10 # 卷积核个数
filter_size = [2, 3, 4] # 卷积核的长,取了三种
output_dim = 2 # 种类
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
model = MyModel(vocab_size+1, dmodel, num_filter=num_filter, filter_sizes=filter_size, output_dim=output_dim).to(device)
# 训练循环
def train(dataloader, model, loss_fn, optimizer):
size = len(dataloader.dataset) # 训练集的大小
num_batches = len(dataloader) # 批次数目, (size/batch_size,向上取整)
train_loss, train_acc = 0, 0 # 初始化训练损失和正确率
for X, y in dataloader: # 获取图片及其标签
X, y = X.to(device), y.to(device)
# 计算预测误差
pred = model(X) # 网络输出
loss = loss_fn(pred, y) # 计算网络输出和真实值之间的差距,targets为真实值,计算二者差值即为损失
# 反向传播
optimizer.zero_grad() # grad属性归零
loss.backward() # 反向传播
optimizer.step() # 每一步自动更新
# 记录acc与loss
train_acc += (pred.argmax(1) == y).type(torch.float).sum().item()
train_loss += loss.item()
train_acc /= size
train_loss /= num_batches
return train_acc, train_loss
def test (dataloader, model, loss_fn):
size = len(dataloader.dataset) # 测试集的大小
num_batches = len(dataloader) # 批次数目, (size/batch_size,向上取整)
test_loss, test_acc = 0, 0
# 当不进行训练时,停止梯度更新,节省计算内存消耗
with torch.no_grad():
for texts, target in dataloader:
texts, target = texts.to(device), target.to(device)
# 计算loss
target_pred = model(texts)
loss = loss_fn(target_pred, target)
test_loss += loss.item()
test_acc += (target_pred.argmax(1) == target).type(torch.float).sum().item()
test_acc /= size
test_loss /= num_batches
return test_acc, test_loss
learn_rate = 1e-2 # 初始学习率
lambda1 = lambda epoch: 0.90 ** (epoch // 2)
optimizer = torch.optim.Adam(model.parameters(), lr=learn_rate)
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda1)
import copy
loss_fn = nn.CrossEntropyLoss() # 创建损失函数
epochs = 20
train_loss = []
train_acc = []
test_loss = []
test_acc = []
best_acc = 0 # 设置一个最佳准确率,作为最佳模型的判别指标
for epoch in range(epochs):
# 更新学习率(使用自定义学习率时使用)
# adjust_learning_rate(optimizer, epoch, learn_rate)
model.train()
epoch_train_acc, epoch_train_loss = train(train_loader, model, loss_fn, optimizer)
scheduler.step() # 更新学习率(调用官方动态学习率接口时使用)
model.eval()
epoch_test_acc, epoch_test_loss = test(val_loader, model, loss_fn)
# 保存最佳模型到 best_model
if epoch_test_acc > best_acc:
best_acc = epoch_test_acc
best_model = copy.deepcopy(model)
train_acc.append(epoch_train_acc)
train_loss.append(epoch_train_loss)
test_acc.append(epoch_test_acc)
test_loss.append(epoch_test_loss)
# 获取当前的学习率
lr = optimizer.state_dict()['param_groups'][0]['lr']
template = ('Epoch:{:2d}, Train_acc:{:.1f}%, Train_loss:{:.3f}, Test_acc:{:.1f}%, Test_loss:{:.3f}, Lr:{:.2E}')
print(template.format(epoch+1, epoch_train_acc*100, epoch_train_loss,
epoch_test_acc*100, epoch_test_loss, lr))
# 保存最佳模型到文件中
PATH = './best_model.pth' # 保存的参数文件名
torch.save(model.state_dict(), PATH)
print('Done')
Epoch: 1, Train_acc:53.3%, Train_loss:0.879, Test_acc:55.5%, Test_loss:0.667, Lr:1.00E-02
Epoch: 2, Train_acc:55.6%, Train_loss:0.708, Test_acc:67.3%, Test_loss:0.654, Lr:9.00E-03
Epoch: 3, Train_acc:60.0%, Train_loss:0.669, Test_acc:80.6%, Test_loss:0.498, Lr:9.00E-03
Epoch: 4, Train_acc:68.2%, Train_loss:0.587, Test_acc:79.1%, Test_loss:0.412, Lr:8.10E-03
Epoch: 5, Train_acc:74.7%, Train_loss:0.499, Test_acc:85.8%, Test_loss:0.341, Lr:8.10E-03
Epoch: 6, Train_acc:81.0%, Train_loss:0.418, Test_acc:90.4%, Test_loss:0.248, Lr:7.29E-03
Epoch: 7, Train_acc:85.5%, Train_loss:0.323, Test_acc:92.1%, Test_loss:0.200, Lr:7.29E-03
Epoch: 8, Train_acc:88.6%, Train_loss:0.280, Test_acc:91.3%, Test_loss:0.216, Lr:6.56E-03
Epoch: 9, Train_acc:91.3%, Train_loss:0.218, Test_acc:93.3%, Test_loss:0.172, Lr:6.56E-03
Epoch:10, Train_acc:93.6%, Train_loss:0.173, Test_acc:93.4%, Test_loss:0.173, Lr:5.90E-03
Epoch:11, Train_acc:94.6%, Train_loss:0.149, Test_acc:94.0%, Test_loss:0.160, Lr:5.90E-03
Epoch:12, Train_acc:95.0%, Train_loss:0.132, Test_acc:94.0%, Test_loss:0.161, Lr:5.31E-03
Epoch:13, Train_acc:96.4%, Train_loss:0.101, Test_acc:94.3%, Test_loss:0.164, Lr:5.31E-03
Epoch:14, Train_acc:96.6%, Train_loss:0.094, Test_acc:94.5%, Test_loss:0.162, Lr:4.78E-03
Epoch:15, Train_acc:97.4%, Train_loss:0.081, Test_acc:94.1%, Test_loss:0.169, Lr:4.78E-03
Epoch:16, Train_acc:97.9%, Train_loss:0.061, Test_acc:94.4%, Test_loss:0.179, Lr:4.30E-03
Epoch:17, Train_acc:97.4%, Train_loss:0.076, Test_acc:93.8%, Test_loss:0.178, Lr:4.30E-03
Epoch:18, Train_acc:98.0%, Train_loss:0.056, Test_acc:93.6%, Test_loss:0.187, Lr:3.87E-03
Epoch:19, Train_acc:98.0%, Train_loss:0.070, Test_acc:93.6%, Test_loss:0.183, Lr:3.87E-03
Epoch:20, Train_acc:98.6%, Train_loss:0.046, Test_acc:93.5%, Test_loss:0.188, Lr:3.49E-03
Done
import matplotlib.pyplot as plt
#隐藏警告
import warnings
warnings.filterwarnings("ignore") #忽略警告信息
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
plt.rcParams['figure.dpi'] = 100 #分辨率
epochs_range = range(epochs)
plt.figure(figsize=(12, 3))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, train_acc, label='Training Accuracy')
plt.plot(epochs_range, test_acc, label='Test Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')
plt.subplot(1, 2, 2)
plt.plot(epochs_range, train_loss, label='Training Loss')
plt.plot(epochs_range, test_loss, label='Test Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()
PATH = './best_model.pth'
model.load_state_dict(torch.load(PATH, map_location=device))
test_x = [np.asarray(transform([word_dict[n] for n in sen.split()])) for sen in test_text]
test_inputs = torch.LongTensor(test_x).to(device)
res = []
# # 预测
predict = model(test_inputs).data.max(1, keepdim=True)[1]
for i in range(len(predict)):
if predict[i][0] == 0:
res.append('REAL')
else:
res.append('FAKE')
answer=res
answer=[x.upper() for x in answer]
dic={"id":[str(i) for i in range(len(res))],"answer":answer}
df=pd.DataFrame(dic)
df.to_csv('answer6.csv',index=False, encoding='utf-8-sig')
df