任务链接
这个任务内容比较感兴趣而且和工作内容相关,学习一下打个卡。
编码完成任务1,2,3,5,6,目前手上只有2080,之后在3090上跑。
最近杂事多,笔记、任务4和7之后再补充。
%pip install pandas -i https://pypi.tuna.tsinghua.edu.cn/simple
import pandas as pd
def load_lcqmc():
'''LCQMC文本匹配数据集
'''
train = pd.read_csv('https://mirror.coggle.club/dataset/LCQMC.train.data.zip',
sep='\t', names=['query1', 'query2', 'label'])
valid = pd.read_csv('https://mirror.coggle.club/dataset/LCQMC.valid.data.zip',
sep='\t', names=['query1', 'query2', 'label'])
test = pd.read_csv('https://mirror.coggle.club/dataset/LCQMC.test.data.zip',
sep='\t', names=['query1', 'query2', 'label'])
return train, valid, test
train, valid, test = load_lcqmc()
train
query1 | query2 | label | |
---|---|---|---|
0 | 喜欢打篮球的男生喜欢什么样的女生 | 爱打篮球的男生喜欢什么样的女生 | 1 |
1 | 我手机丢了,我想换个手机 | 我想买个新手机,求推荐 | 1 |
2 | 大家觉得她好看吗 | 大家觉得跑男好看吗? | 0 |
3 | 求秋色之空漫画全集 | 求秋色之空全集漫画 | 1 |
4 | 晚上睡觉带着耳机听音乐有什么害处吗? | 孕妇可以戴耳机听音乐吗? | 0 |
... | ... | ... | ... |
238761 | 女孩子说我是你的汤是什么意思 | 男孩给女孩说你的眼是海什么意思 | 0 |
238762 | 求重生之老公请接招全文 | 求重生之老公请接招>全文 | 1 |
238763 | 求小说电子书, | 求《甄嬛》小说电子书! | 0 |
238764 | 杭州有什么好玩的地方? | 杭州有什么好玩的地方求推 | 1 |
238765 | 我想做卫生巾代理,哪里有 | 我想做淘宝代理去那找,怎么做 | 0 |
238766 rows × 3 columns
%pip install jieba -i https://pypi.tuna.tsinghua.edu.cn/simple
def diff_len(query1, query2):
return abs(len(query1)- len(query2))
all_data = pd.concat([train, valid, test])
all_data["query1_len"] = all_data["query1"].apply(lambda x: len(x))
all_data["query2_len"] = all_data["query2"].apply(lambda x: len(x))
df1 = pd.DataFrame(pd.value_counts(pd.cut(all_data["query1_len"].tolist(),
[0, 10, 20, 30, 40, 50, 60])),
columns=["Query1长度的数量分布"])
df2 = pd.DataFrame(pd.value_counts(pd.cut(all_data["query1_len"].tolist(),
[0, 10, 20, 30, 40, 50, 60])),
columns=["Query2长度的数量分布"])
df1.plot(kind="bar")
df2.plot(kind="bar")
统计一下query1,query2文本长度的分布。在任务5时,长度截断取30基本覆盖全部query文本。
all_data["query_len_dif"] = all_data.apply(lambda x:
diff_len(x["query1"], x["query2"]), axis=1)
true_data = all_data[all_data["label"]==1]
false_data = all_data[all_data["label"]==0]
true_query_len_ave = sum(true_data["query_len_dif"])/len(true_data)
false_query_len_ave = sum(false_data["query_len_dif"])/len(false_data)
print("正确样本query字数差平均值:%.8f \n错误样本query字数差平均值:%.8f" %(true_query_len_ave, false_query_len_ave))
print("正确样本query长度差比错误样本query长度差小 %.2f%%" % \
((false_query_len_ave-true_query_len_ave)/true_query_len_ave*100))
正确样本query字数差平均值:1.48703309
错误样本query字数差平均值:2.57498060
正确样本query长度差比错误样本query长度差小 73.16%
import jieba
def static_words_num(query1, query2):
words = []
words.extend(jieba.lcut(query1))
words.extend(jieba.lcut(query2))
return len(words)
def static_token_num(query1, query2):
return len(query1) + len(query2)
all_data["words_num"] = all_data.apply(lambda x: static_words_num(x["query1"], x["query2"]), axis=1)
all_data["token_num"] = all_data.apply(lambda x: static_token_num(x["query1"], x["query2"]), axis=1)
print("赛题query文本中所有单词个数:%d 所有字符个数:%d " % (sum(all_data["words_num"]), sum(all_data["token_num"])))
Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.565 seconds.
Prefix dict has been built successfully.
赛题query文本中所有单词个数:3400570 所有字符个数:5686532
# 对query1和query2计算文本统计特征
# query1和query2文本长度
get_query_len = lambda x: len(x)
all_data["query1_len"] = all_data.apply(lambda x: get_query_len(x["query1"]), axis=1)
all_data["query2_len"] = all_data.apply(lambda x: get_query_len(x["query2"]), axis=1)
# query1和query2文本单词个数
import jieba
def get_word_num(query):
return len(jieba.lcut(query))
all_data["query1_word_num"] = all_data.apply(lambda x: get_word_num(x["query1"]), axis=1)
all_data["query2_word_num"] = all_data.apply(lambda x: get_word_num(x["query2"]), axis=1)
后面直接用DL模型,这里特征就不弄了(懒
%pip install gensim -i https://pypi.tuna.tsinghua.edu.cn/simple
%pip install tqdm -i https://pypi.tuna.tsinghua.edu.cn/simple
import jieba
import os
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
from gensim.models import TfidfModel
from gensim import corpora
def load_stopwords(path=None):
if path is not None and os.path.exists(path):
files = os.listdir(path)
text_set = set()
for file in files:
if ".txt" not in file: continue
with open(os.path.join(path, file), 'r', encoding="UTF-8") as f:
for line in f.readlines():
text_set.add(line[:-1])
print("Load [%d] nums stopwords successfully!" % len(text_set))
return list(text_set)
stop_words = load_stopwords("./stop_words/")
def get_cut_words(sen):
words = []
for word in jieba.lcut(sen):
if word not in stop_words:
words.append(word)
return words
# get_cut_words = lambda x: jieba.lcut(x)
all_data["query1_words"] = all_data.apply(lambda x:
get_cut_words(x["query1"]), axis=1)
all_data["query2_words"] = all_data.apply(lambda x:
get_cut_words(x["query2"]), axis=1)
context = []
all_words = []
for i in range(len(all_data)):
row = all_data.iloc[i]
context.append(row["query1_words"])
context.append(row["query2_words"])
all_words.extend(list(set(row["query1_words"])))
all_words.extend(list(set(row["query2_words"])))
wv_model = Word2Vec(sentences=context, vector_size=100, window=5, min_count=1, workers=4)
wv_model.train(context, total_examples=1, epochs=1)
Load [746] nums stopwords successfully!
(1803946, 1896899)
# 计算tf-idf
import math
from collections import Counter
from tqdm import tqdm
doc_num = len(all_data) * 2
def get_idf(all_words):
idf = {}
count = Counter(all_words)
for word, cnt in tqdm(dict(count).items()):
idf[word] = math.log(doc_num/(cnt))
return idf
idf = get_idf(all_words)
100%|██████████| 39937/39937 [00:00<00:00, 2015132.86it/s]
import numpy as np
from sklearn.decomposition import TruncatedSVD
# 计算sif https://github.com/PrincetonML/SIF/blob/master/src/SIF_embedding.py
def compute_pc(X,npc=1):
svd = TruncatedSVD(n_components=npc, n_iter=7, random_state=0)
svd.fit(X)
return svd.components_
def remove_pc(X, npc=1):
pc = compute_pc(X, npc)
if npc==1:
XX = X - X.dot(pc.transpose()) * pc
else:
XX = X - X.dot(pc.transpose()).dot(pc)
return XX
def sif_weight(count, a=3e-5):
word_num = 0
for k, v in dict(count).items():
word_num += v
sif = {}
for k, v in dict(count).items():
sif[k] = a / (a + v/word_num)
return sif
count = Counter(all_words)
sif = sif_weight(count)
def sen2embed(wv_model, data, operation="IDF-Pooling"):
query1_embed, query2_embed = [], []
for i in tqdm(range(len(all_data))):
row = data.iloc[i]
embed_lst1, embed_lst2 = [], []
if operation == "Mean-Pooling":
for word in row["query1_words"]:
embed_lst1.append(wv_model.wv[word])
for word in row["query2_words"]:
embed_lst2.append(wv_model.wv[word])
query1_embed.append(np.mean(embed_lst1, axis=0))
query2_embed.append(np.mean(embed_lst2, axis=0))
elif operation == "Max-Pooling":
for word in row["query1_words"]:
embed_lst1.append(wv_model.wv[word])
for word in row["query2_words"]:
embed_lst2.append(wv_model.wv[word])
if len(embed_lst1) == 0:
query1_embed.append(np.zeros(100))
else:
query1_embed.append(np.amax(embed_lst1, axis=0))
if len(embed_lst2) == 0:
query2_embed.append(np.zeros(100))
else:
query2_embed.append(np.amax(embed_lst2, axis=0))
elif operation == "IDF-Pooling":
for word in row["query1_words"]:
embed_lst1.append(wv_model.wv[word]*idf[word])
for word in row["query2_words"]:
embed_lst2.append(wv_model.wv[word]*idf[word])
query1_embed.append(np.mean(embed_lst1, axis=0))
query2_embed.append(np.mean(embed_lst2, axis=0))
elif operation == "SIF-Pooling":
for word in row["query1_words"]:
embed_lst1.append(wv_model.wv[word]*sif[word])
for word in row["query2_words"]:
embed_lst2.append(wv_model.wv[word]*sif[word])
query1_embed.append(np.mean(embed_lst1, axis=0))
query2_embed.append(np.mean(embed_lst2, axis=0))
query1_embed = list(remove_pc(np.array(query1_embed)))
query2_embed = list(remove_pc(np.array(query1_embed)))
data["query1_"+operation] = query1_embed
data["query2_"+operation] = query2_embed
sen2embed(wv_model, all_data, "Mean-Pooling")
sen2embed(wv_model, all_data, "Max-Pooling")
sen2embed(wv_model, all_data, "IDF-Pooling")
sen2embed(wv_model, all_data, "SIF-Pooling")
print(all_data[["query1_Mean-Pooling", "query2_Mean-Pooling",
"query1_Max-Pooling", "query2_Max-Pooling",
"query1_IDF-Pooling", "query2_IDF-Pooling",
"query1_SIF-Pooling", "query2_SIF-Pooling"]])
!pip install torch -i https://pypi.tuna.tsinghua.edu.cn/simple
!pip install pytorch-lightning -i https://pypi.tuna.tsinghua.edu.cn/simple
任务5~7我使用pytoch-liightning框架快速的编写代码,它可以支持将处理数据逻辑和模型结构,训练模块解耦开。因此在任务5,6中的数据处理,训练流程,以及Siamese Network等相同逻辑部分我就尽量的共用了代码(Common Code)。
任务5,6编码调试完成,任务5在2080上一小时左右跑完,但跑任务6的有点费劲, 之后在3090上跑一下。
任务7最近太忙,后面有空学习相关知识再补充一下。
import re
import tqdm
def load_lcqmc():
'''LCQMC文本匹配数据集
'''
train = pd.read_csv('https://mirror.coggle.club/dataset/LCQMC.train.data.zip',
sep='\t', names=['query1', 'query2', 'label'])
valid = pd.read_csv('https://mirror.coggle.club/dataset/LCQMC.valid.data.zip',
sep='\t', names=['query1', 'query2', 'label'])
test = pd.read_csv('https://mirror.coggle.club/dataset/LCQMC.test.data.zip',
sep='\t', names=['query1', 'query2', 'label'])
return train, valid, test
def load_stopwords(path=None):
if path is not None and os.path.exists(path):
files = os.listdir(path)
text_set = set()
for file in files:
if ".txt" not in file: continue
with open(os.path.join(path, file), 'r', encoding="UTF-8") as f:
for line in f.readlines():
text_set.add(line[:-1])
print("Load [%d] nums stopwords successfully!" % len(text_set))
return list(text_set)
def cut(content, stop_words, seq_len=30):
content = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]", "", content)
result = []
seg_list = jieba.lcut(content, cut_all=True)
for i in seg_list:
if i not in stop_words:
result.append(i)
if len(result) < seq_len: # 小于规定长度,填充
new_result = ['PAD' for i in range(seq_len)]
new_result[:len(result)] = result
return new_result
else:
return result[:seq_len]
import torch
from torch.utils.data import DataLoader
import pytorch_lightning as pl
from torchmetrics import Accuracy
from torchmetrics import F1Score
from torchmetrics import AUROC
class DataModule(pl.LightningDataModule):
def __init__(self,
train_df,
valid_df,
test_df,
batch_size: int = 32,
num_workers: int = 8,
pin_memory: bool = False):
super().__init__()
self.batch_size = batch_size
self.num_workers = num_workers
self.pin_memory = pin_memory
self.train_dataset = SaimDataset(train_df)
self.valid_dataset = SaimDataset(valid_df)
self.test_dataset = SaimDataset(test_df)
def setup(self, stage=None): # 实现数据集划分定义,每张GPU都会执行。
pass
def train_dataloader(self):
return DataLoader(self.train_dataset, batch_size=self.batch_size,
shuffle=False, num_workers=self.num_workers,
pin_memory=self.pin_memory)
def val_dataloader(self):
return DataLoader(self.valid_dataset, batch_size=self.batch_size,
shuffle=False, num_workers=self.num_workers,
pin_memory=self.pin_memory)
def test_dataloader(self):
return DataLoader(self.test_dataset, batch_size=self.batch_size,
shuffle=False, num_workers=self.num_workers,
pin_memory=self.pin_memory)
@staticmethod
def add_dataset_args(parent_parser):
pass
class ContrastiveLoss(torch.nn.Module):
"""
Contrastive loss function.
Based on: http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
"""
def __init__(self, margin=2.0):
super(ContrastiveLoss, self).__init__()
self.margin = margin
def forward(self, distance, label):
# distance = F.pairwise_distance(output1, output2) #euclidean_distance
loss_contrastive = torch.mean((label) * torch.pow(distance, 2) + # calmp夹断用法
(1-label) * torch.pow(torch.clamp(self.margin - distance, min=0.0), 2))
# y * cosine_distance^2 + (1-y) * max(0, margin- cosine_distance)^2 margin=1.2, 1.5, 1.8, 2 [0, 2]
return loss_contrastive
class SiamTrainModel(pl.LightningModule):
def __init__(self, hparams={"learning_rate": 1e-3,
"contrastive_loss_margin": 1.0}):
super().__init__()
self.save_hyperparameters(hparams)
self.encode = self.hparams.encode
self.epoch_size = None # 可能会改变,给外界提供一个修改接口
self.learning_rate = self.hparams.learning_rate # 可能会改变,给外界提供一个修改接口
self.contrastive_loss = ContrastiveLoss(margin=self.hparams.contrastive_loss_margin)
self.metric_acc = Accuracy(task="binary", num_classes=2)
self.metric_f1 = F1Score(task="binary", num_classes=2)
self.metric_auc = AUROC(task="binary", num_classes=2)
def forward(self, batch):
embed1 = self.encode(batch["input1"])
embed2 = self.encode(batch["input2"])
return embed1, embed2
def configure_optimizers(self):
optimizer = torch.optim.Adam(self.parameters(),
lr=self.learning_rate)
return optimizer
def training_step(self, batch, batch_idx):
embed1, embed2 = self(batch)
sims = (embed1 * embed2).sum(dim=-1)
loss = self.contrastive_loss(1.0-sims, batch["label"])
self.log("train_loss", loss, on_step=True, on_epoch=False, prog_bar=True, logger=True)
# training_step 一定要返回 loss
return {"loss": loss,
"preds": sims.detach(),
"targets": batch["label"].detach()}
def validation_step(self, batch, batch_idx):
embed1, embed2 = self(batch)
sims = (embed1 * embed2).sum(dim=-1)
loss = self.contrastive_loss(1.0-sims, batch["label"])
return {"loss": loss.detach(),
"preds": sims.detach(),
"targets": batch["label"].detach()}
def validation_epoch_end(self, outputs):
loss = 0.0
preds, targets = [], []
for output in outputs:
loss += output["loss"]
preds.append(torch.where(output["preds"] >= 0.5, 1, 0))
targets.append(output["targets"])
loss /= len(outputs)
preds = torch.cat(preds)
targets = torch.cat(targets)
valid_acc = self.metric_acc(preds, targets)
valid_f1 = self.metric_f1(preds, targets)
valid_auc = self.metric_auc(preds, targets)
metrics = {"valid_loss": loss, "valid_acc": valid_acc,
"valid_f1": valid_f1, "valid_auc": valid_auc}
self.log_dict(metrics, on_step=False, on_epoch=True, prog_bar=True, logger=False)
def test_step(self, batch, batch_idx):
embed1, embed2 = self(batch)
sims = (embed1 * embed2).sum(dim=-1) #归一化后,相似度在(0,1)之间
loss = self.contrastive_loss(1.0-sims, batch["label"])
self.log("test_loss", loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)
return {"loss": loss.detach(),
"preds": sims.detach(),
"targets": batch["label"].detach()}
def test_epoch_end(self, outputs):
loss = 0.0
preds, targets = [], []
for output in outputs:
loss += output["loss"]
preds.append(torch.where(output["preds"] >= 0.5, 1, 0))
targets.append(output["targets"])
loss /= len(outputs)
preds = torch.cat(preds)
targets = torch.cat(targets)
test_acc = self.metric_acc(preds, targets)
test_f1 = self.metric_f1(preds, targets)
test_auc = self.metric_auc(preds, targets)
metrics = {"test_loss": loss, "test_acc": test_acc,
"test_f1": test_f1, "test_auc": test_auc}
self.log_dict(metrics, on_step=False, on_epoch=True, prog_bar=True, logger=True)
return metrics
def predict_step(self, batch, batch_idx):
embed1, embed2 = self(batch)
sims = (embed1 * embed2).sum(dim=-1)
preds = torch.where(sims >= 0.5, 1, 0)
return preds
import pytorch_lightning as pl
from pytorch_lightning import Trainer
def main(hparams):
pl.seed_everything(1234) # 统一设置随机种子
train_model = hparams["model"]
data_module = hparams["data_module"]
ckpt_callback = pl.callbacks.ModelCheckpoint(
monitor=hparams["train_monitor"],
dirpath=hparams["save_dir"],
filename="model-{epoch:04d}-{valid_auc:.3f}",
mode=hparams["train_mode"])
early_stopping = pl.callbacks.EarlyStopping(
monitor=hparams["train_monitor"],
patience=hparams["early_stop_patience"],
mode=hparams["train_mode"])
callbacks = [ckpt_callback, early_stopping]
trainer = pl.Trainer(
min_epochs=hparams["min_epochs"],
max_epochs=hparams["max_epochs"],
callbacks=callbacks,
accelerator='gpu',
devices=1)
trainer.fit(model=train_model, datamodule=data_module)
print("Test Data Result:")
result = trainer.test(model=train_model, datamodule=data_module)
return result
import os
import jieba
import pandas as pd
from tqdm import tqdm
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
def cut_words(df, stop_words):
df["query1_words"] = df["query1"].apply(lambda x: cut(x, stop_words))
df["query2_words"] = df["query2"].apply(lambda x: cut(x, stop_words))
return df
def build_vocab(train, valid, test, file_name=""):
word_index = 0
vocab_dict = {}
dataset = [train, valid, test]
for data in tqdm(dataset, desc='构建vocab'):
for i in range(len(data)):
row = data.iloc[i]
words_a = row['query1_words']
words_b = row['query2_words']
for word in words_a:
if word not in vocab_dict.keys():
vocab_dict[word] = word_index
word_index += 1
for word in words_b:
if word not in vocab_dict.keys():
vocab_dict[word] = word_index
word_index += 1
print("Build Vocab [%d] successfully!" % len(vocab_dict))
if len(file_name) > 0:
write_pickle(vocab_dict, file_name)
return vocab_dict
def word2vocab_index(df, vocab):
word2index = lambda words: [vocab[word] for word in words]
df["query1_index"] = df["query1_words"].apply(lambda x: word2index(x))
df["query2_index"] = df["query2_words"].apply(lambda x: word2index(x))
return df
def data_process():
train, valid, test = load_lcqmc()
stop_words = load_stopwords("./stop_words/")
# get words
train = cut_words(train, stop_words)
valid = cut_words(valid, stop_words)
test = cut_words(test, stop_words)
# get vocab
vocab = build_vocab(train, valid, test, "vocab.txt")
# trans word to vocab index
train = word2vocab_index(train, vocab)
valid = word2vocab_index(valid, vocab)
test = word2vocab_index(test, vocab)
return train, valid, test, vocab
def load_wv(train, dev, test, file_name=""):
datalist = [train, dev, test]
context = []
for data in datalist:
for i in range(len(data)):
row = data.iloc[i]
context.append(row['query1_index']) # 这里学习是word在vocab中index的向量
context.append(row['query2_index'])
wv_model = Word2Vec(sentences=context, vector_size=100, window=5, min_count=1, workers=4)
wv_model.train(context, total_examples=1, epochs=1)
if len(file_name) > 0:
wv_model.save(file_name)
return wv_model
train, valid, test, vocab = data_process()
wv_model = load_wv(train, valid, test)
Load [746] nums stopwords successfully!
构建vocab: 100%|██████████| 3/3 [00:24<00:00, 8.05s/it]
Build Vocab [42397] successfully!
Save [42397] nums in [vocab.txt] successfully!
import numpy as np
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
# 自定义数据集
class SaimDataset(Dataset):
def __init__(self, df):
super(Dataset, self).__init__()
self.text1 = df["query1_index"]
self.text2 = df["query2_index"]
self.label = df["label"]
self.len = len(df)
def __getitem__(self, idx):
index1 = np.array(self.text1.iloc[idx], dtype="int64")
index2 = np.array(self.text2.iloc[idx], dtype="int64")
label = np.array(self.label.iloc[idx]).astype("int64")
return {"input1": index1,
"input2": index2,
"label": label}
def __len__(self):
return self.len
import torch
from torch import nn
class LSTM(nn.Module):
def __init__(self, wv_mode, vocab_size, embed_dim):
super(LSTM, self).__init__()
word_vectors = torch.randn([vocab_size, embed_dim])
for i in range(0, vocab_size):
word_vectors[i, :] = torch.from_numpy(wv_mode.wv[i])
self.embedding = nn.Embedding.from_pretrained(word_vectors, freeze=False)
self.LSTM = nn.LSTM(input_size=embed_dim, hidden_size=embed_dim, num_layers=2)
self.Linear = nn.Sequential(
nn.Linear(embed_dim*30, embed_dim),
#nn.Dropout(p=0.1),
nn.ReLU(),
nn.Linear(embed_dim, embed_dim))
def forward(self, text):
# 计算a
x = self.embedding(text)
x = x.transpose(0, 1) # 交换维度,LSTM的输入是 (L, D, H)
x, _ = self.LSTM(x)
x = x.transpose(0, 1) # 还原维度, LSTM输出是 (L, D, H)
x = x.contiguous().view(x.size(0), -1)
return self.Linear(x)
data_module = DataModule(train, valid, test)
text_encode = LSTM(wv_model, len(vocab), 100)
hparams["encode"] = text_encode
for data in data_module.train_dataloader():
print(data["input1"].shape)
output = text_encode(data["input1"])
print(output)
break
torch.Size([32, 30])
x shape: torch.Size([32, 30, 100])
tensor([[-0.0412, -0.0353, -0.1243, ..., 0.0885, -0.0744, 0.1103],
[-0.0414, -0.0400, -0.1309, ..., 0.0911, -0.0751, 0.1125],
[-0.0410, -0.0322, -0.1343, ..., 0.0925, -0.0741, 0.1105],
...,
[-0.0395, -0.0288, -0.1321, ..., 0.0905, -0.0775, 0.1069],
[-0.0360, -0.0414, -0.1215, ..., 0.0882, -0.0800, 0.1011],
[-0.0445, -0.0366, -0.1335, ..., 0.0885, -0.0717, 0.1087]],
grad_fn=)
# 执行step3 之前先执行上面 PL Dataloader, TrainModel, Training(Common Code)
hparams = {
"learning_rate": 1e-3,
"contrastive_loss_margin": 1.0,
"train_monitor": "valid_auc",
"train_mode": "max",
"early_stop_patience": 5,
"min_epochs": 5,
"max_epochs": 100,
"save_dir": "./models/"
}
train, valid, test, vocab = data_process()
data_module = DataModule(train, valid, test)
wv_model = load_wv(train, valid, test)
text_encode = LSTM(wv_model, len(vocab), 100)
hparams["encode"] = text_encode
train_model = SiamTrainModel(hparams)
hparams["model"] = train_model
hparams["data_module"] = data_module
main(hparams)
Load [746] nums stopwords successfully!
构建vocab: 100%|██████████| 3/3 [00:24<00:00, 8.05s/it]
Build Vocab [42397] successfully!
Save [42397] nums in [vocab.txt] successfully!
/opt/conda/lib/python3.8/site-packages/pytorch_lightning/utilities/parsing.py:262: UserWarning: Attribute 'encode' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['encode'])`.
rank_zero_warn(
Global seed set to 1234
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/opt/conda/lib/python3.8/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:604: UserWarning: Checkpoint directory /usr/yexiaoju/code/practice_project/jupyter/models exists and is not empty.
rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
| Name | Type | Params
-----------------------------------------------------
0 | encode | LSTM | 4.7 M
1 | contrastive_loss | ContrastiveLoss | 0
2 | metric_acc | BinaryAccuracy | 0
3 | metric_f1 | BinaryF1Score | 0
4 | metric_auc | BinaryAUROC | 0
-----------------------------------------------------
4.7 M Trainable params
0 Non-trainable params
4.7 M Total params
18.846 Total estimated model params size (MB)
Sanity Checking: 0it [00:00, ?it/s]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Test Data Result:
Testing: 0it [00:00, ?it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Test metric DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
test_acc 0.7702400088310242
test_auc 0.7702400088310242
test_f1 0.7962542772293091
test_loss 0.19331854581832886
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
[{'test_loss': 0.19331854581832886,
'test_acc': 0.7702400088310242,
'test_f1': 0.7962542772293091,
'test_auc': 0.7702400088310242}]
%pip install transformers -i https://pypi.tuna.tsinghua.edu.cn/simple
这里尝试是Sentence-Bert提取文本特征方式,没有用Bert-NSP。
from collections import defaultdict
# def data_process(tokenizer): # BERT-NSP方式
# train, valid, test = load_lcqmc()
# data_df = [train, valid, test]
# for df in data_df:
# inputs = defaultdict(list)
# for i, row in tqdm(df.iterrows(), desc="encode {} data".format(k), total=len(df)):
# seq_a, seq_b, label = row[0], row[1], row[2]
# try:
# inputs_dict = tokenizer.encode_plus(seq_a, seq_b, add_special_tokens=True, \
# return_token_type_ids=True, return_attention_mask=True)
# except TypeError as ex:
# print(row)
# inputs["input_ids"].append(inputs_dict["input_ids"])
# inputs["token_type_ids"].append(inputs_dict["token_type_ids"])
# inputs["attention_mask"].append(inputs_dict["attention_mask"])
# inputs["labels"].append(label)
# df["input_ids"] = inputs["input_ids"]
# df["token_type_ids"] = inputs["token_type_ids"]
# df["attention_mask"] = inputs["attention_mask"]
# df["labels"] = inputs["labels"]
# return train, valid, test
def data_process(tokenizer): # Sentence-BERT
train, valid, test = load_lcqmc()
data_df = [train, valid, test]
for df in data_df:
inputs = defaultdict(list)
for i, row in tqdm(df.iterrows(), desc="process data", total=len(df)):
seq_a, seq_b, label = row[0], row[1], row[2]
try:
inputs_dict1 = tokenizer(seq_a,
add_special_tokens=True,
return_attention_mask=True,
return_tensors="pt",
padding=True)
inputs_dict2 = tokenizer(seq_b,
add_special_tokens=True,
return_attention_mask=True,
return_tensors="pt",
padding=True)
except TypeError as ex:
print(row)
inputs["input_ids1"].append(inputs_dict1["input_ids"])
inputs["attention_mask1"].append(inputs_dict1["attention_mask"])
inputs["input_ids2"].append(inputs_dict2["input_ids"])
inputs["attention_mask2"].append(inputs_dict2["attention_mask"])
inputs["labels"].append(label)
df["input_ids1"] = inputs["input_ids1"]
df["attention_mask1"] = inputs["attention_mask1"]
df["input_ids2"] = inputs["input_ids2"]
df["attention_mask2"] = inputs["attention_mask2"]
df["labels"] = inputs["labels"]
return train, valid, test
from torch.utils.data import Dataset
class SaimDataset(Dataset):
def __init__(self, df):
super(Dataset, self).__init__()
self.input_ids1 = df["input_ids1"]
self.attention_mask1 = df["attention_mask1"]
self.input_ids2 = df["input_ids2"]
self.attention_mask2 = df["attention_mask2"]
self.labels = df["labels"]
self.len = len(self.labels)
def __getitem__(self, idx):
inputs1 = {"inputs_ids": self.input_ids1[idx],
"attention_mask": self.attention_mask1[idx]}
inputs2 = {"inputs_ids": self.input_ids2[idx],
"attention_mask": self.attention_mask2[idx]}
return {"input1": inputs1,
"input2": inputs2,
"label": self.labels[idx]}
def __len__(self):
return self.len
import torch
from torch import nn
class BertSentence(nn.Module):
def __init__(self, BertModel):
super(BertSentence, self).__init__()
self.sen_bert = BertModel
def forward(self, batch):
inputs_ids, attention_mask = batch["input_ids"], batch["attention_mask"]
output = self.sen_bert(inputs_ids, attention_mask)
return output[1]
# 执行step3 之前先执行上面 PL Dataloader, TrainModel, Training(Common Code)
from transformers import BertConfig
from transformers import BertTokenizer
from transformers import BertModel
hparams = {
"learning_rate": 1e-3,
"contrastive_loss_margin": 1.0,
"train_monitor": "valid_auc",
"train_mode": "max",
"early_stop_patience": 5,
"min_epochs": 5,
"max_epochs": 100,
"save_dir": "./models/"
}
model_name = "hfl/chinese-roberta-wwm-ext"
config = BertConfig.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = BertModel.from_pretrained(model_name)
train, valid, test, vocab = data_process(tokenizer)
data_module = DataModule(train, valid, test)
text_encode = BertSentence(bert_model)
hparams["encode"] = text_encode
train_model = SiamTrainModel(hparams)
hparams["model"] = train_model
hparams["data_module"] = data_module
main(hparams)
有空补充