Coggle 30 Days of ML(23年1月)打卡

前言

任务链接
这个任务内容比较感兴趣而且和工作内容相关,学习一下打个卡。
编码完成任务1,2,3,5,6,目前手上只有2080,之后在3090上跑。
最近杂事多,笔记、任务4和7之后再补充。

任务1:数据集读取

%pip install pandas -i https://pypi.tuna.tsinghua.edu.cn/simple
import pandas as pd

def load_lcqmc():
    '''LCQMC文本匹配数据集
    '''
    train = pd.read_csv('https://mirror.coggle.club/dataset/LCQMC.train.data.zip', 
            sep='\t', names=['query1', 'query2', 'label'])

    valid = pd.read_csv('https://mirror.coggle.club/dataset/LCQMC.valid.data.zip', 
            sep='\t', names=['query1', 'query2', 'label'])

    test = pd.read_csv('https://mirror.coggle.club/dataset/LCQMC.test.data.zip', 
            sep='\t', names=['query1', 'query2', 'label'])
    return train, valid, test

train, valid, test = load_lcqmc()
train
query1 query2 label
0 喜欢打篮球的男生喜欢什么样的女生 爱打篮球的男生喜欢什么样的女生 1
1 我手机丢了,我想换个手机 我想买个新手机,求推荐 1
2 大家觉得她好看吗 大家觉得跑男好看吗? 0
3 求秋色之空漫画全集 求秋色之空全集漫画 1
4 晚上睡觉带着耳机听音乐有什么害处吗? 孕妇可以戴耳机听音乐吗? 0
... ... ... ...
238761 女孩子说我是你的汤是什么意思 男孩给女孩说你的眼是海什么意思 0
238762 求重生之老公请接招全文 求重生之老公请接招>全文 1
238763 求小说电子书, 求《甄嬛》小说电子书! 0
238764 杭州有什么好玩的地方? 杭州有什么好玩的地方求推 1
238765 我想做卫生巾代理,哪里有 我想做淘宝代理去那找,怎么做 0

238766 rows × 3 columns

任务2:文本数据分析

%pip install jieba -i https://pypi.tuna.tsinghua.edu.cn/simple

1.分析赛题文本长度,相似文本对与不相似文本对的文本长度是否存在差异?

def diff_len(query1, query2):
    return abs(len(query1)- len(query2))

all_data = pd.concat([train, valid, test])
all_data["query1_len"] = all_data["query1"].apply(lambda x: len(x))
all_data["query2_len"] = all_data["query2"].apply(lambda x: len(x))

df1 = pd.DataFrame(pd.value_counts(pd.cut(all_data["query1_len"].tolist(), 
                                          [0, 10, 20, 30, 40, 50, 60])), 
                   columns=["Query1长度的数量分布"])
df2 = pd.DataFrame(pd.value_counts(pd.cut(all_data["query1_len"].tolist(), 
                                          [0, 10, 20, 30, 40, 50, 60])), 
                   columns=["Query2长度的数量分布"])

df1.plot(kind="bar")
df2.plot(kind="bar")

Coggle 30 Days of ML(23年1月)打卡_第1张图片
Coggle 30 Days of ML(23年1月)打卡_第2张图片
统计一下query1,query2文本长度的分布。在任务5时,长度截断取30基本覆盖全部query文本。

all_data["query_len_dif"] = all_data.apply(lambda x: 
                                           diff_len(x["query1"], x["query2"]), axis=1)

true_data = all_data[all_data["label"]==1]
false_data = all_data[all_data["label"]==0]

true_query_len_ave = sum(true_data["query_len_dif"])/len(true_data)
false_query_len_ave = sum(false_data["query_len_dif"])/len(false_data)
print("正确样本query字数差平均值:%.8f \n错误样本query字数差平均值:%.8f" %(true_query_len_ave, false_query_len_ave))
print("正确样本query长度差比错误样本query长度差小 %.2f%%" % \
      ((false_query_len_ave-true_query_len_ave)/true_query_len_ave*100))
正确样本query字数差平均值:1.48703309 
错误样本query字数差平均值:2.57498060
正确样本query长度差比错误样本query长度差小 73.16%

2.分析赛题单词和字符个数,在所有文本中包含多少个单词(用jieba进行分析)和字符?

import jieba

def static_words_num(query1, query2):
    words = []
    words.extend(jieba.lcut(query1))
    words.extend(jieba.lcut(query2))
    return len(words)

def static_token_num(query1, query2):
    return len(query1) + len(query2)

all_data["words_num"] = all_data.apply(lambda x: static_words_num(x["query1"], x["query2"]), axis=1)
all_data["token_num"] = all_data.apply(lambda x: static_token_num(x["query1"], x["query2"]), axis=1)
print("赛题query文本中所有单词个数:%d  所有字符个数:%d " % (sum(all_data["words_num"]), sum(all_data["token_num"])))
Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.565 seconds.
Prefix dict has been built successfully.


赛题query文本中所有单词个数:3400570  所有字符个数:5686532 

任务3:文本相似度(统计特征) (未完成)

# 对query1和query2计算文本统计特征

# query1和query2文本长度
get_query_len = lambda x: len(x)
all_data["query1_len"] = all_data.apply(lambda x: get_query_len(x["query1"]), axis=1)
all_data["query2_len"] = all_data.apply(lambda x: get_query_len(x["query2"]), axis=1)

# query1和query2文本单词个数
import jieba
def get_word_num(query):
    return len(jieba.lcut(query))

all_data["query1_word_num"] = all_data.apply(lambda x: get_word_num(x["query1"]), axis=1)
all_data["query2_word_num"] = all_data.apply(lambda x: get_word_num(x["query2"]), axis=1)

后面直接用DL模型,这里特征就不弄了(懒

任务4:文本相似度(词向量与句子编码)

%pip install gensim -i https://pypi.tuna.tsinghua.edu.cn/simple
%pip install tqdm -i https://pypi.tuna.tsinghua.edu.cn/simple

步骤1:使用jieba分词,然后使用word2vec训练词向量

import jieba
import os
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
from gensim.models import TfidfModel
from gensim import corpora

def load_stopwords(path=None):
    if path is not None and os.path.exists(path):
        files = os.listdir(path)
        text_set = set()
        for file in files:
            if ".txt" not in file: continue
            with open(os.path.join(path, file), 'r', encoding="UTF-8") as f:
                for line in f.readlines():
                    text_set.add(line[:-1])

    print("Load [%d] nums stopwords successfully!" % len(text_set))
    return list(text_set)

stop_words = load_stopwords("./stop_words/")

def get_cut_words(sen):
    words = []
    for word in jieba.lcut(sen):
        if word not in stop_words:
            words.append(word)
    return words

# get_cut_words = lambda x: jieba.lcut(x) 
all_data["query1_words"] = all_data.apply(lambda x: 
                                          get_cut_words(x["query1"]), axis=1)
all_data["query2_words"] = all_data.apply(lambda x: 
                                          get_cut_words(x["query2"]), axis=1)

context = []
all_words = []
for i in range(len(all_data)):
    row = all_data.iloc[i]
    context.append(row["query1_words"])
    context.append(row["query2_words"])
    all_words.extend(list(set(row["query1_words"])))
    all_words.extend(list(set(row["query2_words"])))

wv_model = Word2Vec(sentences=context, vector_size=100, window=5, min_count=1, workers=4)
wv_model.train(context, total_examples=1, epochs=1)
Load [746] nums stopwords successfully!
(1803946, 1896899)

步骤2:计算单词的TF-IDF或BM25权重

# 计算tf-idf
import math

from collections import Counter
from tqdm import tqdm

doc_num = len(all_data) * 2

def get_idf(all_words):
    idf = {}
    count = Counter(all_words)
    for word, cnt in tqdm(dict(count).items()):
        idf[word] = math.log(doc_num/(cnt))
    return idf

idf =  get_idf(all_words)
100%|██████████| 39937/39937 [00:00<00:00, 2015132.86it/s]

步骤3:尝试如下无监督句子编码过程

  • Mean-Pooling
  • Max-Pooling
  • IDF-Pooling / BM25-Pooling
  • SIF-Pooling
import numpy as np

from sklearn.decomposition import TruncatedSVD

# 计算sif https://github.com/PrincetonML/SIF/blob/master/src/SIF_embedding.py
def compute_pc(X,npc=1):
    svd = TruncatedSVD(n_components=npc, n_iter=7, random_state=0)
    svd.fit(X)
    return svd.components_

def remove_pc(X, npc=1):
    pc = compute_pc(X, npc)
    if npc==1:
        XX = X - X.dot(pc.transpose()) * pc
    else:
        XX = X - X.dot(pc.transpose()).dot(pc)
    return XX

def sif_weight(count, a=3e-5):
    word_num = 0
    for k, v in dict(count).items():
        word_num += v
    sif = {}
    for k, v in dict(count).items():
        sif[k] = a / (a + v/word_num)
    return sif

count = Counter(all_words)
sif = sif_weight(count)

def sen2embed(wv_model, data, operation="IDF-Pooling"):
    query1_embed, query2_embed = [], []
    
    for i in tqdm(range(len(all_data))):
        row = data.iloc[i]
        embed_lst1, embed_lst2 = [], []
        if operation == "Mean-Pooling":
            for word in row["query1_words"]:
                embed_lst1.append(wv_model.wv[word])
            for word in row["query2_words"]:
                embed_lst2.append(wv_model.wv[word])
            query1_embed.append(np.mean(embed_lst1, axis=0))
            query2_embed.append(np.mean(embed_lst2, axis=0))
        elif operation == "Max-Pooling":
            for word in row["query1_words"]:
                embed_lst1.append(wv_model.wv[word])
            for word in row["query2_words"]:
                embed_lst2.append(wv_model.wv[word])
            if len(embed_lst1) == 0:
                query1_embed.append(np.zeros(100))
            else:
                query1_embed.append(np.amax(embed_lst1, axis=0))
            if len(embed_lst2) == 0:
                query2_embed.append(np.zeros(100))
            else:
                query2_embed.append(np.amax(embed_lst2, axis=0))
        elif operation == "IDF-Pooling":
            for word in row["query1_words"]:
                embed_lst1.append(wv_model.wv[word]*idf[word])
            for word in row["query2_words"]:
                embed_lst2.append(wv_model.wv[word]*idf[word])
            query1_embed.append(np.mean(embed_lst1, axis=0))
            query2_embed.append(np.mean(embed_lst2, axis=0))
        elif operation == "SIF-Pooling":
            for word in row["query1_words"]:
                embed_lst1.append(wv_model.wv[word]*sif[word])
            for word in row["query2_words"]:
                embed_lst2.append(wv_model.wv[word]*sif[word])
            query1_embed.append(np.mean(embed_lst1, axis=0))
            query2_embed.append(np.mean(embed_lst2, axis=0))
            query1_embed = list(remove_pc(np.array(query1_embed)))
            query2_embed = list(remove_pc(np.array(query1_embed)))
            
    data["query1_"+operation] = query1_embed
    data["query2_"+operation] = query2_embed
    
            
sen2embed(wv_model, all_data, "Mean-Pooling")
sen2embed(wv_model, all_data, "Max-Pooling")
sen2embed(wv_model, all_data, "IDF-Pooling")
sen2embed(wv_model, all_data, "SIF-Pooling")

print(all_data[["query1_Mean-Pooling", "query2_Mean-Pooling",
                 "query1_Max-Pooling", "query2_Max-Pooling",
                 "query1_IDF-Pooling", "query2_IDF-Pooling",
                 "query1_SIF-Pooling", "query2_SIF-Pooling"]])

任务5-7:Network Training

!pip install torch  -i https://pypi.tuna.tsinghua.edu.cn/simple
!pip install pytorch-lightning -i https://pypi.tuna.tsinghua.edu.cn/simple

任务5~7我使用pytoch-liightning框架快速的编写代码,它可以支持将处理数据逻辑和模型结构,训练模块解耦开。因此在任务5,6中的数据处理,训练流程,以及Siamese Network等相同逻辑部分我就尽量的共用了代码(Common Code)。

任务5,6编码调试完成,任务5在2080上一小时左右跑完,但跑任务6的有点费劲, 之后在3090上跑一下。
任务7最近太忙,后面有空学习相关知识再补充一下。

Utils Function(Common Code)

import re
import tqdm

def load_lcqmc():
    '''LCQMC文本匹配数据集
    '''
    train = pd.read_csv('https://mirror.coggle.club/dataset/LCQMC.train.data.zip', 
            sep='\t', names=['query1', 'query2', 'label'])

    valid = pd.read_csv('https://mirror.coggle.club/dataset/LCQMC.valid.data.zip', 
            sep='\t', names=['query1', 'query2', 'label'])

    test = pd.read_csv('https://mirror.coggle.club/dataset/LCQMC.test.data.zip', 
            sep='\t', names=['query1', 'query2', 'label'])
    return train, valid, test


def load_stopwords(path=None):
    if path is not None and os.path.exists(path):
        files = os.listdir(path)
        text_set = set()
        for file in files:
            if ".txt" not in file: continue
            with open(os.path.join(path, file), 'r', encoding="UTF-8") as f:
                for line in f.readlines():
                    text_set.add(line[:-1])

    print("Load [%d] nums stopwords successfully!" % len(text_set))
    return list(text_set)

def cut(content, stop_words, seq_len=30):
    content = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]", "", content)
    result = []
    seg_list = jieba.lcut(content, cut_all=True)
    for i in seg_list:
        if i not in stop_words:
            result.append(i)

    if len(result) < seq_len:  # 小于规定长度,填充
        new_result = ['PAD' for i in range(seq_len)]
        new_result[:len(result)] = result
        return new_result
    else:
        return result[:seq_len] 

PL Dataloader, TrainModel, Training(Common Code)

PL Dataloader

import torch
from torch.utils.data import DataLoader
import pytorch_lightning as pl

from torchmetrics import Accuracy
from torchmetrics import F1Score
from torchmetrics import AUROC

class DataModule(pl.LightningDataModule):
    def __init__(self,
                 train_df,
                 valid_df,
                 test_df,
                 batch_size: int = 32,
                 num_workers: int = 8,
                 pin_memory: bool = False):
        
        super().__init__()
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.pin_memory = pin_memory
        self.train_dataset = SaimDataset(train_df)
        self.valid_dataset = SaimDataset(valid_df)
        self.test_dataset = SaimDataset(test_df)
    
    def setup(self, stage=None):  # 实现数据集划分定义,每张GPU都会执行。
        pass
        
    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size,
                          shuffle=False, num_workers=self.num_workers,
                          pin_memory=self.pin_memory)

    def val_dataloader(self):
        return DataLoader(self.valid_dataset, batch_size=self.batch_size,
                          shuffle=False, num_workers=self.num_workers,
                          pin_memory=self.pin_memory)
    
    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size,
                          shuffle=False, num_workers=self.num_workers,
                          pin_memory=self.pin_memory)

    @staticmethod
    def add_dataset_args(parent_parser):
        pass

PL TrainModel

class ContrastiveLoss(torch.nn.Module):
    """
    Contrastive loss function.
    Based on: http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
    """
    def __init__(self, margin=2.0):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin

    def forward(self, distance, label):
        # distance = F.pairwise_distance(output1, output2)  #euclidean_distance
        loss_contrastive = torch.mean((label) * torch.pow(distance, 2) +     # calmp夹断用法
                                      (1-label) * torch.pow(torch.clamp(self.margin - distance, min=0.0), 2))
        # y * cosine_distance^2 + (1-y) * max(0, margin- cosine_distance)^2  margin=1.2, 1.5, 1.8, 2 [0, 2]
        return loss_contrastive

    
class SiamTrainModel(pl.LightningModule):
    def __init__(self, hparams={"learning_rate": 1e-3,
                                "contrastive_loss_margin": 1.0}):
        
        super().__init__()
        self.save_hyperparameters(hparams)  
        self.encode = self.hparams.encode
        self.epoch_size = None                                 # 可能会改变,给外界提供一个修改接口
        self.learning_rate = self.hparams.learning_rate        # 可能会改变,给外界提供一个修改接口
        self.contrastive_loss = ContrastiveLoss(margin=self.hparams.contrastive_loss_margin)
        self.metric_acc = Accuracy(task="binary", num_classes=2)
        self.metric_f1 = F1Score(task="binary", num_classes=2)
        self.metric_auc = AUROC(task="binary", num_classes=2)
    
    def forward(self, batch):
        embed1 = self.encode(batch["input1"])
        embed2 = self.encode(batch["input2"])
        return embed1, embed2
    
    def configure_optimizers(self): 
        optimizer = torch.optim.Adam(self.parameters(),
                                     lr=self.learning_rate)
        return optimizer
    
    def training_step(self, batch, batch_idx):
        embed1, embed2 = self(batch)
        sims = (embed1 * embed2).sum(dim=-1)
        loss = self.contrastive_loss(1.0-sims, batch["label"])
        self.log("train_loss", loss, on_step=True, on_epoch=False, prog_bar=True, logger=True)
        # training_step 一定要返回 loss
        return {"loss": loss,
                "preds": sims.detach(), 
                "targets": batch["label"].detach()}
    
    def validation_step(self, batch, batch_idx):
        embed1, embed2 = self(batch)
        sims = (embed1 * embed2).sum(dim=-1)
        loss = self.contrastive_loss(1.0-sims, batch["label"])
        return {"loss": loss.detach(), 
                "preds": sims.detach(), 
                "targets": batch["label"].detach()}
    
    def validation_epoch_end(self, outputs):
        loss = 0.0
        preds, targets = [], []
        for output in outputs:
            loss += output["loss"]
            preds.append(torch.where(output["preds"] >= 0.5, 1, 0))
            targets.append(output["targets"])
        loss /= len(outputs)
        preds = torch.cat(preds)
        targets = torch.cat(targets)
        valid_acc = self.metric_acc(preds, targets)
        valid_f1 = self.metric_f1(preds, targets)
        valid_auc = self.metric_auc(preds, targets)
        metrics = {"valid_loss": loss, "valid_acc": valid_acc, 
                   "valid_f1": valid_f1, "valid_auc": valid_auc}
        self.log_dict(metrics, on_step=False, on_epoch=True, prog_bar=True, logger=False)

    def test_step(self, batch, batch_idx):
        embed1, embed2 = self(batch)
        sims = (embed1 * embed2).sum(dim=-1)    #归一化后,相似度在(0,1)之间
        loss = self.contrastive_loss(1.0-sims, batch["label"])
        self.log("test_loss", loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        return {"loss": loss.detach(),
                "preds": sims.detach(),
                "targets": batch["label"].detach()}
    
    def test_epoch_end(self, outputs):
        loss = 0.0
        preds, targets = [], []
        for output in outputs:
            loss += output["loss"]
            preds.append(torch.where(output["preds"] >= 0.5, 1, 0))
            targets.append(output["targets"])
    
        loss /= len(outputs)
        preds = torch.cat(preds)
        targets = torch.cat(targets)
        test_acc = self.metric_acc(preds, targets)
        test_f1 = self.metric_f1(preds, targets)
        test_auc = self.metric_auc(preds, targets)
        metrics = {"test_loss": loss, "test_acc": test_acc, 
                   "test_f1": test_f1, "test_auc": test_auc}
        self.log_dict(metrics, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        return metrics
        
    def predict_step(self, batch, batch_idx):
        embed1, embed2 = self(batch)
        sims = (embed1 * embed2).sum(dim=-1)
        preds = torch.where(sims >= 0.5, 1, 0)
        return preds

Training Process

import pytorch_lightning as pl

from pytorch_lightning import Trainer


def main(hparams):
    pl.seed_everything(1234)  # 统一设置随机种子
    train_model = hparams["model"]
    data_module = hparams["data_module"]
    
    ckpt_callback = pl.callbacks.ModelCheckpoint(
        monitor=hparams["train_monitor"],
        dirpath=hparams["save_dir"],
        filename="model-{epoch:04d}-{valid_auc:.3f}",
        mode=hparams["train_mode"])
    
    early_stopping = pl.callbacks.EarlyStopping(
        monitor=hparams["train_monitor"],
        patience=hparams["early_stop_patience"],
        mode=hparams["train_mode"])
    callbacks = [ckpt_callback, early_stopping]

    trainer = pl.Trainer(
        min_epochs=hparams["min_epochs"],
        max_epochs=hparams["max_epochs"],
        callbacks=callbacks,
        accelerator='gpu',
        devices=1)
    trainer.fit(model=train_model, datamodule=data_module)
    
    print("Test Data Result:")
    result = trainer.test(model=train_model, datamodule=data_module)
    return result

任务5: 文本匹配模型(LSTM孪生网络)

LSTM-Siamese Data Process

import os
import jieba
import pandas as pd

from tqdm import tqdm
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

def cut_words(df, stop_words):
    df["query1_words"] = df["query1"].apply(lambda x: cut(x, stop_words))
    df["query2_words"] = df["query2"].apply(lambda x: cut(x, stop_words))
    return df
    
def build_vocab(train, valid, test, file_name=""):
    word_index = 0
    vocab_dict = {}
    dataset = [train, valid, test]
    for data in tqdm(dataset, desc='构建vocab'):
        for i in range(len(data)):
            row = data.iloc[i]
            words_a = row['query1_words']
            words_b = row['query2_words']
            for word in words_a:
                if word not in vocab_dict.keys():
                    vocab_dict[word] = word_index
                    word_index += 1
            for word in words_b:
                if word not in vocab_dict.keys():
                    vocab_dict[word] = word_index
                    word_index += 1

    print("Build Vocab [%d] successfully!" % len(vocab_dict))
    if len(file_name) > 0:          
        write_pickle(vocab_dict, file_name)
    return vocab_dict
    
def word2vocab_index(df, vocab):
    word2index = lambda words: [vocab[word] for word in words]
    df["query1_index"] = df["query1_words"].apply(lambda x: word2index(x))
    df["query2_index"] = df["query2_words"].apply(lambda x: word2index(x))
    return df

def data_process():
    train, valid, test = load_lcqmc()
    stop_words = load_stopwords("./stop_words/")
    # get words
    train = cut_words(train, stop_words)
    valid = cut_words(valid, stop_words)
    test = cut_words(test, stop_words)
    # get vocab
    vocab = build_vocab(train, valid, test, "vocab.txt")
    #  trans word to vocab index
    train = word2vocab_index(train, vocab)
    valid = word2vocab_index(valid, vocab)
    test = word2vocab_index(test, vocab)
    return train, valid, test, vocab

def load_wv(train, dev, test, file_name=""):
    datalist = [train, dev, test]
    context = []
    for data in datalist:
        for i in range(len(data)):
            row = data.iloc[i]
            context.append(row['query1_index'])  # 这里学习是word在vocab中index的向量
            context.append(row['query2_index'])
    wv_model = Word2Vec(sentences=context, vector_size=100, window=5, min_count=1, workers=4)
    wv_model.train(context, total_examples=1, epochs=1)
    if len(file_name) > 0: 
        wv_model.save(file_name)
    return wv_model

train, valid, test, vocab = data_process()
wv_model = load_wv(train, valid, test)
Load [746] nums stopwords successfully!
构建vocab: 100%|██████████| 3/3 [00:24<00:00,  8.05s/it]
Build Vocab [42397] successfully!
Save [42397] nums in [vocab.txt] successfully!
import numpy as np
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

# 自定义数据集
class SaimDataset(Dataset):
    def __init__(self, df):
        super(Dataset, self).__init__()
        self.text1 = df["query1_index"]
        self.text2 = df["query2_index"]
        self.label = df["label"]
        self.len = len(df)

    def __getitem__(self, idx):
        index1 = np.array(self.text1.iloc[idx], dtype="int64")
        index2 = np.array(self.text2.iloc[idx], dtype="int64")
        label = np.array(self.label.iloc[idx]).astype("int64")

        return {"input1": index1,
                "input2": index2,
                "label":  label}

    def __len__(self):
        return self.len

LSTM Network Structure

import torch
from torch import nn

class LSTM(nn.Module):
    def __init__(self, wv_mode, vocab_size, embed_dim):
        super(LSTM, self).__init__()
        word_vectors = torch.randn([vocab_size, embed_dim])
        for i in range(0, vocab_size):
            word_vectors[i, :] = torch.from_numpy(wv_mode.wv[i])
        self.embedding = nn.Embedding.from_pretrained(word_vectors, freeze=False)
        
        self.LSTM = nn.LSTM(input_size=embed_dim, hidden_size=embed_dim, num_layers=2)
        self.Linear = nn.Sequential(
            nn.Linear(embed_dim*30, embed_dim),
            #nn.Dropout(p=0.1),
            nn.ReLU(),
            nn.Linear(embed_dim, embed_dim))

    def forward(self, text):
        # 计算a
        x = self.embedding(text)
        x = x.transpose(0, 1)  # 交换维度,LSTM的输入是 (L, D, H)
        x, _ = self.LSTM(x)
        x = x.transpose(0, 1)  # 还原维度, LSTM输出是 (L, D, H)
        x = x.contiguous().view(x.size(0), -1)
        return self.Linear(x)
data_module = DataModule(train, valid, test)
text_encode = LSTM(wv_model, len(vocab), 100)
hparams["encode"] = text_encode

for data in data_module.train_dataloader():
    print(data["input1"].shape)
    output = text_encode(data["input1"])
    print(output)
    break
torch.Size([32, 30])
x shape: torch.Size([32, 30, 100])
tensor([[-0.0412, -0.0353, -0.1243,  ...,  0.0885, -0.0744,  0.1103],
        [-0.0414, -0.0400, -0.1309,  ...,  0.0911, -0.0751,  0.1125],
        [-0.0410, -0.0322, -0.1343,  ...,  0.0925, -0.0741,  0.1105],
        ...,
        [-0.0395, -0.0288, -0.1321,  ...,  0.0905, -0.0775,  0.1069],
        [-0.0360, -0.0414, -0.1215,  ...,  0.0882, -0.0800,  0.1011],
        [-0.0445, -0.0366, -0.1335,  ...,  0.0885, -0.0717,  0.1087]],
       grad_fn=)

LSTM-Siamese Model Training

# 执行step3 之前先执行上面 PL Dataloader,  TrainModel, Training(Common Code)

hparams = {
    "learning_rate": 1e-3,
    "contrastive_loss_margin": 1.0,
    "train_monitor": "valid_auc",
    "train_mode": "max",
    "early_stop_patience": 5,
    "min_epochs": 5,
    "max_epochs": 100,
    "save_dir": "./models/"
}

train, valid, test, vocab = data_process() 
data_module = DataModule(train, valid, test)
wv_model = load_wv(train, valid, test)
text_encode = LSTM(wv_model, len(vocab), 100)
hparams["encode"] = text_encode
train_model = SiamTrainModel(hparams)

hparams["model"] = train_model
hparams["data_module"] = data_module
main(hparams)
Load [746] nums stopwords successfully!
构建vocab: 100%|██████████| 3/3 [00:24<00:00,  8.05s/it]
Build Vocab [42397] successfully!
Save [42397] nums in [vocab.txt] successfully!
/opt/conda/lib/python3.8/site-packages/pytorch_lightning/utilities/parsing.py:262: UserWarning: Attribute 'encode' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['encode'])`.
  rank_zero_warn(
Global seed set to 1234
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/opt/conda/lib/python3.8/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:604: UserWarning: Checkpoint directory /usr/yexiaoju/code/practice_project/jupyter/models exists and is not empty.
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name             | Type            | Params
-----------------------------------------------------
0 | encode           | LSTM            | 4.7 M 
1 | contrastive_loss | ContrastiveLoss | 0     
2 | metric_acc       | BinaryAccuracy  | 0     
3 | metric_f1        | BinaryF1Score   | 0     
4 | metric_auc       | BinaryAUROC     | 0     
-----------------------------------------------------
4.7 M     Trainable params
0         Non-trainable params
4.7 M     Total params
18.846    Total estimated model params size (MB)
 Sanity Checking: 0it [00:00, ?it/s]
 LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
 Test Data Result:
 Testing: 0it [00:00, ?it/s]
 ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
 ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_acc            0.7702400088310242
        test_auc            0.7702400088310242
         test_f1            0.7962542772293091
        test_loss           0.19331854581832886          
 ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
  
[{'test_loss': 0.19331854581832886,
  'test_acc': 0.7702400088310242,
  'test_f1': 0.7962542772293091,
  'test_auc': 0.7702400088310242}]

任务6 文本匹配模型(Sentence-BERT模型)

%pip install transformers -i https://pypi.tuna.tsinghua.edu.cn/simple

Bert Data Process

这里尝试是Sentence-Bert提取文本特征方式,没有用Bert-NSP。

from collections import defaultdict

# def data_process(tokenizer):  # BERT-NSP方式 
#     train, valid, test = load_lcqmc()
#     data_df = [train, valid, test]
#     for df in data_df:
#         inputs = defaultdict(list)
#         for i, row in tqdm(df.iterrows(), desc="encode {} data".format(k), total=len(df)):
#             seq_a, seq_b, label = row[0], row[1], row[2]
#             try:
#                 inputs_dict = tokenizer.encode_plus(seq_a, seq_b, add_special_tokens=True, \ 
#                                                     return_token_type_ids=True, return_attention_mask=True)
#             except TypeError as ex:
#                 print(row)
                
#             inputs["input_ids"].append(inputs_dict["input_ids"])
#             inputs["token_type_ids"].append(inputs_dict["token_type_ids"])
#             inputs["attention_mask"].append(inputs_dict["attention_mask"])
#             inputs["labels"].append(label)
            
#         df["input_ids"] = inputs["input_ids"]
#         df["token_type_ids"] = inputs["token_type_ids"]
#         df["attention_mask"] = inputs["attention_mask"]
#         df["labels"] = inputs["labels"]
        
#     return train, valid, test

def data_process(tokenizer):  # Sentence-BERT 
    train, valid, test = load_lcqmc()
    data_df = [train, valid, test]
    for df in data_df:
        inputs = defaultdict(list)
        for i, row in tqdm(df.iterrows(), desc="process data", total=len(df)):
            seq_a, seq_b, label = row[0], row[1], row[2]
            try:
                inputs_dict1 = tokenizer(seq_a, 
                                         add_special_tokens=True, 
                                         return_attention_mask=True,
                                         return_tensors="pt", 
                                         padding=True)
                inputs_dict2 = tokenizer(seq_b, 
                                         add_special_tokens=True, 
                                         return_attention_mask=True,
                                         return_tensors="pt", 
                                         padding=True)
            except TypeError as ex:
                print(row)
            inputs["input_ids1"].append(inputs_dict1["input_ids"])
            inputs["attention_mask1"].append(inputs_dict1["attention_mask"])
            inputs["input_ids2"].append(inputs_dict2["input_ids"])
            inputs["attention_mask2"].append(inputs_dict2["attention_mask"])
            inputs["labels"].append(label)
            
        df["input_ids1"] = inputs["input_ids1"]
        df["attention_mask1"] = inputs["attention_mask1"]
        df["input_ids2"] = inputs["input_ids2"]
        df["attention_mask2"] = inputs["attention_mask2"]
        df["labels"] = inputs["labels"]
        
    return train, valid, test
from torch.utils.data import Dataset

class SaimDataset(Dataset):
    def __init__(self, df):
        super(Dataset, self).__init__()
        self.input_ids1 = df["input_ids1"]
        self.attention_mask1 = df["attention_mask1"]
        self.input_ids2 = df["input_ids2"]
        self.attention_mask2 = df["attention_mask2"]
        self.labels = df["labels"]
        self.len = len(self.labels)

    def __getitem__(self, idx):
        inputs1 = {"inputs_ids": self.input_ids1[idx],
                    "attention_mask":  self.attention_mask1[idx]}
        inputs2 = {"inputs_ids": self.input_ids2[idx],
                    "attention_mask":  self.attention_mask2[idx]}
        return {"input1": inputs1,
                "input2": inputs2,
                "label": self.labels[idx]}

    def __len__(self):
        return self.len

Sentence-Bert Network Structure

import torch
from torch import nn

class BertSentence(nn.Module):
    def __init__(self, BertModel):
        super(BertSentence, self).__init__()
        self.sen_bert = BertModel

    def forward(self, batch):
        inputs_ids, attention_mask = batch["input_ids"], batch["attention_mask"]
        output = self.sen_bert(inputs_ids, attention_mask)
        return output[1]

Sentence-Bert Model Training

# 执行step3 之前先执行上面 PL Dataloader,  TrainModel, Training(Common Code)
from transformers import BertConfig
from transformers import BertTokenizer
from transformers import BertModel

hparams = {
    "learning_rate": 1e-3,
    "contrastive_loss_margin": 1.0,
    "train_monitor": "valid_auc",
    "train_mode": "max",
    "early_stop_patience": 5,
    "min_epochs": 5,
    "max_epochs": 100,
    "save_dir": "./models/"
}

model_name = "hfl/chinese-roberta-wwm-ext"
config = BertConfig.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = BertModel.from_pretrained(model_name)

train, valid, test, vocab = data_process(tokenizer) 
data_module = DataModule(train, valid, test)

text_encode = BertSentence(bert_model)
hparams["encode"] = text_encode

train_model = SiamTrainModel(hparams)
hparams["model"] = train_model
hparams["data_module"] = data_module
main(hparams)

任务7:文本匹配模型(SimCSE模型)(未做)

有空补充

你可能感兴趣的:(nlp/知识图谱,nlp,pytorch,文本匹配)