Hugginigface微调模型(使用transformers)

仅用于学习交流,如有侵权还请告知。

from transformers import AutoTokenizer,AutoModelForSequenceClassification,BertModel,BertTokenizer
from pathlib import Path
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset
from transformers import DistilBertTokenizerFast,DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
"""
1. Prepare dataset
2. Load pretrained Tokenizer, call it with dataset -> encoding
3. Build PyTorch Dataset with encodings
4. Load pretrained Model
5. a) Load Trainer and train it
   b) or use native PyTorch training pipeline
"""


model_name = "distilbert-base-uncased"

def read_imdb_split(split_dir):
    split_dir = Path(split_dir)
    texts = []
    labels = []
    for label_dir in ["pos","neg"]:
        for text_file in (split_dir/label_dir).iterdir():
            texts.append(text_file.read_text())
            labels.append(0 if label_dir == "neg" else 1)
    return texts, labels

# Large Movie Review Dataset
# http://ai.stanford.edu/~amaas/data/sentiment/
train_texts, train_labels = read_imdb_split('aclImdb/train')
test_texts, test_labels = read_imdb_split('aclImdb/test')

train_texts, val_texts, train_labels, val_lalbels = train_test_split(train_texts,train_labels,test_size=0.2,shuffle=True,random_state=42)

class IMDBdataset(Dataset):
    def __init__(self,encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
    def __len__(self):
        return len(self.labels)

tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

train_dataset = IMDBdataset(train_encodings, train_labels)
val_dataset = IMDBdataset(val_encodings, val_lalbels)
test_dataset = IMDBdataset(test_encodings, test_labels)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=16, #batch size per device during training
    per_device_eval_batch_size=64, #batch size for evaluation
    warmup_steps=500, # number of warmup steps for learning rate scheduler
    learning_rate=5e-5, #learning rate
    weight_decay=0.01, #strength of weight decay
    logging_dir='./logs', # directory for storing logs
    logging_steps=10,

)

model = DistilBertForSequenceClassification.from_pretrained(model_name)

trainer = Trainer(
    model=model, #the instantiated Transformers model to be trained
    args=training_args, # training arguments, defined above
    train_dataset=train_dataset, # training dataset
    eval_dataset=val_dataset # evaluation dataset
)
trainer.train()

你可能感兴趣的:(python,深度学习,pytorch)