transformers pretrained

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import torch
import transformers as tfs

train_df = pd.read_csv('./data/train.tsv', delimiter='\t', header=None)
train_set = train_df[:3000]   #取其中的3000条数据作为我们的数据集
tokenizer =tfs.BertTokenizer.from_pretrained('bert-base-uncased')
model = tfs.BertModel.from_pretrained('bert-base-uncased')
inputs = tokenizer(train_set[0].tolist(),padding=True,return_tensors='pt')

# 提取特征
with torch.no_grad():
    train_last_hidden_states = model(**inputs)

# 提取bert输出的特征
train_features = train_last_hidden_states[0][:,0,:].numpy() # 我们使用[:,0,:]来提取序列第一个位置的输出向量,因为第一个位置是[CLS],比起其他位置,该向量应该更具有代表性
train_labels = train_set[1]

# 分成训练集和验证集
train_features, test_features, train_labels, test_labels = train_test_split(train_features, train_labels)
# 训练
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)
lr_clf.score(test_features, test_labels)

你可能感兴趣的:(Python,python)