import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import torch
import transformers as tfs
train_df = pd.read_csv('./data/train.tsv', delimiter='\t', header=None)
train_set = train_df[:3000]
tokenizer =tfs.BertTokenizer.from_pretrained('bert-base-uncased')
model = tfs.BertModel.from_pretrained('bert-base-uncased')
inputs = tokenizer(train_set[0].tolist(),padding=True,return_tensors='pt')
with torch.no_grad():
train_last_hidden_states = model(**inputs)
train_features = train_last_hidden_states[0][:,0,:].numpy()
train_labels = train_set[1]
train_features, test_features, train_labels, test_labels = train_test_split(train_features, train_labels)
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)
lr_clf.score(test_features, test_labels)