查看GPU版本和使用情况
import torch
if torch. cuda. is_available( ) :
device = torch. device( "cuda" )
print ( 'There are %d GPU(s) available.' % torch. cuda. device_count( ) )
print ( 'We will use the GPU:' , torch. cuda. get_device_name( 0 ) )
else :
print ( 'No GPU available, using the CPU instead.' )
device = torch. device( "cpu" )
There are 1 GPU(s) available.
We will use the GPU: GeForce GTX 1070
导入评论信息
import pandas as pd
path = './中文文本情感分类/'
comments = pd. read_csv( path + '社交网站评论信息情感分类.csv' )
moods = { 0 : '喜悦' , 1 : '愤怒' , 2 : '厌恶' , 3 : '低落' }
print ( '文本数量(总体):%d' % comments. shape[ 0 ] )
for label, mood in moods. items( ) :
print ( '文本数量({}):{}' . format ( mood, comments[ comments. label== label] . shape[ 0 ] ) )
文本数量(总体):361744
文本数量(喜悦):199496
文本数量(愤怒):51714
文本数量(厌恶):55267
文本数量(低落):55267
简单查看一下数据集
comments[ 0 : 5 ]
label
review
0
0
啊呀呀!要死啦!么么么!只穿外套就好了,我认为里面那件很多余啊周小伦喜歡 你各種 五角星的...
1
0
嗯……既然大姚通知了……那我也表示下收到……姚,你知道吗?假如外星人入侵地球,只要摧毁我们的...
2
0
风格不一样嘛,都喜欢!最喜欢哪张?
3
0
好呀,试试D .I .Y .去死皮面膜1.将燕麦片加水中浸泡6小时,加入木瓜牛奶搅拌。2.放...
4
0
张老师,谢谢侬的信任!粉丝多少无所谓重在质地近日发现一个现象——他加了你关注,你回加后,他立...
由于数据实在太多,我们在这里每一种情感 选择1000个例子
df0 = comments. loc[ comments. label == 0 ] . sample( 1000 ) [ [ 'review' , 'label' ] ]
df1 = comments. loc[ comments. label == 1 ] . sample( 1000 ) [ [ 'review' , 'label' ] ]
df2 = comments. loc[ comments. label == 2 ] . sample( 1000 ) [ [ 'review' , 'label' ] ]
df3 = comments. loc[ comments. label == 3 ] . sample( 1000 ) [ [ 'review' , 'label' ] ]
由于数据集太大,组成新的数据集并且打乱顺序
df0 = df0. append( df1)
df0 = df0. append( df2)
df0 = df0. append( df3)
df0 = df0. append( df3)
df0 = df0. sample( frac= 1 )
len ( df0)
5000
简单查看一下数据
df0[ 0 : 5 ]
review
label
355831
据最新消息,此次火灾造成9人死亡9人受伤。经初步勘察,火灾系售楼处一楼沙盘电路故障引发。
3
348348
上星期买的裙子,今晚穿了,不好看~ ~
3
357542
顺便说点什么吧...荷兰,阿姆斯特丹火车站,广播里突然传来耳熟能详的《音乐之声》里的123,...
3
361326
午后,打扮的美美的和喜欢的人坐在咖啡厅,聊聊自己想说的话,或静静坐在一起,什么都不说,只是紧...
3
278616
真的假的?!!!
2
把数据集中的句子和标签取出来
sentences = df0. review. values
labels = df0. label. values
下载 BERT tokenizer.
from transformers import BertTokenizer
print ( '下载 BERT tokenizer...' )
tokenizer = BertTokenizer. from_pretrained( 'bert-base-chinese' , do_lower_case= True )
下载 BERT tokenizer...
简单查看一下 tokenizer
print ( ' 原句: ' , sentences[ 0 ] )
print ( 'Tokenizen 后的句子: ' , tokenizer. tokenize( sentences[ 0 ] ) )
print ( 'Token IDs: ' , tokenizer. convert_tokens_to_ids( tokenizer. tokenize( sentences[ 0 ] ) ) )
原句: 据最新消息,此次火灾造成9人死亡9人受伤。经初步勘察,火灾系售楼处一楼沙盘电路故障引发。
Tokenizen 后的句子: ['据', '最', '新', '消', '息', ',', '此', '次', '火', '灾', '造', '成', '9', '人', '死', '亡', '9', '人', '受', '伤', '。', '经', '初', '步', '勘', '察', ',', '火', '灾', '系', '售', '楼', '处', '一', '楼', '沙', '盘', '电', '路', '故', '障', '引', '发', '。']
Token IDs: [2945, 3297, 3173, 3867, 2622, 8024, 3634, 3613, 4125, 4135, 6863, 2768, 130, 782, 3647, 767, 130, 782, 1358, 839, 511, 5307, 1159, 3635, 1242, 2175, 8024, 4125, 4135, 5143, 1545, 3517, 1905, 671, 3517, 3763, 4669, 4510, 6662, 3125, 7397, 2471, 1355, 511]
最长句子的长度为 265
max_len = 0
lengthOfsentence = [ ]
for sent in sentences:
lengthOfsentence. append( len ( sent) )
max_len = max ( max_len, len ( sent) )
print ( '最长的句子长度为: ' , max_len)
最长的句子长度为: 284
type ( sentences)
numpy.ndarray
sentences. shape
(5000,)
根据观察,大多数句子长度在250 以下,padding 时候的max_length 我们取256
import matplotlib. pyplot as plt
plt. plot( lengthOfsentence)
plt. ylabel( 'some numbers' )
plt. show( )
input_ids = [ ]
attention_masks = [ ]
for sent in sentences:
encoded_dict = tokenizer. encode_plus(
sent,
add_special_tokens = True ,
max_length = 256 ,
pad_to_max_length = True ,
return_attention_mask = True ,
return_tensors = 'pt' ,
)
input_ids. append( encoded_dict[ 'input_ids' ] )
attention_masks. append( encoded_dict[ 'attention_mask' ] )
input_ids = torch. cat( input_ids, dim= 0 )
attention_masks = torch. cat( attention_masks, dim= 0 )
labels = torch. tensor( labels)
简单查看一下第一句的Token IDs 和 attention_masks
print ( '原句: ' , sentences[ 0 ] )
print ( 'Token IDs:' , input_ids[ 0 ] )
print ( 'attention_masks:' , attention_masks[ 0 ] )
原句: 国家哀悼日,悲伤无边,情义永在这里有毁灭,有伤痛,有绝望,但还有希望。
Token IDs: tensor([ 101, 1744, 2157, 1500, 2656, 3189, 8024, 2650, 839, 3187, 6804, 8024,
2658, 721, 3719, 1762, 6821, 7027, 3300, 3673, 4127, 8024, 3300, 839,
4578, 8024, 3300, 5318, 3307, 8024, 852, 6820, 3300, 2361, 3307, 511,
102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0])
attention_masks: tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
设计 training,validation 和 test dataset
from torch. utils. data import TensorDataset, random_split
dataset = TensorDataset( input_ids, attention_masks, labels)
train_size = int ( 0.9 * len ( dataset) )
val_size = len ( dataset) - train_size
train_dataset, val_dataset = random_split( dataset, [ train_size, val_size] )
print ( '{:>5,} 训练数据' . format ( train_size) )
print ( '{:>5,} 验证数据' . format ( val_size) )
4,500 训练数据
500 验证数据
制作dataload
from torch. utils. data import DataLoader, RandomSampler, SequentialSampler
batch_size = 16
train_dataloader = DataLoader(
train_dataset,
sampler = RandomSampler( train_dataset) ,
batch_size = batch_size
)
validation_dataloader = DataLoader(
val_dataset,
sampler = RandomSampler( val_dataset) ,
batch_size = batch_size
)
导入 bert 文本多分类模型 BertForSequenceClassification
from transformers import BertForSequenceClassification, AdamW, BertConfig
model = BertForSequenceClassification. from_pretrained(
"bert-base-chinese" ,
num_labels = 4 ,
output_attentions = False ,
output_hidden_states = False ,
)
model. cuda( )
BertForSequenceClassification(
(bert): BertModel(
(embeddings): BertEmbeddings(
(word_embeddings): Embedding(21128, 768, padding_idx=0)
(position_embeddings): Embedding(512, 768)
(token_type_embeddings): Embedding(2, 768)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(encoder): BertEncoder(
(layer): ModuleList(
(0): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(1): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(2): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(3): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(4): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(5): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(6): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(7): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(8): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(9): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(10): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(11): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
)
(pooler): BertPooler(
(dense): Linear(in_features=768, out_features=768, bias=True)
(activation): Tanh()
)
)
(dropout): Dropout(p=0.1, inplace=False)
(classifier): Linear(in_features=768, out_features=4, bias=True)
)
params = list ( model. named_parameters( ) )
print ( 'The BERT model has {:} different named parameters.\n' . format ( len ( params) ) )
print ( '==== Embedding Layer ====\n' )
for p in params[ 0 : 5 ] :
print ( "{:<55} {:>12}" . format ( p[ 0 ] , str ( tuple ( p[ 1 ] . size( ) ) ) ) )
print ( '\n==== First Transformer ====\n' )
for p in params[ 5 : 21 ] :
print ( "{:<55} {:>12}" . format ( p[ 0 ] , str ( tuple ( p[ 1 ] . size( ) ) ) ) )
print ( '\n==== Output Layer ====\n' )
for p in params[ - 4 : ] :
print ( "{:<55} {:>12}" . format ( p[ 0 ] , str ( tuple ( p[ 1 ] . size( ) ) ) ) )
The BERT model has 201 different named parameters.
==== Embedding Layer ====
bert.embeddings.word_embeddings.weight (21128, 768)
bert.embeddings.position_embeddings.weight (512, 768)
bert.embeddings.token_type_embeddings.weight (2, 768)
bert.embeddings.LayerNorm.weight (768,)
bert.embeddings.LayerNorm.bias (768,)
==== First Transformer ====
bert.encoder.layer.0.attention.self.query.weight (768, 768)
bert.encoder.layer.0.attention.self.query.bias (768,)
bert.encoder.layer.0.attention.self.key.weight (768, 768)
bert.encoder.layer.0.attention.self.key.bias (768,)
bert.encoder.layer.0.attention.self.value.weight (768, 768)
bert.encoder.layer.0.attention.self.value.bias (768,)
bert.encoder.layer.0.attention.output.dense.weight (768, 768)
bert.encoder.layer.0.attention.output.dense.bias (768,)
bert.encoder.layer.0.attention.output.LayerNorm.weight (768,)
bert.encoder.layer.0.attention.output.LayerNorm.bias (768,)
bert.encoder.layer.0.intermediate.dense.weight (3072, 768)
bert.encoder.layer.0.intermediate.dense.bias (3072,)
bert.encoder.layer.0.output.dense.weight (768, 3072)
bert.encoder.layer.0.output.dense.bias (768,)
bert.encoder.layer.0.output.LayerNorm.weight (768,)
bert.encoder.layer.0.output.LayerNorm.bias (768,)
==== Output Layer ====
bert.pooler.dense.weight (768, 768)
bert.pooler.dense.bias (768,)
classifier.weight (4, 768)
classifier.bias (4,)
选择优化器
optimizer = AdamW( model. parameters( ) ,
lr = 2e - 5 ,
eps = 1e - 8
)
设计learning rate scheduler, 调整learning rate.
from transformers import get_linear_schedule_with_warmup
epochs = 4
total_steps = len ( train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup( optimizer,
num_warmup_steps = 0 ,
num_training_steps = total_steps)
flat_accuracy 计算模型准确率
import numpy as np
def flat_accuracy ( preds, labels) :
pred_flat = np. argmax( preds, axis= 1 ) . flatten( )
labels_flat = labels. flatten( )
return np. sum ( pred_flat == labels_flat) / len ( labels_flat)
format_time 计算所用时间
import time
import datetime
def format_time ( elapsed) :
elapsed_rounded = int ( round ( ( elapsed) ) )
return str ( datetime. timedelta( seconds= elapsed_rounded) )
训练数据
import os
import random
import numpy as np
from transformers import WEIGHTS_NAME, CONFIG_NAME
output_dir = "./models/"
output_model_file = os. path. join( output_dir, WEIGHTS_NAME)
output_config_file = os. path. join( output_dir, CONFIG_NAME)
seed_val = 42
random. seed( seed_val)
np. random. seed( seed_val)
torch. manual_seed( seed_val)
torch. cuda. manual_seed_all( seed_val)
training_stats = [ ]
total_t0 = time. time( )
best_val_accuracy = 0
for epoch_i in range ( 0 , epochs) :
print ( 'Epoch {:} / {:}' . format ( epoch_i + 1 , epochs) )
t0 = time. time( )
total_train_loss = 0
total_train_accuracy = 0
model. train( )
for step, batch in enumerate ( train_dataloader) :
if step % 40 == 0 and not step == 0 :
elapsed = format_time( time. time( ) - t0)
print ( ' Batch {:>5,} of {:>5,}. Elapsed: {:}.' . format ( step, len ( train_dataloader) , elapsed) )
b_input_ids = batch[ 0 ] . to( device)
b_input_mask = batch[ 1 ] . to( device)
b_labels = batch[ 2 ] . to( device)
model. zero_grad( )
loss, logits = model( b_input_ids,
token_type_ids= None ,
attention_mask= b_input_mask,
labels= b_labels)
total_train_loss += loss. item( )
loss. backward( )
torch. nn. utils. clip_grad_norm_( model. parameters( ) , 1.0 )
optimizer. step( )
scheduler. step( )
logit = logits. detach( ) . cpu( ) . numpy( )
label_id = b_labels. to( 'cpu' ) . numpy( )
total_train_accuracy += flat_accuracy( logit, label_id)
avg_train_loss = total_train_loss / len ( train_dataloader)
training_time = format_time( time. time( ) - t0)
avg_train_accuracy = total_train_accuracy / len ( train_dataloader)
print ( " 训练准确率: {0:.2f}" . format ( avg_train_accuracy) )
print ( " 平均训练损失 loss: {0:.2f}" . format ( avg_train_loss) )
print ( " 训练时间: {:}" . format ( training_time) )
t0 = time. time( )
model. eval ( )
total_eval_accuracy = 0
total_eval_loss = 0
nb_eval_steps = 0
for batch in validation_dataloader:
b_input_ids = batch[ 0 ] . to( device)
b_input_mask = batch[ 1 ] . to( device)
b_labels = batch[ 2 ] . to( device)
with torch. no_grad( ) :
( loss, logits) = model( b_input_ids,
token_type_ids= None ,
attention_mask= b_input_mask,
labels= b_labels)
total_eval_loss += loss. item( )
logit = logits. detach( ) . cpu( ) . numpy( )
label_id = b_labels. to( 'cpu' ) . numpy( )
total_eval_accuracy += flat_accuracy( logit, label_id)
avg_val_accuracy = total_eval_accuracy / len ( validation_dataloader)
print ( "" )
print ( " 测试准确率: {0:.2f}" . format ( avg_val_accuracy) )
if avg_val_accuracy > best_val_accuracy:
best_val_accuracy = avg_val_accuracy
torch. save( model. state_dict( ) , output_model_file)
model. config. to_json_file( output_config_file)
tokenizer. save_vocabulary( output_dir)
avg_val_loss = total_eval_loss / len ( validation_dataloader)
validation_time = format_time( time. time( ) - t0)
print ( " 平均测试损失 Loss: {0:.2f}" . format ( avg_val_loss) )
print ( " 测试时间: {:}" . format ( validation_time) )
training_stats. append(
{
'epoch' : epoch_i + 1 ,
'Training Loss' : avg_train_loss,
'Valid. Loss' : avg_val_loss,
'Valid. Accur.' : avg_val_accuracy,
'Training Time' : training_time,
'Validation Time' : validation_time
}
)
print ( "训练一共用了 {:} (h:mm:ss)" . format ( format_time( time. time( ) - total_t0) ) )
Epoch 1 / 4
Batch 40 of 282. Elapsed: 0:00:30.
Batch 80 of 282. Elapsed: 0:01:00.
Batch 120 of 282. Elapsed: 0:01:30.
Batch 160 of 282. Elapsed: 0:02:00.
Batch 200 of 282. Elapsed: 0:02:30.
Batch 240 of 282. Elapsed: 0:03:00.
Batch 280 of 282. Elapsed: 0:03:30.
训练准确率: 0.43
平均训练损失 loss: 1.27
训练时间: 0:03:31
测试准确率: 0.49
平均测试损失 Loss: 1.20
测试时间: 0:00:08
Epoch 2 / 4
Batch 40 of 282. Elapsed: 0:00:30.
Batch 80 of 282. Elapsed: 0:01:00.
Batch 120 of 282. Elapsed: 0:01:30.
Batch 160 of 282. Elapsed: 0:02:01.
Batch 200 of 282. Elapsed: 0:02:31.
Batch 240 of 282. Elapsed: 0:03:01.
Batch 280 of 282. Elapsed: 0:03:31.
训练准确率: 0.54
平均训练损失 loss: 1.09
训练时间: 0:03:32
测试准确率: 0.50
平均测试损失 Loss: 1.20
测试时间: 0:00:08
Epoch 3 / 4
Batch 40 of 282. Elapsed: 0:00:30.
Batch 80 of 282. Elapsed: 0:01:01.
Batch 120 of 282. Elapsed: 0:01:31.
Batch 160 of 282. Elapsed: 0:02:01.
Batch 200 of 282. Elapsed: 0:02:31.
Batch 240 of 282. Elapsed: 0:03:01.
Batch 280 of 282. Elapsed: 0:03:31.
训练准确率: 0.68
平均训练损失 loss: 0.81
训练时间: 0:03:32
测试准确率: 0.53
平均测试损失 Loss: 1.18
测试时间: 0:00:08
Epoch 4 / 4
Batch 40 of 282. Elapsed: 0:00:30.
Batch 80 of 282. Elapsed: 0:01:00.
Batch 120 of 282. Elapsed: 0:01:30.
Batch 160 of 282. Elapsed: 0:02:01.
Batch 200 of 282. Elapsed: 0:02:31.
Batch 240 of 282. Elapsed: 0:03:01.
Batch 280 of 282. Elapsed: 0:03:31.
训练准确率: 0.77
平均训练损失 loss: 0.61
训练时间: 0:03:32
测试准确率: 0.56
平均测试损失 Loss: 1.13
测试时间: 0:00:08
训练一共用了 0:14:40 (h:mm:ss)
简单测试一下
( _, logits) = model( input_ids[ - 20 : ] . to( device) ,
token_type_ids= None ,
attention_mask= attention_masks[ - 20 : ] . to( device) ,
labels= labels[ - 20 : ] . to( device) )
logits = logits. detach( ) . cpu( ) . numpy( )
label_ids = labels[ - 20 : ] . to( 'cpu' ) . numpy( )
acc = flat_accuracy( logits, label_ids)
acc
0.95
pred_flat = np. argmax( logits, axis= 1 ) . flatten( )
pred_flat
array([3, 1, 3, 3, 1, 2, 3, 3, 3, 3, 3, 3, 1, 3, 2, 3, 0, 2, 0, 0],
dtype=int64)
label_ids
array([3, 1, 3, 3, 1, 2, 3, 3, 3, 3, 3, 2, 1, 3, 2, 3, 0, 2, 0, 0],
dtype=int64)