Kaggle_tweet_emotion_bert_transformers

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import os
from tqdm import tqdm
import re
import inspect

import tensorflow as tf
from tensorflow import keras
# import tensorflow.keras.backend as K
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
import nltk
import datetime
import transformers
from transformers import BertConfig,TFBertPreTrainedModel,BertTokenizer,TFBertMainLayer,TFBertModel



print("tf_version_ : ",tf.__version__)
print("transformers:",transformers.__version__)
tf_version_ :  2.0.0
transformers: 2.5.1
MAX_LENGTH = 36
BATCH_SIZE = 16

#Load data
path_home = r"/home/lowry/pro/kaggle_tweets/kaggle_tweets_emotion"
path_data = os.path.join(path_home,"data")
data_train = pd.read_csv(os.path.join(path_data,"train.csv"),encoding="utf-8")
data_test = pd.read_csv(os.path.join(path_data,"test.csv"),encoding="utf-8")
data_submit = pd.read_csv(os.path.join(path_data,"sample_submission.csv"),encoding="utf-8")

# data_clean
stopwords_english = stopwords.words("english")
# print(stopwords_english)
def cleanword(s):
    s = s.lower()
    temp = re.findall("http\S*",s)  
    for deletStr in temp:
        if deletStr != "":
            s = s.replace(deletStr," ")
    temp = re.findall("@\S*",s)
    for deletStr in temp:
        if deletStr != "":
            s = s.replace(deletStr," ")
    temp = re.findall("\d*",s)
    for deletStr in temp:
        if deletStr != "":
            s = s.replace(deletStr," ")
            
    temp = re.findall("\x89\S*",s)
    for deletStr in temp:
        if deletStr != "":
            s = s.replace(deletStr[:5]," ")

    s = s.replace("\n"," ")
    s = s.replace(","," ")
    s = s.replace("?"," ")
    s = s.replace("..."," ")
    s = s.replace("."," ")
    s = s.replace("["," ")
    s = s.replace("]"," ")
    s = s.replace("!"," ")
    s = s.replace(":"," ")
    s = s.replace("-"," ")
    s = s.replace("#"," ")
    s = s.replace("|"," ")
    s = s.replace("("," ")
    s = s.replace(")"," ")
    s = s.replace(";"," ")
    s = s.replace("="," ")
    s = s.replace(">"," ")
    s = s.replace("<"," ")
    s = s.replace("/"," ")
    

    #delet conntinue " "
    s_new = ""
    word = ""
    
    for i in range(len(s)):
        if s[i] != " " :
            word += s[i]
        else:
            if word != "":
                s_new = s_new + " " + word
                word = ""         
    if word != "":
        s_new += word
        
    s_new = s_new.strip()
    
    
    return s_new
data_test['text'] = data_test['text'].apply(cleanword)
data_train['text'] = data_train['text'].apply(cleanword)

#Load bert config and convert words to token 

path_bert = "/home/lowry/pro/model/bert_model_h5/bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(f"{path_bert}/vocab.txt")

def bert_encode(texts,tokenizer,max_length = MAX_LENGTH):
    input_ids = []
    input_masks = []
    input_segment = []
    for text in tqdm(texts):
        #把文字转化为id
        inputs = tokenizer.encode_plus(
            text,
            add_special_tokens = True,
            max_length = max_length   #如果长度大于这个值会自动舍去后面的词语
        )
        input_ids_temp = inputs["input_ids"]               #id
        input_masks_temp = inputs["attention_mask"]        #mask 有内容就1,padding就0
        input_segment_temp = [0] * max_length               #第几个句子,因为只有一个segment,全部写0,如果有句子对是0和1
        padding_length = max_length - len(input_ids_temp)  #padding 长度
        input_ids_temp += [0]*padding_length               #不够长的id ,padding
        input_masks_temp += [0]*padding_length             #不够长的mask ,padding
        
        #把数据加入到数据集中
        input_ids.append(input_ids_temp)
        input_masks.append(input_masks_temp)
        input_segment.append(input_segment_temp)
        
    return [
        np.array(input_ids,dtype=np.int32),
        np.array(input_masks,dtype=np.int32),
        np.array(input_segment,dtype=np.int32)
    ]


train_input = bert_encode(data_train.text.values,tokenizer,MAX_LENGTH)
test_input = bert_encode(data_test.text.values,tokenizer,MAX_LENGTH)
train_label = np.array(data_train['target'].tolist(),dtype=np.int32)

word_len = data_train.text.apply(lambda x : len(tokenizer.encode(x)))
print("word_len_percent:",np.percentile(word_len.tolist(),99))


Calling BertTokenizer.from_pretrained() with the path to a single file or url is deprecated
100%|██████████| 7613/7613 [00:02<00:00, 3689.10it/s]
100%|██████████| 3263/3263 [00:00<00:00, 3741.57it/s]


word_len_percent: 36.0
#build model
"""
bert return:

1:last_hidden_state : shape = (batch_size, sequence_length, hidden_size)):

2:pooler_output : shape =  (batch_size, hidden_size)):

3:hidden_states : shape = (1+12) * (batch_size, sequence_length, hidden_size))
                  ps:1+12 = 1 embeddings + 12 layer

"""

class TweetBERT(tf.keras.Model):
    def __init__(self):
        super(TweetBERT,self).__init__()
        config = BertConfig.from_pretrained(f"{path_bert}/config.json",output_hidden_states=True)
        self.hidden_size = config.hidden_size
        self.bert_model = TFBertModel.from_pretrained(f"{path_bert}/tf_model.h5", config=config)
        self.concat = tf.keras.layers.Concatenate(axis=2)
        self.avgpool = tf.keras.layers.GlobalAveragePooling1D()
        self.dropout = tf.keras.layers.Dropout(0.15)
        self.output_ = tf.keras.layers.Dense(1,activation="sigmoid")

        
    def call(self,inputs):
        input_id, input_mask,input_segment = inputs
        sequence_output, pooler_output, hidden_states = self.bert_model(input_id,attention_mask=input_mask,token_type_ids=input_segment)
        h12 = tf.reshape(hidden_states[-1][:,0],(-1,1,self.hidden_size))
        h11 = tf.reshape(hidden_states[-2][:,0],(-1,1,self.hidden_size))
        h10 = tf.reshape(hidden_states[-3][:,0],(-1,1,self.hidden_size))
        h09 = tf.reshape(hidden_states[-4][:,0],(-1,1,self.hidden_size))
        
        concat_hidden = self.concat(([h12,h11,h10,h09]))
        x = self.avgpool(concat_hidden)
#         x = sequence_output[:,0,:]
        x = self.dropout(x)
        x = self.output_(x)
        return x


model = TweetBERT()
optimizer = keras.optimizers.Adam(learning_rate=1e-5)
loss = "binary_crossentropy"
model.compile(loss=loss,optimizer=optimizer,metrics=["accuracy"])

#训练和保存模型
path_save_model = "/home/lowry/pro/kaggle_tweets/kaggle_tweets_emotion/model/" + "bert-base" + '/'
if not os.path.exists(path_save_model):
    os.mkdir(path_save_model)
path_save_model += "saveModelWeightCheckpoint" 
checkpoint = keras.callbacks.ModelCheckpoint(
    filepath = path_save_model,
    monitor = "val_accuracy",
    mode = "max",
    verbose = 1,
    save_best_only = True,
    save_weight_only = True,
    
)

history = model.fit(
    train_input,
    train_label,
    epochs=3,
    batch_size=BATCH_SIZE,
    validation_split=0.2,
    callbacks = [checkpoint],
)
WARNING:tensorflow:Entity > could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of >, which Python reported as:
    def call(self,inputs):
        input_id, input_mask,input_segment = inputs
        sequence_output, pooler_output, hidden_states = self.bert_model(input_id,attention_mask=input_mask,token_type_ids=input_segment)
        h12 = tf.reshape(hidden_states[-1][:,0],(-1,1,self.hidden_size))
        h11 = tf.reshape(hidden_states[-2][:,0],(-1,1,self.hidden_size))
        h10 = tf.reshape(hidden_states[-3][:,0],(-1,1,self.hidden_size))
        h09 = tf.reshape(hidden_states[-4][:,0],(-1,1,self.hidden_size))
        
        concat_hidden = self.concat(([h12,h11,h10,h09]))
        x = self.avgpool(concat_hidden)
#         x = sequence_output[:,0,:]
        x = self.dropout(x)
        x = self.output_(x)
        return x

This may be caused by multiline strings or comments not indented at the same level as the code.
WARNING: Entity > could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of >, which Python reported as:
    def call(self,inputs):
        input_id, input_mask,input_segment = inputs
        sequence_output, pooler_output, hidden_states = self.bert_model(input_id,attention_mask=input_mask,token_type_ids=input_segment)
        h12 = tf.reshape(hidden_states[-1][:,0],(-1,1,self.hidden_size))
        h11 = tf.reshape(hidden_states[-2][:,0],(-1,1,self.hidden_size))
        h10 = tf.reshape(hidden_states[-3][:,0],(-1,1,self.hidden_size))
        h09 = tf.reshape(hidden_states[-4][:,0],(-1,1,self.hidden_size))
        
        concat_hidden = self.concat(([h12,h11,h10,h09]))
        x = self.avgpool(concat_hidden)
#         x = sequence_output[:,0,:]
        x = self.dropout(x)
        x = self.output_(x)
        return x

This may be caused by multiline strings or comments not indented at the same level as the code.
Train on 6090 samples, validate on 1523 samples
Epoch 1/3
WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model_7/bert/pooler/dense/kernel:0', 'tf_bert_model_7/bert/pooler/dense/bias:0'] when minimizing the loss.
WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model_7/bert/pooler/dense/kernel:0', 'tf_bert_model_7/bert/pooler/dense/bias:0'] when minimizing the loss.
6080/6090 [============================>.] - ETA: 0s - loss: 0.4599 - accuracy: 0.7929
Epoch 00001: val_accuracy improved from -inf to 0.82928, saving model to /home/lowry/pro/kaggle_tweets/kaggle_tweets_emotion/model/bert-base/saveModelWeightCheckpoint
6090/6090 [==============================] - 70s 12ms/sample - loss: 0.4602 - accuracy: 0.7928 - val_loss: 0.3834 - val_accuracy: 0.8293
Epoch 2/3
6080/6090 [============================>.] - ETA: 0s - loss: 0.3485 - accuracy: 0.8546
Epoch 00002: val_accuracy did not improve from 0.82928
6090/6090 [==============================] - 41s 7ms/sample - loss: 0.3483 - accuracy: 0.8548 - val_loss: 0.3990 - val_accuracy: 0.8240
Epoch 3/3
6080/6090 [============================>.] - ETA: 0s - loss: 0.2758 - accuracy: 0.8914
Epoch 00003: val_accuracy did not improve from 0.82928
6090/6090 [==============================] - 41s 7ms/sample - loss: 0.2756 - accuracy: 0.8916 - val_loss: 0.4515 - val_accuracy: 0.8267
# load_best_model
model = TweetBERT()
model.load_weights(path_save_model)
print(new_model)
# model.summary()
<__main__.TweetBERT object at 0x7f749ad53b38>
data = pd.DataFrame(history.history).plot()
plt.show()

Kaggle_tweet_emotion_bert_transformers_第1张图片

result = model.predict(test_input)
print(result)
WARNING:tensorflow:Entity > could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of >, which Python reported as:
    def call(self,inputs):
        input_id, input_mask,input_atn = inputs
        sequence_output, pooler_output, hidden_states = self.bert_model(input_id,attention_mask=input_mask,token_type_ids=input_atn)
        h12 = tf.reshape(hidden_states[-1][:,0],(-1,1,self.hidden_size))
        h11 = tf.reshape(hidden_states[-2][:,0],(-1,1,self.hidden_size))
        h10 = tf.reshape(hidden_states[-3][:,0],(-1,1,self.hidden_size))
        h09 = tf.reshape(hidden_states[-4][:,0],(-1,1,self.hidden_size))
        
        concat_hidden = self.concat(([h12,h11,h10,h09]))
        x = self.avgpool(concat_hidden)
#         x = sequence_output[:,0,:]
        x = self.dropout(x)
        x = self.output_(x)
        return x

This may be caused by multiline strings or comments not indented at the same level as the code.
WARNING: Entity > could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of >, which Python reported as:
    def call(self,inputs):
        input_id, input_mask,input_atn = inputs
        sequence_output, pooler_output, hidden_states = self.bert_model(input_id,attention_mask=input_mask,token_type_ids=input_atn)
        h12 = tf.reshape(hidden_states[-1][:,0],(-1,1,self.hidden_size))
        h11 = tf.reshape(hidden_states[-2][:,0],(-1,1,self.hidden_size))
        h10 = tf.reshape(hidden_states[-3][:,0],(-1,1,self.hidden_size))
        h09 = tf.reshape(hidden_states[-4][:,0],(-1,1,self.hidden_size))
        
        concat_hidden = self.concat(([h12,h11,h10,h09]))
        x = self.avgpool(concat_hidden)
#         x = sequence_output[:,0,:]
        x = self.dropout(x)
        x = self.output_(x)
        return x

This may be caused by multiline strings or comments not indented at the same level as the code.
[[0.5131391 ]
 [0.9968698 ]
 [0.98543626]
 ...
 [0.99892104]
 [0.9709744 ]
 [0.98978955]]
#output submit
# date = datetime.datetime.now().strftime("%Y%m%d")
# path_save_submit = "/home/lowry/pro/kaggle_tweets/kaggle_tweets_emotion/result/"+date+"largebert"+".csv"
# submit = result.round()
# submit = [int(li[0]) for li in submit]
# submit_data = pd.DataFrame({"id":data_test.id,"target":submit})
# submit_data.to_csv(path_save_submit,index=False)

# train_input_1 = [train_input[0][:10],train_input[1][:10],train_input[2][:10]]
# re = model(train_input_1)
# print(re)
# print(re.shape)



你可能感兴趣的:(比赛)