
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import os
from tqdm import tqdm
import re
import inspect

import tensorflow as tf
from tensorflow import keras
# import tensorflow.keras.backend as K
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
import nltk
import datetime
import transformers
from transformers import BertConfig,TFBertPreTrainedModel,BertTokenizer,TFBertMainLayer,TFBertModel

print("tf_version_ : ",tf.__version__)
tf_version_ :  2.0.0
transformers: 2.5.1

#Load data
path_home = r"/home/lowry/pro/kaggle_tweets/kaggle_tweets_emotion"
path_data = os.path.join(path_home,"data")
data_train = pd.read_csv(os.path.join(path_data,"train.csv"),encoding="utf-8")
data_test = pd.read_csv(os.path.join(path_data,"test.csv"),encoding="utf-8")
data_submit = pd.read_csv(os.path.join(path_data,"sample_submission.csv"),encoding="utf-8")

# data_clean
stopwords_english = stopwords.words("english")
# print(stopwords_english)
def cleanword(s):
    s = s.lower()
    temp = re.findall("http\S*",s)  
    for deletStr in temp:
        if deletStr != "":
            s = s.replace(deletStr," ")
    temp = re.findall("@\S*",s)
    for deletStr in temp:
        if deletStr != "":
            s = s.replace(deletStr," ")
    temp = re.findall("\d*",s)
    for deletStr in temp:
        if deletStr != "":
            s = s.replace(deletStr," ")
    temp = re.findall("\x89\S*",s)
    for deletStr in temp:
        if deletStr != "":
            s = s.replace(deletStr[:5]," ")

    s = s.replace("\n"," ")
    s = s.replace(","," ")
    s = s.replace("?"," ")
    s = s.replace("..."," ")
    s = s.replace("."," ")
    s = s.replace("["," ")
    s = s.replace("]"," ")
    s = s.replace("!"," ")
    s = s.replace(":"," ")
    s = s.replace("-"," ")
    s = s.replace("#"," ")
    s = s.replace("|"," ")
    s = s.replace("("," ")
    s = s.replace(")"," ")
    s = s.replace(";"," ")
    s = s.replace("="," ")
    s = s.replace(">"," ")
    s = s.replace("<"," ")
    s = s.replace("/"," ")

    #delet conntinue " "
    s_new = ""
    word = ""
    for i in range(len(s)):
        if s[i] != " " :
            word += s[i]
            if word != "":
                s_new = s_new + " " + word
                word = ""         
    if word != "":
        s_new += word
    s_new = s_new.strip()
    return s_new
data_test['text'] = data_test['text'].apply(cleanword)
data_train['text'] = data_train['text'].apply(cleanword)

#Load bert config and convert words to token 

path_bert = "/home/lowry/pro/model/bert_model_h5/bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(f"{path_bert}/vocab.txt")

def bert_encode(texts,tokenizer,max_length = MAX_LENGTH):
    input_ids = []
    input_masks = []
    input_segment = []
    for text in tqdm(texts):
        inputs = tokenizer.encode_plus(
            add_special_tokens = True,
            max_length = max_length   #如果长度大于这个值会自动舍去后面的词语
        input_ids_temp = inputs["input_ids"]               #id
        input_masks_temp = inputs["attention_mask"]        #mask 有内容就1,padding就0
        input_segment_temp = [0] * max_length               #第几个句子,因为只有一个segment,全部写0,如果有句子对是0和1
        padding_length = max_length - len(input_ids_temp)  #padding 长度
        input_ids_temp += [0]*padding_length               #不够长的id ,padding
        input_masks_temp += [0]*padding_length             #不够长的mask ,padding
    return [

train_input = bert_encode(data_train.text.values,tokenizer,MAX_LENGTH)
test_input = bert_encode(data_test.text.values,tokenizer,MAX_LENGTH)
train_label = np.array(data_train['target'].tolist(),dtype=np.int32)

word_len = data_train.text.apply(lambda x : len(tokenizer.encode(x)))

word_len_percent: 36.0
#build model
bert return:

1:last_hidden_state : shape = (batch_size, sequence_length, hidden_size)):

2:pooler_output : shape =  (batch_size, hidden_size)):

3:hidden_states : shape = (1+12) * (batch_size, sequence_length, hidden_size))
                  ps:1+12 = 1 embeddings + 12 layer


class TweetBERT(tf.keras.Model):
    def __init__(self):
        config = BertConfig.from_pretrained(f"{path_bert}/config.json",output_hidden_states=True)
        self.hidden_size = config.hidden_size
        self.bert_model = TFBertModel.from_pretrained(f"{path_bert}/tf_model.h5", config=config)
        self.concat = tf.keras.layers.Concatenate(axis=2)
        self.avgpool = tf.keras.layers.GlobalAveragePooling1D()
        self.dropout = tf.keras.layers.Dropout(0.15)
        self.output_ = tf.keras.layers.Dense(1,activation="sigmoid")

    def call(self,inputs):
        input_id, input_mask,input_segment = inputs
        sequence_output, pooler_output, hidden_states = self.bert_model(input_id,attention_mask=input_mask,token_type_ids=input_segment)
        h12 = tf.reshape(hidden_states[-1][:,0],(-1,1,self.hidden_size))
        h11 = tf.reshape(hidden_states[-2][:,0],(-1,1,self.hidden_size))
        h10 = tf.reshape(hidden_states[-3][:,0],(-1,1,self.hidden_size))
        h09 = tf.reshape(hidden_states[-4][:,0],(-1,1,self.hidden_size))
        concat_hidden = self.concat(([h12,h11,h10,h09]))
        x = self.avgpool(concat_hidden)
#         x = sequence_output[:,0,:]
        x = self.dropout(x)
        x = self.output_(x)
        return x

model = TweetBERT()
optimizer = keras.optimizers.Adam(learning_rate=1e-5)
loss = "binary_crossentropy"

path_save_model = "/home/lowry/pro/kaggle_tweets/kaggle_tweets_emotion/model/" + "bert-base" + '/'
if not os.path.exists(path_save_model):
path_save_model += "saveModelWeightCheckpoint" 
checkpoint = keras.callbacks.ModelCheckpoint(
    filepath = path_save_model,
    monitor = "val_accuracy",
    mode = "max",
    verbose = 1,
    save_best_only = True,
    save_weight_only = True,

history = model.fit(
    callbacks = [checkpoint],
    def call(self,inputs):
        input_id, input_mask,input_segment = inputs
        sequence_output, pooler_output, hidden_states = self.bert_model(input_id,attention_mask=input_mask,token_type_ids=input_segment)
        h12 = tf.reshape(hidden_states[-1][:,0],(-1,1,self.hidden_size))
        h11 = tf.reshape(hidden_states[-2][:,0],(-1,1,self.hidden_size))
        h10 = tf.reshape(hidden_states[-3][:,0],(-1,1,self.hidden_size))
        h09 = tf.reshape(hidden_states[-4][:,0],(-1,1,self.hidden_size))
        concat_hidden = self.concat(([h12,h11,h10,h09]))
        x = self.avgpool(concat_hidden)
#         x = sequence_output[:,0,:]
        x = self.dropout(x)
        x = self.output_(x)
        return x

    def call(self,inputs):
        input_id, input_mask,input_segment = inputs
        sequence_output, pooler_output, hidden_states = self.bert_model(input_id,attention_mask=input_mask,token_type_ids=input_segment)
        h12 = tf.reshape(hidden_states[-1][:,0],(-1,1,self.hidden_size))
        h11 = tf.reshape(hidden_states[-2][:,0],(-1,1,self.hidden_size))
        h10 = tf.reshape(hidden_states[-3][:,0],(-1,1,self.hidden_size))
        h09 = tf.reshape(hidden_states[-4][:,0],(-1,1,self.hidden_size))
        concat_hidden = self.concat(([h12,h11,h10,h09]))
        x = self.avgpool(concat_hidden)
#         x = sequence_output[:,0,:]
        x = self.dropout(x)
        x = self.output_(x)
        return x

# load_best_model
model = TweetBERT()
# model.summary()
data = pd.DataFrame(history.history).plot()


result = model.predict(test_input)
    def call(self,inputs):
        input_id, input_mask,input_atn = inputs
        sequence_output, pooler_output, hidden_states = self.bert_model(input_id,attention_mask=input_mask,token_type_ids=input_atn)
        h12 = tf.reshape(hidden_states[-1][:,0],(-1,1,self.hidden_size))
        h11 = tf.reshape(hidden_states[-2][:,0],(-1,1,self.hidden_size))
        h10 = tf.reshape(hidden_states[-3][:,0],(-1,1,self.hidden_size))
        h09 = tf.reshape(hidden_states[-4][:,0],(-1,1,self.hidden_size))
        concat_hidden = self.concat(([h12,h11,h10,h09]))
        x = self.avgpool(concat_hidden)
#         x = sequence_output[:,0,:]
        x = self.dropout(x)
        x = self.output_(x)
        return x

    def call(self,inputs):
        input_id, input_mask,input_atn = inputs
        sequence_output, pooler_output, hidden_states = self.bert_model(input_id,attention_mask=input_mask,token_type_ids=input_atn)
        h12 = tf.reshape(hidden_states[-1][:,0],(-1,1,self.hidden_size))
        h11 = tf.reshape(hidden_states[-2][:,0],(-1,1,self.hidden_size))
        h10 = tf.reshape(hidden_states[-3][:,0],(-1,1,self.hidden_size))
        h09 = tf.reshape(hidden_states[-4][:,0],(-1,1,self.hidden_size))
        concat_hidden = self.concat(([h12,h11,h10,h09]))
        x = self.avgpool(concat_hidden)
#         x = sequence_output[:,0,:]
        x = self.dropout(x)
        x = self.output_(x)
        return x

#output submit
# date = datetime.datetime.now().strftime("%Y%m%d")
# path_save_submit = "/home/lowry/pro/kaggle_tweets/kaggle_tweets_emotion/result/"+date+"largebert"+".csv"
# submit = result.round()
# submit = [int(li[0]) for li in submit]
# submit_data = pd.DataFrame({"id":data_test.id,"target":submit})
# submit_data.to_csv(path_save_submit,index=False)

# train_input_1 = [train_input[0][:10],train_input[1][:10],train_input[2][:10]]
# re = model(train_input_1)
# print(re)
# print(re.shape)
