Kaggle是一个数据分析竞赛的网站,里面有很多有趣的竞赛和练习。最近刚结束的一个twitter sentiment extraction的竞赛挺有意思的,给出Twitter的文本以及情感分类(positive, negative, neutral),需要找出文本中的哪些内容是支持这个情感分类的。例如对于“Sooo SAD I will miss you here in San Diego!!!”这条推特,分类为negative,其中的“Sooo SAD”是判断为negative的依据。这个竞赛可以看作为NLP中的问答,即把tweet文本以及情感分类作为上下文,从中找出一些词语作为答案。
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
import nltk
from nltk.corpus import stopwords
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import numpy as np
import tensorflow as tf
import tensorflow.keras.backend as K
from sklearn.model_selection import StratifiedKFold
from transformers import *
import tokenizers
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
textID | text | selected_text | sentiment | |
0 | cb774db0d1 | I`d have responded, if I were going | I`d have responded, if I were going | neutral |
1 | 549e992a42 | Sooo SAD I will miss you here in San Diego!!! | Sooo SAD | negative |
2 | 088c60f138 | my boss is bullying me... | bullying me | negative |
3 | 9642c003ef | what interview! leave me alone | leave me alone | negative |
4 | 358bd9e861 | Sons of ****, why couldn`t they put them on t... | Sons of ****, | negative |
RangeIndex: 27481 entries, 0 to 27480
Data columns (total 4 columns):
textID 27481 non-null object
text 27480 non-null object
selected_text 27480 non-null object
sentiment 27481 non-null object
dtypes: object(4)
memory usage: 858.9+ KB
temp = train.groupby('sentiment').count()['text'].reset_index().sort_values(by='text',ascending=False)
sentiment | text | |
1 | neutral | 11117 |
2 | positive | 8582 |
0 | negative | 7781 |
def jaccard(str1, str2):
a = set(str1.lower().split())
b = set(str2.lower().split())
if (len(a)==0) & (len(b)==0): return 0.5
c = a.intersection(b)
return float(len(c)) / (len(a) + len(b) - len(c))
for ind,row in train.iterrows():
sentence1 = row.text
sentence2 = row.selected_text
jaccard_score = jaccard(sentence1,sentence2)
jaccard = pd.DataFrame(results_jaccard,columns=["text","selected_text","jaccard_score"])
train = train.merge(jaccard,how='outer')
textID | text | selected_text | sentiment | jaccard_score | |
0 | cb774db0d1 | I`d have responded, if I were going | I`d have responded, if I were going | neutral | 1.000000 |
1 | 549e992a42 | Sooo SAD I will miss you here in San Diego!!! | Sooo SAD | negative | 0.200000 |
2 | 088c60f138 | my boss is bullying me... | bullying me | negative | 0.166667 |
3 | 9642c003ef | what interview! leave me alone | leave me alone | negative | 0.600000 |
4 | 358bd9e861 | Sons of ****, why couldn`t they put them on t... | Sons of ****, | negative | 0.214286 |
train['Num_words_ST'] = train['selected_text'].apply(lambda x:len(str(x).split()))
train['Num_word_text'] = train['text'].apply(lambda x:len(str(x).split()))
train['difference_in_words'] = train['Num_word_text'] - train['Num_words_ST']
p1=sns.kdeplot(train['Num_words_ST'], shade=True, color="r").set_title('Kernel Distribution of Number Of words')
p1=sns.kdeplot(train['Num_word_text'], shade=True, color="b")
p1=sns.kdeplot(train[train['sentiment']=='positive']['difference_in_words'], shade=True, color="b").set_title('Kernel Distribution of Difference in Number Of words')
p2=sns.kdeplot(train[train['sentiment']=='negative']['difference_in_words'], shade=True, color="r")
p1=sns.kdeplot(train[train['sentiment']=='positive']['jaccard_score'], shade=True, color="b").set_title('KDE of Jaccard Scores across different Sentiments')
p2=sns.kdeplot(train[train['sentiment']=='negative']['jaccard_score'], shade=True, color="r")
def plot_wordcloud(text, mask=None, max_words=200, max_font_size=100, figure_size=(16.0,6.0), color = 'white',
title = None, title_size=40, image_color=False):
stopwords = set(STOPWORDS)
more_stopwords = {'u', "im"}
stopwords = stopwords.union(more_stopwords)
wordcloud = WordCloud(background_color=color,
stopwords = stopwords,
max_words = max_words,
max_font_size = max_font_size,
random_state = 42,
mask = mask)
if image_color:
image_colors = ImageColorGenerator(mask);
plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation="bilinear");
plt.title(title, fontdict={'size': title_size,
'verticalalignment': 'bottom'})
plt.title(title, fontdict={'size': title_size, 'color': 'black',
'verticalalignment': 'bottom'})
pos_mask = np.array(Image.open('twitter.jpg'))
neutral_sent = train[train['sentiment']=='neutral']
plot_wordcloud(neutral_sent.text,mask=pos_mask,color='white',max_font_size=80,title_size=30,title="WordCloud of Neutral Tweets")
positive_sent = train[train['sentiment']=='positive']
plot_wordcloud(positive_sent.text,mask=pos_mask,color='white',max_font_size=80,title_size=30,title="WordCloud of Neutral Tweets")
negative_sent = train[train['sentiment']=='negative']
plot_wordcloud(negative_sent.text,mask=pos_mask,color='white',max_font_size=80,title_size=30,title="WordCloud of Neutral Tweets")
MAX_LEN = 96
PATH = '../../NLP/models/roberta-base/'
tokenizer = tokenizers.ByteLevelBPETokenizer(
sentiment_id = {'positive': 1313, 'negative': 2430, 'neutral': 7974}
ct = train.shape[0]
input_ids = np.ones((ct,MAX_LEN),dtype='int32')
attention_mask = np.zeros((ct,MAX_LEN),dtype='int32')
token_type_ids = np.zeros((ct,MAX_LEN),dtype='int32')
start_tokens = np.zeros((ct,MAX_LEN),dtype='int32')
end_tokens = np.zeros((ct,MAX_LEN),dtype='int32')
for k in range(train.shape[0]):
text1 = " "+" ".join(train.loc[k,'text'].split())
text2 = " ".join(train.loc[k,'selected_text'].split())
idx = text1.find(text2)
if text1[idx-1] == " ":
idx -= 1
enc1 = tokenizer.encode(text1)
enc2 = tokenizer.encode(text2)
start_token_idx = 1
if idx>0:
start_token_idx = len(tokenizer.encode(text1[0:idx]).ids)+1
end_token_idx = start_token_idx+len(enc2.ids)-1
s_tok = sentiment_id[train.loc[k,'sentiment']]
input_ids[k,:len(enc1.ids)+5] = [0] + enc1.ids + [2,2] + [s_tok] + [2]
attention_mask[k,:len(enc1.ids)+5] = 1
start_tokens[k,start_token_idx] = 1
end_tokens[k,end_token_idx] = 1
ct = test.shape[0]
input_ids_t = np.ones((ct,MAX_LEN),dtype='int32')
attention_mask_t = np.zeros((ct,MAX_LEN),dtype='int32')
token_type_ids_t = np.zeros((ct,MAX_LEN),dtype='int32')
for k in range(test.shape[0]):
text1 = " "+" ".join(test.loc[k,'text'].split())
enc = tokenizer.encode(text1)
s_tok = sentiment_id[test.loc[k,'sentiment']]
input_ids_t[k,:len(enc.ids)+5] = [0] + enc.ids + [2,2] + [s_tok] + [2]
attention_mask_t[k,:len(enc.ids)+5] = 1
def build_model():
ids = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
att = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
tok = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
config = RobertaConfig.from_pretrained(PATH+'config.json')
bert_model = TFRobertaModel.from_pretrained(PATH+'tf_model.h5',config=config)
x = bert_model(ids,attention_mask=att,token_type_ids=tok)
# The end position of the answer
x2 = tf.keras.layers.Dropout(0.1)(x[0])
x2b = tf.keras.layers.Dense(1)(x2)
x2 = tf.keras.layers.Flatten()(x2b)
x2 = tf.keras.layers.Activation('softmax')(x2)
# The start position of the answer
x1 = tf.keras.layers.Concatenate()([x2b,x[0]])
x1 = tf.keras.layers.Dense(1)(x1)
x1 = tf.keras.layers.Flatten()(x1)
x1 = tf.keras.layers.Activation('softmax')(x1)
model = tf.keras.models.Model(inputs=[ids, att, tok], outputs=[x1,x2])
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)
return model
def jaccard(str1, str2):
a = set(str1.lower().split())
b = set(str2.lower().split())
if (len(a)==0) & (len(b)==0): return 0.5
c = a.intersection(b)
return float(len(c)) / (len(a) + len(b) - len(c))
skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=777)
val_start = np.zeros((input_ids.shape[0],MAX_LEN))
val_end = np.zeros((input_ids.shape[0],MAX_LEN))
for fold,(idxT,idxV) in enumerate(skf.split(input_ids,train.sentiment.values)):
print('### Start Fold %i training:'%fold)
model = build_model()
sv = tf.keras.callbacks.ModelCheckpoint(
'roberta-%i.h5'%(fold), monitor='val_loss', verbose=1, save_best_only=True,
save_weights_only=True, mode='auto', save_freq='epoch')
model.fit([input_ids[idxT,], attention_mask[idxT,], token_type_ids[idxT,]], [start_tokens[idxT,], end_tokens[idxT,]],
epochs=3, batch_size=32, verbose=True, callbacks=[sv],
[start_tokens[idxV,], end_tokens[idxV,]]))
val_start[idxV,],val_end[idxV,] = model.predict([input_ids[idxV,],attention_mask[idxV,],token_type_ids[idxV,]])
val_metric = []
for k in idxV:
start = np.argmax(val_start[k])
end = np.argmax(val_end[k])
if end>=start:
selected_text = tokenizer.decode(input_ids[k,start:(end+1)])
selected_text = train.loc[k, 'text']
val_metric.append(jaccard(selected_text, train.loc[k, 'selected_text']))
val_metric_all = np.mean(val_metric)
print('Fold %i metric:%f'%(fold, val_metric_all))
### Start Fold 0 training:
Some weights of the model checkpoint at ../../NLP/models/roberta-base/tf_model.h5 were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFRobertaModel were initialized from the model checkpoint at ../../NLP/models/roberta-base/tf_model.h5.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use TFRobertaModel for predictions without further training.
Train on 21984 samples, validate on 5496 samples
Epoch 1/3
WARNING:tensorflow:Gradients do not exist for variables ['tf_roberta_model/roberta/pooler/dense/kernel:0', 'tf_roberta_model/roberta/pooler/dense/bias:0'] when minimizing the loss.
WARNING:tensorflow:Gradients do not exist for variables ['tf_roberta_model/roberta/pooler/dense/kernel:0', 'tf_roberta_model/roberta/pooler/dense/bias:0'] when minimizing the loss.
21952/21984 [============================>.] - ETA: 0s - loss: 2.2670 - activation_1_loss: 1.0392 - activation_loss: 1.2278
Epoch 00001: val_loss improved from inf to 1.76498, saving model to roberta-0.h5
21984/21984 [==============================] - 207s 9ms/sample - loss: 2.2667 - activation_1_loss: 1.0391 - activation_loss: 1.2277 - val_loss: 1.7650 - val_activation_1_loss: 0.8611 - val_activation_loss: 0.9037
Epoch 2/3
21952/21984 [============================>.] - ETA: 0s - loss: 1.8038 - activation_1_loss: 0.8228 - activation_loss: 0.9810
Epoch 00002: val_loss did not improve from 1.76498
21984/21984 [==============================] - 204s 9ms/sample - loss: 1.8032 - activation_1_loss: 0.8225 - activation_loss: 0.9807 - val_loss: 1.7662 - val_activation_1_loss: 0.8408 - val_activation_loss: 0.9254
Epoch 3/3
21952/21984 [============================>.] - ETA: 0s - loss: 1.6722 - activation_1_loss: 0.7702 - activation_loss: 0.9020
Epoch 00003: val_loss improved from 1.76498 to 1.73339, saving model to roberta-0.h5
21984/21984 [==============================] - 206s 9ms/sample - loss: 1.6726 - activation_1_loss: 0.7709 - activation_loss: 0.9017 - val_loss: 1.7334 - val_activation_1_loss: 0.8281 - val_activation_loss: 0.9048
Fold 0 metric:0.701402
### Start Fold 1 training:
### Start Fold 2 training:
### Start Fold 3 training:
### Start Fold 4 training:
preds_start = np.zeros((input_ids_t.shape[0],MAX_LEN))
preds_end = np.zeros((input_ids_t.shape[0],MAX_LEN))
for i in range(5):
preds = model.predict([input_ids_t,attention_mask_t,token_type_ids_t])
preds_start += preds[0]/5
preds_end += preds[1]/5
preds_text = []
for i in range(test.shape[0]):
start = np.argmax(preds_start[i])
end = np.argmax(preds_end[i])
if end>=start:
selected_text = tokenizer.decode(input_ids_t[i, start:(end+1)]).strip()
selected_text = test.loc[0, 'text']
test['selected_text'] = preds_text
test[['textID', 'selected_text']].to_csv('submission.csv', index=False)
textID | text | sentiment | selected_text | |
0 | f87dea47db | Last session of the day http://twitpic.com/67ezh | neutral | last session of the day http://twitpic.com/67ezh |
1 | 96d74cb729 | Shanghai is also really exciting (precisely -... | positive | exciting |
2 | eee518ae67 | Recession hit Veronique Branquinho, she has to... | negative | such a shame! |
3 | 01082688c6 | happy bday! | positive | happy bday! |
4 | 33987a8ee5 | http://twitpic.com/4w75p - I like it!! | positive | i like it!! |
5 | 726e501993 | that`s great!! weee!! visitors! | positive | that`s great!! |
6 | 261932614e | I THINK EVERYONE HATES ME ON HERE lol | negative | hates |
7 | afa11da83f | soooooo wish i could, but im in school and my... | negative | blocked |
8 | e64208b4ef | and within a short time of the last clue all ... | neutral | and within a short time of the last clue all o... |
9 | 37bcad24ca | What did you get? My day is alright.. haven`... | neutral | what did you get? my day is alright.. haven`t ... |
10 | 24c92644a4 | My bike was put on hold...should have known th... | negative | argh total bummer |
11 | 43b390b336 | I checked. We didn`t win | neutral | i checked. we didn`t win |
12 | 69d6b5d93e | .. and you`re on twitter! Did the tavern bore... | neutral | .. and you`re on twitter! did the tavern bore ... |
13 | 5c1e0b61a1 | I`m in VA for the weekend, my youngest son tur... | negative | it makes me kinda sad, |
14 | 504e45d9d9 | Its coming out the socket I feel like my phon... | negative | i feel like my phones hole is not a virgin. th... |
15 | ae93ad52a0 | So hot today =_= don`t like it and i hate my ... | negative | i hate my new timetable, having such a bad week |
16 | 9fce30159a | Miss you | negative | miss you |
17 | 00d5195223 | Cramps . . . | negative | cramps |
18 | 33f19050cf | you guys didn`t say hi or answer my questions... | positive | nice songs. |
19 | f7718b3c23 | I`m going into a spiritual stagnentation, its ... | neutral | i`m going into a spiritual stagnentation, its ... |