比赛链接:https://www.kaggle.com/competitions/feedback-prize-effectiveness/overview
公开代码:https://www.kaggle.com/code/mustafakeser4/feedback-eda/notebook
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from wordcloud import WordCloud,ImageColorGenerator
from PIL import Image
pd.options.display.width = 180
pd.options.display.max_colwidth = 120
train = pd.read_csv("../input/feedback-prize-effectiveness/train.csv")
test = pd.read_csv("../input/feedback-prize-effectiveness/test.csv")
sample_sub = pd.read_csv("../input/feedback-prize-effectiveness/sample_submission.csv")
print('train:')
display(train)
print('test:')
display(test)
print('sample_sub:')
display(sample_sub)
train.describe()
def count_string(x):
return len(x.split())
train['word_count'] = train.discourse_text.apply(count_string)
test['word_count'] = test.discourse_text.apply(count_string)
import seaborn as sns
plt.figure(figsize = (10,4))
sns.countplot(data = train,x = 'discourse_type',hue = 'discourse_effectiveness')
def pieplot(df,col ,title,explodeplace=0):
"""
df: DataFrame
col: column
title: title for plot
explodeplace: segment|slice location for exploding
"""
labels = df[col].value_counts().index
sizes = df[col].value_counts()
# 初始化偏移程度
# https://vimsky.com/examples/usage/matplotlib-axes-axes-pie-in-python.html
explode = np.zeros(len(sizes))
explode[explodeplace]=0.1
fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
shadow=True, startangle=90)
ax1.axis('equal')
ax1.set_title(title)
plt.show()
print(df[col].value_counts())
分别考察:训练集整体评分分布、训练集整体问题类型分布、测试集整体评分分布
pieplot(train,"discourse_effectiveness","train discourse_effectiveness",explodeplace=1)
pieplot(train,"discourse_type","train discourse_type",explodeplace=0)
pieplot(test,"discourse_type","test discourse_type",explodeplace=0)
plt.figure(figsize = (10,5))
sns.histplot(data = train,x = 'word_count',hue = 'discourse_effectiveness',kde = True)
plt.title('distribution of word counts (based df row)')
train_txt_list = os.listdir("../input/feedback-prize-effectiveness/train")
test_txt_list = os.listdir("../input/feedback-prize-effectiveness/test")
train_word_count = []
for txt in tqdm(train_txt_list):
with open("../input/feedback-prize-effectiveness/train/"+txt,'r') as f:
data = f.read().replace('\n','')
train_word_count.append(len(data.split()))
test_word_count = []
for txt in tqdm(test_txt_list):
with open("../input/feedback-prize-effectiveness/test/"+txt, "r") as f:
data = f.read().replace('\n','')
test_word_count.append(len(data.split()))
train_count_dict = dict(zip([f[:-4] for f in train_txt_list],train_word_count))
test_count_dict = dict(zip([f[:-4] for f in test_txt_list],test_word_count))
每篇文章的字数统计
essay_count_df = pd.DataFrame(data = train_count_dict.values(),index = train_count_dict.keys(),columns = ['word_count'])
essay_count_df
plt.figure(figsize = (10,5))
sns.histplot(data = essay_count_df,x = 'word_count',kde = True)
plt.title("distribution of word counts (based essays)");
essay_ids = train.essay_id.unique().tolist()
first_sentence_word_count = []
first_sentence_effectiveness = []
for ids in tqdm(essay_ids):
A = train[train.essay_id == ids]
first_sentence_word_count.append(A.iloc[0]['word_count'])
first_sentence_effectiveness.append(A.iloc[0]["discourse_effectiveness"])
essay_count_df['first_discourse'] = first_sentence_word_count
essay_count_df["first_sentence_effectiveness"] = first_sentence_effectiveness
essay_count_df.head()
plt.figure(figsize = (10,5))
sns.scatterplot(data = essay_count_df,y = 'word_count',x="first_discourse",hue="first_sentence_effectiveness")
plt.xlabel("first discourse word count")
plt.ylabel("essay word count");
_,ax = plt.subplots(ncols = 2,nrows = 1,figsize = (16,5))
train.groupby("discourse_effectiveness").mean().T.plot(kind = 'bar',ax = ax[0])
ax[0].set_title("average word count for discourse type ");
train.groupby('discourse_type').mean().T.plot(kind="bar",ax=ax[1])
ax[1].set_title("average word count for discourse type ");
import nltk
from nltk.corpus import stopwords
print(stopwords.words('english'))
def generate_ngrams(text,stop_words,n_gram = 1):
token = [token for token in text.lower().split(' ') if token != '' if token not in stop_words]
ngrams = zip(*[token[i:] for i in range(n_gram)])
return [' '.join(ngram) for ngram in ngrams]
#generate_ngrams('I love you for ever.',['a'],2)
grams = ["unigrams","bigrams","trigrams"]
type_effectiveness = ['All','Adequate', 'Ineffective', 'Effective']
stop_words = stopwords.words('english')
import warnings
warnings.filterwarnings('ignore')
for name in type_effectiveness:
for i,ngram in enumerate(grams):
if name == 'All':
locals()[name] = train.discourse_text
else:
locals()[name] = train[train.discourse_effectiveness == name].discourse_text
locals()[name]["mytext_new"] = locals()[name].str.lower().str.replace('[^\w\s]','')
locals()[name+'_'+ngram] = pd.Series(generate_ngrams(' '.join(locals()[name].mytext_new.to_list()),stop_words, n_gram=i+1)).value_counts()
name = 'unigrams'
top = 25
for ngram in grams:
if ngram != 'trigrams':
sl = 0
a = 0
else:
sl = 1
a = 2.5
fig,axs = plt.subplots(ncols = 4,nrows = 1,figsize=(16,int(top/4)),sharey = False)
plt.subplots_adjust(hspace = 0.5,wspace = 1*a)
sns.barplot(y=locals()[f"All_{ngram}"].index[sl:top],x = locals()[f"All_{ngram}"].values[sl:top],palette = 'magma_r',ax = axs[0])
axs[0].set_xlabel('All_frequency')
axs[0].set_ylabel(ngram)
sns.barplot(y=locals()[f"Adequate_{ngram}"].index[sl:top], x = locals()[f"Adequate_{ngram}"].values[sl:top],palette='viridis_r',ax=axs[1])
axs[1].set_xlabel("Adequate_frequency")
axs[1].set_ylabel(ngram)
sns.barplot(y=locals()[f"Ineffective_{ngram}"].index[sl:top], x = locals()[f"Ineffective_{ngram}"].values[sl:top],palette='magma_r',ax=axs[2])
axs[2].set_xlabel("Ineffective_frequency")
axs[2].set_ylabel(ngram)
sns.barplot(y=locals()[f"Effective_{ngram}"].index[sl:top], x = locals()[f"Effective_{ngram}"].values[sl:top],palette='viridis_r',ax=axs[3])
axs[3].set_xlabel("Effective_frequency")
axs[3].set_ylabel(ngram)
fig.suptitle(f"TOP {top} {ngram}")
plt.show()
def word_cloud(is_df = False,df = None,title_plot_df = ' ',is_essay = True,essay_id = '007ACE74B050',discourse_id = '0013cc385424'):
"""
is df: True if text from df. True cancels essay id and discource id options
df: text column in df
is_assey: True: whole essay text word cloud | False: discourse text word cloud
"""
path = "../input/feedback-prize-effectiveness/train/"
if is_df:
data = ' '.join(df.to_list())
else:
if is_essay:
with open(path+essay_id+'.txt','r') as f:
data = f.read().replace('\n','')
else:
data = train[train['discourse_id'] == discourse_id]['discourse_text'][0]
#print(data)
wordcloud = WordCloud(relative_scaling=0.1,colormap="cividis",prefer_horizontal=0.5,background_color="white",mode= "RGB").generate(data)
plt.figure(figsize = (12,8),dpi=90)
plt.imshow(wordcloud, interpolation="antialiased",aspect=0.5)
plt.axis('off')
plt.margins(x=0, y=0)
if is_df:
plt.title(f" {title_plot_df} ")
else:
if is_essay:
plt.title(f"essay id: {essay_id} word cloud")
else:
plt.title(f"discourse_id: {discourse_id} word cloud")
plt.show()
word_cloud(is_df=True, df=train[train.discourse_effectiveness=='Adequate'].discourse_text,
title_plot_df="Adequate word cloud"
,is_essay=True,essay_id="007ACE74B050"
,discourse_id="0013cc385424")
word_cloud(is_df=True, df=train[train.discourse_effectiveness=='Ineffective'].discourse_text,
title_plot_df="Ineffective word cloud"
,is_essay=True,essay_id="007ACE74B050"
,discourse_id="0013cc385424")
word_cloud(is_df=True, df=train[train.discourse_effectiveness== 'Effective'].discourse_text,
title_plot_df="Effective word cloud"
,is_essay=True,essay_id="007ACE74B050"
,discourse_id="0013cc385424")
word_cloud(is_essay=True,essay_id="007ACE74B050",discourse_id="0013cc385424")
word_cloud(is_essay=False,essay_id="007ACE74B050",discourse_id="0013cc385424")
分别用每个评分的“问题”作为词向量训练
import tensorflow_hub as hub
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss,accuracy_score,classification_report,confusion_matrix
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
vec = TfidfVectorizer(ngram_range=(1,2),stop_words=stop_words)
vec.fit(train.discourse_text.to_list()+test.discourse_text.to_list())
le = LabelEncoder()
preds = []
models = {}
vectorizers = {}
for name in ['All','Adequate', 'Effective', 'Ineffective']:
vec = TfidfVectorizer(ngram_range=(1,2),stop_words=stop_words)
X = train.discourse_text.to_list()
y = le.fit_transform(train.discourse_effectiveness.astype(str))
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.1,random_state=0)
if name=='All':
vec=TfidfVectorizer(ngram_range=(1,2),stop_words=stop_words)
vec.fit(train.discourse_text.to_list())
else:
vec = TfidfVectorizer(ngram_range=(1,2),stop_words=stop_words)
vec.fit(train.discourse_text[train.discourse_effectiveness==name].to_list())
X_train = vec.transform(X_train)
X_test = vec.transform(X_test)
testvec = vec.transform(test.discourse_text.to_list())
lgsc = LogisticRegression(max_iter=600,penalty='l2',C=1.015)
lgsc.fit(X_train,y_train)
pred=lgsc.predict_proba(X_test)
print(f"\n{name} logloss train: {log_loss(y_train,lgsc.predict_proba(X_train))}, logloss val: {log_loss(y_test,pred)}")
predtest = lgsc.predict_proba(testvec)
preds.append(predtest)
models[name] = lgsc
vectorizers[name] = vec
print(classification_report(y_train,np.argmax(lgsc.predict_proba(X_train),axis=1)))
sns.heatmap(confusion_matrix(y_train,np.argmax(lgsc.predict_proba(X_train),axis=1)),annot=True,fmt='.0f',xticklabels=le.classes_,yticklabels=le.classes_)
plt.xlabel("predicted")
plt.ylabel("actual")
plt.title('train confusion matrix')
predtest = np.mean(preds,axis=0)
sample_sub.iloc[:,1:]=predtest
sample_sub.to_csv("submission.csv",index=False)
sample_sub