小黑跟师叔们一起这次一定要拿牌,先跑一下别人的EDA代码:议论文评分

比赛链接:https://www.kaggle.com/competitions/feedback-prize-effectiveness/overview
公开代码:https://www.kaggle.com/code/mustafakeser4/feedback-eda/notebook

导包加载数据

import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from wordcloud import WordCloud,ImageColorGenerator
from PIL import Image

pd.options.display.width = 180
pd.options.display.max_colwidth = 120

train = pd.read_csv("../input/feedback-prize-effectiveness/train.csv")
test = pd.read_csv("../input/feedback-prize-effectiveness/test.csv")
sample_sub = pd.read_csv("../input/feedback-prize-effectiveness/sample_submission.csv")

print('train:')
display(train)
print('test:')
display(test)
print('sample_sub:')
display(sample_sub)

小黑跟师叔们一起这次一定要拿牌,先跑一下别人的EDA代码:议论文评分_第1张图片
小黑跟师叔们一起这次一定要拿牌,先跑一下别人的EDA代码:议论文评分_第2张图片
小黑跟师叔们一起这次一定要拿牌,先跑一下别人的EDA代码:议论文评分_第3张图片

train.describe()

小黑跟师叔们一起这次一定要拿牌,先跑一下别人的EDA代码:议论文评分_第4张图片
统计每类问题下,学生们的答题评分分布

def count_string(x):
    return len(x.split())

train['word_count'] = train.discourse_text.apply(count_string)
test['word_count'] = test.discourse_text.apply(count_string)


import seaborn as sns
plt.figure(figsize = (10,4))
sns.countplot(data = train,x = 'discourse_type',hue = 'discourse_effectiveness')

小黑跟师叔们一起这次一定要拿牌,先跑一下别人的EDA代码:议论文评分_第5张图片


def pieplot(df,col ,title,explodeplace=0):
    """
    df: DataFrame
    col: column
    title: title for plot
    explodeplace: segment|slice location for exploding
    """
    labels = df[col].value_counts().index
    sizes = df[col].value_counts()
    # 初始化偏移程度
    # https://vimsky.com/examples/usage/matplotlib-axes-axes-pie-in-python.html
    explode = np.zeros(len(sizes))
    explode[explodeplace]=0.1
    fig1, ax1 = plt.subplots()
    ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
            shadow=True, startangle=90)
    ax1.axis('equal')
    ax1.set_title(title)

    plt.show()
    print(df[col].value_counts())
    

分别考察:训练集整体评分分布、训练集整体问题类型分布、测试集整体评分分布

pieplot(train,"discourse_effectiveness","train discourse_effectiveness",explodeplace=1)
pieplot(train,"discourse_type","train discourse_type",explodeplace=0)
pieplot(test,"discourse_type","test discourse_type",explodeplace=0)

小黑跟师叔们一起这次一定要拿牌,先跑一下别人的EDA代码:议论文评分_第6张图片
小黑跟师叔们一起这次一定要拿牌,先跑一下别人的EDA代码:议论文评分_第7张图片
统计每个评分下,“问题”的长度分布

plt.figure(figsize = (10,5))
sns.histplot(data = train,x = 'word_count',hue = 'discourse_effectiveness',kde = True)
plt.title('distribution of word counts (based df row)')

小黑跟师叔们一起这次一定要拿牌,先跑一下别人的EDA代码:议论文评分_第8张图片

train_txt_list = os.listdir("../input/feedback-prize-effectiveness/train")
test_txt_list = os.listdir("../input/feedback-prize-effectiveness/test")
train_word_count = []

for txt in tqdm(train_txt_list):
    with open("../input/feedback-prize-effectiveness/train/"+txt,'r') as f:
        data = f.read().replace('\n','')
    train_word_count.append(len(data.split()))

test_word_count = []
for txt in tqdm(test_txt_list):
    with open("../input/feedback-prize-effectiveness/test/"+txt, "r") as f:
        data = f.read().replace('\n','')
    test_word_count.append(len(data.split()))

train_count_dict = dict(zip([f[:-4] for f in train_txt_list],train_word_count))
test_count_dict = dict(zip([f[:-4] for f in test_txt_list],test_word_count))

每篇文章的字数统计

essay_count_df = pd.DataFrame(data = train_count_dict.values(),index = train_count_dict.keys(),columns = ['word_count'])
essay_count_df

小黑跟师叔们一起这次一定要拿牌,先跑一下别人的EDA代码:议论文评分_第9张图片
文章的长度分布

plt.figure(figsize = (10,5))
sns.histplot(data = essay_count_df,x = 'word_count',kde = True)
plt.title("distribution of word counts (based essays)");

小黑跟师叔们一起这次一定要拿牌,先跑一下别人的EDA代码:议论文评分_第10张图片

essay_ids = train.essay_id.unique().tolist()
first_sentence_word_count = []
first_sentence_effectiveness = []

for ids in tqdm(essay_ids):
    A = train[train.essay_id == ids]
    first_sentence_word_count.append(A.iloc[0]['word_count'])
    first_sentence_effectiveness.append(A.iloc[0]["discourse_effectiveness"])

essay_count_df['first_discourse'] = first_sentence_word_count
essay_count_df["first_sentence_effectiveness"] = first_sentence_effectiveness
essay_count_df.head()

小黑跟师叔们一起这次一定要拿牌,先跑一下别人的EDA代码:议论文评分_第11张图片
每个评分的“问题”长度分析:评分低的“问题”偏长

plt.figure(figsize = (10,5))
sns.scatterplot(data = essay_count_df,y = 'word_count',x="first_discourse",hue="first_sentence_effectiveness")
plt.xlabel("first discourse word count")
plt.ylabel("essay word count");

小黑跟师叔们一起这次一定要拿牌,先跑一下别人的EDA代码:议论文评分_第12张图片

_,ax = plt.subplots(ncols = 2,nrows = 1,figsize = (16,5))
train.groupby("discourse_effectiveness").mean().T.plot(kind = 'bar',ax = ax[0])
ax[0].set_title("average word count for discourse type ");
train.groupby('discourse_type').mean().T.plot(kind="bar",ax=ax[1])
ax[1].set_title("average word count for discourse type ");

小黑跟师叔们一起这次一定要拿牌,先跑一下别人的EDA代码:议论文评分_第13张图片

停用词

import nltk
from nltk.corpus import stopwords
print(stopwords.words('english'))

小黑跟师叔们一起这次一定要拿牌,先跑一下别人的EDA代码:议论文评分_第14张图片

统计每个评分下的“问题”的高频gram

def generate_ngrams(text,stop_words,n_gram = 1):
    token = [token for token in text.lower().split(' ') if token != '' if token not in stop_words]
    ngrams = zip(*[token[i:] for i in range(n_gram)])
    return [' '.join(ngram) for ngram in ngrams]

#generate_ngrams('I love you for ever.',['a'],2)

grams = ["unigrams","bigrams","trigrams"]
type_effectiveness = ['All','Adequate', 'Ineffective', 'Effective']
stop_words = stopwords.words('english')
import warnings
warnings.filterwarnings('ignore')
for name in type_effectiveness:
    for i,ngram in enumerate(grams):
        if name == 'All':
            locals()[name] = train.discourse_text
        else:
            locals()[name] = train[train.discourse_effectiveness == name].discourse_text
        locals()[name]["mytext_new"] = locals()[name].str.lower().str.replace('[^\w\s]','')
        locals()[name+'_'+ngram] = pd.Series(generate_ngrams(' '.join(locals()[name].mytext_new.to_list()),stop_words, n_gram=i+1)).value_counts()

name = 'unigrams'
top = 25

for ngram in grams:
    if ngram != 'trigrams':
        sl = 0
        a = 0
    else:
        sl = 1
        a = 2.5
    fig,axs = plt.subplots(ncols = 4,nrows = 1,figsize=(16,int(top/4)),sharey = False)
    plt.subplots_adjust(hspace = 0.5,wspace = 1*a)
    sns.barplot(y=locals()[f"All_{ngram}"].index[sl:top],x = locals()[f"All_{ngram}"].values[sl:top],palette = 'magma_r',ax = axs[0])
    axs[0].set_xlabel('All_frequency')
    axs[0].set_ylabel(ngram)
    sns.barplot(y=locals()[f"Adequate_{ngram}"].index[sl:top], x = locals()[f"Adequate_{ngram}"].values[sl:top],palette='viridis_r',ax=axs[1])
    axs[1].set_xlabel("Adequate_frequency")
    axs[1].set_ylabel(ngram)
    sns.barplot(y=locals()[f"Ineffective_{ngram}"].index[sl:top], x = locals()[f"Ineffective_{ngram}"].values[sl:top],palette='magma_r',ax=axs[2])
    axs[2].set_xlabel("Ineffective_frequency")
    axs[2].set_ylabel(ngram)
    sns.barplot(y=locals()[f"Effective_{ngram}"].index[sl:top], x = locals()[f"Effective_{ngram}"].values[sl:top],palette='viridis_r',ax=axs[3])
    axs[3].set_xlabel("Effective_frequency")
    axs[3].set_ylabel(ngram)
    fig.suptitle(f"TOP {top} {ngram}")
    plt.show()

小黑跟师叔们一起这次一定要拿牌,先跑一下别人的EDA代码:议论文评分_第15张图片
小黑跟师叔们一起这次一定要拿牌,先跑一下别人的EDA代码:议论文评分_第16张图片
小黑跟师叔们一起这次一定要拿牌,先跑一下别人的EDA代码:议论文评分_第17张图片

词云图

def word_cloud(is_df = False,df = None,title_plot_df = ' ',is_essay = True,essay_id = '007ACE74B050',discourse_id = '0013cc385424'):
    """
    is df: True if text from df. True cancels essay id and discource id options 
    df: text column in df
    is_assey: True: whole essay text word cloud | False: discourse text word cloud
    """
    path = "../input/feedback-prize-effectiveness/train/"
    if is_df:
        data = ' '.join(df.to_list())

    else:
        if is_essay:
            with open(path+essay_id+'.txt','r') as f:
                data = f.read().replace('\n','')
                
        else:
            data = train[train['discourse_id'] == discourse_id]['discourse_text'][0]
    #print(data)
    wordcloud = WordCloud(relative_scaling=0.1,colormap="cividis",prefer_horizontal=0.5,background_color="white",mode= "RGB").generate(data)
    plt.figure(figsize = (12,8),dpi=90)
    plt.imshow(wordcloud, interpolation="antialiased",aspect=0.5)
    plt.axis('off')
    plt.margins(x=0, y=0)
    if is_df:
        plt.title(f" {title_plot_df} ")
    else:
        if is_essay:
             plt.title(f"essay id: {essay_id} word cloud")
        else:
            plt.title(f"discourse_id: {discourse_id} word cloud")
        plt.show()

word_cloud(is_df=True, df=train[train.discourse_effectiveness=='Adequate'].discourse_text,
           title_plot_df="Adequate word cloud" 
                ,is_essay=True,essay_id="007ACE74B050"
                ,discourse_id="0013cc385424")

小黑跟师叔们一起这次一定要拿牌,先跑一下别人的EDA代码:议论文评分_第18张图片

word_cloud(is_df=True, df=train[train.discourse_effectiveness=='Ineffective'].discourse_text,
           title_plot_df="Ineffective word cloud" 
                ,is_essay=True,essay_id="007ACE74B050"
                ,discourse_id="0013cc385424")

小黑跟师叔们一起这次一定要拿牌,先跑一下别人的EDA代码:议论文评分_第19张图片

word_cloud(is_df=True, df=train[train.discourse_effectiveness== 'Effective'].discourse_text,
           title_plot_df="Effective word cloud" 
                ,is_essay=True,essay_id="007ACE74B050"
                ,discourse_id="0013cc385424")

小黑跟师叔们一起这次一定要拿牌,先跑一下别人的EDA代码:议论文评分_第20张图片

word_cloud(is_essay=True,essay_id="007ACE74B050",discourse_id="0013cc385424")

小黑跟师叔们一起这次一定要拿牌,先跑一下别人的EDA代码:议论文评分_第21张图片

word_cloud(is_essay=False,essay_id="007ACE74B050",discourse_id="0013cc385424")

小黑跟师叔们一起这次一定要拿牌,先跑一下别人的EDA代码:议论文评分_第22张图片

训练

分别用每个评分的“问题”作为词向量训练

import tensorflow_hub as hub
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss,accuracy_score,classification_report,confusion_matrix
from sklearn.preprocessing import OneHotEncoder,LabelEncoder

vec = TfidfVectorizer(ngram_range=(1,2),stop_words=stop_words)
vec.fit(train.discourse_text.to_list()+test.discourse_text.to_list())
le = LabelEncoder()
preds = []
models = {}
vectorizers = {}
for name in ['All','Adequate', 'Effective', 'Ineffective']:
    vec = TfidfVectorizer(ngram_range=(1,2),stop_words=stop_words)
    X = train.discourse_text.to_list()
    y = le.fit_transform(train.discourse_effectiveness.astype(str))
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.1,random_state=0)
    if name=='All':
        vec=TfidfVectorizer(ngram_range=(1,2),stop_words=stop_words)
        vec.fit(train.discourse_text.to_list())
    else:
        vec = TfidfVectorizer(ngram_range=(1,2),stop_words=stop_words)
        vec.fit(train.discourse_text[train.discourse_effectiveness==name].to_list())
    X_train = vec.transform(X_train)
    X_test = vec.transform(X_test)
    testvec = vec.transform(test.discourse_text.to_list())
    lgsc = LogisticRegression(max_iter=600,penalty='l2',C=1.015)
    lgsc.fit(X_train,y_train)
    pred=lgsc.predict_proba(X_test)
    print(f"\n{name} logloss train: {log_loss(y_train,lgsc.predict_proba(X_train))}, logloss val: {log_loss(y_test,pred)}")
    predtest = lgsc.predict_proba(testvec)
    preds.append(predtest)
    models[name] = lgsc
    vectorizers[name] = vec
    

小黑跟师叔们一起这次一定要拿牌,先跑一下别人的EDA代码:议论文评分_第23张图片

print(classification_report(y_train,np.argmax(lgsc.predict_proba(X_train),axis=1)))

小黑跟师叔们一起这次一定要拿牌,先跑一下别人的EDA代码:议论文评分_第24张图片

sns.heatmap(confusion_matrix(y_train,np.argmax(lgsc.predict_proba(X_train),axis=1)),annot=True,fmt='.0f',xticklabels=le.classes_,yticklabels=le.classes_)
plt.xlabel("predicted")
plt.ylabel("actual")
plt.title('train confusion matrix')

小黑跟师叔们一起这次一定要拿牌,先跑一下别人的EDA代码:议论文评分_第25张图片

提交

predtest = np.mean(preds,axis=0)

sample_sub.iloc[:,1:]=predtest

sample_sub.to_csv("submission.csv",index=False)
sample_sub

小黑跟师叔们一起这次一定要拿牌,先跑一下别人的EDA代码:议论文评分_第26张图片

你可能感兴趣的:(小黑数据科学成长笔记,python,开发语言)