Python可视化分析球员裁判数据(一)


from __future__ import absolute_import,division,print_function
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.pyplot import GridSpec
import seaborn as sns
import numpy as np
import pandas as pda
import os ,sys
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")
sns.set_context("poster",font_scale=1.3)
import missingno as msno
import pandas_profiling
from sklearn.datasets import make_blobs
import time
#读入数据
data=pda.read_csv("redcard.csv.gz",compression="gzip")
#查看维度
print(data.shape)
#查看列名
print(data.columns)
#查看前5行
print(data.head(20))
#数据一半描述性统计
print(data.describe().T)
#查看各列类型
print(data.dtypes)
#所有球员的身高均值
print(data["height"].mean())
print(np.mean(data.groupby("playerShort").height.mean()))

#数据集切分拆分为多个独立的数据集合
player_index="playerShort"
player_cols=["birthday","height","weight","position","photoID","rater1","rater2"]

all_cols_unque_player=data.groupby("playerShort").agg({col:"nunique" for col in player_cols})
print(all_cols_unque_player)
print(all_cols_unque_player[all_cols_unque_player>1].dropna().head())

def get_subgrop(dataframe,g_index,g_columns):
    g=dataframe.groupby(g_index).agg({col:"nunique" for col in player_cols})

    if g[g>1].dropna().shape[0]!=0:
        print("Waring:you probably assumed this had all unique values but it doesn't")
    return  dataframe.groupby(g_index).agg({col:"max" for col in g_columns})



def save_subgroup(dataframe,g_index,subgroup_name,prefix="raw_"):
    save_subgroup_filename="".join([prefix,subgroup_name,".csv.gz"])
    print(save_subgroup_filename)
    dataframe.to_csv(save_subgroup_filename,compression='gzip',encoding="UTF-8")
    test_df=pda.read_csv(save_subgroup_filename,compression='gzip',index_col=g_index,encoding="UTF-8")
    if dataframe.equals(test_df):
        print("保存正确")
    else:
        print("保存异常")

players=get_subgrop(data,player_index,player_cols)
print(players.head())
save_subgroup(players,player_index,"players")

#俱乐部
club_index="club"
club_cols=["leagueCountry"]
clubs=get_subgrop(data,club_index,club_cols)
print(clubs.head())
print(clubs["leagueCountry"].value_counts())

save_subgroup(clubs,club_index,"clubs")
#裁判
referee_index="refNum"
referee_cols=["refCountry"]

referees=get_subgrop(data,referee_index,referee_cols)
print(referees.head())

print(referees.refCountry.nunique())

print(referees.tail())
print(referees.shape)
save_subgroup(referees,referee_index,"referees")

country_index="refCountry"
country_cols=["Alpha_3","meanIAT","nIAT","seIAT","meanExp","nExp","seExp"]

countries=get_subgrop(data,country_index,country_cols)
print(countries.head())

rename_columns={"Alpha_3":"countryName",}#重命名
countries=countries.rename(columns=rename_columns)
print(countries.head())
print(countries.shape)
#球员和裁判的关系
dyad_index=["refNum","playerShort"]
dyad_cols=["games","victories","ties","defeats","goals","yellowCards","yellowReds","redCards",]

dyads=get_subgrop(data,g_index=dyad_index,g_columns=dyad_cols)
print(dyads.head())
print(dyads.shape)
print(dyads[dyads.redCards>1].head(10))
save_subgroup(dyads,dyad_index,"dyads")
print(dyads.redCards.max())#红牌最多的人

你可能感兴趣的:(Python)