Python数据分析-FIFA2018球员数据分析

#!/usr/bin/env python
# coding: utf-8
# TI=FIFA2018球员数据分析
# 明确分析目的
# 	运动员数量前十名的国家,以及平均身价
# 	各大联赛运动员数量,以及球员平均身价
# 	各俱乐部的平均周薪
# 	英超联赛English Premier League各个俱乐部球员的平均周薪
# 	球员年龄分布情况,不同年龄段平均身价分布
# 引入使用的库
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# 加载数据文件
df = pd.read_csv('./FIFA_2018_player.csv')
# 查看数据具有哪些列,什么类型
df.info()
# 可见共17994行,league和club有缺失值

RangeIndex: 17994 entries, 0 to 17993
Data columns (total 12 columns):
ID             17994 non-null int64
name           17994 non-null object
full_name      17994 non-null object
nationality    17994 non-null object
league         17741 non-null object
club           17741 non-null object
age            17994 non-null int64
birth_date     17994 non-null object
height_cm      17994 non-null float64
weight_kg      17994 non-null float64
eur_value      17994 non-null float64
eur_wage       17994 non-null float64
dtypes: float64(4), int64(2), object(6)
memory usage: 1.6+ MB
df.head()
ID name full_name nationality league club age birth_date height_cm weight_kg eur_value eur_wage
0 20801 Cristiano Ronaldo C. Ronaldo dos Santos Aveiro Portugal Spanish Primera División Real Madrid CF 32 1985-02-05 185.0 80.0 95500000.0 565000.0
1 158023 L. Messi Lionel Messi Argentina Spanish Primera División FC Barcelona 30 1987-06-24 170.0 72.0 105000000.0 565000.0
2 190871 Neymar Neymar da Silva Santos Jr. Brazil French Ligue 1 Paris Saint-Germain 25 1992-02-05 175.0 68.0 123000000.0 280000.0
3 176580 L. Suárez Luis Suárez Uruguay Spanish Primera División FC Barcelona 30 1987-01-24 182.0 86.0 97000000.0 510000.0
4 167495 M. Neuer Manuel Neuer Germany German Bundesliga FC Bayern Munich 31 1986-03-27 193.0 92.0 61000000.0 230000.0
df.describe()
ID age height_cm weight_kg eur_value eur_wage
count 17994.000000 17994.000000 17994.000000 17994.000000 1.799400e+04 17994.000000
mean 207791.796543 25.120151 181.271980 75.400856 2.370511e+06 11503.834612
std 32328.527723 4.617428 6.690392 6.994824 5.347250e+06 23050.661073
min 16.000000 16.000000 155.000000 49.000000 0.000000e+00 0.000000
25% 192621.250000 21.000000 177.000000 70.000000 3.000000e+05 2000.000000
50% 214186.000000 25.000000 181.000000 75.000000 7.000000e+05 4000.000000
75% 231615.750000 28.000000 186.000000 80.000000 2.000000e+06 12000.000000
max 241489.000000 47.000000 205.000000 110.000000 1.230000e+08 565000.000000
df.count()
# 可见league 和 club有缺失值
ID             17994
name           17994
full_name      17994
nationality    17994
league         17741
club           17741
age            17994
birth_date     17994
height_cm      17994
weight_kg      17994
eur_value      17994
eur_wage       17994
dtype: int64
# 对于本次的分析目的,其实在加载数据时就可以只加载部分列
# 选出部分列 ID nationality league club age eur_value eur_wage
# 分析的是FIFA2018的数据,age按当年数据计算,birth_date省略
df = df[['ID', 'nationality', 'league', 'club', 'age', 'eur_value', 'eur_wage']]
df
ID nationality league club age eur_value eur_wage
0 20801 Portugal Spanish Primera División Real Madrid CF 32 95500000.0 565000.0
1 158023 Argentina Spanish Primera División FC Barcelona 30 105000000.0 565000.0
2 190871 Brazil French Ligue 1 Paris Saint-Germain 25 123000000.0 280000.0
3 176580 Uruguay Spanish Primera División FC Barcelona 30 97000000.0 510000.0
4 167495 Germany German Bundesliga FC Bayern Munich 31 61000000.0 230000.0
... ... ... ... ... ... ... ...
17989 237463 England English League One Scunthorpe United 17 50000.0 1000.0
17990 11728 England English League Two Wycombe Wanderers 47 0.0 1000.0
17991 231381 Scotland English League Two Swindon Town 17 60000.0 1000.0
17992 238813 England English League Two Crewe Alexandra 18 60000.0 1000.0
17993 238308 Ghana English League One Scunthorpe United 18 50000.0 1000.0

17994 rows × 7 columns

df[df.league.isnull()]
ID nationality league club age eur_value eur_wage
163 188152 Brazil NaN NaN 25 0.0 0.0
168 184826 Portugal NaN NaN 28 0.0 0.0
271 177413 Belgium NaN NaN 28 0.0 0.0
480 176733 Sweden NaN NaN 30 0.0 0.0
494 169195 Brazil NaN NaN 29 0.0 0.0
... ... ... ... ... ... ... ...
17267 234509 India NaN NaN 29 0.0 0.0
17486 234508 India NaN NaN 20 0.0 0.0
17489 223760 India NaN NaN 24 0.0 0.0
17511 233526 India NaN NaN 22 0.0 0.0
17568 231057 New Zealand NaN NaN 20 0.0 0.0

253 rows × 7 columns

# 可以看到联赛、俱乐部是空值,同时这些条目的身价、周薪都是0
# 删除数据一般在后面进行,但四行数据都异常可以先删除
df.drop(df[df.league.isnull()].index,inplace=True)
# 查看删除后情况
df.count()
ID             17741
nationality    17741
league         17741
club           17741
age            17741
eur_value      17741
eur_wage       17741
dtype: int64
# 查看数据中数值、浮点型数据整体信息
df.describe()
ID age eur_value eur_wage
count 17741.000000 17741.000000 1.774100e+04 17741.000000
mean 207756.835522 25.088552 2.404317e+06 11667.887943
std 32421.331072 4.616413 5.377693e+06 23173.181633
min 16.000000 16.000000 0.000000e+00 1000.000000
25% 192621.000000 21.000000 3.250000e+05 2000.000000
50% 214175.000000 25.000000 7.000000e+05 4000.000000
75% 231624.000000 28.000000 2.100000e+06 12000.000000
max 241489.000000 47.000000 1.230000e+08 565000.000000
# 经查看,eur_value 在最小值上有问题,0.000000e+00
# 筛选一下数据,查看eur_value == 0.000000e+00的有多少
df[df['eur_value'] == 0.000000e+00].count()
ID             6
nationality    6
league         6
club           6
age            6
eur_value      6
eur_wage       6
dtype: int64
# eur_value == 0.000000e+00的有6条
df[df['eur_value'] == 0.000000e+00]
ID nationality league club age eur_value eur_wage
2199 3665 France French Ligue 1 ES Troyes AC 40 0.0 16000.0
3105 17605 Belgium Belgian First Division A Club Brugge KV 40 0.0 14000.0
3272 176900 Colombia Colombian Primera A Asociacion Deportivo Cali 40 0.0 2000.0
7734 148745 Norway Norwegian Eliteserien Sogndal 41 0.0 2000.0
17628 149727 England Rep. Ireland Premier Division St. Patrick's Athletic 37 0.0 1000.0
17990 11728 England English League Two Wycombe Wanderers 47 0.0 1000.0
# 使用平均值填充这些身价为0的数据
# df[df['eur_value'] == 0.000000e+00].loc[:,'eur_value'] = 2.404317e+06 警告
# 使用平均值填充这些身价为0的数据
df['eur_value'].replace(0, df['eur_value'].mean(), inplace = True)
df.describe()
# 可见数据已经填充成功
ID age eur_value eur_wage
count 17741.000000 17741.000000 1.774100e+04 17741.000000
mean 207756.835522 25.088552 2.405130e+06 11667.887943
std 32421.331072 4.616413 5.377511e+06 23173.181633
min 16.000000 16.000000 1.000000e+04 1000.000000
25% 192621.000000 21.000000 3.250000e+05 2000.000000
50% 214175.000000 25.000000 7.000000e+05 4000.000000
75% 231624.000000 28.000000 2.100000e+06 12000.000000
max 241489.000000 47.000000 1.230000e+08 565000.000000
# 检查是否有整个条目完全重复值,若有则展示
df[df.duplicated()]
ID nationality league club age eur_value eur_wage
# 检查指定列是否有重复值
df[df['ID'].duplicated()]
ID nationality league club age eur_value eur_wage
# 查看分类统计值是否有不合逻辑的类名
df['league'].value_counts()
Argentinian Superliga            780
English Championship             717
English League One               668
English Premier League           654
Spanish Segunda División         637
English League Two               633
Italian Serie B                  625
USA Major League Soccer          625
Spanish Primera División         602
French Ligue 1                   598
Italian Serie A                  559
Colombian Primera A              552
French Ligue 2                   543
German Bundesliga                537
Japanese J1 League               519
Mexican Liga MX                  518
German 3. Liga                   515
German 2. Bundesliga             510
Portuguese Primeira Liga         509
Turkish Süper Lig                502
Holland Eredivisie               488
Russian Premier League           449
Belgian First Division A         436
Polish Ekstraklasa               418
Saudi Professional League        411
Norwegian Eliteserien            393
Swedish Allsvenskan              389
Danish Superliga                 365
Korean K League Classic          336
Scottish Premiership             321
Chilian Primera División         320
Campeonato Brasileiro Série A    320
Rep. Ireland Premier Division    288
Swiss Super League               263
Austrian Bundesliga              259
Australian A-League              236
Greek Super League               111
South African PSL                 56
Czech Liga                        28
Finnish Veikkausliiga             27
Ukrainian Premier League          24
Name: league, dtype: int64
# 数据清洗完毕,开始分析
# 样本总数
df.count()
ID             17741
nationality    17741
league         17741
club           17741
age            17741
eur_value      17741
eur_wage       17741
dtype: int64
# 数值类型列统计学指标
df.describe()
ID age eur_value eur_wage
count 17741.000000 17741.000000 1.774100e+04 17741.000000
mean 207756.835522 25.088552 2.405130e+06 11667.887943
std 32421.331072 4.616413 5.377511e+06 23173.181633
min 16.000000 16.000000 1.000000e+04 1000.000000
25% 192621.000000 21.000000 3.250000e+05 2000.000000
50% 214175.000000 25.000000 7.000000e+05 4000.000000
75% 231624.000000 28.000000 2.100000e+06 12000.000000
max 241489.000000 47.000000 1.230000e+08 565000.000000
# 运动员数量前十名的国家
nationality_data = df.groupby('nationality', as_index = False)  #拿出按国家分组的数据
nat_count = nationality_data.count()[['nationality','ID']]     #计数,拿出国家和ID两列
nat_count.rename(columns = {'ID':'ath_count'}, inplace = True) #对列名重命名
nat_head10 = nat_count.sort_values('ath_count', ascending = False).head(10)#降序排序,取前十
nat_head10
nationality ath_count
44 England 1631
57 Germany 1147
135 Spain 1020
53 France 966
5 Argentina 962
18 Brazil 803
75 Italy 800
29 Colombia 591
78 Japan 471
105 Netherlands 430
# 运动员数量前十名的国家及其平均球员身价
nat_val_mean = nationality_data[['nationality','eur_value']].mean()
nat_val_mean.rename(columns = {'eur_value':'val_mean'})
nat_head10_val_mean = pd.merge(nat_head10,nat_val_mean, on = 'nationality', how = 'left')
nat_head10_val_mean
nationality ath_count eur_value
0 England 1631 1.425410e+06
1 Germany 1147 2.609010e+06
2 Spain 1020 4.465897e+06
3 France 966 3.314264e+06
4 Argentina 962 2.900120e+06
5 Brazil 803 4.001071e+06
6 Italy 800 2.681325e+06
7 Colombia 591 1.719068e+06
8 Japan 471 8.067091e+05
9 Netherlands 430 3.002930e+06
# 各大联赛运动员数量,以及球员平均身价(操作方式与上述方法类似)
league_data = df.groupby('league', as_index = False)
league_count = league_data.count()[['league','ID']].rename(columns = {'ID':'ath_count'})
league_count.rename(columns = {'ID':'ath_count'}).sort_values('ath_count', ascending = False)
lea_val_mean = league_data[['league','eur_value']].mean().rename(columns = {'eur_value':'val_mean'})
lea_val_mean = pd.merge(league_count, lea_val_mean, on = 'league', how = 'left')
lea_val_mean
league ath_count val_mean
0 Argentinian Superliga 780 1.453788e+06
1 Australian A-League 236 6.848941e+05
2 Austrian Bundesliga 259 7.276062e+05
3 Belgian First Division A 436 1.956719e+06
4 Campeonato Brasileiro Série A 320 2.249016e+06
5 Chilian Primera División 320 2.238234e+06
6 Colombian Primera A 552 9.465567e+05
7 Czech Liga 28 2.141250e+06
8 Danish Superliga 365 7.188767e+05
9 English Championship 717 1.831032e+06
10 English League One 668 4.875075e+05
11 English League Two 633 2.926687e+05
12 English Premier League 654 9.091483e+06
13 Finnish Veikkausliiga 27 2.940741e+05
14 French Ligue 1 598 5.188201e+06
15 French Ligue 2 543 8.015930e+05
16 German 2. Bundesliga 510 1.238333e+06
17 German 3. Liga 515 4.530777e+05
18 German Bundesliga 537 7.702849e+06
19 Greek Super League 111 3.808333e+06
20 Holland Eredivisie 488 2.171250e+06
21 Italian Serie A 559 7.292030e+06
22 Italian Serie B 625 8.344480e+05
23 Japanese J1 League 519 6.472736e+05
24 Korean K League Classic 336 8.854911e+05
25 Mexican Liga MX 518 2.025782e+06
26 Norwegian Eliteserien 393 6.056217e+05
27 Polish Ekstraklasa 418 7.070096e+05
28 Portuguese Primeira Liga 509 3.506257e+06
29 Rep. Ireland Premier Division 288 1.683310e+05
30 Russian Premier League 449 2.679788e+06
31 Saudi Professional League 411 8.512287e+05
32 Scottish Premiership 321 9.114486e+05
33 South African PSL 56 1.150893e+06
34 Spanish Primera División 602 9.257550e+06
35 Spanish Segunda División 637 1.508854e+06
36 Swedish Allsvenskan 389 6.417095e+05
37 Swiss Super League 263 1.134202e+06
38 Turkish Süper Lig 502 2.961036e+06
39 USA Major League Soccer 625 1.484424e+06
40 Ukrainian Premier League 24 8.283750e+06
# 各俱乐部的平均周薪
club_data = df.groupby('club', as_index = False)
club_wage_mean = club_data.mean()[['club','eur_wage']]
club_wage_mean.rename(columns = {'eur_wage':'wage_mean'},inplace = True)
club_wage_mean.sort_values('wage_mean', ascending = False, inplace = True)
club_wage_mean
club wage_mean
219 FC Barcelona 194666.666667
466 Real Madrid CF 170821.428571
222 FC Bayern Munich 123384.615385
330 Juventus 122000.000000
377 Manchester United 109030.303030
... ... ...
97 Bray Wanderers 1000.000000
425 PAOK Thessaloniki 1000.000000
88 Bohemian FC 1000.000000
263 Finn Harps 1000.000000
578 Tigres FC 1000.000000

647 rows × 2 columns

# 英超联赛English Premier League
EPL_data = df[df['league'] == 'English Premier League']
EPL_data.describe()
ID age eur_value eur_wage
count 654.000000 654.000000 6.540000e+02 654.000000
mean 196333.365443 24.711009 9.091483e+06 57840.978593
std 37997.349392 4.769797 1.222195e+07 50627.145927
min 2147.000000 16.000000 6.000000e+04 2000.000000
25% 183551.250000 20.000000 9.125000e+05 17000.000000
50% 201840.000000 25.000000 5.000000e+06 48000.000000
75% 222604.000000 28.000000 1.137500e+07 82000.000000
max 241384.000000 38.000000 9.050000e+07 325000.000000
# 英超联赛English Premier League各个俱乐部球员的平均周薪
EPL_club = EPL_data.groupby('club', as_index = False)
EPL_club_wage_mean = EPL_club.mean()[['club','eur_wage']]
EPL_club_wage_mean.rename(columns = {'eur_wage':'wage_mean'}, inplace = True)
EPL_club_wage_mean.sort_values('wage_mean', ascending = False, inplace = True)
EPL_club_wage_mean
club wage_mean
11 Manchester United 109030.303030
4 Chelsea 105181.818182
10 Manchester City 95787.878788
0 Arsenal 91121.212121
9 Liverpool 83250.000000
6 Everton 76484.848485
16 Tottenham Hotspur 69218.750000
19 West Ham United 61818.181818
13 Southampton 51181.818182
15 Swansea City 43878.787879
8 Leicester City 43875.000000
18 West Bromwich Albion 42516.129032
14 Stoke City 41093.750000
17 Watford 40848.484848
12 Newcastle United 40000.000000
1 Bournemouth 38303.030303
5 Crystal Palace 35181.818182
3 Burnley 33666.666667
2 Brighton & Hove Albion 30454.545455
7 Huddersfield Town 23181.818182
# 球员年龄分布情况,不同年龄段平均身价分布
# ⽣成桶,5岁⼀个分桶,根据上述统计数据可知最⼩16岁,最⼤47
bins = np.arange(15, 50, 5)
bins_data = pd.cut(df['age'], bins)
bin_counts = df['age'].groupby(bins_data).count()
print(bin_counts)
# 可以使用matplotlib粗看一下作图效果,以便在正式出图前作调整
bin_counts.plot(kind='pie')
age
(15, 20]    3300
(20, 25]    6749
(25, 30]    5234
(30, 35]    2192
(35, 40]     258
(40, 45]       7
Name: age, dtype: int64

你可能感兴趣的:(Python)