import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('./FIFA_2018_player.csv')
df.info()
RangeIndex: 17994 entries, 0 to 17993
Data columns (total 12 columns):
ID 17994 non-null int64
name 17994 non-null object
full_name 17994 non-null object
nationality 17994 non-null object
league 17741 non-null object
club 17741 non-null object
age 17994 non-null int64
birth_date 17994 non-null object
height_cm 17994 non-null float64
weight_kg 17994 non-null float64
eur_value 17994 non-null float64
eur_wage 17994 non-null float64
dtypes: float64(4), int64(2), object(6)
memory usage: 1.6+ MB
df.head()
|
ID |
name |
full_name |
nationality |
league |
club |
age |
birth_date |
height_cm |
weight_kg |
eur_value |
eur_wage |
0 |
20801 |
Cristiano Ronaldo |
C. Ronaldo dos Santos Aveiro |
Portugal |
Spanish Primera División |
Real Madrid CF |
32 |
1985-02-05 |
185.0 |
80.0 |
95500000.0 |
565000.0 |
1 |
158023 |
L. Messi |
Lionel Messi |
Argentina |
Spanish Primera División |
FC Barcelona |
30 |
1987-06-24 |
170.0 |
72.0 |
105000000.0 |
565000.0 |
2 |
190871 |
Neymar |
Neymar da Silva Santos Jr. |
Brazil |
French Ligue 1 |
Paris Saint-Germain |
25 |
1992-02-05 |
175.0 |
68.0 |
123000000.0 |
280000.0 |
3 |
176580 |
L. Suárez |
Luis Suárez |
Uruguay |
Spanish Primera División |
FC Barcelona |
30 |
1987-01-24 |
182.0 |
86.0 |
97000000.0 |
510000.0 |
4 |
167495 |
M. Neuer |
Manuel Neuer |
Germany |
German Bundesliga |
FC Bayern Munich |
31 |
1986-03-27 |
193.0 |
92.0 |
61000000.0 |
230000.0 |
df.describe()
|
ID |
age |
height_cm |
weight_kg |
eur_value |
eur_wage |
count |
17994.000000 |
17994.000000 |
17994.000000 |
17994.000000 |
1.799400e+04 |
17994.000000 |
mean |
207791.796543 |
25.120151 |
181.271980 |
75.400856 |
2.370511e+06 |
11503.834612 |
std |
32328.527723 |
4.617428 |
6.690392 |
6.994824 |
5.347250e+06 |
23050.661073 |
min |
16.000000 |
16.000000 |
155.000000 |
49.000000 |
0.000000e+00 |
0.000000 |
25% |
192621.250000 |
21.000000 |
177.000000 |
70.000000 |
3.000000e+05 |
2000.000000 |
50% |
214186.000000 |
25.000000 |
181.000000 |
75.000000 |
7.000000e+05 |
4000.000000 |
75% |
231615.750000 |
28.000000 |
186.000000 |
80.000000 |
2.000000e+06 |
12000.000000 |
max |
241489.000000 |
47.000000 |
205.000000 |
110.000000 |
1.230000e+08 |
565000.000000 |
df.count()
ID 17994
name 17994
full_name 17994
nationality 17994
league 17741
club 17741
age 17994
birth_date 17994
height_cm 17994
weight_kg 17994
eur_value 17994
eur_wage 17994
dtype: int64
df = df[['ID', 'nationality', 'league', 'club', 'age', 'eur_value', 'eur_wage']]
df
|
ID |
nationality |
league |
club |
age |
eur_value |
eur_wage |
0 |
20801 |
Portugal |
Spanish Primera División |
Real Madrid CF |
32 |
95500000.0 |
565000.0 |
1 |
158023 |
Argentina |
Spanish Primera División |
FC Barcelona |
30 |
105000000.0 |
565000.0 |
2 |
190871 |
Brazil |
French Ligue 1 |
Paris Saint-Germain |
25 |
123000000.0 |
280000.0 |
3 |
176580 |
Uruguay |
Spanish Primera División |
FC Barcelona |
30 |
97000000.0 |
510000.0 |
4 |
167495 |
Germany |
German Bundesliga |
FC Bayern Munich |
31 |
61000000.0 |
230000.0 |
... |
... |
... |
... |
... |
... |
... |
... |
17989 |
237463 |
England |
English League One |
Scunthorpe United |
17 |
50000.0 |
1000.0 |
17990 |
11728 |
England |
English League Two |
Wycombe Wanderers |
47 |
0.0 |
1000.0 |
17991 |
231381 |
Scotland |
English League Two |
Swindon Town |
17 |
60000.0 |
1000.0 |
17992 |
238813 |
England |
English League Two |
Crewe Alexandra |
18 |
60000.0 |
1000.0 |
17993 |
238308 |
Ghana |
English League One |
Scunthorpe United |
18 |
50000.0 |
1000.0 |
17994 rows × 7 columns
df[df.league.isnull()]
|
ID |
nationality |
league |
club |
age |
eur_value |
eur_wage |
163 |
188152 |
Brazil |
NaN |
NaN |
25 |
0.0 |
0.0 |
168 |
184826 |
Portugal |
NaN |
NaN |
28 |
0.0 |
0.0 |
271 |
177413 |
Belgium |
NaN |
NaN |
28 |
0.0 |
0.0 |
480 |
176733 |
Sweden |
NaN |
NaN |
30 |
0.0 |
0.0 |
494 |
169195 |
Brazil |
NaN |
NaN |
29 |
0.0 |
0.0 |
... |
... |
... |
... |
... |
... |
... |
... |
17267 |
234509 |
India |
NaN |
NaN |
29 |
0.0 |
0.0 |
17486 |
234508 |
India |
NaN |
NaN |
20 |
0.0 |
0.0 |
17489 |
223760 |
India |
NaN |
NaN |
24 |
0.0 |
0.0 |
17511 |
233526 |
India |
NaN |
NaN |
22 |
0.0 |
0.0 |
17568 |
231057 |
New Zealand |
NaN |
NaN |
20 |
0.0 |
0.0 |
253 rows × 7 columns
df.drop(df[df.league.isnull()].index,inplace=True)
df.count()
ID 17741
nationality 17741
league 17741
club 17741
age 17741
eur_value 17741
eur_wage 17741
dtype: int64
df.describe()
|
ID |
age |
eur_value |
eur_wage |
count |
17741.000000 |
17741.000000 |
1.774100e+04 |
17741.000000 |
mean |
207756.835522 |
25.088552 |
2.404317e+06 |
11667.887943 |
std |
32421.331072 |
4.616413 |
5.377693e+06 |
23173.181633 |
min |
16.000000 |
16.000000 |
0.000000e+00 |
1000.000000 |
25% |
192621.000000 |
21.000000 |
3.250000e+05 |
2000.000000 |
50% |
214175.000000 |
25.000000 |
7.000000e+05 |
4000.000000 |
75% |
231624.000000 |
28.000000 |
2.100000e+06 |
12000.000000 |
max |
241489.000000 |
47.000000 |
1.230000e+08 |
565000.000000 |
df[df['eur_value'] == 0.000000e+00].count()
ID 6
nationality 6
league 6
club 6
age 6
eur_value 6
eur_wage 6
dtype: int64
df[df['eur_value'] == 0.000000e+00]
|
ID |
nationality |
league |
club |
age |
eur_value |
eur_wage |
2199 |
3665 |
France |
French Ligue 1 |
ES Troyes AC |
40 |
0.0 |
16000.0 |
3105 |
17605 |
Belgium |
Belgian First Division A |
Club Brugge KV |
40 |
0.0 |
14000.0 |
3272 |
176900 |
Colombia |
Colombian Primera A |
Asociacion Deportivo Cali |
40 |
0.0 |
2000.0 |
7734 |
148745 |
Norway |
Norwegian Eliteserien |
Sogndal |
41 |
0.0 |
2000.0 |
17628 |
149727 |
England |
Rep. Ireland Premier Division |
St. Patrick's Athletic |
37 |
0.0 |
1000.0 |
17990 |
11728 |
England |
English League Two |
Wycombe Wanderers |
47 |
0.0 |
1000.0 |
df['eur_value'].replace(0, df['eur_value'].mean(), inplace = True)
df.describe()
|
ID |
age |
eur_value |
eur_wage |
count |
17741.000000 |
17741.000000 |
1.774100e+04 |
17741.000000 |
mean |
207756.835522 |
25.088552 |
2.405130e+06 |
11667.887943 |
std |
32421.331072 |
4.616413 |
5.377511e+06 |
23173.181633 |
min |
16.000000 |
16.000000 |
1.000000e+04 |
1000.000000 |
25% |
192621.000000 |
21.000000 |
3.250000e+05 |
2000.000000 |
50% |
214175.000000 |
25.000000 |
7.000000e+05 |
4000.000000 |
75% |
231624.000000 |
28.000000 |
2.100000e+06 |
12000.000000 |
max |
241489.000000 |
47.000000 |
1.230000e+08 |
565000.000000 |
df[df.duplicated()]
|
ID |
nationality |
league |
club |
age |
eur_value |
eur_wage |
df[df['ID'].duplicated()]
|
ID |
nationality |
league |
club |
age |
eur_value |
eur_wage |
df['league'].value_counts()
Argentinian Superliga 780
English Championship 717
English League One 668
English Premier League 654
Spanish Segunda División 637
English League Two 633
Italian Serie B 625
USA Major League Soccer 625
Spanish Primera División 602
French Ligue 1 598
Italian Serie A 559
Colombian Primera A 552
French Ligue 2 543
German Bundesliga 537
Japanese J1 League 519
Mexican Liga MX 518
German 3. Liga 515
German 2. Bundesliga 510
Portuguese Primeira Liga 509
Turkish Süper Lig 502
Holland Eredivisie 488
Russian Premier League 449
Belgian First Division A 436
Polish Ekstraklasa 418
Saudi Professional League 411
Norwegian Eliteserien 393
Swedish Allsvenskan 389
Danish Superliga 365
Korean K League Classic 336
Scottish Premiership 321
Chilian Primera División 320
Campeonato Brasileiro Série A 320
Rep. Ireland Premier Division 288
Swiss Super League 263
Austrian Bundesliga 259
Australian A-League 236
Greek Super League 111
South African PSL 56
Czech Liga 28
Finnish Veikkausliiga 27
Ukrainian Premier League 24
Name: league, dtype: int64
df.count()
ID 17741
nationality 17741
league 17741
club 17741
age 17741
eur_value 17741
eur_wage 17741
dtype: int64
df.describe()
|
ID |
age |
eur_value |
eur_wage |
count |
17741.000000 |
17741.000000 |
1.774100e+04 |
17741.000000 |
mean |
207756.835522 |
25.088552 |
2.405130e+06 |
11667.887943 |
std |
32421.331072 |
4.616413 |
5.377511e+06 |
23173.181633 |
min |
16.000000 |
16.000000 |
1.000000e+04 |
1000.000000 |
25% |
192621.000000 |
21.000000 |
3.250000e+05 |
2000.000000 |
50% |
214175.000000 |
25.000000 |
7.000000e+05 |
4000.000000 |
75% |
231624.000000 |
28.000000 |
2.100000e+06 |
12000.000000 |
max |
241489.000000 |
47.000000 |
1.230000e+08 |
565000.000000 |
nationality_data = df.groupby('nationality', as_index = False)
nat_count = nationality_data.count()[['nationality','ID']]
nat_count.rename(columns = {'ID':'ath_count'}, inplace = True)
nat_head10 = nat_count.sort_values('ath_count', ascending = False).head(10)
nat_head10
|
nationality |
ath_count |
44 |
England |
1631 |
57 |
Germany |
1147 |
135 |
Spain |
1020 |
53 |
France |
966 |
5 |
Argentina |
962 |
18 |
Brazil |
803 |
75 |
Italy |
800 |
29 |
Colombia |
591 |
78 |
Japan |
471 |
105 |
Netherlands |
430 |
nat_val_mean = nationality_data[['nationality','eur_value']].mean()
nat_val_mean.rename(columns = {'eur_value':'val_mean'})
nat_head10_val_mean = pd.merge(nat_head10,nat_val_mean, on = 'nationality', how = 'left')
nat_head10_val_mean
|
nationality |
ath_count |
eur_value |
0 |
England |
1631 |
1.425410e+06 |
1 |
Germany |
1147 |
2.609010e+06 |
2 |
Spain |
1020 |
4.465897e+06 |
3 |
France |
966 |
3.314264e+06 |
4 |
Argentina |
962 |
2.900120e+06 |
5 |
Brazil |
803 |
4.001071e+06 |
6 |
Italy |
800 |
2.681325e+06 |
7 |
Colombia |
591 |
1.719068e+06 |
8 |
Japan |
471 |
8.067091e+05 |
9 |
Netherlands |
430 |
3.002930e+06 |
league_data = df.groupby('league', as_index = False)
league_count = league_data.count()[['league','ID']].rename(columns = {'ID':'ath_count'})
league_count.rename(columns = {'ID':'ath_count'}).sort_values('ath_count', ascending = False)
lea_val_mean = league_data[['league','eur_value']].mean().rename(columns = {'eur_value':'val_mean'})
lea_val_mean = pd.merge(league_count, lea_val_mean, on = 'league', how = 'left')
lea_val_mean
|
league |
ath_count |
val_mean |
0 |
Argentinian Superliga |
780 |
1.453788e+06 |
1 |
Australian A-League |
236 |
6.848941e+05 |
2 |
Austrian Bundesliga |
259 |
7.276062e+05 |
3 |
Belgian First Division A |
436 |
1.956719e+06 |
4 |
Campeonato Brasileiro Série A |
320 |
2.249016e+06 |
5 |
Chilian Primera División |
320 |
2.238234e+06 |
6 |
Colombian Primera A |
552 |
9.465567e+05 |
7 |
Czech Liga |
28 |
2.141250e+06 |
8 |
Danish Superliga |
365 |
7.188767e+05 |
9 |
English Championship |
717 |
1.831032e+06 |
10 |
English League One |
668 |
4.875075e+05 |
11 |
English League Two |
633 |
2.926687e+05 |
12 |
English Premier League |
654 |
9.091483e+06 |
13 |
Finnish Veikkausliiga |
27 |
2.940741e+05 |
14 |
French Ligue 1 |
598 |
5.188201e+06 |
15 |
French Ligue 2 |
543 |
8.015930e+05 |
16 |
German 2. Bundesliga |
510 |
1.238333e+06 |
17 |
German 3. Liga |
515 |
4.530777e+05 |
18 |
German Bundesliga |
537 |
7.702849e+06 |
19 |
Greek Super League |
111 |
3.808333e+06 |
20 |
Holland Eredivisie |
488 |
2.171250e+06 |
21 |
Italian Serie A |
559 |
7.292030e+06 |
22 |
Italian Serie B |
625 |
8.344480e+05 |
23 |
Japanese J1 League |
519 |
6.472736e+05 |
24 |
Korean K League Classic |
336 |
8.854911e+05 |
25 |
Mexican Liga MX |
518 |
2.025782e+06 |
26 |
Norwegian Eliteserien |
393 |
6.056217e+05 |
27 |
Polish Ekstraklasa |
418 |
7.070096e+05 |
28 |
Portuguese Primeira Liga |
509 |
3.506257e+06 |
29 |
Rep. Ireland Premier Division |
288 |
1.683310e+05 |
30 |
Russian Premier League |
449 |
2.679788e+06 |
31 |
Saudi Professional League |
411 |
8.512287e+05 |
32 |
Scottish Premiership |
321 |
9.114486e+05 |
33 |
South African PSL |
56 |
1.150893e+06 |
34 |
Spanish Primera División |
602 |
9.257550e+06 |
35 |
Spanish Segunda División |
637 |
1.508854e+06 |
36 |
Swedish Allsvenskan |
389 |
6.417095e+05 |
37 |
Swiss Super League |
263 |
1.134202e+06 |
38 |
Turkish Süper Lig |
502 |
2.961036e+06 |
39 |
USA Major League Soccer |
625 |
1.484424e+06 |
40 |
Ukrainian Premier League |
24 |
8.283750e+06 |
club_data = df.groupby('club', as_index = False)
club_wage_mean = club_data.mean()[['club','eur_wage']]
club_wage_mean.rename(columns = {'eur_wage':'wage_mean'},inplace = True)
club_wage_mean.sort_values('wage_mean', ascending = False, inplace = True)
club_wage_mean
|
club |
wage_mean |
219 |
FC Barcelona |
194666.666667 |
466 |
Real Madrid CF |
170821.428571 |
222 |
FC Bayern Munich |
123384.615385 |
330 |
Juventus |
122000.000000 |
377 |
Manchester United |
109030.303030 |
... |
... |
... |
97 |
Bray Wanderers |
1000.000000 |
425 |
PAOK Thessaloniki |
1000.000000 |
88 |
Bohemian FC |
1000.000000 |
263 |
Finn Harps |
1000.000000 |
578 |
Tigres FC |
1000.000000 |
647 rows × 2 columns
EPL_data = df[df['league'] == 'English Premier League']
EPL_data.describe()
|
ID |
age |
eur_value |
eur_wage |
count |
654.000000 |
654.000000 |
6.540000e+02 |
654.000000 |
mean |
196333.365443 |
24.711009 |
9.091483e+06 |
57840.978593 |
std |
37997.349392 |
4.769797 |
1.222195e+07 |
50627.145927 |
min |
2147.000000 |
16.000000 |
6.000000e+04 |
2000.000000 |
25% |
183551.250000 |
20.000000 |
9.125000e+05 |
17000.000000 |
50% |
201840.000000 |
25.000000 |
5.000000e+06 |
48000.000000 |
75% |
222604.000000 |
28.000000 |
1.137500e+07 |
82000.000000 |
max |
241384.000000 |
38.000000 |
9.050000e+07 |
325000.000000 |
EPL_club = EPL_data.groupby('club', as_index = False)
EPL_club_wage_mean = EPL_club.mean()[['club','eur_wage']]
EPL_club_wage_mean.rename(columns = {'eur_wage':'wage_mean'}, inplace = True)
EPL_club_wage_mean.sort_values('wage_mean', ascending = False, inplace = True)
EPL_club_wage_mean
|
club |
wage_mean |
11 |
Manchester United |
109030.303030 |
4 |
Chelsea |
105181.818182 |
10 |
Manchester City |
95787.878788 |
0 |
Arsenal |
91121.212121 |
9 |
Liverpool |
83250.000000 |
6 |
Everton |
76484.848485 |
16 |
Tottenham Hotspur |
69218.750000 |
19 |
West Ham United |
61818.181818 |
13 |
Southampton |
51181.818182 |
15 |
Swansea City |
43878.787879 |
8 |
Leicester City |
43875.000000 |
18 |
West Bromwich Albion |
42516.129032 |
14 |
Stoke City |
41093.750000 |
17 |
Watford |
40848.484848 |
12 |
Newcastle United |
40000.000000 |
1 |
Bournemouth |
38303.030303 |
5 |
Crystal Palace |
35181.818182 |
3 |
Burnley |
33666.666667 |
2 |
Brighton & Hove Albion |
30454.545455 |
7 |
Huddersfield Town |
23181.818182 |
bins = np.arange(15, 50, 5)
bins_data = pd.cut(df['age'], bins)
bin_counts = df['age'].groupby(bins_data).count()
print(bin_counts)
bin_counts.plot(kind='pie')
age
(15, 20] 3300
(20, 25] 6749
(25, 30] 5234
(30, 35] 2192
(35, 40] 258
(40, 45] 7
Name: age, dtype: int64