# -*- coding: utf-8 -*-
'''
Created on 2018年7月2日
@author: user
@summary: Predicting the winner of the 2018 FIFA World Cup
'''
import numpy as np # linear algebra
import pandas as pd # data processing
import tensorflow as tf
from tensorflow.python.data import Dataset
from sklearn import metrics
from itertools import combinations
rankings = pd.read_csv('fifa_ranking.csv')
rankings = rankings.loc[:,['rank', 'country_full', 'country_abrv', 'cur_year_avg_weighted', 'rank_date', 'two_year_ago_weighted', 'three_year_ago_weighted']]
rankings.country_full.replace("^IR Iran*", "Iran", regex=True, inplace=True)
rankings['weighted_points'] = rankings['cur_year_avg_weighted'] + rankings['two_year_ago_weighted'] + rankings['three_year_ago_weighted']
rankings['rank_date'] = pd.to_datetime(rankings['rank_date'])
matches = pd.read_csv("results.csv")
matches = matches.replace({'Germany DR': 'Germany', 'China': 'China PR'})
matches['date'] = pd.to_datetime(matches['date'])
world_cup = pd.read_csv("World Cup 2018 Dataset.csv")
world_cup = world_cup.loc[:, ['Team', 'Group', 'First match \nagainst', 'Second match\n against', 'Third match\n against']]
world_cup = world_cup.dropna(how='all')
world_cup = world_cup.replace({"IRAN": "Iran", "Costarica": "Costa Rica", "Porugal": "Portugal", "Columbia": "Colombia", "Korea" : "Korea Republic"})
world_cup = world_cup.set_index('Team')
# Get Complete Date wise Ranking table
rankings = rankings.set_index(['rank_date']).groupby(['country_full'],group_keys = False).resample('D').first().fillna(method='ffill').reset_index()
#Join Ranking with match
matches = matches.merge(rankings,left_on=['date', 'home_team'],right_on=['rank_date', 'country_full'])
matches = matches.merge(rankings, left_on=['date', 'away_team'],right_on=['rank_date', 'country_full'], suffixes=('_home', '_away'))
# feature generation
matches['rank_difference'] = matches['rank_home'] - matches['rank_away']
matches['average_rank'] = (matches['rank_home'] + matches['rank_away'])/2
matches['point_difference'] = matches['weighted_points_home'] - matches['weighted_points_away']
matches['score_difference'] = matches['home_score'] - matches['away_score']
matches['is_won'] = matches['score_difference'] > 0 # take draw as lost
matches['is_stake'] = matches['tournament'] != 'Friendly'
tf.logging.set_verbosity(tf.logging.ERROR)
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format
matches = matches.reindex(np.random.permutation(matches.index))
def preprocess_features(matches):
selected_features = matches[["average_rank", "rank_difference", "point_difference", "is_stake"]]
processed_features = selected_features.copy()
return processed_features
def preprocess_targets(matches):
output_targets = pd.DataFrame()
# Scale the target to be in units of thousands of dollars.
output_targets["is_won"] = matches['is_won']
return output_targets
# Choose the first 60% i.e 10900 (out of 18167) examples for training.
training_examples = preprocess_features(matches.head(10900))
training_targets = preprocess_targets(matches.head(10900))
# Choose the last 40% i.e 7267 (out of 18167) examples for validation.
validation_examples = preprocess_features(matches.tail(7267))
validation_targets = preprocess_targets(matches.tail(7267))
Complete_Data_training = preprocess_features(matches)
Complete_Data_Validation = preprocess_targets(matches)
def construct_feature_columns(input_features):
return set([tf.feature_column.numeric_column(my_feature) for my_feature in input_features])
def my_input_fn(features, targets, batch_size=1, shuffle=True, num_epochs=None):
"""Trains a neural network model.
Args:
features: pandas DataFrame of features
targets: pandas DataFrame of targets
batch_size: Size of batches to be passed to the model
shuffle: True or False. Whether to shuffle the data.
num_epochs: Number of epochs for which data should be repeated. None = repeat indefinitely
Returns:
Tuple of (features, labels) for next data batch
"""
# Convert pandas data into a dict of np arrays.
features = {key:np.array(value) for key,value in dict(features).items()}
# Construct a dataset, and configure batching/repeating.
ds = Dataset.from_tensor_slices((features,targets)) # warning: 2GB limit
ds = ds.batch(batch_size).repeat(num_epochs)
# Shuffle the data, if specified.
if shuffle:
ds = ds.shuffle(10000)
# Return the next batch of data.
features, labels = ds.make_one_shot_iterator().get_next()
return features, labels
def train_nn_classification_model(
my_optimizer,
steps,
batch_size,
hidden_units,
training_examples,
training_targets,
validation_examples,
validation_targets):
periods = 10
steps_per_period = steps / periods
# Create a DNNRegressor object.
my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 3.0)
dnn_classifier = tf.estimator.DNNClassifier(
feature_columns=construct_feature_columns(training_examples),
hidden_units=hidden_units,
optimizer=my_optimizer)
# Create input functions.
training_input_fn = lambda: my_input_fn(training_examples,
training_targets["is_won"],
batch_size=batch_size)
predict_training_input_fn = lambda: my_input_fn(training_examples,
training_targets["is_won"],
num_epochs=1,
shuffle=False)
predict_validation_input_fn = lambda: my_input_fn(validation_examples,
validation_targets["is_won"],
num_epochs=1,
shuffle=False)
# Train the model, but do so inside a loop so that we can periodically assess
# loss metrics.
# Train the model, but do so inside a loop so that we can periodically assess
# loss metrics.
print("Training model...")
print("LogLoss (on training data):")
training_log_losses = []
validation_log_losses = []
for period in range (0, periods):
# Train the model, starting from the prior state.
dnn_classifier.train(
input_fn=training_input_fn,
steps=steps_per_period)
# Take a break and compute predictions.
training_probabilities = dnn_classifier.predict(input_fn=predict_training_input_fn)
training_probabilities = np.array([item['probabilities'] for item in training_probabilities])
validation_probabilities = dnn_classifier.predict(input_fn=predict_validation_input_fn)
validation_probabilities = np.array([item['probabilities'] for item in validation_probabilities])
training_log_loss = metrics.log_loss(training_targets, training_probabilities)
validation_log_loss = metrics.log_loss(validation_targets, validation_probabilities)
# Occasionally print the current loss.
print(" period %02d : %0.2f" % (period, training_log_loss))
# Add the loss metrics from this period to our list.
training_log_losses.append(training_log_loss)
validation_log_losses.append(validation_log_loss)
print("Model training finished.")
# Output a graph of loss metrics over periods.
return dnn_classifier
linear_classifier = train_nn_classification_model(
my_optimizer=tf.train.AdagradOptimizer(learning_rate=0.07),
steps=3000,
batch_size=2000,
hidden_units=[5, 5,6,5],
training_examples=training_examples,
training_targets=training_targets,
validation_examples=validation_examples,
validation_targets=validation_targets)
predict_validation_input_fn = lambda: my_input_fn(validation_examples,
validation_targets["is_won"],
num_epochs=1,
shuffle=False)
validation_probabilities = linear_classifier.predict(input_fn=predict_validation_input_fn)
# Get just the probabilities for the positive class.
validation_probabilities = np.array([item['probabilities'][1] for item in validation_probabilities])
false_positive_rate, true_positive_rate, thresholds = metrics.roc_curve(validation_targets, validation_probabilities)
evaluation_metrics = linear_classifier.evaluate(input_fn=predict_validation_input_fn)
print("AUC on the validation set: %0.2f" % evaluation_metrics['auc'])
print("Accuracy on the validation set: %0.2f" % evaluation_metrics['accuracy'])
#World Cup simulation
# let's define a small margin when we safer to predict draw then win
margin = 0.05
# let's define the rankings at the time of the World Cup
world_cup_rankings = rankings.loc[(rankings['rank_date'] == rankings['rank_date'].max()) & rankings['country_full'].isin(world_cup.index.unique())]
world_cup_rankings = world_cup_rankings.set_index(['country_full'])
opponents = ['First match \nagainst', 'Second match\n against', 'Third match\n against']
world_cup['points'] = 0
world_cup['total_prob'] = 0
for group in set(world_cup['Group']):
print('___Starting group {}:___'.format(group))
for home, away in combinations(world_cup.query('Group =="{}"'.format(group)).index, 2):
print("{} vs. {}: ".format(home, away))
row = pd.DataFrame(np.array([[np.nan, np.nan, np.nan, True]]), columns=validation_examples.columns)
home_rank = world_cup_rankings.loc[home, 'rank']
home_points = world_cup_rankings.loc[home, 'weighted_points']
opp_rank = world_cup_rankings.loc[away, 'rank']
opp_points = world_cup_rankings.loc[away, 'weighted_points']
row['average_rank'] = (home_rank + opp_rank) / 2
row['rank_difference'] = home_rank - opp_rank
row['point_difference'] = home_points - opp_points
row['is_won'] =np.nan
predict_validation_input_fn1 = lambda: my_input_fn(row,
row["is_won"],
num_epochs=1,
shuffle=False)
validation_probabilities1 = linear_classifier.predict(input_fn=predict_validation_input_fn1)
# Get just the probabilities for the positive class.
validation_probabilities1 = np.array([item['probabilities'][1] for item in validation_probabilities1])
#print(validation_probabilities1[0])
home_win_prob = validation_probabilities1[0]
world_cup.loc[home, 'total_prob'] += home_win_prob
world_cup.loc[away, 'total_prob'] += 1-home_win_prob
points = 0
if home_win_prob <= 0.5 - margin:
print("{} wins with {:.2f}".format(away, 1-home_win_prob))
world_cup.loc[away, 'points'] += 3
if home_win_prob > 0.5 - margin:
points = 1
if home_win_prob >= 0.5 + margin:
points = 3
world_cup.loc[home, 'points'] += 3
print("{} wins with {:.2f}".format(home, home_win_prob))
if points == 1:
print("Draw")
world_cup.loc[home, 'points'] += 1
world_cup.loc[away, 'points'] += 1
pairing = [0,3,4,7,8,11,12,15,1,2,5,6,9,10,13,14]
world_cup = world_cup.sort_values(by=['Group', 'points', 'total_prob'], ascending=False).reset_index()
next_round_wc = world_cup.groupby('Group').nth([0, 1]) # select the top 2
next_round_wc = next_round_wc.reset_index()
next_round_wc = next_round_wc.loc[pairing]
next_round_wc = next_round_wc.set_index('Team')
finals = ['round_of_16', 'quarterfinal', 'semifinal', 'final']
labels = list()
odds = list()
for f in finals:
print("___Starting of the {}___".format(f))
iterations = int(len(next_round_wc) / 2)
winners = []
for i in range(iterations):
home = next_round_wc.index[i*2]
away = next_round_wc.index[i*2+1]
print("{} vs. {}: ".format(home,away))
row = pd.DataFrame(np.array([[np.nan, np.nan, np.nan, True]]), columns=validation_examples.columns)
home_rank = world_cup_rankings.loc[home, 'rank']
home_points = world_cup_rankings.loc[home, 'weighted_points']
opp_rank = world_cup_rankings.loc[away, 'rank']
opp_points = world_cup_rankings.loc[away, 'weighted_points']
row['average_rank'] = (home_rank + opp_rank) / 2
row['rank_difference'] = home_rank - opp_rank
row['point_difference'] = home_points - opp_points
row['is_won'] =np.nan
predict_validation_input_fn1 = lambda: my_input_fn(row,
row["is_won"],
num_epochs=1,
shuffle=False)
validation_probabilities1 = linear_classifier.predict(input_fn=predict_validation_input_fn1)
# Get just the probabilities for the positive class.
validation_probabilities1 = np.array([item['probabilities'][1] for item in validation_probabilities1])
#print(validation_probabilities1[0])
home_win_prob = validation_probabilities1[0]
#home_win_prob = model.predict_proba(row)[:,1][0]
if home_win_prob <= 0.5:
print("{0} wins with probability {1:.2f}".format(away, 1-home_win_prob))
winners.append(away)
else:
print("{0} wins with probability {1:.2f}".format(home, home_win_prob))
winners.append(home)
labels.append("{}({:.2f}) vs. {}({:.2f})".format(world_cup_rankings.loc[home, 'country_abrv'],
1/home_win_prob,
world_cup_rankings.loc[away, 'country_abrv'],
1/(1-home_win_prob)))
odds.append([home_win_prob, 1-home_win_prob])
next_round_wc = next_round_wc.loc[winners]
print("\n")
Model training finished.
AUC on the validation set: 0.74
Accuracy on the validation set: 0.67
___Starting group A:___
Russia vs. Saudi Arabia:
Draw
Russia vs. Egypt:
Egypt wins with 0.67
Russia vs. Uruguay:
Uruguay wins with 0.84
Saudi Arabia vs. Egypt:
Egypt wins with 0.66
Saudi Arabia vs. Uruguay:
Uruguay wins with 0.84
Egypt vs. Uruguay:
Uruguay wins with 0.84
___Starting group C:___
France vs. Australia:
France wins with 0.57
France vs. Peru:
Draw
France vs. Denmark:
Draw
Australia vs. Peru:
Peru wins with 0.84
Australia vs. Denmark:
Denmark wins with 0.84
Peru vs. Denmark:
Draw
___Starting group B:___
Portugal vs. Spain:
Draw
Portugal vs. Morocco:
Portugal wins with 0.62
Portugal vs. Iran:
Portugal wins with 0.62
Spain vs. Morocco:
Spain wins with 0.60
Spain vs. Iran:
Spain wins with 0.60
Morocco vs. Iran:
Draw
___Starting group E:___
Brazil vs. Switzerland:
Draw
Brazil vs. Costa Rica:
Draw
Brazil vs. Serbia:
Brazil wins with 0.59
Switzerland vs. Costa Rica:
Draw
Switzerland vs. Serbia:
Switzerland wins with 0.57
Costa Rica vs. Serbia:
Draw
___Starting group D:___
Argentina vs. Iceland:
Draw
Argentina vs. Croatia:
Draw
Argentina vs. Nigeria:
Argentina wins with 0.64
Iceland vs. Croatia:
Draw
Iceland vs. Nigeria:
Iceland wins with 0.60
Croatia vs. Nigeria:
Croatia wins with 0.60
___Starting group G:___
Belgium vs. Panama:
Belgium wins with 0.68
Belgium vs. Tunisia:
Draw
Belgium vs. England:
Draw
Panama vs. Tunisia:
Tunisia wins with 0.84
Panama vs. England:
England wins with 0.84
Tunisia vs. England:
England wins with 0.61
___Starting group F:___
Germany vs. Mexico:
Germany wins with 0.56
Germany vs. Sweden:
Germany wins with 0.59
Germany vs. Korea Republic:
Germany wins with 0.73
Mexico vs. Sweden:
Draw
Mexico vs. Korea Republic:
Mexico wins with 0.65
Sweden vs. Korea Republic:
Sweden wins with 0.64
___Starting group H:___
Poland vs. Senegal:
Draw
Poland vs. Colombia:
Draw
Poland vs. Japan:
Poland wins with 0.66
Senegal vs. Colombia:
Colombia wins with 0.55
Senegal vs. Japan:
Senegal wins with 0.63
Colombia vs. Japan:
Colombia wins with 0.65
___Starting of the round_of_16___
Uruguay vs. Spain:
Spain wins with probability 0.54
Denmark vs. Croatia:
Denmark wins with probability 0.55
Switzerland vs. Mexico:
Mexico wins with probability 0.51
England vs. Poland:
Poland wins with probability 0.53
Egypt vs. Portugal:
Portugal wins with probability 0.84
Peru vs. Argentina:
Argentina wins with probability 0.56
Brazil vs. Germany:
Germany wins with probability 0.84
Belgium vs. Colombia:
Belgium wins with probability 0.54
___Starting of the quarterfinal___
Spain vs. Denmark:
Denmark wins with probability 0.52
Mexico vs. Poland:
Poland wins with probability 0.59
Portugal vs. Argentina:
Argentina wins with probability 0.53
Germany vs. Belgium:
Belgium wins with probability 0.52
___Starting of the semifinal___
Denmark vs. Poland:
Poland wins with probability 0.51
Argentina vs. Belgium:
Belgium wins with probability 0.57
___Starting of the final___
Poland vs. Belgium:
Belgium wins with probability 0.84