2018年世界杯赔率预测 -DNN

# -*- coding: utf-8 -*-
'''
Created on 2018年7月2日
@author: user
@summary:  Predicting the winner of the 2018 FIFA World Cup
'''
import numpy as np # linear algebra
import pandas as pd # data processing
import tensorflow as tf
from tensorflow.python.data import Dataset
from sklearn import metrics
from itertools import combinations


rankings = pd.read_csv('fifa_ranking.csv')
rankings = rankings.loc[:,['rank', 'country_full', 'country_abrv', 'cur_year_avg_weighted', 'rank_date', 'two_year_ago_weighted', 'three_year_ago_weighted']]
rankings.country_full.replace("^IR Iran*", "Iran", regex=True, inplace=True)
rankings['weighted_points'] =  rankings['cur_year_avg_weighted'] + rankings['two_year_ago_weighted'] + rankings['three_year_ago_weighted']
rankings['rank_date'] = pd.to_datetime(rankings['rank_date'])

matches = pd.read_csv("results.csv")
matches =  matches.replace({'Germany DR': 'Germany', 'China': 'China PR'})
matches['date'] = pd.to_datetime(matches['date'])

world_cup = pd.read_csv("World Cup 2018 Dataset.csv")
world_cup = world_cup.loc[:, ['Team', 'Group', 'First match \nagainst', 'Second match\n against', 'Third match\n against']]
world_cup = world_cup.dropna(how='all')
world_cup = world_cup.replace({"IRAN": "Iran",  "Costarica": "Costa Rica",  "Porugal": "Portugal", "Columbia": "Colombia", "Korea" : "Korea Republic"})
world_cup = world_cup.set_index('Team')

# Get Complete Date wise Ranking table
rankings = rankings.set_index(['rank_date']).groupby(['country_full'],group_keys = False).resample('D').first().fillna(method='ffill').reset_index()
#Join Ranking with match 
matches = matches.merge(rankings,left_on=['date', 'home_team'],right_on=['rank_date', 'country_full'])
matches = matches.merge(rankings, left_on=['date', 'away_team'],right_on=['rank_date', 'country_full'],  suffixes=('_home', '_away'))
# feature generation
matches['rank_difference'] = matches['rank_home'] - matches['rank_away']
matches['average_rank'] = (matches['rank_home'] + matches['rank_away'])/2
matches['point_difference'] = matches['weighted_points_home'] - matches['weighted_points_away']
matches['score_difference'] = matches['home_score'] - matches['away_score']
matches['is_won'] = matches['score_difference'] > 0 # take draw as lost
matches['is_stake'] = matches['tournament'] != 'Friendly'

tf.logging.set_verbosity(tf.logging.ERROR)
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format
matches = matches.reindex(np.random.permutation(matches.index))
def preprocess_features(matches):
    
    selected_features = matches[["average_rank", "rank_difference", "point_difference", "is_stake"]]
    processed_features = selected_features.copy()
    return processed_features

def preprocess_targets(matches):
    output_targets = pd.DataFrame()
    # Scale the target to be in units of thousands of dollars.
    output_targets["is_won"] = matches['is_won']
    return output_targets

# Choose the first 60% i.e 10900 (out of 18167) examples for training.
training_examples = preprocess_features(matches.head(10900))
training_targets = preprocess_targets(matches.head(10900))

# Choose the last 40% i.e 7267 (out of 18167) examples for validation.
validation_examples = preprocess_features(matches.tail(7267))
validation_targets = preprocess_targets(matches.tail(7267))

Complete_Data_training = preprocess_features(matches)
Complete_Data_Validation = preprocess_targets(matches)

def construct_feature_columns(input_features):
    return set([tf.feature_column.numeric_column(my_feature) for my_feature in input_features])

def my_input_fn(features, targets, batch_size=1, shuffle=True, num_epochs=None):
    """Trains a neural network model.
  
    Args:
      features: pandas DataFrame of features
      targets: pandas DataFrame of targets
      batch_size: Size of batches to be passed to the model
      shuffle: True or False. Whether to shuffle the data.
      num_epochs: Number of epochs for which data should be repeated. None = repeat indefinitely
    Returns:
      Tuple of (features, labels) for next data batch
    """
    
    # Convert pandas data into a dict of np arrays.
    features = {key:np.array(value) for key,value in dict(features).items()}                                           
    # Construct a dataset, and configure batching/repeating.
    ds = Dataset.from_tensor_slices((features,targets)) # warning: 2GB limit
    ds = ds.batch(batch_size).repeat(num_epochs)
    
    # Shuffle the data, if specified.
    if shuffle:
        ds = ds.shuffle(10000)
    
    # Return the next batch of data.
    features, labels = ds.make_one_shot_iterator().get_next()
    return features, labels

def train_nn_classification_model(
    my_optimizer,
    steps,
    batch_size,
    hidden_units,
    training_examples,
    training_targets,
    validation_examples,
    validation_targets):
    periods = 10
    steps_per_period = steps / periods
    # Create a DNNRegressor object.
    my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 3.0)
    dnn_classifier = tf.estimator.DNNClassifier(
      feature_columns=construct_feature_columns(training_examples),
      hidden_units=hidden_units,
      optimizer=my_optimizer)
    # Create input functions.
    training_input_fn = lambda: my_input_fn(training_examples, 
                                          training_targets["is_won"], 
                                          batch_size=batch_size)
    predict_training_input_fn = lambda: my_input_fn(training_examples, 
                                                  training_targets["is_won"], 
                                                  num_epochs=1, 
                                                  shuffle=False)
    predict_validation_input_fn = lambda: my_input_fn(validation_examples, 
                                                    validation_targets["is_won"], 
                                                    num_epochs=1, 
                                                    shuffle=False)

    # Train the model, but do so inside a loop so that we can periodically assess
    # loss metrics.
    # Train the model, but do so inside a loop so that we can periodically assess
    # loss metrics.
    print("Training model...")
    print("LogLoss (on training data):")
    training_log_losses = []
    validation_log_losses = []
    for period in range (0, periods):
    # Train the model, starting from the prior state.
        dnn_classifier.train(
        input_fn=training_input_fn,
        steps=steps_per_period)
        # Take a break and compute predictions.    
        training_probabilities = dnn_classifier.predict(input_fn=predict_training_input_fn)
        training_probabilities = np.array([item['probabilities'] for item in training_probabilities])
    
        validation_probabilities = dnn_classifier.predict(input_fn=predict_validation_input_fn)
        validation_probabilities = np.array([item['probabilities'] for item in validation_probabilities])
    
        training_log_loss = metrics.log_loss(training_targets, training_probabilities)
        validation_log_loss = metrics.log_loss(validation_targets, validation_probabilities)
        # Occasionally print the current loss.
        print("  period %02d : %0.2f" % (period, training_log_loss))
        # Add the loss metrics from this period to our list.
        training_log_losses.append(training_log_loss)
        validation_log_losses.append(validation_log_loss)
    print("Model training finished.")
    # Output a graph of loss metrics over periods.
    return dnn_classifier



linear_classifier = train_nn_classification_model(
    my_optimizer=tf.train.AdagradOptimizer(learning_rate=0.07),
    steps=3000,
    batch_size=2000,
    hidden_units=[5, 5,6,5],
    training_examples=training_examples,
    training_targets=training_targets,
    validation_examples=validation_examples,
    validation_targets=validation_targets)

predict_validation_input_fn = lambda: my_input_fn(validation_examples, 
                                                  validation_targets["is_won"], 
                                                  num_epochs=1, 
                                                  shuffle=False)


validation_probabilities = linear_classifier.predict(input_fn=predict_validation_input_fn)
# Get just the probabilities for the positive class.
validation_probabilities = np.array([item['probabilities'][1] for item in validation_probabilities])

false_positive_rate, true_positive_rate, thresholds = metrics.roc_curve(validation_targets, validation_probabilities)
evaluation_metrics = linear_classifier.evaluate(input_fn=predict_validation_input_fn)
print("AUC on the validation set: %0.2f" % evaluation_metrics['auc'])
print("Accuracy on the validation set: %0.2f" % evaluation_metrics['accuracy'])

#World Cup simulation
# let's define a small margin when we safer to predict draw then win
margin = 0.05

# let's define the rankings at the time of the World Cup
world_cup_rankings = rankings.loc[(rankings['rank_date'] == rankings['rank_date'].max()) &  rankings['country_full'].isin(world_cup.index.unique())]
world_cup_rankings = world_cup_rankings.set_index(['country_full'])
opponents = ['First match \nagainst', 'Second match\n against', 'Third match\n against']

world_cup['points'] = 0
world_cup['total_prob'] = 0

for group in set(world_cup['Group']):
    print('___Starting group {}:___'.format(group))
    
    for home, away in combinations(world_cup.query('Group =="{}"'.format(group)).index, 2):
        print("{} vs. {}: ".format(home, away))
        
        row = pd.DataFrame(np.array([[np.nan, np.nan, np.nan, True]]), columns=validation_examples.columns)
        home_rank = world_cup_rankings.loc[home, 'rank']
        home_points = world_cup_rankings.loc[home, 'weighted_points']
        opp_rank = world_cup_rankings.loc[away, 'rank']
        opp_points = world_cup_rankings.loc[away, 'weighted_points']
        row['average_rank'] = (home_rank + opp_rank) / 2
        row['rank_difference'] = home_rank - opp_rank
        row['point_difference'] = home_points - opp_points
        row['is_won'] =np.nan
        predict_validation_input_fn1 = lambda: my_input_fn(row, 
                                                  row["is_won"], 
                                                  num_epochs=1, 
                                                  shuffle=False)
        validation_probabilities1 = linear_classifier.predict(input_fn=predict_validation_input_fn1)
        # Get just the probabilities for the positive class.
        validation_probabilities1 = np.array([item['probabilities'][1] for item in validation_probabilities1])
        #print(validation_probabilities1[0])
        home_win_prob = validation_probabilities1[0]
        world_cup.loc[home, 'total_prob'] += home_win_prob
        world_cup.loc[away, 'total_prob'] += 1-home_win_prob
        
        points = 0
        if home_win_prob <= 0.5 - margin:
            print("{} wins with {:.2f}".format(away, 1-home_win_prob))
            world_cup.loc[away, 'points'] += 3
        if home_win_prob > 0.5 - margin:
            points = 1
        if home_win_prob >= 0.5 + margin:
            points = 3
            world_cup.loc[home, 'points'] += 3
            print("{} wins with {:.2f}".format(home, home_win_prob))
        if points == 1:
            print("Draw")
            world_cup.loc[home, 'points'] += 1
            world_cup.loc[away, 'points'] += 1
pairing = [0,3,4,7,8,11,12,15,1,2,5,6,9,10,13,14]

world_cup = world_cup.sort_values(by=['Group', 'points', 'total_prob'], ascending=False).reset_index()
next_round_wc = world_cup.groupby('Group').nth([0, 1]) # select the top 2
next_round_wc = next_round_wc.reset_index()
next_round_wc = next_round_wc.loc[pairing]
next_round_wc = next_round_wc.set_index('Team')

finals = ['round_of_16', 'quarterfinal', 'semifinal', 'final']

labels = list()
odds = list()

for f in finals:
    print("___Starting of the {}___".format(f))
    iterations = int(len(next_round_wc) / 2)
    winners = []

    for i in range(iterations):
        home = next_round_wc.index[i*2]
        away = next_round_wc.index[i*2+1]
        print("{} vs. {}: ".format(home,away))
        row = pd.DataFrame(np.array([[np.nan, np.nan, np.nan, True]]), columns=validation_examples.columns)
        home_rank = world_cup_rankings.loc[home, 'rank']
        home_points = world_cup_rankings.loc[home, 'weighted_points']
        opp_rank = world_cup_rankings.loc[away, 'rank']
        opp_points = world_cup_rankings.loc[away, 'weighted_points']
        row['average_rank'] = (home_rank + opp_rank) / 2
        row['rank_difference'] = home_rank - opp_rank
        row['point_difference'] = home_points - opp_points
        row['is_won'] =np.nan
        predict_validation_input_fn1 = lambda: my_input_fn(row, 
                                                  row["is_won"], 
                                                  num_epochs=1, 
                                                  shuffle=False)
        validation_probabilities1 = linear_classifier.predict(input_fn=predict_validation_input_fn1)
        # Get just the probabilities for the positive class.
        validation_probabilities1 = np.array([item['probabilities'][1] for item in validation_probabilities1])
        #print(validation_probabilities1[0])
        home_win_prob = validation_probabilities1[0]

        #home_win_prob = model.predict_proba(row)[:,1][0]
        if home_win_prob <= 0.5:
            print("{0} wins with probability {1:.2f}".format(away, 1-home_win_prob))
            winners.append(away)
        else:
            print("{0} wins with probability {1:.2f}".format(home, home_win_prob))
            winners.append(home)

        labels.append("{}({:.2f}) vs. {}({:.2f})".format(world_cup_rankings.loc[home, 'country_abrv'], 
                                                        1/home_win_prob, 
                                                        world_cup_rankings.loc[away, 'country_abrv'], 
                                                        1/(1-home_win_prob)))
        odds.append([home_win_prob, 1-home_win_prob])
                
    next_round_wc = next_round_wc.loc[winners]
    print("\n")
Model training finished.
AUC on the validation set: 0.74
Accuracy on the validation set: 0.67
___Starting group A:___
Russia vs. Saudi Arabia:
Draw
Russia vs. Egypt:
Egypt wins with 0.67
Russia vs. Uruguay:
Uruguay wins with 0.84
Saudi Arabia vs. Egypt:
Egypt wins with 0.66
Saudi Arabia vs. Uruguay:
Uruguay wins with 0.84
Egypt vs. Uruguay:
Uruguay wins with 0.84
___Starting group C:___
France vs. Australia:
France wins with 0.57
France vs. Peru:
Draw
France vs. Denmark:
Draw
Australia vs. Peru:
Peru wins with 0.84
Australia vs. Denmark:
Denmark wins with 0.84
Peru vs. Denmark:
Draw
___Starting group B:___
Portugal vs. Spain:
Draw
Portugal vs. Morocco:
Portugal wins with 0.62
Portugal vs. Iran:
Portugal wins with 0.62
Spain vs. Morocco:
Spain wins with 0.60
Spain vs. Iran:
Spain wins with 0.60
Morocco vs. Iran:
Draw
___Starting group E:___
Brazil vs. Switzerland:
Draw
Brazil vs. Costa Rica:
Draw
Brazil vs. Serbia:
Brazil wins with 0.59
Switzerland vs. Costa Rica:
Draw
Switzerland vs. Serbia:
Switzerland wins with 0.57
Costa Rica vs. Serbia:
Draw
___Starting group D:___
Argentina vs. Iceland:
Draw
Argentina vs. Croatia:
Draw
Argentina vs. Nigeria:
Argentina wins with 0.64
Iceland vs. Croatia:
Draw
Iceland vs. Nigeria:
Iceland wins with 0.60
Croatia vs. Nigeria:
Croatia wins with 0.60
___Starting group G:___
Belgium vs. Panama:
Belgium wins with 0.68
Belgium vs. Tunisia:
Draw
Belgium vs. England:
Draw
Panama vs. Tunisia:
Tunisia wins with 0.84
Panama vs. England:
England wins with 0.84
Tunisia vs. England:
England wins with 0.61
___Starting group F:___
Germany vs. Mexico:
Germany wins with 0.56
Germany vs. Sweden:
Germany wins with 0.59
Germany vs. Korea Republic:
Germany wins with 0.73
Mexico vs. Sweden:
Draw
Mexico vs. Korea Republic:
Mexico wins with 0.65
Sweden vs. Korea Republic:
Sweden wins with 0.64
___Starting group H:___
Poland vs. Senegal:
Draw
Poland vs. Colombia:
Draw
Poland vs. Japan:
Poland wins with 0.66
Senegal vs. Colombia:
Colombia wins with 0.55
Senegal vs. Japan:
Senegal wins with 0.63
Colombia vs. Japan:
Colombia wins with 0.65
___Starting of the round_of_16___
Uruguay vs. Spain:
Spain wins with probability 0.54
Denmark vs. Croatia:
Denmark wins with probability 0.55
Switzerland vs. Mexico:
Mexico wins with probability 0.51
England vs. Poland:
Poland wins with probability 0.53
Egypt vs. Portugal:
Portugal wins with probability 0.84
Peru vs. Argentina:
Argentina wins with probability 0.56
Brazil vs. Germany:
Germany wins with probability 0.84
Belgium vs. Colombia:
Belgium wins with probability 0.54


___Starting of the quarterfinal___
Spain vs. Denmark:
Denmark wins with probability 0.52
Mexico vs. Poland:
Poland wins with probability 0.59
Portugal vs. Argentina:
Argentina wins with probability 0.53
Germany vs. Belgium:
Belgium wins with probability 0.52


___Starting of the semifinal___
Denmark vs. Poland:
Poland wins with probability 0.51
Argentina vs. Belgium:
Belgium wins with probability 0.57


___Starting of the final___
Poland vs. Belgium:
Belgium wins with probability 0.84


你可能感兴趣的:(Algorithm)