%matplotlib inline
import random, os, sys
import h2o
import pandas
import pprint
import operator
import matplotlib
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
from tabulate import tabulate
from h2o.automl import H2OAutoML
from datetime import datetime
import pandas as pd
import logging
import csv
import optparse
import time
import json
from distutils.util import strtobool
import psutil
import numpy as np
h2o.init(strict_version_check=False)
interactive = True
if not interactive: matplotlib.use('Agg', warn=False)
import matplotlib.pyplot as plt
print("Import and Parse airlines data")
data = h2o.import_file(path='data/allyears2k.csv')
data.describe()
def scatter_plot(data, x, y, max_points = 1000, fit = True):
if(fit):
lr = H2OGeneralizedLinearEstimator(family = "gaussian")
lr.train(x=x, y=y, training_frame=data)
coeff = lr.coef()
df = data[[x,y]]
runif = df[y].runif()
df_subset = df[runif < float(max_points)/data.nrow]
df_py = h2o.as_list(df_subset)
if(fit): h2o.remove(lr._id)
if(df_py[x].dtype == "object"):
if interactive: df_py.boxplot(column = y, by = x)
else:
if interactive: df_py.plot(x = x, y = y, kind = "scatter")
if(fit):
x_min = min(df_py[x])
x_max = max(df_py[x])
y_min = coeff["Intercept"] + coeff[x]*x_min
y_max = coeff["Intercept"] + coeff[x]*x_max
plt.plot([x_min, x_max], [y_min, y_max], "k-")
if interactive: plt.show()
scatter_plot(data, "Distance", "AirTime", fit = True)
distance的点图如下:
scatter_plot(data, "UniqueCarrier", "ArrDelay", max_points = 5000, fit = False)
uniquecarrier的箱线图如下:
grouped = data.group_by("Month")
bpd = grouped.count().sum("Cancelled").frame
bpd.show()
bpd.describe()
bpd.dim
data["Year"] = data["Year"] .asfactor()
data["Month"] = data["Month"] .asfactor()
data["DayOfWeek"] = data["DayOfWeek"].asfactor()
data["Cancelled"] = data["Cancelled"].asfactor()
hour1 = data["CRSArrTime"] / 100
mins1 = data["CRSArrTime"] % 100
arrTime = hour1*60 + mins1
hour2 = data["CRSDepTime"] / 100
mins2 = data["CRSDepTime"] % 100
depTime = hour2*60 + mins2
data["TravelTime"] = (arrTime-depTime > 0).ifelse((arrTime-depTime), h2o.H2OFrame([[None]] * data.nrow))
scatter_plot(data, "Distance", "TravelTime")
data.impute(column = "Distance", by = ["Origin", "Dest"])
scatter_plot(data, "Distance", "TravelTime")
train,test = data.split_frame([.9])
myY = "IsDepDelayed"
myX = ["Origin", "Dest", "Year", "UniqueCarrier", "DayOfWeek", "Month", "Distance", "FlightNum"]
data_glm = H2OGeneralizedLinearEstimator(family="binomial", standardize=True)
data_glm.train(x =myX,
y =myY,
training_frame =train,
validation_frame=test)