python--h2o实现广义线性模型

%matplotlib inline
import random, os, sys
import h2o
import pandas
import pprint
import operator
import matplotlib
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
from tabulate import tabulate
from h2o.automl import H2OAutoML
from datetime import datetime
import pandas as pd
import logging
import csv
import optparse
import time
import json
from distutils.util import strtobool
import psutil
import numpy as np

# 连接H2O服务器
h2o.init(strict_version_check=False)
# 设置交互式图形
interactive = True
if not interactive: matplotlib.use('Agg', warn=False)
import matplotlib.pyplot as plt
# 导入数据集
print("Import and Parse airlines data")
data = h2o.import_file(path='data/allyears2k.csv')
data.describe()
# 
def scatter_plot(data, x, y, max_points = 1000, fit = True):
    if(fit):
    	# 广义线性模型
        lr = H2OGeneralizedLinearEstimator(family = "gaussian")
        # 训练
        lr.train(x=x, y=y, training_frame=data)
        # 系数
        coeff = lr.coef()
    df = data[[x,y]]
    # 从均匀分布中产生随机数的column(和数据源有一样的数据布局)
    runif = df[y].runif()
    df_subset = df[runif < float(max_points)/data.nrow]
    # 将h2o_data转换成list
    df_py = h2o.as_list(df_subset)
    # 移除对象
    if(fit): h2o.remove(lr._id)

    # 变量是string,则画箱线图
    if(df_py[x].dtype == "object"):
        if interactive: df_py.boxplot(column = y, by = x)
    # 否则画点图
    else:
        if interactive: df_py.plot(x = x, y = y, kind = "scatter")    
    if(fit):
        x_min = min(df_py[x])
        x_max = max(df_py[x])
        y_min = coeff["Intercept"] + coeff[x]*x_min
        y_max = coeff["Intercept"] + coeff[x]*x_max
        plt.plot([x_min, x_max], [y_min, y_max], "k-")
    if interactive: plt.show()
scatter_plot(data, "Distance", "AirTime", fit = True)

distance的点图如下:
python--h2o实现广义线性模型_第1张图片

scatter_plot(data, "UniqueCarrier", "ArrDelay", max_points = 5000, fit = False)

uniquecarrier的箱线图如下:
python--h2o实现广义线性模型_第2张图片

# 按月份求和
grouped = data.group_by("Month")
bpd = grouped.count().sum("Cancelled").frame
bpd.show()
bpd.describe()
bpd.dim

python--h2o实现广义线性模型_第3张图片

# 将"Year," "Month," "DayOfWeek," "Cancelled" columns转换成factors
data["Year"]      = data["Year"]     .asfactor()
data["Month"]     = data["Month"]    .asfactor()
data["DayOfWeek"] = data["DayOfWeek"].asfactor()
data["Cancelled"] = data["Cancelled"].asfactor()
# 计算和画出旅游时间
hour1 = data["CRSArrTime"] / 100
mins1 = data["CRSArrTime"] % 100
arrTime = hour1*60 + mins1

hour2 = data["CRSDepTime"] / 100
mins2 = data["CRSDepTime"] % 100
depTime = hour2*60 + mins2


data["TravelTime"] = (arrTime-depTime > 0).ifelse((arrTime-depTime), h2o.H2OFrame([[None]] * data.nrow))
scatter_plot(data, "Distance", "TravelTime")

python--h2o实现广义线性模型_第4张图片

# 填补缺失值
data.impute(column = "Distance", by = ["Origin", "Dest"])
scatter_plot(data, "Distance", "TravelTime")
# 分割train/test 
train,test = data.split_frame([.9])
# 设置x和y的columns
myY = "IsDepDelayed"
myX = ["Origin", "Dest", "Year", "UniqueCarrier", "DayOfWeek", "Month", "Distance", "FlightNum"]
# 训练glm_model,二项式分布
data_glm = H2OGeneralizedLinearEstimator(family="binomial", standardize=True)
data_glm.train(x               =myX,
               y               =myY,
               training_frame  =train,
               validation_frame=test)          

你可能感兴趣的:(机器学习)