在看H2O,把阅读官方booklet和API了解到的一些东西随手记了下来,以为备忘,遂成此小文。
library(h2o)
h2o.init(ip = 'localhost', port = 54321, nthreads = -1, max_men_size = '4g')
H2O:"R,你用table秀一下这些信息"
R:"臣妾做不到啊……"
//话说我得有多无聊,大晚上编这种段子。。。
as.data.frame()
H2O->Ras.h2o()
R->H2Ostr.H2OFrame()
查看下其中元素信息确保转换正确。# Import dataset and display summary
library(h2o)
h2o.init()
airlinesURL = "https://s3.amazonaws.com/h2o-airlines-unpacked/allyears2k.csv"
airlines.hex = h2o.importFile(path = airlinesURL, destination_frame = "airlines.hex")
summary(airlines.hex)
# View quantiles and histograms
#high_na_columns = h2o.ignoreColumns(data = airlines.hex)
quantile(x = airlines.hex$ArrDelay, na.rm = TRUE)
h2o.hist(airlines.hex$ArrDelay)
# Find number of flights by airport
originFlights = h2o.group_by(data = airlines.hex, by = "Origin", nrow("Origin"),gb.control=list(na.methods="rm"))
originFlights.R = as.data.frame(originFlights)
# Find number of flights per month
flightsByMonth = h2o.group_by(data = airlines.hex, by = "Month", nrow("Month"),gb.control=list(na.methods="rm"))
flightsByMonth.R = as.data.frame(flightsByMonth)
# Find months with the highest cancellation ratio
which(colnames(airlines.hex)=="Cancelled")
cancellationsByMonth = h2o.group_by(data = airlines.hex, by = "Month", sum("Cancelled"),gb.control=list(na.methods="rm"))
cancellation_rate = cancellationsByMonth$sum_Cancelled/flightsByMonth$nrow_Month
rates_table = h2o.cbind(flightsByMonth$Month, cancellation_rate)
rates_table.R = as.data.frame(rates_table)
# Construct test and train sets using sampling
airlines.split = h2o.splitFrame(data = airlines.hex,ratios = 0.85)
airlines.train = airlines.split[[1]]
airlines.test = airlines.split[[2]]
# Display a summary using table-like functions
h2o.table(airlines.train$Cancelled)
h2o.table(airlines.test$Cancelled)
# Set predictor and response variables
Y = "IsDepDelayed"
X = c("Origin", "Dest", "DayofMonth", "Year", "UniqueCarrier", "DayOfWeek", "Month", "DepTime", "ArrTime", "Distance")
# Define the data for the model and display the results
airlines.glm <- h2o.glm(training_frame=airlines.train, x=X, y=Y, family = "binomial", alpha = 0.5)
# View model information: training statistics, performance, important variables
summary(airlines.glm)
# Predict using GLM model
pred = h2o.predict(object = airlines.glm, newdata = airlines.test)
# Look at summary of predictions: probability of TRUE class (p1)
summary(pred$p1)
下篇中将看心情介绍一些数据处理中的常用函数以及简单说明一些其他模型的用法。
饿了只好去睡觉……