kaggle 入门 Bike sharing Demand prediction

根据天气等因素对自行车租赁数量进行预测

利用xgboost进行预测

import csv
from math import log, exp

import numpy

__author__ = 'Whiker'
__mtime__ = '2016/5/26'

from datetime import datetime

import pandas
import xgboost


def parse_time(data):
	date = datetime.strptime(data, "%Y-%m-%d %H:%M:%S")
	hour = date.hour
	day = date.day
	month = date.month
	dow = date.weekday()
	year = date.year
	return year, month, day, hour, dow


def mylog(data):
	data = int(data) + 1
	return log(data)


dataset = pandas.read_csv("input/train.csv")
testset = pandas.read_csv("input/test.csv")

labelData = dataset['casual'].apply(func=mylog)
labelData2 = dataset['registered'].apply(func=mylog)
myIndex = testset['datetime']

feature1 = ['atemp', 'temp', 'hour', 'humidity', 'windspeed',
			'month', 'dow', 'workingday', 'holiday', 'year', 'weather']

# datetime
dataset['year'], dataset['month'], dataset['day'], dataset['hour'], dataset['dow'] = zip(
	*dataset['datetime'].apply(func=parse_time))
testset['year'], testset['month'], testset['day'], testset['hour'], testset['dow'] = zip(
	*testset['datetime'].apply(func=parse_time))


trainData = dataset[feature1].iloc[:, :].values
testData = testset[feature1].iloc[:, :].values

offset = 6000
xgtrain = xgboost.DMatrix(trainData[:offset, :], label=labelData[:offset])
xgeval = xgboost.DMatrix(trainData[offset:, :], label=labelData[offset:])
xgtest = xgboost.DMatrix(testData)

watchlist = [(xgtrain, 'train'), (xgeval, 'val')]
params = {"max_depth": 6, "tree_num": 1000, "silent": 1, "shrinkage": 0.1}

xgModel = xgboost.train(list(params.items()), xgtrain, 450, watchlist, early_stopping_rounds=100)
# preds = numpy.column_stack((myIndex, xgModel.predict(xgtest, ntree_limit=xgModel.best_iteration))).tolist()
preds = xgModel.predict(xgtest, ntree_limit=xgModel.best_iteration).tolist()
preds = [exp(i) - 1 for i in preds]


# registered ================================================
feature2 = ['hour', 'humidity', 'atemp', 'temp', 'windspeed',
			'month', 'dow', 'workingday', 'holiday', 'year', 'weather']
trainData = dataset[list(feature2)].iloc[:, :].values
testData = testset[list(feature2)].iloc[:, :].values

xgtrain = xgboost.DMatrix(trainData[:offset, :], label=labelData2[:offset])
xgeval = xgboost.DMatrix(trainData[offset:, :], label=labelData2[offset:])
xgtest = xgboost.DMatrix(testData)
watchlist = [(xgtrain, 'train'), (xgeval, 'val')]

params = {"max_depth": 6, "tree_num": 1000, "silent": 1, "shrinkage": 0.1}

xgModel = xgboost.train(list(params.items()), xgtrain, 450, watchlist, early_stopping_rounds=100)
preds2 = xgModel.predict(xgtest, ntree_limit=xgModel.best_iteration).tolist()
preds2 = [exp(i) - 1 for i in preds2]


# ==================================================================

preds = numpy.column_stack((myIndex, map(lambda x, y: x + y, preds, preds2))).tolist()

with open("result/sub_xgb_linear.csv", "w") as output:
	writer = csv.writer(output, lineterminator='\n')
	writer.writerow(["datetime", "count"])
	writer.writerows(preds)


利用randomForest进行预测

import csv
from math import log, exp

import numpy
from sklearn.ensemble import RandomForestRegressor

__author__ = 'Whiker'
__mtime__ = '2016/5/26'

from datetime import datetime

import pandas


def parse_time(data):
	date = datetime.strptime(data, "%Y-%m-%d %H:%M:%S")
	hour = date.hour
	day = date.day
	month = date.month
	dow = date.weekday()
	year = date.year
	return year, month, day, hour, dow


def mylog(data):
	data = int(data) + 1
	return log(data)


dataset = pandas.read_csv("input/train.csv")
testset = pandas.read_csv("input/test.csv")

labelData = dataset['casual'].apply(func=mylog).values
labelData2 = dataset['registered'].apply(func=mylog).values
myIndex = testset['datetime']

feature1 = ['atemp', 'temp', 'hour', 'humidity', 'windspeed',
			'month', 'dow', 'workingday', 'holiday', 'year', 'weather']

# datetime
dataset['year'], dataset['month'], dataset['day'], dataset['hour'], dataset['dow'] = zip(
	*dataset['datetime'].apply(func=parse_time))
testset['year'], testset['month'], testset['day'], testset['hour'], testset['dow'] = zip(
	*testset['datetime'].apply(func=parse_time))

trainData = dataset[feature1].iloc[:, :].values
testData = testset[feature1].iloc[:, :].values

rfModel = RandomForestRegressor(n_estimators=100)
rfModel.fit(trainData, labelData)
preds = rfModel.predict(testData).tolist()
preds = [exp(i) - 1 for i in preds]


# registered ================================================
feature2 = ['hour', 'humidity', 'atemp', 'temp', 'windspeed',
			'month', 'dow', 'workingday', 'holiday', 'year', 'weather']
trainData = dataset[list(feature2)].iloc[:, :].values
testData = testset[list(feature2)].iloc[:, :].values

rfModel = RandomForestRegressor(n_estimators=100)
rfModel.fit(trainData, labelData2)
preds2 = rfModel.predict(testData).tolist()
preds2 = [exp(i) - 1 for i in preds2]


# ==================================================================

preds = numpy.column_stack((myIndex, map(lambda x, y: x + y, preds, preds2))).tolist()

with open("result/sub_rf.csv", "w") as output:
	writer = csv.writer(output, lineterminator='\n')
	writer.writerow(["datetime", "count"])
	writer.writerows(preds)



最后评分0.46左右

你可能感兴趣的:(kaggle 入门 Bike sharing Demand prediction)