Kaggle竞赛PetFinder日记,第一次提交:简单粗暴先裸奔一下
竞赛题目见:
https://www.kaggle.com/c/petfinder-adoption-prediction
第一次提交:简单粗暴先裸奔一下随机森林可以得0.4分,先提交一个结果看看能得几分,0.287,哈哈,排名第635,不经特征工程,这样已经把我的第一次Kaggle流程跑通了,休息一下,回头继续努力。
#!/usr/bin/python
# -*- coding: utf-8 -*-
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import warnings
warnings.filterwarnings("ignore")
train = pd.read_csv("../input/train/train.csv")
# 填充缺失值
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values="NaN", strategy="median", axis=0)
train["AdoptionSpeed"] = imp.fit_transform(train[["AdoptionSpeed"]])
train["AdoptionSpeed"] = train["AdoptionSpeed"].astype(int)
# 选取一些特征作为我们划分的依据
x = train[['Type', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2', 'Color3',
'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed', 'Sterilized', 'Health'
, 'Quantity', 'Fee', 'State', 'VideoAmt', 'PhotoAmt']]
y = train['AdoptionSpeed']
# grouped = x['Type'].groupby(x["Type"])
# print(grouped.count())
# print( y.value_counts())
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)
# 使用决策树
dtc = DecisionTreeClassifier()
dtc.fit(x_train, y_train)
dt_predict = dtc.predict(x_test)
print(dtc.score(x_test, y_test))
print(classification_report(y_test, dt_predict, target_names=["0", "1", "2", "3", "4"]))
# 使用随机森林
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(x_train, y_train)
rfc_y_predict = rfc.predict(x_test)
print(rfc.score(x_test, y_test))
# 使用随机森林全量学习和全量预测
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(x, y)
test = pd.read_csv("../input/test/test.csv")
x_test = test[['Type', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2', 'Color3',
'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed', 'Sterilized', 'Health'
, 'Quantity', 'Fee', 'State', 'VideoAmt', 'PhotoAmt']]
final_result = rfc.predict(x_test)
submission_df = pd.DataFrame(data={'PetID': test['PetID'].tolist(), 'AdoptionSpeed': final_result})
submission_df.to_csv('submission.csv', index=False)