Tags:MachineLearning
前面的文章讲到了adaboost分类器,这里参照这篇文章,在scikit-learn下实现下adaboot。原文对scikit-learn adaboost 库参数介绍的很详细。
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_gaussian_quantiles
x1, y1 = make_gaussian_quantiles(cov=2.0, n_samples=500, n_features=2, n_classes=2, random_state=1)
x2, y2 = make_gaussian_quantiles(mean=(3,3), cov=1.5, n_samples=500, n_features=2, n_classes=2, random_state=1)
X = np.concatenate((x1, x2)) # default is in the direction of axis x
y = np.concatenate((y1, -y2+1))
# print(X.shape)
plt.scatter(X[:,0], X[:,1], marker='o', c=y)
# plt.show()
bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2, min_samples_split=20, min_samples_leaf=5),
algorithm="SAMME",
n_estimators=300, learning_rate=0.8)
bdt.fit(X, y)
print("score:", bdt.score(X,y))
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),
np.arange(y_min, y_max, 0.02))
print(xx.shape)
print(xx.ravel().shape)
Z = bdt.predict(np.c_[xx.ravel(), yy.ravel()]) # np.c_ short for column concatenate np.r_ short for row concatenate
Z = Z.reshape(xx.shape)
cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)
plt.scatter(X[:, 0], X[:, 1], marker='o', c=y)
plt.show()
不过上面的POST是自己模拟的数据,下面我自己用adaboost来解决kaggle上面的一个classificaiton问题:
这个数据中很多离散变量,所有的特征加起来是62维的,整个过程最重要的就是如何调参吧,当然前期的数据准备也很重要,但是我们只看如何调参数。我没有使用adaboost而是使用加强版gbdt, 但是调参的原理是一样的。
使用默认的gbdt, accuracy在0.844的样子,然后就很难提升。先搜索n_estimators, 迭代次数,然后搜索max_depth, min_sample_spilt, 再然后搜搜min_sample_split和min_sample_leaf, 然后搜索max_features, subsample。 结合所有的这些超参数,可以达到0.8502的accuracy:
# coding: utf-8
# In[1]:
from string import ascii_uppercase
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
from sklearn import svm
# In[2]:
adult_df = pd.read_csv("./adult.csv")
# In[3]:
adult_df.describe()
# In[4]:
adult_df.columns
# In[5]:
adult_df.head(10)
# In[6]:
adult_df['income'] = np.where(adult_df['income'] == '>50K', 1, 0)
print(len(adult_df['income'] == 1))
print(len(adult_df['income'] == 0))
# In[7]:
# define the visualize function to see how the columns ineteracts with incomes >=50k or <50k
def hist_by(df, column):
X, y = [], []
for value in sorted(df[column].unique()):
print(value)
X.append(value)
y.append(df[df[column] == value]['income'].mean()*100.0)
index = np.arange(len(X))
width = 0.35
plt.bar(index, y, width)
plt.xticks(index+width/2, X, rotation=70)
plt.yticks(np.arange(0, 100, 10))
plt.ylim(0, 100)
plt.xlabel(column)
plt.ylabel('Percentage of people who\'s income is above $50K')
plt.tight_layout()
plt.show()
# In[8]:
hist_by(adult_df, 'sex')
# In[9]:
hist_by(adult_df, "education")
# In[10]:
hist_by(adult_df, "marital.status")
# In[11]:
hist_by(adult_df, 'occupation')
# In[12]:
hist_by(adult_df, 'workclass')
# In[13]:
hist_by(adult_df, 'race')
# In[14]:
# 去除 职业和工作类型未知的行
adult_df = adult_df[adult_df['occupation'] != '?']
adult_df = adult_df[adult_df['workclass'] != '?']
# In[15]:
# pd.get_dummies 将类别变量转为向量
education_dummies = pd.get_dummies(adult_df['education'])
marital_dummies = pd.get_dummies(adult_df['marital.status'])
relationship_dummies = pd.get_dummies(adult_df['relationship'])
sex_dummies = pd.get_dummies(adult_df['sex'])
occupation_dummies = pd.get_dummies(adult_df['occupation'])
native_dummies = pd.get_dummies(adult_df['native.country'])
race_dummies = pd.get_dummies(adult_df['race'])
workclass_dummies = pd.get_dummies(adult_df['workclass'])
# In[16]:
def into_bins(column, bins):
group_names = list(ascii_uppercase[:len(bins)-1])
binned = pd.cut(column, bins, labels=group_names)
return binned
# In[17]:
adult_df['capital.loss'].describe()
# In[18]:
unique = sorted(adult_df['capital.loss'].unique())
plt.scatter(range(len(unique)), unique)
plt.ylabel('Capital Loss')
plt.tick_params(axis='x', which='both', labelbottom='off', bottom='off') # disable x ticks
plt.show()
# In[19]:
loss_bins = into_bins(adult_df['capital.loss'], list(range(-1, 4500, 500)))
# print(loss_bins)
loss_dummies = pd.get_dummies(loss_bins)
# In[20]:
adult_df['capital.gain'].describe()
# print(adult_df['capital.gain'])
# In[21]:
unique = sorted(adult_df['capital.gain'].unique())
plt.scatter(range(len(unique)), unique)
plt.ylabel('Capital Gain')
plt.tick_params(axis='x', which='both', labelbottom='off', bottom='off') # disable x ticks
plt.show()
# In[22]:
gain_bins = into_bins(adult_df['capital.gain'], list(range(-1, 42000, 5000)) + [100000])
gain_dummies = pd.get_dummies(gain_bins)
# In[23]:
X = pd.concat([adult_df[['age', 'hours.per.week']], gain_dummies, occupation_dummies, workclass_dummies, education_dummies, marital_dummies, race_dummies, sex_dummies], axis=1)
y = adult_df['income']
X_scaled = preprocessing.scale(X) # rescale data
from sklearn.decomposition import PCA
# pca = PCA(n_components=20)
# X_scale = pca.fit_transform(X_scaled)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, train_size=0.9, random_state=1)
# In[25]:
print("Shape of X:", X_scaled.shape)
# print(gain_bins)
# print(gain_dummies)
print(X_scaled)
# In[26]:
dct = DecisionTreeClassifier()
# clf = svm.SVC()
# In[27]:
# clf = AdaBoostClassifier(base_estimator = dct, algorithm = "SAMME", random_state=1, n_estimators=100, learning_rate=1)
from sklearn.grid_search import GridSearchCV
from sklearn import cross_validation, metrics
clf0 = GradientBoostingClassifier()
clf0.fit(X_train, y_train)
# In[28]:
y_pred = clf0.predict(X_test)
print('Accuracy: {}'.format(accuracy_score(y_pred, y_test)))
# In[29]:
param_estimators = {"n_estimators":[20,40,60,80,100,120]}
gsearch1 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, min_samples_split=300,
min_samples_leaf=20,max_depth=8,max_features='sqrt', subsample=0.8,random_state=10),
param_grid = param_estimators, scoring='roc_auc',iid=False,cv=5)
gsearch1.fit(X_train,y_train)
# In[30]:
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_
# In[41]:
param_test2 = {'max_depth':[2,4,6,8,10], 'min_samples_split':[100,140,180,220]}
gsearch2 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=120, min_samples_leaf=20,
max_features='sqrt', subsample=0.8, random_state=10),
param_grid = param_test2, scoring='roc_auc',iid=False, cv=5)
gsearch2.fit(X_train,y_train)
gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_
# In[43]:
param_test3 = {'min_samples_split':[500,1200,1000,1400,1500], 'min_samples_leaf':[80,100,120,140]}
gsearch3 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=120,max_depth=10,
max_features='sqrt', subsample=0.8, random_state=10),
param_grid = param_test3, scoring='roc_auc',iid=False, cv=5)
gsearch3.fit(X_train,y_train)
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_
# In[39]:
gbm1 = GradientBoostingClassifier(learning_rate=0.02, n_estimators=800,max_depth=4, min_samples_leaf =140,
min_samples_split =1500, max_features='sqrt', subsample=0.9, random_state=10)
gbm1.fit(X_train,y_train)
y_pred = gbm1.predict(X_test)
# In[40]:
y_pred = gbm1.predict(X_test)
print('Accuracy: {}'.format(accuracy_score(y_pred, y_test)))
# In[44]:
param_test4 = {'max_features':[5,10,15,20]}
gsearch4 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=120,max_depth=10, min_samples_leaf =80,
min_samples_split =1000, subsample=0.8, random_state=10),
param_grid = param_test4, scoring='roc_auc',iid=False, cv=5)
gsearch4.fit(X_train,y_train)
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_
# In[ ]:
param_test5 = {'subsample':[0.6,0.7,0.75,0.8,0.85,0.9]}
gsearch5 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=120,max_depth=10, min_samples_leaf =80,
min_samples_split =1000, max_features=15, random_state=10),
param_grid = param_test5, scoring='roc_auc',iid=False, cv=5)
gsearch5.fit(X_train,y_train)
gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_
# In[37]:
gbm2 = GradientBoostingClassifier(learning_rate=0.1, n_estimators=120,max_depth=10, min_samples_leaf =80,
min_samples_split =1000, max_features=15, subsample=0.9, random_state=10)
gbm2.fit(X_train,y_train)
y_pred = gbm2.predict(X_test)
# In[38]:
print('Accuracy: {}'.format(accuracy_score(y_pred, y_test)))