老师最近布置了一个作业,第一次接触了xgboost,之前只是听老师理论的讲解,这次布置作业让我对这个模型有了深入的理解。作业任务说明:Happy Customer Bank目标客户识别
https://discuss.analyticsvidhya.com/t/hackathon-3-x-predict-customer-worth-for-happy-customer-bank/3802
这里不对该项目做详细介绍。
花了我半天的时间,在尝试怎么把字符串转化为数字类型进行训练。真的好麻烦,比如把“female”和“male”用数字形式表示。。
Female Male
1 0
采用枚举的方式进行数字类型的转化,对多类字符串类型特征值进行转化,具体代码如下:
array = ['Gender', 'City', 'DOB', 'Lead_Creation_Date', 'Employer_Name', 'Salary_Account', 'Mobile_Verified', 'Var1', 'Filled_Form', 'Device_Type', 'Var2', 'Source']
#array = train.columns显示label
#print array.size可以用来测试数组的距离
for i in range(len(array)) :
class_mapping = {label:idx for idx,label in enumerate(set(train[array[i]]))}
train[array[i]] = train[array[i]].map(class_mapping)
# print train[array[i]]
求误差的代码:
#-*-coding:utf-8-*-
import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
import time
from sklearn.feature_extraction import DictVectorizer
from sklearn import preprocessing
start_name = time.time()
path = "D:\\"
#读入数据
train = pd.read_csv(path + "train.csv")
test = pd.read_csv(path + "test.csv")
#Sex_ohe_1 = pd.get_dummies(train['Gender'])
#print Sex_ohe_1.head()
array = ['Gender', 'City', 'DOB', 'Lead_Creation_Date', 'Employer_Name', 'Salary_Account', 'Mobile_Verified', 'Var1', 'Filled_Form', 'Device_Type', 'Var2', 'Source']
#array = train.columns显示label
#print array.size可以用来测试数组的距离
for i in range(len(array)) :
class_mapping = {label:idx for idx,label in enumerate(set(train[array[i]]))}
train[array[i]] = train[array[i]].map(class_mapping)
# print train[array[i]]
for i in range(len(array)) :
class_mapping = {label:idx for idx,label in enumerate(set(test[array[i]]))}
test[array[i]] = train[array[i]].map(class_mapping)
# print test[array[i]]
#用train_test_split进行训练数据集的划分,将训练集和交叉验证集分为8:2
train1, val = train_test_split(train, test_size = 0.2, random_state=1)
y = train1.Disbursed
x = train1.drop(['Disbursed', 'ID', 'LoggedIn'], axis=1)
test = test.drop('ID', axis=1)
valy = val.Disbursed
valx = val.drop(['Disbursed', 'ID', 'LoggedIn'], axis=1)
xgb_val = xgb.DMatrix(valx, label = valy)
xgb_train1 = xgb.DMatrix(x, label = y)
xgb_test = xgb.DMatrix(test)
print xgb_test.feature_names
params = {
'booster':'gbtree',
'objective': 'multi:softmax', #多分类的问题
'num_class':10, # 类别数,与 multisoftmax 并用
'gamma':0.1, # 用于控制是否后剪枝的参数,越大越保守,一般0.1、0.2这样子。
'max_depth':12, # 构建树的深度,越大越容易过拟合
'lambda':2, # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。
'subsample':0.7, # 随机采样训练样本
'colsample_bytree':0.7, # 生成树时进行的列采样
'min_child_weight':3,
# 这个参数默认是 1,是每个叶子里面 h 的和至少是多少,对正负样本不均衡时的 0-1 分类而言
#,假设 h 在 0.01 附近,min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。
#这个参数非常影响结果,控制叶子节点中二阶导的和的最小值,该参数值越小,越容易 overfitting。
'silent':0 ,#设置成1则没有运行信息输出,最好是设置为0.
'eta': 0.01, # 如同学习率
'seed':1000,
'nthread':7,# cpu 线程数
#'eval_metric': 'auc'
}
plst = list(params.items())
num_iter = 5000
watchList = [(xgb_train1, "train"), (xgb_val, "val")]
model = xgb.train(plst, xgb_train1, num_iter, watchList, early_stopping_rounds=100)
model.save_model(path + "xgboost.model")
print "best_ntree_limit", model.best_ntree_limit
predict = model.predict(xgb_test, ntree_limit = model.best_ntree_limit)
np.savetxt(path + 'result.csv', np.c_[range(1, len(test)+1), predict], delimiter=',',header='ID, Disbursed',comments='',fmt='%d')
cost_time = time.time() - start_name
print "运行时间:", cost_time
测试结果:
train-merror:0.014695 val-merror:0.014365
best_ntree_limit 1
运行时间: 72.9449999332
求准确率的代码:
#-*-coding:utf-8-*-
import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import accuracy_score
import time
from sklearn import preprocessing
from sklearn import metrics
from matplotlib import pyplot as plt
from xgboost import plot_importance
start_name = time.time()
path = "D:\\"
#读入数据
train = pd.read_csv(path + "train.csv")
test = pd.read_csv(path + "test.csv")
#Sex_ohe_1 = pd.get_dummies(train['Gender'])
#print Sex_ohe_1.head()
array = ['Gender', 'City', 'DOB', 'Lead_Creation_Date', 'Employer_Name', 'Salary_Account', 'Mobile_Verified', 'Var1', 'Filled_Form', 'Device_Type', 'Var2', 'Source']
#array = train.columns显示label
#print array.size可以用来测试数组的距离
for i in range(len(array)) :
class_mapping = {label:idx for idx,label in enumerate(set(train[array[i]]))}
train[array[i]] = train[array[i]].map(class_mapping)
# print train[array[i]]
for i in range(len(array)) :
class_mapping = {label:idx for idx,label in enumerate(set(test[array[i]]))}
test[array[i]] = train[array[i]].map(class_mapping)
# print test[array[i]]
#用train_test_split进行训练数据集的划分,将训练集和交叉验证集分为8:2
train1, val = train_test_split(train, test_size = 0.2, random_state=1)
print len(train1), len(val)
y = train1.Disbursed
x = train1.drop(['Disbursed', 'ID', 'LoggedIn'], axis=1)
test = test.drop('ID', axis=1)
valy = val.Disbursed
valx = val.drop(['Disbursed', 'ID', 'LoggedIn'], axis=1)
#数据归一化处理
scaler = preprocessing.StandardScaler().fit(x)
x = scaler.transform(x)
#x = preprocessing.scale(x)
print(x)
scaler = preprocessing.StandardScaler().fit(valx)
valx = scaler.transform(valx)
#valx = preprocessing.scale(valx)
print(valx)
scaler = preprocessing.StandardScaler().fit(test)
test = scaler.transform(test)
#test = preprocessing.scale(test)
print(test)
xgb_val = xgb.DMatrix(valx, label = valy)
xgb_train1 = xgb.DMatrix(x, label = y)
xgb_test = xgb.DMatrix(test)
print xgb_test.feature_names
params = {
'booster':'gbtree',
'objective': 'multi:softmax', #多分类的问题
'num_class':2, # 类别数,与 multisoftmax 并用
'gamma':0.1, # 用于控制是否后剪枝的参数,越大越保守,一般0.1、0.2这样子。
'max_depth':6, # 构建树的深度,越大越容易过拟合
'lambda':2, # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。
'subsample':0.7, # 随机采样训练样本
'colsample_bytree':0.7, # 生成树时进行的列采样
'min_child_weight':3,
# 这个参数默认是 1,是每个叶子里面 h 的和至少是多少,对正负样本不均衡时的 0-1 分类而言
#,假设 h 在 0.01 附近,min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。
#这个参数非常影响结果,控制叶子节点中二阶导的和的最小值,该参数值越小,越容易 overfitting。
'silent':0 ,#设置成1则没有运行信息输出,最好是设置为0.
'eta': 0.05, # 如同学习率
'seed':1000,
'nthread':7,# cpu 线程数
#'eval_metric': 'auc'
}
plst = list(params.items())
num_iter = 10000
watchlist = [(xgb_train1,'train'),(xgb_val,'val')]
bst = xgb.train(plst,xgb_train1,num_boost_round=num_iter,evals = watchlist)
#bst = xgb.train(plst, xgb_train1, num_iter)
train_preds = bst.predict(xgb_train1)
#print "train_preds", train_preds
train_predictions = [round(value) for value in train_preds]
#print "train_predictions", train_predictions
y_train = xgb_train1.get_label()
#print "y_train", y_train
count1 = 0
if i in range(len(train_predictions)):
if(train_predictions[i] == 1):
count = count + 1
print "count:", count
train_acc = accuracy_score(y_train, train_predictions)
print "Train Accuary:%.2f%%" %(train_acc * 100.0)
print xgb_train1
print xgb_test.feature_names
preds = bst.predict(xgb_val)
test_predictions = [round(value) for value in preds]
y_test = xgb_val.get_label()
test_acc = accuracy_score(y_test, test_predictions)
print "Test Accuary:%.2f%%" %(test_acc * 100.0)
#检验模型是否稳健,macro表示宏平均),是先对每一个类统计指标值,然后在对所有类求算术平均值。
#微平均(Micro-averaging),是对数据集中的每一个实例不分类别进行统计建立全局混淆矩阵,然后计算相应指标。(来源:谈谈评价指标中的宏平均和微平均)
print set(y_train)
print set(train_predictions)
train_f1 = metrics.f1_score(y_train, train_preds, average="weighted", labels=np.unique(train_preds))
print "Train F1 score:%.6f" %train_f1
#labels=np.unique(preds),决定您对未预测的标签分数不感兴趣,然后明确指定您感兴趣的标签(这些标签至少预测过一次)
print set(y_test)
print set(test_predictions)
test_f1 = metrics.f1_score(y_test, preds, average="weighted", labels=np.unique(preds))
print "Test F1 score:%.6f" %test_f1
bst.save_model(path + "xgboost.model")
print "best_ntree_limit", bst.best_ntree_limit
predict = bst.predict(xgb_test, ntree_limit = bst.best_ntree_limit)
np.savetxt(path + 'result.csv', np.c_[range(1, len(test)+1), predict], delimiter=',',header='ID, Disbursed',comments='',fmt='%d')
cost_time = time.time() - start_name
print "运行时间:" , cost_time
plot_importance(bst)
plt.show()
测试结果:
Train Accuary:99.99%
Test Accuary:98.54%
Train F1 score:0.999871
Test F1 score:0.978390
best_ntree_limit 10000
运行时间: 2621.94099998
欢迎大家多交流。如果有错误请指正。