model1 = CatBoostClassifier(iterations=200
,learning_rate=0.1
,loss_function='Logloss')
model2 = XGBClassifier(learning_rate=0.1
,n_estimators=1000 # 树的个数
,max_depth=6 # 树的深度
,min_child_weight = 1 # 叶子节点最小权重
,gamma=0.2 # 惩罚项中叶子结点个数前的参数
,subsample=0.8 # 随机选择80%样本建立决策树
,colsample_btree=0.8 # 随机选择80%特征建立决策树
,objective='binary:logistic'
,scale_pos_weight=1 # 解决样本个数不平衡的问题
,random_state=27 # 随机数)
model3 = lgb.LGBMClassifier(objective='regression'
,learning_rate=0.1
,n_estimators=20
,subsample=0.8
,colsample_bytree=0.8
,num_leaves=22
,max_depth =5
,min_child_samples=20
,min_child_weight=0.00005
,feature_fraction = 0.6
,bagging_fraction = 0.7)
## 显示cell运行时长
%load_ext klab-autotime
## 导入包(有些包没有用到)
import osfrom sklearn.ensemble
import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import warnings
from itertools import combinations
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import lightgbm as lgb
## 数据处理
%matplotlib inlinewarnings.filterwarnings('ignore')pd.set_option('display.max_rows',None)pd.set_option('display.max_columns',None)
path = '/home/kesci/data/competition_A/'
train_df = pd.read_csv(path+'train_set.csv')
test_df = pd.read_csv(path+'test_set.csv')
submission1 = pd.read_csv(path+'submission_example.csv')
submission2 = pd.read_csv(path+'submission_example.csv')
submission3 = pd.read_csv(path+'submission_example.csv')
submission = pd.read_csv(path+'submission_example.csv')
print('Train Shape:{}\nTest Shape:{}'.format(train_df.shape,test_df.shape))train_df.head()
num_columns = ['年龄','体重','身高','体重指数', '腰围', '最高血压', '最低血压',
'好胆固醇', '坏胆固醇', '总胆固醇','收入']
zero_to_one_columns = ['肥胖腰围','血脂异常','PVD']str_columns = ['性别','区域','体育活动','教育','未婚',
'护理来源','视力不佳','饮酒','高血压', '家庭高血压', '糖尿病', '家族糖尿病','家族肝炎', '慢性疲劳','ALF']
# 字符编码
for i in tqdm(str_columns):
lbl = LabelEncoder()
train_df[i] = lbl.fit_transform(train_df[i].astype(str))
test_df[i] = lbl.fit_transform(test_df[i].astype(str))
# 数值归一化
train_df[num_columns] = MinMaxScaler().fit_transform(train_df[num_columns])
test_df[num_columns] = MinMaxScaler().fit_transform(test_df[num_columns])
# 空值填充
train_df.fillna(0,inplace=True)test_df.fillna(0,inplace=True)
# 准备数据集
all_columns = [i for i in train_df.columns if i not in ['肝炎','ID']]
train_x,train_y = train_df[all_columns].values,train_df['肝炎'].values
test_x = test_df[all_columns].values
submission1['hepatitis'] =0
submission2['hepatitis'] =0
submission3['hepatitis'] =0
kfold = StratifiedKFold(n_splits=5, shuffle=False)
## 训练
model1 = CatBoostClassifier(iterations=200
,learning_rate=0.1
,loss_function='Logloss')
model2 = XGBClassifier(learning_rate=0.1
,n_estimators=1000 # 树的个数
,max_depth=6 # 树的深度
,min_child_weight = 1 # 叶子节点最小权重
,gamma=0.2 # 惩罚项中叶子结点个数前的参数
,subsample=0.8 # 随机选择80%样本建立决策树
,colsample_btree=0.8 # 随机选择80%特征建立决策树
,objective='binary:logistic'
,scale_pos_weight=1 # 解决样本个数不平衡的问题
,random_state=27 # 随机数)
model3 = lgb.LGBMClassifier(objective='regression'
,learning_rate=0.1
,n_estimators=20
,subsample=0.8
,colsample_bytree=0.8
,num_leaves=22
,max_depth =5
,min_child_samples=20
,min_child_weight=0.00005
,feature_fraction = 0.6
,bagging_fraction = 0.7)
param_grid = {'learning_rate': [0.01, 0.1, 1], 'n_estimators': [20, 40]}
for train, valid in kfold.split(train_x, train_y):
X_train, Y_train = train_x[train], train_y[train]
X_valid, Y_valid = train_x[valid], train_y[valid]
model1.fit(X_train,Y_train, eval_set=(X_valid, Y_valid),use_best_model=True)
model2.fit(X_train,Y_train,eval_set = [(X_valid, Y_valid)], early_stopping_rounds = 100, verbose = True)
model3.fit(X_train, Y_train, eval_set=[(X_valid, Y_valid)], eval_metric='l1', early_stopping_rounds=5)
Y_valid_pred_prob1 = model1.predict_proba(X_valid)
Y_valid_pred_prob2 = model2.predict_proba(X_valid)
submission1['hepatitis'] += model1.predict_proba(test_x)[:,1] / 5
submission2['hepatitis'] += model2.predict_proba(test_x)[:,1] / 5
# 网格搜索,参数优化
estimator = lgb.LGBMRegressor(num_leaves=31)
model3 = GridSearchCV(estimator, param_grid)
model3.fit(X_train, Y_train)
submission5['hepatitis'] += model.predict(test_x) / 5
print('Best parameters found by grid search are:', model3.best_params_)
## 三个模型投票
submission['hepatitis'] = 0
submission['hepatitis'] = (submission1['hepatitis']+submission2['hepatitis']+submission3['hepatitis'])/3
# 写入文件
submission.to_csv('pingfen.csv',index=False)
# 提交
代码是在这篇文章上https://blog.csdn.net/qq_44574333/article/details/108964488进行修改的