机器学习-GridSearchCV自动调参,RF特征选择

  • 主要思想:通过GridSearchCV算法进行特征的自动化筛选
import numpy as np
import pandas as pd
from time import strftime
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

def selectParam(clf,param,features=[]):
        """
        param:希望对模型进行调优的参数;
        feature:RF模型调参时默认对所有特征进行处理;LR模型调参时选择RF筛选留下的特征
        """
        grid_search = GridSearchCV(estimator = clf, param_grid=param, n_jobs=4,scoring='roc_auc',cv=5)
        x1 = np.asarray(self.x_train)
        if len(features) > 0:           
            x1 = np.asarray(self.x_train[features])
        y1 = np.asarray(self.y_train) 
        grid_search.fit(x1, y1) #传入训练集矩阵和训练样本类标
         # 输出best score
#        print("Best score of %s: %0.3f" % (clf,grid_search.best_score_))
#        print("Best parameters set of %s is:" %clf )
        # 输出最佳的分类器到底使用了怎样的参数
        best_parameters = grid_search.best_estimator_.get_params()

        return best_parameters


    def top_features(self):

        """
        step1:调整参数获得随机森林最优参数;
        step2: 通过最优参数建模并获得rf_model
        step3: 通过rf_model获得特征重要性
        返回值:特征及其重要性并其按重要性排序的输出
        """

        # 随机深林 特征重要性阈值
        threshold =0.005
        clf_RF = RandomForestClassifier(random_state=10)
        param_grid =[ {"max_depth": range(10,50,3),#一般数据量少或特征少时可以不考虑,否则常用的取值为10-100之间。
                      #"min_samples_split": [5, 10,15,20,25],#默认为2,数据量级特别大时可以用
                      #"min_samples_leaf": [5, 10,15,20,25],#默认1,当叶子节点样本数少于此值会被截肢
                      #"bootstrap": [True, False],
                      #"criterion": ["gini", "entropy"],
                      "n_estimators": range(10,50,3),
                      }
                      # "class_weight": [{0:1,1:13.24503311,2:1.315789474,3:12.42236025,4:8.163265306,5:31.25,6:4.77326969,7:19.41747573}],
                      # "max_features":如果样本特征数不多,比如小于50,用默认的"None",较多可采用log2、sqrt、auto,
                      # "warm_start": [True, False],
                      # "oob_score": [True, False],
                      # "verbose": [True, False]}
                    ]

        best_parameters = self.selectParam(clf_RF,param_grid)
        best_parameters.pop('random_state')    
        clf_RF_prior = RandomForestClassifier(random_state=10,**best_parameters)
        clf_RF_prior.fit(self.x_train,self.y_train)
        print("the best paramers of RF to choose features resulted in score : ", clf_RF_prior.score(self.x_test, self.y_test))

        feature_imp = pd.DataFrame([self.x_train.columns,clf_RF_prior.feature_importances_]).T
        feature_imp.columns = ['features','features_importance']
        feature_imp = feature_imp.sort_values(['features_importance'],ascending=False)

        feature_imp = feature_imp[feature_imp['features_importance']>threshold]

        return feature_imp

你可能感兴趣的:(模型训练)