机器学习流程-demo

# encoding = 'utf-8'

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


bank=pd.read_csv('bank.csv', delimiter=';')


feature_name_list = list(bank.columns)
cat_columns = bank[feature_name_list].select_dtypes(exclude = np.number).columns
num_columns = bank[feature_name_list].select_dtypes(include = np.number).columns


from sklearn.preprocessing import LabelEncoder

lbEncoder_dic  = {}
for col in cat_columns:
    bank[col] = bank[col].astype(str)
    lbEncoder_dic[col] = LabelEncoder()
    bank[col] = lbEncoder_dic[col].fit_transform(bank[col])


sns.heatmap(bank.corr(),annot=True,center = 0)
plt.show()

sns.pairplot(bank) 


for col in feature_name_list:
    if col != 'y':
        bank[col] = bank[col].apply(lambda x:float(x))
        max_value = max(list(bank[col]))
        min_value = min(list(bank[col]))
        bank[col] = bank[col].apply(lambda x:(x-min_value)/(max_value-min_value))   


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data.drop('y',1), data[['y']], random_state=10)

import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier


clf = RandomForestClassifier(n_estimators=20)

param_dist = {"max_depth": [3, 5,10,15,20,25,30,35],                     
              "max_features": [5,10,15,20,25,30,35,40,45],          
              "min_samples_split": [2,4,6,8,10,12,14,16],     
              "bootstrap": [True, False],                 
              "criterion": ["gini", "entropy"]}      

n_iter_search = 20
random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search, cv=5)
random_search.fit(X_train, y_train)


random_search.best_params_ 

best_randomForest_model = RandomForestClassifier(min_samples_split=8,
                                                 max_features=10,
                                                 max_depth=35,
                                                 criterion='entropy',
                                                 bootstrap=True)


best_randomForest_model.fit(X_train, y_train)


pred_x_val = best_randomForest_model.predict(X_test)


from sklearn.metrics import classification_report

print(classification_report(y_test, pred_x_val))

你可能感兴趣的:(机器学习,python,人工智能)