import pandas as pd
import matplotlib. pyplot as plt
default = pd. read_excel( r'C:\Users\Administrator\Desktop\default of credit card.xls' )
plt. axes( aspect = 'equal' )
plt. rcParams[ 'font.sans-serif' ] = [ 'Microsoft YaHei' ]
plt. rcParams[ 'axes.unicode_minus' ] = False
counts = default. y. value_counts( )
plt. pie( x = counts,
labels= pd. Series( counts. index) . map ( { 0 : '不违约' , 1 : '违约' } ) ,
autopct= '%.1f%%'
)
plt. show( )
from sklearn import model_selection
from sklearn import ensemble
from sklearn import metrics
X = default. drop( [ 'ID' , 'y' ] , axis = 1 )
y = default. y
X_train, X_test, y_train, y_test = model_selection. train_test_split( X, y, test_size = 0.25 , random_state = 1234 )
AdaBoost1 = ensemble. AdaBoostClassifier( )
AdaBoost1. fit( X_train, y_train)
pred1 = AdaBoost1. predict( X_test)
print ( '模型的准确率为:\n' , metrics. accuracy_score( y_test, pred1) )
print ( '模型的评估报告:\n' , metrics. classification_report( y_test, pred1) )
y_score = AdaBoost1. predict_proba( X_test) [ : , 1 ]
fpr, tpr, threshold = metrics. roc_curve( y_test, y_score)
roc_auc = metrics. auc( fpr, tpr)
plt. stackplot( fpr, tpr, color= 'steelblue' , alpha = 0.5 , edgecolor = 'black' )
plt. plot( fpr, tpr, color= 'black' , lw = 1 )
plt. plot( [ 0 , 1 ] , [ 0 , 1 ] , color = 'red' , linestyle = '--' )
plt. text( 0.5 , 0.3 , 'ROC curve (area = %0.2f)' % roc_auc)
plt. xlabel( '1-Specificity' )
plt. ylabel( 'Sensitivity' )
plt. show( )
importance = pd. Series( AdaBoost1. feature_importances_, index = X. columns)
importance. sort_values( ) . plot( kind = 'barh' )
plt. show( )
predictors = list ( importance[ importance> 0.02 ] . index)
predictors
from sklearn. model_selection import GridSearchCV
from sklearn. tree import DecisionTreeClassifier
max_depth = [ 3 , 4 , 5 , 6 ]
params1 = { 'base_estimator__max_depth' : max_depth}
base_model = GridSearchCV( estimator = ensemble. AdaBoostClassifier( base_estimator = DecisionTreeClassifier( ) ) ,
param_grid= params1, scoring = 'roc_auc' , cv = 5 , n_jobs = 4 , verbose = 1 )
base_model. fit( X_train[ predictors] , y_train)
base_model. best_params_, base_model. best_score_
from sklearn. model_selection import GridSearchCV
n_estimators = [ 100 , 200 , 300 ]
learning_rate = [ 0.01 , 0.05 , 0.1 , 0.2 ]
params2 = { 'n_estimators' : n_estimators, 'learning_rate' : learning_rate}
adaboost = GridSearchCV( estimator = ensemble. AdaBoostClassifier( base_estimator = DecisionTreeClassifier( max_depth = 3 ) ) ,
param_grid= params2, scoring = 'roc_auc' , cv = 5 , n_jobs = 4 , verbose = 1 )
adaboost. fit( X_train[ predictors] , y_train)
adaboost. best_params_, adaboost. best_score_
AdaBoost2 = ensemble. AdaBoostClassifier( base_estimator = DecisionTreeClassifier( max_depth = 3 ) ,
n_estimators = 300 , learning_rate = 0.01 )
AdaBoost2. fit( X_train[ predictors] , y_train)
pred2 = AdaBoost2. predict( X_test[ predictors] )
print ( '模型的准确率为:\n' , metrics. accuracy_score( y_test, pred2) )
print ( '模型的评估报告:\n' , metrics. classification_report( y_test, pred2) )
y_score = AdaBoost2. predict_proba( X_test[ predictors] ) [ : , 1 ]
fpr, tpr, threshold = metrics. roc_curve( y_test, y_score)
roc_auc = metrics. auc( fpr, tpr)
plt. stackplot( fpr, tpr, color= 'steelblue' , alpha = 0.5 , edgecolor = 'black' )
plt. plot( fpr, tpr, color= 'black' , lw = 1 )
plt. plot( [ 0 , 1 ] , [ 0 , 1 ] , color = 'red' , linestyle = '--' )
plt. text( 0.5 , 0.3 , 'ROC curve (area = %0.2f)' % roc_auc)
plt. xlabel( '1-Specificity' )
plt. ylabel( 'Sensitivity' )
plt. show( )
learning_rate = [ 0.01 , 0.05 , 0.1 , 0.2 ]
n_estimators = [ 100 , 300 , 500 ]
max_depth = [ 3 , 4 , 5 , 6 ]
params = { 'learning_rate' : learning_rate, 'n_estimators' : n_estimators, 'max_depth' : max_depth}
gbdt_grid = GridSearchCV( estimator = ensemble. GradientBoostingClassifier( ) ,
param_grid= params, scoring = 'roc_auc' , cv = 5 , n_jobs = 4 , verbose = 1 )
gbdt_grid. fit( X_train[ predictors] , y_train)
gbdt_grid. best_params_, gbdt_grid. best_score_
pred = gbdt_grid. predict( X_test[ predictors] )
print ( '模型的准确率为:\n' , metrics. accuracy_score( y_test, pred) )
print ( '模型的评估报告:\n' , metrics. classification_report( y_test, pred) )
y_score = gbdt_grid. predict_proba( X_test[ predictors] ) [ : , 1 ]
fpr, tpr, threshold = metrics. roc_curve( y_test, y_score)
roc_auc = metrics. auc( fpr, tpr)
plt. stackplot( fpr, tpr, color= 'steelblue' , alpha = 0.5 , edgecolor = 'black' )
plt. plot( fpr, tpr, color= 'black' , lw = 1 )
plt. plot( [ 0 , 1 ] , [ 0 , 1 ] , color = 'red' , linestyle = '--' )
plt. text( 0.5 , 0.3 , 'ROC curve (area = %0.2f)' % roc_auc)
plt. xlabel( '1-Specificity' )
plt. ylabel( 'Sensitivity' )
plt. show( )
import pandas as pd
creditcard = pd. read_csv( r'./creditcard.csv' )
import matplotlib. pyplot as plt
plt. axes( aspect = 'equal' )
counts = creditcard. Class. value_counts( )
plt. pie( x = counts,
labels= pd. Series( counts. index) . map ( { 0 : '正常' , 1 : '欺诈' } ) ,
autopct= '%.2f%%'
)
plt. show( )
from sklearn import model_selection
X = creditcard. drop( [ 'Time' , 'Class' ] , axis = 1 )
y = creditcard. Class
X_train, X_test, y_train, y_test = model_selection. train_test_split( X, y, test_size = 0.3 , random_state = 1234 )
from imblearn. over_sampling import SMOTE
over_samples = SMOTE( random_state= 1234 )
over_samples_X, over_samples_y = over_samples. fit_sample( X_train, y_train)
print ( y_train. value_counts( ) / len ( y_train) )
print ( pd. Series( over_samples_y) . value_counts( ) / len ( over_samples_y) )
0 0.998239
1 0.001761
Name: Class, dtype: float64
1 0.5
0 0.5
Name: Class, dtype: float64
import xgboost
import numpy as np
xgboost = xgboost. XGBClassifier( )
xgboost. fit( over_samples_X, over_samples_y)
resample_pred = xgboost. predict( np. array( X_test) )
print ( '模型的准确率为:\n' , metrics. accuracy_score( y_test, resample_pred) )
print ( '模型的评估报告:\n' , metrics. classification_report( y_test, resample_pred) )
C:\Users\12048\Anaconda3\lib\site-packages\dask\dataframe\utils.py:13: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
import pandas.util.testing as tm
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
in ()
7 xgboost.fit(over_samples_X,over_samples_y)
8 # 将模型运用到测试数据集中
----> 9 resample_pred = xgboost.predict(np.array(X_test))
10
11 # 返回模型的预测效果
~\Anaconda3\lib\site-packages\xgboost\sklearn.py in predict(self, data, output_margin, ntree_limit, validate_features, base_margin)
886 output_margin=output_margin,
887 ntree_limit=ntree_limit,
--> 888 validate_features=validate_features)
889 if output_margin:
890 # If output_margin is active, simply return the scores
~\Anaconda3\lib\site-packages\xgboost\core.py in predict(self, data, output_margin, ntree_limit, pred_leaf, pred_contribs, approx_contribs, pred_interactions, validate_features, training)
1569
1570 if validate_features:
-> 1571 self._validate_features(data)
1572
1573 length = c_bst_ulong()
~\Anaconda3\lib\site-packages\xgboost\core.py in _validate_features(self, data)
2129
2130 raise ValueError(msg.format(self.feature_names,
-> 2131 data.feature_names))
2132
2133 def get_split_value_histogram(self, feature, fmap='', bins=None,
ValueError: feature_names mismatch: ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'] ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28']
expected V2, V12, V18, V16, V19, V22, V15, V7, Amount, V21, V5, V3, V6, V14, V13, V8, V20, V26, V10, V25, V11, V9, V17, V27, V1, V4, V24, V23, V28 in input data
training data did not have the following fields: f28, f14, f24, f22, f19, f21, f4, f18, f1, f3, f15, f5, f10, f25, f11, f16, f2, f7, f0, f27, f6, f20, f26, f17, f12, f9, f8, f13, f23
y_score = xgboost. predict_proba( np. array( X_test) ) [ : , 1 ]
fpr, tpr, threshold = metrics. roc_curve( y_test, y_score)
roc_auc = metrics. auc( fpr, tpr)
plt. stackplot( fpr, tpr, color= 'steelblue' , alpha = 0.5 , edgecolor = 'black' )
plt. plot( fpr, tpr, color= 'black' , lw = 1 )
plt. plot( [ 0 , 1 ] , [ 0 , 1 ] , color = 'red' , linestyle = '--' )
plt. text( 0.5 , 0.3 , 'ROC curve (area = %0.2f)' % roc_auc)
plt. xlabel( '1-Specificity' )
plt. ylabel( 'Sensitivity' )
plt. show( )
xgboost2 = xgboost. XGBClassifier( )
xgboost2. fit( X_train, y_train)
pred2 = xgboost2. predict( X_test)
pd. crosstab( pred2, y_test)
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
in ()
1 # 构建XGBoost分类器
----> 2 xgboost2 = xgboost.XGBClassifier()
3 # 使用非平衡的训练数据集拟合模型
4 xgboost2.fit(X_train,y_train)
5 # 基于拟合的模型对测试数据集进行预测
AttributeError: 'XGBClassifier' object has no attribute 'XGBClassifier'
print ( '模型的准确率为:\n' , metrics. accuracy_score( y_test, pred2) )
print ( '模型的评估报告:\n' , metrics. classification_report( y_test, pred2) )
y_score = xgboost2. predict_proba( X_test) [ : , 1 ]
fpr, tpr, threshold = metrics. roc_curve( y_test, y_score)
roc_auc = metrics. auc( fpr, tpr)
plt. stackplot( fpr, tpr, color= 'steelblue' , alpha = 0.5 , edgecolor = 'black' )
plt. plot( fpr, tpr, color= 'black' , lw = 1 )
plt. plot( [ 0 , 1 ] , [ 0 , 1 ] , color = 'red' , linestyle = '--' )
plt. text( 0.5 , 0.3 , 'ROC curve (area = %0.2f)' % roc_auc)
plt. xlabel( '1-Specificity' )
plt. ylabel( 'Sensitivity' )
plt. show( )