# -*- coding: utf-8 -*-
"""
Created on Mon Aug 6 20:37:19 2018
@author: wangxihe
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from statsmodels.formula.api import ols
import statsmodels.api as sm
import statsmodels.formula.api as smf
import os
os.chdir(r'E:\spyderwork\wxh\数据科学\二分类问题')
columns=['A0','A1','A2','A3','A4','A5','A6','A7','A8','A9','A10','A11','A12','A13','A14','A15',
'A16','A17','A18','A19','A20','A21','A22','A23','A24','A25','A26','A27','A28','A29',
'A30','A31','A32','A33','A34','A35','A36','A37','A38','A39','A40','A41','A42','A43',
'A44','A45','A46','A47','A48','A49','A50','A51','A52','A53','A54','A55',
'A56','A57','A58','A59','Y']
sonar=pd.read_csv('sonar.all-data.csv',names=columns,header=None)
sonar.shape
sonar.dtypes
sonar['Y'].value_counts()#大致均衡
#R=0,M=1
sonar['Y'].replace({'R':0},inplace=True)
sonar['Y'].replace({'M':1},inplace=True)
# 数据的分类分布
sonar['Y'].value_counts().plot(kind='bar')
#%%
使用传统的方法
#%%
#由于自变量都是连续的,使用双样本T检验
columned=[]
X=sonar.copy()
for ct in X.columns:
if ct!='Y':
TT0=X[X['Y']==0][ct]
TT1=X[X['Y']==1][ct]
#方差齐性检验
leveneTest=stats.levene(TT0,TT1,center='median')
# print('w-value=%6.4f, p-value=%6.4f' %leveneTest)
_,fp_value=leveneTest
if fp_value<0.05:
Flag=False
else:
Flag=True
_,p_value=stats.stats.ttest_ind(TT0,TT1,equal_var=Flag)
if p_value<0.05:
columned.append(ct)
print('p-value=%6.4f' %p_value)
len(columned) #建议保留 34个变量
##使用共性判断
#%%共线性
def vif(df, col_i):
cols = list(df.columns)
cols.remove(col_i)
cols_noti = cols
formula = col_i + '~' + '+'.join(cols_noti)
r2 = ols(formula, df).fit().rsquared
return 1. / (1. - r2)
#%%
exog = X[columned].copy()
for i in exog.columns:
print(i, '\t', vif(df=exog, col_i=i))
exog.drop(['A19'],axis=1,inplace=True)
exog.drop(['A45'],axis=1,inplace=True)
exog.drop(['A35'],axis=1,inplace=True)
exog.drop(['A10'],axis=1,inplace=True)
exog.drop(['A47'],axis=1,inplace=True)
#%%
# 向前法
def forward_select(data, response):
remaining = set(data.columns)
remaining.remove(response)
selected = []
current_score, best_new_score = float('inf'), float('inf')
while remaining:
aic_with_candidates=[]
for candidate in remaining:
formula = "{} ~ {}".format(
response,' + '.join(selected + [candidate]))
aic = smf.glm(
formula=formula, data=data,
family=sm.families.Binomial(sm.families.links.logit)
).fit().aic
aic_with_candidates.append((aic, candidate))
aic_with_candidates.sort(reverse=True)
best_new_score, best_candidate=aic_with_candidates.pop()
if current_score > best_new_score:
remaining.remove(best_candidate)
selected.append(best_candidate)
current_score = best_new_score
print ('aic is {},continuing!'.format(current_score))
else:
print ('forward selection over!')
break
formula = "{} ~ {} ".format(response,' + '.join(selected))
print('final formula is {}'.format(formula))
model = smf.glm(
formula=formula, data=data,
family=sm.families.Binomial(sm.families.links.logit)
).fit()
return(model)
#%%
data_select = pd.concat([exog,X['Y']],axis=1)
lg_m0 = forward_select(data=data_select, response='Y')
#bad_ind ~ fico_score + ltv + age_oldest_tr + nta + veh_mileage + tot_derog
lg_m0.summary()
train=X.sample(frac=0.8, replace=False,random_state=7).copy()
test=X[~X.index.isin(train.index)].copy()
#'Y ~ A36+A44+A20+A51+A7+A48+A49+A53+A42+A13+A0+A50+A34+A2+A3+A8+A11'
formula='Y ~ A11+A48+A51+A36+A44+A49+A13+A50+A0+A7+A8+A53+A2+A3+A21'
lg=smf.glm(formula=formula, data=train, family=sm.families.Binomial(sm.families.links.logit)).fit()
lg.summary()
train['proba']=lg.predict(train)
test['proba']=lg.predict(test)
train['prediction'] = (train['proba'] > 0.3).astype('int')
test['prediction'] = (test['proba'] > 0.3).astype('int')
import sklearn.metrics as metrics
fpr_test, tpr_test, th_test = metrics.roc_curve(test.Y, test.proba)
fpr_train, tpr_train, th_train = metrics.roc_curve(train.Y, train.proba)
plt.figure(figsize=[10, 10])
plt.plot(fpr_test, tpr_test, 'b--')
plt.plot(fpr_train, tpr_train, 'r-')
plt.title('ROC curve')
plt.show()
print('testAUC = %.4f' %metrics.auc(fpr_test, tpr_test))
print('trainAUC = %.4f' %metrics.auc(fpr_train, tpr_train))
#%%过拟合了
formula='Y ~ A11+A48+A51+A36+A44+A49+A13+A50+A0+A7+A8+A21'
lg=smf.glm(formula=formula, data=train, family=sm.families.Binomial(sm.families.links.logit)).fit()
lg.summary()
#testAUC = 0.8750
#trainAUC = 0.9415
train['proba']=lg.predict(train)
test['proba']=lg.predict(test)
train['prediction'] = (train['proba'] > 0.3).astype('int')
test['prediction'] = (test['proba'] > 0.3).astype('int')
import sklearn.metrics as metrics
fpr_test, tpr_test, th_test = metrics.roc_curve(test.Y, test.proba)
fpr_train, tpr_train, th_train = metrics.roc_curve(train.Y, train.proba)
plt.figure(figsize=[10, 10])
plt.plot(fpr_test, tpr_test, 'b--')
plt.plot(fpr_train, tpr_train, 'r-')
plt.title('ROC curve')
plt.show()
print('testAUC = %.4f' %metrics.auc(fpr_test, tpr_test))
print('trainAUC = %.4f' %metrics.auc(fpr_train, tpr_train))
#显著
#%%
skewdata=train[['A11','A48','A51','A36','A44','A49','A13','A50','A0','A7','A8','A21']]
skew_var_x = {}
for i in skewdata:
skew_var_x[i] = abs(skewdata[i].skew())
skew = pd.Series(skew_var_x).sort_values(ascending=False)
for sk in skew[0:6].index:
skewdata[sk].hist(bins=30)
plt.xlabel(sk)
plt.show()
#%%对train A2,A3,A0,A51,A49,A8取对数
skewdata['A50_ln']=np.log(skewdata['A50'])
skewdata['A0_ln']=np.log(skewdata['A0'])
skewdata['A51_ln']=np.log(skewdata['A51'])
skewdata['A49_ln']=np.log(skewdata['A49'])
skewdata.drop(['A50','A0','A51','A49'],axis=1,inplace=True)
#%%图形显示都是右偏对原始数据集处理
sonar['A50_ln']=np.log(sonar['A50'])
sonar['A0_ln']=np.log(sonar['A0'])
sonar['A51_ln']=np.log(sonar['A51'])
sonar['A49_ln']=np.log(sonar['A49'])
sonar.drop(['A50','A0','A51','A49'],axis=1,inplace=True)
X=sonar.copy()
train=X.sample(frac=0.8, replace=False,random_state=7).copy()
test=X[~X.index.isin(train.index)].copy()
A49_ln有问题
#%%处理后的数据重新
#A51_ln
formula='Y ~ A11+A48+A51_ln+A36+A44+A13+A7+A8+A21'
lg=smf.glm(formula=formula, data=train, family=sm.families.Binomial(sm.families.links.logit)).fit()
lg.summary()
train['proba']=lg.predict(train)
test['proba']=lg.predict(test)
train['prediction'] = (train['proba'] > 0.4).astype('int')
test['prediction'] = (test['proba'] > 0.4).astype('int')
import sklearn.metrics as metrics
fpr_test, tpr_test, th_test = metrics.roc_curve(test.Y, test.proba)
fpr_train, tpr_train, th_train = metrics.roc_curve(train.Y, train.proba)
plt.figure(figsize=[10, 10])
plt.plot(fpr_test, tpr_test, 'b--')
plt.plot(fpr_train, tpr_train, 'r-')
plt.title('ROC curve')
plt.show()
print('testAUC = %.4f' %metrics.auc(fpr_test, tpr_test))
print('trainAUC = %.4f' %metrics.auc(fpr_train, tpr_train))
#%%
使用机器学习方法预测
#%% 评估算法的基准
n_folds = 10
seed = 7
scoring = 'accuracy'
#%%
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
import sklearn.metrics as metrics
sonar=pd.read_csv('sonar.all-data.csv',names=columns,header=None)
sonar['Y'].replace({'R':0},inplace=True)
sonar['Y'].replace({'M':1},inplace=True)
X=sonar.iloc[:,0:60].copy()
X.head()
Y=sonar['Y'].copy()
seed = 7
train,test,train_y,test_y=train_test_split(X,Y,test_size=0.2,random_state=seed)
#%% 模型
models = {}
models['LR'] = LogisticRegression()
models['LDA'] = LinearDiscriminantAnalysis()
models['KNN'] = KNeighborsClassifier()
models['CART'] = DecisionTreeClassifier()
models['NB'] = GaussianNB()
models['SVM'] = SVC()
results = []
kfold=KFold(n_splits=n_folds,random_state=seed)
for key in models:
score=cross_val_score(models[key],train,train_y,scoring=scoring,cv=kfold)
results.append(score)
print('model:%s, mean:%f, std:%f'%(key,score.mean(),score.std()))
#%%
fig=plt.figure(figsize=(10,10))
fig.suptitle('Algorithm Comparison')
ax=fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(models.keys())
plt.show()
#%%
#model:LR, mean:0.782721, std:0.093796
#model:LDA, mean:0.746324, std:0.117854
#model:KNN, mean:0.808088, std:0.067507
#model:CART, mean:0.723529, std:0.109480
#model:NB, mean:0.648897, std:0.141868
#model:SVM, mean:0.608824, std:0.118656
#%%
pipelines={}
pipelines['ScalerLR'] = Pipeline([('Scaler', StandardScaler()), ('LR', LogisticRegression())])
pipelines['ScalerLDA'] = Pipeline([('Scaler', StandardScaler()), ('LDA', LinearDiscriminantAnalysis())])
pipelines['ScalerKNN'] = Pipeline([('Scaler', StandardScaler()), ('KNN', KNeighborsClassifier())])
pipelines['ScalerCART'] = Pipeline([('Scaler', StandardScaler()), ('CART', DecisionTreeClassifier())])
pipelines['ScalerNB'] = Pipeline([('Scaler', StandardScaler()), ('NB', GaussianNB())])
pipelines['ScalerSVM'] = Pipeline([('Scaler', StandardScaler()), ('SVM', SVC())])
results = []
for key in pipelines:
kfold = KFold(n_splits=n_folds, random_state=seed)
cv_results = cross_val_score(pipelines[key], train, train_y, cv=kfold, scoring=scoring)
results.append(cv_results)
print('%s : %f (%f)' % (key, cv_results.mean(), cv_results.std()))
# 评估算法 - 箱线图
fig=plt.figure(figsize=(10,10))
fig.suptitle('Scaled Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(models.keys())
plt.show()
#%%
#ScalerLR : 0.734191 (0.095885)
#ScalerLDA : 0.746324 (0.117854)
#ScalerKNN : 0.825735 (0.054511)
#ScalerCART : 0.705147 (0.112992)
#ScalerNB : 0.648897 (0.141868)
#ScalerSVM : 0.836397 (0.088697)
#%%
scaler = StandardScaler().fit(train)
train_scaler = scaler.transform(train)
scaler = StandardScaler().fit(test)
test_scaler = scaler.transform(test)
#%% 调参改进算法 - KNN
param_grid = {'n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21]}
model = KNeighborsClassifier()
kfold = KFold(n_splits=n_folds, random_state=seed)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(X=train_scaler, y=train_y)
print('最优:%s 使用%s' % (grid_result.best_score_, grid_result.best_params_))
cv_results = zip(grid_result.cv_results_['mean_test_score'],
grid_result.cv_results_['std_test_score'],
grid_result.cv_results_['params'])
for mean, std, param in cv_results:
print('%f (%f) with %r' % (mean, std, param))
#%% 调参改进算法 - SVM
param_grid = {}
param_grid['C'] = [0.1, 0.3, 0.5, 0.7, 0.9, 1.0, 1.3, 1.5, 1.7, 2.0]
param_grid['kernel'] = ['linear', 'poly', 'rbf', 'sigmoid']
model = SVC()
kfold = KFold(n_splits=n_folds, random_state=seed)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(X=train_scaler, y=train_y)
print('最优:%s 使用%s' % (grid_result.best_score_, grid_result.best_params_))
cv_results = zip(grid_result.cv_results_['mean_test_score'],
grid_result.cv_results_['std_test_score'],
grid_result.cv_results_['params'])
for mean, std, param in cv_results:
print('%f (%f) with %r' % (mean, std, param))
#%% 集成算法
ensembles = {}
ensembles['ScaledAB'] = Pipeline([('Scaler', StandardScaler()), ('AB', AdaBoostClassifier())])
ensembles['ScaledGBM'] = Pipeline([('Scaler', StandardScaler()), ('GBM', GradientBoostingClassifier())])
ensembles['ScaledRF'] = Pipeline([('Scaler', StandardScaler()), ('RFR', RandomForestClassifier())])
ensembles['ScaledET'] = Pipeline([('Scaler', StandardScaler()), ('ETR', ExtraTreesClassifier())])
results = []
for key in ensembles:
kfold = KFold(n_splits=n_folds, random_state=seed)
cv_result = cross_val_score(ensembles[key], train, train_y, cv=kfold, scoring=scoring)
results.append(cv_result)
print('%s: %f (%f)' % (key, cv_result.mean(), cv_result.std()))
#%% 集成算法 - 箱线图
fig=plt.figure(figsize=(10,10))
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(ensembles.keys())
plt.show()
#%%
# 集成算法GBM - 调参
param_grid = {'n_estimators': [10, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900]}
model = GradientBoostingClassifier()
kfold = KFold(n_splits=n_folds, random_state=seed)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(X=train_scaler, y=train_y)
print('最优:%s 使用%s' % (grid_result.best_score_, grid_result.best_params_))
model = GradientBoostingClassifier(n_estimators= 300)
model.fit(X=train_scaler, y=train_y)
predictions = model.predict(test_scaler)
prob_test=model.predict_proba(test_scaler)[:,1]
prob_train= model.predict_proba(train_scaler)[:,1]
train_prediction=model.predict(train_scaler)
#sum(train_p==train_y)//len(train_y)100%准确率
fpr_test, tpr_test, th_test = metrics.roc_curve(test_y, prob_test)
fpr_train, tpr_train, th_train = metrics.roc_curve(train_y, prob_train)
plt.figure(figsize=[6,6])
plt.plot(fpr_test,tpr_test,'--b')
plt.plot(fpr_train,tpr_train,'-r')
plt.title('ROC curve')
print('testAUC = %6.4f' %metrics.auc(fpr_test, tpr_test))
print('trainAUC = %6.4f' %metrics.auc(fpr_train, tpr_train))
#%%模型最终化
model = SVC(C=1.5, kernel='rbf')
model.fit(X=train_scaler, y=train_y)
# 评估模型
test_scaler = scaler.transform(test)
predictions = model.predict(test_scaler)
train_prediction=model.predict(train_scaler)
print(accuracy_score(train_y, train_p))
print(accuracy_score(test_y, predictions))
print(confusion_matrix(test_y, predictions))
print(classification_report(test_y, predictions))
#%%roc曲线
fpr_test, tpr_test, th_test = metrics.roc_curve(test_y, predictions)
fpr_train, tpr_train, th_train = metrics.roc_curve(train_y, train_p)
plt.figure(figsize=[6,6])
plt.plot(fpr_test,tpr_test,'--b')
plt.plot(fpr_train,tpr_train,'-r')
plt.title('ROC curve')
print('testAUC = %6.4f' %metrics.auc(fpr_test, tpr_test))
print('trainAUC = %6.4f' %metrics.auc(fpr_train, tpr_train))
#%%
上面机器学习虽然训练集上都有不错表现,但是发生了过度你和,根据传统变量选择方法,选择变量后再建模
#%%
sonar=pd.read_csv('sonar.all-data.csv',names=columns,header=None)
sonar['Y'].replace({'R':0},inplace=True)
sonar['Y'].replace({'M':1},inplace=True)
sonar['A51_ln']=np.log(sonar['A51'])
X=sonar[['A11','A48','A51_ln','A36','A44','A13','A7','A8','A21']]
X.head()
Y=sonar['Y'].copy()
seed = 7
train,test,train_y,test_y=train_test_split(X,Y,test_size=0.2,random_state=seed)
#%%集成算法GBM - 调参
param_grid = {'n_estimators': [10, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900]}
model = GradientBoostingClassifier()
kfold = KFold(n_splits=n_folds, random_state=seed)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(X=train_scaler, y=train_y)
print('最优:%s 使用%s' % (grid_result.best_score_, grid_result.best_params_))
model = GradientBoostingClassifier(n_estimators= 33)
model.fit(X=train_scaler, y=train_y)
predictions = model.predict(test_scaler)
prob_test=model.predict_proba(test_scaler)[:,1]
prob_train= model.predict_proba(train_scaler)[:,1]
train_prediction=model.predict(train_scaler)
#sum(train_p==train_y)//len(train_y)100%准确率
fpr_test, tpr_test, th_test = metrics.roc_curve(test_y, prob_test)
fpr_train, tpr_train, th_train = metrics.roc_curve(train_y, prob_train)
plt.figure(figsize=[6,6])
plt.plot(fpr_test,tpr_test,'--b')
plt.plot(fpr_train,tpr_train,'-r')
plt.title('ROC curve')
print('testAUC = %6.4f' %metrics.auc(fpr_test, tpr_test))
print('trainAUC = %6.4f' %metrics.auc(fpr_train, tpr_train))