"""
Created on Sun Jun 21 22:13:17 2020
@author: xn_katherine
"""
'''
1、分析项目需求,明确具体问题
2、获取数据
3、特征预处理与特征选择(归一化、离散化、因子化、缺失值处理、去除共线性等,需要运用特征有效性分析的相关技术,如相关系数、卡方检验、平均互信息、条件熵、后验概率、逻辑回归权重等方法)
4、训练模型与调优
5、模型诊断( 过拟合/欠拟合、误差分析)
6、模型融合
7、上线运行
'''
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
file = open("D:t.txt",encoding='utf-8')
cityType = pd.read_excel("D:/city.xlsx",sheet_name="Sheet1")
a = file.readline()
a.split()
a.strip().strip('"')
aa = a.strip().strip('"').split('"\t"')
aa[0]
type(aa[0])
aa[1]
eval(aa[1])
type(aa[1])
type(eval(aa[1]))
aa[2]
eval(aa[2])
aa[7]
eval(aa[7])
type(eval(aa[7]))
aa[43]
eval(aa[43])
type(eval(aa[43]))
file.close()
data0 = []
for line in file.readlines():
lineArr = line.strip().strip('"').split('"\t"')
data0.append(lineArr)
file.close()
data0[0:2]
len(data0)
data0[1][0]
data0[1][2]
data0[1][3]
data0[1][4]
type(data0[1][4])
data0[1][5]
data0[1][6]
data0[1][19]
data0[1][42]
data0[1][43]
type(data0[1][18])
for i in range(1,573051,1):
for j in range(0,44,1):
if((j>=2 and j<=6) or j==19 or j==42):
data0[i][j] = data0[i][j]
elif len(data0[i][j]) == 0:
data0[i][j] = 1.0e-10
print(j)
else:
data0[i][j] = eval(data0[i][j])
'''包含空字符串,设置为1.0e-10 的列 有:
[17]'ProductRejectRate'
[41]'lastcallduration'
'''
data0[0][17]
a=0
for i in range(1,573051,1):
if isinstance(data0[i][17],str):
a +=1
a=0
for i in range(1,573051,1):
if(isinstance(data0[i][17],float)):
a +=1
a
for i in range(1,573051,1):
for j in range(0,44,1):
if len(data0[i][j]) == 0:
data0[i][j] = np.nan
elif((j>=2 and j<=6) or j==19 or j==42):
data0[i][j] = data0[i][j]
else:
data0[i][j] = eval(data0[i][j])
data0[0:2]
data = pd.DataFrame(data0[1:],columns=data0[0])
data.head(2)
data.shape
data.describe()
data.columns
data.lastcalltype.describe()
data['lastcalltype'].groupby(data['lastcalltype']).count()
sum(data['lastcalltype']=='')
data.loc[data['lastcalltype']=='','lastcalltype']
data.where(data['lastcalltype']=='')
for col in ['SrCustomerId','CreationTime','provinceName','cityName','paymentType','orderMakerCenter','lastcalltype']:
data.loc[data[col]=='',col] = np.nan
data.loc[[0,2],'ShippingStatus']
sum(data.ShippingStatus.isin([1,2]))
data.loc[data.ShippingStatus.isin([1,2]),'ShippingStatus']
sum(data['ShippingStatus'] > 2)
data1 = data.loc[data['ShippingStatus'] > 2]
data1.head(2)
data1.shape
data1.describe()
data1[0:2][['ShippingStatus']]
'''test:
a=pd.DataFrame([list('abcd'), list('efgh'), list('ijkl'), list('mnop')],
columns=['one', 'two','first', 'second'])
a.loc[0,'one']=0
a.loc[a.one.isin(['e']),'two']=1
data1.loc[0,'ShippingStatus']=1 #error
data1.loc[0,'y']=1 #error
data1.loc[data1.ShippingStatus.isin([3,6,7]),'ShippingStatus'] = 1 #error,使用下方方式代替
data1.loc[data1.ShippingStatus.isin([4,5]),'ShippingStatus'] = 0 #error,使用下方方式代替
'''
data1.loc[data1.ShippingStatus.isin([3,6,7]),'y'] = 1
data1.loc[data1.ShippingStatus.isin([4,5]),'y'] = 0
data1['y'].groupby(data1['y']).count()
data1.head(2)
data1.shape
data1.describe()
cityType.head()
data2 = pd.merge(data1,cityType,left_on='cityName',right_on='名称.1',how='left')
data2.shape
data2.head(2)
sum(pd.isnull(data2['城市类型']))
a = data2.loc[pd.isnull(data2['城市类型']),'cityName']
a.to_excel("D:/citydefault.xlsx")
data3 = data2.copy()
sum(pd.isnull(data3['城市类型']))
data3.loc[pd.isnull(data3['城市类型']),'城市类型']='五线'
data3.shape
data3.head(2)
'''
参考:
数据标准化常见问题:对整个数据集数据标准化后再划分训练集、测试集和先对训练级标准化再将规则用于测试集有什么区别(Python实现)
https://blog.csdn.net/qq_40304090/article/details/90597892
'''
from sklearn.model_selection import train_test_split
data1_X = data3[['Id','SrCustomerId','paymentType','cash','RefillDeposit','PaidByDeposit',
'cashRatio','RefillDepositRatio','PaidByDepositRatio','iszero','ispurezero',
'CurrentCustomerGrade','cus_reject_rate','ProductRejectRate','maker_reject_rate','orderMakerCenter','workAge',
'cus_history_cash','cus_history_RefillDeposit','cus_history_PaidByDeposit',
'cus_history_cashRatio','cus_history_RefillDepositRatio','cus_history_PaidByDepositRatio','cus_history_zeroRatio','cus_history_purezeroRatio',
'induration','inthroughcnt','inaverageduration','inthroughrate','outduration','outthroughcnt','outaverageduration','outthroughrate',
'callduration','callthroughcnt','callaverageduration','callthroughrate','lastcallduration','lastcalltype',
'城市类型']]
data1_y = data3['y']
data1_X.shape
data1_y.shape
data1_train_X, data1_test_X, data1_train_y, data1_test_y = train_test_split(data1_X, data1_y, random_state=666, test_size=0.30)
data1_train_X.shape
data1_test_X.shape
data1_train_y.shape
data1_test_y.shape
import matplotlib as mpl
mpl.rcParams['font.sans-serif']=['SimHei']
mpl.rcParams['axes.unicode_minus']=False
data1_train = pd.concat([data1_train_X, data1_train_y],axis=1)
data1_test = pd.concat([data1_test_X, data1_test_y],axis=1)
data1_train.shape
data1_test.shape
data1_train[['SrCustomerId','paymentType','orderMakerCenter','lastcalltype','城市类型']].describe()
data1_train[['Id','cash','RefillDeposit','PaidByDeposit','cashRatio','RefillDepositRatio','PaidByDepositRatio','iszero','ispurezero']].describe()
data1_train[['CurrentCustomerGrade','cus_reject_rate','ProductRejectRate','maker_reject_rate','workAge']].describe()
data1_train[['cus_history_cash','cus_history_RefillDeposit','cus_history_PaidByDeposit','cus_history_cashRatio',
'cus_history_RefillDepositRatio','cus_history_PaidByDepositRatio','cus_history_zeroRatio','cus_history_purezeroRatio']].describe()
data1_train[['induration','inthroughcnt','inaverageduration','inthroughrate','outduration','outthroughcnt','outaverageduration','outthroughrate',
'callduration','callthroughcnt','callaverageduration','callthroughrate','lastcallduration','y']].describe()
data1_train[['iszero']].groupby(data1_train['iszero']).count()
data1_train[['ispurezero']].groupby(data1_train['ispurezero']).count()
data1_train[['y']].groupby(data1_train['y']).count()
data1_train[['CurrentCustomerGrade']].groupby(data1_train['CurrentCustomerGrade']).count()
'''Out[180]:
CurrentCustomerGrade
CurrentCustomerGrade
1 10889
2 6181
3 132059
4 28656
5 33709
6 72358
7 88308
8 25832
'''
fig,axis = plt.subplots(1,3)
color = dict(boxes='DarkGreen', whiskers='DarkOrange',medians='DarkBlue', caps='Red')
data1_train[['cash','RefillDeposit','PaidByDeposit']].plot(kind='box',ax=axis,subplots=True,color=color,sym='r+')
data1_train[['cashRatio','RefillDepositRatio','PaidByDepositRatio']].plot(kind='box',ax=axis,subplots=True,color=color,sym='r+')
fig,axis = plt.subplots(1,4)
color = dict(boxes='DarkGreen', whiskers='DarkOrange',medians='DarkBlue', caps='Red')
data1_train[['cus_reject_rate','ProductRejectRate','maker_reject_rate','workAge']].plot(kind='box',
ax=axis,subplots=True,color=color,sym='r+')
fig,axis = plt.subplots(1,3)
color = dict(boxes='DarkGreen', whiskers='DarkOrange',medians='DarkBlue', caps='Red')
data1_train[['cus_history_cash','cus_history_RefillDeposit','cus_history_PaidByDeposit']].plot(kind='box',
ax=axis,subplots=True,color=color,sym='r+')
data1_train[['cus_history_cashRatio','cus_history_RefillDepositRatio','cus_history_PaidByDepositRatio']].plot(kind='box',
ax=axis,subplots=True,color=color,sym='r+')
data1_train[['cus_history_zeroRatio','cus_history_purezeroRatio','lastcallduration']].plot(kind='box',
ax=axis,subplots=True,color=color,sym='r+')
fig,axis = plt.subplots(1,4)
color = dict(boxes='DarkGreen', whiskers='DarkOrange',medians='DarkBlue', caps='Red')
data1_train[['induration','inthroughcnt','inaverageduration','inthroughrate']].plot(kind='box',
ax=axis,subplots=True,color=color,sym='r+')
data1_train[['outduration','outthroughcnt','outaverageduration','outthroughrate']].plot(kind='box',
ax=axis,subplots=True,color=color,sym='r+')
data1_train[['callduration','callthroughcnt','callaverageduration','callthroughrate']].plot(kind='box',
ax=axis,subplots=True,color=color,sym='r+')
'''
箱型图boxplot()所有用法详解: https://blog.csdn.net/weixin_40683253/article/details/87857194
箱线图查看异常值分布 : https://blog.csdn.net/zm_1900/article/details/89074306
Python实现箱形图的绘制 : https://blog.csdn.net/qq_41080850/article/details/83829045
'''
fig,axis = plt.subplots(1,1)
p = data1_train[['cash']].boxplot(return_type='dict',ax=axis,notch=False,boxprops={'color':'DarkGreen'},sym='r+')
x = p['fliers'][0].get_xdata()
y = p['fliers'][0].get_ydata()
len(x)
len(y)
type(x)
type(y)
y.min()
y.max()
type(data1_train.loc[data1_train.cash<1501.0,'cash'])
type(data1_train.loc[data1_train.cash<1501.0,'cash'].to_frame())
fig,axis = plt.subplots(1,1)
p1 = data1_train.loc[data1_train.cash<1501.0,'cash'].to_frame().boxplot(return_type='dict',ax=axis,notch=False,boxprops={'color':'DarkGreen'},sym='r+')
import numpy as np
from scipy.stats import pearsonr
type(data1_train[['y']].T.values[0])
pearsonr(data1_train[['cash']].T.values[0], data1_train[['y']].T.values[0])
'''
(-0.0013504679129656334, 0.3942351282022717) 相关性系数很小,双边检验P值>0.05,不具有线性相关性
注:相关系数是度量 数值型变量与数值型变量之间的关系,不适用于分类变量。度量 分类变量与数值型变量的关系应该使用方差分析。
'''
from sklearn.cross_validation import cross_val_score, ShuffleSplit
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=20, max_depth=4)
data1_train_X.shape
data1_train_y.shape
data1_train_X.head(2)
data1_train_X.columns
type(data1_train_X.loc[:,'cash'])
type(data1_train_X.loc[:,'cash'].values)
type(data1_train_y.values)
data1_train_X.loc[:,'cash'].values.ndim
type(data1_train_X.loc[:, 'cash'].values.reshape(-1,1))
data1_train_X.loc[:, 'cash'].values.reshape(-1,1).ndim
score = cross_val_score(rf, data1_train_X.loc[:, 'cash'].values.reshape(-1,1), data1_train_y.values, scoring="r2", cv=3)
score
scores = []
for column in ['cash','RefillDeposit']:
score = cross_val_score(rf, data1_train_X.loc[:, column].values.reshape(-1,1), data1_train_y.values, scoring="r2", cv=3)
scores.append((round(np.mean(score), 3), column))
print(sorted(scores, reverse=True))
data1_train.head(2)
data1_train.shape
data1_train.loc[:,'y'].head(3)
type(data1_train.loc[:,'y'])
type(data1_train[['y']])
type(data1_train[['y']].values)
type(data1_train['y'])
type(data1_train['y'].values)
sum(data1_train.ProductRejectRate.isnull())
sum(data1_train.lastcallduration.isnull())
ProductRejectRate_mean = data1_train['ProductRejectRate'].groupby(data1_train['y']).mean()
for index in data1_train.loc[data1_train.ProductRejectRate.isnull()].index:
data1_train.loc[index,'ProductRejectRate'] = ProductRejectRate_mean[data1_train.loc[index,'y']]
data1_train.loc[data1_train.lastcallduration.isnull(),'lastcallduration']=0
scores = []
for column in [ 'cash', 'RefillDeposit', 'PaidByDeposit', 'cashRatio', 'RefillDepositRatio', 'PaidByDepositRatio',
'iszero', 'ispurezero', 'cus_reject_rate', 'ProductRejectRate', 'maker_reject_rate', 'workAge',
'cus_history_cash', 'cus_history_RefillDeposit', 'cus_history_PaidByDeposit',
'cus_history_cashRatio', 'cus_history_RefillDepositRatio','cus_history_PaidByDepositRatio',
'cus_history_zeroRatio', 'cus_history_purezeroRatio',
'induration', 'inthroughcnt','inaverageduration', 'inthroughrate', 'outduration', 'outthroughcnt',
'outaverageduration', 'outthroughrate', 'callduration',
'callthroughcnt', 'callaverageduration', 'callthroughrate', 'lastcallduration']:
score = cross_val_score(rf,data1_train.loc[:,column].values.reshape(-1,1),data1_train.loc[:,'y'].values,scoring="r2",cv=3)
scores.append((round(np.mean(score),3),column))
print(sorted(scores,reverse=True))
'''
[(0.051, 'maker_reject_rate'), (0.05, 'ProductRejectRate'), (0.041, 'cus_reject_rate'), (0.04, 'cash'), (0.037, 'workAge'),
(0.034, 'cus_history_cash'), (0.033, 'outduration'), (0.033, 'cus_history_cashRatio'), (0.033, 'cashRatio'), (0.033, 'callduration'),
(0.029, 'outthroughcnt'), (0.028, 'callthroughcnt'), (0.026, 'cus_history_PaidByDepositRatio'), (0.026, 'cus_history_PaidByDeposit'),
(0.025, 'iszero'), (0.023, 'cus_history_zeroRatio'), (0.018, 'cus_history_RefillDepositRatio'), (0.018, 'cus_history_RefillDeposit'),
(0.014, 'ispurezero'), (0.013, 'cus_history_purezeroRatio'), (0.012, 'PaidByDepositRatio'), (0.011, 'PaidByDeposit'), (0.009, 'outaverageduration'),
(0.009, 'callaverageduration'), (0.005, 'outthroughrate'), (0.005, 'callthroughrate'), (0.004, 'inthroughcnt'), (0.003, 'inaverageduration'),
(0.003, 'RefillDeposit'), (0.002, 'inthroughrate'), (0.002, 'induration'), (0.002, 'RefillDepositRatio'), (0.001, 'lastcallduration')]
设置阈值为0.01,即选择得分大于等于0.01的特征进行建模,共22个特征
还可使用卡方检验进行特征选择,此处略
'''
data1_train.columns
a=['Id', 'SrCustomerId', 'paymentType', 'cash', 'RefillDeposit', 'PaidByDeposit', 'cashRatio', 'RefillDepositRatio',
'PaidByDepositRatio', 'iszero', 'ispurezero', 'CurrentCustomerGrade', 'cus_reject_rate', 'ProductRejectRate', 'maker_reject_rate',
'orderMakerCenter', 'workAge', 'cus_history_cash', 'cus_history_RefillDeposit', 'cus_history_PaidByDeposit',
'cus_history_cashRatio', 'cus_history_RefillDepositRatio', 'cus_history_PaidByDepositRatio', 'cus_history_zeroRatio',
'cus_history_purezeroRatio', 'induration', 'inthroughcnt', 'inaverageduration', 'inthroughrate', 'outduration', 'outthroughcnt',
'outaverageduration', 'outthroughrate', 'callduration', 'callthroughcnt', 'callaverageduration', 'callthroughrate',
'lastcallduration', 'lastcalltype', '城市类型', 'y']
data1_train_X.head()
data1_train_X.describe()
data1_train_X['lastcalltype'].describe()
data1_train_X[['ProductRejectRate','lastcallduration']].describe()
sum(data1_train_X.lastcalltype.isnull())
sum(data1_train_X.ProductRejectRate.isnull())
sum(data1_train_X.lastcallduration.isnull())
data1_test_X.describe()
data1_test_X['lastcalltype'].describe()
data1_test_X[['ProductRejectRate','lastcallduration']].describe()
sum(data1_test_X.lastcalltype.isnull())
sum(data1_test_X.ProductRejectRate.isnull())
sum(data1_test_X.lastcallduration.isnull())
'''
训练集:
'lastcalltype' 缺失207条 —— 先单独设置一个类目值no(后面有时间使用RandomForest随机森林算法,根据已有的特征值拟合一下)
'ProductRejectRate' 缺失3条 —— 缺失值处理成相应y值的平均值
'lastcallduration' 缺失207条 —— 缺失值处理成0(很可能是未匹配到订单前的最后一次通话,可能无通话?)
测试集:
'lastcalltype' 缺失86条 —— 先单独设置一个类目值no(后面有时间使用RandomForest随机森林算法,根据已有的特征值拟合一下)
'ProductRejectRate' 缺失2条 —— 缺失值处理成相应y值的平均值
'lastcallduration' 缺失86条 —— 缺失值处理成0(很可能是未匹配到订单前的最后一次通话,可能无通话?)
'''
data2_train = pd.concat([data1_train_X, data1_train_y],axis=1)
data2_test = pd.concat([data1_test_X, data1_test_y],axis=1)
data2_train.shape
data2_test.shape
data2_test['lastcalltype'].unique()
sum(data2_train['lastcalltype'].isnull())
sum(data2_test['lastcalltype'].isnull())
data2_train.loc[data2_train.lastcalltype.isnull(),'lastcalltype'] = 'no'
data2_test.loc[data2_test.lastcalltype.isnull(),'lastcalltype'] = 'no'
sum(data2_train['ProductRejectRate'].isnull())
sum(data2_test['ProductRejectRate'].isnull())
ProductRejectRate_mean = data2_train['ProductRejectRate'].groupby(data2_train['y']).mean()
for index in data2_train.loc[data2_train.ProductRejectRate.isnull()].index:
data2_train.loc[index,'ProductRejectRate'] = ProductRejectRate_mean[data2_train.loc[index,'y']]
ProductRejectRate_mean = data2_test['ProductRejectRate'].groupby(data2_test['y']).mean()
for index in data2_test.loc[data2_test.ProductRejectRate.isnull()].index:
data2_test.loc[index,'ProductRejectRate'] = ProductRejectRate_mean[data2_test.loc[index,'y']]
sum(data2_train.lastcallduration.isnull())
sum(data2_test.lastcallduration.isnull())
data2_train.loc[data2_train.lastcallduration.isnull(),'lastcallduration']=0
data2_test.loc[data2_test.lastcallduration.isnull(),'lastcallduration']=0
'''
需要因子化的特征:'paymentType','CurrentCustomerGrade','orderMakerCenter','lastcalltype','城市类型'
'''
dummies_paymentType_train = pd.get_dummies(data2_train['paymentType'],prefix='paymentType')
dummies_CurrentCustomerGrade_train = pd.get_dummies(data2_train['CurrentCustomerGrade'],prefix='CurrentCustomerGrade')
dummies_orderMakerCenter_train = pd.get_dummies(data2_train['orderMakerCenter'],prefix='orderMakerCenter')
dummies_lastcalltype_train = pd.get_dummies(data2_train['lastcalltype'],prefix='lastcalltype')
dummies_citytype_train = pd.get_dummies(data2_train['城市类型'],prefix='城市类型')
data3_train = pd.concat([data2_train,dummies_paymentType_train,dummies_CurrentCustomerGrade_train,dummies_orderMakerCenter_train,
dummies_lastcalltype_train,dummies_citytype_train],axis=1)
data3_train.drop(['paymentType','CurrentCustomerGrade','orderMakerCenter','lastcalltype','城市类型'],axis=1,inplace=True)
data3_train.head()
data3_train.shape
dummies_paymentType_test = pd.get_dummies(data2_test['paymentType'],prefix='paymentType')
dummies_CurrentCustomerGrade_test = pd.get_dummies(data2_test['CurrentCustomerGrade'],prefix='CurrentCustomerGrade')
dummies_orderMakerCenter_test = pd.get_dummies(data2_test['orderMakerCenter'],prefix='orderMakerCenter')
dummies_lastcalltype_test = pd.get_dummies(data2_test['lastcalltype'],prefix='lastcalltype')
dummies_citytype_test = pd.get_dummies(data2_test['城市类型'],prefix='城市类型')
data3_test = pd.concat([data2_test,dummies_paymentType_test,dummies_CurrentCustomerGrade_test,dummies_orderMakerCenter_test,
dummies_lastcalltype_test,dummies_citytype_test],axis=1)
data3_test.drop(['paymentType','CurrentCustomerGrade','orderMakerCenter','lastcalltype','城市类型'],axis=1,inplace=True)
data3_test.shape
data4_train = data3_train.copy()
data4_test = data3_test.copy()
import sklearn.preprocessing as preprocessing
list1 = ['cash','RefillDeposit','PaidByDeposit','workAge',
'cus_history_cash','cus_history_RefillDeposit','cus_history_PaidByDeposit',
'induration','inthroughcnt','inaverageduration','outduration','outthroughcnt','outaverageduration',
'callduration','callthroughcnt','callaverageduration','lastcallduration']
scaler = preprocessing.StandardScaler()
for col in list1:
data4_train[col] = scaler.fit_transform(data4_train[col].values.reshape(-1,1))
data4_test[col] = scaler.fit_transform(data4_test[col].values.reshape(-1,1))
data4_train.head()
data4_train.shape
data4_test.head()
data4_test.shape
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
data_train_X = data4_train.drop(['Id','SrCustomerId','y'],axis=1)
data_train_y = data4_train['y']
data_test_X = data4_test.drop(['Id','SrCustomerId','y'],axis=1)
data_test_y = data4_test['y']
data_train_X.shape
data_test_X.shape
data_train_y.shape
data_test_y.shape
def PolynomialLogisticRegression(degree,C,penalty):
return Pipeline([
('poly',PolynomialFeatures(degree=degree)),
('std_scaler',StandardScaler()),
('log_reg',LogisticRegression(C=C, penalty=penalty))
])
clf = PolynomialLogisticRegression(degree=2, C=1.0, penalty='l2')
clf.fit(data_train_X,data_train_y)
clf.score(data_train_X,data_train_y)
clf.score(data_test_X,data_test_y)
'''
clf = LogisticRegression(penalty='l2',tol=1e-6, C=1.0, solver='liblinear')
clf.fit(data_train_X, data_train_y)
(penalty='l2', dual=False, tol=0.0001, C=1.0,fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None,
solver='liblinear', max_iter=100, multi_class='ovr', verbose=0,warm_start=False, n_jobs=1)
'''
data_test_y_predict = clf.predict(data_test_X)
data_test_y_predict_log_proba = clf.predict_log_proba(data_test_X)
data_test_y_predict_proba = clf.predict_proba(data_test_X)
clf.score(data_test_X, data_test_y)
def TN(y_true, y_predict):
assert len(y_true) == len(y_predict)
return np.sum((y_true == 0) & (y_predict == 0))
TN(data_test_y, data_test_y_predict)
def FP(y_true, y_predict):
assert len(y_true) == len(y_predict)
return np.sum((y_true == 0) & (y_predict == 1))
FP(data_test_y, data_test_y_predict)
def FN(y_true, y_predict):
assert len(y_true) == len(y_predict)
return np.sum((y_true == 1) & (y_predict == 0))
FN(data_test_y, data_test_y_predict)
def TP(y_true, y_predict):
assert len(y_true) == len(y_predict)
return np.sum((y_true == 1) & (y_predict == 1))
TP(data_test_y, data_test_y_predict)
def confusion_matrix(y_true, y_predict):
return np.array([
[TN(y_true, y_predict), FP(y_true, y_predict)],
[FN(y_true, y_predict), TP(y_true, y_predict)]
])
confusion_matrix(data_test_y, data_test_y_predict)
'''
array([[154297, 615],
[ 15063, 593]], dtype=int64)
'''
def precision_score(y_true, y_predict):
tp = TP(y_true, y_predict)
fp = FP(y_true, y_predict)
try:
return tp / (tp + fp)
except:
return 0.0
precision_score(data_test_y, data_test_y_predict)
def recall_score(y_true, y_predict):
tp = TP(y_true, y_predict)
fn = FN(y_true, y_predict)
try:
return tp / (tp + fn)
except:
return 0.0
recall_score(data_test_y, data_test_y_predict)
decision_scores = clf.decision_function(data_test_X)
min(decision_scores)
max(decision_scores)
def TPR(y_true, y_predict):
tp = TP(y_true, y_predict)
fn = FN(y_true, y_predict)
try:
return tp / (tp + fn)
except:
return 0.0
def FPR(y_true, y_predict):
fp = FP(y_true, y_predict)
tn = TN(y_true, y_predict)
try:
return fp / (fp + tn)
except:
return 0.0
fprs = []
tprs = []
thresholds = np.arange(np.min(decision_scores), np.max(decision_scores), 0.1)
for threshold in thresholds:
y_predict = np.array(decision_scores >= threshold, dtype=int)
fprs.append(FPR(data_test_y, y_predict))
tprs.append(TPR(data_test_y, y_predict))
plt.plot(fprs, tprs,color='red')
plt.show()
from sklearn.metrics import roc_curve
y_score = clf.predict_proba(data_test_X)
print(y_score)
fpr, tpr, _ = roc_curve(data_test_y, y_score[:,1])
print("fpr:",fpr)
print("tpr:", tpr)
y_score2 = clf.decision_function(data_test_X)
print(y_score2)
fpr2, tpr2, _ = roc_curve(data_test_y, y_score2)
print("fpr:",fpr2)
print("tpr:", tpr2)
from sklearn.metrics import auc
def plot_roc_curve(fpr, tpr):
plt.figure()
lw = 2
roc_auc = auc(fpr,tpr)
plt.plot(fpr, tpr, color='darkorange',lw=lw,label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0,1], [0,1], color='navy', lw=lw, linestyle='--')
plt.xlim([0, 1.0])
plt.ylim([0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('roc')
plt.legend(loc="lower right")
plt.show()
plot_roc_curve(fpr, tpr)
from sklearn.metrics import roc_curve
fprs, tprs, thresholds = roc_curve(data_test_y, decision_scores)
plt.plot(fprs, tprs)
plt.show()
from sklearn.metrics import roc_auc_score
roc_auc_score(data_test_y, decision_scores)
df = pd.DataFrame({"columns":list(data_test_X.columns)[:], "coef":list(clf.coef_[0].T)})
df
from sklearn import cross_validation
clf = LogisticRegression(penalty='l2',tol=1e-6, C=1.0, solver='liblinear')
all_data_X = pd.concat([data4_train,data4_test],axis=0).drop(['Id','SrCustomerId','y'],axis=1)
all_data_X.head()
all_data_X.shape
all_data_y = pd.concat([data4_train['y'],data4_test['y']],axis=0)
all_data_y.head()
all_data_y.shape
s = cross_validation.cross_val_score(clf,all_data_X,all_data_y,cv=5)
print(s)
import matplotlib as mpl
mpl.rcParams['font.sans-serif']=['SimHei']
mpl.rcParams['axes.unicode_minus']=False
from sklearn.learning_curve import learning_curve
def plot_learning_curve_nxn(estimator,title,X,y,ylim=None,cv=None,n_jobs=1,
train_sizes=np.linspace(0.05,1.0,20),verbose=0,plot=True):
'''
画出data在某模型上的learning curve,参数解释:
------------
estimator : 你用的分类器
title : 表格的标题
X : 输入的feature, numpy类型
y : 输入的target vector
ylim : tuple格式的(ymin,ymax),设定图像中纵坐标的最低点和最高点
cv : 做cross validation的时候,数据分成的份数,其中一份作为cv集,其余n-1份作为training(默认为3份)
n_jobs : 并行的任务数(默认1)
'''
train_sizes,train_scores,test_scores = learning_curve(
estimator,X,y,cv=cv,n_jobs=n_jobs,train_sizes=train_sizes,verbose=verbose)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
if plot:
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel(u"训练样本数")
plt.ylabel(u"得分")
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std,
alpha=0.1, color="b")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std,
alpha=0.1, color="r")
plt.plot(train_sizes, train_scores_mean, 'o-', color="b", label=u"训练集上得分")
plt.plot(train_sizes, test_scores_mean, 'o-', color="r", label=u"交叉验证集上得分")
plt.legend(loc="best")
plt.draw()
plt.show()
midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) + (test_scores_mean[-1] - test_scores_std[-1])) / 2
diff = (train_scores_mean[-1] + train_scores_std[-1]) - (test_scores_mean[-1] - test_scores_std[-1])
return midpoint, diff
plot_learning_curve_nxn(clf,u"学习曲线",all_data_X,all_data_y,cv=5)
def plot_learning_curve(estimator,title,X,y,ylim=None,cv=None,n_jobs=1,
train_sizes=np.linspace(0.05,1.0,20),verbose=0,plot=True):
'''
画出data在某模型上的learning curve,参数解释:
------------
estimator : 你用的分类器
title : 表格的标题
X : 输入的feature, numpy类型
y : 输入的target vector
ylim : tuple格式的(ymin,ymax),设定图像中纵坐标的最低点和最高点
cv : 做cross validation的时候,数据分成的份数,其中一份作为cv集,其余n-1份作为training(默认为3份)
n_jobs : 并行的任务数(默认1)
'''
train_sizes,train_scores,test_scores = learning_curve(
estimator,X,y,cv=cv,n_jobs=n_jobs,train_sizes=train_sizes,verbose=verbose)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
if plot:
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel(u"训练样本数")
plt.ylabel(u"得分")
plt.gca().invert_yaxis()
'''
gca()函数:https://blog.csdn.net/Dontla/article/details/98327176
当前的图表和子图可以使用plt.gcf()和plt.gca()获得,分别表示Get Current Figure和Get Current Axes。
在pyplot模块中,许多函数都是对当前的Figure或Axes对象进行处理,比如说:
plt.plot()实际上会通过plt.gca()获得当前的Axes对象ax,然后再调用ax.plot()方法实现真正的绘图。
invert_yaxis()函数 : 反转Y轴
'''
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std,
alpha=0.1, color="b")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std,
alpha=0.1, color="r")
plt.plot(train_sizes, train_scores_mean, 'o-', color="b", label=u"训练集上得分")
plt.plot(train_sizes, test_scores_mean, 'o-', color="r", label=u"交叉验证集上得分")
plt.legend(loc="best")
plt.draw()
plt.show()
plt.gca().invert_yaxis()
midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) + (test_scores_mean[-1] - test_scores_std[-1])) / 2
diff = (train_scores_mean[-1] + train_scores_std[-1]) - (test_scores_mean[-1] - test_scores_std[-1])
return midpoint, diff
plot_learning_curve(clf,u"学习曲线",all_data_X,all_data_y)
'''
data_train_X = data4_train.drop(['Id','SrCustomerId','y'],axis=1) #样本集(训练集)的特征值
data_train_y = data4_train['y'] #样本集(训练集)的目标值
data_test_X = data4_test.drop(['Id','SrCustomerId','y'],axis=1) #测试集的特征值
data_test_y = data4_test['y'] #测试集的目标值
data_train_X.shape # (397992, 57)
data_test_X.shape # (170568, 57)
data_train_y.shape # (397992,)
data_test_y.shape # (170568,)
'''
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import accuracy_score
clf = LogisticRegression(penalty='l2',tol=1e-6, C=1.0)
bagging_clf = BaggingRegressor(clf,n_estimators=2,max_samples=0.8,max_features=1.0,
bootstrap=True,bootstrap_features=False,n_jobs=-1)
bagging_clf.fit(data_train_X,data_train_y)
predictions = bagging_clf.predict(data_test_X)
print(accuracy_score(data_test_y,predictions))
def TN(y_true, y_predict):
assert len(y_true) == len(y_predict)
return np.sum((y_true == 0) & (y_predict == 0))
TN(data_test_y, predictions)
def FP(y_true, y_predict):
assert len(y_true) == len(y_predict)
return np.sum((y_true == 0) & (y_predict == 1))
FP(data_test_y, predictions)
def FN(y_true, y_predict):
assert len(y_true) == len(y_predict)
return np.sum((y_true == 1) & (y_predict == 0))
FN(data_test_y, predictions)
def TP(y_true, y_predict):
assert len(y_true) == len(y_predict)
return np.sum((y_true == 1) & (y_predict == 1))
TP(data_test_y, predictions)
def confusion_matrix(y_true, y_predict):
return np.array([
[TN(y_true, y_predict), FP(y_true, y_predict)],
[FN(y_true, y_predict), TP(y_true, y_predict)]
])
confusion_matrix(data_test_y, predictions)
'''
array([[154226, 546],
[ 15010, 532]], dtype=int64)
'''
precision_score(data_test_y, predictions)
recall_score(data_test_y, predictions)
len(predictions)
len(data_test_y)
np.unique(data_test_y)
np.unique(predictions)
a = pd.DataFrame(predictions,columns=['result'])
a.columns
a['result'].groupby(a['result']).count()
'''
result
0.0 169236
0.5 254
1.0 1078
Name: result, dtype: int64
'''
def forward_select(data,response):
import statsmodels.api as sm
import statsmodels.formula.api as smf
remaining = set(data.columns)
remaining.remove(response)
selected = []
current_score,best_new_score = float('inf'),float('inf')
while remaining:
aic_with_candidates = []
for candidate in remaining:
formula = "{}~{}".format(response,"+".join(selected + [candidate]))
aic = smf.glm(formula=formula,data=data,family=sm.families.Binomial(sm.families.links.logit)).fit().aic
aic_with_candidates.append((aic,candidate))
aic_with_candidates.sort(reverse=True)
best_new_score,best_candidate=aic_with_candidates.pop()
if current_score>best_new_score:
remaining.remove(best_candidate)
selected.append(best_candidate)
current_score = best_new_score
print("aic is {},continuing!".format(current_score))
else:
print("forward selection over!")
break
formula = "{}~{}".format(response,"+".join(selected))
print("final formula is {}".format(formula))
model = smf.glm(formula=formula,data=data,family=sm.families.Binomial(sm.families.links.logit)).fit()
return(model)
data_for_select = pd.concat([data_train_X,data_train_y],axis=1)
lg_model = forward_select(data=data_for_select,response='y')
lg_model.summary().tables[1]
import sklearn.metrics as metrics
data_test = pd.concat([data_test_X,data_test_y],axis=1)
fpr,tpr,th = metrics.roc_curve(data_test_y,lg_model.predict(data_test))
plt.figure(figsize=[6,6])
plt.plot(fpr,tpr,'b--')
plt.title("ROC curve")
plt.show()
print("AUC = %.4f"%metrics.auc(fpr,tpr))