test

# -*- coding: utf-8 -*-
"""
Created on Sun Jun 21 22:13:17 2020

@author: xn_katherine
"""

### 项目流程
'''
1、分析项目需求,明确具体问题
2、获取数据
3、特征预处理与特征选择(归一化、离散化、因子化、缺失值处理、去除共线性等,需要运用特征有效性分析的相关技术,如相关系数、卡方检验、平均互信息、条件熵、后验概率、逻辑回归权重等方法)
4、训练模型与调优
5、模型诊断( 过拟合/欠拟合、误差分析)
6、模型融合
7、上线运行
'''

# 一、业务理解

# 二、获取数据及初步的数据分析
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
#import os

### 导入数据 


file = open("D:t.txt",encoding='utf-8')
cityType =  pd.read_excel("D:/city.xlsx",sheet_name="Sheet1")


#测试读取text文件 strip() split() 
a = file.readline()
a.split()
a.strip().strip('"')
aa = a.strip().strip('"').split('"\t"')  #python中strip()函数的使用 https://www.cnblogs.com/liu-xiaoliu/p/9233885.html
aa[0] #'13279119'
type(aa[0]) #str
aa[1]
eval(aa[1])       #Python之eval的使用 https://www.cnblogs.com/onefine/p/10499383.html
type(aa[1]) #str
type(eval(aa[1])) #int
aa[2]
eval(aa[2]) #error 
aa[7]
eval(aa[7]) #200.0
type(eval(aa[7])) #float
aa[43] 
eval(aa[43]) #str 
type(eval(aa[43])) #int
file.close()


#读取txt文件后写入列表 
data0 = []
for line in file.readlines():
    lineArr = line.strip().strip('"').split('"\t"') 
    data0.append(lineArr)
file.close()

data0[0:2]
len(data0) #573051
data0[1][0]
data0[1][2]
data0[1][3]
data0[1][4]
type(data0[1][4])
data0[1][5]
data0[1][6]
data0[1][19]
data0[1][42]

data0[1][43]
type(data0[1][18])


#str格式转换成int/float 方式一:
for i in range(1,573051,1):
    for j in range(0,44,1):
        if((j>=2 and j<=6) or j==19 or j==42):
            data0[i][j] = data0[i][j]
        elif len(data0[i][j]) == 0:
            data0[i][j] = 1.0e-10   #可设置为 float('inf')
            print(j)
        else:
            data0[i][j] = eval(data0[i][j])
                
'''包含空字符串,设置为1.0e-10 的列 有:
[17]'ProductRejectRate'
[41]'lastcallduration' 
'''

#计数
data0[0][17]
a=0
for i in range(1,573051,1):
    if isinstance(data0[i][17],str): #len(data0[i]['ProductRejectRate'])==0:
        a +=1

a=0
for i in range(1,573051,1):
    if(isinstance(data0[i][17],float)):
        a +=1
a

#str格式转换成int/float 方式二:
for i in range(1,573051,1):
    for j in range(0,44,1):
        if len(data0[i][j]) == 0:
            data0[i][j] = np.nan
        elif((j>=2 and j<=6) or j==19 or j==42):
            data0[i][j] = data0[i][j]
        else:
            data0[i][j] = eval(data0[i][j])
            
data0[0:2]


#转换成数据框 
data = pd.DataFrame(data0[1:],columns=data0[0])
data.head(2)
data.shape #(573050, 44)
data.describe()

data.columns
data.lastcalltype.describe()
data['lastcalltype'].groupby(data['lastcalltype']).count()
sum(data['lastcalltype']=='') #300
data.loc[data['lastcalltype']=='','lastcalltype']
data.where(data['lastcalltype']=='')

for col in ['SrCustomerId','CreationTime','provinceName','cityName','paymentType','orderMakerCenter','lastcalltype']:
    data.loc[data[col]=='',col] = np.nan


#把ShippingStatus in(1,2)的样本刨除,然后3、6、7 划分为 1拒收,4、5 划分为 0未拒收 
data.loc[[0,2],'ShippingStatus']
sum(data.ShippingStatus.isin([1,2])) #4490
data.loc[data.ShippingStatus.isin([1,2]),'ShippingStatus']
sum(data['ShippingStatus'] > 2) #568560

data1 = data.loc[data['ShippingStatus'] > 2]
data1.head(2)
data1.shape #(568560, 44) 
data1.describe()
data1[0:2][['ShippingStatus']]

'''test:
a=pd.DataFrame([list('abcd'), list('efgh'), list('ijkl'), list('mnop')],
                columns=['one', 'two','first', 'second'])
a.loc[0,'one']=0
a.loc[a.one.isin(['e']),'two']=1

data1.loc[0,'ShippingStatus']=1 #error
data1.loc[0,'y']=1 #error
data1.loc[data1.ShippingStatus.isin([3,6,7]),'ShippingStatus'] = 1 #error,使用下方方式代替
data1.loc[data1.ShippingStatus.isin([4,5]),'ShippingStatus'] = 0 #error,使用下方方式代替
'''
data1.loc[data1.ShippingStatus.isin([3,6,7]),'y'] = 1
data1.loc[data1.ShippingStatus.isin([4,5]),'y'] = 0
data1['y'].groupby(data1['y']).count()    #0.0    516407    1.0     52153
data1.head(2)
data1.shape #(568560, 45)
data1.describe()

# 划分城市类型
cityType.head()
data2 = pd.merge(data1,cityType,left_on='cityName',right_on='名称.1',how='left')
data2.shape #(568560, 49)
data2.head(2)
sum(pd.isnull(data2['城市类型'])) #17924
a = data2.loc[pd.isnull(data2['城市类型']),'cityName']
#a.to_csv("D:/citydefault.csv") #乱码
a.to_excel("D:/citydefault.xlsx") #导出缺失城市类型的样本 ,观察后将缺失值都设置为五线

#将缺失值都设置为五线
data3 = data2.copy()
sum(pd.isnull(data3['城市类型'])) #17924
data3.loc[pd.isnull(data3['城市类型']),'城市类型']='五线'
data3.shape #(568560, 49)
data3.head(2)






### 划分训练集和测试集,分别取出特征值和目标值,去除['ShippingStatus'],得到data_X、data_y
'''
参考:
数据标准化常见问题:对整个数据集数据标准化后再划分训练集、测试集和先对训练级标准化再将规则用于测试集有什么区别(Python实现)
https://blog.csdn.net/qq_40304090/article/details/90597892
'''

from sklearn.model_selection import train_test_split

data1_X = data3[['Id','SrCustomerId','paymentType','cash','RefillDeposit','PaidByDeposit',
 'cashRatio','RefillDepositRatio','PaidByDepositRatio','iszero','ispurezero',
 'CurrentCustomerGrade','cus_reject_rate','ProductRejectRate','maker_reject_rate','orderMakerCenter','workAge',
 'cus_history_cash','cus_history_RefillDeposit','cus_history_PaidByDeposit',
 'cus_history_cashRatio','cus_history_RefillDepositRatio','cus_history_PaidByDepositRatio','cus_history_zeroRatio','cus_history_purezeroRatio',
 'induration','inthroughcnt','inaverageduration','inthroughrate','outduration','outthroughcnt','outaverageduration','outthroughrate',
 'callduration','callthroughcnt','callaverageduration','callthroughrate','lastcallduration','lastcalltype',
 '城市类型']]
data1_y = data3['y']
data1_X.shape #(568560, 40) 样本集的特征值
data1_y.shape #(568560,) 样本集的目标值

data1_train_X, data1_test_X, data1_train_y, data1_test_y = train_test_split(data1_X, data1_y, random_state=666, test_size=0.30)  #(sklearn的train_test_split:https://www.cnblogs.com/bonelee/p/8036024.html) 
data1_train_X.shape #(397992, 40) 
data1_test_X.shape #(170568, 40) 
data1_train_y.shape #(397992,)
data1_test_y.shape #(170568,)



### 初步的数据分析 (对data1_train进行分析,再将数据清洗/特征预处理/特征选择的规则应用于测试集;
#对data1_train:describe分析+绘制箱线图+异常值分析(3sigma/箱线图分析)、缺失值、归一化、特征选择
import matplotlib as mpl
mpl.rcParams['font.sans-serif']=['SimHei'] #指定默认字体 SimHei为黑体
mpl.rcParams['axes.unicode_minus']=False #用来正常显示负号

data1_train = pd.concat([data1_train_X, data1_train_y],axis=1)
data1_test = pd.concat([data1_test_X, data1_test_y],axis=1)
data1_train.shape # (397992, 41)
data1_test.shape # (170568, 41)

data1_train[['SrCustomerId','paymentType','orderMakerCenter','lastcalltype','城市类型']].describe()
data1_train[['Id','cash','RefillDeposit','PaidByDeposit','cashRatio','RefillDepositRatio','PaidByDepositRatio','iszero','ispurezero']].describe()
data1_train[['CurrentCustomerGrade','cus_reject_rate','ProductRejectRate','maker_reject_rate','workAge']].describe()
data1_train[['cus_history_cash','cus_history_RefillDeposit','cus_history_PaidByDeposit','cus_history_cashRatio',
      'cus_history_RefillDepositRatio','cus_history_PaidByDepositRatio','cus_history_zeroRatio','cus_history_purezeroRatio']].describe()
data1_train[['induration','inthroughcnt','inaverageduration','inthroughrate','outduration','outthroughcnt','outaverageduration','outthroughrate',
      'callduration','callthroughcnt','callaverageduration','callthroughrate','lastcallduration','y']].describe()


#绘制箱线图 ( Python实现箱形图的绘制:https://blog.csdn.net/qq_41080850/article/details/83829045 )

data1_train[['iszero']].groupby(data1_train['iszero']).count()      #0:307227       1:90765 
data1_train[['ispurezero']].groupby(data1_train['ispurezero']).count()      #0:338762       1:59230 
data1_train[['y']].groupby(data1_train['y']).count()   #0.0  361495    1.0   36497
data1_train[['CurrentCustomerGrade']].groupby(data1_train['CurrentCustomerGrade']).count()
'''Out[180]: 
                      CurrentCustomerGrade
CurrentCustomerGrade                      
1                                    10889
2                                     6181
3                                   132059
4                                    28656
5                                    33709
6                                    72358
7                                    88308
8                                    25832
'''

fig,axis = plt.subplots(1,3)
color = dict(boxes='DarkGreen', whiskers='DarkOrange',medians='DarkBlue', caps='Red')
data1_train[['cash','RefillDeposit','PaidByDeposit']].plot(kind='box',ax=axis,subplots=True,color=color,sym='r+')
data1_train[['cashRatio','RefillDepositRatio','PaidByDepositRatio']].plot(kind='box',ax=axis,subplots=True,color=color,sym='r+')

fig,axis = plt.subplots(1,4)
color = dict(boxes='DarkGreen', whiskers='DarkOrange',medians='DarkBlue', caps='Red')
data1_train[['cus_reject_rate','ProductRejectRate','maker_reject_rate','workAge']].plot(kind='box',
           ax=axis,subplots=True,color=color,sym='r+')

fig,axis = plt.subplots(1,3)
color = dict(boxes='DarkGreen', whiskers='DarkOrange',medians='DarkBlue', caps='Red')
data1_train[['cus_history_cash','cus_history_RefillDeposit','cus_history_PaidByDeposit']].plot(kind='box',
           ax=axis,subplots=True,color=color,sym='r+')
data1_train[['cus_history_cashRatio','cus_history_RefillDepositRatio','cus_history_PaidByDepositRatio']].plot(kind='box',
           ax=axis,subplots=True,color=color,sym='r+')
data1_train[['cus_history_zeroRatio','cus_history_purezeroRatio','lastcallduration']].plot(kind='box',
           ax=axis,subplots=True,color=color,sym='r+')

fig,axis = plt.subplots(1,4)
color = dict(boxes='DarkGreen', whiskers='DarkOrange',medians='DarkBlue', caps='Red')
data1_train[['induration','inthroughcnt','inaverageduration','inthroughrate']].plot(kind='box',
           ax=axis,subplots=True,color=color,sym='r+')
data1_train[['outduration','outthroughcnt','outaverageduration','outthroughrate']].plot(kind='box',
           ax=axis,subplots=True,color=color,sym='r+')
data1_train[['callduration','callthroughcnt','callaverageduration','callthroughrate']].plot(kind='box',
           ax=axis,subplots=True,color=color,sym='r+')



### 以cash为例,取出异常值,然后进行特征选择技术
'''
箱型图boxplot()所有用法详解: https://blog.csdn.net/weixin_40683253/article/details/87857194 
箱线图查看异常值分布 : https://blog.csdn.net/zm_1900/article/details/89074306
Python实现箱形图的绘制 : https://blog.csdn.net/qq_41080850/article/details/83829045
'''
fig,axis = plt.subplots(1,1)
p = data1_train[['cash']].boxplot(return_type='dict',ax=axis,notch=False,boxprops={'color':'DarkGreen'},sym='r+')
x = p['fliers'][0].get_xdata() #'fliers'即为异常值的标签
y = p['fliers'][0].get_ydata()
len(x) #37418
len(y) #37418 共有37418个异常值 
type(x) #numpy.ndarray
type(y) #numpy.ndarray
y.min() #1501.0
y.max() #76748.4

type(data1_train.loc[data1_train.cash<1501.0,'cash']) #pandas.core.series.Series,Series类型无boxplot方法,使用to_frame()方法转换
type(data1_train.loc[data1_train.cash<1501.0,'cash'].to_frame()) #pandas.core.frame.DataFrame
fig,axis = plt.subplots(1,1)
p1 = data1_train.loc[data1_train.cash<1501.0,'cash'].to_frame().boxplot(return_type='dict',ax=axis,notch=False,boxprops={'color':'DarkGreen'},sym='r+')


### 以cash为例的特征选择 —— (1)Pearson相关系数 
import numpy as np
from scipy.stats import pearsonr
type(data1_train[['y']].T.values[0])
pearsonr(data1_train[['cash']].T.values[0], data1_train[['y']].T.values[0])  # 计算两变量间的Pearson相关系数
'''
(-0.0013504679129656334, 0.3942351282022717) 相关性系数很小,双边检验P值>0.05,不具有线性相关性
注:相关系数是度量 数值型变量与数值型变量之间的关系,不适用于分类变量。度量 分类变量与数值型变量的关系应该使用方差分析。
'''


### 基于学习模型的特征排序
from sklearn.cross_validation import cross_val_score, ShuffleSplit
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=20, max_depth=4)

#
data1_train_X.shape #(397992, 40) 
data1_train_y.shape #(397992,) 
data1_train_X.head(2)
data1_train_X.columns
type(data1_train_X.loc[:,'cash'])  #pandas.core.series.Series
type(data1_train_X.loc[:,'cash'].values)  #numpy.ndarray
type(data1_train_y.values)  #numpy.ndarray
data1_train_X.loc[:,'cash'].values.ndim #1
type(data1_train_X.loc[:, 'cash'].values.reshape(-1,1)) #numpy.ndarray
data1_train_X.loc[:, 'cash'].values.reshape(-1,1).ndim #2

#test1
score = cross_val_score(rf, data1_train_X.loc[:, 'cash'].values.reshape(-1,1), data1_train_y.values, scoring="r2", cv=3)
score #array([0.03960862, 0.03897603, 0.03949953])

#test2
scores = []
for column in ['cash','RefillDeposit']:
     score = cross_val_score(rf, data1_train_X.loc[:, column].values.reshape(-1,1), data1_train_y.values, scoring="r2", cv=3)
     scores.append((round(np.mean(score), 3), column))
print(sorted(scores, reverse=True)) #[(0.04, 'cash'), (0.003, 'RefillDeposit')]


## 使用每个特征单独训练模型,并获取每个模型的评分来作为特征选择的依据(首先对['ProductRejectRate','lastcallduration']进行缺失值处理)
data1_train.head(2)
data1_train.shape #(397992, 41)
data1_train.loc[:,'y'].head(3)
type(data1_train.loc[:,'y']) #pandas.core.series.Series
type(data1_train[['y']]) #pandas.core.frame.DataFrame
type(data1_train[['y']].values) #numpy.ndarray
type(data1_train['y']) #pandas.core.series.Series
type(data1_train['y'].values) #numpy.ndarray
sum(data1_train.ProductRejectRate.isnull()) # 3条缺失数据
sum(data1_train.lastcallduration.isnull()) #207条缺失数据

#对['ProductRejectRate','lastcallduration']进行缺失值处理,'ProductRejectRate'处理成相应y值的平均值,'lastcallduration'处理成0
ProductRejectRate_mean = data1_train['ProductRejectRate'].groupby(data1_train['y']).mean()
for index in data1_train.loc[data1_train.ProductRejectRate.isnull()].index:
    data1_train.loc[index,'ProductRejectRate'] = ProductRejectRate_mean[data1_train.loc[index,'y']]

data1_train.loc[data1_train.lastcallduration.isnull(),'lastcallduration']=0

scores = []
for column in [ 'cash', 'RefillDeposit', 'PaidByDeposit', 'cashRatio', 'RefillDepositRatio', 'PaidByDepositRatio', 
               'iszero', 'ispurezero', 'cus_reject_rate', 'ProductRejectRate', 'maker_reject_rate', 'workAge', 
               'cus_history_cash', 'cus_history_RefillDeposit', 'cus_history_PaidByDeposit',
               'cus_history_cashRatio', 'cus_history_RefillDepositRatio','cus_history_PaidByDepositRatio', 
               'cus_history_zeroRatio', 'cus_history_purezeroRatio', 
               'induration', 'inthroughcnt','inaverageduration', 'inthroughrate', 'outduration', 'outthroughcnt',
               'outaverageduration', 'outthroughrate', 'callduration',
               'callthroughcnt', 'callaverageduration', 'callthroughrate', 'lastcallduration']:
    score = cross_val_score(rf,data1_train.loc[:,column].values.reshape(-1,1),data1_train.loc[:,'y'].values,scoring="r2",cv=3)
    scores.append((round(np.mean(score),3),column))
print(sorted(scores,reverse=True))
'''
[(0.051, 'maker_reject_rate'), (0.05, 'ProductRejectRate'), (0.041, 'cus_reject_rate'), (0.04, 'cash'), (0.037, 'workAge'),
 (0.034, 'cus_history_cash'), (0.033, 'outduration'), (0.033, 'cus_history_cashRatio'), (0.033, 'cashRatio'), (0.033, 'callduration'),
 (0.029, 'outthroughcnt'), (0.028, 'callthroughcnt'), (0.026, 'cus_history_PaidByDepositRatio'), (0.026, 'cus_history_PaidByDeposit'), 
 (0.025, 'iszero'), (0.023, 'cus_history_zeroRatio'), (0.018, 'cus_history_RefillDepositRatio'), (0.018, 'cus_history_RefillDeposit'), 
 (0.014, 'ispurezero'), (0.013, 'cus_history_purezeroRatio'), (0.012, 'PaidByDepositRatio'), (0.011, 'PaidByDeposit'), (0.009, 'outaverageduration'),
 (0.009, 'callaverageduration'), (0.005, 'outthroughrate'), (0.005, 'callthroughrate'), (0.004, 'inthroughcnt'), (0.003, 'inaverageduration'),
 (0.003, 'RefillDeposit'), (0.002, 'inthroughrate'), (0.002, 'induration'), (0.002, 'RefillDepositRatio'), (0.001, 'lastcallduration')]

设置阈值为0.01,即选择得分大于等于0.01的特征进行建模,共22个特征

还可使用卡方检验进行特征选择,此处略
'''
#类别型变量:'paymentType', 'CurrentCustomerGrade', 'orderMakerCenter', 'lastcalltype', '城市类型' 共5个;
#需要删除的变量:'Id', 'SrCustomerId', 'y'
data1_train.columns
a=['Id', 'SrCustomerId', 'paymentType', 'cash', 'RefillDeposit', 'PaidByDeposit', 'cashRatio', 'RefillDepositRatio',
       'PaidByDepositRatio', 'iszero', 'ispurezero', 'CurrentCustomerGrade', 'cus_reject_rate', 'ProductRejectRate', 'maker_reject_rate',
       'orderMakerCenter', 'workAge', 'cus_history_cash', 'cus_history_RefillDeposit', 'cus_history_PaidByDeposit',
       'cus_history_cashRatio', 'cus_history_RefillDepositRatio', 'cus_history_PaidByDepositRatio', 'cus_history_zeroRatio',
       'cus_history_purezeroRatio', 'induration', 'inthroughcnt', 'inaverageduration', 'inthroughrate', 'outduration', 'outthroughcnt',
       'outaverageduration', 'outthroughrate', 'callduration', 'callthroughcnt', 'callaverageduration', 'callthroughrate',
       'lastcallduration', 'lastcalltype', '城市类型', 'y']




# 三、特征预处理与特征选择
#3.1 缺失值处理,对训练集data1_train_X进行预处理 ,对测试集data1_test_X使用相同方式进行预处理。
data1_train_X.head()

data1_train_X.describe()
data1_train_X['lastcalltype'].describe() #397785
data1_train_X[['ProductRejectRate','lastcallduration']].describe() #397989  397785 
sum(data1_train_X.lastcalltype.isnull()) #207条缺失数据
sum(data1_train_X.ProductRejectRate.isnull()) # 3条缺失数据
sum(data1_train_X.lastcallduration.isnull()) #207条缺失数据

data1_test_X.describe()
data1_test_X['lastcalltype'].describe() #170482
data1_test_X[['ProductRejectRate','lastcallduration']].describe() #170566  170482 
sum(data1_test_X.lastcalltype.isnull()) #86条缺失数据
sum(data1_test_X.ProductRejectRate.isnull()) # 2条缺失数据
sum(data1_test_X.lastcallduration.isnull()) #86条缺失数据


'''
训练集:
'lastcalltype' 缺失207条  —— 先单独设置一个类目值no(后面有时间使用RandomForest随机森林算法,根据已有的特征值拟合一下)
'ProductRejectRate' 缺失3条  —— 缺失值处理成相应y值的平均值
'lastcallduration' 缺失207条  —— 缺失值处理成0(很可能是未匹配到订单前的最后一次通话,可能无通话?)

测试集:
'lastcalltype' 缺失86条  —— 先单独设置一个类目值no(后面有时间使用RandomForest随机森林算法,根据已有的特征值拟合一下)
'ProductRejectRate' 缺失2条  —— 缺失值处理成相应y值的平均值
'lastcallduration' 缺失86条  —— 缺失值处理成0(很可能是未匹配到订单前的最后一次通话,可能无通话?)
'''

data2_train = pd.concat([data1_train_X, data1_train_y],axis=1)
data2_test = pd.concat([data1_test_X, data1_test_y],axis=1)
data2_train.shape # (397992, 41)
data2_test.shape # (170568, 41)

#'lastcalltype'非数值型特征 —— 先单独设置一个类目值no(后面有时间使用RandomForest随机森林算法,根据已有的特征值拟合一下)
data2_test['lastcalltype'].unique()  #array(['callin', 'callout', nan], dtype=object)
sum(data2_train['lastcalltype'].isnull()) #207
sum(data2_test['lastcalltype'].isnull()) #86

data2_train.loc[data2_train.lastcalltype.isnull(),'lastcalltype'] = 'no'
data2_test.loc[data2_test.lastcalltype.isnull(),'lastcalltype'] = 'no'

#'ProductRejectRate'  —— 缺失值处理成相应y值的平均值
sum(data2_train['ProductRejectRate'].isnull()) #3
sum(data2_test['ProductRejectRate'].isnull()) #2

ProductRejectRate_mean = data2_train['ProductRejectRate'].groupby(data2_train['y']).mean()
for index in data2_train.loc[data2_train.ProductRejectRate.isnull()].index:
    data2_train.loc[index,'ProductRejectRate'] = ProductRejectRate_mean[data2_train.loc[index,'y']]

ProductRejectRate_mean = data2_test['ProductRejectRate'].groupby(data2_test['y']).mean()
for index in data2_test.loc[data2_test.ProductRejectRate.isnull()].index:
    data2_test.loc[index,'ProductRejectRate'] = ProductRejectRate_mean[data2_test.loc[index,'y']]
    
#'lastcallduration'  —— 缺失值处理成0
sum(data2_train.lastcallduration.isnull()) #207
sum(data2_test.lastcallduration.isnull()) #86
 
data2_train.loc[data2_train.lastcallduration.isnull(),'lastcallduration']=0
data2_test.loc[data2_test.lastcallduration.isnull(),'lastcallduration']=0



#3.2 对类目型的特征因子化 使用pandas的get_dummies()函数 
'''
需要因子化的特征:'paymentType','CurrentCustomerGrade','orderMakerCenter','lastcalltype','城市类型'
'''
dummies_paymentType_train = pd.get_dummies(data2_train['paymentType'],prefix='paymentType')
dummies_CurrentCustomerGrade_train = pd.get_dummies(data2_train['CurrentCustomerGrade'],prefix='CurrentCustomerGrade')
dummies_orderMakerCenter_train = pd.get_dummies(data2_train['orderMakerCenter'],prefix='orderMakerCenter')
dummies_lastcalltype_train = pd.get_dummies(data2_train['lastcalltype'],prefix='lastcalltype')
dummies_citytype_train = pd.get_dummies(data2_train['城市类型'],prefix='城市类型')
data3_train = pd.concat([data2_train,dummies_paymentType_train,dummies_CurrentCustomerGrade_train,dummies_orderMakerCenter_train,
                         dummies_lastcalltype_train,dummies_citytype_train],axis=1)
data3_train.drop(['paymentType','CurrentCustomerGrade','orderMakerCenter','lastcalltype','城市类型'],axis=1,inplace=True)
data3_train.head()
data3_train.shape # (397992, 60)

dummies_paymentType_test = pd.get_dummies(data2_test['paymentType'],prefix='paymentType')
dummies_CurrentCustomerGrade_test = pd.get_dummies(data2_test['CurrentCustomerGrade'],prefix='CurrentCustomerGrade')
dummies_orderMakerCenter_test = pd.get_dummies(data2_test['orderMakerCenter'],prefix='orderMakerCenter')
dummies_lastcalltype_test = pd.get_dummies(data2_test['lastcalltype'],prefix='lastcalltype')
dummies_citytype_test = pd.get_dummies(data2_test['城市类型'],prefix='城市类型')
data3_test = pd.concat([data2_test,dummies_paymentType_test,dummies_CurrentCustomerGrade_test,dummies_orderMakerCenter_test,
                        dummies_lastcalltype_test,dummies_citytype_test],axis=1)
data3_test.drop(['paymentType','CurrentCustomerGrade','orderMakerCenter','lastcalltype','城市类型'],axis=1,inplace=True)
data3_test.shape # (170568, 60)



#3.3 数据标准化or归一化,scaling 将变化幅度大的特征化到[-1,1]范围之内 

data4_train = data3_train.copy()
data4_test = data3_test.copy()
import sklearn.preprocessing as preprocessing

list1 = ['cash','RefillDeposit','PaidByDeposit','workAge',
 'cus_history_cash','cus_history_RefillDeposit','cus_history_PaidByDeposit',
 'induration','inthroughcnt','inaverageduration','outduration','outthroughcnt','outaverageduration',
 'callduration','callthroughcnt','callaverageduration','lastcallduration']
scaler = preprocessing.StandardScaler()  #使用标准化处理
for col in list1:
    data4_train[col] = scaler.fit_transform(data4_train[col].values.reshape(-1,1))  #fit_transform说明 https://blog.csdn.net/weixin_38278334/article/details/82971752
    data4_test[col] = scaler.fit_transform(data4_test[col].values.reshape(-1,1)) 


data4_train.head()
data4_train.shape # (397992, 60)
data4_test.head()
data4_test.shape # (170568, 60)








# 四、训练模型与调优
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

data_train_X = data4_train.drop(['Id','SrCustomerId','y'],axis=1) #样本集(训练集)的特征值
data_train_y = data4_train['y']  #样本集(训练集)的目标值 
data_test_X = data4_test.drop(['Id','SrCustomerId','y'],axis=1) #测试集的特征值
data_test_y = data4_test['y']  #测试集的目标值 
data_train_X.shape # (397992, 57) 
data_test_X.shape # (170568, 57)
data_train_y.shape # (397992,)
data_test_y.shape # (170568,)

#为逻辑回归添加多项式项的管道
def PolynomialLogisticRegression(degree,C,penalty):
    return Pipeline([
            ('poly',PolynomialFeatures(degree=degree)),
            ('std_scaler',StandardScaler()),
            ('log_reg',LogisticRegression(C=C, penalty=penalty))
            ])

clf = PolynomialLogisticRegression(degree=2, C=1.0, penalty='l2') 

clf.fit(data_train_X,data_train_y)

clf.score(data_train_X,data_train_y) # 
clf.score(data_test_X,data_test_y) # 
    
'''
clf = LogisticRegression(penalty='l2',tol=1e-6, C=1.0, solver='liblinear')
clf.fit(data_train_X, data_train_y)

(penalty='l2', dual=False, tol=0.0001, C=1.0,fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, 
                         solver='liblinear', max_iter=100, multi_class='ovr', verbose=0,warm_start=False, n_jobs=1)
'''

# 得到测试集data_test_X 所对应的预测值
data_test_y_predict = clf.predict(data_test_X) #预测样本X中的类标签

data_test_y_predict_log_proba = clf.predict_log_proba(data_test_X) #对数概率估计
data_test_y_predict_proba = clf.predict_proba(data_test_X) #概率估计

#pd.DataFrame({data4_test.SrCustomerId,data_test_X,data_test_y,data_test_y_predict,data_test_y_predict_proba,data_test_y_predict_log_proba}) #error



#分类准确度
clf.score(data_test_X, data_test_y)  # 0.9080835795694386





# 五、模型诊断( ROC曲线、模型系数关联分析、交叉验证、学习曲线-过拟合/欠拟合、误差分析)
#5.1 ROC曲线

#定义混淆矩阵的四个指标:TN
def TN(y_true, y_predict):
    assert len(y_true) == len(y_predict)
    return np.sum((y_true == 0) & (y_predict == 0)) 

TN(data_test_y, data_test_y_predict) #154297

#定义混淆矩阵的四个指标:FP
def FP(y_true, y_predict):
    assert len(y_true) == len(y_predict)
    return np.sum((y_true == 0) & (y_predict == 1)) 

FP(data_test_y, data_test_y_predict) #615

#定义混淆矩阵的四个指标:FN
def FN(y_true, y_predict):
    assert len(y_true) == len(y_predict)
    return np.sum((y_true == 1) & (y_predict == 0)) 

FN(data_test_y, data_test_y_predict) #15063

#定义混淆矩阵的四个指标:TP
def TP(y_true, y_predict):
    assert len(y_true) == len(y_predict)
    return np.sum((y_true == 1) & (y_predict == 1))

TP(data_test_y, data_test_y_predict) #593

# 输出混淆矩阵
def confusion_matrix(y_true, y_predict):
    return np.array([
        [TN(y_true, y_predict), FP(y_true, y_predict)],
        [FN(y_true, y_predict), TP(y_true, y_predict)]
    ])

confusion_matrix(data_test_y, data_test_y_predict)
'''
array([[154297,    615],
       [ 15063,    593]], dtype=int64)
'''

# 计算精准率
def precision_score(y_true, y_predict):
    tp = TP(y_true, y_predict)
    fp = FP(y_true, y_predict)
    try:
        return tp / (tp + fp)
    except:
        return 0.0

precision_score(data_test_y, data_test_y_predict) # 0.4908940397350993

# 计算召回率
def recall_score(y_true, y_predict):
    tp = TP(y_true, y_predict)
    fn = FN(y_true, y_predict)
    try:
        return tp / (tp + fn)
    except:
        return 0.0

recall_score(data_test_y, data_test_y_predict)  # 0.03787685232498723


##roc模型评估方式一: (参考:https://mp.weixin.qq.com/s/elI6-BX-AfKGuVjPPPE0rw )
#分类阈值threshold (sklearn中方法decision_function 返回分类阈值)
decision_scores = clf.decision_function(data_test_X)  # decision_function(X)预测样本的置信分数
min(decision_scores) #-38.208671618208804
max(decision_scores) #3.118950170921355
#y_predict = np.array(decision_scores >= 5, dtype='int') 
# TPR (=recalll 召回率)
def TPR(y_true, y_predict):
    tp = TP(y_true, y_predict)
    fn = FN(y_true, y_predict)
    try:
        return tp / (tp + fn)
    except:
        return 0.0
# FPR
def FPR(y_true, y_predict):
    fp = FP(y_true, y_predict)
    tn = TN(y_true, y_predict)
    try:
        return fp / (fp + tn)
    except:
        return 0.0
#ROC曲线
fprs = [] 
tprs = [] 
# 以0.1为步长,遍历decision_scores中的最小值到最大值的所有数据点,将其作为阈值集合
thresholds = np.arange(np.min(decision_scores), np.max(decision_scores), 0.1)
for threshold in thresholds:
    # decision_scores >= threshold 是布尔型向量,用dtype设置为int  # 大于等于阈值threshold分类为1,小于为0,用这种方法得到预测值
    y_predict = np.array(decision_scores >= threshold, dtype=int)
    # 对于每个阈值,所得到的FPR和TPR都添加到相应的队列中
    fprs.append(FPR(data_test_y, y_predict))
    tprs.append(TPR(data_test_y, y_predict))
# 绘制ROC曲线,x轴是fpr的值,y轴是tpr的值
plt.plot(fprs, tprs,color='red')
plt.show()


##roc模型评估方式二:(参考:https://blog.csdn.net/tttwister/article/details/81159487)
from sklearn.metrics import roc_curve
#
y_score = clf.predict_proba(data_test_X) #predict_proba给出测试集每个样本的[反例, 正例]的概率,计算roc_curve只需使用正例的概率。
print(y_score)
fpr, tpr, _ = roc_curve(data_test_y, y_score[:,1])
print("fpr:",fpr)
print("tpr:", tpr)
#
y_score2 = clf.decision_function(data_test_X)
print(y_score2)
fpr2, tpr2, _ = roc_curve(data_test_y, y_score2)
print("fpr:",fpr2)
print("tpr:", tpr2)

# 绘制roc
from sklearn.metrics import auc
def plot_roc_curve(fpr, tpr):
    plt.figure()
    lw = 2
    roc_auc = auc(fpr,tpr)
    plt.plot(fpr, tpr, color='darkorange',lw=lw,label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0,1], [0,1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0, 1.0])
    plt.ylim([0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('roc')
    plt.legend(loc="lower right") 
    plt.show() 
    
plot_roc_curve(fpr, tpr) 


## sklearn中的ROC曲线:
from sklearn.metrics import roc_curve
fprs, tprs, thresholds = roc_curve(data_test_y, decision_scores)
plt.plot(fprs, tprs)
plt.show()

# AUC
from sklearn.metrics import roc_auc_score
roc_auc_score(data_test_y, decision_scores) # 0.7896574584167956 


#5.2 模型系数关联分析
#把得到的model系数和feature关联起来看看
df = pd.DataFrame({"columns":list(data_test_X.columns)[:], "coef":list(clf.coef_[0].T)})
df


#5.3 交叉验证(cross validation)  #sklearn.model_selection.cross_val_score()函数学习 https://blog.csdn.net/zs15321583801/article/details/79677035
from sklearn import cross_validation

clf = LogisticRegression(penalty='l2',tol=1e-6, C=1.0, solver='liblinear')

all_data_X = pd.concat([data4_train,data4_test],axis=0).drop(['Id','SrCustomerId','y'],axis=1) #所有样本集的特征值 
all_data_X.head()
all_data_X.shape #(568560, 57)

all_data_y = pd.concat([data4_train['y'],data4_test['y']],axis=0) #所有样本集的目标值 
all_data_y.head()
all_data_y.shape #(568560,)

s = cross_validation.cross_val_score(clf,all_data_X,all_data_y,cv=5)
print(s) # [0.90827786 0.90778539 0.90827705 0.90798603 0.90816192]


    
#5.4 learning curves 绘制学习曲线,观察 过拟合/欠拟合 情况
#(过拟合的基本调优思路是增加数据量,降低模型复杂度。欠拟合的基本调优思路是提高特征数量和质量,增加模型复杂度)

import matplotlib as mpl
mpl.rcParams['font.sans-serif']=['SimHei'] #指定默认字体 SimHei为黑体
mpl.rcParams['axes.unicode_minus']=False #用来正常显示负号
from sklearn.learning_curve import learning_curve

def plot_learning_curve_nxn(estimator,title,X,y,ylim=None,cv=None,n_jobs=1,
                        train_sizes=np.linspace(0.05,1.0,20),verbose=0,plot=True):
    '''
    画出data在某模型上的learning curve,参数解释:
    ------------
    estimator : 你用的分类器
    title : 表格的标题
    X : 输入的feature, numpy类型
    y : 输入的target vector
    ylim : tuple格式的(ymin,ymax),设定图像中纵坐标的最低点和最高点
    cv : 做cross validation的时候,数据分成的份数,其中一份作为cv集,其余n-1份作为training(默认为3份)
    n_jobs : 并行的任务数(默认1)
    '''
    train_sizes,train_scores,test_scores = learning_curve(
            estimator,X,y,cv=cv,n_jobs=n_jobs,train_sizes=train_sizes,verbose=verbose)    
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)    
    if plot:
        plt.figure()
        plt.title(title)
        if ylim is not None:
            plt.ylim(*ylim)
        plt.xlabel(u"训练样本数")
        plt.ylabel(u"得分")
        plt.grid()      
        plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, 
                         alpha=0.1, color="b")
        plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, 
                         alpha=0.1, color="r")
        plt.plot(train_sizes, train_scores_mean, 'o-', color="b", label=u"训练集上得分")
        plt.plot(train_sizes, test_scores_mean, 'o-', color="r", label=u"交叉验证集上得分")    
        plt.legend(loc="best")        
        plt.draw()
        plt.show()
    
    midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) + (test_scores_mean[-1] - test_scores_std[-1])) / 2
    diff = (train_scores_mean[-1] + train_scores_std[-1]) - (test_scores_mean[-1] - test_scores_std[-1])
    return midpoint, diff

#调用以上函数画出学习曲线     
plot_learning_curve_nxn(clf,u"学习曲线",all_data_X,all_data_y,cv=5)



### 画图代码
#用sklearn的learning_curve得到training_score和cv_score,使用matplotlib画出learning curve
def plot_learning_curve(estimator,title,X,y,ylim=None,cv=None,n_jobs=1,
                        train_sizes=np.linspace(0.05,1.0,20),verbose=0,plot=True):
    '''
    画出data在某模型上的learning curve,参数解释:
    ------------
    estimator : 你用的分类器
    title : 表格的标题
    X : 输入的feature, numpy类型
    y : 输入的target vector
    ylim : tuple格式的(ymin,ymax),设定图像中纵坐标的最低点和最高点
    cv : 做cross validation的时候,数据分成的份数,其中一份作为cv集,其余n-1份作为training(默认为3份)
    n_jobs : 并行的任务数(默认1)
    '''
    train_sizes,train_scores,test_scores = learning_curve(
            estimator,X,y,cv=cv,n_jobs=n_jobs,train_sizes=train_sizes,verbose=verbose)
    
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    
    if plot:
        plt.figure()
        plt.title(title)
        if ylim is not None:
            plt.ylim(*ylim)
        plt.xlabel(u"训练样本数")
        plt.ylabel(u"得分")
        plt.gca().invert_yaxis()  #gca()函数和invert_yaxis()函数的用法
        '''
        gca()函数:https://blog.csdn.net/Dontla/article/details/98327176
        当前的图表和子图可以使用plt.gcf()和plt.gca()获得,分别表示Get Current Figure和Get Current Axes。
        在pyplot模块中,许多函数都是对当前的Figure或Axes对象进行处理,比如说:
        plt.plot()实际上会通过plt.gca()获得当前的Axes对象ax,然后再调用ax.plot()方法实现真正的绘图。
        
        invert_yaxis()函数 : 反转Y轴
        '''
        plt.grid()
        
        plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, 
                         alpha=0.1, color="b")
        plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, 
                         alpha=0.1, color="r")
        plt.plot(train_sizes, train_scores_mean, 'o-', color="b", label=u"训练集上得分")
        plt.plot(train_sizes, test_scores_mean, 'o-', color="r", label=u"交叉验证集上得分")
    
        plt.legend(loc="best")
        
        plt.draw()
        plt.show()
        plt.gca().invert_yaxis()
    
    midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) + (test_scores_mean[-1] - test_scores_std[-1])) / 2
    diff = (train_scores_mean[-1] + train_scores_std[-1]) - (test_scores_mean[-1] - test_scores_std[-1])
    return midpoint, diff

     
plot_learning_curve(clf,u"学习曲线",all_data_X,all_data_y)




### 六、模型融合
'''
data_train_X = data4_train.drop(['Id','SrCustomerId','y'],axis=1) #样本集(训练集)的特征值
data_train_y = data4_train['y']  #样本集(训练集)的目标值 
data_test_X = data4_test.drop(['Id','SrCustomerId','y'],axis=1) #测试集的特征值
data_test_y = data4_test['y']  #测试集的目标值 
data_train_X.shape # (397992, 57) 
data_test_X.shape # (170568, 57)
data_train_y.shape # (397992,)
data_test_y.shape # (170568,)
'''
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import accuracy_score

clf = LogisticRegression(penalty='l2',tol=1e-6, C=1.0)

bagging_clf = BaggingRegressor(clf,n_estimators=2,max_samples=0.8,max_features=1.0,
                               bootstrap=True,bootstrap_features=False,n_jobs=-1)

bagging_clf.fit(data_train_X,data_train_y)

predictions = bagging_clf.predict(data_test_X)

print(accuracy_score(data_test_y,predictions))

#定义混淆矩阵的四个指标:TN
def TN(y_true, y_predict):
    assert len(y_true) == len(y_predict)
    return np.sum((y_true == 0) & (y_predict == 0)) 

TN(data_test_y, predictions) #154226
#定义混淆矩阵的四个指标:FP
def FP(y_true, y_predict):
    assert len(y_true) == len(y_predict)
    return np.sum((y_true == 0) & (y_predict == 1)) 

FP(data_test_y, predictions) #546
#定义混淆矩阵的四个指标:FN
def FN(y_true, y_predict):
    assert len(y_true) == len(y_predict)
    return np.sum((y_true == 1) & (y_predict == 0)) 

FN(data_test_y, predictions) #15010
#定义混淆矩阵的四个指标:TP
def TP(y_true, y_predict):
    assert len(y_true) == len(y_predict)
    return np.sum((y_true == 1) & (y_predict == 1))

TP(data_test_y, predictions) #532
# 输出混淆矩阵
def confusion_matrix(y_true, y_predict):
    return np.array([
        [TN(y_true, y_predict), FP(y_true, y_predict)],
        [FN(y_true, y_predict), TP(y_true, y_predict)]
    ])

confusion_matrix(data_test_y, predictions)
'''
array([[154226,    546],
       [ 15010,    532]], dtype=int64)
'''
# 计算精准率
precision_score(data_test_y, predictions) # 0.4935064935064935
# 计算召回率
recall_score(data_test_y, predictions)  # 0.034229828850855744

#分类阈值 decision_scores1 = bagging_clf.decision_function(data_test_X) #error:'BaggingRegressor' object has no attribute 'decision_function'

len(predictions)  #170568
len(data_test_y)  #170568
np.unique(data_test_y)  #array([0., 1.])
np.unique(predictions)  #array([0. , 0.5, 1. ])
a = pd.DataFrame(predictions,columns=['result'])
a.columns
a['result'].groupby(a['result']).count()
'''
result
0.0    169236
0.5       254
1.0      1078
Name: result, dtype: int64
'''





### 使用 个人贷款违约预测模型 中的训练模型
def forward_select(data,response):
    import statsmodels.api as sm
    import statsmodels.formula.api as smf
    remaining = set(data.columns)
    remaining.remove(response)
    selected = []
    current_score,best_new_score = float('inf'),float('inf')  #python的正负无穷float("inf")的用法 https://www.cnblogs.com/ShaunChen/p/6231127.html
    while remaining:
        aic_with_candidates = []
        for candidate in remaining:
            formula = "{}~{}".format(response,"+".join(selected + [candidate]))
            aic = smf.glm(formula=formula,data=data,family=sm.families.Binomial(sm.families.links.logit)).fit().aic
            aic_with_candidates.append((aic,candidate))
        aic_with_candidates.sort(reverse=True)
        best_new_score,best_candidate=aic_with_candidates.pop()
        if current_score>best_new_score:
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score
            print("aic is {},continuing!".format(current_score))
        else:
            print("forward selection over!")
            break
    formula = "{}~{}".format(response,"+".join(selected))
    print("final formula is {}".format(formula))
    model = smf.glm(formula=formula,data=data,family=sm.families.Binomial(sm.families.links.logit)).fit()
    return(model)


data_for_select = pd.concat([data_train_X,data_train_y],axis=1)
lg_model = forward_select(data=data_for_select,response='y')
lg_model.summary().tables[1]

#roc曲线
import sklearn.metrics as metrics
data_test = pd.concat([data_test_X,data_test_y],axis=1)
fpr,tpr,th = metrics.roc_curve(data_test_y,lg_model.predict(data_test))
plt.figure(figsize=[6,6])
plt.plot(fpr,tpr,'b--')
plt.title("ROC curve")
plt.show()

print("AUC = %.4f"%metrics.auc(fpr,tpr)) # AUC = 0.7331 `

你可能感兴趣的:(test)