信用卡反欺诈

1、 脱敏后的数据文件

信用卡反欺诈_第1张图片信用卡反欺诈_第2张图片
最后一列Class,0为正常,1为欺诈

2、程序解读

2.1 读取文件

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
import seaborn as sns
import matplotlib.gridspec as gridspec

#读取csv文件
data = pd.read_csv('../dataset/creditcard.csv')
# .iloc:根据标签的所在位置,从0开始计数,选取列
x_train = np.array(data.iloc[:,0:29])
y_train = np.array(data.iloc[:,30])

2.2 查看前5行记录

# df.head(n):查看DataFrame对象的前n行
print(data.head())
   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9       V10       V11       V12       V13       V14  \
0  0.098698  0.363787  0.090794 -0.551600 -0.617801 -0.991390 -0.311169   
1  0.085102 -0.255425 -0.166974  1.612727  1.065235  0.489095 -0.143772   
2  0.247676 -1.514654  0.207643  0.624501  0.066084  0.717293 -0.165946   
3  0.377436 -1.387024 -0.054952 -0.226487  0.178228  0.507757 -0.287924   
4 -0.270533  0.817739  0.753074 -0.822843  0.538196  1.345852 -1.119670   

        V15       V16       V17       V18       V19       V20       V21  \
0  1.468177 -0.470401  0.207971  0.025791  0.403993  0.251412 -0.018307   
1  0.635558  0.463917 -0.114805 -0.183361 -0.145783 -0.069083 -0.225775   
2  2.345865 -2.890083  1.109969 -0.121359 -2.261857  0.524980  0.247998   
3 -0.631418 -1.059647 -0.684093  1.965775 -1.232622 -0.208038 -0.108300   
4  0.175121 -0.451449 -0.237033 -0.038195  0.803487  0.408542 -0.009431   

        V22       V23       V24       V25       V26       V27       V28  \
0  0.277838 -0.110474  0.066928  0.128539 -0.189115  0.133558 -0.021053   
1 -0.638672  0.101288 -0.339846  0.167170  0.125895 -0.008983  0.014724   
2  0.771679  0.909412 -0.689281 -0.327642 -0.139097 -0.055353 -0.059752   
3  0.005274 -0.190321 -1.175575  0.647376 -0.221929  0.062723  0.061458   
4  0.798278 -0.137458  0.141267 -0.206010  0.502292  0.219422  0.215153   

   Amount  Class  
0  149.62      0  
1    2.69      0  
2  378.66      0  
3  123.50      0  
4   69.99      0  

2.3 统计描述

# 生成描述性统计,总结数据集分布的中心趋势,分散和形状,不包括NaN值。
print(data.describe())
                Time            V1            V2            V3            V4  \
count  284807.000000  2.848070e+05  2.848070e+05  2.848070e+05  2.848070e+05   
mean    94813.859575  1.759072e-12 -8.251146e-13 -9.655448e-13  8.321385e-13   
std     47488.145955  1.958696e+00  1.651309e+00  1.516255e+00  1.415869e+00   
min         0.000000 -5.640751e+01 -7.271573e+01 -4.832559e+01 -5.683171e+00   
25%     54201.500000 -9.203734e-01 -5.985499e-01 -8.903648e-01 -8.486401e-01   
50%     84692.000000  1.810880e-02  6.548556e-02  1.798463e-01 -1.984653e-02   
75%    139320.500000  1.315642e+00  8.037239e-01  1.027196e+00  7.433413e-01   
max    172792.000000  2.454930e+00  2.205773e+01  9.382558e+00  1.687534e+01   

                 V5            V6            V7            V8            V9  \
count  2.848070e+05  2.848070e+05  2.848070e+05  2.848070e+05  2.848070e+05   
mean   1.649983e-13  4.248434e-13 -3.054696e-13  8.777981e-14 -1.179757e-12   
std    1.380247e+00  1.332271e+00  1.237094e+00  1.194353e+00  1.098632e+00   
min   -1.137433e+02 -2.616051e+01 -4.355724e+01 -7.321672e+01 -1.343407e+01   
25%   -6.915971e-01 -7.682956e-01 -5.540759e-01 -2.086297e-01 -6.430976e-01   
50%   -5.433583e-02 -2.741871e-01  4.010308e-02  2.235804e-02 -5.142873e-02   
75%    6.119264e-01  3.985649e-01  5.704361e-01  3.273459e-01  5.971390e-01   
max    3.480167e+01  7.330163e+01  1.205895e+02  2.000721e+01  1.559499e+01   

                V10           V11           V12           V13           V14  \
count  2.848070e+05  2.848070e+05  2.848070e+05  2.848070e+05  2.848070e+05   
mean   7.092627e-13  1.874974e-12  1.053347e-12  7.127607e-13 -1.474787e-13   
std    1.088850e+00  1.020713e+00  9.992014e-01  9.952742e-01  9.585956e-01   
min   -2.458826e+01 -4.797473e+00 -1.868371e+01 -5.791881e+00 -1.921433e+01   
25%   -5.354257e-01 -7.624942e-01 -4.055715e-01 -6.485393e-01 -4.255740e-01   
50%   -9.291738e-02 -3.275735e-02  1.400326e-01 -1.356806e-02  5.060132e-02   
75%    4.539234e-01  7.395934e-01  6.182380e-01  6.625050e-01  4.931498e-01   
max    2.374514e+01  1.201891e+01  7.848392e+00  7.126883e+00  1.052677e+01   

                V15           V16           V17           V18           V19  \
count  2.848070e+05  2.848070e+05  2.848070e+05  2.848070e+05  2.848070e+05   
mean  -5.231430e-13 -2.282231e-13 -6.425412e-13  4.950748e-13  7.057401e-13   
std    9.153160e-01  8.762529e-01  8.493371e-01  8.381762e-01  8.140405e-01   
min   -4.498945e+00 -1.412985e+01 -2.516280e+01 -9.498746e+00 -7.213527e+00   
25%   -5.828843e-01 -4.680368e-01 -4.837483e-01 -4.988498e-01 -4.562989e-01   
50%    4.807155e-02  6.641332e-02 -6.567575e-02 -3.636312e-03  3.734823e-03   
75%    6.488208e-01  5.232963e-01  3.996750e-01  5.008067e-01  4.589494e-01   
max    8.877742e+00  1.731511e+01  9.253526e+00  5.041069e+00  5.591971e+00   

                V20           V21           V22           V23           V24  \
count  2.848070e+05  2.848070e+05  2.848070e+05  2.848070e+05  2.848070e+05   
mean   1.766109e-12 -3.405785e-13 -5.723165e-13 -9.725860e-13  1.464148e-12   
std    7.709250e-01  7.345240e-01  7.257016e-01  6.244603e-01  6.056471e-01   
min   -5.449772e+01 -3.483038e+01 -1.093314e+01 -4.480774e+01 -2.836627e+00   
25%   -2.117214e-01 -2.283949e-01 -5.423504e-01 -1.618463e-01 -3.545861e-01   
50%   -6.248109e-02 -2.945017e-02  6.781943e-03 -1.119293e-02  4.097606e-02   
75%    1.330408e-01  1.863772e-01  5.285536e-01  1.476421e-01  4.395266e-01   
max    3.942090e+01  2.720284e+01  1.050309e+01  2.252841e+01  4.584549e+00   

                V25           V26           V27           V28         Amount  \
count  2.848070e+05  2.848070e+05  2.848070e+05  2.848070e+05  284807.000000   
mean  -6.987110e-13 -5.617884e-13  3.332082e-12 -3.518875e-12      88.349619   
std    5.212781e-01  4.822270e-01  4.036325e-01  3.300833e-01     250.120109   
min   -1.029540e+01 -2.604551e+00 -2.256568e+01 -1.543008e+01       0.000000   
25%   -3.171451e-01 -3.269839e-01 -7.083953e-02 -5.295979e-02       5.600000   
50%    1.659350e-02 -5.213911e-02  1.342146e-03  1.124383e-02      22.000000   
75%    3.507156e-01  2.409522e-01  9.104512e-02  7.827995e-02      77.165000   
max    7.519589e+00  3.517346e+00  3.161220e+01  3.384781e+01   25691.160000   

               Class  
count  284807.000000  
mean        0.001727  
std         0.041527  
min         0.000000  
25%         0.000000  
50%         0.000000  
75%         0.000000  
max         1.000000  

2.4 空值统计

print(data.isnull().sum())
Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

2.5 维度统计描述

# Time, 时间维度
print("Fraud")
# data.Class == 1表示选择Class列值等于1的记录
print(data.Time[data.Class == 1].describe())
print()
print("Normal")
print(data.Time[data.Class == 0].describe())
print()
Fraud
count       492.000000
mean      80746.806911
std       47835.365138
min         406.000000
25%       41241.500000
50%       75568.500000
75%      128483.000000
max      170348.000000
Name: Time, dtype: float64

Normal
count    284315.000000
mean      94838.202258
std       47484.015786
min           0.000000
25%       54230.000000
50%       84711.000000
75%      139333.000000
max      172792.000000
Name: Time, dtype: float64

2.6

f, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(12,4))

bins = 50

ax1.hist(data.Time[data.Class == 1], bins = bins)
ax1.set_title('Fraud')

ax2.hist(data.Time[data.Class == 0], bins = bins)
ax2.set_title('Normal')
plt.xlabel('Time (in Seconds)')
plt.ylabel('Number of Transactions')
plt.show()


信用卡反欺诈_第3张图片代码

# Amount 金额
print("Fraud")
print(data.Amount[data.Class == 1].describe())
print()
print("Normal")
print(data.Amount[data.Class == 0].describe())
Fraud
count     492.000000
mean      122.211321
std       256.683288
min         0.000000
25%         1.000000
50%         9.250000
75%       105.890000
max      2125.870000
Name: Amount, dtype: float64

Normal
count    284315.000000
mean         88.291022
std         250.105092
min           0.000000
25%           5.650000
50%          22.000000
75%          77.050000
max       25691.160000
Name: Amount, dtype: float64
f, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(12,4))

bins = 30

ax1.hist(data.Amount[data.Class == 1], bins = bins)
ax1.set_title('Fraud')

ax2.hist(data.Amount[data.Class == 0], bins = bins)
ax2.set_title('Normal')

plt.xlabel('Amount ($)')
plt.ylabel('Number of Transactions')
plt.yscale('log')
plt.show()

信用卡反欺诈_第4张图片

data['Amount_max_fraud'] = 1
data.loc[data.Amount <= 2125.87, 'Amount_max_fraud'] = 0

f, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(12,6))

ax1.scatter(data.Time[data.Class == 1], data.Amount[data.Class == 1])
ax1.set_title('Fraud')

ax2.scatter(data.Time[data.Class == 0], data.Amount[data.Class == 0])
ax2.set_title('Normal')

plt.xlabel('Time (in Seconds)')
plt.ylabel('Amount')
plt.show()

信用卡反欺诈_第5张图片

# analysis the anonymized features.
#Select only the anonymized features.
v_features = data.iloc[:,1:29].columns

plt.figure(figsize=(12,28*4))
gs = gridspec.GridSpec(28, 1)
for i, cn in enumerate(data[v_features]):
    ax = plt.subplot(gs[i])
    sns.distplot(data[cn][data.Class == 1], bins=50)    # 看两者的形状差异
    sns.distplot(data[cn][data.Class == 0], bins=50)
    ax.set_xlabel('')
    ax.set_title('histogram of feature: ' + str(cn))
plt.show()

信用卡反欺诈_第6张图片

#Drop all of the features that have very similar distributions between the two types of transactions.
data = data.drop(['V28','V27','V26','V25','V24','V23','V22','V20','V15','V13','V8'], axis =1)


#Based on the plots above, these features are created to identify values where fraudulent transaction are more common.
data['V1_'] = data.V1.map(lambda x: 1 if x < -3 else 0)
data['V2_'] = data.V2.map(lambda x: 1 if x > 2.5 else 0)
data['V3_'] = data.V3.map(lambda x: 1 if x < -4 else 0)
data['V4_'] = data.V4.map(lambda x: 1 if x > 2.5 else 0)
data['V5_'] = data.V5.map(lambda x: 1 if x < -4.5 else 0)
data['V6_'] = data.V6.map(lambda x: 1 if x < -2.5 else 0)
data['V7_'] = data.V7.map(lambda x: 1 if x < -3 else 0)
data['V9_'] = data.V9.map(lambda x: 1 if x < -2 else 0)
data['V10_'] = data.V10.map(lambda x: 1 if x < -2.5 else 0)
data['V11_'] = data.V11.map(lambda x: 1 if x > 2 else 0)
data['V12_'] = data.V12.map(lambda x: 1 if x < -2 else 0)
data['V14_'] = data.V14.map(lambda x: 1 if x < -2.5 else 0)
data['V16_'] = data.V16.map(lambda x: 1 if x < -2 else 0)
data['V17_'] = data.V17.map(lambda x: 1 if x < -2 else 0)
data['V18_'] = data.V18.map(lambda x: 1 if x < -2 else 0)
data['V19_'] = data.V19.map(lambda x: 1 if x > 1.5 else 0)
data['V21_'] = data.V21.map(lambda x: 1 if x > 0.6 else 0)

print('每个单一属性的欺诈记录与整车记录的差异统计:')
print(data.describe())
print(data.sum())
每个单一属性的欺诈记录与整车记录的差异统计:
                Time            V1            V2            V3            V4  \
count  284807.000000  2.848070e+05  2.848070e+05  2.848070e+05  2.848070e+05   
mean    94813.859575  1.759072e-12 -8.251146e-13 -9.655448e-13  8.321385e-13   
std     47488.145955  1.958696e+00  1.651309e+00  1.516255e+00  1.415869e+00   
min         0.000000 -5.640751e+01 -7.271573e+01 -4.832559e+01 -5.683171e+00   
25%     54201.500000 -9.203734e-01 -5.985499e-01 -8.903648e-01 -8.486401e-01   
50%     84692.000000  1.810880e-02  6.548556e-02  1.798463e-01 -1.984653e-02   
75%    139320.500000  1.315642e+00  8.037239e-01  1.027196e+00  7.433413e-01   
max    172792.000000  2.454930e+00  2.205773e+01  9.382558e+00  1.687534e+01   

                 V5            V6            V7            V9           V10  \
count  2.848070e+05  2.848070e+05  2.848070e+05  2.848070e+05  2.848070e+05   
mean   1.649983e-13  4.248434e-13 -3.054696e-13 -1.179757e-12  7.092627e-13   
std    1.380247e+00  1.332271e+00  1.237094e+00  1.098632e+00  1.088850e+00   
min   -1.137433e+02 -2.616051e+01 -4.355724e+01 -1.343407e+01 -2.458826e+01   
25%   -6.915971e-01 -7.682956e-01 -5.540759e-01 -6.430976e-01 -5.354257e-01   
50%   -5.433583e-02 -2.741871e-01  4.010308e-02 -5.142873e-02 -9.291738e-02   
75%    6.119264e-01  3.985649e-01  5.704361e-01  5.971390e-01  4.539234e-01   
max    3.480167e+01  7.330163e+01  1.205895e+02  1.559499e+01  2.374514e+01   

                V11           V12           V14           V16           V17  \
count  2.848070e+05  2.848070e+05  2.848070e+05  2.848070e+05  2.848070e+05   
mean   1.874974e-12  1.053347e-12 -1.474787e-13 -2.282231e-13 -6.425412e-13   
std    1.020713e+00  9.992014e-01  9.585956e-01  8.762529e-01  8.493371e-01   
min   -4.797473e+00 -1.868371e+01 -1.921433e+01 -1.412985e+01 -2.516280e+01   
25%   -7.624942e-01 -4.055715e-01 -4.255740e-01 -4.680368e-01 -4.837483e-01   
50%   -3.275735e-02  1.400326e-01  5.060132e-02  6.641332e-02 -6.567575e-02   
75%    7.395934e-01  6.182380e-01  4.931498e-01  5.232963e-01  3.996750e-01   
max    1.201891e+01  7.848392e+00  1.052677e+01  1.731511e+01  9.253526e+00   

                V18           V19           V21         Amount          Class  \
count  2.848070e+05  2.848070e+05  2.848070e+05  284807.000000  284807.000000   
mean   4.950748e-13  7.057401e-13 -3.405785e-13      88.349619       0.001727   
std    8.381762e-01  8.140405e-01  7.345240e-01     250.120109       0.041527   
min   -9.498746e+00 -7.213527e+00 -3.483038e+01       0.000000       0.000000   
25%   -4.988498e-01 -4.562989e-01 -2.283949e-01       5.600000       0.000000   
50%   -3.636312e-03  3.734823e-03 -2.945017e-02      22.000000       0.000000   
75%    5.008067e-01  4.589494e-01  1.863772e-01      77.165000       0.000000   
max    5.041069e+00  5.591971e+00  2.720284e+01   25691.160000       1.000000   

       Amount_max_fraud            V1_            V2_            V3_  \
count     284807.000000  284807.000000  284807.000000  284807.000000   
mean           0.002117       0.047042       0.024771       0.009838   
std            0.045965       0.211730       0.155427       0.098699   
min            0.000000       0.000000       0.000000       0.000000   
25%            0.000000       0.000000       0.000000       0.000000   
50%            0.000000       0.000000       0.000000       0.000000   
75%            0.000000       0.000000       0.000000       0.000000   
max            1.000000       1.000000       1.000000       1.000000   

                 V4_            V5_            V6_            V7_  \
count  284807.000000  284807.000000  284807.000000  284807.000000   
mean        0.052794       0.004579       0.006274       0.010059   
std         0.223622       0.067510       0.078963       0.099791   
min         0.000000       0.000000       0.000000       0.000000   
25%         0.000000       0.000000       0.000000       0.000000   
50%         0.000000       0.000000       0.000000       0.000000   
75%         0.000000       0.000000       0.000000       0.000000   
max         1.000000       1.000000       1.000000       1.000000   

                 V9_           V10_           V11_           V12_  \
count  284807.000000  284807.000000  284807.000000  284807.000000   
mean        0.031530       0.005049       0.018244       0.048408   
std         0.174746       0.070877       0.133833       0.214628   
min         0.000000       0.000000       0.000000       0.000000   
25%         0.000000       0.000000       0.000000       0.000000   
50%         0.000000       0.000000       0.000000       0.000000   
75%         0.000000       0.000000       0.000000       0.000000   
max         1.000000       1.000000       1.000000       1.000000   

                V14_           V16_           V17_           V18_  \
count  284807.000000  284807.000000  284807.000000  284807.000000   
mean        0.013697       0.021165       0.002173       0.013943   
std         0.116230       0.143935       0.046569       0.117254   
min         0.000000       0.000000       0.000000       0.000000   
25%         0.000000       0.000000       0.000000       0.000000   
50%         0.000000       0.000000       0.000000       0.000000   
75%         0.000000       0.000000       0.000000       0.000000   
max         1.000000       1.000000       1.000000       1.000000   

                V19_           V21_  
count  284807.000000  284807.000000  
mean        0.032952       0.041958  
std         0.178512       0.200494  
min         0.000000       0.000000  
25%         0.000000       0.000000  
50%         0.000000       0.000000  
75%         0.000000       0.000000  
max         1.000000       1.000000  
Time                2.700365e+10
V1                  5.009022e-07
V2                 -2.350312e-07
V3                 -2.744665e-07
V4                  2.368500e-07
V5                  4.533991e-08
V6                  1.209676e-07
V7                 -8.687127e-08
V9                 -3.359903e-07
V10                 2.020664e-07
V11                 5.340173e-07
V12                 3.000407e-07
V14                -4.247506e-08
V16                -6.495627e-08
V17                -1.830887e-07
V18                 1.412354e-07
V19                 2.010940e-07
V21                -9.702072e-08
Amount              2.516259e+07
Class               4.920000e+02
Amount_max_fraud    6.030000e+02
V1_                 1.339800e+04
V2_                 7.055000e+03
V3_                 2.802000e+03
V4_                 1.503600e+04
V5_                 1.304000e+03
V6_                 1.787000e+03
V7_                 2.865000e+03
V9_                 8.980000e+03
V10_                1.438000e+03
V11_                5.196000e+03
V12_                1.378700e+04
V14_                3.901000e+03
V16_                6.028000e+03
V17_                6.190000e+02
V18_                3.971000e+03
V19_                9.385000e+03
V21_                1.195000e+04
dtype: float64
#Create a new feature for normal (non-fraudulent) transactions.
data.loc[data.Class == 0, 'Normal'] = 1
data.loc[data.Class == 1, 'Normal'] = 0

#Rename 'Class' to 'Fraud'.
data = data.rename(columns={'Class': 'Fraud'})


#492 fraudulent transactions, 284,315 normal transactions.
#0.172% of transactions were fraud. 
print('欺诈记录的占比:')
print(data.Normal.value_counts())
print()
print(data.Fraud.value_counts())

pd.set_option("display.max_columns",101)
print(data.head())
欺诈记录的占比:
1.0    284315
0.0       492
Name: Normal, dtype: int64

0    284315
1       492
Name: Fraud, dtype: int64
   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V9       V10       V11       V12       V14       V16       V17  \
0  0.363787  0.090794 -0.551600 -0.617801 -0.311169 -0.470401  0.207971   
1 -0.255425 -0.166974  1.612727  1.065235 -0.143772  0.463917 -0.114805   
2 -1.514654  0.207643  0.624501  0.066084 -0.165946 -2.890083  1.109969   
3 -1.387024 -0.054952 -0.226487  0.178228 -0.287924 -1.059647 -0.684093   
4  0.817739  0.753074 -0.822843  0.538196 -1.119670 -0.451449 -0.237033   

        V18       V19       V21  Amount  Fraud  Amount_max_fraud  V1_  V2_  \
0  0.025791  0.403993 -0.018307  149.62      0                 0    0    0   
1 -0.183361 -0.145783 -0.225775    2.69      0                 0    0    0   
2 -0.121359 -2.261857  0.247998  378.66      0                 0    0    0   
3  1.965775 -1.232622 -0.108300  123.50      0                 0    0    0   
4 -0.038195  0.803487 -0.009431   69.99      0                 0    0    0   

   V3_  V4_  V5_  V6_  V7_  V9_  V10_  V11_  V12_  V14_  V16_  V17_  V18_  \
0    0    0    0    0    0    0     0     0     0     0     0     0     0   
1    0    0    0    0    0    0     0     0     0     0     0     0     0   
2    0    0    0    0    0    0     0     0     0     0     1     0     0   
3    0    0    0    0    0    0     0     0     0     0     0     0     0   
4    0    0    0    0    0    0     0     0     0     0     0     0     0   

   V19_  V21_  Normal  
0     0     0     1.0  
1     0     0     1.0  
2     0     0     1.0  
3     0     0     1.0  
4     0     0     1.0 
#Create dataframes of only Fraud and Normal transactions.
Fraud = data[data.Fraud == 1]
Normal = data[data.Normal == 1]


# Set X_train equal to 80% of the fraudulent transactions.
X_train = Fraud.sample(frac=0.8)
count_Frauds = len(X_train)

# Add 80% of the normal transactions to X_train.
X_train = pd.concat([X_train, Normal.sample(frac = 0.8)], axis = 0)

# X_test contains all the transaction not in X_train.
X_test = data.loc[~data.index.isin(X_train.index)]


#Shuffle the dataframes so that the training is done in a random order.
X_train = shuffle(X_train)
X_test = shuffle(X_test)

#Add our target features to y_train and y_test.
y_train = X_train.Fraud
y_train = pd.concat([y_train, X_train.Normal], axis=1)

y_test = X_test.Fraud
y_test = pd.concat([y_test, X_test.Normal], axis=1)

#Drop target features from X_train and X_test.
X_train = X_train.drop(['Fraud','Normal'], axis = 1)
X_test = X_test.drop(['Fraud','Normal'], axis = 1)

#Check to ensure all of the training/testing dataframes are of the correct length
print()
print('切割[学习、校验]处理后的记录数量:')
print(len(X_train))
print(len(y_train))
print(len(X_test))
print(len(y_test))
切割[学习、校验]处理后的记录数量:
227846
227846
56961
56961
'''
Due to the imbalance in the data, ratio will act as an equal weighting system for our model. 
By dividing the number of transactions by those that are fraudulent, ratio will equal the value that when multiplied
by the number of fraudulent transactions will equal the number of normal transaction. 
Simply put: # of fraud * ratio = # of normal
'''
ratio = len(X_train)/count_Frauds 
print()
print('数据的占比:', ratio)

y_train.Fraud *= ratio
y_test.Fraud  *= ratio
print('训练数据的数量:\n', y_train.Fraud)
print('测试数据的数量:\n', y_train.Fraud)
数据的占比: 578.2893401015228
训练数据的数量:
 22023     0.0
185560    0.0
112703    0.0
165996    0.0
245243    0.0
238885    0.0
39966     0.0
112043    0.0
171013    0.0
255567    0.0
283619    0.0
203942    0.0
82908     0.0
245906    0.0
225464    0.0
13679     0.0
107609    0.0
140858    0.0
156028    0.0
158914    0.0
72341     0.0
208184    0.0
111027    0.0
217998    0.0
229747    0.0
281186    0.0
259994    0.0
112170    0.0
204651    0.0
184758    0.0

118430    0.0
15155     0.0
28982     0.0
193685    0.0
209645    0.0
201038    0.0
226108    0.0
219122    0.0
266437    0.0
45419     0.0
99879     0.0
167812    0.0
117954    0.0
20935     0.0
238062    0.0
13355     0.0
71356     0.0
54123     0.0
95958     0.0
280240    0.0
271372    0.0
259493    0.0
149400    0.0
231110    0.0
30784     0.0
186483    0.0
74528     0.0
187912    0.0
17719     0.0
42839     0.0
Name: Fraud, Length: 227846, dtype: float64
测试数据的数量:
 22023     0.0
185560    0.0
112703    0.0
165996    0.0
245243    0.0
238885    0.0
39966     0.0
112043    0.0
171013    0.0
255567    0.0
283619    0.0
203942    0.0
82908     0.0
245906    0.0
225464    0.0
13679     0.0
107609    0.0
140858    0.0
156028    0.0
158914    0.0
72341     0.0
208184    0.0
111027    0.0
217998    0.0
229747    0.0
281186    0.0
259994    0.0
112170    0.0
204651    0.0
184758    0.0

118430    0.0
15155     0.0
28982     0.0
193685    0.0
209645    0.0
201038    0.0
226108    0.0
219122    0.0
266437    0.0
45419     0.0
99879     0.0
167812    0.0
117954    0.0
20935     0.0
238062    0.0
13355     0.0
71356     0.0
54123     0.0
95958     0.0
280240    0.0
271372    0.0
259493    0.0
149400    0.0
231110    0.0
30784     0.0
186483    0.0
74528     0.0
187912    0.0
17719     0.0
42839     0.0

你可能感兴趣的:(机器学习)