Santander Customer Transaction Prediction(2)

https://www.kaggle.com/c/santander-customer-transaction-prediction/leaderboard

import pandas as pd
import matplotlib.pyplot as plt
# 初始化spark
def spark_init(master = "yarn",appName="test"):  
    from pyspark.sql import SparkSession 
    from pyspark.sql import types  
    spark = SparkSession.builder.master(master).appName(appName).getOrCreate() 
    return spark
# 将单个数据源转成dataframe
def get_input_data(path, header=True, inferSchema=True, schema=None, sep=',',encoding='UTF-8'): 
    df = spark.read.csv(path, header=header, inferSchema=inferSchema, schema=schema,sep=sep,encoding=encoding)
    return df
# 将处理过后的数据源持久化到hdfs 
def output_data(df, location, size=1, format="csv", mode="overwrite", header=False, delimiter=','): 
    df.repartition(size).write.save(location, format=format, header=header, delimiter=delimiter,
                                    mode=mode)
    return location
# 数据源描述 
# 数据探索
#数据统计性描述
def data_describe(df):
    import json
    des = df.describe().toPandas()
    return json.dumps(des.to_dict(), ensure_ascii=False)
# 两个变量因子的相关性描速
def data_single_corr(df,col1,col2):
    return df.corr(col1, col1)
# 所有数值类型的变量因子的相关性
def data_all_corr(df):
    import json
    import pandas as pd
    numerical = [t[0] for t in df.dtypes if t[1] == 'int'] 
    n_numerical = len(numerical)
    corr = []
    for i in range(0, n_numerical):
        temp = [None] * i
        for j in range(i, n_numerical):
            temp.append(df.corr(numerical[i], numerical[j]))
        corr.append(temp)  
    df01 = pd.DataFrame(corr,columns=numerical) 
    return json.dumps(df01.to_dict(), ensure_ascii=False)
# 预处理-抽样
def sample(df,withReplacement=False, fraction=0.5, seed=None):
    sample1 = df.sample(
    withReplacement=withReplacement, # 无放回抽样
    fraction=fraction,
    seed=seed)  
    return sample1
#  字段筛选 
def field_select(df, select_fields):
    """
    字段筛选
    :return:
    """
    df = df.select(*select_fields)
    return df
# 缺失值填充(暂时只提供均值填充)
def fill_mean(df,col): 
    from pyspark.sql.functions import mean
    mean_val = df.select(mean(df[col])).collect()
    mean_sales = mean_val[0][0] # to show the number
    return df.na.fill(mean_sales,[col])

# 字符串类别特征onehot的处理
def deal_categoricalCol(categoricalColumns): 
    satge = []
    new_cols = []
    for categoricalCol in categoricalColumns:
        stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + 'Index')
        # encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
        outputCol = categoricalCol + "classVec"
        encoder = OneHotEncoder(inputCol=stringIndexer.getOutputCol(), outputCol=outputCol)
        satge += [stringIndexer, encoder]
        new_cols.append(outputCol)
    return satge,new_cols

# fit pipeline
def fit_pipeline(train, stages): 
    cols = train.columns
    from pyspark.ml import Pipeline
    pipeline = Pipeline(stages=stages)
    # 适配pipeline
    pipelineModel = pipeline.fit(train)  
    return pipelineModel

# 保存模型
def save_model(pipelineModel,path):
    """
    :type pipelineModel  Pipeline
    :param pipelineModel:
    :return:
    """
    pipelineModel.save(path)
    return path

# 模型预测
def model_pre(test,input_path,output_path,size=1, format="csv", mode="overwrite", header=False, delimiter=','):
    """
    :type return  Pipeline
    :return
    """
    from pyspark.ml import PipelineModel
    pipeModel = PipelineModel.load(path)
    predictions = pipeModel.transform(test)
    predictions.repartition(size).write.save(output_path, format=format, header=header, delimiter=delimiter,
                                    mode=mode)
    predictions.show()
    

In [2]:

spark = spark_init(master='local[*]',appName='sant_demo01')
base_path =  "F:\\001experience\\MatchSummary\\resources\\sant\\" 
spark

Out[2]:

SparkSession - in-memory

SparkContext

Spark UI

Version

v2.2.0

Master

local[*]

AppName

sant_demo01

数据探索

  1. 查看数据分布情况
  2. 区分连续变量和离散变量
  3. 相关性分析
  4. Hypothesis testing
  5. 可视化展示
  6. 统计分析
  7. 问题理解
  8. 异常数据原因分析
  9. 空数据原因分析
  10. 错误数据分析

在Santander,我们的使命是帮助人们和企业繁荣发展。 我们一直在寻找方法来帮助客户了解他们的财务状况,并确定哪些产品和服务可以帮助他们实现货币目标。

我们的数据科学团队不断挑战我们的机器学习算法,与全球数据科学界合作,确保我们能够更准确地识别解决我们最常见挑战的新方法,二元分类问题,例如:客户是否满意? 客户会购买此产品吗? 客户可以支付这笔贷款吗?

在此挑战中,我们邀请Kagglers帮助我们确定哪些客户将来会进行特定交易,无论交易金额多少。 为此次竞赛提供的数据与我们可用于解决此问题的实际数据具有相同的结构。

In [3]:

df_train = get_input_data(base_path + 'train.csv')  
pd.DataFrame(df_train.take(10), columns=df_train.columns).transpose() 

Out[3]:

  0 1 2 3 4 5 6 7 8 9
ID_code train_0 train_1 train_2 train_3 train_4 train_5 train_6 train_7 train_8 train_9
target 0 0 0 0 0 0 0 0 0 0
var_0 8.9255 11.5006 8.6093 11.0604 9.8369 11.4763 11.8091 13.558 16.1071 12.5088
var_1 -6.7863 -4.1473 -2.7457 -2.1518 -1.4834 -2.3182 -0.0832 -7.9881 2.4426 1.9743
var_2 11.9081 13.8588 12.0805 8.9522 12.8746 12.608 9.3494 13.8776 13.9307 8.896
var_3 5.093 5.389 7.8928 7.1957 6.6375 8.6264 4.2916 7.5985 5.6327 5.4508
var_4 11.4607 12.3622 10.5825 12.5846 12.2772 10.9621 11.1355 8.6543 8.8014 13.6043
var_5 -9.2834 7.0433 -9.0837 -1.8361 2.4486 3.5609 -8.0198 0.831 6.163 -16.2859
var_6 5.1187 5.6208 6.9427 5.8428 5.9405 4.5322 6.1961 5.689 4.4514 6.0637
var_7 18.6266 16.5338 14.6155 14.925 19.2514 15.2255 12.0771 22.3262 10.1854 16.841
var_8 -4.92 3.1468 -4.9193 -5.8609 6.2654 3.5855 -4.3781 5.0647 -3.1882 0.1287
var_9 5.747 8.0851 5.9525 8.245 7.6784 5.979 7.9232 7.1971 9.0827 7.9682
var_10 2.9252 -0.4032 -0.3249 2.3061 -9.4458 0.801 -5.1288 1.4532 0.9501 0.8787
var_11 3.1821 8.0585 -11.2648 2.8102 -12.1419 -0.6192 -7.5271 -6.7033 1.7982 3.0537
var_12 14.0137 14.0239 14.1929 13.8463 13.8481 13.638 14.1629 14.2919 14.0654 13.9639
var_13 0.5745 8.4135 7.3124 11.9704 7.8895 1.2589 13.3058 10.9699 -3.0572 0.8071
var_14 8.7989 5.4345 7.5244 6.4569 7.7894 8.1939 7.8412 6.919 11.1642 9.924
var_15 14.5691 13.7003 14.6472 14.8372 15.0553 14.9894 14.3363 14.2459 14.8757 15.2659
var_16 5.7487 13.8275 7.6782 10.743 8.4871 12.0763 7.5951 9.5376 10.0075 11.39
var_17 -7.2393 -15.5849 -1.7395 -0.4299 -3.068 -1.471 11.0922 -0.7226 -8.9472 1.5367
var_18 4.284 7.8 4.7011 15.9426 6.5263 6.7341 21.1976 5.1548 3.8349 5.4649
var_19 30.7133 28.5708 20.4775 13.7257 11.3152 14.8241 6.2946 17.1535 0.856 13.6196
var_20 10.535 3.4287 17.7559 20.301 21.4246 19.7172 15.8877 13.7326 10.6958 23.7806
var_21 16.2191 2.7407 18.1377 12.5579 18.9608 11.9882 24.2595 14.4195 6.3738 4.4221
var_22 2.5791 8.5524 1.2145 6.8202 10.1102 1.0468 8.1159 1.2375 6.558 6.1695
var_23 2.4716 3.3716 3.5137 2.7229 2.7142 3.8663 3.9769 3.1711 2.6182 3.2978
var_24 14.3831 6.9779 5.6777 12.1354 14.208 4.7252 7.6851 9.1258 13.2506 4.5923
var_25 13.4325 13.891 13.2177 13.7367 13.5433 13.9427 13.36 13.325 13.7929 13.3778
var_26 -5.1488 -11.7684 -7.994 0.8135 3.1736 -1.2796 -0.5156 3.3883 -14.4918 -3.22
var_27 -0.4073 -2.5586 -2.9029 -0.9059 -3.3423 -4.3763 0.069 -0.4418 -2.5407 -2.3302
... ... ... ... ... ... ... ... ... ... ...
var_170 -4.7645 5.5378 -7.0927 -7.1541 1.4493 -6.1449 0.2619 8.9519 3.1838 -3.0868
var_171 -8.4254 5.0988 -3.9116 -6.192 -2.6627 -2.0285 -1.1405 -2.3522 -1.7865 -1.2558
var_172 20.8773 22.033 7.2569 18.2366 19.8056 18.4106 25.1675 6.1335 4.9105 24.2683
var_173 3.1531 5.5134 -5.8234 11.7134 2.3705 1.4457 2.6965 0.0876 3.5803 -4.5382
var_174 18.5618 30.2645 25.682 14.7483 18.4685 21.8853 17.0152 19.5642 32.9149 18.2209
var_175 7.7423 10.4968 10.9202 8.1013 16.3309 9.2654 12.7942 13.2008 13.0201 7.5652
var_176 -10.1245 -7.2352 -0.3104 11.8771 -3.3456 -6.5247 -3.0403 -11.1786 -2.4845 6.3377
var_177 13.7241 16.5721 8.8438 13.9552 13.5261 10.7687 8.1735 17.3041 11.0988 14.6223
var_178 -3.5189 -7.3477 -9.7009 -10.4701 1.7189 -7.6283 4.5637 -0.6535 7.4609 -13.896
var_179 1.7202 11.0752 2.4013 5.6961 5.1743 1.0208 3.8973 0.0592 -2.1408 2.391
var_180 -8.4051 -5.5937 -4.2935 -3.7546 -7.6938 7.1968 -8.1416 5.114 -3.9172 2.7878
var_181 9.0164 9.4878 9.3908 8.4117 9.7685 11.1227 10.057 10.5478 7.7291 11.3457
var_182 3.0657 -14.91 -13.2648 1.8986 4.891 2.2257 15.7862 6.9736 -11.4027 -9.6774
var_183 14.3691 9.4245 3.1545 7.2601 12.2198 6.4056 3.3593 6.9724 2.0696 10.3382
var_184 25.8398 22.5441 23.0866 -0.4639 11.8503 21.055 11.914 24.0369 -1.7937 19.0645
var_185 5.8764 -4.8622 -5.3 -0.0498 -7.8931 -13.6509 -4.287 -4.822 -0.003 -7.6785
var_186 11.8411 7.6543 5.3745 7.9336 6.4209 4.7691 7.5015 8.4947 11.5024 6.758
var_187 -19.7159 -15.9319 -6.266 -12.8279 5.927 -8.9114 -29.9763 -5.9076 -18.3172 -21.607
var_188 17.5743 13.3175 10.1934 12.4124 16.0201 15.1007 17.2867 18.8663 13.1403 20.8112
var_189 0.5857 -0.3566 -0.8417 1.8489 -0.2829 2.4286 1.8539 1.9731 0.7014 -0.1873
var_190 4.4354 7.6421 2.9057 4.4666 -1.4905 -6.3068 8.783 13.17 1.4298 0.5543
var_191 3.9642 7.7214 9.7905 4.7433 9.5214 6.6025 6.4521 6.5491 14.751 6.316
var_192 3.1364 2.5837 1.6704 0.7178 -0.1508 5.2912 3.5325 3.9906 1.6395 1.0371
var_193 1.691 10.9516 1.6858 1.4214 9.1942 0.4403 0.1777 5.8061 1.4181 3.6885
var_194 18.5227 15.4305 21.6042 23.0347 13.2876 14.9452 18.3314 23.1407 14.837 14.8344
var_195 -2.3978 2.0339 3.1417 -1.2706 -1.5121 1.0314 0.5845 -0.3776 -1.994 0.4467
var_196 7.8784 8.1267 -6.5213 -2.9275 3.9267 -3.6241 9.1104 4.2178 -1.0733 14.1287
var_197 8.5635 8.7889 8.2675 10.2922 9.5031 9.767 9.1143 9.4237 8.1975 7.9133
var_198 12.7803 18.356 14.7222 17.9697 17.9974 12.5809 10.8869 8.6624 19.5114 16.2375
var_199 -1.0914 1.9518 0.3965 -8.9996 -8.8104 -4.7602 -3.2097 3.4806 4.8453 14.2514

202 rows × 10 columns

In [20]:

df_des = df_train.toPandas().describe(include = 'all')
# df_des.transpose()
df_des

Out[20]:

  ID_code target var_0 var_1 var_2 var_3 var_4 var_5 var_6 var_7 ... var_190 var_191 var_192 var_193 var_194 var_195 var_196 var_197 var_198 var_199
count 200000 200000.000000 200000.000000 200000.000000 200000.000000 200000.000000 200000.000000 200000.000000 200000.000000 200000.000000 ... 200000.000000 200000.000000 200000.000000 200000.000000 200000.000000 200000.000000 200000.000000 200000.000000 200000.000000 200000.000000
unique 200000 NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
top train_46809 NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
freq 1 NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
mean NaN 0.100490 10.679914 -1.627622 10.715192 6.796529 11.078333 -5.065317 5.408949 16.545850 ... 3.234440 7.438408 1.927839 3.331774 17.993784 -0.142088 2.303335 8.908158 15.870720 -3.326537
std NaN 0.300653 3.040051 4.050044 2.640894 2.043319 1.623150 7.863267 0.866607 3.418076 ... 4.559922 3.023272 1.478423 3.992030 3.135162 1.429372 5.454369 0.921625 3.010945 10.438015
min NaN 0.000000 0.408400 -15.043400 2.117100 -0.040200 5.074800 -32.562600 2.347300 5.349700 ... -14.093300 -2.691700 -3.814500 -11.783400 8.694400 -5.261000 -14.209600 5.960600 6.299300 -38.852800
25% NaN 0.000000 8.453850 -4.740025 8.722475 5.254075 9.883175 -11.200350 4.767700 13.943800 ... -0.058825 5.157400 0.889775 0.584600 15.629800 -1.170700 -1.946925 8.252800 13.829700 -11.208475
50% NaN 0.000000 10.524750 -1.608050 10.580000 6.825000 11.108250 -4.833150 5.385100 16.456800 ... 3.203600 7.347750 1.901300 3.396350 17.957950 -0.172700 2.408900 8.888200 15.934050 -2.819550
75% NaN 0.000000 12.758200 1.358625 12.516700 8.324100 12.261125 0.924800 6.003000 19.102900 ... 6.406200 9.512525 2.949500 6.205800 20.396525 0.829600 6.556725 9.593300 18.064725 4.836800
max NaN 1.000000 20.315000 10.376800 19.353000 13.188300 16.671400 17.251600 8.447700 27.691800 ... 18.440900 16.716500 8.402400 18.281800 27.928800 4.272900 18.321500 12.000400 26.079100 28.500700

11 rows × 202 columns

In [5]:

# df_des = df_des.transpose()
df_des = pd.read_csv("./describe.csv",index_col=False) 
df_des['range'] = df_des['max']-df_des['min'] #极差
df_des['var'] = df_des['std']/df_des['mean'] #变异系数
df_des['dis'] = df_des['75%']-df_des['25%'] #四分位数间距
df_des.to_csv("./describe.csv") 
df_des

Out[5]:

  Unnamed: 0 Unnamed: 0.1 IDcode count mean std min 25% 50% 75% max range var dis
0 0 0 target 200000.0 0.100490 0.300653 0.0000 0.000000 0.00000 0.000000 1.0000 1.0000 2.991870 0.000000
1 1 1 var_0 200000.0 10.679914 3.040051 0.4084 8.453850 10.52475 12.758200 20.3150 19.9066 0.284651 4.304350
2 2 2 var_1 200000.0 -1.627622 4.050044 -15.0434 -4.740025 -1.60805 1.358625 10.3768 25.4202 -2.488320 6.098650
3 3 3 var_2 200000.0 10.715192 2.640894 2.1171 8.722475 10.58000 12.516700 19.3530 17.2359 0.246463 3.794225
4 4 4 var_3 200000.0 6.796529 2.043319 -0.0402 5.254075 6.82500 8.324100 13.1883 13.2285 0.300642 3.070025
5 5 5 var_4 200000.0 11.078333 1.623150 5.0748 9.883175 11.10825 12.261125 16.6714 11.5966 0.146516 2.377950
6 6 6 var_5 200000.0 -5.065317 7.863267 -32.5626 -11.200350 -4.83315 0.924800 17.2516 49.8142 -1.552374 12.125150
7 7 7 var_6 200000.0 5.408949 0.866607 2.3473 4.767700 5.38510 6.003000 8.4477 6.1004 0.160217 1.235300
8 8 8 var_7 200000.0 16.545850 3.418076 5.3497 13.943800 16.45680 19.102900 27.6918 22.3421 0.206582 5.159100
9 9 9 var_8 200000.0 0.284162 3.332634 -10.5055 -2.317800 0.39370 2.937900 10.1513 20.6568 11.727941 5.255700
10 10 10 var_9 200000.0 7.567236 1.235070 3.9705 6.618800 7.62960 8.584425 11.1506 7.1801 0.163213 1.965625
11 11 11 var_10 200000.0 0.394340 5.500793 -20.7313 -3.594950 0.48730 4.382925 18.6702 39.4015 13.949350 7.977875
12 12 12 var_11 200000.0 -3.245596 5.970253 -26.0950 -7.510600 -3.28695 0.852825 17.1887 43.2837 -1.839494 8.363425
13 13 13 var_12 200000.0 14.023978 0.190059 13.4346 13.894000 14.02550 14.164200 14.6545 1.2199 0.013552 0.270200
14 14 14 var_13 200000.0 8.530232 4.639536 -6.0111 5.072800 8.60425 12.274775 22.3315 28.3426 0.543893 7.201975
15 15 15 var_14 200000.0 7.537606 2.247908 1.0133 5.781875 7.52030 9.270425 14.9377 13.9244 0.298226 3.488550
16 16 16 var_15 200000.0 14.573126 0.411711 13.0769 14.262800 14.57410 14.874500 15.8633 2.7864 0.028251 0.611700
17 17 17 var_16 200000.0 9.333264 2.557421 0.6351 7.452275 9.23205 11.055900 17.9506 17.3155 0.274011 3.603625
18 18 18 var_17 200000.0 -5.696731 6.712612 -33.3802 -10.476225 -5.66635 -0.810775 19.0259 52.4061 -1.178327 9.665450
19 19 19 var_18 200000.0 15.244013 7.851370 -10.6642 9.177950 15.19625 21.013325 41.7480 52.4122 0.515046 11.835375
20 20 20 var_19 200000.0 12.438567 7.996694 -12.4025 6.276475 12.45390 18.433300 35.1830 47.5855 0.642895 12.156825
21 21 21 var_20 200000.0 13.290894 5.876254 -5.4322 8.627800 13.19680 17.879400 31.2859 36.7181 0.442126 9.251600
22 22 22 var_21 200000.0 17.257883 8.196564 -10.0890 11.551000 17.23425 23.089050 49.0443 59.1333 0.474946 11.538050
23 23 23 var_22 200000.0 4.305430 2.847958 -5.3225 2.182400 4.27515 6.293200 14.5945 19.9170 0.661480 4.110800
24 24 24 var_23 200000.0 3.019540 0.526893 1.2098 2.634100 3.00865 3.403800 4.8752 3.6654 0.174495 0.769700
25 25 25 var_24 200000.0 10.584400 3.777245 -0.6784 7.613000 10.38035 13.479600 25.4460 26.1244 0.356869 5.866600
26 26 26 var_25 200000.0 13.667496 0.285535 12.7200 13.456400 13.66250 13.863700 14.6546 1.9346 0.020892 0.407300
27 27 27 var_26 200000.0 -4.055133 5.922210 -24.2431 -8.321725 -4.19690 -0.090200 15.6751 39.9182 -1.460423 8.231525
28 28 28 var_27 200000.0 -1.137908 1.523714 -6.1668 -2.307900 -1.13210 0.015625 3.2431 9.4099 -1.339049 2.323525
29 29 29 var_28 200000.0 5.532980 0.783367 2.0896 4.992100 5.53485 6.093700 8.7874 6.6978 0.141581 1.101600
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
171 171 171 var_170 200000.0 -0.004962 4.424621 -14.5060 -3.258500 0.00280 3.096400 16.7319 31.2379 -891.690223 6.354900
172 172 172 var_171 200000.0 -0.831777 5.378008 -22.4793 -4.720350 -0.80735 2.956800 17.9173 40.3966 -6.465689 7.677150
173 173 173 var_172 200000.0 19.817094 8.674171 -11.4533 13.731775 19.74800 25.907725 53.5919 65.0452 0.437712 12.175950
174 174 174 var_173 200000.0 -0.677967 5.966674 -22.7487 -5.009525 -0.56975 3.619900 18.8554 41.6041 -8.800833 8.629425
175 175 175 var_174 200000.0 20.210677 7.136427 -2.9953 15.064600 20.20610 25.641225 43.5468 46.5421 0.353102 10.576625
176 176 176 var_175 200000.0 11.640613 2.892167 3.2415 9.371600 11.67980 13.745500 20.8548 17.6133 0.248455 4.373900
177 177 177 var_176 200000.0 -2.799585 7.513939 -29.1165 -8.386500 -2.53845 2.704400 20.2452 49.3617 -2.683948 11.090900
178 178 178 var_177 200000.0 11.882933 2.628895 4.9521 9.808675 11.73725 13.931300 20.5965 15.6444 0.221233 4.122625
179 179 179 var_178 200000.0 -1.014064 8.579810 -29.2734 -7.395700 -0.94205 5.338750 29.8413 59.1147 -8.460821 12.734450
180 180 180 var_179 200000.0 2.591444 2.798956 -7.8561 0.625575 2.51230 4.391125 13.4487 21.3048 1.080076 3.765550
181 181 181 var_180 200000.0 -2.741666 5.261243 -22.0374 -6.673900 -2.68880 0.996200 12.7505 34.7879 -1.918995 7.670100
182 182 182 var_181 200000.0 10.085518 1.371862 5.4165 9.084700 10.03605 11.011300 14.3939 8.9774 0.136023 1.926600
183 183 183 var_182 200000.0 0.719109 8.963434 -26.0011 -6.064425 0.72020 7.499175 29.2487 55.2498 12.464637 13.563600
184 184 184 var_183 200000.0 8.769088 4.474924 -4.8082 5.423100 8.60000 12.127425 23.7049 28.5131 0.510307 6.704325
185 185 185 var_184 200000.0 12.756676 9.318280 -18.4897 5.663300 12.52100 19.456150 44.3634 62.8531 0.730463 13.792850
186 186 186 var_185 200000.0 -3.983261 4.725167 -22.5833 -7.360000 -3.94695 -0.590650 12.9975 35.5808 -1.186256 6.769350
187 187 187 var_186 200000.0 8.970274 3.189759 -3.0223 6.715200 8.90215 11.193800 21.7392 24.7615 0.355592 4.478600
188 188 188 var_187 200000.0 -10.335043 11.574708 -47.7536 -19.205125 -10.20975 -1.466000 22.7861 70.5397 -1.119948 17.739125
189 189 189 var_188 200000.0 15.377174 3.944604 4.4123 12.501550 15.23945 18.345225 29.3303 24.9180 0.256523 5.843675
190 190 190 var_189 200000.0 0.746072 0.976348 -2.5543 0.014900 0.74260 1.482900 4.0341 6.5884 1.308652 1.468000
191 191 191 var_190 200000.0 3.234440 4.559922 -14.0933 -0.058825 3.20360 6.406200 18.4409 32.5342 1.409803 6.465025
192 192 192 var_191 200000.0 7.438408 3.023272 -2.6917 5.157400 7.34775 9.512525 16.7165 19.4082 0.406441 4.355125
193 193 193 var_192 200000.0 1.927839 1.478423 -3.8145 0.889775 1.90130 2.949500 8.4024 12.2169 0.766881 2.059725
194 194 194 var_193 200000.0 3.331774 3.992030 -11.7834 0.584600 3.39635 6.205800 18.2818 30.0652 1.198170 5.621200
195 195 195 var_194 200000.0 17.993784 3.135162 8.6944 15.629800 17.95795 20.396525 27.9288 19.2344 0.174236 4.766725
196 196 196 var_195 200000.0 -0.142088 1.429372 -5.2610 -1.170700 -0.17270 0.829600 4.2729 9.5339 -10.059738 2.000300
197 197 197 var_196 200000.0 2.303335 5.454369 -14.2096 -1.946925 2.40890 6.556725 18.3215 32.5311 2.368031 8.503650
198 198 198 var_197 200000.0 8.908158 0.921625 5.9606 8.252800 8.88820 9.593300 12.0004 6.0398 0.103459 1.340500
199 199 199 var_198 200000.0 15.870720 3.010945 6.2993 13.829700 15.93405 18.064725 26.0791 19.7798 0.189717 4.235025
200 200 200 var_199 200000.0 -3.326537 10.438015 -38.8528 -11.208475 -2.81955 4.836800 28.5007 67.3535 -3.137802 16.045275

201 rows × 14 columns

In [68]:

df_des['stddev'].min()

Out[68]:

0.007186267883143137

In [69]:

df_des['stddev'].quantile(q=0.5)

Out[69]:

3.944604281951493

In [24]:

# 中位数的取值情况
df_des.plot(kind='bar',x='IDcode',y='50%',color='red',figsize=(14,6))
plt.show()

Santander Customer Transaction Prediction(2)_第1张图片

In [5]:

df_train.select('var_193').distinct().count()

Out[5]:

110557

In [7]:

df_train.select('var_45').distinct().count()

Out[7]:

169968

In [21]:

df_train.select('var_91').distinct().count()

Out[21]:

7962

In [24]:

# 查看每个变量的种类
for col in df_train.columns:
    print(col,df_train.select(col).distinct().count())
ID_code 200000
target 2
var_0 94672
var_1 108932
var_2 86555
var_3 74597
var_4 63515
var_5 141030
var_6 38599
var_7 103063
var_8 98617
var_9 49417
var_10 128764
var_11 130193
var_12 9561
var_13 115181
var_14 79122
var_15 19810
var_16 86918
var_17 137823
var_18 139515
var_19 144180
var_20 127764
var_21 140062
var_22 90661
var_23 24913
var_24 105101
var_25 14853
var_26 127089
var_27 60186
var_28 35859
var_29 88339
var_30 145977
var_31 77388
var_32 85964
var_33 112239
var_34 25164
var_35 122384
var_36 96404
var_37 79040
var_38 115366
var_39 112674
var_40 141878
var_41 131896
var_42 31592
var_43 15188
var_44 127702
var_45 169968
var_46 93450
var_47 154781
var_48 152039
var_49 140641
var_50 32308
var_51 143455
var_52 121313
var_53 33460
var_54 144776
var_55 128077
var_56 103045
var_57 35545
var_58 113908
var_59 37744
var_60 113763
var_61 159369
var_62 74778
var_63 97098
var_64 59379
var_65 108347
var_66 47722
var_67 137253
var_68 451
var_69 110346
var_70 153193
var_71 13527
var_72 110115
var_73 142582
var_74 161058
var_75 129383
var_76 139317
var_77 106809
var_78 72254
var_79 53212
var_80 136432
var_81 79065
var_82 144829
var_83 144281
var_84 133766
var_85 108437
var_86 140594
var_87 125296
var_88 84918
var_89 103522
var_90 157210
var_91 7962
var_92 110743
var_93 26708
var_94 89146
var_95 29388
var_96 148099
var_97 158739
var_98 33266
var_99 69301
var_100 150727
var_101 122295
var_102 146237
var_103 9376
var_104 72627
var_105 39115
var_106 71065
var_107 137827
var_108 8525
var_109 112172
var_110 106121
var_111 46464
var_112 60482
var_113 116496
var_114 43084
var_115 86729
var_116 63467
var_117 164469
var_118 143667
var_119 112403
var_120 158269
var_121 64695
var_122 121768
var_123 129893
var_124 91022
var_125 16059
var_126 32411
var_127 95711
var_128 98200
var_129 113425
var_130 36638
var_131 21465
var_132 57923
var_133 19236
var_134 131620
var_135 140774
var_136 156615
var_137 144397
var_138 117429
var_139 137294
var_140 121384
var_141 134444
var_142 128613
var_143 94372
var_144 40595
var_145 108526
var_146 84314
var_147 137559
var_148 10608
var_149 148504
var_150 83660
var_151 109667
var_152 95823
var_153 73728
var_154 119342
var_155 127457
var_156 40634
var_157 126534
var_158 144556
var_159 112830
var_160 156274
var_161 11071
var_162 57396
var_163 123168
var_164 122744
var_165 119403
var_166 17902
var_167 140955
var_168 97227
var_169 18242
var_170 113721
var_171 125914
var_172 143366
var_173 128120
var_174 134945
var_175 92659
var_176 142521
var_177 85720
var_178 145236
var_179 90091
var_180 123477
var_181 56164
var_182 149196
var_183 117529
var_184 145185
var_185 120747
var_186 98060
var_187 157031
var_188 108813
var_189 41765
var_190 114959
var_191 94266
var_192 59066
var_193 110557
var_194 97069
var_195 57870
var_196 125560
var_197 40537
var_198 94153
var_199 149430

In [6]:

df_train.count()

Out[6]:

200000

In [6]:

dd = float(df_des[df_des['IDcode']=='var_199']['50%'])
dd

Out[6]:

-2.81955

In [ ]:

# 数据清洗
1. 缺失值处理
2. 异常数据处理
3. 剔除多重共线性变量
4. 数据分箱

In [14]:

df_std01 = df_des[df_des['std']<0.1][['IDcode','std']].sort_values(by=['std'],ascending=True)
select_cols01 = df_std01['IDcode'].tolist()
# select_cols01.remove('target')
df_std02 = df_des[df_des['std']>=0.1][['IDcode','std']].sort_values(by=['std'],ascending=True)
select_cols02 = df_std02['IDcode'].tolist()
select_cols02.remove('target')
# print(select_cols)
stages = []
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler,StandardScaler
import pyspark.ml.feature as ft
def deal_binarizer(categoricalColumns): 
    satge = []
    new_cols = []
    for categoricalCol in categoricalColumns:
        outputCol = categoricalCol+"bin"
        dd = float(df_des[df_des['IDcode']==categoricalCol]['50%'])
#         print(categoricalCol,dd)
        binarizer = ft.Binarizer(threshold=dd, inputCol=categoricalCol, outputCol=outputCol) 
        satge += [binarizer]
        new_cols.append(outputCol)
    return satge,new_cols
satge,new_cols = deal_binarizer(select_cols01)
stages += satge
inputCols = new_cols + select_cols02
print(inputCols)
assembler = VectorAssembler(inputCols=inputCols, outputCol="asfeatures")
stages += [assembler]
label_stringIdx = StringIndexer(inputCol='target', outputCol='label')
stages += [label_stringIdx]
scaler = StandardScaler(inputCol="asfeatures", outputCol="features",
                        withStd=True, withMean=False)
stages += [scaler]

from pyspark.ml import Pipeline
pipeline = Pipeline(stages = stages)
pipelineModel = pipeline.fit(df_train)
df = pipelineModel.transform(df_train)
train, test = df.randomSplit([0.7, 0.3], seed=2019)
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))
# 测试
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(featuresCol='features',regParam=0.01, labelCol='label', maxIter=200)
lrModel = lr.fit(train)

import matplotlib.pyplot as plt
import numpy as np
beta = np.sort(lrModel.coefficients)
plt.plot(beta)
plt.ylabel('Beta Coefficients')
plt.show()


trainingSummary = lrModel.summary
roc = trainingSummary.roc.toPandas()
plt.plot(roc['FPR'],roc['TPR'])
plt.ylabel('False Positive Rate')
plt.xlabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()
print('Training set areaUnderROC: ' + str(trainingSummary.areaUnderROC))


pr = trainingSummary.pr.toPandas()
plt.plot(pr['recall'],pr['precision'])
plt.ylabel('Precision')
plt.xlabel('Recall')
plt.show()

predictions = lrModel.transform(test)
from pyspark.ml.evaluation import BinaryClassificationEvaluator 
evaluator = BinaryClassificationEvaluator()
print('Test Area Under ROC', evaluator.evaluate(predictions))
print('areaUnderROC', evaluator.evaluate(predictions,
{evaluator.metricName: 'areaUnderROC'}))
print('areaUnderPR', evaluator.evaluate(predictions,{evaluator.metricName: 'areaUnderPR'}))
# predictions.select('features').show()
predictions = predictions.select('features', 'target', 'label', 'rawPrediction', 'prediction', 'probability')
# predictions.show()
pd.DataFrame(predictions.take(10), columns=predictions.columns).transpose() 
['var_68bin', 'var_91', 'var_108', 'var_103', 'var_12', 'var_148', 'var_161', 'var_71', 'var_25', 'var_43', 'var_125', 'var_169', 'var_166', 'var_133', 'var_15', 'var_131', 'var_23', 'var_34', 'var_93', 'var_95', 'var_50', 'var_42', 'var_98', 'var_53', 'var_126', 'var_28', 'var_57', 'var_130', 'var_59', 'var_105', 'var_6', 'var_197', 'var_144', 'var_156', 'var_189', 'var_114', 'var_111', 'var_66', 'var_9', 'var_79', 'var_181', 'var_162', 'var_195', 'var_132', 'var_192', 'var_64', 'var_27', 'var_112', 'var_4', 'var_116', 'var_121', 'var_99', 'var_106', 'var_104', 'var_78', 'var_153', 'var_62', 'var_3', 'var_31', 'var_14', 'var_37', 'var_81', 'var_150', 'var_88', 'var_146', 'var_16', 'var_32', 'var_29', 'var_115', 'var_177', 'var_2', 'var_124', 'var_94', 'var_179', 'var_22', 'var_46', 'var_175', 'var_143', 'var_152', 'var_198', 'var_191', 'var_0', 'var_168', 'var_63', 'var_36', 'var_194', 'var_127', 'var_186', 'var_128', 'var_8', 'var_7', 'var_56', 'var_89', 'var_24', 'var_65', 'var_77', 'var_110', 'var_145', 'var_85', 'var_188', 'var_72', 'var_69', 'var_151', 'var_193', 'var_1', 'var_39', 'var_159', 'var_129', 'var_119', 'var_92', 'var_60', 'var_38', 'var_58', 'var_33', 'var_109', 'var_170', 'var_113', 'var_183', 'var_138', 'var_190', 'var_13', 'var_185', 'var_140', 'var_101', 'var_154', 'var_52', 'var_165', 'var_122', 'var_35', 'var_180', 'var_163', 'var_171', 'var_196', 'var_164', 'var_10', 'var_157', 'var_87', 'var_55', 'var_142', 'var_155', 'var_20', 'var_44', 'var_26', 'var_41', 'var_173', 'var_11', 'var_75', 'var_123', 'var_134', 'var_84', 'var_17', 'var_141', 'var_174', 'var_67', 'var_147', 'var_80', 'var_73', 'var_176', 'var_107', 'var_135', 'var_139', 'var_86', 'var_167', 'var_18', 'var_49', 'var_5', 'var_158', 'var_76', 'var_30', 'var_19', 'var_51', 'var_21', 'var_40', 'var_83', 'var_54', 'var_82', 'var_96', 'var_178', 'var_102', 'var_172', 'var_118', 'var_137', 'var_182', 'var_100', 'var_184', 'var_136', 'var_149', 'var_199', 'var_47', 'var_160', 'var_48', 'var_187', 'var_61', 'var_70', 'var_120', 'var_97', 'var_90', 'var_117', 'var_74', 'var_45']
Training Dataset Count: 140208
Test Dataset Count: 59792

Santander Customer Transaction Prediction(2)_第2张图片

Santander Customer Transaction Prediction(2)_第3张图片

Training set areaUnderROC: 0.8572874986254869

Santander Customer Transaction Prediction(2)_第4张图片

Test Area Under ROC 0.8604638791224748
areaUnderROC 0.8604638791224721
areaUnderPR 0.5145853422620837

Out[14]:

  0 1 2 3 4 5 6 7 8 9
features [2.0000262487479796, 47.42117806123396, 82.294... [0.0, 46.302865299428504, 82.68222664303848, 7... [2.0000262487479796, 45.757139153802285, 84.51... [0.0, 44.64668799313429, 83.20358606228598, 8.... [0.0, 44.38397948845709, 82.9966788488178, 7.3... [2.0000262487479796, 46.29958963228788, 85.204... [0.0, 44.481594369247375, 82.99434091420234, 9... [0.0, 47.517482675168, 83.12409628536034, 9.30... [2.0000262487479796, 45.63659460302771, 82.554... [0.0, 46.1672526798071, 82.50746603053287, 10....
target 0 0 1 0 0 0 0 1 0 0
label 0 0 1 0 0 0 0 1 0 0
rawPrediction [2.745411097253725, -2.745411097253725] [2.2770635627955, -2.2770635627955] [1.7640231694663067, -1.7640231694663067] [2.3870306837485726, -2.3870306837485726] [2.699555439548069, -2.699555439548069] [1.9689919906092934, -1.9689919906092934] [2.236223494259175, -2.236223494259175] [2.352430409935469, -2.352430409935469] [2.8009650886646655, -2.8009650886646655] [2.6047753141937164, -2.6047753141937164]
prediction 0 0 0 0 0 0 0 0 0 0
probability [0.9396536620507552, 0.0603463379492449] [0.9069595547320315, 0.09304044526796858] [0.8537128180771243, 0.14628718192287574] [0.9158329668960973, 0.08416703310390271] [0.9370004063505419, 0.06299959364945804] [0.8775028018243339, 0.12249719817566608] [0.9034555593377342, 0.09654444066226563] [0.9131272157024063, 0.08687278429759375] [0.942727953398344, 0.05727204660165593] [0.9311682791393624, 0.06883172086063759]

In [20]:

df_test = get_input_data(base_path + 'test.csv')  
pd.DataFrame(df_test.take(10), columns=df_test.columns).transpose() 
df_test = pipelineModel.transform(df_test)
predictions = lrModel.transform(df_test)
# predictions.select('features').show()
predictions = predictions.select('ID_code',  'prediction')
# predictions.show()
pd.DataFrame(predictions.take(10), columns=predictions.columns).transpose() 

Out[20]:

  0 1 2 3 4 5 6 7 8 9
ID_code test_0 test_1 test_2 test_3 test_4 test_5 test_6 test_7 test_8 test_9
prediction 0 0 0 0 0 0 0 0 0 0

In [19]:

predictions.show(5)
+--------------------+--------------------+----------+--------------------+
|            features|       rawPrediction|prediction|         probability|
+--------------------+--------------------+----------+--------------------+
|[0.0,46.405721247...|[1.84608169708193...|       0.0|[0.86366639261870...|
|[2.00002624874797...|[1.86154755297036...|       0.0|[0.86547722599330...|
|[0.0,45.986435853...|[2.29001370850934...|       0.0|[0.90804659470819...|
|[0.0,45.318199756...|[1.90419746231964...|       0.0|[0.87036585836244...|
|[2.00002624874797...|[2.07114740649738...|       0.0|[0.88806706897729...|
+--------------------+--------------------+----------+--------------------+
only showing top 5 rows

In [23]:

# predictions = predictions.toPandas()
predictions['target'] = predictions['prediction'].apply(lambda x: int(x))
predictions[["ID_code","target"]].to_csv("./submit01.csv",index=False)
predictions[["ID_code","target"]]

Out[23]:

  ID_code target
0 test_0 0
1 test_1 0
2 test_2 0
3 test_3 0
4 test_4 0
5 test_5 0
6 test_6 0
7 test_7 0
8 test_8 0
9 test_9 0
10 test_10 0
11 test_11 0
12 test_12 0
13 test_13 0
14 test_14 0
15 test_15 0
16 test_16 0
17 test_17 0
18 test_18 0
19 test_19 0
20 test_20 0
21 test_21 0
22 test_22 0
23 test_23 0
24 test_24 0
25 test_25 0
26 test_26 0
27 test_27 0
28 test_28 0
29 test_29 0
... ... ...
199970 test_199970 0
199971 test_199971 0
199972 test_199972 0
199973 test_199973 0
199974 test_199974 0
199975 test_199975 0
199976 test_199976 0
199977 test_199977 0
199978 test_199978 0
199979 test_199979 0
199980 test_199980 0
199981 test_199981 0
199982 test_199982 0
199983 test_199983 0
199984 test_199984 0
199985 test_199985 0
199986 test_199986 0
199987 test_199987 0
199988 test_199988 0
199989 test_199989 0
199990 test_199990 0
199991 test_199991 0
199992 test_199992 0
199993 test_199993 0
199994 test_199994 0
199995 test_199995 0
199996 test_199996 0
199997 test_199997 0
199998 test_199998 0
199999 test_199999 0

200000 rows × 2 columns

In [ ]:

from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'label', maxDepth = 20)
dtModel = dt.fit(train)
predictions = dtModel.transform(test)

evaluator = BinaryClassificationEvaluator()
print("Test Area Under ROC: " + str(evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})))


from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'label')
rfModel = rf.fit(train)
predictions = rfModel.transform(test)

evaluator = BinaryClassificationEvaluator()
print("Test Area Under ROC: " + str(evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})))

from pyspark.ml.classification import GBTClassifier
gbt = GBTClassifier(maxIter=100)
gbtModel = gbt.fit(train)
predictions = gbtModel.transform(test)

evaluator = BinaryClassificationEvaluator()
print("Test Area Under ROC: " + str(evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})))

print(gbt.explainParams())


# from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
# paramGrid = (ParamGridBuilder()
#              .addGrid(gbt.maxDepth, [12, 8, 20])
#              .addGrid(gbt.maxBins, [200, 600])
#              .addGrid(gbt.maxIter, [100, 1000])
#              .build())
# cv = CrossValidator(estimator=gbt, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
# # Run cross validations.  This can take about 6 minutes since it is training over 20 trees!
# cvModel = cv.fit(train)
# predictions = cvModel.transform(test)
# print('Test Area Under ROC', evaluator.evaluate(predictions))
# print('areaUnderROC', evaluator.evaluate(predictions,
# {evaluator.metricName: 'areaUnderROC'}))
# print('areaUnderPR', evaluator.evaluate(predictions,{evaluator.metricName: 'areaUnderPR'}))

# predictions.show()
pd.DataFrame(predictions.take(10), columns=predictions.columns).transpose() 

你可能感兴趣的:(比赛)