基于客户提取为所属客户经理的信息

本地处理

#!/usr/bin/python
# -*- coding: utf-8 -*-

# UnicodeDecodeError: 'utf8' codec can't decode byte 0x9a in position 12的暂时解决方法——修改默认encoding
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.functions import *  # 为使用dataframe的方法
import re
from pyspark.sql.types import *
import datetime
import pandas as pd


conf = SparkConf().setAppName("miniProject").setMaster("local")
sc1 = SparkContext.getOrCreate(conf)
spark = SparkSession(sc1)

1. 客户经理信息简单查询

# 客户经理数据——已经将代码的csv传到本地,读取
cm_df = pd.read_csv('file:///home/hadoop/xxx/project_0509_khjl/secret.csv', sep=',', encoding='utf-8')
cm_df = cm_df.ix[:,1:-1]
cm_df.info()

RangeIndex: 104467 entries, 0 to 104466
Data columns (total 12 columns):
staff_id             104467 non-null int64
staff_name           104467 non-null object
user_id_card_edit    104467 non-null object
id_county            104431 non-null object
id_province          104431 non-null object
id_city              104431 non-null object
id                   104467 non-null int64
role_id              104467 non-null int64
presona_id           104467 non-null int64
status               104467 non-null int64
city_number          104467 non-null int64
num                  104467 non-null int64
dtypes: int64(7), object(5)
memory usage: 9.6+ MB
cm_df.describe()
staff_id id role_id presona_id status city_number num
count 104467.000000 104467.000000 104467.000000 104467.000000 104467.000000 104467.0 104467.000000
mean 191295.747298 191295.747298 58.994228 3135.484143 1.857151 0.0 -0.000029
std 78064.637688 78064.637688 0.400920 2029.885308 0.349920 0.0 0.031703
min 218.000000 218.000000 0.000000 43.000000 1.000000 0.0 -4.000000
25% 162123.500000 162123.500000 59.000000 1603.000000 2.000000 0.0 0.000000
50% 206533.000000 206533.000000 59.000000 2374.000000 2.000000 0.0 0.000000
75% 247621.500000 247621.500000 59.000000 4828.000000 2.000000 0.0 0.000000
max 294921.000000 294921.000000 155.000000 9002.000000 2.000000 0.0 4.000000
# 城市数据
# 已经将代码的csv传到cluster,放在hdfs根目录
ctcode_df = pd.read_csv('file:///home/hadoop/xxx/project_0509_khjl/xz.csv', sep=',', encoding='utf-8')
ctcode_df.head()
ctcode_df.info()

RangeIndex: 3219 entries, 0 to 3218
Data columns (total 2 columns):
xzqhdm         3219 non-null int64
xzqhdm_name    3219 non-null object
dtypes: int64(1), object(1)
memory usage: 50.4+ KB
ctcode_df['xzqhdm'] = ctcode_df['xzqhdm'].astype(str)
ctcode_df.info()

RangeIndex: 3219 entries, 0 to 3218
Data columns (total 2 columns):
xzqhdm         3219 non-null object
xzqhdm_name    3219 non-null object
dtypes: object(2)
memory usage: 50.4+ KB
# 链接
tmp_county = pd.merge(left=cm_df, right=ctcode_df, left_on='id_county', right_on='xzqhdm', how='left')


tmp_province = pd.merge(left=tmp_county, right=ctcode_df, left_on='id_province', right_on='xzqhdm', how='left')

tmp_city = pd.merge(left=tmp_province, right=ctcode_df, left_on='id_city', right_on='xzqhdm', how='left')

tmp_city.drop(['xzqhdm_x','xzqhdm_y','xzqhdm'], axis=1, inplace=True)
tmp_city.head()
tmp_city.count()
staff_id             104467
staff_name           104467
user_id_card_edit    104467
id_county            104431
id_province          104431
id_city              104431
id                   104467
role_id              104467
presona_id           104467
status               104467
city_number          104467
num                  104467
xzqhdm_name_x         45367
xzqhdm_name_y         65507
xzqhdm_name           51701
dtype: int64
# 不同省份的人数
tmp_city.groupby(['xzqhdm_name_y'])['staff_id'].count()
xzqhdm_name_y
上海市          240
云南省         1699
内蒙古自治区      1731
北京市           79
吉林省         2200
四川省         4520
天津市          274
宁夏回族自治区      369
安徽省         3906
山东省         5584
山西省         1285
广东省         3889
广西壮族自治区     1823
新疆维吾尔自治区      55
江苏省         4799
江西省         2386
河北省         3110
河南省         4545
浙江省         2202
海南省          337
湖北省         3596
湖南省         2706
甘肃省         1031
福建省         3204
西藏自治区          2
贵州省         1230
辽宁省         2562
重庆市         1347
陕西省         1929
青海省          132
黑龙江省        2735
Name: staff_id, dtype: int64
dis_ct_prov = tmp_city.groupby(['xzqhdm_name_y'])['staff_id'].count().sort_values(ascending=False) 
dis_ct_city = tmp_city.groupby(['xzqhdm_name_x'])['staff_id'].count().sort_values(ascending=False) 
dis_ct_county = tmp_city.groupby(['xzqhdm_name'])['staff_id'].count().sort_values(ascending=False) 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib notebook
from pylab import *  
mpl.rcParams['font.sans-serif'] = ['SimHei']  
mpl.rcParams['axes.unicode_minus'] = False  

fig = plt.figure()
dis_ct_prov.plot(kind='bar', )
plt.title('province')
plt.show()

dis_ct_prov[:10]
xzqhdm_name_y
山东省     5584
江苏省     4799
河南省     4545
四川省     4520
安徽省     3906
广东省     3889
湖北省     3596
福建省     3204
河北省     3110
黑龙江省    2735
Name: staff_id, dtype: int64
fig2 = plt.figure()
dis_ct_city.plot(kind='bar', )
plt.title('city')
plt.show()

dis_ct_city[:10]
xzqhdm_name_x
市中区    241
惠安县    187
睢宁县    177
宁海县    164
沭阳县    161
灌云县    159
兴化市    148
仙游县    142
临泉县    135
沛县     134
Name: staff_id, dtype: int64
fig3 = plt.figure()
dis_ct_county.plot(kind='bar', )
plt.title('city')
plt.show()

dis_ct_county[:10]
xzqhdm_name
徐州市    892
阜阳市    618
泉州市    604
漳州市    566
潍坊市    556
淮安市    526
盐城市    521
湛江市    509
南充市    508
宿迁市    505
Name: staff_id, dtype: int64
# 查看客户经理的在职状态,1-在职
tmp_city.groupby('status').count(staff_id)

2、客户信息处理

#!/usr/bin/python
# -*- coding: utf-8 -*-

import pandas as pd
import numpy as np

# from pyspark.sql import functions as F

def fraud_normal(already_num,cd):
    if cd>=2:
        flag='loan_fraud'
    else:
        if already_num>4:
            flag='loan_normal'
        else:
            flag='drop'
    return flag
# Fraud_normal=F.udf(fraud_normal)

# 读入整理好的客户信息,已经将代码的csv传到本地,读取
cl_df = pd.read_csv('file:///home/hadoop/xxx/project_0509_khjl/datasource/client.csv', sep=',', encoding='utf-8')
cl_df = cl_df.ix[:,1:-1]
cl_df[['cd', 'already_num']].sort_values(['cd', 'already_num']).head()
cd already_num
160569 -8.0 17
114941 -3.0 10
1041 0.0 0
1176 0.0 0
2354 0.0 0
cl_df[cl_df['loan_type']==2].head()
cl_df['flag'] = cl_df[cl_df['loan_type']==2].apply(lambda x: fraud_normal(x['already_num'],x['cd']), axis=1)
cl_df_flaged=cl_df[cl_df['flag']!='drop']
cl_df_flaged.columns, cl_df_flaged.count()
(Index([             u'id',          u'id_num',      u'account_id',
          u'loan_staff_id',       u'education',       u'child_sum',
                 u'is_car',  u'account_number',        u'zs_money',
                u'zipCode',          u'status',             u'amt',
              u'loan_type',              u'cd',            u'year',
        u'intopieces_date',           u'count',   u'total_account',
        u'avg_month_men_a',       u'repay_num',     u'already_num',
             u'user_phone',     u'presona_pid',            u'flag'],
       dtype='object'), id                 771956
 id_num             771956
 account_id         771956
 loan_staff_id      771956
 education          771956
 child_sum          771806
 is_car             771956
 account_number     771956
 zs_money           771956
 zipCode            254976
 status             771956
 amt                771956
 loan_type          771956
 cd                 768909
 year               771956
 intopieces_date    771956
 count              771956
 total_account      771956
 avg_month_men_a    771956
 repay_num          771956
 already_num        771956
 user_phone         752550
 presona_pid        751350
 flag               271373
 dtype: int64)

3. 链接经理和客户

def et_id_sex(x):
    
    if len(x)==18:
        if float(x[16])%2 == 0:
            sex = 0 # 'female'
        else:
            sex = 1 # 'male'
        return sex

    elif len(x)==15:
        if float(x[-1])%2 == 0:
            sex = 0 # 'female'
        else:
            sex = 1 # 'male'
        return sex
    else:
        return None
import datetime

def et_id_age(x):
            
    if len(x)==18:
        y_m_d = x[6:15]
        age = calculate_age(y_m_d)
        return age

    elif len(x)==15:
        y_m_d = x[6:12]
        age = calculate_age(y_m_d)
        return age
    else:
        return None

def calculate_age(input_born, today=[2018,5,10]):
        '''
        : input_born: string, len=8
        '''

        y_born = input_born[0:4]
        m_born = input_born[4:6]
        d_born = input_born[6:8]
        
        if (int(m_born)  in range(1, 13, 1) and int(d_born) in range(1, 32, 1)):
            born = datetime.date(int(y_born), int(m_born), int(d_born))
            today = datetime.date(today[0],today[1],today[2])

            born_days = born - datetime.date(born.year-1, 12, 31)  #减去上一年最后一天,可得解
            target_days = today - datetime.date(today.year-1, 12, 31)
            sub_days = target_days - born_days
            sub_days = sub_days.days

            if today > born:
                years = today.year-born.year
                if sub_days >= 0:
                    if sub_days>=0 and sub_days<183:
                        return years
                    else:
                        # sub_days in range(183,366,1):
                        return years+1

                else:
                    sub_days = sub_days*(-1)
                    if sub_days>=0 and sub_days<183:
                        return years
                    else:
                        return years-1
            else:
                print('error_date')
                return None
            
        else:
            return None

3.1 客户经理数据读入与处理

# 城市区划数据,已经将代码的csv传到cluster,放在hdfs根目录
ctcode_df = pd.read_csv('/home/hadoop/xxx/project_0509_khjl/datasource/xz.csv', sep=',', encoding='utf-8')
ctcode_df['xzqhdm'] = ctcode_df['xzqhdm'].astype(str)

# 客户经理数据——已经将代码的csv传到本地,读取
cm_df = pd.read_csv('/home/hadoop/xxx/project_0509_khjl/datasource/manager.csv', sep=',', encoding='utf-8')
cm_df = cm_df.ix[:,1:-1]

# 城市地点
tmp_county = pd.merge(left=cm_df, right=ctcode_df, left_on='id_county', right_on='xzqhdm', how='left')

tmp_province = pd.merge(left=tmp_county, right=ctcode_df, left_on='id_province', right_on='xzqhdm', how='left')

tmp_city = pd.merge(left=tmp_province, right=ctcode_df, left_on='id_city', right_on='xzqhdm', how='left')

# 性别
tmp_city['sex'] = tmp_city['user_id_card_edit'].apply(et_id_sex)

# 年龄
tmp_city['age'] = tmp_city['user_id_card_edit'].apply(et_id_age)

# 重命名客户经理的在职状态字段,status
tmp_city.rename(columns={u'status':'status_m'}, inplace=True)

cm_df_location = tmp_city.drop(['xzqhdm_x','xzqhdm_y','xzqhdm'], axis=1, inplace=False)
/home/hadoop/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2723: DtypeWarning: Columns (4,5,6) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)


error_date

3.1链接经理与客户信息

joint_cm_cl = pd.merge(left=cm_df_location, right=cl_df_flaged, left_on='staff_id', right_on='loan_staff_id', how='left')
joint_cm_cl.tail()
cl_df_flaged['loan_staff_id'].dtypes
dtype('int64')
cm_df_location['staff_id'].dtypes
dtype('int64')
cm_df_location.sort_values('staff_id').tail()
cl_df_flaged.sort_values('loan_staff_id').tail()
joint_cm_cl.count()
staff_id             740250
staff_name           740250
user_id_card_edit    740250
id_county            740214
id_province          740214
id_city              740214
id_x                 740250
role_id              740250
presona_id           740250
status_m             740250
city_number          740250
num                  740250
xzqhdm_name_x        314714
xzqhdm_name_y        461990
xzqhdm_name          362371
sex                  740214
age                  740122
id_y                 690472
id_num               690472
account_id           690472
loan_staff_id        690472
education            690472
child_sum            690359
is_car               690472
account_number       690472
zs_money             690472
zipCode              221233
status               690472
amt                  690472
loan_type            690472
cd                   687819
year                 690472
intopieces_date      690472
count                690472
total_account        690472
avg_month_men_a      690472
repay_num            690472
already_num          690472
user_phone           690472
presona_pid          689456
flag                 236757
dtype: int64
cl_df_flaged.count()
id                 771956
id_num             771956
account_id         771956
loan_staff_id      771956
education          771956
child_sum          771806
is_car             771956
account_number     771956
zs_money           771956
zipCode            254976
status             771956
amt                771956
loan_type          771956
cd                 768909
year               771956
intopieces_date    771956
count              771956
total_account      771956
avg_month_men_a    771956
repay_num          771956
already_num        771956
user_phone         752550
presona_pid        751350
flag               271373
dtype: int64
joint_cm_cl.groupby('flag').agg('count')
staff_id staff_name user_id_card_edit id_county id_province id_city id_x role_id presona_id status_m ... cd year intopieces_date count total_account avg_month_men_a repay_num already_num user_phone presona_pid
flag
loan_fraud 64186 64186 64186 64186 64186 64186 64186 64186 64186 64186 ... 64186 64186 64186 64186 64186 64186 64186 64186 64186 64104
loan_normal 172571 172571 172571 172571 172571 172571 172571 172571 172571 172571 ... 172569 172571 172571 172571 172571 172571 172571 172571 172571 172352

2 rows × 40 columns

joint_cm_cl.ix[584100:584129,'flag'].notnull()
584100     True
584101     True
584102    False
584103    False
584104     True
584105    False
584106     True
584107    False
584108    False
584109    False
584110     True
584111    False
584112    False
584113     True
584114    False
584115    False
584116    False
584117    False
584118    False
584119    False
584120    False
584121    False
584122     True
584123    False
584124     True
584125    False
584126     True
584127    False
584128     True
584129     True
Name: flag, dtype: bool

3.3 选取感兴趣数据一

feature_use1 = ['staff_id','id_province', 'sex', 'status_m', 'age', 'num', 'presona_id', 'city_number','flag']
dataset_use1 = joint_cm_cl.loc[:, feature_use1]
dataset_use1.info()

Int64Index: 740250 entries, 0 to 740249
Data columns (total 9 columns):
staff_id       740250 non-null int64
id_province    740214 non-null object
sex            740214 non-null float64
status_m       740250 non-null int64
age            740122 non-null float64
num            740250 non-null int64
presona_id     740250 non-null int64
city_number    740250 non-null int64
flag           236757 non-null object
dtypes: float64(2), int64(5), object(2)
memory usage: 56.5+ MB
dataset_use1.describe()
/home/hadoop/anaconda2/lib/python2.7/site-packages/numpy/lib/function_base.py:4291: RuntimeWarning: Invalid value encountered in percentile
  interpolation=interpolation)
staff_id sex status_m age num presona_id city_number
count 740250.000000 740214.000000 740250.000000 740122.000000 740250.000000 740250.000000 740250.0
mean 206828.474852 0.578088 1.715630 28.350092 -0.000009 3265.831038 0.0
std 49171.210610 0.493865 0.451114 4.882471 0.018777 2031.866055 0.0
min 218.000000 0.000000 1.000000 18.000000 -4.000000 43.000000 0.0
25% 184968.000000 NaN 1.000000 NaN 0.000000 1676.000000 0.0
50% 214036.000000 NaN 2.000000 NaN 0.000000 2457.000000 0.0
75% 238753.000000 NaN 2.000000 NaN 0.000000 5110.000000 0.0
max 294921.000000 1.000000 2.000000 1716.000000 4.000000 9002.000000 0.0

经验证,其中sex=NaN的客户经理,溯源其身份证,通过函数验证身份证信息属假的,故考虑去除sex=NaN的客户经理信息(相应的flag也是NaN)

# 去除sex=NaN的观测
dataset_use1 = dataset_use1[dataset_use1['sex'].notnull()]
# 去除flag==NaN的观测。这部分值可能是由于在生成flag的过程中,cd=None或者already_num=None所带来的。
dataset_use1 = dataset_use1[dataset_use1['flag'].notnull()]
# 去除sage=NaN的观测
dataset_use1[dataset_use1['age'].isnull()]
dataset_use1 = dataset_use1[dataset_use1['age'].notnull()]
dataset_use1[dataset_use1['flag'].isnull()].groupby('staff_id').count()['sex']
Series([], Name: sex, dtype: int64)
# 计算客户经理对应的客户的违约情况,0-1,无违约,100%违约。获取客户经理对应的客户数量
def groupby_calcu(data_df, goal_id='staff_id', flag_id='flag'):
    whole_num = data_df.groupby(goal_id).count()['sex']
    whole_num.rename('whole_count', inplace=True)
    
    loan_fraud_num = data_df[data_df[flag_id]=='loan_fraud'].groupby(goal_id).count()['sex']
    loan_fraud_num.rename('fraud_count', inplace=True)
    
    tmp_df = pd.concat([whole_num, loan_fraud_num], axis=1, join='outer')
    tmp_df['fraud_count'] = tmp_df['fraud_count'].fillna(value=0, inplace=False)
    
    # 类型转换,减少内存
    tmp_df['whole_count'] = tmp_df['whole_count'].astype('int32')
    tmp_df['fraud_count'] = tmp_df['fraud_count'].astype('float32')
    
    flag_new_name = '%s_perc' %(flag_id)
    
    # 方式1:突破内存限制
    for i in tmp_df.index.values:
        tmp_df.ix[i, flag_new_name] = (tmp_df.ix[i, 'fraud_count']/tmp_df.ix[i, 'whole_count']).astype('float32')

    # 方式2:受到内存限制,报错!!!死机!!!——不推荐!!!
    # np.float32(wf_df['f_count'][:100000]/wf_df['whole_count'][:100000])
    
    df = pd.DataFrame({goal_id:tmp_df.index, flag_new_name:tmp_df[flag_new_name], 'num_client':tmp_df['whole_count']}, columns=[goal_id, flag_new_name, 'num_client'])

    unique_data_df = data_df.drop_duplicates(goal_id).sort_values(goal_id)
    unique_data_df_final = pd.merge(left=unique_data_df, right=df, left_on=goal_id, right_on=goal_id, how='left')
    
    whole_num, loan_fraud_num, tmp_df, df, unique_data_df = None, None, None, None, None
    del whole_num, loan_fraud_num, tmp_df, df, unique_data_df
    
    return unique_data_df_final
 
dataset_use2 = groupby_calcu(dataset_use1.ix[:,])
dataset_use2.head()
staff_id id_province sex status_m age num presona_id city_number flag flag_perc num_client
0 4735 350000 0.0 2 28.0 0 112 0 loan_fraud 1.0 1
1 4857 620000 0.0 2 29.0 0 43 0 loan_fraud 1.0 2
2 5365 340000 1.0 2 32.0 0 141 0 loan_normal 0.0 2
3 5373 410000 1.0 2 28.0 0 142 0 loan_normal 0.0 2
4 5910 410000 0.0 2 25.0 0 1110 0 loan_normal 0.5 4
# w_num1 = dataset_use1.groupby('staff_id').count()['sex']
# w_num1 = w_num1.rename('whole_count')
# f_num1 = dataset_use1[dataset_use1['flag']=='fraud_count'].groupby('staff_id').count()['sex']
# f_num1 = f_num1.rename('f_count')

# wf_df = pd.concat([w_num1, f_num1], axis=1, join='outer')
# wf_df['f_count'] = wf_df['f_count'].fillna(value=int(0), inplace=False)

# wf_df['whole_count'] = wf_df['whole_count'].astype('int32')
# wf_df['f_count'] = wf_df['f_count'].astype('float32')
# wf_df.dtypes
dataset_use2.describe()
staff_id sex status_m age num presona_id city_number flag_perc num_client
count 39374.000000 39374.000000 39374.000000 39374.000000 39374.000000 39374.000000 39374.0 39374.000000 39374.000000
mean 219081.539468 0.616727 1.799055 27.900797 0.000000 3240.945421 0.0 0.268053 6.012851
std 42961.619859 0.486190 0.400712 4.126361 0.014254 2052.084694 0.0 0.313801 7.872544
min 4735.000000 0.000000 1.000000 19.000000 -1.000000 43.000000 0.0 0.000000 1.000000
25% 196255.250000 0.000000 2.000000 25.000000 0.000000 1676.000000 0.0 0.000000 1.000000
50% 226812.500000 1.000000 2.000000 28.000000 0.000000 2449.000000 0.0 0.181818 3.000000
75% 249347.750000 1.000000 2.000000 30.000000 0.000000 4961.000000 0.0 0.444444 7.000000
max 286704.000000 1.000000 2.000000 52.000000 2.000000 9002.000000 0.0 1.000000 103.000000
# flagd的处理
def flag_pd(x):
    if x=='loan_normal':
        label = int(1)
        
    else:
        label = int(0)
    return label

dataset_use2['flag_01'] = dataset_use2['flag'].apply(flag_pd)

3.3选取感兴趣的字段二——为建模准备

dataset_use3 = dataset_use2[['sex','age','num_client', 'status_m', 'presona_id', 'id_province', 'flag_perc']].reset_index()
dataset_use3[dataset_use3["flag_perc"] <= 0.5]['flag_perc'] = 0
dataset_use3[dataset_use3["flag_perc"] > 0.5]['flag_perc'] = 1
/home/hadoop/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
/home/hadoop/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
dataset_use3.dtypes, dataset_use3.head()
(index            int64
 sex            float64
 age            float64
 num_client       int32
 status_m         int64
 presona_id       int64
 id_province     object
 flag_perc      float32
 dtype: object,
    index  sex   age  num_client  status_m  presona_id id_province  flag_perc
 0      0  0.0  28.0           1         2         112      350000        1.0
 1      1  0.0  29.0           2         2          43      620000        1.0
 2      2  1.0  32.0           2         2         141      340000        0.0
 3      3  1.0  28.0           2         2         142      410000        0.0
 4      4  0.0  25.0           4         2        1110      410000        0.5)
dataset_use3['sex'] = dataset_use3['sex'].astype(int).astype(str)
dataset_use3['age'] = dataset_use3['age'].astype(int)
dataset_use3['status_m'] = dataset_use3['status_m'].astype(int).astype(str)
dataset_use3['presona_id'] = dataset_use3['presona_id'].astype(str)
dataset_use3['id_province'] = dataset_use3['id_province'].astype(int).astype(str)
dataset_use3['flag_perc'] = dataset_use3['flag_perc'].astype(int)

data_tmp_onehot = pd.get_dummies(dataset_use3[['sex','age', 'num_client', 'status_m', 'id_province']])
data_tmp_onehot['flag_perc'] = dataset_use3['flag_perc']
data_tmp_onehot.info()

RangeIndex: 39374 entries, 0 to 39373
Data columns (total 38 columns):
age                   39374 non-null int64
num_client            39374 non-null int32
sex_0                 39374 non-null float64
sex_1                 39374 non-null float64
status_m_1            39374 non-null float64
status_m_2            39374 non-null float64
id_province_110000    39374 non-null float64
id_province_120000    39374 non-null float64
id_province_130000    39374 non-null float64
id_province_140000    39374 non-null float64
id_province_150000    39374 non-null float64
id_province_210000    39374 non-null float64
id_province_220000    39374 non-null float64
id_province_230000    39374 non-null float64
id_province_310000    39374 non-null float64
id_province_320000    39374 non-null float64
id_province_330000    39374 non-null float64
id_province_340000    39374 non-null float64
id_province_350000    39374 non-null float64
id_province_360000    39374 non-null float64
id_province_370000    39374 non-null float64
id_province_400000    39374 non-null float64
id_province_410000    39374 non-null float64
id_province_420000    39374 non-null float64
id_province_430000    39374 non-null float64
id_province_440000    39374 non-null float64
id_province_450000    39374 non-null float64
id_province_460000    39374 non-null float64
id_province_500000    39374 non-null float64
id_province_510000    39374 non-null float64
id_province_520000    39374 non-null float64
id_province_530000    39374 non-null float64
id_province_610000    39374 non-null float64
id_province_620000    39374 non-null float64
id_province_630000    39374 non-null float64
id_province_640000    39374 non-null float64
id_province_650000    39374 non-null float64
flag_perc             39374 non-null int64
dtypes: float64(35), int32(1), int64(2)
memory usage: 11.3 MB

3.4绘图与描述统计分析

import matplotlib.pyplot as plt
import seaborn as sns
% matplotlib notebook

3.4.1单变量的分布情况

flg = plt.figure()

你可能感兴趣的:(python,数据科学项目)