本地处理
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.functions import *
import re
from pyspark.sql.types import *
import datetime
import pandas as pd
conf = SparkConf().setAppName("miniProject").setMaster("local")
sc1 = SparkContext.getOrCreate(conf)
spark = SparkSession(sc1)
1. 客户经理信息简单查询
cm_df = pd.read_csv('file:///home/hadoop/xxx/project_0509_khjl/secret.csv', sep=',', encoding='utf-8')
cm_df = cm_df.ix[:,1:-1]
cm_df.info()
RangeIndex: 104467 entries, 0 to 104466
Data columns (total 12 columns):
staff_id 104467 non-null int64
staff_name 104467 non-null object
user_id_card_edit 104467 non-null object
id_county 104431 non-null object
id_province 104431 non-null object
id_city 104431 non-null object
id 104467 non-null int64
role_id 104467 non-null int64
presona_id 104467 non-null int64
status 104467 non-null int64
city_number 104467 non-null int64
num 104467 non-null int64
dtypes: int64(7), object(5)
memory usage: 9.6+ MB
cm_df.describe()
|
staff_id |
id |
role_id |
presona_id |
status |
city_number |
num |
count |
104467.000000 |
104467.000000 |
104467.000000 |
104467.000000 |
104467.000000 |
104467.0 |
104467.000000 |
mean |
191295.747298 |
191295.747298 |
58.994228 |
3135.484143 |
1.857151 |
0.0 |
-0.000029 |
std |
78064.637688 |
78064.637688 |
0.400920 |
2029.885308 |
0.349920 |
0.0 |
0.031703 |
min |
218.000000 |
218.000000 |
0.000000 |
43.000000 |
1.000000 |
0.0 |
-4.000000 |
25% |
162123.500000 |
162123.500000 |
59.000000 |
1603.000000 |
2.000000 |
0.0 |
0.000000 |
50% |
206533.000000 |
206533.000000 |
59.000000 |
2374.000000 |
2.000000 |
0.0 |
0.000000 |
75% |
247621.500000 |
247621.500000 |
59.000000 |
4828.000000 |
2.000000 |
0.0 |
0.000000 |
max |
294921.000000 |
294921.000000 |
155.000000 |
9002.000000 |
2.000000 |
0.0 |
4.000000 |
ctcode_df = pd.read_csv('file:///home/hadoop/xxx/project_0509_khjl/xz.csv', sep=',', encoding='utf-8')
ctcode_df.head()
ctcode_df.info()
RangeIndex: 3219 entries, 0 to 3218
Data columns (total 2 columns):
xzqhdm 3219 non-null int64
xzqhdm_name 3219 non-null object
dtypes: int64(1), object(1)
memory usage: 50.4+ KB
ctcode_df['xzqhdm'] = ctcode_df['xzqhdm'].astype(str)
ctcode_df.info()
RangeIndex: 3219 entries, 0 to 3218
Data columns (total 2 columns):
xzqhdm 3219 non-null object
xzqhdm_name 3219 non-null object
dtypes: object(2)
memory usage: 50.4+ KB
tmp_county = pd.merge(left=cm_df, right=ctcode_df, left_on='id_county', right_on='xzqhdm', how='left')
tmp_province = pd.merge(left=tmp_county, right=ctcode_df, left_on='id_province', right_on='xzqhdm', how='left')
tmp_city = pd.merge(left=tmp_province, right=ctcode_df, left_on='id_city', right_on='xzqhdm', how='left')
tmp_city.drop(['xzqhdm_x','xzqhdm_y','xzqhdm'], axis=1, inplace=True)
tmp_city.head()
tmp_city.count()
staff_id 104467
staff_name 104467
user_id_card_edit 104467
id_county 104431
id_province 104431
id_city 104431
id 104467
role_id 104467
presona_id 104467
status 104467
city_number 104467
num 104467
xzqhdm_name_x 45367
xzqhdm_name_y 65507
xzqhdm_name 51701
dtype: int64
tmp_city.groupby(['xzqhdm_name_y'])['staff_id'].count()
xzqhdm_name_y
上海市 240
云南省 1699
内蒙古自治区 1731
北京市 79
吉林省 2200
四川省 4520
天津市 274
宁夏回族自治区 369
安徽省 3906
山东省 5584
山西省 1285
广东省 3889
广西壮族自治区 1823
新疆维吾尔自治区 55
江苏省 4799
江西省 2386
河北省 3110
河南省 4545
浙江省 2202
海南省 337
湖北省 3596
湖南省 2706
甘肃省 1031
福建省 3204
西藏自治区 2
贵州省 1230
辽宁省 2562
重庆市 1347
陕西省 1929
青海省 132
黑龙江省 2735
Name: staff_id, dtype: int64
dis_ct_prov = tmp_city.groupby(['xzqhdm_name_y'])['staff_id'].count().sort_values(ascending=False)
dis_ct_city = tmp_city.groupby(['xzqhdm_name_x'])['staff_id'].count().sort_values(ascending=False)
dis_ct_county = tmp_city.groupby(['xzqhdm_name'])['staff_id'].count().sort_values(ascending=False)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib notebook
from pylab import *
mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['axes.unicode_minus'] = False
fig = plt.figure()
dis_ct_prov.plot(kind='bar', )
plt.title('province')
plt.show()
dis_ct_prov[:10]
xzqhdm_name_y
山东省 5584
江苏省 4799
河南省 4545
四川省 4520
安徽省 3906
广东省 3889
湖北省 3596
福建省 3204
河北省 3110
黑龙江省 2735
Name: staff_id, dtype: int64
fig2 = plt.figure()
dis_ct_city.plot(kind='bar', )
plt.title('city')
plt.show()
dis_ct_city[:10]
xzqhdm_name_x
市中区 241
惠安县 187
睢宁县 177
宁海县 164
沭阳县 161
灌云县 159
兴化市 148
仙游县 142
临泉县 135
沛县 134
Name: staff_id, dtype: int64
fig3 = plt.figure()
dis_ct_county.plot(kind='bar', )
plt.title('city')
plt.show()
dis_ct_county[:10]
xzqhdm_name
徐州市 892
阜阳市 618
泉州市 604
漳州市 566
潍坊市 556
淮安市 526
盐城市 521
湛江市 509
南充市 508
宿迁市 505
Name: staff_id, dtype: int64
tmp_city.groupby('status').count(staff_id)
2、客户信息处理
import pandas as pd
import numpy as np
def fraud_normal(already_num,cd):
if cd>=2:
flag='loan_fraud'
else:
if already_num>4:
flag='loan_normal'
else:
flag='drop'
return flag
cl_df = pd.read_csv('file:///home/hadoop/xxx/project_0509_khjl/datasource/client.csv', sep=',', encoding='utf-8')
cl_df = cl_df.ix[:,1:-1]
cl_df[['cd', 'already_num']].sort_values(['cd', 'already_num']).head()
|
cd |
already_num |
160569 |
-8.0 |
17 |
114941 |
-3.0 |
10 |
1041 |
0.0 |
0 |
1176 |
0.0 |
0 |
2354 |
0.0 |
0 |
cl_df[cl_df['loan_type']==2].head()
cl_df['flag'] = cl_df[cl_df['loan_type']==2].apply(lambda x: fraud_normal(x['already_num'],x['cd']), axis=1)
cl_df_flaged=cl_df[cl_df['flag']!='drop']
cl_df_flaged.columns, cl_df_flaged.count()
(Index([ u'id', u'id_num', u'account_id',
u'loan_staff_id', u'education', u'child_sum',
u'is_car', u'account_number', u'zs_money',
u'zipCode', u'status', u'amt',
u'loan_type', u'cd', u'year',
u'intopieces_date', u'count', u'total_account',
u'avg_month_men_a', u'repay_num', u'already_num',
u'user_phone', u'presona_pid', u'flag'],
dtype='object'), id 771956
id_num 771956
account_id 771956
loan_staff_id 771956
education 771956
child_sum 771806
is_car 771956
account_number 771956
zs_money 771956
zipCode 254976
status 771956
amt 771956
loan_type 771956
cd 768909
year 771956
intopieces_date 771956
count 771956
total_account 771956
avg_month_men_a 771956
repay_num 771956
already_num 771956
user_phone 752550
presona_pid 751350
flag 271373
dtype: int64)
3. 链接经理和客户
def et_id_sex(x):
if len(x)==18:
if float(x[16])%2 == 0:
sex = 0
else:
sex = 1
return sex
elif len(x)==15:
if float(x[-1])%2 == 0:
sex = 0
else:
sex = 1
return sex
else:
return None
import datetime
def et_id_age(x):
if len(x)==18:
y_m_d = x[6:15]
age = calculate_age(y_m_d)
return age
elif len(x)==15:
y_m_d = x[6:12]
age = calculate_age(y_m_d)
return age
else:
return None
def calculate_age(input_born, today=[2018,5,10]):
'''
: input_born: string, len=8
'''
y_born = input_born[0:4]
m_born = input_born[4:6]
d_born = input_born[6:8]
if (int(m_born) in range(1, 13, 1) and int(d_born) in range(1, 32, 1)):
born = datetime.date(int(y_born), int(m_born), int(d_born))
today = datetime.date(today[0],today[1],today[2])
born_days = born - datetime.date(born.year-1, 12, 31)
target_days = today - datetime.date(today.year-1, 12, 31)
sub_days = target_days - born_days
sub_days = sub_days.days
if today > born:
years = today.year-born.year
if sub_days >= 0:
if sub_days>=0 and sub_days<183:
return years
else:
return years+1
else:
sub_days = sub_days*(-1)
if sub_days>=0 and sub_days<183:
return years
else:
return years-1
else:
print('error_date')
return None
else:
return None
3.1 客户经理数据读入与处理
ctcode_df = pd.read_csv('/home/hadoop/xxx/project_0509_khjl/datasource/xz.csv', sep=',', encoding='utf-8')
ctcode_df['xzqhdm'] = ctcode_df['xzqhdm'].astype(str)
cm_df = pd.read_csv('/home/hadoop/xxx/project_0509_khjl/datasource/manager.csv', sep=',', encoding='utf-8')
cm_df = cm_df.ix[:,1:-1]
tmp_county = pd.merge(left=cm_df, right=ctcode_df, left_on='id_county', right_on='xzqhdm', how='left')
tmp_province = pd.merge(left=tmp_county, right=ctcode_df, left_on='id_province', right_on='xzqhdm', how='left')
tmp_city = pd.merge(left=tmp_province, right=ctcode_df, left_on='id_city', right_on='xzqhdm', how='left')
tmp_city['sex'] = tmp_city['user_id_card_edit'].apply(et_id_sex)
tmp_city['age'] = tmp_city['user_id_card_edit'].apply(et_id_age)
tmp_city.rename(columns={u'status':'status_m'}, inplace=True)
cm_df_location = tmp_city.drop(['xzqhdm_x','xzqhdm_y','xzqhdm'], axis=1, inplace=False)
/home/hadoop/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2723: DtypeWarning: Columns (4,5,6) have mixed types. Specify dtype option on import or set low_memory=False.
interactivity=interactivity, compiler=compiler, result=result)
error_date
3.1链接经理与客户信息
joint_cm_cl = pd.merge(left=cm_df_location, right=cl_df_flaged, left_on='staff_id', right_on='loan_staff_id', how='left')
joint_cm_cl.tail()
cl_df_flaged['loan_staff_id'].dtypes
dtype('int64')
cm_df_location['staff_id'].dtypes
dtype('int64')
cm_df_location.sort_values('staff_id').tail()
cl_df_flaged.sort_values('loan_staff_id').tail()
joint_cm_cl.count()
staff_id 740250
staff_name 740250
user_id_card_edit 740250
id_county 740214
id_province 740214
id_city 740214
id_x 740250
role_id 740250
presona_id 740250
status_m 740250
city_number 740250
num 740250
xzqhdm_name_x 314714
xzqhdm_name_y 461990
xzqhdm_name 362371
sex 740214
age 740122
id_y 690472
id_num 690472
account_id 690472
loan_staff_id 690472
education 690472
child_sum 690359
is_car 690472
account_number 690472
zs_money 690472
zipCode 221233
status 690472
amt 690472
loan_type 690472
cd 687819
year 690472
intopieces_date 690472
count 690472
total_account 690472
avg_month_men_a 690472
repay_num 690472
already_num 690472
user_phone 690472
presona_pid 689456
flag 236757
dtype: int64
cl_df_flaged.count()
id 771956
id_num 771956
account_id 771956
loan_staff_id 771956
education 771956
child_sum 771806
is_car 771956
account_number 771956
zs_money 771956
zipCode 254976
status 771956
amt 771956
loan_type 771956
cd 768909
year 771956
intopieces_date 771956
count 771956
total_account 771956
avg_month_men_a 771956
repay_num 771956
already_num 771956
user_phone 752550
presona_pid 751350
flag 271373
dtype: int64
joint_cm_cl.groupby('flag').agg('count')
|
staff_id |
staff_name |
user_id_card_edit |
id_county |
id_province |
id_city |
id_x |
role_id |
presona_id |
status_m |
... |
cd |
year |
intopieces_date |
count |
total_account |
avg_month_men_a |
repay_num |
already_num |
user_phone |
presona_pid |
flag |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
loan_fraud |
64186 |
64186 |
64186 |
64186 |
64186 |
64186 |
64186 |
64186 |
64186 |
64186 |
... |
64186 |
64186 |
64186 |
64186 |
64186 |
64186 |
64186 |
64186 |
64186 |
64104 |
loan_normal |
172571 |
172571 |
172571 |
172571 |
172571 |
172571 |
172571 |
172571 |
172571 |
172571 |
... |
172569 |
172571 |
172571 |
172571 |
172571 |
172571 |
172571 |
172571 |
172571 |
172352 |
2 rows × 40 columns
joint_cm_cl.ix[584100:584129,'flag'].notnull()
584100 True
584101 True
584102 False
584103 False
584104 True
584105 False
584106 True
584107 False
584108 False
584109 False
584110 True
584111 False
584112 False
584113 True
584114 False
584115 False
584116 False
584117 False
584118 False
584119 False
584120 False
584121 False
584122 True
584123 False
584124 True
584125 False
584126 True
584127 False
584128 True
584129 True
Name: flag, dtype: bool
3.3 选取感兴趣数据一
feature_use1 = ['staff_id','id_province', 'sex', 'status_m', 'age', 'num', 'presona_id', 'city_number','flag']
dataset_use1 = joint_cm_cl.loc[:, feature_use1]
dataset_use1.info()
Int64Index: 740250 entries, 0 to 740249
Data columns (total 9 columns):
staff_id 740250 non-null int64
id_province 740214 non-null object
sex 740214 non-null float64
status_m 740250 non-null int64
age 740122 non-null float64
num 740250 non-null int64
presona_id 740250 non-null int64
city_number 740250 non-null int64
flag 236757 non-null object
dtypes: float64(2), int64(5), object(2)
memory usage: 56.5+ MB
dataset_use1.describe()
/home/hadoop/anaconda2/lib/python2.7/site-packages/numpy/lib/function_base.py:4291: RuntimeWarning: Invalid value encountered in percentile
interpolation=interpolation)
|
staff_id |
sex |
status_m |
age |
num |
presona_id |
city_number |
count |
740250.000000 |
740214.000000 |
740250.000000 |
740122.000000 |
740250.000000 |
740250.000000 |
740250.0 |
mean |
206828.474852 |
0.578088 |
1.715630 |
28.350092 |
-0.000009 |
3265.831038 |
0.0 |
std |
49171.210610 |
0.493865 |
0.451114 |
4.882471 |
0.018777 |
2031.866055 |
0.0 |
min |
218.000000 |
0.000000 |
1.000000 |
18.000000 |
-4.000000 |
43.000000 |
0.0 |
25% |
184968.000000 |
NaN |
1.000000 |
NaN |
0.000000 |
1676.000000 |
0.0 |
50% |
214036.000000 |
NaN |
2.000000 |
NaN |
0.000000 |
2457.000000 |
0.0 |
75% |
238753.000000 |
NaN |
2.000000 |
NaN |
0.000000 |
5110.000000 |
0.0 |
max |
294921.000000 |
1.000000 |
2.000000 |
1716.000000 |
4.000000 |
9002.000000 |
0.0 |
经验证,其中sex=NaN的客户经理,溯源其身份证,通过函数验证身份证信息属假的,故考虑去除sex=NaN的客户经理信息(相应的flag也是NaN)
dataset_use1 = dataset_use1[dataset_use1['sex'].notnull()]
dataset_use1 = dataset_use1[dataset_use1['flag'].notnull()]
dataset_use1[dataset_use1['age'].isnull()]
dataset_use1 = dataset_use1[dataset_use1['age'].notnull()]
dataset_use1[dataset_use1['flag'].isnull()].groupby('staff_id').count()['sex']
Series([], Name: sex, dtype: int64)
def groupby_calcu(data_df, goal_id='staff_id', flag_id='flag'):
whole_num = data_df.groupby(goal_id).count()['sex']
whole_num.rename('whole_count', inplace=True)
loan_fraud_num = data_df[data_df[flag_id]=='loan_fraud'].groupby(goal_id).count()['sex']
loan_fraud_num.rename('fraud_count', inplace=True)
tmp_df = pd.concat([whole_num, loan_fraud_num], axis=1, join='outer')
tmp_df['fraud_count'] = tmp_df['fraud_count'].fillna(value=0, inplace=False)
tmp_df['whole_count'] = tmp_df['whole_count'].astype('int32')
tmp_df['fraud_count'] = tmp_df['fraud_count'].astype('float32')
flag_new_name = '%s_perc' %(flag_id)
for i in tmp_df.index.values:
tmp_df.ix[i, flag_new_name] = (tmp_df.ix[i, 'fraud_count']/tmp_df.ix[i, 'whole_count']).astype('float32')
df = pd.DataFrame({goal_id:tmp_df.index, flag_new_name:tmp_df[flag_new_name], 'num_client':tmp_df['whole_count']}, columns=[goal_id, flag_new_name, 'num_client'])
unique_data_df = data_df.drop_duplicates(goal_id).sort_values(goal_id)
unique_data_df_final = pd.merge(left=unique_data_df, right=df, left_on=goal_id, right_on=goal_id, how='left')
whole_num, loan_fraud_num, tmp_df, df, unique_data_df = None, None, None, None, None
del whole_num, loan_fraud_num, tmp_df, df, unique_data_df
return unique_data_df_final
dataset_use2 = groupby_calcu(dataset_use1.ix[:,])
dataset_use2.head()
|
staff_id |
id_province |
sex |
status_m |
age |
num |
presona_id |
city_number |
flag |
flag_perc |
num_client |
0 |
4735 |
350000 |
0.0 |
2 |
28.0 |
0 |
112 |
0 |
loan_fraud |
1.0 |
1 |
1 |
4857 |
620000 |
0.0 |
2 |
29.0 |
0 |
43 |
0 |
loan_fraud |
1.0 |
2 |
2 |
5365 |
340000 |
1.0 |
2 |
32.0 |
0 |
141 |
0 |
loan_normal |
0.0 |
2 |
3 |
5373 |
410000 |
1.0 |
2 |
28.0 |
0 |
142 |
0 |
loan_normal |
0.0 |
2 |
4 |
5910 |
410000 |
0.0 |
2 |
25.0 |
0 |
1110 |
0 |
loan_normal |
0.5 |
4 |
dataset_use2.describe()
|
staff_id |
sex |
status_m |
age |
num |
presona_id |
city_number |
flag_perc |
num_client |
count |
39374.000000 |
39374.000000 |
39374.000000 |
39374.000000 |
39374.000000 |
39374.000000 |
39374.0 |
39374.000000 |
39374.000000 |
mean |
219081.539468 |
0.616727 |
1.799055 |
27.900797 |
0.000000 |
3240.945421 |
0.0 |
0.268053 |
6.012851 |
std |
42961.619859 |
0.486190 |
0.400712 |
4.126361 |
0.014254 |
2052.084694 |
0.0 |
0.313801 |
7.872544 |
min |
4735.000000 |
0.000000 |
1.000000 |
19.000000 |
-1.000000 |
43.000000 |
0.0 |
0.000000 |
1.000000 |
25% |
196255.250000 |
0.000000 |
2.000000 |
25.000000 |
0.000000 |
1676.000000 |
0.0 |
0.000000 |
1.000000 |
50% |
226812.500000 |
1.000000 |
2.000000 |
28.000000 |
0.000000 |
2449.000000 |
0.0 |
0.181818 |
3.000000 |
75% |
249347.750000 |
1.000000 |
2.000000 |
30.000000 |
0.000000 |
4961.000000 |
0.0 |
0.444444 |
7.000000 |
max |
286704.000000 |
1.000000 |
2.000000 |
52.000000 |
2.000000 |
9002.000000 |
0.0 |
1.000000 |
103.000000 |
def flag_pd(x):
if x=='loan_normal':
label = int(1)
else:
label = int(0)
return label
dataset_use2['flag_01'] = dataset_use2['flag'].apply(flag_pd)
3.3选取感兴趣的字段二——为建模准备
dataset_use3 = dataset_use2[['sex','age','num_client', 'status_m', 'presona_id', 'id_province', 'flag_perc']].reset_index()
dataset_use3[dataset_use3["flag_perc"] <= 0.5]['flag_perc'] = 0
dataset_use3[dataset_use3["flag_perc"] > 0.5]['flag_perc'] = 1
/home/hadoop/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
if __name__ == '__main__':
/home/hadoop/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
from ipykernel import kernelapp as app
dataset_use3.dtypes, dataset_use3.head()
(index int64
sex float64
age float64
num_client int32
status_m int64
presona_id int64
id_province object
flag_perc float32
dtype: object,
index sex age num_client status_m presona_id id_province flag_perc
0 0 0.0 28.0 1 2 112 350000 1.0
1 1 0.0 29.0 2 2 43 620000 1.0
2 2 1.0 32.0 2 2 141 340000 0.0
3 3 1.0 28.0 2 2 142 410000 0.0
4 4 0.0 25.0 4 2 1110 410000 0.5)
dataset_use3['sex'] = dataset_use3['sex'].astype(int).astype(str)
dataset_use3['age'] = dataset_use3['age'].astype(int)
dataset_use3['status_m'] = dataset_use3['status_m'].astype(int).astype(str)
dataset_use3['presona_id'] = dataset_use3['presona_id'].astype(str)
dataset_use3['id_province'] = dataset_use3['id_province'].astype(int).astype(str)
dataset_use3['flag_perc'] = dataset_use3['flag_perc'].astype(int)
data_tmp_onehot = pd.get_dummies(dataset_use3[['sex','age', 'num_client', 'status_m', 'id_province']])
data_tmp_onehot['flag_perc'] = dataset_use3['flag_perc']
data_tmp_onehot.info()
RangeIndex: 39374 entries, 0 to 39373
Data columns (total 38 columns):
age 39374 non-null int64
num_client 39374 non-null int32
sex_0 39374 non-null float64
sex_1 39374 non-null float64
status_m_1 39374 non-null float64
status_m_2 39374 non-null float64
id_province_110000 39374 non-null float64
id_province_120000 39374 non-null float64
id_province_130000 39374 non-null float64
id_province_140000 39374 non-null float64
id_province_150000 39374 non-null float64
id_province_210000 39374 non-null float64
id_province_220000 39374 non-null float64
id_province_230000 39374 non-null float64
id_province_310000 39374 non-null float64
id_province_320000 39374 non-null float64
id_province_330000 39374 non-null float64
id_province_340000 39374 non-null float64
id_province_350000 39374 non-null float64
id_province_360000 39374 non-null float64
id_province_370000 39374 non-null float64
id_province_400000 39374 non-null float64
id_province_410000 39374 non-null float64
id_province_420000 39374 non-null float64
id_province_430000 39374 non-null float64
id_province_440000 39374 non-null float64
id_province_450000 39374 non-null float64
id_province_460000 39374 non-null float64
id_province_500000 39374 non-null float64
id_province_510000 39374 non-null float64
id_province_520000 39374 non-null float64
id_province_530000 39374 non-null float64
id_province_610000 39374 non-null float64
id_province_620000 39374 non-null float64
id_province_630000 39374 non-null float64
id_province_640000 39374 non-null float64
id_province_650000 39374 non-null float64
flag_perc 39374 non-null int64
dtypes: float64(35), int32(1), int64(2)
memory usage: 11.3 MB
3.4绘图与描述统计分析
import matplotlib.pyplot as plt
import seaborn as sns
% matplotlib notebook
3.4.1单变量的分布情况
flg = plt.figure()