B产品的分析

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st
import os
import re
import gc
import warnings

warnings.filterwarnings('ignore')

plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 100)
def diff_max_min(x):
    return x.max() - x.min()
path1 = 'B产品/'
path2 = '其他数据表/'
train = pd.read_csv(path1 + 'train_b.csv')
train.head(2)
id core_cust_id prod_code a2 a3 y
0 70e7f0465877447aa44c8d3120d0414c 9cb1f66b15 SSTJMZKF001 3 2021-07-01 0
1 b069c78512614452a7e815d231d8c580 9a5deb2794 SSTJMZKF001 3 2021-08-01 0
train.shape[0]
339516
test = pd.read_csv(path1 + 'test_b.csv')test.head(2)
id core_cust_id prod_code c2 c3
0 2360e70c585a4d8a922ad2590e0bf494 a030075b9 SSTJMZKF001 3 2021-10-01
1 8c565852b73b4a5fafc19b88a5ad8899 a030075b9 SSTJMZKF002 3 2021-10-01
test.rename(columns={'c2':'a2', 'c3':'a3'}, inplace=True)
test.shape[0]
51461

训练集339516, 测试集51461, 总共:390977

df

train['type'] = 'train'test['type'] = 'test'df = pd.concat([train, test])
df.head(2).append(df.tail(2))
id core_cust_id prod_code a2 a3 y type
0 70e7f0465877447aa44c8d3120d0414c 9cb1f66b15 SSTJMZKF001 3 2021-07-01 0.0 train
1 b069c78512614452a7e815d231d8c580 9a5deb2794 SSTJMZKF001 3 2021-08-01 0.0 train
51459 323ed9139ecb4a5286f61e58ce3e3bab 4d885e237d SSTJMZKF002 3 2021-10-01 NaN test
51460 0330a1471d6b4f868392eaa450110db3 7c195ef1fa SSTJMZKF001 3 2021-10-01 NaN test
df.shape[0]
390977

B产品表

h = pd.read_csv(path2 + 'h.csv')
h.head(2)
prod_code h1 h2 h3 h4 h5 h6 h7 h8
0 ZYGR2016286 0 1 1 2 0 0 NaN 209912
1 ZYGR2015103 0 1 1 2 1 0 NaN 209912
h_columns = ['prod_code', '计价类型', '周期类型', '模式', '风险等级', '是否允许变更分红方式', '产品品种', '模式2', '数据日期']h.columns = h_columnsh.head(2)
prod_code 计价类型 周期类型 模式 风险等级 是否允许变更分红方式 产品品种 模式2 数据日期
0 ZYGR2016286 0 1 1 2 0 0 NaN 209912
1 ZYGR2015103 0 1 1 2 1 0 NaN 209912
h.info()
RangeIndex: 2696 entries, 0 to 2695Data columns (total 9 columns): #   Column      Non-Null Count  Dtype  ---  ------      --------------  -----   0   prod_code   2696 non-null   object  1   计价类型        2696 non-null   int64   2   周期类型        2696 non-null   int64   3   模式          2696 non-null   int64   4   风险等级        2696 non-null   int64   5   是否允许变更分红方式  2696 non-null   int64   6   产品品种        2696 non-null   int64   7   模式2         65 non-null     float64 8   数据日期        2696 non-null   int64  dtypes: float64(1), int64(7), object(1)memory usage: 189.7+ KB
h.nunique()
prod_code     2696计价类型             2周期类型             2模式               2风险等级             3是否允许变更分红方式       2产品品种             1模式2              7数据日期             3dtype: int64
h.drop(['产品品种', '模式2'], axis=1, inplace=True)
l = pd.read_csv(path2 + 'l.csv')l_columns = ['prod_code', '募集方式', '管理方式', '业务模式', '收益特点', '期限', '投资模式', '数据日期']l.columns = l_columnsl.head(2)
prod_code 募集方式 管理方式 业务模式 收益特点 期限 投资模式 数据日期
0 YXFB2017031 0.0 2.0 3.0 1 NaN 0 209912
1 ZYGR2016014 1.0 2.0 1.0 2 4.0 1 209912
l.info()

RangeIndex: 3002 entries, 0 to 3001
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   prod_code  3002 non-null   object 
 1   募集方式       2502 non-null   float64
 2   管理方式       2502 non-null   float64
 3   业务模式       2502 non-null   float64
 4   收益特点       3002 non-null   int64  
 5   期限         567 non-null    float64
 6   投资模式       3002 non-null   int64  
 7   数据日期       3002 non-null   int64  
dtypes: float64(4), int64(3), object(1)
memory usage: 187.8+ KB
l.nunique()
prod_code    3002
募集方式            2
管理方式            2
业务模式            2
收益特点            2
期限              5
投资模式            2
数据日期            3
dtype: int64
b_prod = pd.merge(h, l, on=['prod_code', '数据日期'], how='outer')b_prod.head(3)
prod_code 计价类型 周期类型 模式 风险等级 是否允许变更分红方式 数据日期 募集方式 管理方式 业务模式 收益特点 期限 投资模式
0 ZYGR2016286 0.0 1.0 1.0 2.0 0.0 209912 0.0 2.0 1.0 2 NaN 1
1 ZYGR2015103 0.0 1.0 1.0 2.0 1.0 209912 1.0 2.0 1.0 2 6.0 1
2 YQ2017167 0.0 1.0 1.0 1.0 0.0 209912 0.0 2.0 1.0 2 NaN 0
b_prod['数据日期'] = b_prod['数据日期'].astype('object')
b_prod.info()
Int64Index: 3002 entries, 0 to 3001Data columns (total 13 columns): #   Column      Non-Null Count  Dtype  ---  ------      --------------  -----   0   prod_code   3002 non-null   object  1   计价类型        2696 non-null   float64 2   周期类型        2696 non-null   float64 3   模式          2696 non-null   float64 4   风险等级        2696 non-null   float64 5   是否允许变更分红方式  2696 non-null   float64 6   数据日期        3002 non-null   object  7   募集方式        2502 non-null   float64 8   管理方式        2502 non-null   float64 9   业务模式        2502 non-null   float64 10  收益特点        3002 non-null   int64   11  期限          567 non-null    float64 12  投资模式        3002 non-null   int64  dtypes: float64(9), int64(2), object(2)memory usage: 328.3+ KB

df + 产品表

# df = pd.merge(df, b_prod, on='prod_code', how='left')# df.head(2)
# df.info()
# df.nunique()
gc.collect()
13611

B产品流水表

# o = pd.read_csv(path2 + 'o.csv', thousands=',')
o1 = pd.read_csv('o_bc.csv', thousands=',')
o = pd.read_csv('o.csv', thousands=',')
o = pd.concat([o, o1])
o_columns = ['流水号', '业务代码', '渠道标识', 'core_cust_id', 'prod_code', '净值', '申请金额', 
             '交易状态', '资金状态', '总金额', '超额管理费', 'trade_date']
o.columns = o_columns
o.head(2)
流水号 业务代码 渠道标识 core_cust_id prod_code 净值 申请金额 交易状态 资金状态 总金额 超额管理费 trade_date
0 ALC202107210000007437420 2 1 b1a66424c4 SSTJMZKF002 1.0085 75110.0 3 3 110.0 110.0 20210721
1 ALC202107140000007373365 2 1 e733784b55 SSTJMZKF002 1.0071 18110.0 3 3 110.0 110.0 20210714
o['trade_date'] = o['trade_date'].astype('str')
o['trade_date'].min(), o['trade_date'].max()
('20210104', '20211130')
o['datetime'] = pd.to_datetime(o['trade_date'],errors='coerce', format='%Y-%m-%d')   #先转化为datetime类型,默认format='%Y-%m-%d %H:%M:%S'# ss['dayofyear_s']  = ss['datetime'].dt.dayofyear.fillna(0).astype("int") #一年中的第n天o['record_month'] = o['datetime'].dt.month.fillna(0).astype('int')o['record_day'] = o['datetime'].dt.day.fillna(0).astype("int")# r['record_hour'] = r['datetime'].dt.hour.fillna(0).astype("int")    #转化提取小时# r['record_minute'] = r['datetime'].dt.minute.fillna(0).astype("int") #转化提取分钟# r['click_second'] = r['datetime'].dt.second.fillna(0).astype("int") #转化提取秒# df['date'] = df['datetime'].dt.date   #转化提取年-月-日# df['year'] =df['datetime'].dt.year.fillna(0).astype("int")   #转化提取年 ,#如果有NaN元素则默认转化float64型,要转换数据类型则需要先填充空值,在做数据类型转换o['record_dayofyear']  = o['datetime'].dt.dayofyear.fillna(0).astype("int") #一年中的第n天o['record_weekofyear'] = o['datetime'].dt.weekofyear.fillna(0).astype("int") #一年中的第n周o['record_weekday']    = o['datetime'].dt.weekday.fillna(0).astype("int") #周几,一周里的第几天,Monday=0, Sunday=6o['record_quarter']    = o['datetime'].dt.quarter.fillna(0).astype("int")  #季度o['record_is_wknd']    = o['datetime'].dt.dayofweek // 4                  #是否周末
o.head()
流水号 业务代码 渠道标识 core_cust_id prod_code 净值 申请金额 交易状态 资金状态 总金额 超额管理费 trade_date datetime record_month record_day record_dayofyear record_weekofyear record_weekday record_quarter record_is_wknd
0 ALC202107210000007437420 2 1 b1a66424c4 SSTJMZKF002 1.0085 75110.0 3 3 110.0 110.0 20210721 2021-07-21 7 21 202 29 2 3 0
1 ALC202107140000007373365 2 1 e733784b55 SSTJMZKF002 1.0071 18110.0 3 3 110.0 110.0 20210714 2021-07-14 7 14 195 28 2 3 0
2 LC2021022300000006329444 8 5 1ef76cc3c0 YZFB0032 -999.0000 110.0 3 6 110.0 110.0 20210223 2021-02-23 2 23 54 8 1 1 0
3 LC2021051300000006917384 8 5 72c91d39c4 201811140151 1.0000 110.0 3 6 110.0 110.0 20210513 2021-05-13 5 13 133 19 3 2 0
4 LC2021062400000007193297 10 5 72c91d39c4 DXTY0289 1.0000 110.0 3 0 110.0 110.0 20210624 2021-06-24 6 24 175 25 3 2 0
o.info()
Int64Index: 53839 entries, 0 to 11346Data columns (total 12 columns): #   Column        Non-Null Count  Dtype  ---  ------        --------------  -----   0   流水号           53839 non-null  object  1   业务代码          53839 non-null  int64   2   渠道标识          53839 non-null  int64   3   core_cust_id  53839 non-null  object  4   prod_code     53839 non-null  object  5   净值            53839 non-null  float64 6   申请金额          53839 non-null  float64 7   交易状态          53839 non-null  int64   8   资金状态          53839 non-null  int64   9   总金额           53839 non-null  float64 10  超额管理费         53839 non-null  float64 11  trade_date    53839 non-null  int64  dtypes: float64(4), int64(5), object(3)memory usage: 5.3+ MB
o.nunique()
流水号             53839业务代码                9渠道标识                4core_cust_id    15683prod_code         114净值                106申请金额              798交易状态                5资金状态                5总金额              1970超额管理费            1839trade_date        228dtype: int64
o.nunique()
流水号             42492业务代码                9渠道标识                4core_cust_id    13678prod_code         112净值                 87申请金额              741交易状态                5资金状态                5总金额              1123超额管理费             960trade_date        188dtype: int64

B产品毫无作用

df+产品流水:待进一步处理

#日期的年月#申请金额apply_amonto.rename(columns={'申请金额': 'apply_amt'}, inplace=True)#交易状态 + 资金状态 + 业务代码 + 渠道标识 排序组合o['deal_bus'] = o['交易状态'].astype('str') + o['业务代码'].astype('str')o['if_buy'] = o['deal_bus'].apply(lambda x: 1 if x=='32' else 0)o['deal_bus_channel'] = o['交易状态'].astype('str') + o['业务代码'].astype('str') + o['渠道标识'].astype('str')o['deal_bus_fund'] = o['交易状态'].astype('str') + o['业务代码'].astype('str') + o['资金状态'].astype('str')o['deal_bus_c_f'] = o['交易状态'].astype('str') + o['业务代码'].astype('str') + o['渠道标识'].astype('str') + o['资金状态'].astype('str')#p_record['deal_bus'] = p_record['交易状态'].astype('str') + p_record['业务代码'].astype('str')#整个月的平均值与最大值或最小值相比,如果等,那就是不买,如果不等,那就购买

o.head(1)

num_b = []nums_b = []for i in range(1, 12):    num = o[(o['record_month']==i) & (o['if_buy']==1)].shape[0]    num_b.append(num)    nums = o[(o['record_month']==i)].shape[0]    nums_b.append(nums)    print('{}月的购买数量为{}, 总共流水数为{}'.format(i, num, nums))
1月的购买数量为0, 总共流水数为14032月的购买数量为0, 总共流水数为12133月的购买数量为1276, 总共流水数为57304月的购买数量为4053, 总共流水数为74065月的购买数量为3060, 总共流水数为70096月的购买数量为2535, 总共流水数为52597月的购买数量为2688, 总共流水数为51588月的购买数量为3235, 总共流水数为47559月的购买数量为2877, 总共流水数为455910月的购买数量为3224, 总共流水数为556911月的购买数量为2654, 总共流水数为5778
x_b = [i for i in range(1, 12)]plt.plot(x_b[2:], num_b[2:])plt.plot(x_b[2:], nums_b[2:])
[]


[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-rE4kvdFe-1643361700218)(output_45_1.png)]

st = o[o['record_month']==8].groupby(['core_cust_id', 'prod_code'])['if_buy'].sum().reset_index()st['if_buy'].value_counts()
1     24180      9712      2723       464       195        37        19        113       115       1Name: if_buy, dtype: int64
2418+272+46+19+3+4
2762
o2 = o[['流水号', 'core_cust_id', 'prod_code', '净值', 'apply_amt', '总金额', '资金状态', 'record_month', 'trade_date', 'record_month', 'if_buy']]
o2.head(2)
流水号 core_cust_id prod_code 净值 apply_amt 总金额 资金状态 record_month trade_date record_month if_buy
0 ALC202107210000007437420 b1a66424c4 SSTJMZKF002 1.0085 75110.0 110.0 3 7 20210721 7 1
1 ALC202107140000007373365 e733784b55 SSTJMZKF002 1.0071 18110.0 110.0 3 7 20210714 7 1
# o2[(o2['record_month']==8) & (o2['if_buy']==1)][:50]o2.head(5)
流水号 core_cust_id prod_code 净值 apply_amt 总金额 资金状态 record_month trade_date record_month if_buy
0 ALC202107210000007437420 b1a66424c4 SSTJMZKF002 1.0085 75110.0 110.0 3 7 20210721 7 1
1 ALC202107140000007373365 e733784b55 SSTJMZKF002 1.0071 18110.0 110.0 3 7 20210714 7 1
2 LC2021022300000006329444 1ef76cc3c0 YZFB0032 -999.0000 110.0 110.0 6 2 20210223 2 0
3 LC2021051300000006917384 72c91d39c4 201811140151 1.0000 110.0 110.0 6 5 20210513 5 0
4 LC2021062400000007193297 72c91d39c4 DXTY0289 1.0000 110.0 110.0 0 6 20210624 6 0
o.sort_values(['core_cust_id', 'prod_code'], inplace=True)
o.reset_index(inplace=True)
del o['index']
o[(o['record_month']==8) & (o['if_buy']==1)].shape[0]
3235
oo = o[(o['record_month']==8) & (o['if_buy']==1)]
oo.drop_duplicates(['core_cust_id', 'prod_code'], inplace=True)
oo.shape[0]
2762
oo[oo['record_day']==30].shape[0]
149

用户对产品的交易次数

oo.drop_duplicates(['core_cust_id', 'prod_code', 'apply_amt', '净值']).shape[0]
2762
oo[:5]
流水号 业务代码 渠道标识 core_cust_id prod_code 净值 apply_amt 交易状态 资金状态 总金额 超额管理费 trade_date datetime record_month record_day record_dayofyear record_weekofyear record_weekday record_quarter record_is_wknd deal_bus if_buy deal_bus_channel deal_bus_fund deal_bus_c_f
43 ALC202108110000007585651 2 1 1087ee18a7 SSTJMZKF002 1.0117 142610.0 3 3 110.0 110.0 20210811 2021-08-11 8 11 223 32 2 3 0 32 1 321 323 3213
92 ALC202108300000007699259 2 1 1087ee2dc3 SSTJMZKF001 1.0208 150110.0 3 3 110.0 110.0 20210830 2021-08-30 8 30 242 35 0 3 0 32 1 321 323 3213
96 ALC202108240000007666267 2 1 1087ee2e87 SSTJMZKF001 1.0200 154610.0 3 3 110.0 110.0 20210824 2021-08-24 8 24 236 34 1 3 0 32 1 321 323 3213
135 ALC202108030000007537694 2 1 1087ee5402 SSTJMZKF001 1.0178 15110.0 3 3 110.0 110.0 20210803 2021-08-03 8 3 215 31 1 3 0 32 1 321 323 3213
151 ALC202108190000007632217 2 1 1087ee55fd SSTJMZKF001 1.0200 1350110.0 3 3 110.0 110.0 20210819 2021-08-19 8 19 231 33 3 3 0 32 1 321 323 3213
o[(o['record_month']==8) & (o['if_buy']==1)][:100]
流水号 业务代码 渠道标识 core_cust_id prod_code 净值 apply_amt 交易状态 资金状态 总金额 超额管理费 trade_date datetime record_month record_day record_dayofyear record_weekofyear record_weekday record_quarter record_is_wknd deal_bus if_buy deal_bus_channel deal_bus_fund deal_bus_c_f
43 ALC202108110000007585651 2 1 1087ee18a7 SSTJMZKF002 1.0117 142610.0 3 3 110.0 110.0 20210811 2021-08-11 8 11 223 32 2 3 0 32 1 321 323 3213
44 ALC202108180000007627651 2 1 1087ee18a7 SSTJMZKF002 1.0127 90110.0 3 3 110.0 110.0 20210818 2021-08-18 8 18 230 33 2 3 0 32 1 321 323 3213
92 ALC202108300000007699259 2 1 1087ee2dc3 SSTJMZKF001 1.0208 150110.0 3 3 110.0 110.0 20210830 2021-08-30 8 30 242 35 0 3 0 32 1 321 323 3213
96 ALC202108240000007666267 2 1 1087ee2e87 SSTJMZKF001 1.0200 154610.0 3 3 110.0 110.0 20210824 2021-08-24 8 24 236 34 1 3 0 32 1 321 323 3213
135 ALC202108030000007537694 2 1 1087ee5402 SSTJMZKF001 1.0178 15110.0 3 3 110.0 110.0 20210803 2021-08-03 8 3 215 31 1 3 0 32 1 321 323 3213
151 ALC202108190000007632217 2 1 1087ee55fd SSTJMZKF001 1.0200 1350110.0 3 3 110.0 110.0 20210819 2021-08-19 8 19 231 33 3 3 0 32 1 321 323 3213
165 ALC202108040000007543279 2 1 1087ee578a SSTJMZKF001 1.0186 1399610.0 3 3 110.0 110.0 20210804 2021-08-04 8 4 216 31 2 3 0 32 1 321 323 3213
174 ALC202108060000007559516 2 1 1087ee578a SSTJMZKF002 1.0117 150110.0 3 3 110.0 110.0 20210806 2021-08-06 8 6 218 31 4 3 1 32 1 321 323 3213
178 ALC202108240000007666537 2 1 1087ee61b5 SSTJMZKF002 1.0132 15110.0 3 3 110.0 110.0 20210824 2021-08-24 8 24 236 34 1 3 0 32 1 321 323 3213
182 ALC202108120000007593777 2 1 1087ee62dc SSTJMZKF001 1.0194 45110.0 3 3 110.0 110.0 20210812 2021-08-12 8 12 224 32 3 3 0 32 1 321 323 3213
183 ALC202108160000007609755 2 1 1087ee62dc SSTJMZKF002 1.0127 45110.0 3 3 110.0 110.0 20210816 2021-08-16 8 16 228 33 0 3 0 32 1 321 323 3213
202 ALC202108300000007694893 2 1 1087ee6dd1 SSTJMZKF001 1.0208 300110.0 3 3 110.0 110.0 20210830 2021-08-30 8 30 242 35 0 3 0 32 1 321 323 3213
204 ALC202108310000007709314 2 1 1087ee6dd1 SSTJMZKF002 1.0140 150110.0 3 3 110.0 110.0 20210831 2021-08-31 8 31 243 35 1 3 0 32 1 321 323 3213
227 ALC202108030000007536410 2 1 1087ee7987 SSTJMZKF001 1.0178 315110.0 3 3 110.0 110.0 20210803 2021-08-03 8 3 215 31 1 3 0 32 1 321 323 3213
241 ALC202108180000007626876 2 1 1087ee8222 SSTJMZKF002 1.0127 15110.0 3 3 110.0 110.0 20210818 2021-08-18 8 18 230 33 2 3 0 32 1 321 323 3213
251 ALC202108260000007678005 2 1 1087ee8541 SSTJMZKF002 1.0140 30110.0 3 3 110.0 110.0 20210826 2021-08-26 8 26 238 34 3 3 0 32 1 321 323 3213
258 ALC202108120000007594293 2 1 10ff23a3aa SSTJMZKF001 1.0194 123110.0 3 3 110.0 110.0 20210812 2021-08-12 8 12 224 32 3 3 0 32 1 321 323 3213
263 ALC202108020000007519975 2 1 10ff23a3aa SSTJMZKF002 1.0109 120110.0 3 3 110.0 110.0 20210802 2021-08-02 8 2 214 31 0 3 0 32 1 321 323 3213
287 ALC202108270000007685686 2 2 10ff23b1ba SSTJMZKF002 1.0140 75110.0 3 3 110.0 110.0 20210827 2021-08-27 8 27 239 34 4 3 1 32 1 322 323 3223
323 ALC202108160000007609399 2 1 10ff23d864 SSTJMZKF001 1.0194 150110.0 3 3 110.0 110.0 20210816 2021-08-16 8 16 228 33 0 3 0 32 1 321 323 3213
327 ALC202108100000007579204 2 1 10ff23d869 SSTJMZKF001 1.0186 75110.0 3 3 110.0 110.0 20210810 2021-08-10 8 10 222 32 1 3 0 32 1 321 323 3213
330 ALC202108020000007517266 2 1 10ff23d869 SSTJMZKF001 1.0178 75110.0 3 3 110.0 110.0 20210802 2021-08-02 8 2 214 31 0 3 0 32 1 321 323 3213
368 ALC202108200000007640207 2 1 10ff23f166 SSTJMZKF001 1.0200 150110.0 3 3 110.0 110.0 20210820 2021-08-20 8 20 232 33 4 3 1 32 1 321 323 3213
373 ALC202108030000007538763 2 1 10ff23f28f SSTJMZKF001 1.0178 15110.0 3 3 110.0 110.0 20210803 2021-08-03 8 3 215 31 1 3 0 32 1 321 323 3213
375 ALC202108030000007538770 2 1 10ff23f28f SSTJMZKF001 1.0178 7610.0 3 3 110.0 110.0 20210803 2021-08-03 8 3 215 31 1 3 0 32 1 321 323 3213
391 ALC202108170000007617318 2 1 10ff23f6dd SSTJMZKF001 1.0194 90110.0 3 3 110.0 110.0 20210817 2021-08-17 8 17 229 33 1 3 0 32 1 321 323 3213
395 ALC202108270000007682839 2 1 10ff23f6dd SSTJMZKF002 1.0140 120110.0 3 3 110.0 110.0 20210827 2021-08-27 8 27 239 34 4 3 1 32 1 321 323 3213
400 ALC202108030000007536210 2 1 10ff23f7a3 SSTJMZKF001 1.0178 58610.0 3 3 110.0 110.0 20210803 2021-08-03 8 3 215 31 1 3 0 32 1 321 323 3213
402 ALC202108180000007625004 2 1 10ff23f7a3 SSTJMZKF002 1.0127 184610.0 3 3 110.0 110.0 20210818 2021-08-18 8 18 230 33 2 3 0 32 1 321 323 3213
425 ALC202108110000007584513 2 1 10ff2407a7 SSTJMZKF002 1.0117 15110.0 3 3 110.0 110.0 20210811 2021-08-11 8 11 223 32 2 3 0 32 1 321 323 3213
428 ALC202108200000007635938 2 1 10ff2407a7 SSTJMZKF002 1.0132 85610.0 3 3 110.0 110.0 20210820 2021-08-20 8 20 232 33 4 3 1 32 1 321 323 3213
430 ALC202108100000007576554 2 1 10ff2407a7 SSTJMZKF002 1.0117 30110.0 3 3 110.0 110.0 20210810 2021-08-10 8 10 222 32 1 3 0 32 1 321 323 3213
433 ALC202108020000007494374 2 1 10ff240b90 SSTJMZKF002 1.0109 81110.0 3 3 110.0 110.0 20210802 2021-08-02 8 2 214 31 0 3 0 32 1 321 323 3213
437 ALC202108270000007685958 2 1 10ff2410a3 SSTJMZKF002 1.0140 30110.0 3 3 110.0 110.0 20210827 2021-08-27 8 27 239 34 4 3 1 32 1 321 323 3213
442 ALC202108100000007576353 2 1 10ff24161f SSTJMZKF001 1.0186 37610.0 3 3 110.0 110.0 20210810 2021-08-10 8 10 222 32 1 3 0 32 1 321 323 3213
447 ALC202108100000007576360 2 1 10ff24161f SSTJMZKF002 1.0117 45110.0 3 3 110.0 110.0 20210810 2021-08-10 8 10 222 32 1 3 0 32 1 321 323 3213
562 ALC202108020000007520147 2 1 10ff244ada SSTJMZKF002 1.0109 16610.0 3 3 110.0 110.0 20210802 2021-08-02 8 2 214 31 0 3 0 32 1 321 323 3213
563 ALC202108020000007520133 2 1 10ff244ada SSTJMZKF002 1.0109 228110.0 3 3 110.0 110.0 20210802 2021-08-02 8 2 214 31 0 3 0 32 1 321 323 3213
594 ALC202108040000007544729 2 1 10ff244d35 SSTJMZKF002 1.0109 120110.0 3 3 110.0 110.0 20210804 2021-08-04 8 4 216 31 2 3 0 32 1 321 323 3213
598 ALC202108230000007661626 2 1 10ff244f8c SSTJMZKF001 1.0200 495110.0 3 3 110.0 110.0 20210823 2021-08-23 8 23 235 34 0 3 0 32 1 321 323 3213
612 ALC202108060000007553782 2 1 10ff24562c SSTJMZKF002 1.0117 225110.0 3 3 110.0 110.0 20210806 2021-08-06 8 6 218 31 4 3 1 32 1 321 323 3213
626 ALC202108100000007576431 2 1 10ff2459b7 SSTJMZKF002 1.0117 150110.0 3 3 110.0 110.0 20210810 2021-08-10 8 10 222 32 1 3 0 32 1 321 323 3213
639 ALC202108230000007655132 2 1 10ff246e67 SSTJMZKF001 1.0200 90110.0 3 3 110.0 110.0 20210823 2021-08-23 8 23 235 34 0 3 0 32 1 321 323 3213
658 ALC202108200000007636530 2 1 10ff246ecc SSTJMZKF002 1.0132 27110.0 3 3 110.0 110.0 20210820 2021-08-20 8 20 232 33 4 3 1 32 1 321 323 3213
661 ALC202108160000007608465 2 1 10ff24724b SSTJMZKF001 1.0194 31610.0 3 3 110.0 110.0 20210816 2021-08-16 8 16 228 33 0 3 0 32 1 321 323 3213
663 ALC202108180000007629341 2 1 10ff247761 SSTJMZKF002 1.0127 15110.0 3 3 110.0 110.0 20210818 2021-08-18 8 18 230 33 2 3 0 32 1 321 323 3213
665 ALC202108180000007629074 2 1 10ff247761 SSTJMZKF002 1.0127 120110.0 3 3 110.0 110.0 20210818 2021-08-18 8 18 230 33 2 3 0 32 1 321 323 3213
678 ALC202108030000007536327 2 1 10ff247f99 SSTJMZKF001 1.0178 15110.0 3 3 110.0 110.0 20210803 2021-08-03 8 3 215 31 1 3 0 32 1 321 323 3213
703 ALC202108020000007522986 2 1 10ff24888f SSTJMZKF002 1.0109 75110.0 3 3 110.0 110.0 20210802 2021-08-02 8 2 214 31 0 3 0 32 1 321 323 3213
707 ALC202108310000007706589 2 1 10ff2488f4 SSTJMZKF001 1.0208 225110.0 3 3 110.0 110.0 20210831 2021-08-31 8 31 243 35 1 3 0 32 1 321 323 3213
726 ALC202108030000007537438 2 1 10ff24895a SSTJMZKF001 1.0178 150110.0 3 3 110.0 110.0 20210803 2021-08-03 8 3 215 31 1 3 0 32 1 321 323 3213
768 ALC202108240000007667112 2 2 10ff2491f2 SSTJMZKF001 1.0200 150110.0 3 3 110.0 110.0 20210824 2021-08-24 8 24 236 34 1 3 0 32 1 322 323 3223
776 ALC202108060000007552881 2 1 10ff249643 SSTJMZKF001 1.0186 30110.0 3 3 110.0 110.0 20210806 2021-08-06 8 6 218 31 4 3 1 32 1 321 323 3213
788 ALC202108250000007672921 2 1 10ff24a190 SSTJMZKF001 1.0208 15110.0 3 3 110.0 110.0 20210825 2021-08-25 8 25 237 34 2 3 0 32 1 321 323 3213
833 ALC202108030000007537223 2 1 10ff24b51b SSTJMZKF001 1.0178 120110.0 3 3 110.0 110.0 20210803 2021-08-03 8 3 215 31 1 3 0 32 1 321 323 3213
864 ALC202108160000007612176 2 1 10ff24ba93 SSTJMZKF002 1.0127 150110.0 3 3 110.0 110.0 20210816 2021-08-16 8 16 228 33 0 3 0 32 1 321 323 3213
870 ALC202108200000007642125 2 1 10ff24bbc3 SSTJMZKF002 1.0132 135110.0 3 3 110.0 110.0 20210820 2021-08-20 8 20 232 33 4 3 1 32 1 321 323 3213
871 ALC202108190000007632855 2 1 10ff24bbc3 SSTJMZKF002 1.0132 15110.0 3 3 110.0 110.0 20210819 2021-08-19 8 19 231 33 3 3 0 32 1 321 323 3213
912 ALC202108180000007628292 2 1 10ff24d523 SSTJMZKF002 1.0127 150110.0 3 3 110.0 110.0 20210818 2021-08-18 8 18 230 33 2 3 0 32 1 321 323 3213
915 ALC202108200000007641380 2 1 10ff24d718 SSTJMZKF002 1.0132 30110.0 3 3 110.0 110.0 20210820 2021-08-20 8 20 232 33 4 3 1 32 1 321 323 3213
916 ALC202108050000007549306 2 2 10ff24d9ce SSTJMZKF002 1.0117 150110.0 3 3 110.0 110.0 20210805 2021-08-05 8 5 217 31 3 3 0 32 1 322 323 3223
934 ALC202108020000007517268 2 1 10ff24ec2b SSTJMZKF002 1.0109 15110.0 3 3 110.0 110.0 20210802 2021-08-02 8 2 214 31 0 3 0 32 1 321 323 3213
943 ALC202108250000007672948 2 2 10ff24f397 SSTJMZKF002 1.0132 150110.0 3 3 110.0 110.0 20210825 2021-08-25 8 25 237 34 2 3 0 32 1 322 323 3223
974 ALC202108040000007546238 2 1 10ff24fbca SSTJMZKF002 1.0109 30110.0 3 3 110.0 110.0 20210804 2021-08-04 8 4 216 31 2 3 0 32 1 321 323 3213
984 ALC202108040000007542238 2 1 10ff24fc96 SSTJMZKF002 1.0109 285110.0 3 3 110.0 110.0 20210804 2021-08-04 8 4 216 31 2 3 0 32 1 321 323 3213
985 ALC202108180000007629157 2 1 10ff24fc96 SSTJMZKF002 1.0127 135110.0 3 3 110.0 110.0 20210818 2021-08-18 8 18 230 33 2 3 0 32 1 321 323 3213
1030 ALC202108310000007706732 2 1 10ff25097c SSTJMZKF001 1.0208 150110.0 3 3 110.0 110.0 20210831 2021-08-31 8 31 243 35 1 3 0 32 1 321 323 3213
1082 ALC202108170000007617418 2 1 10ff333597 SSTJMZKF001 1.0194 15110.0 3 3 110.0 110.0 20210817 2021-08-17 8 17 229 33 1 3 0 32 1 321 323 3213
1083 ALC202108180000007628072 2 1 10ff33372b SSTJMZKF001 1.0200 150110.0 3 3 110.0 110.0 20210818 2021-08-18 8 18 230 33 2 3 0 32 1 321 323 3213
1097 ALC202108170000007618242 2 1 10ff335ab9 SSTJMZKF002 1.0127 240110.0 3 3 110.0 110.0 20210817 2021-08-17 8 17 229 33 1 3 0 32 1 321 323 3213
1117 ALC202108110000007587848 2 1 10ff3385aa SSTJMZKF002 1.0117 16610.0 3 3 110.0 110.0 20210811 2021-08-11 8 11 223 32 2 3 0 32 1 321 323 3213
1163 ALC202108100000007581893 2 1 10ff34318e SSTJMZKF001 1.0186 120110.0 3 3 110.0 110.0 20210810 2021-08-10 8 10 222 32 1 3 0 32 1 321 323 3213
1164 ALC202108050000007550230 2 1 10ff34318e SSTJMZKF001 1.0186 84110.0 3 3 110.0 110.0 20210805 2021-08-05 8 5 217 31 3 3 0 32 1 321 323 3213
1186 ALC202108230000007656206 2 1 10ff34525c SSTJMZKF001 1.0200 60110.0 3 3 110.0 110.0 20210823 2021-08-23 8 23 235 34 0 3 0 32 1 321 323 3213
1222 ALC202108120000007593807 2 1 12dbfa1167 SSTJMZKF002 1.0127 15110.0 3 3 110.0 110.0 20210812 2021-08-12 8 12 224 32 3 3 0 32 1 321 323 3213
1398 ALC202108300000007697805 2 2 12dbfa64f9 SSTJMZKF001 1.0208 45110.0 3 3 110.0 110.0 20210830 2021-08-30 8 30 242 35 0 3 0 32 1 322 323 3223
1405 ALC202108310000007701049 2 1 12dbfa681e SSTJMZKF002 1.0140 22610.0 3 3 110.0 110.0 20210831 2021-08-31 8 31 243 35 1 3 0 32 1 321 323 3213
1411 ALC202108310000007709598 2 1 12dbfa681e SSTJMZKF002 1.0140 15110.0 3 3 110.0 110.0 20210831 2021-08-31 8 31 243 35 1 3 0 32 1 321 323 3213
1416 ALC202108020000007531945 2 1 12dbfa6880 SSTJMZKF001 1.0178 75110.0 3 3 110.0 110.0 20210802 2021-08-02 8 2 214 31 0 3 0 32 1 321 323 3213
1483 ALC202108020000007530552 2 1 13532fa758 SSTJMZKF002 1.0109 30110.0 3 3 110.0 110.0 20210802 2021-08-02 8 2 214 31 0 3 0 32 1 321 323 3213
1484 ALC202108110000007586048 2 1 13532faad9 SSTJMZKF002 1.0117 300110.0 3 3 110.0 110.0 20210811 2021-08-11 8 11 223 32 2 3 0 32 1 321 323 3213
1501 ALC202108270000007683188 2 1 13532fb886 SSTJMZKF002 1.0140 30110.0 3 3 110.0 110.0 20210827 2021-08-27 8 27 239 34 4 3 1 32 1 321 323 3213
1502 ALC202108300000007694268 2 1 13532fb886 SSTJMZKF002 1.0140 15110.0 3 3 110.0 110.0 20210830 2021-08-30 8 30 242 35 0 3 0 32 1 321 323 3213
1507 ALC202108040000007545078 2 1 13532fb886 SSTJMZKF002 1.0109 48110.0 3 3 110.0 110.0 20210804 2021-08-04 8 4 216 31 2 3 0 32 1 321 323 3213
1510 ALC202108040000007545374 2 1 13532fb886 SSTJMZKF002 1.0109 27110.0 3 3 110.0 110.0 20210804 2021-08-04 8 4 216 31 2 3 0 32 1 321 323 3213
1521 ALC202108050000007550087 2 1 13532fc054 SSTJMZKF001 1.0186 450110.0 3 3 110.0 110.0 20210805 2021-08-05 8 5 217 31 3 3 0 32 1 321 323 3213
1611 ALC202108310000007708497 2 1 13532fed49 SSTJMZKF001 1.0208 37610.0 3 3 110.0 110.0 20210831 2021-08-31 8 31 243 35 1 3 0 32 1 321 323 3213
1639 ALC202108300000007700121 2 1 13532ff706 SSTJMZKF001 1.0208 15110.0 3 3 110.0 110.0 20210830 2021-08-30 8 30 242 35 0 3 0 32 1 321 323 3213
1641 ALC202108240000007663925 2 1 13532ff706 SSTJMZKF001 1.0200 15110.0 3 3 110.0 110.0 20210824 2021-08-24 8 24 236 34 1 3 0 32 1 321 323 3213
1647 ALC202108040000007542107 2 1 13532ff706 SSTJMZKF002 1.0109 15110.0 3 3 110.0 110.0 20210804 2021-08-04 8 4 216 31 2 3 0 32 1 321 323 3213
1650 ALC202108110000007588997 2 1 13532ff706 SSTJMZKF002 1.0117 75110.0 3 3 110.0 110.0 20210811 2021-08-11 8 11 223 32 2 3 0 32 1 321 323 3213
1652 ALC202108050000007551715 2 1 13532ff70a SSTJMZKF001 1.0186 52610.0 3 3 110.0 110.0 20210805 2021-08-05 8 5 217 31 3 3 0 32 1 321 323 3213
1653 ALC202108260000007678351 2 1 13532ff70a SSTJMZKF001 1.0208 135110.0 3 3 110.0 110.0 20210826 2021-08-26 8 26 238 34 3 3 0 32 1 321 323 3213
1659 ALC202108110000007585615 2 1 13532ff70a SSTJMZKF002 1.0117 150110.0 3 3 110.0 110.0 20210811 2021-08-11 8 11 223 32 2 3 0 32 1 321 323 3213
1665 ALC202108300000007696531 2 1 13532ff70b SSTJMZKF001 1.0208 150110.0 3 3 110.0 110.0 20210830 2021-08-30 8 30 242 35 0 3 0 32 1 321 323 3213
1686 ALC202108180000007628699 2 1 13532ffd4b SSTJMZKF002 1.0127 450110.0 3 3 110.0 110.0 20210818 2021-08-18 8 18 230 33 2 3 0 32 1 321 323 3213
1704 ALC202108310000007704681 2 1 13533003f0 SSTJMZKF001 1.0208 55610.0 3 3 110.0 110.0 20210831 2021-08-31 8 31 243 35 1 3 0 32 1 321 323 3213
1706 ALC202108100000007576098 2 1 13533003f0 SSTJMZKF001 1.0186 30110.0 3 3 110.0 110.0 20210810 2021-08-10 8 10 222 32 1 3 0 32 1 321 323 3213
1716 ALC202108250000007670596 2 1 13533003f0 SSTJMZKF002 1.0132 67610.0 3 3 110.0 110.0 20210825 2021-08-25 8 25 237 34 2 3 0 32 1 321 323 3213
1717 ALC202108100000007576092 2 1 13533003f0 SSTJMZKF002 1.0117 30110.0 3 3 110.0 110.0 20210810 2021-08-10 8 10 222 32 1 3 0 32 1 321 323 3213
# p_data['trade_date'] = p_data['trade_date'].astype('str')# # p_data['trade_date'] = pd.to_datetime(p_data['trade_date'], format='%Y%m%d', errors='coerce')# p_data['date'] = p_data['trade_date'].apply(lambda x:x[:6])
# o.rename(columns={'trade_date':'a3'},inplace = True)# o.rename(columns={'prod_class':'a2'},inplace = True)
# df = df.merge(p_record, on= ['core_cust_id','prod_code','a3'], how='left')
o_col_all = ['净值', '总金额', '超额管理费', 'apply_amt']o_cols = ['业务代码', '渠道标识', '资金状态', '交易状态', 'deal_bus', 'deal_bus_channel', 'deal_bus_fund', 'deal_bus_c_f']       
o['date'] = o['trade_date'].apply(lambda x: x[:6])

#最高最低收益率、收益率
dict_ = {'2021-07-01':'202106', '2021-08-01':'202107', '2021-09-01':'202108', '2021-10-01':'202109'}
dict_1 = {'2021-07-01':['202105', '202106'], '2021-08-01':['202106', '202107'], 
          '2021-09-01':['202107', '202108'], '2021-10-01':['202108', '202109']}

data = o.copy()
dfs = []

split_month = ['redu_1', 'redu_2', 'redu_b', 'redu_b1', 'redu_b2']

for month in sorted(df['a3'].unique()):
    print(month)
    tmp_df = df[df['a3'] == month]
    # 5种不同月份拆分
    # 111111####################
    stat = data[data['date'] == dict_[month]].groupby('core_cust_id')['prod_code'].count().reset_index()
    stat.columns = ['core_cust_id', 'uid_count_1']
    stat['pid_nunique_1'] = data[data['date'] == dict_[month]].groupby('core_cust_id')['prod_code'].agg(
        'nunique').values
    stat['pid_mean_count_1'] = stat['uid_count_1'] / stat['pid_nunique_1']
    stat['buy_num_1'] = data[data['date'] == dict_[month]].groupby('core_cust_id')['if_buy'].agg(
        'sum').values
    stat['buy_rate_1'] = data[data['date'] == dict_[month]].groupby('core_cust_id')['if_buy'].agg(
        'mean').values
#     stat['buy_rate'] = stat['buy_num'] / stat['uid_count_1']
    for col in o_col_all:
        stat[f'{col}_apply_mean_1'] = data[data['date'] == dict_[month]].groupby('core_cust_id')[col].agg('mean').values
        stat[f'{col}_apply_max_1'] = data[data['date'] == dict_[month]].groupby('core_cust_id')[col].agg(
            'max').values  # 我们加的
        stat[f'{col}_apply_min_1'] = data[data['date'] == dict_[month]].groupby('core_cust_id')[col].agg(
            'min').values  # 我们加的
        stat[f'{col}_apply_std_1'] = data[data['date'] == dict_[month]].groupby('core_cust_id')[col].agg(
            'std').values  # 我们加的
        stat[f'{col}_apply_median_1'] = data[data['date'] == dict_[month]].groupby('core_cust_id')[col].agg(
            'median').values  # 我们加的
        stat[f'{col}_apply_sum_1'] = data[data['date'] == dict_[month]].groupby('core_cust_id')[col].agg('sum').values
        stat[f'{col}_apply_max_min_1'] = data[data['date'] == dict_[month]].groupby('core_cust_id')[col].agg(
            diff_max_min).values
    for col in o_cols:
        stat[f'{col}_apply_nunique_1'] = data[data['date'] == dict_[month]].groupby('core_cust_id')[col].agg(
            'nunique').values

    # 2222222###########
    stat1 = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')[
        'prod_code'].count().reset_index()
    stat1.columns = ['core_cust_id', f'uid_count_2']
    stat1['pid_nunique_2'] = \
    data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')[
        'prod_code'].agg('nunique').values
    stat1['pid_mean_count_2'] = stat1['uid_count_2'] / stat1['pid_nunique_2']
    stat1['buy_num_2'] = data[data['date'] == dict_1[month][0]].groupby('core_cust_id')['if_buy'].agg(
        'sum').values
    stat1['buy_rate_2'] = data[data['date'] == dict_1[month][0]].groupby('core_cust_id')['if_buy'].agg(
        'mean').values
    for col in o_col_all:
        stat1[f'{col}_apply_mean_2'] = \
        data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')[col].agg(
            'mean').values
        stat1[f'{col}_apply_max_2'] = \
        data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')[col].agg(
            'max').values  # 我们加的
        stat1[f'{col}_apply_min_2'] = \
        data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')[col].agg(
            'min').values  # 我们加的
        stat1[f'{col}_apply_std_2'] = \
        data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')[col].agg(
            'std').values  # 我们加的
        stat1[f'{col}_apply_median_2'] = \
        data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')[col].agg(
            'median').values  # 我们加的
        stat1[f'{col}_apply_sum_2'] = \
        data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')[col].agg(
            'sum').values
        stat1[f'{col}_apply_max_min_2'] = \
        data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')[col].agg(
            diff_max_min).values
    for col in o_cols:
        stat1[f'{col}_apply_nunique_2'] = \
        data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby('core_cust_id')[col].agg(
            'nunique').values

    # 333333333##################################
    stat2 = data[data['date'] <= dict_[month]].groupby('core_cust_id')['prod_code'].count().reset_index()
    stat2.columns = ['core_cust_id', f'uid_count_b']
    stat2['pid_nunique_b'] = data[data['date'] <= dict_[month]].groupby('core_cust_id')['prod_code'].agg('nunique').values
    stat2['pid_mean_count_b'] = stat2['uid_count_b'] / stat2['pid_nunique_b']
    stat2['buy_num_b'] = data[data['date'] <= dict_[month]].groupby('core_cust_id')['if_buy'].agg(
        'sum').values
    stat2['buy_rate_b'] = data[data['date'] <= dict_[month]].groupby('core_cust_id')['if_buy'].agg(
        'mean').values
    for col in o_col_all:
        stat2[f'{col}_apply_mean_b'] = data[data['date'] <= dict_[month]].groupby('core_cust_id')[col].agg(
            'mean').values
        stat2[f'{col}_apply_max_b'] = data[data['date'] <= dict_[month]].groupby('core_cust_id')[col].agg(
            'max').values  # 我们加的
        stat2[f'{col}_apply_min_b'] = data[data['date'] <= dict_[month]].groupby('core_cust_id')[col].agg(
            'min').values  # 我们加的
        stat2[f'{col}_apply_std_b'] = data[data['date'] <= dict_[month]].groupby('core_cust_id')[col].agg(
            'std').values  # 我们加的
        stat2[f'{col}_apply_median_b'] = data[data['date'] <= dict_[month]].groupby('core_cust_id')[col].agg(
            'median').values  # 我们加的
        stat2[f'{col}_apply_sum_b'] = data[data['date'] <= dict_[month]].groupby('core_cust_id')[col].agg('sum').values
        stat2[f'{col}_apply_max_min_b'] = data[data['date'] <= dict_[month]].groupby('core_cust_id')[col].agg(
            diff_max_min).values
    for col in o_cols:
        stat2[f'{col}_apply_nunique_b'] = data[data['date'] <= dict_[month]].groupby('core_cust_id')[col].agg('nunique').values

    # 4444444444#######################
    stat3 = data[data['date'] < dict_[month]].groupby('core_cust_id')['prod_code'].count().reset_index()
    
    stat3.columns = ['core_cust_id', 'uid_count_b1']
    stat3['pid_nunique_b1'] = data[data['date'] < dict_[month]].groupby('core_cust_id')['prod_code'].agg(
        'nunique').values
    stat3['pid_mean_count_b1'] = stat3['uid_count_b1'] / stat3['pid_nunique_b1']
    stat3['buy_num_b1'] = data[data['date'] < dict_[month]].groupby('core_cust_id')['if_buy'].agg(
        'sum').values
    stat3['buy_rate_b1'] = data[data['date'] < dict_[month]].groupby('core_cust_id')['if_buy'].agg(
        'mean').values
    for col in o_col_all:
        stat3[f'{col}_apply_mean_b1'] = data[data['date'] < dict_[month]].groupby('core_cust_id')[col].agg(
            'mean').values
        stat3[f'{col}_apply_max_b1'] = data[data['date'] < dict_[month]].groupby('core_cust_id')[col].agg(
            'max').values  # 我们加的
        stat3[f'{col}_apply_min_b1'] = data[data['date'] < dict_[month]].groupby('core_cust_id')[col].agg(
            'min').values  # 我们加的
        stat3[f'{col}_apply_std_b1'] = data[data['date'] < dict_[month]].groupby('core_cust_id')[col].agg(
            'std').values  # 我们加的
        stat3[f'{col}_apply_median_b1'] = data[data['date'] < dict_[month]].groupby('core_cust_id')[col].agg(
            'median').values  # 我们加的
        stat3[f'{col}_apply_sum_b1'] = data[data['date'] < dict_[month]].groupby('core_cust_id')[col].agg('sum').values
        stat3[f'{col}_apply_max_min_b1'] = data[data['date'] < dict_[month]].groupby('core_cust_id')[col].agg(
            diff_max_min).values
    for col in o_cols:
        stat3[f'{col}_apply_nunique_b1'] = data[
            data['date'] < dict_[month]].groupby('core_cust_id')[col].agg('nunique').values
    #55555555555555555#######################
    stat4 = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['prod_code'].count().reset_index()
#     print(stat4.columns)
    stat4.columns = ['core_cust_id', 'uid_count_b2']
    stat4['pid_nunique_b2'] = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['prod_code'].agg(
        'nunique').values
    stat4['pid_mean_count_b2'] = stat4['uid_count_b2'] / stat4['pid_nunique_b2']
    stat4['buy_num_b2'] = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['if_buy'].agg(
        'sum').values
    stat4['buy_rate_b2'] = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')['if_buy'].agg(
        'mean').values
    for col in o_col_all:
        stat4[f'{col}_apply_mean_b2'] = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')[col].agg(
            'mean').values
        stat4[f'{col}_apply_max_b2'] = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')[col].agg(
            'max').values  # 我们加的
        stat4[f'{col}_apply_min_b2'] = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')[col].agg(
            'min').values  # 我们加的
        stat4[f'{col}_apply_std_b2'] = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')[col].agg(
            'std').values  # 我们加的
        stat4[f'{col}_apply_median_b2'] = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')[col].agg(
            'median').values  # 我们加的
        stat4[f'{col}_apply_sum_b2'] = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')[col].agg(
            'sum').values
        stat4[f'{col}_apply_max_min_b2'] = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')[col].agg(
            diff_max_min).values
    for col in o_cols:
        stat4[f'{col}_apply_nunique_b2'] = data[data['date'] < dict_1[month][0]].groupby('core_cust_id')[col].agg(
            'nunique').values

    # 以core_cust_id/prod_code来groupby,查看用户在该产品中产生多少笔交易
    #########################
    stat_5 = data[data['date'] < dict_[month]].groupby(['core_cust_id', 'prod_code'])['流水号'].count().reset_index()
    stat_5.columns = ['core_cust_id', 'prod_code', 'flow_redu_b1']
    stat_5['cp_buy_num_b1'] = data[data['date'] < dict_[month]].groupby(['core_cust_id', 'prod_code'])['if_buy'].agg(
        'sum').values
    stat_5['cp_buy_rate_b1'] = data[data['date'] < dict_[month]].groupby(['core_cust_id', 'prod_code'])['if_buy'].agg(
        'mean').values
    stat_6 = data[data['date'] <= dict_[month]].groupby(['core_cust_id', 'prod_code'])['流水号'].count().reset_index()
    stat_6.columns = ['core_cust_id', 'prod_code', 'flow_redu_b']
    stat_6['cp_buy_num_b'] = data[data['date'] <= dict_[month]].groupby(['core_cust_id', 'prod_code'])['if_buy'].agg(
        'sum').values
    stat_6['cp_buy_rate_b'] = data[data['date'] <= dict_[month]].groupby(['core_cust_id', 'prod_code'])['if_buy'].agg(
        'mean').values
    stat_7 = data[data['date'] < dict_1[month][0]].groupby(['core_cust_id', 'prod_code'])[
        '流水号'].count().reset_index()
    stat_7.columns = ['core_cust_id', 'prod_code', 'flow_redu_b2']
    stat_7['cp_buy_num_b2'] = data[data['date'] < dict_1[month][0]].groupby(['core_cust_id', 'prod_code'])['if_buy'].agg(
        'sum').values
    stat_7['cp_buy_rate_b2'] = data[data['date'] < dict_1[month][0]].groupby(['core_cust_id', 'prod_code'])['if_buy'].agg(
        'mean').values
    stat_8 = data[data['date'] == dict_[month]].groupby(['core_cust_id', 'prod_code'])['流水号'].count().reset_index()
    stat_8.columns = ['core_cust_id', 'prod_code', 'flow_redu_1']
    stat_8['cp_buy_num_1'] = data[data['date'] == dict_[month]].groupby(['core_cust_id', 'prod_code'])['if_buy'].agg(
        'sum').values
    stat_8['cp_buy_rate_1'] = data[data['date'] == dict_[month]].groupby(['core_cust_id', 'prod_code'])['if_buy'].agg(
        'mean').values
    stat_9 = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby(
        ['core_cust_id', 'prod_code'])['流水号'].count().reset_index()
    stat_9.columns = ['core_cust_id', 'prod_code', 'flow_redu_2']
    stat_9['cp_buy_num_2'] = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby(
        ['core_cust_id', 'prod_code'])['if_buy'].agg('sum').values
    stat_9['cp_buy_rate_2'] = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby(
        ['core_cust_id', 'prod_code'])['if_buy'].agg('mean').values
    ###########################
    # 按照col来groupby

    tmp_df = tmp_df.merge(stat, on=['core_cust_id'], how='left')
    tmp_df = tmp_df.merge(stat1, on=['core_cust_id'], how='left')
    tmp_df = tmp_df.merge(stat2, on=['core_cust_id'], how='left')
    tmp_df = tmp_df.merge(stat3, on=['core_cust_id'], how='left')
    tmp_df = tmp_df.merge(stat4, on=['core_cust_id'], how='left')
    tmp_df = tmp_df.merge(stat_5, on=['core_cust_id', 'prod_code'], how='left')
    tmp_df = tmp_df.merge(stat_6, on=['core_cust_id', 'prod_code'], how='left')
    tmp_df = tmp_df.merge(stat_7, on=['core_cust_id', 'prod_code'], how='left')
    tmp_df = tmp_df.merge(stat_8, on=['core_cust_id', 'prod_code'], how='left')
    tmp_df = tmp_df.merge(stat_9, on=['core_cust_id', 'prod_code'], how='left')

    # 以core_cust_id和freq_lis来groupby
#     freq_lis = ['业务代码', '渠道标识', '资金状态', '交易状态', 'deal_bus', 'deal_bus_channel', 
#                 'deal_bus_fund', 'deal_bus_c_f']
    freq_lis = ['业务代码', '渠道标识', '资金状态', '交易状态']
    
    # 频度统计
    
#     for freq in freq_lis:
#         group = ['core_cust_id']
#         group.append(freq)
# #         print(data[data['date'] < dict_[month]].columns)
#         stat_count = data[data['date'] < dict_[month]].groupby(group)['流水号'].count().reset_index()
#         stat_count.columns = ['core_cust_id', freq, f'{freq}_freq_b1']
#         tmp_df = tmp_df.merge(stat_count, on=['core_cust_id'], how='left')
#     for freq in freq_lis:
#         group = ['core_cust_id']
#         group.append(freq)
#         stat_count = data[data['date'] <= dict_[month]].groupby(group)['流水号'].count().reset_index()
#         stat_count.columns = ['core_cust_id', f'{freq}', f'{freq}_freq_b']
#         tmp_df = tmp_df.merge(stat_count, on=['core_cust_id'], how='left')
#     for freq in freq_lis:
#         group = ['core_cust_id']
#         group.append(freq)
#         stat_count = data[data['date'] == dict_[month]].groupby(group)['流水号'].count().reset_index()
#         stat_count.columns = ['core_cust_id', f'{freq}', f'{freq}_freq_1']
#         tmp_df = tmp_df.merge(stat_count, on=['core_cust_id'], how='left')
#     for freq in freq_lis:
#         group = ['core_cust_id']
#         group.append(freq)
#         stat_count = data[data['date'] < dict_1[month][0]].groupby(group)['流水号'].count().reset_index()
#         stat_count.columns = ['core_cust_id', f'{freq}', f'{freq}_freq_b2']
#         tmp_df = tmp_df.merge(stat_count, on=['core_cust_id'], how='left')
#     for freq in freq_lis:
#         group = ['core_cust_id']
#         group.append(freq)
#         stat_count = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby(group)[
#         '流水号'].count().reset_index()
#         stat_count.columns = ['core_cust_id', f'{freq}', f'{freq}_freq_2']
#         tmp_df = tmp_df.merge(stat_count, on=['core_cust_id'], how='left')

#     # 以core_cust_id和prod_code来groupby
#     for freq in freq_lis:
#         group = ['core_cust_id', 'prod_code']
#         group.append(freq)
#         stat_count = data[data['date'] < dict_[month]].groupby(group)['流水号'].count().reset_index()
#         stat_count.columns = ['core_cust_id', 'prod_code', f'{freq}', f'cp_{freq}_freq_b1']
#         tmp_df = tmp_df.merge(stat_count, on=['core_cust_id', 'prod_code'], how='left')
#     for freq in freq_lis:
#         group = ['core_cust_id', 'prod_code']
#         group.append(freq)
#         stat_count = data[data['date'] <= dict_[month]].groupby(group)['流水号'].count().reset_index()
#         stat_count.columns = ['core_cust_id', 'prod_code', f'{freq}', f'cp_{freq}_freq_b']
#         tmp_df = tmp_df.merge(stat_count, on=['core_cust_id', 'prod_code'], how='left')
#     for freq in freq_lis:
#         group = ['core_cust_id', 'prod_code']
#         group.append(freq)
#         stat_count = data[data['date'] == dict_[month]].groupby(group)['流水号'].count().reset_index()
#         stat_count.columns = ['core_cust_id', 'prod_code', f'{freq}', f'cp_{freq}_freq_b']
#         tmp_df = tmp_df.merge(stat_count, on=['core_cust_id', 'prod_code'], how='left')
#     for freq in freq_lis:
#         group = ['core_cust_id', 'prod_code']
#         group.append(freq)
#         stat_count = data[data['date'] < dict_1[month][0]].groupby(group)['流水号'].count().reset_index()
#         stat_count.columns = ['core_cust_id', 'prod_code', f'{freq}', f'cp_{freq}_freq_b2']
#         tmp_df = tmp_df.merge(stat_count, on=['core_cust_id', 'prod_code'], how='left')
#     for freq in freq_lis:
#         group = ['core_cust_id', 'prod_code']
#         group.append(freq)
#         stat_count = data[(data['date'] == dict_1[month][0]) & (data['date'] == dict_1[month][1])].groupby(group)[
#         '流水号'].count().reset_index()
#         stat_count.columns = ['core_cust_id', 'prod_code', f'{freq}', f'cp_{freq}_freq_2']
#         tmp_df = tmp_df.merge(stat_count, on=['core_cust_id', 'prod_code'], how='left')

    dfs.append(tmp_df)

df = pd.concat(dfs).reset_index(drop=True)

2021-07-01
2021-08-01
2021-09-01
2021-10-01
# df.info()
df.nunique()
id                                   390977
core_cust_id                         212637
prod_code                                 2
a2                                        1
a3                                        4
y                                         2
type                                      2
uid_count_1                              22
pid_nunique_1                             6
pid_mean_count_1                         33
净值_apply_mean_1                         702
净值_apply_max_1                           42
净值_apply_min_1                           43
净值_apply_std_1                          812
净值_apply_median_1                       202
净值_apply_sum_1                          835
净值_apply_max_min_1                      159
总金额_apply_mean_1                        910
总金额_apply_max_1                         689
总金额_apply_min_1                         504
总金额_apply_std_1                         489
总金额_apply_median_1                      724
总金额_apply_sum_1                         992
总金额_apply_max_min_1                     385
超额管理费_apply_mean_1                      796
超额管理费_apply_max_1                       585
超额管理费_apply_min_1                       443
超额管理费_apply_std_1                       416
超额管理费_apply_median_1                    641
超额管理费_apply_sum_1                       846
超额管理费_apply_max_min_1                   311
apply_amt_apply_mean_1                  703
apply_amt_apply_max_1                   377
apply_amt_apply_min_1                   311
apply_amt_apply_std_1                   693
apply_amt_apply_median_1                434
apply_amt_apply_sum_1                  1037
apply_amt_apply_max_min_1               259
业务代码_apply_nunique_1                      6
渠道标识_apply_nunique_1                      2
资金状态_apply_nunique_1                      5
交易状态_apply_nunique_1                      3
deal_bus_apply_nunique_1                  8
deal_bus_channel_apply_nunique_1          8
deal_bus_fund_apply_nunique_1             8
deal_bus_c_f_apply_nunique_1              8
uid_count_2                               0
pid_nunique_2                             0
pid_mean_count_2                          0
净值_apply_mean_2                           0
净值_apply_max_2                            0
净值_apply_min_2                            0
净值_apply_std_2                            0
净值_apply_median_2                         0
净值_apply_sum_2                            0
净值_apply_max_min_2                        0
总金额_apply_mean_2                          0
总金额_apply_max_2                           0
总金额_apply_min_2                           0
总金额_apply_std_2                           0
总金额_apply_median_2                        0
总金额_apply_sum_2                           0
总金额_apply_max_min_2                       0
超额管理费_apply_mean_2                        0
超额管理费_apply_max_2                         0
超额管理费_apply_min_2                         0
超额管理费_apply_std_2                         0
超额管理费_apply_median_2                      0
超额管理费_apply_sum_2                         0
超额管理费_apply_max_min_2                     0
apply_amt_apply_mean_2                    0
apply_amt_apply_max_2                     0
apply_amt_apply_min_2                     0
apply_amt_apply_std_2                     0
apply_amt_apply_median_2                  0
apply_amt_apply_sum_2                     0
apply_amt_apply_max_min_2                 0
业务代码_apply_nunique_2                      0
渠道标识_apply_nunique_2                      0
资金状态_apply_nunique_2                      0
交易状态_apply_nunique_2                      0
deal_bus_apply_nunique_2                  0
deal_bus_channel_apply_nunique_2          0
deal_bus_fund_apply_nunique_2             0
deal_bus_c_f_apply_nunique_2              0
uid_count_b                              48
pid_nunique_b                            14
pid_mean_count_b                        146
净值_apply_mean_b                        4033
净值_apply_max_b                           76
净值_apply_min_b                           62
净值_apply_std_b                         5427
净值_apply_median_b                       453
净值_apply_sum_b                         4794
净值_apply_max_min_b                      414
总金额_apply_mean_b                       1790
总金额_apply_max_b                         839
总金额_apply_min_b                          11
总金额_apply_std_b                        1902
总金额_apply_median_b                      366
总金额_apply_sum_b                        1936
总金额_apply_max_min_b                     835
超额管理费_apply_mean_b                     1438
超额管理费_apply_max_b                       689
超额管理费_apply_min_b                         1
超额管理费_apply_std_b                      1589
超额管理费_apply_median_b                    294
超额管理费_apply_sum_b                      1529
超额管理费_apply_max_min_b                   689
apply_amt_apply_mean_b                 2578
apply_amt_apply_max_b                   618
apply_amt_apply_min_b                   443
apply_amt_apply_std_b                  4182
apply_amt_apply_median_b                725
apply_amt_apply_sum_b                  3875
apply_amt_apply_max_min_b               585
业务代码_apply_nunique_b                      7
渠道标识_apply_nunique_b                      3
资金状态_apply_nunique_b                      5
交易状态_apply_nunique_b                      4
deal_bus_apply_nunique_b                 10
deal_bus_channel_apply_nunique_b         10
deal_bus_fund_apply_nunique_b            11
deal_bus_c_f_apply_nunique_b             11
uid_count_b1                             43
pid_nunique_b1                           13
pid_mean_count_b1                       131
净值_apply_mean_b1                       3199
净值_apply_max_b1                          69
净值_apply_min_b1                          54
净值_apply_std_b1                        4263
净值_apply_median_b1                      376
净值_apply_sum_b1                        3804
净值_apply_max_min_b1                     340
总金额_apply_mean_b1                      1170
总金额_apply_max_b1                        621
总金额_apply_min_b1                         11
总金额_apply_std_b1                       1239
总金额_apply_median_b1                     252
总金额_apply_sum_b1                       1272
总金额_apply_max_min_b1                    617
超额管理费_apply_mean_b1                     888
超额管理费_apply_max_b1                      481
超额管理费_apply_min_b1                        1
超额管理费_apply_std_b1                      967
超额管理费_apply_median_b1                   189
超额管理费_apply_sum_b1                      948
超额管理费_apply_max_min_b1                  481
apply_amt_apply_mean_b1                2205
apply_amt_apply_max_b1                  594
apply_amt_apply_min_b1                  435
apply_amt_apply_std_b1                 3452
apply_amt_apply_median_b1               685
apply_amt_apply_sum_b1                 3329
apply_amt_apply_max_min_b1              550
业务代码_apply_nunique_b1                     7
渠道标识_apply_nunique_b1                     3
资金状态_apply_nunique_b1                     5
交易状态_apply_nunique_b1                     4
deal_bus_apply_nunique_b1                10
deal_bus_channel_apply_nunique_b1        10
deal_bus_fund_apply_nunique_b1           10
deal_bus_c_f_apply_nunique_b1            10
uid_count_b2                             36
pid_nunique_b2                           13
pid_mean_count_b2                       114
净值_apply_mean_b2                       2407
净值_apply_max_b2                          61
净值_apply_min_b2                          46
净值_apply_std_b2                        3137
净值_apply_median_b2                      284
净值_apply_sum_b2                        2857
净值_apply_max_min_b2                     268
总金额_apply_mean_b2                       581
总金额_apply_max_b2                        332
总金额_apply_min_b2                         11
总金额_apply_std_b2                        621
总金额_apply_median_b2                     134
总金额_apply_sum_b2                        658
总金额_apply_max_min_b2                    328
超额管理费_apply_mean_b2                     347
超额管理费_apply_max_b2                      203
超额管理费_apply_min_b2                        1
超额管理费_apply_std_b2                      381
超额管理费_apply_median_b2                    77
超额管理费_apply_sum_b2                      393
超额管理费_apply_max_min_b2                  203
apply_amt_apply_mean_b2                1893
apply_amt_apply_max_b2                  562
apply_amt_apply_min_b2                  422
apply_amt_apply_std_b2                 2781
apply_amt_apply_median_b2               653
apply_amt_apply_sum_b2                 2815
apply_amt_apply_max_min_b2              493
业务代码_apply_nunique_b2                     7
渠道标识_apply_nunique_b2                     3
资金状态_apply_nunique_b2                     5
交易状态_apply_nunique_b2                     4
deal_bus_apply_nunique_b2                10
deal_bus_channel_apply_nunique_b2        10
deal_bus_fund_apply_nunique_b2           10
deal_bus_c_f_apply_nunique_b2            10
flow_redu_b1                             26
flow_redu_b                              28
flow_redu_b2                             22
flow_redu_1                              17
flow_redu_2                               0
dtype: int64
#51461df.shape[0]
390977
gc.collect()
68

客户风险表

产品风险等级和客户风险等级的大小关系

e1 = pd.read_csv('e_bc.csv')e1.head(2)
core_cust_id e1 e2
0 9361c799f2 3 20211113
1 9cb2061f63 2 20211014
# e = pd.read_csv(path2 + 'e.csv')# e_columns = ['core_cust_id', '客户风险等级', '评估日期']# e.columns = e_columnse = pd.read_csv('e.csv')e.head(2)
core_cust_id e1 e2
0 d4931873cb 3 20200608
1 af52580627 3 20200330
e = pd.concat([e, e1])
e.info()
Int64Index: 351573 entries, 0 to 35456Data columns (total 3 columns): #   Column        Non-Null Count   Dtype ---  ------        --------------   -----  0   core_cust_id  351573 non-null  object 1   e1            351573 non-null  int64  2   e2            351573 non-null  int64 dtypes: int64(2), object(1)memory usage: 10.7+ MB
len(e.drop_duplicates())
351523
e['e2'] = e['e2'].astype('str')
e['date'] = e['e2'].apply(lambda x:x[:6])
e['date'].value_counts().sort_index()
201303       30201304        3201305        9201306        7201307       12201308       19201309       25201310       30201311       28201312       52201401       43201402       31201403       65201404       54201405      100201406      108201407       80201408      101201409      209201410      199201411      205201412      213201502        1201503        2201504       18201505       78201506      137201507      358201508      823201509     1202201510     1127201511     1080201512      810201601      564201602      731201603     1204201604      829201605      646201606      526201607      515201608      547201609      819201610      944201611     1052201612      932201701      628201702     2213201703     1850201704     1110201705     1138201706     1304201707     1381201708     2389201709     2127201710     1489201711     3243201712     2703201801     1683201802     2160201803     2901201804     2376201805     2377201806     2250201807     2876201808     4684201809     4648201810     4484201811     4459201812     5921201901     4310201902     3310201903     4164201904     3573201905     3974201906     3316201907     3585201908     3639201909     4491201910     4556201911     3937201912     6040202001     6065202002     2832202003     5075202004     8605202005     8177202006     7860202007     8949202008     8705202009    10822202010     8405202011     8608202012    10566202101     7893202102     5814202103     8806202104     9652202105     8180202106     9567202107    13409202108    12828202109    17441202110    20209202111    15248Name: date, dtype: int64
  • #登记和更新风险等级人数,与购买有关系么?
  • 与df合并后查看一下在各产品中的表现
# 客户风险表dfs = []dict_ = {'2021-07-01':'202106', '2021-08-01':'202107', '2021-09-01':'202108', '2021-10-01':'202109'}for month in sorted(df['a3'].unique()):    print(month)    tmp_df = df[df['a3'] == month]        stat_1 = e[e['date'] == dict_[month]].groupby('core_cust_id')['e2'].count().reset_index()    stat_1.columns = ['core_cust_id','risk_count']    stat_1['risk_change']       = e[e['date'] == dict_[month]].groupby('core_cust_id')['e2'].agg('nunique').values    stat_1['risk_level_mean']   = e[e['date'] == dict_[month]].groupby('core_cust_id')['e2'].agg('mean').values    stat_1['risk_max']          = e[e['date'] == dict_[month]].groupby('core_cust_id')['e2'].agg('max').values    stat_1['risk_min']          = e[e['date'] == dict_[month]].groupby('core_cust_id')['e2'].agg('min').values        stat_2 = e[e['date'] < dict_[month]].groupby('core_cust_id')['e2'].count().reset_index()    stat_2.columns = ['core_cust_id','risk_count_b']    stat_2['risk_change_b']       = e[e['date'] < dict_[month]].groupby('core_cust_id')['e2'].agg('nunique').values    stat_2['risk_level_mean_b']   = e[e['date'] < dict_[month]].groupby('core_cust_id')['e2'].agg('mean').values    stat_2['risk_max_b']          = e[e['date'] < dict_[month]].groupby('core_cust_id')['e2'].agg('max').values    stat_2['risk_min_b']          = e[e['date'] < dict_[month]].groupby('core_cust_id')['e2'].agg('min').values        #最新评估的风险    stat_3 = e[e['date'] <= dict_[month]]    stat_3.drop_duplicates('core_cust_id', 'last', inplace=True)    stat_3['date1'] = dict_[month]    stat_3['risk_diff_date'] = stat_3['date1'].astype('int') - stat_3['date'].astype('int') + 1    stat_3.drop(['date1', 'e2'], axis=1, inplace=True)        tmp_df = tmp_df.merge(stat_1, on='core_cust_id', how='left')    tmp_df = tmp_df.merge(stat_2, on='core_cust_id', how='left')    tmp_df = tmp_df.merge(stat_3, on='core_cust_id', how='left')    dfs.append(tmp_df)df = pd.concat(dfs).reset_index(drop=True)
2021-07-012021-08-012021-09-012021-10-01
df.head(2)
id core_cust_id prod_code a2 a3 y type uid_count_1 pid_nunique_1 pid_mean_count_1 净值_apply_mean_1 净值_apply_max_1 净值_apply_min_1 净值_apply_std_1 净值_apply_median_1 净值_apply_sum_1 净值_apply_max_min_1 总金额_apply_mean_1 总金额_apply_max_1 总金额_apply_min_1 总金额_apply_std_1 总金额_apply_median_1 总金额_apply_sum_1 总金额_apply_max_min_1 超额管理费_apply_mean_1 超额管理费_apply_max_1 超额管理费_apply_min_1 超额管理费_apply_std_1 超额管理费_apply_median_1 超额管理费_apply_sum_1 超额管理费_apply_max_min_1 apply_amt_apply_mean_1 apply_amt_apply_max_1 apply_amt_apply_min_1 apply_amt_apply_std_1 apply_amt_apply_median_1 apply_amt_apply_sum_1 apply_amt_apply_max_min_1 业务代码_apply_nunique_1 渠道标识_apply_nunique_1 资金状态_apply_nunique_1 交易状态_apply_nunique_1 deal_bus_apply_nunique_1 deal_bus_channel_apply_nunique_1 deal_bus_fund_apply_nunique_1 deal_bus_c_f_apply_nunique_1 uid_count_2 pid_nunique_2 pid_mean_count_2 净值_apply_mean_2 净值_apply_max_2 净值_apply_min_2 净值_apply_std_2 净值_apply_median_2 净值_apply_sum_2 净值_apply_max_min_2 总金额_apply_mean_2 总金额_apply_max_2 总金额_apply_min_2 总金额_apply_std_2 总金额_apply_median_2 总金额_apply_sum_2 总金额_apply_max_min_2 超额管理费_apply_mean_2 超额管理费_apply_max_2 超额管理费_apply_min_2 超额管理费_apply_std_2 超额管理费_apply_median_2 超额管理费_apply_sum_2 超额管理费_apply_max_min_2 apply_amt_apply_mean_2 apply_amt_apply_max_2 apply_amt_apply_min_2 apply_amt_apply_std_2 apply_amt_apply_median_2 apply_amt_apply_sum_2 apply_amt_apply_max_min_2 业务代码_apply_nunique_2 渠道标识_apply_nunique_2 资金状态_apply_nunique_2 交易状态_apply_nunique_2 deal_bus_apply_nunique_2 deal_bus_channel_apply_nunique_2 deal_bus_fund_apply_nunique_2 deal_bus_c_f_apply_nunique_2 uid_count_b pid_nunique_b pid_mean_count_b 净值_apply_mean_b 净值_apply_max_b 净值_apply_min_b 净值_apply_std_b 净值_apply_median_b 净值_apply_sum_b 净值_apply_max_min_b 总金额_apply_mean_b 总金额_apply_max_b 总金额_apply_min_b 总金额_apply_std_b 总金额_apply_median_b 总金额_apply_sum_b 总金额_apply_max_min_b 超额管理费_apply_mean_b 超额管理费_apply_max_b 超额管理费_apply_min_b 超额管理费_apply_std_b 超额管理费_apply_median_b 超额管理费_apply_sum_b 超额管理费_apply_max_min_b apply_amt_apply_mean_b apply_amt_apply_max_b apply_amt_apply_min_b apply_amt_apply_std_b apply_amt_apply_median_b apply_amt_apply_sum_b apply_amt_apply_max_min_b 业务代码_apply_nunique_b 渠道标识_apply_nunique_b 资金状态_apply_nunique_b 交易状态_apply_nunique_b deal_bus_apply_nunique_b deal_bus_channel_apply_nunique_b deal_bus_fund_apply_nunique_b deal_bus_c_f_apply_nunique_b uid_count_b1 pid_nunique_b1 pid_mean_count_b1 净值_apply_mean_b1 净值_apply_max_b1 净值_apply_min_b1 净值_apply_std_b1 净值_apply_median_b1 净值_apply_sum_b1 净值_apply_max_min_b1 总金额_apply_mean_b1 总金额_apply_max_b1 总金额_apply_min_b1 总金额_apply_std_b1 总金额_apply_median_b1 总金额_apply_sum_b1 总金额_apply_max_min_b1 超额管理费_apply_mean_b1 超额管理费_apply_max_b1 超额管理费_apply_min_b1 超额管理费_apply_std_b1 超额管理费_apply_median_b1 超额管理费_apply_sum_b1 超额管理费_apply_max_min_b1 apply_amt_apply_mean_b1 apply_amt_apply_max_b1 apply_amt_apply_min_b1 apply_amt_apply_std_b1 apply_amt_apply_median_b1 apply_amt_apply_sum_b1 apply_amt_apply_max_min_b1 业务代码_apply_nunique_b1 渠道标识_apply_nunique_b1 资金状态_apply_nunique_b1 交易状态_apply_nunique_b1 deal_bus_apply_nunique_b1 deal_bus_channel_apply_nunique_b1 deal_bus_fund_apply_nunique_b1 deal_bus_c_f_apply_nunique_b1 uid_count_b2 pid_nunique_b2 pid_mean_count_b2 净值_apply_mean_b2 净值_apply_max_b2 净值_apply_min_b2 净值_apply_std_b2 净值_apply_median_b2 净值_apply_sum_b2 净值_apply_max_min_b2 总金额_apply_mean_b2 总金额_apply_max_b2 总金额_apply_min_b2 总金额_apply_std_b2 总金额_apply_median_b2 总金额_apply_sum_b2 总金额_apply_max_min_b2 超额管理费_apply_mean_b2 超额管理费_apply_max_b2 超额管理费_apply_min_b2 超额管理费_apply_std_b2 超额管理费_apply_median_b2 超额管理费_apply_sum_b2 超额管理费_apply_max_min_b2 apply_amt_apply_mean_b2 apply_amt_apply_max_b2 apply_amt_apply_min_b2 apply_amt_apply_std_b2 apply_amt_apply_median_b2 apply_amt_apply_sum_b2 apply_amt_apply_max_min_b2 业务代码_apply_nunique_b2 渠道标识_apply_nunique_b2 资金状态_apply_nunique_b2 交易状态_apply_nunique_b2 deal_bus_apply_nunique_b2 deal_bus_channel_apply_nunique_b2 deal_bus_fund_apply_nunique_b2 deal_bus_c_f_apply_nunique_b2 flow_redu_b1 flow_redu_b flow_redu_b2 flow_redu_1 flow_redu_2 risk_count risk_change risk_level_mean risk_max risk_min risk_count_b risk_change_b risk_level_mean_b risk_max_b risk_min_b 客户风险等级 date risk_diff_date
0 70e7f0465877447aa44c8d3120d0414c 9cb1f66b15 SSTJMZKF001 3 2021-07-01 0.0 train NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2.0 2.0 1.5 2.0 1.0 2.0 201910 197.0
1 0df607cd59144c9fa0ddd0863372a0de c446c41e48 SSTJMZKF002 3 2021-07-01 0.0 train NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

B产品的产品表与df合并后都只有一个值,所以客户风险承受能力就是客户风险等级

df.shape[0]
390977
# 客户风险等级和产品风险等级比
#df['risk_tolerance'] = df['客户风险等级'] - df['风险等级']
#df.info()

客户信息

d_bc = pd.read_csv('d_bc.csv')
d_bc.head(2)
core_cust_id d1 d2 d3
0 9cb2061f63 1 4.0 60
1 e538d4d33a 1 4.0 28
d = pd.read_csv('d.csv')# d_columns = ['core_cust_id', '性别', '客户等级', '年龄']# d.columns = d_columnsd.head(2)
core_cust_id d1 d2 d3
0 d4931873cb 1 4.0 40
1 af52580627 2 3.0 32
d = pd.concat([d, d_bc])
d.info()
Int64Index: 271919 entries, 0 to 7863Data columns (total 4 columns): #   Column        Non-Null Count   Dtype  ---  ------        --------------   -----   0   core_cust_id  271919 non-null  object  1   d1            271919 non-null  int64   2   d2            236613 non-null  float64 3   d3            271919 non-null  int64  dtypes: float64(1), int64(2), object(1)memory usage: 10.4+ MB
sns.countplot(x='d1', hue='d2', data=d)


[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-BdhNiP2a-1643361700228)(output_94_1.png)]

from scipy.stats import *# sns.distplot(d[d['性别']==1]['年龄'], fit=lognorm) #拟合标准正态分布# sns.distplot(d[d['性别']==2]['年龄'],hist=False, fit=norm) #拟合标准正态分布sns.distplot(d[d['d1']==1]['d3']) #拟合标准正态分布sns.distplot(d[d['d1']==2]['d3'],hist=False, fit=lognorm) #拟合标准正态分布


[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-C0GzDFdr-1643361700229)(output_95_1.png)]

sns.distplot(d[d['d1']==2]['d3'],hist=False, fit=skewnorm) #拟合标准正态分布


[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-gYKp3dpM-1643361700230)(output_96_1.png)]

d[(d['d3']==55) & (d['d1']==1)].shape[0], d[d['d3']==55].shape[0]
(2062, 4075)
d[(d['d3']==72) & (d['d1']==1)].shape[0], d[d['d3']==72].shape[0]
(1095, 2204)
d['age_gap'] = d['d3'].apply(lambda x: 1 if x<56 and x>72 else 2)
d['age_grade'] = d['d3'] / d['d2']
d['d3'].max(), d['d3'].min()
(101, 18)
d['d3'].value_counts()
32     906731     890233     840834     809735     765630     763139     741736     699829     695737     693538     690040     638028     626327     563942     563743     550848     546058     543850     539949     531944     530451     527445     523446     510947     510841     505526     499653     499252     480657     431656     422259     420925     420355     407564     390254     383767     364163     342624     341866     340165     338968     317961     313662     299469     294623     262170     259560     252271     234772     220422     198973     188674     178021     161475     147520     129076     122977     113378      98079      87019      79180      71181      61282      60483      50584      42685      35686      27587      22018      21689      13588      12990       7291       5592       4493       1794       1395        796        497        399        398        1100       1101       1Name: d3, dtype: int64
#bin = [i*15 for i in range(1, 8)]d['age_bin10'] = pd.cut(d['年龄'], 10, labels=False)d['age_bin15'] = pd.cut(d['年龄'], 15, labels=False)d['age_bin20'] = pd.cut(d['年龄'], 20, labels=False)d['age_bin25'] = pd.cut(d['年龄'], 25, labels=False)d['age_bin30'] = pd.cut(d['年龄'], 30, labels=False)
age_counts = d['d3'].value_counts().reset_index()age_counts.columns = ['d3', 'age_频度']d = d.merge(age_counts, on=['d3'], how='left')
d['age_freq3'] = pd.cut(d['age_频度'], 3, labels=False)d['age_freq5'] = pd.cut(d['age_频度'], 5, labels=False)d['age_freq7'] = pd.cut(d['age_频度'], 7, labels=False)d['age_freq9'] = pd.cut(d['age_频度'], 9, labels=False)d['age_freq11'] = pd.cut(d['age_频度'], 11, labels=False)
cust_grade_counts = d['d2'].value_counts().reset_index()cust_grade_counts.columns = ['d2', 'cust_grade_频度'] d = d.merge(cust_grade_counts, on=['d2'], how='left')
d.head()
core_cust_id d1 d2 d3 age_gap age_grade age_频度 age_freq3 age_freq5 age_freq7 age_freq9 age_freq11 cust_grade_频度
0 d4931873cb 1 4.0 40 2 10.000000 6380 2 3 4 6 7 171960.0
1 af52580627 2 3.0 32 2 10.666667 9067 2 4 6 8 10 41350.0
2 8a11960fe9 1 NaN 58 2 NaN 5438 1 2 4 5 6 NaN
3 cfeaff22c6 2 4.0 53 2 13.250000 4992 1 2 3 4 6 171960.0
4 17fb48e6ce 1 4.0 34 2 8.500000 8097 2 4 6 8 9 171960.0
df = df.merge(d, on='core_cust_id', how='left')
#按照月份、将单个产品中的年龄统计量做出来stat0 = df.groupby('prod_code')['d3'].count().reset_index()stat0.columns = ['prod_code', 'prod_age_count']stat0['prod_age_max'] = df.groupby('prod_code')['d3'].agg('max').valuesstat0['prod_age_min'] = df.groupby('prod_code')['d3'].agg('min').valuesstat0['prod_age_sum'] = df.groupby('prod_code')['d3'].agg('sum').valuesstat0['prod_age_std'] = df.groupby('prod_code')['d3'].agg('std').valuesstat0['prod_age_median'] = df.groupby('prod_code')['d3'].agg('median').valuesstat0['prod_age_nunique'] = df.groupby('prod_code')['d3'].agg('nunique').values# stat0['prod_age_mode'] = df.groupby('prod_code')['年龄'].agg('mode').valuesdf = pd.merge(df, stat0, on='prod_code', how='left')
#51461
df.shape[0]
390977
gc.collect()
91

APP点击行为

# r = pd.read_csv(path2 + 'r.csv')
r1 = pd.read_csv('r_bc.csv')
r = pd.read_csv('r.csv')
r = pd.concat([r, r1])
r.head(2)
r1 core_cust_id r3 prod_code r5
0 20210825_11277467902 a15a1d681a 2 91318017 2021-08-25 14:18:10
1 20210824_11229966502 a15a1d681a 1 GRHLA20211386 2021-08-24 14:55:49
r['r5'].min(), r['r5'].max()
('2021-01-01 00:06:51', '2021-10-24 23:59:59')
r.info()
Int64Index: 1054997 entries, 0 to 277921Data columns (total 5 columns): #   Column        Non-Null Count    Dtype ---  ------        --------------    -----  0   r1            1054997 non-null  object 1   core_cust_id  1054997 non-null  object 2   r3            1054997 non-null  int64  3   prod_code     1054990 non-null  object 4   r5            1054997 non-null  objectdtypes: int64(1), object(4)memory usage: 48.3+ MB
#是否周末点击?
r['datetime'] = pd.to_datetime(r['r5'],errors='coerce', format='%Y-%m-%d %H:%M:%S')   #先转化为datetime类型,默认format='%Y-%m-%d %H:%M:%S'# ss['dayofyear_s']  = ss['datetime'].dt.dayofyear.fillna(0).astype("int") #一年中的第n天r['click_month'] = r['datetime'].dt.month.fillna(0).astype('int')r['click_day'] = r['datetime'].dt.day.fillna(0).astype("int")r['click_hour'] = r['datetime'].dt.hour.fillna(0).astype("int")    #转化提取小时r['click_minute'] = r['datetime'].dt.minute.fillna(0).astype("int") #转化提取分钟r['click_second'] = r['datetime'].dt.second.fillna(0).astype("int") #转化提取秒# df['date'] = df['datetime'].dt.date   #转化提取年-月-日# df['year'] =df['datetime'].dt.year.fillna(0).astype("int")   #转化提取年 ,#如果有NaN元素则默认转化float64型,要转换数据类型则需要先填充空值,在做数据类型转换r['click_dayofyear']  = r['datetime'].dt.dayofyear.fillna(0).astype("int") #一年中的第n天r['click_weekofyear'] = r['datetime'].dt.weekofyear.fillna(0).astype("int") #一年中的第n周r['click_weekday']    = r['datetime'].dt.weekday.fillna(0).astype("int") #周几,一周里的第几天,Monday=0, Sunday=6r['click_quarter']    = r['datetime'].dt.quarter.fillna(0).astype("int")  #季度r['click_is_wknd']    = r['datetime'].dt.dayofweek // 4                  #是否周末
r.head(3)
r1 core_cust_id r3 prod_code r5 datetime click_month click_day click_hour click_minute click_second click_dayofyear click_weekofyear click_weekday click_quarter click_is_wknd
0 20210825_11277467902 a15a1d681a 2 91318017 2021-08-25 14:18:10 2021-08-25 14:18:10 8 25 14 18 10 237 34 2 3 0
1 20210824_11229966502 a15a1d681a 1 GRHLA20211386 2021-08-24 14:55:49 2021-08-24 14:55:49 8 24 14 55 49 236 34 1 3 0
2 20210824_11234138402 a15a1d681a 1 GRHLA20211386 2021-08-24 16:17:15 2021-08-24 16:17:15 8 24 16 17 15 236 34 1 3 0
r.info()
Int64Index: 1054997 entries, 0 to 277921Data columns (total 16 columns): #   Column            Non-Null Count    Dtype         ---  ------            --------------    -----          0   r1                1054997 non-null  object         1   core_cust_id      1054997 non-null  object         2   r3                1054997 non-null  int64          3   prod_code         1054990 non-null  object         4   r5                1054997 non-null  object         5   datetime          1054997 non-null  datetime64[ns] 6   click_month       1054997 non-null  int32          7   click_day         1054997 non-null  int32          8   click_hour        1054997 non-null  int32          9   click_minute      1054997 non-null  int32          10  click_second      1054997 non-null  int32          11  click_dayofyear   1054997 non-null  int32          12  click_weekofyear  1054997 non-null  int32          13  click_weekday     1054997 non-null  int32          14  click_quarter     1054997 non-null  int32          15  click_is_wknd     1054997 non-null  int64         dtypes: datetime64[ns](1), int32(9), int64(2), object(4)memory usage: 100.6+ MB
sns.countplot(x='click_month', data=r)


[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-3fAoiLjH-1643361700231)(output_120_1.png)]

sns.countplot(x='click_day', data=r)


[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-DGb1fJ9N-1643361700232)(output_121_1.png)]

sns.countplot(x='click_hour', data=r)


[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-h1pxP64s-1643361700233)(output_122_1.png)]

# sns.countplot(x='click_minute', data=r)


[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-yCOe4XRn-1643361700234)(output_123_1.png)]

sns.countplot(x='click_weekofyear', data=r)


[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-j69M5jTj-1643361700234)(output_124_1.png)]

sns.countplot(x='click_weekday', data=r)


[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-VnrTuads-1643361700235)(output_125_1.png)]

sns.countplot(x='click_is_wknd', data=r)


[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-n7j5bRp2-1643361700236)(output_126_1.png)]

r['r3'].nunique()
2

合并r表:

r.columns
Index(['r1', 'core_cust_id', 'r3', 'prod_code', 'r5', 'click_day',       'click_hour', 'click_minute', 'click_second', 'click_time',       'click_month', 'click_d', 'datetime', 'weekofyear_r', 'weekday_r',       'quarter_r', 'is_wknd_r'],      dtype='object')
  • 点击类型r3: 是否有变化–nunique有几个值
  • click_hour: 众数mode,min,max,median,nunique
  • click_month: 众数mode,min,max,median,nunique,月均次数
  • click_d: 众数mode,min,max,median,nunique,日均次数***---------- #每个月中哪一天
  • weekofyear: 众数mode,min,max,median,nunique,周均次数
  • weekday: 众数mode,min,max,median,nunique,周内日均次数
  • quarter: 众数mode,min,max,median,nunique,季度均次数
  • is_wknd: 众数mode,sum,median,
  • 以上所有需要补充sum和mean么?
r['r5'].max(), r['r5'].min()
('2021-10-24 23:59:59', '2021-01-01 00:06:51')
r['date'] = r['r5'].apply(lambda x: x[:7])

dict_ = {'2021-07-01':'2021-06', '2021-08-01':'2021-07', '2021-09-01':'2021-08', '2021-10-01':'2021-09'}
dict_1 = {'2021-07-01':'2021-05', '2021-08-01':'2021-06', '2021-09-01':'2021-07', '2021-10-01':'2021-08'}

r_col = [ 'click_hour', 'click_month', 'click_d', 'weekofyear_r', 'weekday_r', 'quarter_r']
    
dfs = []
#单个用户点击几个月、每个月点击次数
for month in sorted(df['a3'].unique()):
    print(month)
    tmp_df = df[df['a3'] == month]

    stat_1 = r[(r['date'] <= dict_[month])].groupby('core_cust_id')['prod_code'].count().reset_index()
    stat_1.columns = ['core_cust_id','uid_click_action_count_b']
    stat_1['pid_click_action_nunique_b'] = r[(r['date'] <= dict_[month])].groupby('core_cust_id')['prod_code'].agg('nunique').values
    stat_1['uid_click_r3_nunique_b'] = r[(r['date'] <= dict_[month])].groupby('core_cust_id')['r3'].agg('nunique').values
    #点了几个月
    #stat_1['pid_click_month_nunique_b'] = r[(r['date'] <= dict_[month])].groupby('core_cust_id')['click_month'].agg('nunique').values
    #月均次数
    #stat_1['pid_click_month_mean_b'] = stat_1['uid_click_action_count_b'] / stat_1['pid_click_month_nunique_b']
#     stat_1['uid_click_iswknd_mode_b'] = r[(r['date'] <= dict_[month])].groupby('core_cust_id')['is_wknd_r'].agg('mode').values
    stat_1['uid_click_iswknd_sum_b'] = r[(r['date'] <= dict_[month])].groupby('core_cust_id')['is_wknd_r'].agg('sum').values
    stat_1['uid_click_iswknd_median_b'] = r[(r['date'] <= dict_[month])].groupby('core_cust_id')['is_wknd_r'].agg('median').values
    for col in r_col:
#         stat_1[f'uid_click_{col}_mode_b'] = r[(r['date'] <= dict_[month])].groupby('core_cust_id')[col].agg('mode').values
        stat_1[f'uid_click_{col}_median_b'] = r[(r['date'] <= dict_[month])].groupby('core_cust_id')[col].agg('median').values
        stat_1[f'uid_click_{col}_sum_b'] = r[(r['date'] <= dict_[month])].groupby('core_cust_id')[col].agg('sum').values
        stat_1[f'uid_click_{col}_max_b'] = r[(r['date'] <= dict_[month])].groupby('core_cust_id')[col].agg('max').values
        stat_1[f'uid_click_{col}_min_b'] = r[(r['date'] <= dict_[month])].groupby('core_cust_id')[col].agg('min').values
        stat_1[f'uid_click_{col}_nunique_b'] = r[(r['date'] <= dict_[month])].groupby('core_cust_id')[col].agg('nunique').values
        stat_1[f'uid_click_{col}_smean_b'] = stat_1['uid_click_action_count_b'] / stat_1[f'uid_click_{col}_nunique_b']
   
    
    stat_2 = r[(r['date'] <= dict_1[month])].groupby('core_cust_id')['prod_code'].count().reset_index()
    stat_2.columns = ['core_cust_id','uid_click_action_count_b1']
    stat_2['pid_click_action_nunique_b1'] = r[(r['date'] <= dict_1[month])].groupby('core_cust_id')['prod_code'].agg('nunique').values
    stat_2['uid_click_r3_nunique_b1'] = r[(r['date'] <= dict_1[month])].groupby('core_cust_id')['r3'].agg('nunique').values
    
#     stat_2['uid_click_iswknd_mode_b1'] = r[(r['date'] <= dict_1[month])].groupby('core_cust_id')['is_wknd_r'].agg('mode').values
    stat_2['uid_click_iswknd_sum_b1'] = r[(r['date'] <= dict_1[month])].groupby('core_cust_id')['is_wknd_r'].agg('sum').values
    stat_2['uid_click_iswknd_median_b1'] = r[(r['date'] <= dict_1[month])].groupby('core_cust_id')['is_wknd_r'].agg('median').values
    for col in r_col:
#         stat_2[f'uid_click_{col}_mode_b1'] = r[(r['date'] <= dict_1[month])].groupby('core_cust_id')[col].agg('mode').values
        stat_2[f'uid_click_{col}_median_b1'] = r[(r['date'] <= dict_1[month])].groupby('core_cust_id')[col].agg('median').values
        stat_2[f'uid_click_{col}_sum_b1'] = r[(r['date'] <= dict_1[month])].groupby('core_cust_id')[col].agg('sum').values
        stat_2[f'uid_click_{col}_max_b1'] = r[(r['date'] <= dict_1[month])].groupby('core_cust_id')[col].agg('max').values
        stat_2[f'uid_click_{col}_min_b1'] = r[(r['date'] <= dict_1[month])].groupby('core_cust_id')[col].agg('min').values
        stat_2[f'uid_click_{col}_nunique_b1'] = r[(r['date'] <= dict_1[month])].groupby('core_cust_id')[col].agg('nunique').values
        stat_2[f'uid_click_{col}_smean_b1'] = stat_2['uid_click_action_count_b1'] / stat_2[f'uid_click_{col}_nunique_b1']
    
    tmp_df = tmp_df.merge(stat_1, on='core_cust_id', how='left')
    tmp_df = tmp_df.merge(stat_2, on='core_cust_id', how='left')
    
    #用户在某产品点击次数
    stat_3 = r[(r['date'] <= dict_[month])].groupby(['core_cust_id', 'prod_code'])['r1'].count().reset_index()
    stat_3.columns = ['core_cust_id', 'prod_code','cp_click_action_count_b']
    stat_3['cp_click_r3_nunique_b'] = r[(r['date'] <= dict_[month])].groupby(['core_cust_id', 'prod_code'])['r3'].agg('nunique').values
    #有多少天关注该产品
   # stat_3['cp_click_day_nunique_b'] = r[(r['date'] <= dict_[month])].groupby(['core_cust_id', 'prod_code'])['click_day'].agg('nunique').values
    #关注天频率
    #stat_3['cp_click_day_freq_b'] = stat_3['cp_click_action_count_b'] / stat_3['cp_click_day_nunique_b']
    #关注了几个月
    #stat_3['cp_click_month_nunique_b'] = r[(r['date'] <= dict_[month])].groupby(['core_cust_id', 'prod_code'])['click_month'].agg('nunique').values
    #月均次数
    #stat_3['cp_click_month_mean_b'] = stat_3['cp_click_action_count_b'] / stat_3['cp_click_month_nunique_b']
#     stat_3['cp_click_iswknd_mode_b'] = r[(r['date'] <= dict_[month])].groupby(['core_cust_id', 'prod_code'])['is_wknd_r'].agg('mode').values
    stat_3['cp_click_iswknd_sum_b'] = r[(r['date'] <= dict_[month])].groupby(['core_cust_id', 'prod_code'])['is_wknd_r'].agg('sum').values
    stat_3['cp_click_iswknd_median_b'] = r[(r['date'] <= dict_[month])].groupby(['core_cust_id', 'prod_code'])['is_wknd_r'].agg('median').values
    for col in r_col:
#         stat_3[f'cp_click_{col}_mode_b'] = r[(r['date'] <= dict_[month])].groupby(['core_cust_id', 'prod_code'])[col].agg('mode').values
        stat_3[f'cp_click_{col}_median_b'] = r[(r['date'] <= dict_[month])].groupby(['core_cust_id', 'prod_code'])[col].agg('median').values
        stat_3[f'cp_click_{col}_sum_b'] = r[(r['date'] <= dict_[month])].groupby(['core_cust_id', 'prod_code'])[col].agg('sum').values
        stat_3[f'cp_click_{col}_max_b'] = r[(r['date'] <= dict_[month])].groupby(['core_cust_id', 'prod_code'])[col].agg('max').values
        stat_3[f'cp_click_{col}_min_b'] = r[(r['date'] <= dict_[month])].groupby(['core_cust_id', 'prod_code'])[col].agg('min').values
        stat_3[f'cp_click_{col}_nunique_b'] = r[(r['date'] <= dict_[month])].groupby(['core_cust_id', 'prod_code'])[col].agg('nunique').values
        stat_3[f'cp_click_{col}_smean_b'] = stat_3['cp_click_action_count_b'] / stat_3[f'cp_click_{col}_nunique_b']
    
    
    stat_4 = r[(r['date'] <= dict_1[month])].groupby(['core_cust_id', 'prod_code'])['r1'].count().reset_index()
    stat_4.columns = ['core_cust_id', 'prod_code', 'cp_click_action_count_b1']
    stat_4['cp_click_r3_nunique_b1'] = r[(r['date'] <= dict_1[month])].groupby(['core_cust_id', 'prod_code'])['r3'].agg('nunique').values
    
#     stat_4['cp_click_iswknd_mode_b1'] = r[(r['date'] <= dict_1[month])].groupby(['core_cust_id', 'prod_code'])['is_wknd_r'].agg('mode').values
    stat_4['cp_click_iswknd_sum_b1'] = r[(r['date'] <= dict_1[month])].groupby(['core_cust_id', 'prod_code'])['is_wknd_r'].agg('sum').values
    stat_4['cp_click_iswknd_median_b1'] = r[(r['date'] <= dict_1[month])].groupby(['core_cust_id', 'prod_code'])['is_wknd_r'].agg('median').values
    for col in r_col:
#         stat_4[f'cp_click_{col}_mode_b1'] = r[(r['date'] <= dict_1[month])].groupby(['core_cust_id', 'prod_code'])[col].agg('mode').values
        stat_4[f'cp_click_{col}_median_b1'] = r[(r['date'] <= dict_1[month])].groupby(['core_cust_id', 'prod_code'])[col].agg('median').values
        stat_4[f'cp_click_{col}_sum_b1'] = r[(r['date'] <= dict_1[month])].groupby(['core_cust_id', 'prod_code'])[col].agg('sum').values
        stat_4[f'cp_click_{col}_max_b1'] = r[(r['date'] <= dict_1[month])].groupby(['core_cust_id', 'prod_code'])[col].agg('max').values
        stat_4[f'cp_click_{col}_min_b1'] = r[(r['date'] <= dict_1[month])].groupby(['core_cust_id', 'prod_code'])[col].agg('min').values
        stat_4[f'cp_click_{col}_nunique_b1'] = r[(r['date'] <= dict_1[month])].groupby(['core_cust_id', 'prod_code'])[col].agg('nunique').values
        stat_4[f'cp_click_{col}_smean_b1'] = stat_4['cp_click_action_count_b1'] / stat_4[f'cp_click_{col}_nunique_b1']
    
    
    tmp_df = tmp_df.merge(stat_3, on=['core_cust_id', 'prod_code'], how='left')
    tmp_df = tmp_df.merge(stat_4, on=['core_cust_id', 'prod_code'], how='left')
    ##############################################################

    dfs.append(tmp_df)

df = pd.concat(dfs).reset_index(drop=True)
2021-07-012021-08-012021-09-012021-10-01
gc.collect()
31
df.shape[0]
390977

账户交易流水

s = pd.read_csv(path2 + 's.csv', thousands=',')s.head(2)
s1 s2 s3 s4 s5 s6 s7
0 01004320210415202104151FT21105818179741 4 NaN 75617.810 2021-04-15 9809df0ffe 2021-04-15
1 01004320210630202106301FT21181810270081 4 NaN 75635.105 2021-06-30 9809df0ffe 2021-06-30
  • s2 交易类型代码
  • s3 借方客户编号
  • s4 借方金额
  • s5 借方交易日期
  • s6 贷方客户编号
  • s7 处理日期
s.nunique()
s.info()
s['deal_date']   = s['s5'].apply(lambda x:x[:7])s['handle_date'] = s['s7'].apply(lambda x:x[:7])
l5 = s['deal_date'].tolist()
l7 = s['handle_date'].tolist()
# for i in range(len(l5)):#     if l5[i] != l7[i]:#         print(i, l5[i], l7[i])
  • 将s3借方:花钱吃饭;s6贷方:发工资到银行卡,分开再合并
  • s3对应的s4为负值,s6对应的s4为正值
  • s1可以删除用rank重新排序
s1 = s[s['s3'].notnull()]s1.drop('s6', axis=1, inplace=True)
s1['s4'] = s1['s4']*(-1)

不要交易号,因为有一些数据同一天发生交易的金额一样,所以判断可能是付款取消,再次付款导致

s1.rename(columns={'s3':'core_cust_id'}, inplace=True)
s1.drop_duplicates(['s2', 'core_cust_id', 's4', 's5'], keep='last', inplace=True)
s1.head(2)
s1 s2 core_cust_id s4 s5 s7 deal_date handle_date
190 01004320210815202108151FT21227030881371 6 4b3451caa2 -12110.0 2021-08-15 2021-08-15 2021-08 2021-08
200 01004320210809202108091FT21221429469501 6 a030fed1b -150110.0 2021-08-09 2021-08-09 2021-08 2021-08
s2 = s[s['s6'].notnull()]
s2.drop('s3', axis=1, inplace=True)
s2.rename(columns={'s6':'core_cust_id'}, inplace=True)
s2.drop_duplicates(['s2', 's4', 's5', 'core_cust_id'], keep='last', inplace=True)
s2.head(2)
s1 s2 s4 s5 core_cust_id s7 deal_date handle_date
0 01004320210415202104151FT21105818179741 4 75617.810 2021-04-15 9809df0ffe 2021-04-15 2021-04 2021-04
1 01004320210630202106301FT21181810270081 4 75635.105 2021-06-30 9809df0ffe 2021-06-30 2021-06 2021-06
ss = pd.concat([s1, s2])
ss.reset_index(inplace=True)
ss.drop('index', axis=1, inplace=True)
ss.head(2)
s1 s2 core_cust_id s4 s5 s7 deal_date handle_date
0 01004320210815202108151FT21227030881371 6 4b3451caa2 -12110.0 2021-08-15 2021-08-15 2021-08 2021-08
1 01004320210809202108091FT21221429469501 6 a030fed1b -150110.0 2021-08-09 2021-08-09 2021-08 2021-08
ss.shape
(6017813, 8)
gc.collect()
13384
ss['interval_month'] = ss['deal_date']==ss['handle_date']
ss['interval_month'] = ss['interval_month'].apply(lambda x: 1 if x=='False' else 0)
ss.head(2)
s1 s2 core_cust_id s4 s5 s7 deal_date handle_date interval_month
0 01004320210815202108151FT21227030881371 6 4b3451caa2 -12110.0 2021-08-15 2021-08-15 2021-08 2021-08 0
1 01004320210809202108091FT21221429469501 6 a030fed1b -150110.0 2021-08-09 2021-08-09 2021-08 2021-08 0
  • 查看每个人交易时间段是否在周末,节假日等
ss['datetime_s'] = pd.to_datetime(ss['s5'],errors='coerce')   #先转化为datetime类型,默认format='%Y-%m-%d %H:%M:%S'
# df['date'] = df['datetime'].dt.date   #转化提取年-月-日
# df['year'] =df['datetime'].dt.year.fillna(0).astype("int")   #转化提取年 ,
#如果有NaN元素则默认转化float64型,要转换数据类型则需要先填充空值,在做数据类型转换
# df['month'] = df['datetime'].dt.month.fillna(0).astype("int")  #转化提取月
# df['monthofyear'] = df['year'].map(str) + '-' + df['month'].map(str) #转化获取年-月
# df['day'] = df['datetime'].dt.day.fillna(0).astype("int")      #转化提取天
# df['hour'] = df['datetime'].dt.hour.fillna(0).astype("int")    #转化提取小时
# df['minute'] = df['datetime'].dt.minute.fillna(0).astype("int") #转化提取分钟
# df['second'] = df['datetime'].dt.second.fillna(0).astype("int") #转化提取秒
ss['dayofyear_s']  = ss['datetime_s'].dt.dayofyear.fillna(0).astype("int") #一年中的第n天
ss['weekofyear_s'] = ss['datetime_s'].dt.weekofyear.fillna(0).astype("int") #一年中的第n周
ss['weekday_s']    = ss['datetime_s'].dt.weekday.fillna(0).astype("int") #周几,一周里的第几天,Monday=0, Sunday=6
ss['quarter_s']    = ss['datetime_s'].dt.quarter.fillna(0).astype("int")  #季度
ss['is_wknd_s']    = ss['datetime_s'].dt.dayofweek // 4                  #是否周末
# df['is_month_start'] = df['datetime'].dt.is_month_start.astype(int)
# df['is_month_end'] = df['datetime'].dt.is_month_end.astype(int)
display(ss.head(2))
s1 s2 core_cust_id s4 s5 s7 deal_date handle_date interval_month datetime_s dayofyear_s weekofyear_s weekday_s quarter_s is_wknd_s
0 01004320210815202108151FT21227030881371 6 4b3451caa2 -12110.0 2021-08-15 2021-08-15 2021-08 2021-08 0 2021-08-15 227 32 6 3 1
1 01004320210809202108091FT21221429469501 6 a030fed1b -150110.0 2021-08-09 2021-08-09 2021-08 2021-08 0 2021-08-09 221 32 0 3 0
ss['s5'].max(), ss['s5'].min()
('2021-09-30', '2021-01-01')
ss['date'] = ss['s7'].apply(lambda x: x[:7])

dict_ = {'2021-07-01':'2021-06', '2021-08-01':'2021-07', '2021-09-01':'2021-08', '2021-10-01':'2021-09'}
dict_1 = {'2021-07-01':['2021-05', '2021-06'], '2021-08-01':['2021-06', '2021-07'], 
          '2021-09-01':['2021-07', '2021-08'], '2021-10-01':['2021-08', '2021-09']}

dfs = []

for month in sorted(df['a3'].unique()):
    print(month)
    tmp_df = df[df['a3'] == month]
    #上个月交易情况
    stat_1 = ss[ss['date'] == dict_[month]].groupby('core_cust_id')['s1'].count().reset_index()
    stat_1.columns = ['core_cust_id','deal_count']
    stat_1['deal_nunique']   = ss[ss['date'] == dict_[month]].groupby('core_cust_id')['s4'].agg('nunique').values
    stat_1['single_deal']    = stat_1['deal_count'] / stat_1['deal_nunique']
    stat_1['deal_mean']      = ss[ss['date'] == dict_[month]].groupby('core_cust_id')['s4'].agg('mean').values
    stat_1['borrow_sum']     = ss[ss['date'] == dict_[month]].groupby('core_cust_id')['s4'].agg('sum').values
    stat_1['borrow_max']     = ss[ss['date'] == dict_[month]].groupby('core_cust_id')['s4'].agg('max').values    # 我们加的
    stat_1['borrow_min']     = ss[ss['date'] == dict_[month]].groupby('core_cust_id')['s4'].agg('min').values    # 我们加的
    stat_1['borrow_std']     = ss[ss['date'] == dict_[month]].groupby('core_cust_id')['s4'].agg('std').values    # 我们加的
    stat_1['borrow_median']  = ss[ss['date'] == dict_[month]].groupby('core_cust_id')['s4'].agg('median').values # 我们加的
    stat_1['deal_diff_max_min']  = ss[ss['date'] == dict_[month]].groupby('core_cust_id')['s4'].agg(diff_max_min).values
    
    #上两个月交易情况
    stat_2 = ss[(ss['date'] == dict_1[month][0]) & (ss['date'] == dict_1[month][1])].groupby('core_cust_id')['s1'].count().reset_index()
    stat_2.columns = ['core_cust_id','deal_count_2']
    stat_2['deal_nunique_2']   = ss[(ss['date'] == dict_1[month][0]) & (ss['date'] == dict_1[month][1])].groupby('core_cust_id')['s4'].agg('nunique').values
    stat_2['single_deal_2']    = stat_2['deal_count_2'] / stat_2['deal_nunique_2']
    stat_2['deal_mean_2']      = ss[(ss['date'] == dict_1[month][0]) & (ss['date'] == dict_1[month][1])].groupby('core_cust_id')['s4'].agg('mean').values
    stat_2['borrow_sum_2']     = ss[(ss['date'] == dict_1[month][0]) & (ss['date'] == dict_1[month][1])].groupby('core_cust_id')['s4'].agg('sum').values
    stat_2['borrow_max_2']     = ss[(ss['date'] == dict_1[month][0]) & (ss['date'] == dict_1[month][1])].groupby('core_cust_id')['s4'].agg('max').values    # 我们加的
    stat_2['borrow_min_2']     = ss[(ss['date'] == dict_1[month][0]) & (ss['date'] == dict_1[month][1])].groupby('core_cust_id')['s4'].agg('min').values    # 我们加的
    stat_2['borrow_std_2']     = ss[(ss['date'] == dict_1[month][0]) & (ss['date'] == dict_1[month][1])].groupby('core_cust_id')['s4'].agg('std').values    # 我们加的
    stat_2['borrow_median_2']  = ss[(ss['date'] == dict_1[month][0]) & (ss['date'] == dict_1[month][1])].groupby('core_cust_id')['s4'].agg('median').values # 我们加的
    stat_2['deal_diff_max_min_2']  = ss[(ss['date'] == dict_1[month][0]) & (ss['date'] == dict_1[month][1])].groupby('core_cust_id')['s4'].agg(diff_max_min).values
    
    #上个月之前交易情况
    stat_3 = ss[ss['date'] < dict_[month]].groupby('core_cust_id')['s1'].count().reset_index()
    stat_3.columns = ['core_cust_id','deal_count_b1']
    stat_3['deal_nunique_b1']   = ss[ss['date'] < dict_[month]].groupby('core_cust_id')['s4'].agg('nunique').values
    stat_3['single_deal_b1']    = stat_3['deal_count_b1'] / stat_3['deal_nunique_b1']
    stat_3['deal_mean_b1']      = ss[ss['date'] < dict_[month]].groupby('core_cust_id')['s4'].agg('mean').values
    stat_3['borrow_sum_b1']     = ss[ss['date'] < dict_[month]].groupby('core_cust_id')['s4'].agg('sum').values
    stat_3['borrow_max_b1']     = ss[ss['date'] < dict_[month]].groupby('core_cust_id')['s4'].agg('max').values    # 我们加的
    stat_3['borrow_min_b1']     = ss[ss['date'] < dict_[month]].groupby('core_cust_id')['s4'].agg('min').values    # 我们加的
    stat_3['borrow_std_b1']     = ss[ss['date'] < dict_[month]].groupby('core_cust_id')['s4'].agg('std').values    # 我们加的
    stat_3['borrow_median_b1']  = ss[ss['date'] < dict_[month]].groupby('core_cust_id')['s4'].agg('median').values # 我们加的
    stat_3['deal_diff_max_min_b1']  = ss[ss['date'] < dict_[month]].groupby('core_cust_id')['s4'].agg(diff_max_min).values
    
    
    #上上个月之前交易情况
    stat_4 = ss[ss['date'] < dict_1[month][0]].groupby('core_cust_id')['s1'].count().reset_index()
    stat_4.columns = ['core_cust_id','deal_count_b2']
    stat_4['deal_nunique_b2']   = ss[ss['date'] < dict_1[month][0]].groupby('core_cust_id')['s4'].agg('nunique').values
    stat_4['single_deal_b2']    = stat_4['deal_count_b2'] / stat_4['deal_nunique_b2']
    stat_4['deal_mean_b2']      = ss[ss['date'] < dict_1[month][0]].groupby('core_cust_id')['s4'].agg('mean').values
    stat_4['borrow_sum_b2']     = ss[ss['date'] < dict_1[month][0]].groupby('core_cust_id')['s4'].agg('sum').values
    stat_4['borrow_max_b2']     = ss[ss['date'] < dict_1[month][0]].groupby('core_cust_id')['s4'].agg('max').values    # 我们加的
    stat_4['borrow_min_b2']     = ss[ss['date'] < dict_1[month][0]].groupby('core_cust_id')['s4'].agg('min').values    # 我们加的
    stat_4['borrow_std_b2']     = ss[ss['date'] < dict_1[month][0]].groupby('core_cust_id')['s4'].agg('std').values    # 我们加的
    stat_4['borrow_median_b2']  = ss[ss['date'] < dict_1[month][0]].groupby('core_cust_id')['s4'].agg('median').values # 我们加的
    stat_4['deal_diff_max_min_b2']  = ss[ss['date'] < dict_1[month][0]].groupby('core_cust_id')['s4'].agg(diff_max_min).values
    

    tmp_df = tmp_df.merge(stat_1, on='core_cust_id', how='left')
    tmp_df = tmp_df.merge(stat_2, on='core_cust_id', how='left')
    tmp_df = tmp_df.merge(stat_3, on='core_cust_id', how='left')
    tmp_df = tmp_df.merge(stat_4, on='core_cust_id', how='left')
    
    dfs.append(tmp_df)

df = pd.concat(dfs).reset_index(drop=True)
2021-07-012021-08-012021-09-012021-10-01
gc.collect()
70
df.shape[0]
390977

F资产信息表

f = pd.read_csv(path2 + 'f.csv', thousands=',')
f.head(2)
core_cust_id f1 f2 f3 f4 f5 f6 f7 f8 f9 f10 f11 f12 f13 f14 f15 f16 f17 f18 f19 f20 f21 f22
0 48e055617a 2020-11-26 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 20210830
1 48e055617a 2020-11-26 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 20210730
drop_col = ['f'+str(i) for i in range(2,22)]
f.drop_duplicates(drop_col, keep='last', inplace=True)
f.head(2)
core_cust_id f1 f2 f3 f4 f5 f6 f7 f8 f9 f10 f11 f12 f13 f14 f15 f16 f17 f18 f19 f20 f21 f22
37 28fa6f2d3 2018-07-21 157.280 110.0 110.0 110.0 110.0 185.0 110.0 110.0 110.0 110.0 185.0 110.0 110.0 110.0 110.0 125.93 110.0 110.0 110.0 110.0 20210930
38 28fa6f2d3 2018-07-21 143.195 110.0 110.0 110.0 110.0 185.0 110.0 110.0 110.0 110.0 177.5 110.0 110.0 110.0 110.0 118.37 110.0 110.0 110.0 110.0 20210830
f['f_start_datetime'] = pd.to_datetime(f['f1'],errors='coerce')   #先转化为datetime类型,默认format='%Y-%m-%d %H:%M:%S'f['f_end_datetime'] = pd.to_datetime(f['f22'].astype('str'), format='%Y-%m-%d', errors='coerce')f['f_diff_time'] = f['f_end_datetime'] - f['f_start_datetime']
f['f_diff_time'] = f['f_diff_time'].dt.days
f.head(2)
core_cust_id f1 f2 f3 f4 f5 f6 f7 f8 f9 f10 f11 f12 f13 f14 f15 f16 f17 f18 f19 f20 f21 f22 f_start_datetime f_end_datetime f_diff_time
37 28fa6f2d3 2018-07-21 157.280 110.0 110.0 110.0 110.0 185.0 110.0 110.0 110.0 110.0 185.0 110.0 110.0 110.0 110.0 125.93 110.0 110.0 110.0 110.0 20210930 2018-07-21 2021-09-30 1167
38 28fa6f2d3 2018-07-21 143.195 110.0 110.0 110.0 110.0 185.0 110.0 110.0 110.0 110.0 177.5 110.0 110.0 110.0 110.0 118.37 110.0 110.0 110.0 110.0 20210830 2018-07-21 2021-08-30 1136
# 单个用户的资产信息统计了多次,那么需要考虑单个用户的资产变化情况,资产统计次数等,f{i}的统计量特征,f1频度,f22频度
f['core_cust_id'].nunique()
30490
f['f22'].value_counts()
20210930    28081
20210830    25696
20210730    24054
20210630    23063
Name: f22, dtype: int64
# df['date'] = df['datetime'].dt.date   #转化提取年-月-日f['year_f'] =f['f_start_datetime'].dt.year.fillna(0).astype("int")   #转化提取年 ,#如果有NaN元素则默认转化float64型,要转换数据类型则需要先填充空值,在做数据类型转换f['month_f'] = f['f_start_datetime'].dt.month.fillna(0).astype("int")  #转化提取月# df['monthofyear'] = df['year'].map(str) + '-' + df['month'].map(str) #转化获取年-月f['day_f'] = f['f_start_datetime'].dt.day.fillna(0).astype("int")      #转化提取天# df['hour'] = df['datetime'].dt.hour.fillna(0).astype("int")    #转化提取小时# df['minute'] = df['datetime'].dt.minute.fillna(0).astype("int") #转化提取分钟# df['second'] = df['datetime'].dt.second.fillna(0).astype("int") #转化提取秒# ss['dayofyear']  = ss['datetime'].dt.dayofyear.fillna(0).astype("int") #一年中的第n天f['weekofyear_f'] = f['f_start_datetime'].dt.weekofyear.fillna(0).astype("int") #一年中的第n周f['weekday_f']    = f['f_start_datetime'].dt.weekday.fillna(0).astype("int") #周几,一周里的第几天,Monday=0, Sunday=6f['quarter_f']    = f['f_start_datetime'].dt.quarter.fillna(0).astype("int")  #季度f['is_wknd_f']    = f['f_start_datetime'].dt.dayofweek // 4                  #是否周末# df['is_month_start'] = df['datetime'].dt.is_month_start.astype(int)# df['is_month_end'] = df['datetime'].dt.is_month_end.astype(int)display(f.head(2))
core_cust_id f1 f2 f3 f4 f5 f6 f7 f8 f9 f10 f11 f12 f13 f14 f15 f16 f17 f18 f19 f20 f21 f22 f_start_datetime f_end_datetime f_diff_time year_f month_f day_f weekofyear_f weekday_f quarter_f is_wknd_f
37 28fa6f2d3 2018-07-21 157.280 110.0 110.0 110.0 110.0 185.0 110.0 110.0 110.0 110.0 185.0 110.0 110.0 110.0 110.0 125.93 110.0 110.0 110.0 110.0 20210930 2018-07-21 2021-09-30 1167 2018 7 21 29 5 3 1
38 28fa6f2d3 2018-07-21 143.195 110.0 110.0 110.0 110.0 185.0 110.0 110.0 110.0 110.0 177.5 110.0 110.0 110.0 110.0 118.37 110.0 110.0 110.0 110.0 20210830 2018-07-21 2021-08-30 1136 2018 7 21 29 5 3 1
# f.loc[f['f22']==20210630,'f22'] = '2021-07-01'
f['f22'] = f['f22'].astype('str')
f.head(2)
core_cust_id f1 f2 f3 f4 f5 f6 f7 f8 f9 f10 f11 f12 f13 f14 f15 f16 f17 f18 f19 f20 f21 f22 f_start_datetime f_end_datetime f_diff_time year_f month_f day_f weekofyear_f weekday_f quarter_f is_wknd_f
37 28fa6f2d3 2018-07-21 157.280 110.0 110.0 110.0 110.0 185.0 110.0 110.0 110.0 110.0 185.0 110.0 110.0 110.0 110.0 125.93 110.0 110.0 110.0 110.0 20210930 2018-07-21 2021-09-30 1167 2018 7 21 29 5 3 1
38 28fa6f2d3 2018-07-21 143.195 110.0 110.0 110.0 110.0 185.0 110.0 110.0 110.0 110.0 177.5 110.0 110.0 110.0 110.0 118.37 110.0 110.0 110.0 110.0 20210830 2018-07-21 2021-08-30 1136 2018 7 21 29 5 3 1
f.info()

Int64Index: 100894 entries, 37 to 1038595
Data columns (total 33 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   core_cust_id      100894 non-null  object        
 1   f1                100894 non-null  object        
 2   f2                58531 non-null   float64       
 3   f3                42937 non-null   float64       
 4   f4                34636 non-null   float64       
 5   f5                12879 non-null   float64       
 6   f6                20731 non-null   float64       
 7   f7                58531 non-null   float64       
 8   f8                42937 non-null   float64       
 9   f9                34636 non-null   float64       
 10  f10               12879 non-null   float64       
 11  f11               20731 non-null   float64       
 12  f12               58531 non-null   float64       
 13  f13               42937 non-null   float64       
 14  f14               34636 non-null   float64       
 15  f15               12879 non-null   float64       
 16  f16               20731 non-null   float64       
 17  f17               58531 non-null   float64       
 18  f18               42937 non-null   float64       
 19  f19               34636 non-null   float64       
 20  f20               12879 non-null   float64       
 21  f21               20731 non-null   float64       
 22  f22               100894 non-null  object        
 23  f_start_datetime  100894 non-null  datetime64[ns]
 24  f_end_datetime    100894 non-null  datetime64[ns]
 25  f_diff_time       100894 non-null  int64         
 26  year_f            100894 non-null  int32         
 27  month_f           100894 non-null  int32         
 28  day_f             100894 non-null  int32         
 29  weekofyear_f      100894 non-null  int32         
 30  weekday_f         100894 non-null  int32         
 31  quarter_f         100894 non-null  int32         
 32  is_wknd_f         100894 non-null  int64         
dtypes: datetime64[ns](2), float64(20), int32(6), int64(2), object(3)
memory usage: 23.9+ MB
  • f2 定期季日均—f3 大额存单季日均----f4 A 理财产品季日均—f5 代销资管季日均----f6 代销基金季日均—
  • f7 定期时点余额–f8 大额存单时点余额–f9 A 理财产品时点余额–f10 代销资管时点余额–f11 代销基金时点余额
  • f12 定期月日均–f13 大额存单月日均–f14 A 理财产品月日均–f15 代销资管月日均–f16 代销基金月日均
  • f17 定期年日均–f18 大额存单年日均–f19 A 理财产品年日均–f20 代销资管年日均–f21 代销基金年日均
# def diff_max_min(x):#     return x.max() - x.min()
f['date'] = f['f22'].apply(lambda x: x[:6])#used_cols = [c for c in f.columns if c not in ['core_cust_id','f1','f22','date']]#      f4   /   f9  /  f14  /  f19     属于A产品# used_cols = ['f' + str(i) for i in range(2, 22)]  used_cols = ['f2', 'f3', 'f5', 'f6', 'f7', 'f8', 'f10', 'f11', 'f12', 'f13', 'f15', 'f16', 'f17', 'f18', 'f20', 'f21'] #对比全量和非A的分数dict_ = {'2021-07-01':'202106', '2021-08-01':'202107', '2021-09-01':'202108', '2021-10-01':'202109'}dfs = []for month in sorted(df['a3'].unique()):    print(month)    tmp_df = df[df['a3'] == month]        stat_1 = f[f['date'] == dict_[month]].groupby('core_cust_id')[used_cols].mean().reset_index()    stat_1.columns = ['core_cust_id'] + [f'{c}_mean' for c in used_cols]    for col in used_cols:         stat_1[f'{col}_nunique'] = f[f['date'] == dict_[month]].groupby('core_cust_id')[col].agg('nunique').values        stat_1[f'{col}_max'] = f[f['date'] == dict_[month]].groupby('core_cust_id')[col].agg('max').values        stat_1[f'{col}_min'] = f[f['date'] == dict_[month]].groupby('core_cust_id')[col].agg('min').values#         stat_1[f'{col}_mean'] = stat_1[f['date'] == dict_[month]].groupby('core_cust_id')[col].agg('mean').values        stat_1[f'{col}_sum'] = f[f['date'] == dict_[month]].groupby('core_cust_id')[col].agg('sum').values        stat_1[f'{col}_std'] = f[f['date'] == dict_[month]].groupby('core_cust_id')[col].agg('std').values        stat_1[f'{col}_median'] = f[f['date'] == dict_[month]].groupby('core_cust_id')[col].agg('median').values        stat_1[f'{col}_diff_max_min'] = f[f['date'] == dict_[month]].groupby('core_cust_id')[col].agg(diff_max_min).values            stat_2 = f[f['date'] < dict_[month]].groupby('core_cust_id')[used_cols].mean().reset_index()    stat_2.columns = ['core_cust_id'] + [f'{c}_mean_b' for c in used_cols]    for col in used_cols:         stat_2[f'{col}_nunique_b'] =  f[f['date'] < dict_[month]].groupby('core_cust_id')[col].agg('nunique').values        stat_2[f'{col}_max_b'] =      f[f['date'] < dict_[month]].groupby('core_cust_id')[col].agg('max').values        stat_2[f'{col}_min_b'] =      f[f['date'] < dict_[month]].groupby('core_cust_id')[col].agg('min').values#         stat_2[f'{col}_mean_b'] =     stat_2[f['date'] < dict_[month]].groupby('core_cust_id')[col].agg('mean').values        stat_2[f'{col}_sum_b'] =      f[f['date'] < dict_[month]].groupby('core_cust_id')[col].agg('sum').values        stat_2[f'{col}_std_b'] =      f[f['date'] < dict_[month]].groupby('core_cust_id')[col].agg('std').values        stat_2[f'{col}_median_b'] =   f[f['date'] < dict_[month]].groupby('core_cust_id')[col].agg('median').values        stat_2[f'{col}_diff_max_min_b'] = f[f['date'] < dict_[month]].groupby('core_cust_id')[col].agg(diff_max_min).values         tmp_df = tmp_df.merge(stat_1, on='core_cust_id', how='left')    tmp_df = tmp_df.merge(stat_2, on='core_cust_id', how='left')    dfs.append(tmp_df)df = pd.concat(dfs).reset_index(drop=True)
2021-07-012021-08-012021-09-012021-10-01
gc.collect()
15464
ff = f[['core_cust_id', 'f22', 'f_diff_time', 'year_f', 'month_f', 'day_f', 'weekofyear_f', 'weekday_f', 'quarter_f', 'is_wknd_f']]
ff.loc[ff['f22']=='20210630','f22'] = '2021-07-01'
ff.loc[ff['f22']=='20210730','f22'] = '2021-08-01'
ff.loc[ff['f22']=='20210830','f22'] = '2021-09-01'
ff.loc[ff['f22']=='20210930','f22'] = '2021-10-01'
ff.rename(columns={'f22' : 'a3'}, inplace=True)
df = pd.merge(df, ff, on=['core_cust_id', 'a3'], how='left')
#51461
df.shape[0]
390977
gc.collect()
34
df.nunique()
id                                   390977
core_cust_id                         212637
prod_code                                 2
a2                                        1
a3                                        4
y                                         2
type                                      2
uid_count_1                              22
pid_nunique_1                             6
pid_mean_count_1                         33
净值_apply_mean_1                         702
净值_apply_max_1                           42
净值_apply_min_1                           43
净值_apply_std_1                          812
净值_apply_median_1                       202
净值_apply_sum_1                          835
净值_apply_max_min_1                      159
总金额_apply_mean_1                        910
总金额_apply_max_1                         689
总金额_apply_min_1                         504
总金额_apply_std_1                         489
总金额_apply_median_1                      724
总金额_apply_sum_1                         992
总金额_apply_max_min_1                     385
超额管理费_apply_mean_1                      796
超额管理费_apply_max_1                       585
超额管理费_apply_min_1                       443
超额管理费_apply_std_1                       416
超额管理费_apply_median_1                    641
超额管理费_apply_sum_1                       846
超额管理费_apply_max_min_1                   311
apply_amt_apply_mean_1                  703
apply_amt_apply_max_1                   377
apply_amt_apply_min_1                   311
apply_amt_apply_std_1                   693
apply_amt_apply_median_1                434
apply_amt_apply_sum_1                  1037
apply_amt_apply_max_min_1               259
业务代码_apply_nunique_1                      6
渠道标识_apply_nunique_1                      2
资金状态_apply_nunique_1                      5
交易状态_apply_nunique_1                      3
deal_bus_apply_nunique_1                  8
deal_bus_channel_apply_nunique_1          8
deal_bus_fund_apply_nunique_1             8
deal_bus_c_f_apply_nunique_1              8
uid_count_2                               0
pid_nunique_2                             0
pid_mean_count_2                          0
净值_apply_mean_2                           0
净值_apply_max_2                            0
净值_apply_min_2                            0
净值_apply_std_2                            0
净值_apply_median_2                         0
净值_apply_sum_2                            0
净值_apply_max_min_2                        0
总金额_apply_mean_2                          0
总金额_apply_max_2                           0
总金额_apply_min_2                           0
总金额_apply_std_2                           0
总金额_apply_median_2                        0
总金额_apply_sum_2                           0
总金额_apply_max_min_2                       0
超额管理费_apply_mean_2                        0
超额管理费_apply_max_2                         0
超额管理费_apply_min_2                         0
超额管理费_apply_std_2                         0
超额管理费_apply_median_2                      0
超额管理费_apply_sum_2                         0
超额管理费_apply_max_min_2                     0
apply_amt_apply_mean_2                    0
apply_amt_apply_max_2                     0
apply_amt_apply_min_2                     0
apply_amt_apply_std_2                     0
apply_amt_apply_median_2                  0
apply_amt_apply_sum_2                     0
apply_amt_apply_max_min_2                 0
业务代码_apply_nunique_2                      0
渠道标识_apply_nunique_2                      0
资金状态_apply_nunique_2                      0
交易状态_apply_nunique_2                      0
deal_bus_apply_nunique_2                  0
deal_bus_channel_apply_nunique_2          0
deal_bus_fund_apply_nunique_2             0
deal_bus_c_f_apply_nunique_2              0
uid_count_b                              48
pid_nunique_b                            14
pid_mean_count_b                        146
净值_apply_mean_b                        4033
净值_apply_max_b                           76
净值_apply_min_b                           62
净值_apply_std_b                         5427
净值_apply_median_b                       453
净值_apply_sum_b                         4794
净值_apply_max_min_b                      414
总金额_apply_mean_b                       1790
总金额_apply_max_b                         839
总金额_apply_min_b                          11
总金额_apply_std_b                        1902
总金额_apply_median_b                      366
总金额_apply_sum_b                        1936
总金额_apply_max_min_b                     835
超额管理费_apply_mean_b                     1438
超额管理费_apply_max_b                       689
超额管理费_apply_min_b                         1
超额管理费_apply_std_b                      1589
超额管理费_apply_median_b                    294
超额管理费_apply_sum_b                      1529
超额管理费_apply_max_min_b                   689
apply_amt_apply_mean_b                 2578
apply_amt_apply_max_b                   618
apply_amt_apply_min_b                   443
apply_amt_apply_std_b                  4182
apply_amt_apply_median_b                725
apply_amt_apply_sum_b                  3875
apply_amt_apply_max_min_b               585
业务代码_apply_nunique_b                      7
渠道标识_apply_nunique_b                      3
资金状态_apply_nunique_b                      5
交易状态_apply_nunique_b                      4
deal_bus_apply_nunique_b                 10
deal_bus_channel_apply_nunique_b         10
deal_bus_fund_apply_nunique_b            11
deal_bus_c_f_apply_nunique_b             11
uid_count_b1                             43
pid_nunique_b1                           13
pid_mean_count_b1                       131
净值_apply_mean_b1                       3199
净值_apply_max_b1                          69
净值_apply_min_b1                          54
净值_apply_std_b1                        4263
净值_apply_median_b1                      376
净值_apply_sum_b1                        3804
净值_apply_max_min_b1                     340
总金额_apply_mean_b1                      1170
总金额_apply_max_b1                        621
总金额_apply_min_b1                         11
总金额_apply_std_b1                       1239
总金额_apply_median_b1                     252
总金额_apply_sum_b1                       1272
总金额_apply_max_min_b1                    617
超额管理费_apply_mean_b1                     888
超额管理费_apply_max_b1                      481
超额管理费_apply_min_b1                        1
超额管理费_apply_std_b1                      967
超额管理费_apply_median_b1                   189
超额管理费_apply_sum_b1                      948
超额管理费_apply_max_min_b1                  481
apply_amt_apply_mean_b1                2205
apply_amt_apply_max_b1                  594
apply_amt_apply_min_b1                  435
apply_amt_apply_std_b1                 3452
apply_amt_apply_median_b1               685
apply_amt_apply_sum_b1                 3329
apply_amt_apply_max_min_b1              550
业务代码_apply_nunique_b1                     7
渠道标识_apply_nunique_b1                     3
资金状态_apply_nunique_b1                     5
交易状态_apply_nunique_b1                     4
deal_bus_apply_nunique_b1                10
deal_bus_channel_apply_nunique_b1        10
deal_bus_fund_apply_nunique_b1           10
deal_bus_c_f_apply_nunique_b1            10
uid_count_b2                             36
pid_nunique_b2                           13
pid_mean_count_b2                       114
净值_apply_mean_b2                       2407
净值_apply_max_b2                          61
净值_apply_min_b2                          46
净值_apply_std_b2                        3137
净值_apply_median_b2                      284
净值_apply_sum_b2                        2857
净值_apply_max_min_b2                     268
总金额_apply_mean_b2                       581
总金额_apply_max_b2                        332
总金额_apply_min_b2                         11
总金额_apply_std_b2                        621
总金额_apply_median_b2                     134
总金额_apply_sum_b2                        658
总金额_apply_max_min_b2                    328
超额管理费_apply_mean_b2                     347
超额管理费_apply_max_b2                      203
超额管理费_apply_min_b2                        1
超额管理费_apply_std_b2                      381
超额管理费_apply_median_b2                    77
超额管理费_apply_sum_b2                      393
超额管理费_apply_max_min_b2                  203
apply_amt_apply_mean_b2                1893
apply_amt_apply_max_b2                  562
apply_amt_apply_min_b2                  422
apply_amt_apply_std_b2                 2781
apply_amt_apply_median_b2               653
apply_amt_apply_sum_b2                 2815
apply_amt_apply_max_min_b2              493
业务代码_apply_nunique_b2                     7
渠道标识_apply_nunique_b2                     3
资金状态_apply_nunique_b2                     5
交易状态_apply_nunique_b2                     4
deal_bus_apply_nunique_b2                10
deal_bus_channel_apply_nunique_b2        10
deal_bus_fund_apply_nunique_b2           10
deal_bus_c_f_apply_nunique_b2            10
flow_redu_b1                             26
flow_redu_b                              28
flow_redu_b2                             22
flow_redu_1                              17
flow_redu_2                               0
risk_count                                3
risk_change                               2
risk_level_mean                          10
risk_max                                  5
risk_min                                  5
risk_count_b                             14
risk_change_b                             5
risk_level_mean_b                        80
risk_max_b                                5
risk_min_b                                5
客户风险等级                                    5
date                                     98
risk_diff_date                          123
性别                                        2
客户等级                                      4
年龄                                       84
age_gap                                   1
age_grade                               221
age_bin10                                10
age_bin15                                15
age_bin20                                20
age_bin25                                25
age_bin30                                30
age_频度                                   80
age_freq3                                 3
age_freq5                                 5
age_freq7                                 7
age_freq9                                 9
age_freq11                               11
cust_grade_频度                             4
prod_age_count                            2
prod_age_max                              2
prod_age_min                              1
prod_age_sum                              2
prod_age_std                              2
prod_age_median                           2
prod_age_nunique                          2
uid_click_action_count_b                436
pid_click_action_nunique_b              169
uid_click_r3_nunique_b                    2
uid_click_iswknd_sum_b                  193
uid_click_iswknd_median_b                 3
uid_click_click_hour_median_b            47
uid_click_click_hour_sum_b             2518
uid_click_click_hour_max_b               24
uid_click_click_hour_min_b               24
uid_click_click_hour_nunique_b           24
uid_click_click_hour_smean_b           1610
uid_click_click_month_median_b           15
uid_click_click_month_sum_b            1392
uid_click_click_month_max_b               7
uid_click_click_month_min_b               7
uid_click_click_month_nunique_b           7
uid_click_click_month_smean_b          1003
uid_click_click_d_median_b               61
uid_click_click_d_sum_b                2904
uid_click_click_d_max_b                  31
uid_click_click_d_min_b                  31
uid_click_click_d_nunique_b              31
uid_click_click_d_smean_b              1833
uid_click_weekofyear_r_median_b          85
uid_click_weekofyear_r_sum_b           3387
uid_click_weekofyear_r_max_b             32
uid_click_weekofyear_r_min_b             32
uid_click_weekofyear_r_nunique_b         32
uid_click_weekofyear_r_smean_b         1849
uid_click_weekday_r_median_b             13
uid_click_weekday_r_sum_b               845
uid_click_weekday_r_max_b                 7
uid_click_weekday_r_min_b                 7
uid_click_weekday_r_nunique_b             7
uid_click_weekday_r_smean_b             683
uid_click_quarter_r_median_b              5
uid_click_quarter_r_sum_b               753
uid_click_quarter_r_max_b                 3
uid_click_quarter_r_min_b                 3
uid_click_quarter_r_nunique_b             3
uid_click_quarter_r_smean_b             624
uid_click_action_count_b1               429
pid_click_action_nunique_b1             166
uid_click_r3_nunique_b1                   2
uid_click_iswknd_sum_b1                 195
uid_click_iswknd_median_b1                3
uid_click_click_hour_median_b1           47
uid_click_click_hour_sum_b1            2513
uid_click_click_hour_max_b1              24
uid_click_click_hour_min_b1              24
uid_click_click_hour_nunique_b1          24
uid_click_click_hour_smean_b1          1615
uid_click_click_month_median_b1          15
uid_click_click_month_sum_b1           1352
uid_click_click_month_max_b1              7
uid_click_click_month_min_b1              7
uid_click_click_month_nunique_b1          7
uid_click_click_month_smean_b1         1012
uid_click_click_d_median_b1              61
uid_click_click_d_sum_b1               2895
uid_click_click_d_max_b1                 31
uid_click_click_d_min_b1                 31
uid_click_click_d_nunique_b1             31
uid_click_click_d_smean_b1             1819
uid_click_weekofyear_r_median_b1         84
uid_click_weekofyear_r_sum_b1          3261
uid_click_weekofyear_r_max_b1            32
uid_click_weekofyear_r_min_b1            32
uid_click_weekofyear_r_nunique_b1        32
uid_click_weekofyear_r_smean_b1        1822
uid_click_weekday_r_median_b1            13
uid_click_weekday_r_sum_b1              839
uid_click_weekday_r_max_b1                7
uid_click_weekday_r_min_b1                7
uid_click_weekday_r_nunique_b1            7
uid_click_weekday_r_smean_b1            671
uid_click_quarter_r_median_b1             5
uid_click_quarter_r_sum_b1              738
uid_click_quarter_r_max_b1                3
uid_click_quarter_r_min_b1                3
uid_click_quarter_r_nunique_b1            3
uid_click_quarter_r_smean_b1            630
cp_click_action_count_b                 150
cp_click_r3_nunique_b                     1
cp_click_iswknd_sum_b                    72
cp_click_iswknd_median_b                  3
cp_click_click_hour_median_b             47
cp_click_click_hour_sum_b               913
cp_click_click_hour_max_b                24
cp_click_click_hour_min_b                24
cp_click_click_hour_nunique_b            24
cp_click_click_hour_smean_b             494
cp_click_click_month_median_b            12
cp_click_click_month_sum_b              508
cp_click_click_month_max_b                6
cp_click_click_month_min_b                6
cp_click_click_month_nunique_b            6
cp_click_click_month_smean_b            290
cp_click_click_d_median_b                61
cp_click_click_d_sum_b                 1083
cp_click_click_d_max_b                   31
cp_click_click_d_min_b                   31
cp_click_click_d_nunique_b               31
cp_click_click_d_smean_b                559
cp_click_weekofyear_r_median_b           51
cp_click_weekofyear_r_sum_b            1312
cp_click_weekofyear_r_max_b              24
cp_click_weekofyear_r_min_b              24
cp_click_weekofyear_r_nunique_b          23
cp_click_weekofyear_r_smean_b           498
cp_click_weekday_r_median_b              13
cp_click_weekday_r_sum_b                293
cp_click_weekday_r_max_b                  7
cp_click_weekday_r_min_b                  7
cp_click_weekday_r_nunique_b              7
cp_click_weekday_r_smean_b              267
cp_click_quarter_r_median_b               5
cp_click_quarter_r_sum_b                264
cp_click_quarter_r_max_b                  3
cp_click_quarter_r_min_b                  3
cp_click_quarter_r_nunique_b              3
cp_click_quarter_r_smean_b              209
cp_click_action_count_b1                138
cp_click_r3_nunique_b1                    1
cp_click_iswknd_sum_b1                   67
cp_click_iswknd_median_b1                 3
cp_click_click_hour_median_b1            47
cp_click_click_hour_sum_b1              871
cp_click_click_hour_max_b1               24
cp_click_click_hour_min_b1               24
cp_click_click_hour_nunique_b1           24
cp_click_click_hour_smean_b1            464
cp_click_click_month_median_b1           12
cp_click_click_month_sum_b1             471
cp_click_click_month_max_b1               6
cp_click_click_month_min_b1               6
cp_click_click_month_nunique_b1           6
cp_click_click_month_smean_b1           281
cp_click_click_d_median_b1               61
cp_click_click_d_sum_b1                1029
cp_click_click_d_max_b1                  31
cp_click_click_d_min_b1                  31
cp_click_click_d_nunique_b1              31
cp_click_click_d_smean_b1               524
cp_click_weekofyear_r_median_b1          51
cp_click_weekofyear_r_sum_b1           1224
cp_click_weekofyear_r_max_b1             24
cp_click_weekofyear_r_min_b1             24
cp_click_weekofyear_r_nunique_b1         22
cp_click_weekofyear_r_smean_b1          473
cp_click_weekday_r_median_b1             13
cp_click_weekday_r_sum_b1               279
cp_click_weekday_r_max_b1                 7
cp_click_weekday_r_min_b1                 7
cp_click_weekday_r_nunique_b1             7
cp_click_weekday_r_smean_b1             250
cp_click_quarter_r_median_b1              5
cp_click_quarter_r_sum_b1               247
cp_click_quarter_r_max_b1                 3
cp_click_quarter_r_min_b1                 3
cp_click_quarter_r_nunique_b1             3
cp_click_quarter_r_smean_b1             198
deal_count                              277
deal_nunique                            202
single_deal                            2232
deal_mean                             73099
borrow_sum                            74470
borrow_max                            32457
borrow_min                            24461
borrow_std                            67088
borrow_median                         36804
deal_diff_max_min                     48290
deal_count_2                              0
deal_nunique_2                            0
single_deal_2                             0
deal_mean_2                               0
borrow_sum_2                              0
borrow_max_2                              0
borrow_min_2                              0
borrow_std_2                              0
borrow_median_2                           0
deal_diff_max_min_2                       0
deal_count_b1                           980
deal_nunique_b1                         581
single_deal_b1                         9244
deal_mean_b1                         122604
borrow_sum_b1                        124481
borrow_max_b1                         37325
borrow_min_b1                         20383
borrow_std_b1                        122418
borrow_median_b1                      41283
deal_diff_max_min_b1                  65336
deal_count_b2                           869
deal_nunique_b2                         517
single_deal_b2                         7963
deal_mean_b2                         114017
borrow_sum_b2                        115903
borrow_max_b2                         34360
borrow_min_b2                         19338
borrow_std_b2                        113487
borrow_median_b2                      39598
deal_diff_max_min_b2                  61295
f2_mean                                7351
f3_mean                                2446
f5_mean                                1975
f6_mean                                6853
f7_mean                                3326
f8_mean                                1113
f10_mean                               1721
f11_mean                               6341
f12_mean                               4592
f13_mean                               1637
f15_mean                               1836
f16_mean                               6640
f17_mean                              15999
f18_mean                               4912
f20_mean                               2684
f21_mean                               7338
f2_nunique                                2
f2_max                                 7351
f2_min                                 7351
f2_sum                                 7352
f2_std                                    0
f2_median                              7351
f2_diff_max_min                           1
f3_nunique                                2
f3_max                                 2446
f3_min                                 2446
f3_sum                                 2447
f3_std                                    0
f3_median                              2446
f3_diff_max_min                           1
f5_nunique                                2
f5_max                                 1975
f5_min                                 1975
f5_sum                                 1976
f5_std                                    0
f5_median                              1975
f5_diff_max_min                           1
f6_nunique                                2
f6_max                                 6853
f6_min                                 6853
f6_sum                                 6854
f6_std                                    0
f6_median                              6853
f6_diff_max_min                           1
f7_nunique                                2
f7_max                                 3326
f7_min                                 3326
f7_sum                                 3327
f7_std                                    0
f7_median                              3326
f7_diff_max_min                           1
f8_nunique                                2
f8_max                                 1113
f8_min                                 1113
f8_sum                                 1114
f8_std                                    0
f8_median                              1113
f8_diff_max_min                           1
f10_nunique                               2
f10_max                                1721
f10_min                                1721
f10_sum                                1722
f10_std                                   0
f10_median                             1721
f10_diff_max_min                          1
f11_nunique                               2
f11_max                                6341
f11_min                                6341
f11_sum                                6342
f11_std                                   0
f11_median                             6341
f11_diff_max_min                          1
f12_nunique                               2
f12_max                                4592
f12_min                                4592
f12_sum                                4593
f12_std                                   0
f12_median                             4592
f12_diff_max_min                          1
f13_nunique                               2
f13_max                                1637
f13_min                                1637
f13_sum                                1638
f13_std                                   0
f13_median                             1637
f13_diff_max_min                          1
f15_nunique                               2
f15_max                                1836
f15_min                                1836
f15_sum                                1837
f15_std                                   0
f15_median                             1836
f15_diff_max_min                          1
f16_nunique                               2
f16_max                                6640
f16_min                                6640
f16_sum                                6641
f16_std                                   0
f16_median                             6640
f16_diff_max_min                          1
f17_nunique                               2
f17_max                               15999
f17_min                               15999
f17_sum                               16000
f17_std                                   0
f17_median                            15999
f17_diff_max_min                          1
f18_nunique                               2
f18_max                                4912
f18_min                                4912
f18_sum                                4913
f18_std                                   0
f18_median                             4912
f18_diff_max_min                          1
f20_nunique                               2
f20_max                                2684
f20_min                                2684
f20_sum                                2685
f20_std                                   0
f20_median                             2684
f20_diff_max_min                          1
f21_nunique                               2
f21_max                                7338
f21_min                                7338
f21_sum                                7339
f21_std                                   0
f21_median                             7338
f21_diff_max_min                          1
f2_mean_b                              8488
f3_mean_b                              2931
f5_mean_b                              1747
f6_mean_b                              4647
f7_mean_b                              2807
f8_mean_b                              1086
f10_mean_b                             1438
f11_mean_b                             4193
f12_mean_b                             4616
f13_mean_b                             2079
f15_mean_b                             1523
f16_mean_b                             4424
f17_mean_b                            12085
f18_mean_b                             3745
f20_mean_b                             2061
f21_mean_b                             4836
f2_nunique_b                              4
f2_max_b                               5463
f2_min_b                               5238
f2_sum_b                               9220
f2_std_b                               4754
f2_median_b                            7300
f2_diff_max_min_b                      3690
f3_nunique_b                              4
f3_max_b                               1960
f3_min_b                               2026
f3_sum_b                               3688
f3_std_b                               1134
f3_median_b                            2569
f3_diff_max_min_b                       859
f5_nunique_b                              4
f5_max_b                               1535
f5_min_b                               1219
f5_sum_b                               1848
f5_std_b                                846
f5_median_b                            1668
f5_diff_max_min_b                       789
f6_nunique_b                              4
f6_max_b                               4010
f6_min_b                               3169
f6_sum_b                               4701
f6_std_b                               2584
f6_median_b                            4466
f6_diff_max_min_b                      2347
f7_nunique_b                              4
f7_max_b                               2571
f7_min_b                               2179
f7_sum_b                               3851
f7_std_b                                889
f7_median_b                            2634
f7_diff_max_min_b                       783
f8_nunique_b                              4
f8_max_b                               1037
f8_min_b                               1024
f8_sum_b                               1961
f8_std_b                                 70
f8_median_b                            1050
f8_diff_max_min_b                        53
f10_nunique_b                             4
f10_max_b                              1362
f10_min_b                              1038
f10_sum_b                              1566
f10_std_b                               639
f10_median_b                           1395
f10_diff_max_min_b                      607
f11_nunique_b                             4
f11_max_b                              3469
f11_min_b                              3154
f11_sum_b                              4247
f11_std_b                              2328
f11_median_b                           4063
f11_diff_max_min_b                     2213
f12_nunique_b                             4
f12_max_b                              3507
f12_min_b                              3047
f12_sum_b                              5552
f12_std_b                              2104
f12_median_b                           4121
f12_diff_max_min_b                     1669
f13_nunique_b                             4
f13_max_b                              1477
f13_min_b                              1517
f13_sum_b                              2887
f13_std_b                               676
f13_median_b                           1834
f13_diff_max_min_b                      445
f15_nunique_b                             4
f15_max_b                              1409
f15_min_b                              1124
f15_sum_b                              1645
f15_std_b                               700
f15_median_b                           1487
f15_diff_max_min_b                      678
f16_nunique_b                             4
f16_max_b                              3737
f16_min_b                              3267
f16_sum_b                              4478
f16_std_b                              2420
f16_median_b                           4289
f16_diff_max_min_b                     2322
f17_nunique_b                             4
f17_max_b                             10601
f17_min_b                              7600
f17_sum_b                             12232
f17_std_b                              7247
f17_median_b                          11760
f17_diff_max_min_b                     6906
f18_nunique_b                             4
f18_max_b                              3342
f18_min_b                              2761
f18_sum_b                              4410
f18_std_b                              1659
f18_median_b                           3670
f18_diff_max_min_b                     1553
f20_nunique_b                             4
f20_max_b                              1688
f20_min_b                              1747
f20_sum_b                              2154
f20_std_b                              1049
f20_median_b                           2037
f20_diff_max_min_b                     1022
f21_nunique_b                             4
f21_max_b                              4322
f21_min_b                              3447
f21_sum_b                              4893
f21_std_b                              2693
f21_median_b                           4760
f21_diff_max_min_b                     2586
f_diff_time                            1162
year_f                                    4
month_f                                  12
day_f                                    31
weekofyear_f                             53
weekday_f                                 7
quarter_f                                 4
is_wknd_f                                 2
dtype: int64


你可能感兴趣的:(数据分析,python,数据挖掘,机器学习)