2022/02/08 17:33
import pandas as pd
import numpy as np
def calculate_psi(expected, actual, buckettype='bins', buckets=10):
'''Calculate the PSI (population stability index) across all variables
Args:
expected: 测试集DF
actual: 训练集DF
buckettype: 分箱模式:
bins:表示等刻度分箱
quantiles:等频分箱
buckets: 分箱数目
Returns:
psi_values: PSI
stat_values: list type,每个分箱具体的PSi及数量
'''
def psi(expected_array, actual_array, buckets):
'''Calculate the PSI for a single variable
Args:
expected_array: numpy array of original values
actual_array: numpy array of new values, same size as expected
buckets: number of percentile ranges to bucket the values into
Returns:
psi_value: calculated PSI value
stat_values: 每一个分箱的数量及psi
'''
# 箱子集合
'''
加上 【-0.00001】的目的是
例子
bins = [-inf,0.0,0.6601,1.3211,...,1.4555,inf]
df.min() 是 0 ,df.max()是1.4555
pd.cut(df,bins)
加【-0.00001】结果,如:(-inf,-0.00001],(-0.00001,0],(0,0.6601] ...
不加【-0.00001】结果,如:(-inf,0],(0,0.6601] ...
如果df中最小值为0时,
不加【-0.00001】就会被统计在(-inf,0]箱子里面,就可能导致负值的情况 漏掉
加【-0.00001】,负值就会被统计在(-inf,-0.00001]这个箱子里面,避免上面的情况
'''
try:
bins = [-np.inf] + [-0.00001] +[round(i,4) for i in np.arange(np.min(expected_array),np.max(expected_array),(
np.max(expected_array) - np.min(expected_array) + 1
) / buckets)] + [np.inf]
# 分箱
if buckettype == 'bins':
# 等宽
expected_bins = expected_array.groupby(pd.cut(expected_array,bins = bins)).count()
elif buckettype == 'quantiles':
# 等 频率
expected_bins = expected_array.groupby(pd.qcut(expected_array,buckets,duplicates = 'drop')).count()
# expected 上的 箱子 及 对应值
expected_bins_index_list = [str(b) for b in expected_bins.index.categories]
expected_bins_index_value = [v for v in expected_bins.values]
expected_percents = [v/len(expected) for v in expected_bins.values]
# 统计actual 的数量
actual_cnt = []
actual_percents = []
for index in expected_bins.index:
actual_cnt.append(actual_array[(actual_array>index.left) & (actual_array<=index.right)].shape[0])
actual_percents.append(actual_array[(actual_array>index.left) & (actual_array<=index.right)].shape[0] / len(actual))
def sub_psi(e_perc, a_perc):
'''Calculate the actual PSI value from comparing the values.
Update the actual value to a very small number if equal to zero
'''
if a_perc == 0:
a_perc = 0.0001
if e_perc == 0:
e_perc = 0.0001
value = (e_perc - a_perc) * np.log(e_perc / a_perc)
return(value)
# 各分箱的数量及PSI
stat_df = pd.DataFrame({
"bucket":expected_bins_index_list,
"expected_cnt":expected_bins_index_value,
"actual_cnt":actual_cnt,
"expected_perc":expected_percents,
"actual_perc":actual_percents
})
stat_df['psi'] = stat_df.apply(lambda row: sub_psi(row['expected_perc'],row['actual_perc']), axis=1)
stat_df = stat_df[['bucket', 'expected_cnt', 'actual_cnt', 'expected_perc', 'actual_perc', 'psi']]
psi_ = stat_df['psi'].sum()
except:
print('error!!!')
psi = np.nan
stat_df = None
return (psi_,stat_df)
stat_values = []
psi_list = []
for col in expected.columns:
psi_stat = psi(expected[col], actual[col], buckets)
psi_list.append([psi_stat[0]])
stat_values.append(psi_stat[1])
# print(psi_list)
psi_values = pd.DataFrame(psi_list).T
psi_values.columns = expected.columns
return psi_values,stat_values
path = "mx_case_clipper.csv"
df = pd.read_csv(path)
col = ['bubble_90day_rate','bubble_90day_cnt']
df[col].fillna(0,inplace=True)
df_06 = df[df['year_month'] == '2021-06'][col]
df_07 = df[df['year_month'] == '2021-07'][col]
buckets = 10
psi_values,stat_values = calculate_psi(df_07, df_06, buckettype='bins', buckets=buckets)
'''
result:
psi_values:
bubble_90day_rate bubble_90day_cnt
0 0.021738 0.035018
stat_values:
[ bucket expected_cnt actual_cnt expected_perc actual_perc \
0 (-inf, -1e-05] 0 0 0.000000 0.000000
1 (-1e-05, 0.0] 209 245 0.323529 0.339806
2 (0.0, 4.1] 384 393 0.594427 0.545076
3 (4.1, 8.2] 34 50 0.052632 0.069348
4 (8.2, 12.3] 10 18 0.015480 0.024965
5 (12.3, 16.4] 4 5 0.006192 0.006935
6 (16.4, 20.5] 2 2 0.003096 0.002774
7 (20.5, 24.6] 2 4 0.003096 0.005548
8 (24.6, 28.7] 0 0 0.000000 0.000000
9 (28.7, 32.8] 0 0 0.000000 0.000000
10 (32.8, 36.9] 0 1 0.000000 0.001387
11 (36.9, inf] 1 3 0.001548 0.004161
psi
0 0.000000
1 0.000799
2 0.004277
3 0.004611
4 0.004534
5 0.000084
6 0.000035
7 0.001430
8 0.000000
9 0.000000
10 0.003384
11 0.002584 ,
bucket expected_cnt actual_cnt expected_perc actual_perc \
0 (-inf, -1e-05] 0 0 0.000000 0.000000
1 (-1e-05, 0.0] 209 245 0.323529 0.339806
2 (0.0, 42.7] 318 336 0.492260 0.466019
3 (42.7, 85.4] 62 80 0.095975 0.110957
4 (85.4, 128.1] 23 26 0.035604 0.036061
5 (128.1, 170.8] 20 11 0.030960 0.015257
6 (170.8, 213.5] 6 9 0.009288 0.012483
7 (213.5, 256.2] 6 6 0.009288 0.008322
8 (256.2, 298.9] 0 3 0.000000 0.004161
9 (298.9, 341.6] 1 2 0.001548 0.002774
10 (341.6, 384.3] 0 0 0.000000 0.000000
11 (384.3, inf] 1 3 0.001548 0.004161
psi
0 0.000000
1 0.000799
2 0.001437
3 0.002173
4 0.000006
5 0.011113
6 0.000944
7 0.000106
8 0.015140
9 0.000715
10 0.000000
11 0.002584 ]
'''