PSI模型稳定计算代码

PSI

2022/02/08 17:33

  1. 参考资料:
    PSI解释:
    https://zhuanlan.zhihu.com/p/79682292
    [https://mwburke.github.io/data%20science/2018/04/29/population-stability-index.html](https://mwburke.github.io/data science/2018/04/29/population-stability-index.html)
  2. 代码
    https://github.com/mwburke/population-stability-index/blob/master/psi.py
import pandas as pd
import numpy as np
 
 
def calculate_psi(expected, actual, buckettype='bins', buckets=10):
    '''Calculate the PSI (population stability index) across all variables
 
    Args:
       expected: 测试集DF
       actual: 训练集DF
       buckettype: 分箱模式:
                   bins:表示等刻度分箱
                   quantiles:等频分箱
       buckets: 分箱数目
       
    Returns:
       psi_values: PSI
       stat_values: list type,每个分箱具体的PSi及数量
    '''
     
    def psi(expected_array, actual_array, buckets):
        '''Calculate the PSI for a single variable
 
        Args:
           expected_array: numpy array of original values
           actual_array: numpy array of new values, same size as expected
           buckets: number of percentile ranges to bucket the values into
 
        Returns:
           psi_value: calculated PSI value
           stat_values: 每一个分箱的数量及psi
        '''
        # 箱子集合
        '''
        加上 【-0.00001】的目的是
        例子
        bins = [-inf,0.0,0.6601,1.3211,...,1.4555,inf]
        df.min() 是 0 ,df.max()是1.4555
        pd.cut(df,bins)
             加【-0.00001】结果,如:(-inf,-0.00001],(-0.00001,0],(0,0.6601] ...
            不加【-0.00001】结果,如:(-inf,0],(0,0.6601] ...
        如果df中最小值为0时,
            不加【-0.00001】就会被统计在(-inf,0]箱子里面,就可能导致负值的情况 漏掉
            加【-0.00001】,负值就会被统计在(-inf,-0.00001]这个箱子里面,避免上面的情况  
        '''
        try:
            bins = [-np.inf] + [-0.00001] +[round(i,4) for i in  np.arange(np.min(expected_array),np.max(expected_array),(
        np.max(expected_array) -  np.min(expected_array) + 1 
        ) / buckets)] + [np.inf]
 
            # 分箱
            if buckettype == 'bins':
                # 等宽
                expected_bins = expected_array.groupby(pd.cut(expected_array,bins = bins)).count()
 
            elif buckettype == 'quantiles':
                # 等 频率
                expected_bins = expected_array.groupby(pd.qcut(expected_array,buckets,duplicates = 'drop')).count()
 
            # expected 上的 箱子 及 对应值
            expected_bins_index_list = [str(b) for b in expected_bins.index.categories]
            expected_bins_index_value = [v for v in expected_bins.values]
            expected_percents = [v/len(expected) for v in expected_bins.values]
 
            # 统计actual 的数量
            actual_cnt = []
            actual_percents = []
            for index in expected_bins.index:
                actual_cnt.append(actual_array[(actual_array>index.left) & (actual_array<=index.right)].shape[0])
                actual_percents.append(actual_array[(actual_array>index.left) & (actual_array<=index.right)].shape[0] / len(actual))
 
            def sub_psi(e_perc, a_perc):
                '''Calculate the actual PSI value from comparing the values.
                   Update the actual value to a very small number if equal to zero
                '''
                if a_perc == 0:
                    a_perc = 0.0001
                if e_perc == 0:
                    e_perc = 0.0001
 
                value = (e_perc - a_perc) * np.log(e_perc / a_perc)
                return(value)
 
            # 各分箱的数量及PSI
            stat_df = pd.DataFrame({
                "bucket":expected_bins_index_list,
                "expected_cnt":expected_bins_index_value,
                "actual_cnt":actual_cnt,
                "expected_perc":expected_percents,
                "actual_perc":actual_percents
            })
 
            stat_df['psi'] = stat_df.apply(lambda row: sub_psi(row['expected_perc'],row['actual_perc']), axis=1)
            stat_df = stat_df[['bucket', 'expected_cnt', 'actual_cnt', 'expected_perc', 'actual_perc', 'psi']]
            psi_ = stat_df['psi'].sum()
        except:
            print('error!!!')
            psi = np.nan
            stat_df = None
        return (psi_,stat_df)
     
    stat_values = []
    psi_list = []
    for col in expected.columns:
        psi_stat = psi(expected[col], actual[col], buckets)
        psi_list.append([psi_stat[0]])
        stat_values.append(psi_stat[1])
        # print(psi_list)
    psi_values = pd.DataFrame(psi_list).T
    psi_values.columns = expected.columns
    return psi_values,stat_values
 
path = "mx_case_clipper.csv"
df = pd.read_csv(path)
col = ['bubble_90day_rate','bubble_90day_cnt']
df[col].fillna(0,inplace=True)
df_06 = df[df['year_month'] == '2021-06'][col]
df_07 = df[df['year_month'] == '2021-07'][col]
buckets = 10
 
psi_values,stat_values = calculate_psi(df_07, df_06, buckettype='bins', buckets=buckets)
'''
result:
psi_values:
 
     bubble_90day_rate   bubble_90day_cnt
0       0.021738          0.035018
 
stat_values:
[            bucket  expected_cnt  actual_cnt  expected_perc  actual_perc  \
  0   (-inf, -1e-05]             0           0       0.000000     0.000000  
  1    (-1e-05, 0.0]           209         245       0.323529     0.339806  
  2       (0.0, 4.1]           384         393       0.594427     0.545076  
  3       (4.1, 8.2]            34          50       0.052632     0.069348  
  4      (8.2, 12.3]            10          18       0.015480     0.024965  
  5     (12.3, 16.4]             4           5       0.006192     0.006935  
  6     (16.4, 20.5]             2           2       0.003096     0.002774  
  7     (20.5, 24.6]             2           4       0.003096     0.005548  
  8     (24.6, 28.7]             0           0       0.000000     0.000000  
  9     (28.7, 32.8]             0           0       0.000000     0.000000  
  10    (32.8, 36.9]             0           1       0.000000     0.001387  
  11     (36.9, inf]             1           3       0.001548     0.004161  
   
           psi 
  0   0.000000 
  1   0.000799 
  2   0.004277 
  3   0.004611 
  4   0.004534 
  5   0.000084 
  6   0.000035 
  7   0.001430 
  8   0.000000 
  9   0.000000 
  10  0.003384 
  11  0.002584  ,
              bucket  expected_cnt  actual_cnt  expected_perc  actual_perc  \
  0   (-inf, -1e-05]             0           0       0.000000     0.000000  
  1    (-1e-05, 0.0]           209         245       0.323529     0.339806  
  2      (0.0, 42.7]           318         336       0.492260     0.466019  
  3     (42.7, 85.4]            62          80       0.095975     0.110957  
  4    (85.4, 128.1]            23          26       0.035604     0.036061  
  5   (128.1, 170.8]            20          11       0.030960     0.015257  
  6   (170.8, 213.5]             6           9       0.009288     0.012483  
  7   (213.5, 256.2]             6           6       0.009288     0.008322  
  8   (256.2, 298.9]             0           3       0.000000     0.004161  
  9   (298.9, 341.6]             1           2       0.001548     0.002774  
  10  (341.6, 384.3]             0           0       0.000000     0.000000  
  11    (384.3, inf]             1           3       0.001548     0.004161  
   
           psi 
  0   0.000000 
  1   0.000799 
  2   0.001437 
  3   0.002173 
  4   0.000006 
  5   0.011113 
  6   0.000944 
  7   0.000106 
  8   0.015140 
  9   0.000715 
  10  0.000000 
  11  0.002584  ]
 
 
'''

你可能感兴趣的:(机器学习,深度学习,数据挖掘)