dataframe中连续数值的离散化

通过等宽分箱得得方法对df连续型数值进行离散化

def binning(x, n=10):
    d1=pd.DataFrame({'x':x, 'bucket':pd.cut(x, n)})
    d2=d1.groupby('bucket', as_index=True)
    d3=pd.DataFrame(d2.x.min(), columns=['min'])
    d3['min']=d2.x.min()
    d3['max']=d2.x.max()
    d3['total']=d2.x.count()
    d3['proba']=d2.x.count()/len(x)
    d4=(d3.sort_values(by='min')).reset_index(drop=True)
    return d4

#replace the values with probability
def replace_proba(value,cut,proba):
    for i in range(len(cut)):
        if value>cut[i]:
            continue
        else:
            return proba[i]
#get the joint probability distribution
def risk_score(row_var):
    s=0
    for p in row_var:
        s+=np.power(np.log(p),2)
    return np.exp(np.power(s/len(row_var), 0.5))

使用:

bin_col=binning(df[col])
df[col]=df[col].apply(replace_proba, args=(bin_col['max'], bin_col['proba'])

你可能感兴趣的:(dataframe中连续数值的离散化)