离散特征信息增益计算
数据来自《.统计学习方法——李航》5.2.1节中贷款申请样本数据表
利用pandas的value_counts(),快速计算
import pandas as pd import numpy as np
def ent(data): ''' calculate entropy :param data: :return: ''' prob = pd.value_counts(data)/len(data) return sum(np.log2(prob)*prob*(-1)) def get_info_gain(data, feat, label): ''' :param data: DataFrame :param feat: feature :param label: target :return: ''' e1 = data.groupby(feat).apply(lambda x:ent(x[label])) p1 = pd.value_counts(data[feat])/len(data[feat]) e2 = sum(e1*p1) return ent(data[label]) - e2 pass if __name__ == '__main__': data = pd.DataFrame({'年龄':['青年','青年','青年','青年','青年','中年','中年','中年','中年','中年','老年','老年','老年','老年','老年'], '有工作':['否','否','是','是','否','否','否','是','否','否','否','否','是','是','否'], '有自己的房子':['否','否','否','是','否','否','否','是','是','是','是','是','否','否','否'], '贷款情况':['一般','好','好','一般','一般','一般','好','好','非常好','非常好','非常好','好','好','非常好','一般'], '类别':['否','否','是','是','否','否','否','是','是','是','是','是','是','是','否']}) print(ent(data['类别'])) # 0.9709505944546686 label = '类别' for feat in ['年龄','有工作','有自己的房子','贷款情况']: print(get_info_gain(data, feat, label)) # 0.08300749985576883 # 0.32365019815155627 # 0.4199730940219749 # 0.36298956253708536
refference:python详细步骤计算信息增益