pandas[python]实现fisher score,衡量特征用于分类的重要性,可用于特征选择

import pandas as pd
import numpy as np


data = pd.read_csv('data/data1to21.csv', header=None)

data[52] = data[52].astype(int).data

# # 计算fisher得分
items = list(range(52))

num_classes = len(set(data[52]))

fisher_score = []

grouped = data.groupby([52], as_index=False)

n = [len(data[data[52] == k+1]) for k in range(num_classes)]

for i in items:  # 遍历所有特征列
    temp = grouped[i].agg({str(i)+'_mean': 'mean',
                           str(i)+'_std': 'std'})     # 已求出特征i在各类别k中的均值u_ik、方差p_ik

    numerator = 0
    denominator = 0

    u_i = data[i].mean()

    for k in range(num_classes):
        n_k = n[k]
        u_ik = temp.iloc[k, :][str(i)+'_mean']
        p_ik = temp.iloc[k, :][str(i)+'_std']

        numerator += n_k*(u_ik-u_i)**2
        denominator += n_k*p_ik**2

    fisher_score.append(numerator/denominator)

pd.DataFrame(fisher_score).to_csv('fisher_score.csv', index=False, header=None)

你可能感兴趣的:(pandas[python]实现fisher score,衡量特征用于分类的重要性,可用于特征选择)