python特征选择之多类别标签的Fisher score计算

"""
特征选择中多类别标签的Fisher score计算
注意:.mat数据集必须满足
特征集为n*m(n为样本数,m为特征数)
标签集为n*1
如果用matlab打开之后不满足以上,可以进行转置
如果是csv文件,也可以进行转换
"""


import pandas as pd
from collections import defaultdict
import numpy as np
import scipy.io as sio


def F_S(sample,label,loop):
    df1 = pd.DataFrame(sample)
    df2 = pd.DataFrame(label, columns=['label'])
    data = pd.concat([df1, df2], axis=1)  # 合并成为一个dataframe
    target_equivalence_class = defaultdict(list)
    for m, n in [(n, m) for m, n in list(enumerate(data.label))]:
        target_equivalence_class[m].append(n)  # m为某类标签,n为某些样本
    # print(target_equivalence_class)
    n = len(label)
    # print(n)
    n_class = {}
    n_value = 'a'
    for k, values in target_equivalence_class.items():
        n_value = len(values)
        n_class[k] = n_value  # 计算出第n类标签的个数
    for k, values in target_equivalence_class.items():
        n_value = len(values)
        n_class[k] = n_value  # 计算出第n类标签的个数
    # print(n_class)
    lst = []
    features_list = list(data.columns)[:-1]  # 以列表的形式列出所有特征的下标【0,1,2....】
    # print(features_list)
    SB = []
    SW = []
    for feature in features_list:
        for key in target_equivalence_class.keys():
            data_key = data[data.label == key]
            a_feature_mean = data_key[feature].mean()
            a_SW = sum((data_key[feature] - a_feature_mean) ** 2)
            SW.append(a_SW)
            # print(type(a_SW))
            all_feature_mean = data[feature].mean()
            a_SB = n_value / n * (a_feature_mean - all_feature_mean) ** 2
            a_SB = float(a_SB)
            SB.append(a_SB)
            # print(a_SB)
        all_SB = sum(SB)  # sum()函数是列表内相加
        # print(all_SB)
        all_SW = sum(SW) / n
        if all_SW == 0:
            m_fisher_score = np.nan
        else:
            m_fisher_score = all_SB / all_SW
        lst.append(m_fisher_score)
    lst1 = np.array(lst)
    a = np.argsort(-lst1)
    reduct_set = a[:loop]  # FS排名前k个特征
    return reduct_set


file = 'Data\\U_Sonar.mat'
m = sio.loadmat(file)
sample = m['X']
label = m['Y']
a=F_S(sample,label,50)
print(a)

看到很多都是二分类标签的Fisher score的文章,多类标签的FS较少,从github上看到了大佬写的一个二分类标签的FS,所以进行了一些改动,研一小菜鸟一枚,写的算法没有进行优化可能比较简单,希望大佬们批评指正~~

你可能感兴趣的:(python,机器学习,数据挖掘)