reliefF算法

reliefF算法

import numpy as np
import pandas as pd
from sklearn import datasets


data = datasets.load_iris()
x,y = data.data,data.target
arr = np.hstack((x,y.reshape(len(y),1)))
k = 10


#获取某个类别的K个近邻样本
def countNeighborDist(randomSample,randomIndex,label):
    dis = {}
    chooseKSampleIndex = []
    index = np.where(arr[:,-1]==label)[0]
    for i in index:
        dist = np.sqrt(np.sum((randomSample[0:-1] - arr[i,0:-1])**2))
        dis[i] = dist
    if randomIndex in index:
        del dis[randomIndex]
    for j in range(k):
        index = min(dis,key=dis.get)
        chooseKSampleIndex.append(index)
        del dis[index]
    return arr[chooseKSampleIndex]


#得到各个类别下K个样本的集合   
def getSample():
    sort = {}   
    randomIndex = np.random.randint(arr.shape[0])
    randomSample = arr[randomIndex]   
    labels = np.unique(arr[:,-1])
    for label in labels:
        samples = countNeighborDist(randomSample,randomIndex,label)
        sort[int(label)] = samples
    return randomSample,sort


#求得同类样本与随机选取样本在各个特征上的权值
def updateEqualLabel(randomSample,KSample):
    featNum = len(randomSample)-1
    diff = np.zeros(featNum)
    for i in range(featNum):
        z = 0
        feat_min,feat_max = np.min(arr[:,i]),np.max(arr[:,i])
        for j in range(k):
            z = z + abs(randomSample[i]-KSample[j,i])/(feat_max-feat_min)
        diff[i] = z
    return (diff/k)


##求得不同类样本与随机选取样本在各个特征上的权值           
def updateNotEqualLabel(randomSample,KSample):
    P0 = len(np.where(arr[:-1]==randomSample[-1])[0])/arr.shape[0]
    P1 = len(np.where(arr[:-1]==KSample[0,-1])[0])/arr.shape[0]
    P = P1/(1-P0)
    diff = updateEqualLabel(randomSample,KSample)
    return (diff*P/k)


def main():
    iter_times = 1000
    w = np.zeros(arr.shape[1]-1)
    w1 = np.zeros(arr.shape[1]-1)  
    for time in range(iter_times):
        randomSample,sort = getSample()
        for label in sort:
            if randomSample[-1]==label:
                w0 = updateEqualLabel(randomSample,sort[label])
            else:
                w1 = w1 + updateNotEqualLabel(randomSample,sort[label])
        w = w - w0/iter_times + w1/iter_times
    print('特征权重分布:\n',w)
"""
得出特征的权值分布,根据设定阈值选择特征
"""
if __name__ == '__main__':
    main()

你可能感兴趣的:(feature,selection)