reliefF算法
import numpy as np
import pandas as pd
from sklearn import datasets
data = datasets.load_iris()
x,y = data.data,data.target
arr = np.hstack((x,y.reshape(len(y),1)))
k = 10
#获取某个类别的K个近邻样本
def countNeighborDist(randomSample,randomIndex,label):
dis = {}
chooseKSampleIndex = []
index = np.where(arr[:,-1]==label)[0]
for i in index:
dist = np.sqrt(np.sum((randomSample[0:-1] - arr[i,0:-1])**2))
dis[i] = dist
if randomIndex in index:
del dis[randomIndex]
for j in range(k):
index = min(dis,key=dis.get)
chooseKSampleIndex.append(index)
del dis[index]
return arr[chooseKSampleIndex]
#得到各个类别下K个样本的集合
def getSample():
sort = {}
randomIndex = np.random.randint(arr.shape[0])
randomSample = arr[randomIndex]
labels = np.unique(arr[:,-1])
for label in labels:
samples = countNeighborDist(randomSample,randomIndex,label)
sort[int(label)] = samples
return randomSample,sort
#求得同类样本与随机选取样本在各个特征上的权值
def updateEqualLabel(randomSample,KSample):
featNum = len(randomSample)-1
diff = np.zeros(featNum)
for i in range(featNum):
z = 0
feat_min,feat_max = np.min(arr[:,i]),np.max(arr[:,i])
for j in range(k):
z = z + abs(randomSample[i]-KSample[j,i])/(feat_max-feat_min)
diff[i] = z
return (diff/k)
##求得不同类样本与随机选取样本在各个特征上的权值
def updateNotEqualLabel(randomSample,KSample):
P0 = len(np.where(arr[:-1]==randomSample[-1])[0])/arr.shape[0]
P1 = len(np.where(arr[:-1]==KSample[0,-1])[0])/arr.shape[0]
P = P1/(1-P0)
diff = updateEqualLabel(randomSample,KSample)
return (diff*P/k)
def main():
iter_times = 1000
w = np.zeros(arr.shape[1]-1)
w1 = np.zeros(arr.shape[1]-1)
for time in range(iter_times):
randomSample,sort = getSample()
for label in sort:
if randomSample[-1]==label:
w0 = updateEqualLabel(randomSample,sort[label])
else:
w1 = w1 + updateNotEqualLabel(randomSample,sort[label])
w = w - w0/iter_times + w1/iter_times
print('特征权重分布:\n',w)
"""
得出特征的权值分布,根据设定阈值选择特征
"""
if __name__ == '__main__':
main()