特征选择之互信息 Mutual Information

%%%   dataset中的最后一列为分类类别,k为要选择的特征个数

function result=MutualInformation(dataset,k)
% character_order
character_order=[];
% the count of classes
classes=unique(dataset(:,size(dataset,2)));
character_count=size(dataset,2)-1;
N=size(dataset,1);
%compute H(Y)
H_Y=0;
h_y=0;
for i=1:length(classes)
    class=classes(i);
    class_id=find(dataset(:,character_count+1)==class())
    class_count=length(class_id);
    h_y=h_y+class_count*log(class_count)/N;
end
H_Y=log(length(classes))-h_y;


for i=1:character_count
    character=dataset(:,i);
    character_value=unique(character,'rows');
    H_Y_f=0;
    for j=1:length(character_value)
        value_rows_id=find(character==character_value(j));
        value_rows=dataset(value_rows_id,:);
        for m=1;length(classes)
            class_id=find(value_rows(:,size(value_rows,2))==classes(m));
            o_jk=length(class_id);
            o_j=length(value_rows_id);
            if o_jk==0||o_j==0
                o_jk;
            end
            H_Y_f=H_Y_f+o_jk*log(o_j/o_jk)/N;
        end
    end
    I_f_Y=H_Y-H_Y_f;
    character_order(i,:)=[i,I_f_Y];      
end
character_order=sortrows(character_order,2);
result=character_order(1:k,:);
end


你可能感兴趣的:(数据挖掘)