文章标题

%%
%读入CSV文件
clear
M=csvread('C:\Users\Administrator\Desktop\classification\pima-indians-diabetes-database\diabetes.csv',2,1);

times=0;
Sfold=4;%%4 Fold 交叉验证
start_idx=1; %%因为matlab没有现成的函数,自己写4 Fold
acc=zeros(Sfold,5);%%保存各方法准确率的矩阵
while times%循环4次,4 Fold

% for j=2:7
%   M(M(:,j)==0,:)=[];     %%%这里是直接删掉缺失值的行
% end


% %很重要的是其中有很多0是指代的缺省值,如何处理?
% %%暂时考虑用平均值替代一下
meanVal=zeros(2,7);     %%这里是用均值替代0,自己实现
for i=1:size(M,1)
    for j=2:7
        if M(i,j)~=0
            meanVal(1,j)=meanVal(1,j)+M(i,j);
            meanVal(2,j)=meanVal(2,j)+1;
        end
    end
end

res=meanVal(1,2:7)./meanVal(2,2:7);

for i=1:size(M,1)
    for j=2:7
        if M(i,j)==0
            M(i,j)=res(j-1);
        end
    end
end
% %%%%第2到第6列的0都用均值替代了
%%
%%%%用不同的算法做一下预测

%把数据集分一下



end_idx=round((times+1)*size(M,1)/Sfold);  %%%4 Fold的实现,每次取1/4的数据

test_data=M(start_idx:end_idx,1:7);  %%将数据划分为测试数据集和训练数据集
test_label=M(start_idx:end_idx,8);  %%训练数据集占3/4,测试占1/4
M(start_idx:end_idx,:)=[];
train_data=M(:,1:7);
train_label=M(:,8);

% train_data=M(1:567,1:7);
% train_label=M(1:567,8);
% test_data=M(568:end,1:7);
% test_label=M(568:end,8);

%%          %%%%下面开始是5中方法,逻辑回归,SVM,决策树,随机森林,提升树,保存准确率到结果矩阵acc中
%%%Logistic Regression
factor=glmfit(train_data,train_label,'binomial','link','logit');
logitFit=round(glmval(factor,test_data,'logit'));

accuracy_lr=size(find(logitFit==test_label),1)/size(logitFit,1);
acc(times+1,1)=accuracy_lr;
%%%SVM
factor_svm=svmtrain(train_data,train_label);
svm_label=svmclassify(factor_svm,test_data);

accuracy_svm=size(find(svm_label==test_label),1)/size(test_label,1);
acc(times+1,2)=accuracy_svm;
%%%CART
factor_cart=classregtree(train_data,train_label,'method','classification');
pruned_tree=prune(factor_cart,'level',2);
% cart_label=eval(factor_cart,test_data);
cart_label=eval(pruned_tree,test_data);
cart_label=cell2mat(cart_label);
cart_label_result=zeros(size(cart_label,1),1);
for i=1:size(cart_label,1)
    cart_label_result(i)=str2double(cart_label(i));
end
accuracy_cart=size(find(cart_label_result==test_label),1)/size(test_label,1);

acc(times+1,3)=accuracy_cart;

%%%Random Forest
factor_rf=TreeBagger(500,train_data,train_label);
[rf_label,~]=predict(factor_rf,test_data);
rf_label=cell2mat(rf_label);
rf_label_result=zeros(size(rf_label,1),1);
for i=1:size(rf_label,1)
    rf_label_result(i)=str2double(rf_label(i));
end
accuracy_rf=size(find(rf_label_result==test_label),1)/size(test_label,1);
acc(times+1,4)=accuracy_rf;

%%%Fitensemble
Ensemble_factor=fitensemble(train_data,train_label,'AdaBoostM1',100,'Tree');
Ensemble_label=predict(Ensemble_factor,test_data);
accuracy_Ensemble=size(find(Ensemble_label==test_label),1)/size(test_label,1);
acc(times+1,5)=accuracy_Ensemble;


%%%%
start_idx=end_idx+1;
times=times+1;
end
mean(acc)  %%显示一下均值

你可能感兴趣的:(diabetes)