数据预处理---空值(补全或删除)与异常值(剔除)(基于MATLAB)

% clc
% clear all
% 
% 导入数据(原始数据)
Data = xlsread('F:\实验数据\数据汇总表111','D:M');  

figure(1)
x3str={'z1','z2','z3','z4','z5','z6','z7','z8','z9','z10','z11','z12','z13','z14','z15'}
bar(Data(1:15))
xtextp=1:15;%每个标签放置位置的横坐标,这个自然应该和原来的一样了。                    
ytextp=-0.1*0.18*ones(1,length(xtextp));
text(xtextp-0.3,ytextp,x3str)
set(gca,'XTickLabel',[]); %将原坐标(1,2,3,..)去掉
title('变量z1-z15空值百分比图','fontsize',12)
ylim([0,0.85])

figure(2)
x3str={'z16','z17','z18','z19','z20','z21','z22','z23','z24','z25','z26','z27','y1','y2'};
bar(Data(16:29))
xtextp=1:14;%每个标签放置位置的横坐标,这个自然应该和原来的一样了。                    
ytextp=-0.1*0.04*ones(1,length(xtextp));
text(xtextp-0.3,ytextp,x3str)
set(gca,'XTickLabel',[]); %将原坐标(1,2,3,..)去掉
title('变量z11-z27、y1、y2空值百分比图','fontsize',12)


% 
% [M N] = size(Data);   %矩阵B的行数与列数
% % 缺失值补全,
% % 补全依据是确失的值认为与它上一行的数据一样,即每个时间段测一次
% % 未测的时间段认定与它最近的上次测试结果一致.
% for i = 2:M
%     for j = 1:N
%         
%         AA = isnan(Data);
%         
%         if AA(i,j) == 1
%             
%             Data(i,j) = Data(i-1,j);
%             
%         else
%             
%             Data(i,j) = Data(i,j);
%             
%         end
%     end
% end
% 
% 

% % 删除数据中的空值
% [U V] = size(Data);
% BB = isnan(Data);
% [u,v] = find(BB()==1);
% Data(u,:) = [];

figure(1);boxplot(Data(:,1:10))
title('变量z1-z10箱体图','fontsize',12)
set(gca,'Xticklabel',{'z1','z2','z3','z4','z5','z6','z7','z8','z9','z10'});
figure(2);boxplot(Data(:,11:27))
title('变量z11-z27箱体图','fontsize',12)
set(gca,'Xticklabel',{'z11','z12','z13','z14','z15','z16','z17','z18','z19','z20','z21','z22','z23','z24','z25','z26','z27'});
figure(3);boxplot(Data(:,28:29))
title('变量y1,y2箱体图','fontsize',12)
set(gca,'Xticklabel',{'y1','y2'});

%% 使用肖维勒方法(等置信概率)剔除异常值
[m n] = size(Data);
Y = [];            
w = 1 + 0.4*log(m);    % 肖维勒系数(近似计算公式)

for i = 1:n
   x = Data(:,i);    
   YiChang = abs(x-mean(x)) > w*std(x);
   %YiChang = x > mean(x)+3*std(x) | x < mean(x)-3*std(x);
   %Std(:,i) = std(x);
   %Mean(:,i) = mean(x);
   Y(:,i) = YiChang;
end

[u v] = find(Y() == 1);   % 找出异常值所在的行与列
uu = unique(u);    % 剔除重复的行数
Data(uu,:) = [ ];   %令异常值所在行为空,即剔除异常值

你可能感兴趣的:(机器学习)