转自课件:
简介:
判别分析(Discriminate Analysis)是用以判别个体所属类体的一种统计方法,它根据已掌握类别信息,建立判别准则(判别函数),进而来判别未知样本所属的类别。判别分析的方法主要有距离判别、Fisher判别、贝叶斯判别、逐步判别。
实例分析:
Fisher于1936年发表的鸢尾花(Iris)数据被广泛地作为判别分析的例子。数据是对3种鸢尾花:刚毛鸢尾花(setosa第1组)、变色鸢尾花(versicolor第2组)和佛吉尼亚鸢尾花(virginica第3组)各抽取一个容量为50的样本,测量其花萼长(sepallen)x1、花萼宽(sepalwid)x2 、花瓣长(petallen)x3、花瓣宽(petalwid)x4,单位为mm,分组标记为S,
1.建立数据.
proc format;
value specname 1='Setosa'
2='Versicolor'
3='Virginica';
run;
data iris;
title 'Discriminant Analysis of Fisher (1936) Iris Data';
input sepallen sepalwid petallen petalwid species @@;
format species specname.;
label sepallen='Sepal Length in mm.'
sepalwid='Sepal Width in mm.'
petallen='Petal Length in mm.'
petalwid='Petal Width in mm.';
cards;
50 33 14 02 1 64 28 56 22 3 65 28 46 15 2 67 31 56 24 3
63 28 51 15 3 46 34 14 03 1 69 31 51 23 3 62 22 45 15 2
59 32 48 18 2 46 36 10 02 1 61 30 46 14 2 60 27 51 16 2
65 30 52 20 3 56 25 39 11 2 65 30 55 18 3 58 27 51 19 3
68 32 59 23 3 51 33 17 05 1 57 28 45 13 2 62 34 54 23 3
77 38 67 22 3 63 33 47 16 2 67 33 57 25 3 76 30 66 21 3
49 25 45 17 3 55 35 13 02 1 67 30 52 23 3 70 32 47 14 2
64 32 45 15 2 61 28 40 13 2 48 31 16 02 1 59 30 51 18 3
55 24 38 11 2 63 25 50 19 3 64 32 53 23 3 52 34 14 02 1
49 36 14 01 1 54 30 45 15 2 79 38 64 20 3 44 32 13 02 1
67 33 57 21 3 50 35 16 06 1 58 26 40 12 2 44 30 13 02 1
77 28 67 20 3 63 27 49 18 3 47 32 16 02 1 55 26 44 12 2
50 23 33 10 2 72 32 60 18 3 48 30 14 03 1 51 38 16 02 1
61 30 49 18 3 48 34 19 02 1 50 30 16 02 1 50 32 12 02 1
61 26 56 14 3 64 28 56 21 3 43 30 11 01 1 58 40 12 02 1
51 38 19 04 1 67 31 44 14 2 62 28 48 18 3 49 30 14 02 1
51 35 14 02 1 56 30 45 15 2 58 27 41 10 2 50 34 16 04 1
46 32 14 02 1 60 29 45 15 2 57 26 35 10 2 57 44 15 04 1
50 36 14 02 1 77 30 61 23 3 63 34 56 24 3 58 27 51 19 3
57 29 42 13 2 72 30 58 16 3 54 34 15 04 1 52 41 15 01 1
71 30 59 21 3 64 31 55 18 3 60 30 48 18 3 63 29 56 18 3
49 24 33 10 2 56 27 42 13 2 57 30 42 12 2 55 42 14 02 1
49 31 15 02 1 77 26 69 23 3 60 22 50 15 3 54 39 17 04 1
66 29 46 13 2 52 27 39 14 2 60 34 45 16 2 50 34 15 02 1
44 29 14 02 1 50 20 35 10 2 55 24 37 10 2 58 27 39 12 2
47 32 13 02 1 46 31 15 02 1 69 32 57 23 3 62 29 43 13 2
74 28 61 19 3 59 30 42 15 2 51 34 15 02 1 50 35 13 03 1
56 28 49 20 3 60 22 40 10 2 73 29 63 18 3 67 25 58 18 3
49 31 15 01 1 67 31 47 15 2 63 23 44 13 2 54 37 15 02 1
56 30 41 13 2 63 25 49 15 2 61 28 47 12 2 64 29 43 13 2
51 25 30 11 2 57 28 41 13 2 65 30 58 22 3 69 31 54 21 3
54 39 13 04 1 51 35 14 03 1 72 36 61 25 3 65 32 51 20 3
61 29 47 14 2 56 29 36 13 2 69 31 49 15 2 64 27 53 19 3
68 30 55 21 3 55 25 40 13 2 48 34 16 02 1 48 30 14 01 1
45 23 13 03 1 57 25 50 20 3 57 38 17 03 1 51 38 15 03 1
55 23 40 13 2 66 30 44 14 2 68 28 48 14 2 54 34 17 02 1
51 37 15 04 1 52 35 15 02 1 58 28 51 24 3 67 30 50 17 2
63 33 60 25 3 53 37 15 02 1
;
run;
proc print data=iris;
run;
2.调用判别分析discrim过程
proc stepdisc data=iris short sle=0.3 sls=0.05;
/*逐步判别分析,对变量进行筛选,偏R方、F统计量值帮助得到建立判别函数的最优变量*/
class species;
var sepallen sepalwid petallen petalwid ;
run;
proc discrim data=iris
method=normal pool=test anova short crosslisterr;
class species;
var petallen; /*依变量petalwid进行判别分析*/
run;
proc discrim data=iris outstat=plotiris
method=normal pool=test manova listerr crosslisterr;
class species;
var petallen petalwid sepalwid sepallen ;/*依四变量进行判别*/
run;
proc print data=plotiris;
run;
3.利用判别函数来判别未知分类的数据
用已知分类的样本数据iris作为判别标准,来判别不知分类的数据集newiris中的新数据的分类。注意iris数据集应与newiris数据集中具有相同的数据变量名和含义。为简单起见,我们将iris数据集中的已知分类species变量去掉,形成一个不知分类的新数据集newiris。可如下调用程序:
data newiris (drop=species);
set iris;
run;
proc print data=plotdata;
run;
proc discrim data=iris testdata=newiris testout=plotp1 testoutd=plotd1
method=normal pool=yes short noclassify crosslisterr;
class species;
var petallen ;
title2 'Using Normal Density Estimates with Equal Variance';
run;
proc discrim data=iris testdata=newiris testout=plotp2 testoutd=plotd2
method=normal pool=no short noclassify crosslisterr;
class species;
var petallen ;
title2 'Using Normal Density Estimates with Unequal Variance';
run;
proc print data=plotp;
run;
proc print data=plotd;
run;
proc discrim data=iris testdata=newiris testout=plotp3 testoutd=plotd3
method=npar kernel=normal r=.4 pool=yes
short noclassify crosslisterr;
class species;
var petallen ;
title2 'Using Kernel Density Estimates with Equal Bandwidth';
run;
proc discrim data=iris testdata=newiris testout=plotp4 testoutd=plotd4
method=npar kernel=normal r=.4 pool=no
short noclassify crosslisterr;
class species;
var petallen ;
title2 'Using Kernel Density Estimates with Unequal Bandwidth';
run;
4.调用典型判别CANDISC过程,输出典型变量散布图。
proc candisc data=iris out=outcan distance anova;
class species;
var sepallen sepalwid petallen petalwid;
run;
proc print data=outcan;
run;
proc format;
value specfmt 1='+'
2='c'
3='*';
run;
proc plot data=outcan formchar='|----|---' vpct=50 hpct=80;
plot can2*can1=species;
format species specfmt.;
title2 'Plot of Canonical Variables';
run;