IML 编程的基本函数(矩阵运算,数据管理)

本文内容主要参考《Statistical Programming with SAS/IML Software》、iml帮助文档

SAS/IML 是利用矩阵语言的模块,矩阵用于统计计算非常方便,典型代表是多元线性模型的求解。矩阵中行代表着观测,列可以表示变量。字符型变量和数值型变量通吃。下面是IML基本函数,当然只是一小部分。。

矩阵运算函数

proc iml; /*声明iml模块*/
  x=1;
  y={1 2 3};
  z={1 2 3,
     3 5 6};
  o={'abc' 'opq',
     'lmn' 'rst'}; 
  n_z=nrow(z);/*矩阵维度,行列维度nrow(),ncol()*/
  p_o=ncol(o);
  type_o=type(o);/*type()矩阵类型,系字符型C或数值型N或者都包括*/
  nlen=nleng(o);/*nleng矩阵长度,几个字符串*/
  len=length(o); /*length()矩阵中每个字符串的字符,当然字符串长度可以更改*/
  print x,y,z,o n_z; /*显示矩阵值*/
quit;

 

利用函数创建矩阵

proc iml;
   c=j(10,1,3.14); /*J()函数产出常量矩阵,10行1列3.14*/
   r=j(1,5);    /*一行五列的矩阵,默认值1*/
   m=j(10,5,0); /*10行5列的0矩阵*/
   miss=j(3,2,.); /*3行2列缺失值矩阵*/
   g=repeat({0 1},3,2);/*REPEAT()函数可以产生更一般的矩阵*/
   j=1:5;/*“ :”标记从1到5,默认增量为1*/
   k=do(1,10,2);/*DO()函数可以进行算术计算,产生固定间距的矩阵*/
   seed=j(10,1,1);
   need=uniform(seed);/*生成伪随机的矩阵,seed为1*/
   /*如果需要生成大量的伪随机数,考虑RANDGEN程序*/
   call randseed(123);/*种子*/
   x=j(10,1); /*x系一个列向量*/
   e=x;       /*x赋值给e*/
   call randgen(x,"Uniform"); /*生成随机向量*/
   call randgen(e,"Normal");
   y=3*x+2+e;                 /*回归模型*/
print c,r,m,miss,need,y;

quit;
proc iml;
   s={1 2 3,4 5 6,7 8 9,10 11 12};
   transpose=t(s); /*矩阵转置,T()函数完成*/
   s2=shape(s,6);/*SHAPE()函数用于矩阵重组,s2是6行矩阵*/
   s3=s[3,2];  /*从矩阵中提取某个值,找到其行列数即可完成*/
   s4=s[1,2];
   s5=s[s4,s4]; /*在提取数基础上可以重新构造矩阵*/
   s6=vecdiag(s);/*vecdiag,可以求矩阵的对角阵*/
print transpose,s2,s3,s4,s5,s6 ;quit;


iml进行数据管理的函数

APPEND               adds observations to the end of a SAS data set
CLOSE                closes a SAS data set
CREATE               creates and opens a new SAS data set for input and output
DELETE               marks observations for deletion in a SAS data set
EDIT                 opens an existing SAS data set for input and output
FIND                 finds observations
INDEX                indexes variables in a SAS data set
LIST                 lists observations
PURGE                purges all deleted observations from a SAS data set
READ                 reads observations into IML variables
REPLACE              writes observations back into a SAS data set
RESET DEFLIB         names default libname
SAVE                 saves changes and reopens a SAS data set
SETIN                selects an open SAS data set for input
SETOUT               selects an open SAS data set for output
SHOW CONTENTS        shows contents of the current input SAS data set
SHOW DATASETS        shows SAS data sets currently open
SORT                 sorts a SAS data set
SUMMARY              produces summary statistics for numeric variables
USE                  opens an existing SAS data set for input

简单例子:

data class;/*work库下建立数据class*/
   set sashelp.class;
run;

/*下面是函数的简单运用*/

   use class;
   show datasets contents; 
   reset deflib=work;

   m={3 6 9 13};
   n={name sex age weight};
   list all point m var n where(sex='F');

   use class;
   read all var {name height};
   read all var _num_ into x;
   read all var _num_ into femal where(sex="F");
   print x,female;

   edit class;
   find all where(name={'HENRY'}) into d;
   delete point=d;
   delete all where(age>12);
   
   create class2 from class;
   setin class2 point 0;
   sum=0;
   do data;
      read next var(weight);
   sum=sum+weight;
   end;
   print sum;

   create ratio2 var(name htwt);
   append;
   show contents;
   close ratio2;
   
   summary var {'height' 'weight'} class {sex} stat {mean std} opt{save};
   
   sort class out=sorted by name;
   
   index sex; 
   purge;


简单的逻辑编程

proc iml;
   a={17 22,13 10};
   if max(a)<20 then p=1;
   else p=0;
print a,p;
quit;
proc iml;
   y=0;
   do i=1 to 10 by 2;
      y=y+i;
   end;
print y;
quit;
proc iml;
   count=1;
   do while (count<5);
      count=count+1;
   end;
print count;
quit;
proc iml;
   count=1;
   do until(count>5);
      count=count+1;
 end;
print count;
quit;
/* module without arguments, all symbols are global. */
proc iml;
   a = 10; /* a is global */
   b = 20; /* b is global */
   c = 30; /* c is global */
   start Mod1; /* begin module */
      p = a+b; /* p is global */
      c = 40; /* c already global */
   finish; /* end module */
   run Mod1; /* run the module */
print a b c p;

描述性统计量计算

Sums:
VEC_SUM: The sum of the elements of the first row of the matrix X
COL_TOT: The column totals (sum of the elements in each column of the matrix X
MAT_SUM: The sum of the elements of the matrix X
SUBMAT: A submatrix including the first two columns and first two rows of the matrix X
SM_SUM: The sum of the elements of a submatrix including the first two rows and the first two columns of the matrix X

PROC IML;
   x={1 2 3,4 5 6,7 8 9};
   vec_sum=x[1,+];
   col_tot=x[+,];
   mat_sum=SUM(x);
   submat=x[1:2,1:2];
   sm_sum=SUM(submat);
   PRINT vec_sum col_tot mat_sum sm_sum;
QUIT;

Means:
VEC_MEAN: The mean of the elements of the first row of the matrix X
MAT_MEAN: The mean of the elements of the matrix X
SM_MEAN: The mean of the elements of a submatrix including the first two rows and the first two columns of the matrix X

PROC IML;
   x={1 2 3,4 5 6,7 8 9};
   vec_mean=x[1,:];
   mat_mean=x[:];
   submat=x[1:2,1:2];
   sm_mean=submat[:];
   PRINT vec_mean mat_mean sm_mean;
QUIT;

Variances:
VEC_VAR: The variance of the elements of the first row of the matrix X
MAT_VAR: The variance of the elements of the matrix X
SM_VAR: The variance of the elements of a submatrix including the first two rows and the first two columns of the matrix X

PROC IML;
   x={1 2 3,4 5 6,7 8 9};
   ssq=SSQ(x[1,]);
   sum=x[1,+];
   vec_var=(ssq-sum*sum/NCOL(x))/(NCOL(x)-1);
   jn=J(NROW(x));
   mat_mean=x[:]; 
   mat_var=SSQ(x-mat_mean*jn)/(NROW(x)*NCOL(x)-1);
   submat=x[1:2,1:2];
   ssq_sub=SSQ(submat);
   sum_sub=SUM(submat);
   sm_var=(ssq_sub - sum_sub*sum_sub/(NCOL(submat)*NROW(submat))) /
           (NCOL(submat)*NROW(submat)-1);
   PRINT vec_var mat_var sm_var;
QUIT;

Other basic statistics:

PROC IML;
   scores={90,62,66,68,70,72,73,74,78,78,78,79,80,81,82,
           82,82,84,84,85,85,85,85,87,88,89,89,89,89,61};
   PRINT scores;
   count=NROW(scores);
   sum=SUM(scores);
   mean=sum/count; /* =scores[:] */
   median=MEDIAN(scores);
/** Frequency Distribution **/
START freq(x);
   unique=UNION(x)`; /** vector of unique scores **/
   observed=J(NROW(unique),1);
   DO i=1 TO NROW(unique);
      observed[i]= NCOL(LOC(x=unique[i]));
   END;
   freq=unique||observed; /* vector of unique scores and their frequencies */
   f_Label={" SCORE" "FREQUENCY"}; /* labels for the columns of Freq */
   PRINT "Frequency Distribution" freq [COLNAME=f_label];
FINISH;
CALL FREQ(scores);
/** End Frequency Distribution**/

/** Start Mode **/
START mode(x);
   unique=UNION(x)`;
   observed=J(NROW(unique),1);
   DO i=1 TO NROW(unique);
      observed[i]= NCOL(LOC(x=unique[i]));
   END;
   max=MAX(observed);
   mode=unique[LOC(observed=max),1];
   RETURN(mode);
FINISH;
mode=MODE(scores);
/** End Mode **/

ssq=SSQ(scores);
variance=(ssq-sum*sum/count)/(count-1);
stdev=SQRT(variance);
quartiles=QUARTILE(scores);

/** Start Percentile **/
START percentile(x,pct);
   CALL SORT(x,1);
   count=NROW(x);
   pct=pct/100*(count+1);
   fpct=FLOOR(pct);
   cpct=CEIL(pct);
   IF fpct=cpct THEN percentile=x[pct];
   ELSE percentile=x[fpct]+(pct-fpct)* (x[cpct]-x[fpct]);
/* SAS PCTLDF 4 */
RETURN(percentile);
FINISH;
percentile=PERCENTILE(scores,75);
/** End Percentile **/


CALL SORT(scores,1); /* sorts the scores in ascending order */
quantile=scores[CEIL(count*.45)];
PRINT count sum mean median mode,
      variance stdev quartiles, "75th Percentile:"
      percentile, "45th Quantile:" quantile;
QUIT;


IML简单运用,相关系数的计算

ods trace output;
PROC IML;
/* Module to compute correlations */
start corr;
   n = nrow(x); /* number of observations */
   sum = x[+,] ; /* compute column sums */
   xpx = t(x)*x-t(sum)*sum/n; /* compute sscp matrix */
   s = diag(1/sqrt(vecdiag(xpx))); /* scaling matrix */
   corr = s*xpx*s; /* correlation matrix */
   print "Correlation Matrix",,corr[rowname=nm colname=nm] ;
finish corr;
/* Module to standardize data */
start std;
   mean = x[+,] /n; /* means for columns */
   x = x-repeat(mean,n,1); /* center x to mean zero */
   ss = x[##,] ; /* sum of squares for columns */
   std = sqrt(ss/(n-1)); /* standard deviation estimate*/
   x = x*diag(1/std); /* scaling to std dev 1 */
   print ,"Standardized Data",,X[colname=nm] ;
finish std;
/* Sample run */
x = { 1 2 3,3 2 1,4 2 1,0 4 1,24 1 0,1 3 8};
nm={age weight height};
run corr;
run std;


 

你可能感兴趣的:(编程,Module,Class,input,Matrix,statistics)