重复数据的相关问题

临床试验中通常会遇到这样的情况:一个病床号对应了不同的患者或者一个患者对应了不同的病床号。这种情况可以归类为重复数据问题。下面有几种方法来学习重复数据的检查与相关处理。

1.利用data步来实现

proc sort data=clean.patients out=tmp;
   by Patno;
run;

data dup;
   set tmp;
   by Patno;
   if first.Patno and last.Patno then delete;
run;

2.利用proc freq实现

proc freq data=clean.patients noprint;
   tables Patno / out=dup_no(keep=Patno Count
                             where=(Count gt 1));
run;
proc sort data=clean.patients out=tmp;
   by Patno;
run;
proc sort data=dup_no;
   by Patno;
run;
data dup;
   merge tmp dup_no(in=Yes_dup drop=Count);
   by Patno;
   if Yes_dup;
run;

3.利用proc sql语句处理 

proc freq data=clean.patients noprint;
   tables Patno / out=dup_no(keep=Patno Count
                             where=(Count gt 1));
run;
proc sql noprint;
   select quote(Patno)
      into :Dup_list separated by " "
      from dup_no;
quit;
title "Duplicates selected using SQL and a macro variable";
proc print data=clean.patients;
   where Patno in (&Dup_list);
run;

下面数据来源于《cody's data cleaning techniques using sas》一书

建立样本数据

data clean.patients;
   input @1  Patno    $3. @4  gender   $1.
         @5  Visit    mmddyy10.
         @15 HR       3.
         @18 SBP      3.
         @21 DBP      3.
         @24 Dx       $3.
         @27 AE       $1.;
   LABEL Patno   = "Patient Number"
         Gender  = "Gender"
         Visit   = "Visit Date"
         HR      = "Heart Rate"
         SBP     = "Systolic Blood Pressure"
         DBP     = "Diastolic Blood Pressure"
         Dx      = "Diagnosis Code"
         AE      = "Adverse Event?";
    format visit mmddyy10.;
datalines;
001M11/11/1998 88140 80  10
002F11/13/1998 84120 78  X0
003X10/21/1998 68190100  31
004F01/01/1999101200120  5A
XX5M05/07/1998 68120 80  10
006 06/15/1999 72102 68  61
007M08/32/1998 88148102   0
   M11/11/1998 90190100   0
008F08/08/1998210        70
009M09/25/1999 86240180  41
010f10/19/1999    40120  10
011M13/13/1998 68300 20  41
012M10/12/98   60122 74   0
013208/23/1999 74108 64  1
014M02/02/1999 22130 90   1
002F11/13/1998 84120 78  X0
003M11/12/1999 58112 74   0
015F           82148 88  31
017F04/05/1999208    84  20
019M06/07/1999 58118 70   0
123M15/12/1999 60        10
321F          900400200  51
020F99/99/9999 10 20  8   0
022M10/10/1999 48114 82  21
023f12/31/1998 22 34 78   0
024F11/09/199876 120 80  10
025M01/01/1999 74102 68  51
027FNOTAVAIL  NA 166106  70
028F03/28/1998 66150 90  30
029M05/15/1998           41
006F07/07/1999 82148 84  10
;
run;

你可能感兴趣的:(重复数据的相关问题)