临床试验中通常会遇到这样的情况:一个病床号对应了不同的患者或者一个患者对应了不同的病床号。这种情况可以归类为重复数据问题。下面有几种方法来学习重复数据的检查与相关处理。
1.利用data步来实现
proc sort data=clean.patients out=tmp;
by Patno;
run;
data dup;
set tmp;
by Patno;
if first.Patno and last.Patno then delete;
run;
2.利用proc freq实现
proc freq data=clean.patients noprint;
tables Patno / out=dup_no(keep=Patno Count
where=(Count gt 1));
run;
proc sort data=clean.patients out=tmp;
by Patno;
run;
proc sort data=dup_no;
by Patno;
run;
data dup;
merge tmp dup_no(in=Yes_dup drop=Count);
by Patno;
if Yes_dup;
run;
3.利用proc sql语句处理
proc freq data=clean.patients noprint;
tables Patno / out=dup_no(keep=Patno Count
where=(Count gt 1));
run;
proc sql noprint;
select quote(Patno)
into :Dup_list separated by " "
from dup_no;
quit;
title "Duplicates selected using SQL and a macro variable";
proc print data=clean.patients;
where Patno in (&Dup_list);
run;
附
下面数据来源于《cody's data cleaning techniques using sas》一书
建立样本数据:
data clean.patients;
input @1 Patno $3. @4 gender $1.
@5 Visit mmddyy10.
@15 HR 3.
@18 SBP 3.
@21 DBP 3.
@24 Dx $3.
@27 AE $1.;
LABEL Patno = "Patient Number"
Gender = "Gender"
Visit = "Visit Date"
HR = "Heart Rate"
SBP = "Systolic Blood Pressure"
DBP = "Diastolic Blood Pressure"
Dx = "Diagnosis Code"
AE = "Adverse Event?";
format visit mmddyy10.;
datalines;
001M11/11/1998 88140 80 10
002F11/13/1998 84120 78 X0
003X10/21/1998 68190100 31
004F01/01/1999101200120 5A
XX5M05/07/1998 68120 80 10
006 06/15/1999 72102 68 61
007M08/32/1998 88148102 0
M11/11/1998 90190100 0
008F08/08/1998210 70
009M09/25/1999 86240180 41
010f10/19/1999 40120 10
011M13/13/1998 68300 20 41
012M10/12/98 60122 74 0
013208/23/1999 74108 64 1
014M02/02/1999 22130 90 1
002F11/13/1998 84120 78 X0
003M11/12/1999 58112 74 0
015F 82148 88 31
017F04/05/1999208 84 20
019M06/07/1999 58118 70 0
123M15/12/1999 60 10
321F 900400200 51
020F99/99/9999 10 20 8 0
022M10/10/1999 48114 82 21
023f12/31/1998 22 34 78 0
024F11/09/199876 120 80 10
025M01/01/1999 74102 68 51
027FNOTAVAIL NA 166106 70
028F03/28/1998 66150 90 30
029M05/15/1998 41
006F07/07/1999 82148 84 10
;
run;