本文是原创文章,转载请注明;
在实际数据挖掘建模过程,如果已有的变量也目标变量相关性不强,可以有多种衍生变量的方法,本代码提供一种暴力衍生的方式,对相关性最强的变量进行随机抽取,可按多种方式进行衍生,并选择与目标变量最相关度前n个变量入训练集,并映射到测试集。
代码在9.4m2环境测试可用。
%let lib=weier;
%let target=y;
%let id=uid;
data credit_model;
set &lib..train_temp_2;
run;
proc sql noprint;
select name into :var_list separated by ' '
from sashelp.vcolumn
where libname=UPPER('work') and memname=upper("credit_model") and type="num"
and name~="&target" and name~="&id" ;/*抽取变量到宏变量,注此处的变量名是要区分大小写的*/
quit;
ods listing close;
ods output spearmancorr=spearman;
proc corr data=credit_model spearman rank;/*计算变量的皮尔曼相关系数*/
var &var_list;
with ⌖
run;
ods listing;
data corr(keep=variable scorr spvalue ranksp) ;/*利用数组将数据转置*/
length variable $ 20;
set spearman;
array best(*) best:;
array r(*) r:;
array p(*) p:;
do i=1 to dim(best);
variable=best(i);
scorr=r(i);
spvalue=p(i);
ranksp=i;
output;
end;
run;
data var_im(keep=variable);
set corr;
run;
proc surveyselect data=var_im NOPRINT /*随机从变量池中抽取5000对变量交叉*/
seed=2018202
out=sample
method=srs
n=2
reps=5000; /*变量数*/
run;
%macro creatvar_2(var_number,out);/*产生有限个交叉变量数据集,输入变量数,输出变量集,这个宏可以选择使用取最重要前n个变量*/
proc sql noprint;
select variable into :var_1 -:var_%left(&var_number) from var_im
/*提取重要变量入宏*/
quit;
%local i j;
data &out;
length var1 var2 $40.;
no=0;
%do i=1 %to 100;/*因计算量大,测度时用100,全部计算用&var_number替代*/
%do j=%eval(&i+1) %to %eval(%left(&var_number.)-1);
no=no+1;
var1="&&var_&i";
var2="&&var_&j";
output;
%end;
%end;
run;
%mend;
/*%creatvar_2(30,sample);*/;
proc sql noprint;
select name into :var_list separated by ' '
from sashelp.vcolumn
where libname=UPPER('work') and memname=upper("credit_model") and type="num"
and name~="&target" and name~="&id" ;/*注此处的变量名是要区分大小写的*/
quit;
%macro cross_var(methold,dsout);/*暴力交叉变量*/
data _null_;
set sample nobs=nobs;
call symputx("var1_"||left(round(_n_/2-0.1)+1),compress(variable));
call symputx("var2_"||left(round(_n_/2)),compress(variable));
call symputx("obs",nobs/2 );
run;
data &dsout(drop=&var_list);
set credit_model;
%do i=1 %to %eval(&obs);
%let vars=%sysfunc(cat(&&var1_&i.,_,&&var2_&i));
if &methold=1 then &vars = 1/(&&var1_&i)+1/(&&var2_&i);/*方法一,是两个变量的倒数和*/
else if &methold=2 then &vars=(&&var1_&i)**2+(&&var2_&i)**2;/*方法二,是两个变量的平方和*/
else if &methold=3 then &vars=(&&var1_&i)*(&&var2_&i);/*方法三,是两个变量的积*/
%end;
output;
run;
proc sql noprint;
select name into :var_corr separated by ' '
from sashelp.vcolumn
where libname=UPPER('work') and memname=upper("&dsout") and type="num"
and name~="&target" and name~="&id" ;/*注此处的变量名是要区分大小写的*/
quit;
ods listing close;
ods output spearmancorr=spearman;
proc corr data=&dsout spearman rank;/*计算目标变量最相关系数*/
var &var_corr;
with ⌖
run;
ods listing;
data corr(keep=variable scorr spvalue ranksp) ;
length variable $ 20;
set spearman;
array best(*) best:;
array r(*) r:;
array p(*) p:;
do i=1 to dim(best);
variable=best(i);
scorr=r(i);
spvalue=p(i);
ranksp=i;
output;
end;
run;
proc sql noprint;
select variable into :var_corr separated by ' '
from corr
where ranksp<=500 ;/*找出与目标变量最相关的500个衍生变量*/
data &lib..&dsout &dsout;
merge &dsout(keep=&id &target &var_corr) credit_model(keep=&var_list);
run;
%mend;
%cross_var(2,train_model_cross);
%macro cross_test(methold,dsout);/*映射衍生变量到测试集*/
data corr_new;
set corr;
where ranksp<=500;
run;
data _null_;
set corr_new nobs=nobs;
call symputx("var1_"||left(_n_),substr(variable,1,index(variable,"_x")-1));
call symputx("var2_"||left(_n_),substr(variable,index(variable,"_x")+1,length(variable)-index(variable,"_x")+1));
call symputx("obs",nobs);
run;
%put &var1_9 &var2_9;
data &lib..&dsout &dsout;
set &lib..test_temp_2;
%do i=1 %to %eval(&obs);
%let vars=%sysfunc(cat(&&var1_&i.,_,&&var2_&i));
if &methold=1 then &vars = 1/(&&var1_&i)+1/(&&var2_&i);
else if &methold=2 then &vars=(&&var1_&i)**2+(&&var2_&i)**2;
else if &methold=3 then &vars=(&&var1_&i)*(&&var2_&i);
%end;
output;
run;
%mend;
%cross_test(2,test_model_cross);
如有任何问题可与作者联系qq:646509377