KDD Cup 1998年竞赛的目标是估计一个直邮的回复量,以便获得最多的捐款。数据集的格式是以逗号作为分隔符,其中学习数据集”cup98lrn.txt”包含了95412条记录,481个字段,验证数据集“cup98val.txt”包含了96367条记录,479个字段。每条记录都包含一个CONTROLN字段,该字段是记录的唯一标识符;有两个目标变量TARGET_B和TARGET_D,TARGET_B是一个二进制变量,表示当一条记录中的TARGET_D变量中有捐款时,该条记录是否对邮件做了回复。学习数据集和验证数据集的数据格式相同,但是在验证数据集中没有包含TARGET_B和TARGET_D这两个变量。
#1)加载并查看数据集
cup98 <- read.csv("F:\\R\\Rworkspace\\cup98lrn/cup98lrn.txt")
dim(cup98)
## [1] 95412 481
str(cup98[, 1:10])
## 'data.frame': 95412 obs. of 10 variables:
## $ ODATEDW : int 8901 9401 9001 8701 8601 9401 8701 9401 8801 9401 ...
## $ OSOURCE : Factor w/ 896 levels " ","AAA","AAD",..: 343 122 50 128 1 220 255 613 487 549 ...
## $ TCODE : int 0 1 1 0 0 0 0 0 1 1 ...
## $ STATE : Factor w/ 57 levels "AA","AE","AK",..: 20 9 33 9 14 4 21 24 18 48 ...
## $ ZIP : Factor w/ 19938 levels "00801","00802",..: 9940 16858 336 18629 2937 3841 5897 12146 7439 4251 ...
## $ MAILCODE: Factor w/ 2 levels " ","B": 1 1 1 1 1 1 1 1 1 1 ...
## $ PVASTATE: Factor w/ 3 levels " ","E","P": 1 1 1 1 1 1 1 1 1 1 ...
## $ DOB : int 3712 5202 0 2801 2001 0 6001 0 0 3211 ...
## $ NOEXCH : Factor w/ 4 levels " ","0","1","X": 2 2 2 2 2 2 2 2 2 2 ...
## $ RECINHSE: Factor w/ 2 levels " ","X": 1 1 1 1 2 1 1 1 1 1 ...
summary(cup98[1:3, 1:20])
## ODATEDW OSOURCE TCODE STATE ZIP
## Min. :8901 AMH :1 Min. :0.0000 CA :1 27017 :1
## 1st Qu.:8951 BOA :1 1st Qu.:0.5000 IL :1 61081 :1
## Median :9001 GRI :1 Median :1.0000 NC :1 91326 :1
## Mean :9101 :0 Mean :0.6667 AA :0 00801 :0
## 3rd Qu.:9201 AAA :0 3rd Qu.:1.0000 AE :0 00802 :0
## Max. :9401 AAD :0 Max. :1.0000 AK :0 00820 :0
## (Other):0 (Other):0 (Other):0
## MAILCODE PVASTATE DOB NOEXCH RECINHSE RECP3 RECPGVG RECSWEEP
## :3 :3 Min. : 0 :0 :3 :3 :3 :3
## B:0 E:0 1st Qu.:1856 0:3 X:0 X:0 X:0 X:0
## P:0 Median :3712 1:0
## Mean :2971 X:0
## 3rd Qu.:4457
## Max. :5202
##
## MDMAUD DOMAIN CLUSTER AGE AGEFLAG HOMEOWNR
## XXXX :3 R2 :1 Min. :14.0 Min. :46.0 :2 :1
## C1CM :0 S1 :1 1st Qu.:25.0 1st Qu.:49.5 E:1 H:1
## C1LM :0 T2 :1 Median :36.0 Median :53.0 I:0 U:1
## C1MM :0 :0 Mean :31.0 Mean :53.0
## C2CM :0 C1 :0 3rd Qu.:39.5 3rd Qu.:56.5
## C2LM :0 C2 :0 Max. :43.0 Max. :60.0
## (Other):0 (Other):0 NA's :1
## CHILD03
## :3
## B:0
## F:0
## M:0
##
##
##
head(cup98[, 1:10])
## ODATEDW OSOURCE TCODE STATE ZIP MAILCODE PVASTATE DOB NOEXCH RECINHSE
## 1 8901 GRI 0 IL 61081 3712 0
## 2 9401 BOA 1 CA 91326 5202 0
## 3 9001 AMH 1 NC 27017 0 0
## 4 8701 BRY 0 CA 95953 2801 0
## 5 8601 0 FL 33176 2001 0 X
## 6 9401 CWR 0 AL 35603 0 0
#从上可知:共计95412条数据,481个属性;有些数据存在缺失值,既有因子类型,也有数字类型
#2)查看目标变量TARGET_B的分布情况,并画饼图
str(cup98[, c("TARGET_B", "TARGET_D")])
## 'data.frame': 95412 obs. of 2 variables:
## $ TARGET_B: int 0 0 0 0 0 0 0 0 0 0 ...
## $ TARGET_D: num 0 0 0 0 0 0 0 0 0 0 ...
#TARGET_B为整数类型,TARGET_D为数字类型
unique(cup98$TARGET_B)
## [1] 0 1
unique(cup98$TARGET_D)
## [1] 0.00 4.00 7.00 5.00 13.00 10.00 25.00 8.00 20.00 16.00
## [11] 26.00 15.00 3.00 60.00 23.00 6.00 11.00 18.00 16.87 2.50
## [21] 50.00 10.70 35.00 14.00 21.00 17.00 100.00 2.00 12.00 19.00
## [31] 40.00 38.00 45.00 30.00 12.50 7.50 1.00 9.00 36.00 22.00
## [41] 24.00 41.00 51.00 28.00 32.00 43.00 31.00 33.00 75.00 47.00
## [51] 27.00 37.00 42.00 18.25 34.00 13.92 29.00 200.00 46.00 44.00
## [61] 53.00 5.25 95.00 17.50 48.00 101.00 150.00 4.50 55.00 102.00
## [71] 44.21
#查看TARGET_B属性0/1的比例
(response.percentage <- round(100*prop.table(table(cup98$TARGET_B)), digits = 1))
##
## 0 1
## 94.9 5.1
#根据0/1的百分比生成饼图的标签
(mylabels <- paste("TARGET_B=", names(response.percentage), "\n", response.percentage, sep = ""))
## [1] "TARGET_B=0\n94.9" "TARGET_B=1\n5.1"
#画出TARGET_B的0/1分布的饼图
pie(response.percentage, labels = mylabels)
#3)查看目标变量TARGET_D的分布情况
cup98pos <- cup98[cup98$TARGET_D>0, ]
dim(cup98pos)
## [1] 4843 481
targetPos <- cup98pos$TARGET_D
summary(targetPos)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 10.00 13.00 15.62 20.00 200.00
#捐款的数额最小为1美元,最大为200美元
boxplot(targetPos)
#4)查看捐款数额大于0并且不是所有的捐款都是整数美元的记录,并将非整数美元的捐款兑换为整数美元
#捐款的总人数
length(targetPos)
## [1] 4843
#捐款不是整数美元的人数
sum(!(targetPos %in% 1:200))
## [1] 21
targetPos[!(targetPos %in% 1:200)]
## [1] 16.87 2.50 10.70 12.50 7.50 12.50 12.50 18.25 13.92 12.50 12.50
## [12] 5.25 12.50 12.50 12.50 17.50 12.50 12.50 4.50 44.21 12.50
targetPos <- round(targetPos)
barplot(table(targetPos), las=2)
#从上图可知:大部分客户的捐款数额不超过25美元,但都是5的倍数。
#5)对TARGET_D进行分解
cup98$TARGET_D2 <- cut(cup98$TARGET_D, right = F, breaks=c(0, 0.1, 10, 15, 20, 25, 30, 50, max(cup98$TARGET_D)))
#各个区间所占捐款的比例
table(cup98$TARGET_D2)
##
## [0,0.1) [0.1,10) [10,15) [15,20) [20,25) [25,30) [30,50) [50,200)
## 90569 1132 1378 806 745 435 233 110
cup98pos$TARGET_D2 <- cut(cup98pos$TARGET_D, right=F, breaks=c(0, 0.1, 10, 15, 20, 25, 30, 50, max(cup98pos$TARGET_D)))
table(cup98pos$TARGET_D2)
##
## [0,0.1) [0.1,10) [10,15) [15,20) [20,25) [25,30) [30,50) [50,200)
## 0 1132 1378 806 745 435 233 110
#6)变量选择:RFA_2R可以删除,都为“L”,NOEXCH字段99.7%的值为“0”,也可以删除
table(cup98$RFA_2R)
##
## L
## 95412
round(100*prop.table(table(cup98$NOEXCH)), digits = 3)
##
## 0 1 X
## 0.007 99.657 0.299 0.037
#图片:根据业务筛选变量
varSet <- c(
#demographics
"ODATEDW", "OSOURCE", "STATE", "ZIP", "PVASTATE", "DOB", "RECINHSE", "MDMAUD",
"DOMAIN", "CLUSTER", "AGE", "HOMEOWNR", "CHILD03", "CHILD07", "CHILD12", "CHILD18",
"NUMCHLD", "INCOME", "GENDER", "WEALTH1", "HIT",
#donor interests
"COLLECT1", "VETERANS", "BIBLE", "CATLG", "HOMEE", "PETS", "CDPLAY", "STEREO",
"PCOWNERS", "PHOTO", "CRAFTS", "FISHER", "GARDENIN", "BOATS", "WALKER", "KIDSTUFF",
"CARDS", "PLATES", "PEPSTRFL",
#summary variables of promotion history
"CARDPROM", "MAXADATE", "NUMPROM", "CARDPM12", "NUMPRM12",
#summary variables of giving history
"RAMNTALL", "NGIFTALL", "CARDGIFT", "MINRAMNT", "MAXRAMNT", "LASTGIFT", "LASTDATE",
"FISTDATE", "TIMELAG", "AVGGIFT",
#ID & targets
"CONTROLN", "TARGET_B", "TARGET_D", "TARGET_D2", "HPHONE_D",
#RFA
"RFA_2F", "RFA_2A", "MDMAUD_R", "MDMAUD_F", "MDMAUD_A",
#OTHERS
"CLUSTER2", "GEOCODE2")
cup98 <- cup98[, varSet]
dim(cup98)
## [1] 95412 67
数据探索性分析需要遵循3个步骤:第一步,要查看单个变量的分布情况,这样做事为了了解每一个变量值的分布情况并找出缺失值和离群点,以便确定变量是否需要进行转换或者是否应该用于建模。第二步,要查看因变量与自变量之间的关系,这可以用于特征选择。第三步,查看自变量之间的关系,以便删除冗余变量。
#1)查看邮购反馈HIT的分布
idx.num <- which(sapply(cup98, is.numeric))
#设置画板:一行两列,即一页画板2张图,使用完之后还原layout(matrix(1))
layout(matrix(c(1, 2), 1, 2))
myHist <- function(x) {
hist(cup98[, x], main=NULL, xlab=x)
}
sapply(names(idx.num[4:5]), myHist)
## AGE NUMCHLD
## breaks Numeric,21 Numeric,13
## counts Integer,20 Integer,12
## density Numeric,20 Numeric,12
## mids Numeric,20 Numeric,12
## xname "cup98[, x]" "cup98[, x]"
## equidist TRUE TRUE
layout(matrix(1))
layout(matrix(c(1, 2), 1, 2))
boxplot(cup98$HIT)
cup98$HIT[cup98$HIT > 200]
## [1] 240 240 240 240 240 240 240 240 240 240 240 240 240 240 240 240 240
## [18] 240 240 240 240 240 240 240 241 240 240 240 240 241 240 240 240 240
## [35] 240 240 240 240 240 240 240 240 240 240 240 240 240 240 240 240 240
## [52] 240 241 241 240 240 240 240 240 240 240 240 240 240 240 240 240 240
## [69] 240 240 240 240 240 240
boxplot(cup98$HIT[cup98$HIT < 200])
layout(matrix(1))
#从上面作图可知:有些值为240或241并与大部分的HIT值远远分开,①与该领域专家研究确定;②可能是离群点,在建模时删除这些值;③数据重构不需要删除离群点,使用记录中Hit的均值或中位数进行替换。
#2)查看捐赠者不同年龄段的分布
AGE2 <- cut(cup98pos$AGE, right = F, breaks=seq(0, 100, by=5))
boxplot(cup98pos$TARGET_D~AGE2, ylim=c(0, 40), las=3)
#从上图可知:30-60岁的人群平均捐赠金额比其它年龄段的高,因为30-60岁年龄段的人群是主要劳动力。
#3)查看捐赠者在不同性别上的分布情况:共同账号“J”的捐赠者数量少于性别为男性“M”或女性“F”的捐赠者
attach(cup98pos)
layout(matrix(c(1, 2), 1, 2))
boxplot(TARGET_D~GENDER, ylim=c(0, 80))
plot(density(TARGET_D[GENDER=="F"]), xlim=c(0, 60), col=1, lty=1)
lines(density(TARGET_D[GENDER=="M"]), col=2, lty=2)
lines(density(TARGET_D[GENDER=="J"]), col=3, lty=3)
legend("topright", c("Female", "Male", "Joint account"), col=1:3, lty=1:3)
layout(matrix(1))
detach(cup98pos)
#4)查看目标变量与其他数值型变量之间的相关系数
correlation <- cor(cup98$TARGET_D, cup98[, idx.num], use = "pairwise.complete.obs")
correlation <- abs(correlation)
(correlation <- correlation[, order(correlation, decreasing = T)])
## TARGET_D TARGET_B LASTGIFT RAMNTALL AVGGIFT
## 1.0000000000 0.7742323755 0.0616784458 0.0448101061 0.0442990841
## MAXRAMNT INCOME CLUSTER2 NUMPRM12 WEALTH1
## 0.0392237509 0.0320627023 0.0290870830 0.0251337775 0.0248673117
## MINRAMNT LASTDATE NUMPROM CLUSTER CARDPM12
## 0.0201578686 0.0188471021 0.0173371740 0.0171274879 0.0163577542
## NUMCHLD CONTROLN CARDPROM FISTDATE ODATEDW
## 0.0149204899 0.0133664439 0.0113023931 0.0075324932 0.0069484311
## HIT CARDGIFT NGIFTALL MAXADATE TIMELAG
## 0.0066483728 0.0064498822 0.0048990126 0.0044963520 0.0036115917
## DOB HPHONE_D AGE RFA_2F
## 0.0027541472 0.0024315898 0.0022823598 0.0009047682
#查看任意两个数值型变量之间的相关系数,并绘制散布图
cor(cup98[, idx.num])
## ODATEDW DOB CLUSTER AGE NUMCHLD INCOME WEALTH1
## ODATEDW 1.000000000 0.0994502795 NA NA NA NA NA
## DOB 0.099450280 1.0000000000 NA NA NA NA NA
## CLUSTER NA NA 1 NA NA NA NA
## AGE NA NA NA 1 NA NA NA
## NUMCHLD NA NA NA NA 1 NA NA
## INCOME NA NA NA NA NA 1 NA
## WEALTH1 NA NA NA NA NA NA 1
## HIT -0.070752160 0.0234507927 NA NA NA NA NA
## CARDPROM -0.919799306 -0.0764315011 NA NA NA NA NA
## MAXADATE -0.010789690 0.0677392489 NA NA NA NA NA
## NUMPROM -0.869340411 -0.0895733975 NA NA NA NA NA
## CARDPM12 -0.149368049 0.0209680497 NA NA NA NA NA
## NUMPRM12 -0.159956207 -0.0490645616 NA NA NA NA NA
## RAMNTALL -0.467448826 -0.0684069242 NA NA NA NA NA
## NGIFTALL -0.718927940 -0.1210833285 NA NA NA NA NA
## CARDGIFT -0.747006191 -0.1184121815 NA NA NA NA NA
## MINRAMNT 0.406741498 0.0280761398 NA NA NA NA NA
## MAXRAMNT 0.004886311 0.0000974447 NA NA NA NA NA
## LASTGIFT 0.083575102 0.0206641991 NA NA NA NA NA
## LASTDATE 0.026900121 -0.0407820080 NA NA NA NA NA
## FISTDATE 0.976866221 0.0984428534 NA NA NA NA NA
## TIMELAG NA NA NA NA NA NA NA
## AVGGIFT 0.216869458 0.0208277233 NA NA NA NA NA
## CONTROLN 0.079036076 -0.0236482005 NA NA NA NA NA
## TARGET_B -0.032269845 -0.0017033496 NA NA NA NA NA
## TARGET_D -0.006948431 0.0027541472 NA NA NA NA NA
## HPHONE_D -0.073434849 0.0769163569 NA NA NA NA NA
## RFA_2F -0.065081811 -0.0485171328 NA NA NA NA NA
## CLUSTER2 NA NA NA NA NA NA NA
## HIT CARDPROM MAXADATE NUMPROM CARDPM12
## ODATEDW -0.0707521600 -0.919799306 -1.078969e-02 -0.86934041 -0.149368049
## DOB 0.0234507927 -0.076431501 6.773925e-02 -0.08957340 0.020968050
## CLUSTER NA NA NA NA NA
## AGE NA NA NA NA NA
## NUMCHLD NA NA NA NA NA
## INCOME NA NA NA NA NA
## WEALTH1 NA NA NA NA NA
## HIT 1.0000000000 0.070475532 1.871469e-02 0.07410355 0.037822596
## CARDPROM 0.0704755316 1.000000000 5.523412e-02 0.94905182 0.406925723
## MAXADATE 0.0187146891 0.055234121 1.000000e+00 0.06254818 0.167730985
## NUMPROM 0.0741035452 0.949051819 6.254818e-02 1.00000000 0.403413622
## CARDPM12 0.0378225963 0.406925723 1.677310e-01 0.40341362 1.000000000
## NUMPRM12 0.0457923224 0.321836443 1.131148e-01 0.51244581 0.613360777
## RAMNTALL 0.0339408708 0.550445111 -9.386919e-04 0.62450193 0.245872119
## NGIFTALL 0.0435449508 0.775785976 5.713401e-05 0.79450398 0.241082428
## CARDGIFT 0.0445941817 0.779219371 -8.561992e-03 0.74847932 0.185616938
## MINRAMNT -0.0295728954 -0.415771224 -4.010021e-03 -0.39084897 -0.155123166
## MAXRAMNT -0.0023220327 0.022911177 1.632801e-03 0.06639027 0.057910365
## LASTGIFT -0.0072132620 -0.059093683 3.207576e-03 -0.02428896 0.030766973
## LASTDATE 0.0191769202 -0.007465905 -1.718122e-03 0.05663170 0.145944155
## FISTDATE -0.0708111145 -0.911159962 -1.456488e-02 -0.86813055 -0.158884895
## TIMELAG NA NA NA NA NA
## AVGGIFT -0.0147053387 -0.189499838 -6.367767e-04 -0.14012767 -0.013944256
## CONTROLN 0.0004204939 -0.115830917 -8.913870e-02 -0.20112769 -0.137348031
## TARGET_B 0.0061886745 0.032466507 -6.913899e-03 0.03316131 0.019614841
## TARGET_D 0.0066483728 0.011302393 -4.496352e-03 0.01733717 0.016357754
## HPHONE_D 0.2134981765 0.062646094 2.404688e-02 0.05680419 0.009329881
## RFA_2F -0.0004476504 0.131908145 -7.268394e-03 0.12056162 0.294564046
## CLUSTER2 NA NA NA NA NA
## NUMPRM12 RAMNTALL NGIFTALL CARDGIFT
## ODATEDW -0.159956207 -0.4674488264 -7.189279e-01 -0.747006191
## DOB -0.049064562 -0.0684069242 -1.210833e-01 -0.118412182
## CLUSTER NA NA NA NA
## AGE NA NA NA NA
## NUMCHLD NA NA NA NA
## INCOME NA NA NA NA
## WEALTH1 NA NA NA NA
## HIT 0.045792322 0.0339408708 4.354495e-02 0.044594182
## CARDPROM 0.321836443 0.5504451111 7.757860e-01 0.779219371
## MAXADATE 0.113114831 -0.0009386919 5.713401e-05 -0.008561992
## NUMPROM 0.512445807 0.6245019252 7.945040e-01 0.748479318
## CARDPM12 0.613360777 0.2458721194 2.410824e-01 0.185616938
## NUMPRM12 1.000000000 0.3837234762 3.037213e-01 0.183812467
## RAMNTALL 0.383723476 1.0000000000 5.997817e-01 0.501330901
## NGIFTALL 0.303721295 0.5997817385 1.000000e+00 0.914586069
## CARDGIFT 0.183812467 0.5013309014 9.145861e-01 1.000000000
## MINRAMNT -0.083060144 -0.0628026948 -3.790845e-01 -0.370640291
## MAXRAMNT 0.142544468 0.5574275003 -4.365491e-02 -0.067850242
## LASTGIFT 0.097370411 0.3237506544 -1.813680e-01 -0.190158107
## LASTDATE 0.333015536 0.0556938292 9.433850e-02 0.069994315
## FISTDATE -0.172496644 -0.4781509390 -7.271117e-01 -0.745287053
## TIMELAG NA NA NA NA
## AVGGIFT 0.088357336 0.3655595066 -2.569922e-01 -0.268607185
## CONTROLN -0.313830159 -0.1017185514 -1.292632e-01 -0.081800423
## TARGET_B 0.018639427 0.0146514065 5.089622e-02 0.054027167
## TARGET_D 0.025133778 0.0448101061 4.899013e-03 0.006449882
## HPHONE_D 0.006704794 0.0014006272 3.608749e-02 0.048155608
## RFA_2F 0.156378426 0.0820955973 3.468415e-01 0.320664755
## CLUSTER2 NA NA NA NA
## MINRAMNT MAXRAMNT LASTGIFT LASTDATE
## ODATEDW 0.406741498 0.0048863109 0.0835751015 0.026900121
## DOB 0.028076140 0.0000974447 0.0206641991 -0.040782008
## CLUSTER NA NA NA NA
## AGE NA NA NA NA
## NUMCHLD NA NA NA NA
## INCOME NA NA NA NA
## WEALTH1 NA NA NA NA
## HIT -0.029572895 -0.0023220327 -0.0072132620 0.019176920
## CARDPROM -0.415771224 0.0229111768 -0.0590936833 -0.007465905
## MAXADATE -0.004010021 0.0016328014 0.0032075759 -0.001718122
## NUMPROM -0.390848967 0.0663902747 -0.0242889563 0.056631698
## CARDPM12 -0.155123166 0.0579103653 0.0307669726 0.145944155
## NUMPRM12 -0.083060144 0.1425444684 0.0973704112 0.333015536
## RAMNTALL -0.062802695 0.5574275003 0.3237506544 0.055693829
## NGIFTALL -0.379084483 -0.0436549062 -0.1813680236 0.094338495
## CARDGIFT -0.370640291 -0.0678502417 -0.1901581073 0.069994315
## MINRAMNT 1.000000000 0.2932493274 0.5315475400 -0.025563775
## MAXRAMNT 0.293249327 1.0000000000 0.5626203544 -0.010541563
## LASTGIFT 0.531547540 0.5626203544 1.0000000000 -0.065573416
## LASTDATE -0.025563775 -0.0105415633 -0.0655734157 1.000000000
## FISTDATE 0.407530722 0.0035759985 0.0836714433 0.033170260
## TIMELAG NA NA NA NA
## AVGGIFT 0.754562553 0.7668103009 0.7841871205 -0.024475849
## CONTROLN 0.040789867 -0.0119468975 -0.0004266941 -0.041289922
## TARGET_B -0.031008020 -0.0168123183 -0.0355261295 0.041157610
## TARGET_D 0.020157869 0.0392237509 0.0616784458 0.018847102
## HPHONE_D -0.048412526 -0.0230696642 -0.0291955819 0.006470935
## RFA_2F -0.280924428 -0.1441743491 -0.3004592822 0.242880145
## CLUSTER2 NA NA NA NA
## FISTDATE TIMELAG AVGGIFT CONTROLN TARGET_B
## ODATEDW 0.976866221 NA 0.2168694576 0.0790360762 -0.032269845
## DOB 0.098442853 NA 0.0208277233 -0.0236482005 -0.001703350
## CLUSTER NA NA NA NA NA
## AGE NA NA NA NA NA
## NUMCHLD NA NA NA NA NA
## INCOME NA NA NA NA NA
## WEALTH1 NA NA NA NA NA
## HIT -0.070811115 NA -0.0147053387 0.0004204939 0.006188675
## CARDPROM -0.911159962 NA -0.1894998376 -0.1158309174 0.032466507
## MAXADATE -0.014564884 NA -0.0006367767 -0.0891386981 -0.006913899
## NUMPROM -0.868130548 NA -0.1401276720 -0.2011276950 0.033161307
## CARDPM12 -0.158884895 NA -0.0139442563 -0.1373480305 0.019614841
## NUMPRM12 -0.172496644 NA 0.0883573364 -0.3138301587 0.018639427
## RAMNTALL -0.478150939 NA 0.3655595066 -0.1017185514 0.014651406
## NGIFTALL -0.727111652 NA -0.2569922245 -0.1292632151 0.050896222
## CARDGIFT -0.745287053 NA -0.2686071846 -0.0818004231 0.054027167
## MINRAMNT 0.407530722 NA 0.7545625533 0.0407898667 -0.031008020
## MAXRAMNT 0.003575998 NA 0.7668103009 -0.0119468975 -0.016812318
## LASTGIFT 0.083671443 NA 0.7841871205 -0.0004266941 -0.035526129
## LASTDATE 0.033170260 NA -0.0244758488 -0.0412899221 0.041157610
## FISTDATE 1.000000000 NA 0.2162506047 0.0880128284 -0.032700683
## TIMELAG NA 1 NA NA NA
## AVGGIFT 0.216250605 NA 1.0000000000 0.0115070224 -0.032443507
## CONTROLN 0.088012828 NA 0.0115070224 1.0000000000 0.013165284
## TARGET_B -0.032700683 NA -0.0324435069 0.0131652838 1.000000000
## TARGET_D -0.007532493 NA 0.0442990841 0.0133664439 0.774232376
## HPHONE_D -0.071569432 NA -0.0443229794 -0.1087474012 -0.002625629
## RFA_2F -0.068630177 NA -0.2771682878 -0.0165448955 0.072311406
## CLUSTER2 NA NA NA NA NA
## TARGET_D HPHONE_D RFA_2F CLUSTER2
## ODATEDW -0.0069484311 -0.073434849 -0.0650818111 NA
## DOB 0.0027541472 0.076916357 -0.0485171328 NA
## CLUSTER NA NA NA NA
## AGE NA NA NA NA
## NUMCHLD NA NA NA NA
## INCOME NA NA NA NA
## WEALTH1 NA NA NA NA
## HIT 0.0066483728 0.213498176 -0.0004476504 NA
## CARDPROM 0.0113023931 0.062646094 0.1319081451 NA
## MAXADATE -0.0044963520 0.024046877 -0.0072683936 NA
## NUMPROM 0.0173371740 0.056804193 0.1205616181 NA
## CARDPM12 0.0163577542 0.009329881 0.2945640464 NA
## NUMPRM12 0.0251337775 0.006704794 0.1563784256 NA
## RAMNTALL 0.0448101061 0.001400627 0.0820955973 NA
## NGIFTALL 0.0048990126 0.036087494 0.3468415023 NA
## CARDGIFT 0.0064498822 0.048155608 0.3206647555 NA
## MINRAMNT 0.0201578686 -0.048412526 -0.2809244275 NA
## MAXRAMNT 0.0392237509 -0.023069664 -0.1441743491 NA
## LASTGIFT 0.0616784458 -0.029195582 -0.3004592822 NA
## LASTDATE 0.0188471021 0.006470935 0.2428801451 NA
## FISTDATE -0.0075324932 -0.071569432 -0.0686301765 NA
## TIMELAG NA NA NA NA
## AVGGIFT 0.0442990841 -0.044322979 -0.2771682878 NA
## CONTROLN 0.0133664439 -0.108747401 -0.0165448955 NA
## TARGET_B 0.7742323755 -0.002625629 0.0723114063 NA
## TARGET_D 1.0000000000 -0.002431590 0.0009047682 NA
## HPHONE_D -0.0024315898 1.000000000 0.0076071769 NA
## RFA_2F 0.0009047682 0.007607177 1.0000000000 NA
## CLUSTER2 NA NA NA 1
#pairs(cup98)
#绘制数值变量的散布图,并基于目标变量设置点的颜色:使用函数jitter()添加少量的噪声数据,在存在大量重叠时间点的情况这种设置很有用
color <- ifelse(cup98$TARGET_D > 0, "blue", "black")
pch <- ifelse(cup98$TARGET_D > 0, "+", ".")
plot(jitter(cup98$AGE), jitter(cup98$HIT), pch=pch, col=color, cex=0.7, ylim=c(0, 70), xlab = "AGE", ylab="HIT")
legend("topleft", c("TARGET_D>0", "TARGET_D=0"), col=c("blue", "black"), pch=c("+", "."))
#5)对于分类变量,使用卡方检验查看自变量与因变量之间的关系
myChisqTest <- function(x) {
t1 <- table(cup98pos[, x], cup98pos$TARGET_D2)
plot(t1, main=x, las=1)
print(x)
print(chisq.test(t1))
}
myChisqTest("GENDER")
## [1] "GENDER"
##
## Pearson's Chi-squared test
##
## data: t1
## X-squared = NaN, df = 42, p-value = NA
#卡方值越大,说明相关性越小
#对所有的分类变量求与因变量的卡方值
idx.cat <- which(sapply(cup98pos, is.factor))
#sapply(names(idx.cat), myChisqTest)
#1)创建训练集和测试集的大小
nRec <- nrow(cup98)
trainSize <- round(nRec*0.7)
testSize <- nRec - trainSize
#2)ctree模型参数
MinSplit <- 1000
MinBucket <- 400
MaxSurrogate <- 4
MaxDepth <- 10
(strParameters <- paste(MinSplit, MinBucket, MaxSurrogate, MaxDepth, sep="-"))
## [1] "1000-400-4-10"
LoopNum <- 10
cost <- 0.68
#筛选创建决策树所需的属性
varSet2 <- c("AGE", "AVGGIFT", "CARDGIFT", "CARDPM12", "CARDPROM",
"CLUSTER2", "DOMAIN", "GENDER", "GEOCODE2", "HIT", "HOMEOWNR", "HPHONE_D",
"INCOME", "LASTGIFT", "MAXRAMNT", "MDMAUD_F", "MDMAUD_R", "MINRAMNT",
"NGIFTALL", "NUMPRM12", "PCOWNERS", "PEPSTRFL", "PETS", "RAMNTALL",
"RECINHSE", "RFA_2A", "RFA_2F", "STATE", "TIMELAG")
cup98 <- cup98[, c("TARGET_D", varSet2)]
dim(cup98)
## [1] 95412 30
#使用pdf()函数设置图形区域和点的大小,以便能够在A4纸上打印出一棵合适大小的决策树
pdf(paste("F:\\R\\Rworkspace/evaluation-tree-", strParameters, ".pdf", sep=""), width=12, height=9, paper="a4r", pointsize=6)
cat(date(), "\n")
## Sun Feb 14 17:40:45 2016
cat(" trainSize=", trainSize, ", testSize=", testSize, "\n")
## trainSize= 66788 , testSize= 28624
cat(" MinSplit=", MinSplit, ", MinBucket=", MinBucket, ", MaxSurrogate=", MaxSurrogate, ", MaxDepth=", MaxDepth, "\n\n")
## MinSplit= 1000 , MinBucket= 400 , MaxSurrogate= 4 , MaxDepth= 10
#运行多次并获取平均结果
allTotalDonation <- matrix(0, nrow=testSize, ncol=LoopNum)
allAvgDonation <- matrix(0, nrow=testSize, ncol=LoopNum)
allDonationPercentile <- matrix(0, nrow=testSize, ncol=LoopNum)
#创建多棵决策树
library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
for(loopCnt in 1:LoopNum) {
#1.输出当前日期和循环数
cat(date(), ":iteration = ", loopCnt, "\n")
#2.创建训练集和测试集数据
trainIdx <- sample(1:nRec, trainSize)
trainData <- cup98[trainIdx, ]
testData <- cup98[-trainIdx, ]
#3.创建模型:
myCtree <- ctree(TARGET_D~., data=trainData, controls=ctree_control(minsplit=MinSplit, minbucket=MinBucket,
maxsurrogate=MaxSurrogate, maxdepth=MaxDepth))
#4.查看模型的大小并保存
print(object.size(myCtree), units="auto") #存储对象内存的估计值
save(myCtree, file=paste("F:\\R\\Rworkspace/cup98-ctree", strParameters, "-run-", loopCnt, ".rdata", sep=""))
#5.画出决策树图
figTitle <- paste("Tree", loopCnt)
plot(myCtree, main=figTitle, type="simple", ip_args=list(pval=F), ep_args=list(digits=0, abbreviate=T), tp_args=list(digits=2))
#6.预测
pred <- predict(myCtree, newdata=testData)
plot(pred, testData$TARGET_D)
print(sum(testData$TARGET_D[pred > cost] - cost)) #测试数据净捐赠数额
#7.对每次采样数据的捐赠总额、平均进行统计
s1 <- sort(pred, decreasing=T, method="quick", index.return=T) #method排序方法为quick,index.return为T返回排序索引
totalDonation <- cumsum(testData$TARGET_D[s1$ix]) #返回累积捐款和,是个向量与变量长度相等
avgDonation <- totalDonation/(1:testSize) #每个捐款累积和的平均捐款额
donationPercentile <- 100*totalDonation/sum(testData$TARGET_D) #每个捐款累积和所占捐款的百分比
allTotalDonation[, loopCnt] <- totalDonation
allAvgDonation[, loopCnt] <- avgDonation
allDonationPercentile[, loopCnt] <- donationPercentile
plot(totalDonation, type="l")
grid() #向图中添加网格
}
## Sun Feb 14 17:40:46 2016 :iteration = 1
## 26.1 Mb
## [1] 3741.06
## Sun Feb 14 17:41:10 2016 :iteration = 2
## 22.5 Mb
## [1] 3796.26
## Sun Feb 14 17:41:33 2016 :iteration = 3
## 18.4 Mb
## [1] 3789.26
## Sun Feb 14 17:41:55 2016 :iteration = 4
## 19.9 Mb
## [1] 3278.37
## Sun Feb 14 17:42:19 2016 :iteration = 5
## 26.1 Mb
## [1] 2490.37
## Sun Feb 14 17:42:42 2016 :iteration = 6
## 31.8 Mb
## [1] 3946.88
## Sun Feb 14 17:43:07 2016 :iteration = 7
## 33.9 Mb
## [1] 3755
## Sun Feb 14 17:43:32 2016 :iteration = 8
## 27.6 Mb
## [1] 3573.84
## Sun Feb 14 17:43:55 2016 :iteration = 9
## 22 Mb
## [1] 3506.08
## Sun Feb 14 17:44:17 2016 :iteration = 10
## 30.2 Mb
## [1] 3290.64
graphics.off()
cat(date(),":Loop completed.\n\n")
## Sun Feb 14 17:44:42 2016 :Loop completed.
#共耗时4分钟左右
fnlTotalDonation <- rowMeans(allTotalDonation) #10次捐款累积和的平均值
fnlAvgDonation <-rowMeans(allAvgDonation) #10次平均捐款额的平均值
fnlDonationPercentile <- rowMeans(allDonationPercentile) #10次累积捐款额百分比的平均值
rm(trainData, testData, pred)
#把结果保存到csv文件中:
results <- data.frame(cbind(allTotalDonation, fnlTotalDonation)) #分别10次捐款累积和及其平均值
names(results) <- c(paste("run", 1:LoopNum), "Average")
write.csv(results, paste("F:\\R\\Rworkspace/evaluation-TotalDonation-", strParameters, ".csv", sep=""))
#1)画出每次采样时,决策树预测捐款总额与发送邮件的关系图
results <- read.csv("F:\\R\\Rworkspace/evaluation-TotalDonation-1000-400-4-10.csv")
head(results)
## X run.1 run.2 run.3 run.4 run.5 run.6 run.7 run.8 run.9 run.10 Average
## 1 1 0 0 0 0 0 0 0 0 0 0 0.0
## 2 2 0 0 0 0 0 0 50 0 0 0 5.0
## 3 3 0 0 0 0 0 0 50 0 0 0 5.0
## 4 4 0 0 0 0 0 0 50 0 0 33 8.3
## 5 5 0 0 0 0 0 0 50 0 0 33 8.3
## 6 6 0 0 0 0 0 0 50 0 0 33 8.3
results[, 2:12] <- results[, 2:12] - cost*(1:testSize)
idx.pos <- c(seq(1, nrow(results), by=10), nrow(results)) #对每10个数据点只绘制一个点,以便缩小保存图表文件的大小
#画出平均的性能
plot(results[idx.pos, 12], type="l", lty=1, col=1, ylim=c(0, 4500), xlab="Number of Mails", ylab="Amount of Donations($)")
for(fCnt in 1:LoopNum) {
lines(results[idx.pos, fCnt+1], pty=".", type="l", lty=1+fCnt, col=1+fCnt)
}
legend("bottomright", col=1:(LoopNum+1), lty=1:(LoopNum+1), legend=c("Average", paste("Run", 1:LoopNum)))
#从上图可知:黑色实线表明所有9次运行的平均性能,其它线表示单次运行结果的性能。其中,运行7次时性能最好
#2)画出每次采样时,决策树预测捐款的百分比图和平均百分比图
donationPercentile <- sapply(2:12, function(i) 100*results[, i]/results[testSize, i]) #results[testSize, i]为捐款累积和的最后一行,即所有的捐款
percentile <- 100*(1:testSize)/testSize
plot(percentile[idx.pos], donationPercentile[idx.pos, 11], pty=".", type="l", lty=1, col=1, ylim=c(0, 170), xlab="Contact Percentile(%)", ylab="Donation Percentile(%)")
grid(col="gray", lty="dotted")
for(fCnt in 1:LoopNum) {
lines(percentile[idx.pos], donationPercentile[idx.pos, fCnt], pty=".", type="l", lty=1+fCnt, col=1+fCnt)
}
legend("bottomright", col=1:(LoopNum+1), lty=1:(LoopNum+1), legend=c("Average", paste("Run", 1:LoopNum)))
#从上图可知:黑色实线表明所有9次运行的平均性能,其它线表示单次运行结果的性能。其中,运行7次时性能最好
#3)对评定结果绘制图表:图中有两个Y轴,为运行9次得到的平均结果,实线表示所有筹集到的捐款数额的百分比,虚线表示通过客户接触获得的捐款额的平均值。每一个通过接触的客户的平均捐款数额在图表左边时比较多,随着接触的客户越来越多,每一个客户的平均捐款减少。因此,建立的预测模型能够有效的从客户排名榜列表中捕获捐款最多的客户。
avgDonation <- sapply(2:12, function(i) results[, i]/(1:testSize))
yTitle <- c("Total Donation Amount Percentile(%)", "Average Donation Amount per Contact($)")
par(mar=c(5,4,4,5)+.1)
plot(percentile[idx.pos], donationPercentile[idx.pos, 7], pty=".", type="l", lty="solid", col="red", ylab=yTitle[1], xlab="Contact Percentile(%)")
grid(col="gray", lty="dotted")
par(new=T)
plot(percentile[idx.pos], avgDonation[idx.pos, 7], type="l", lty="dashed", col="blue", xaxt="n", yaxt="n", xlab="", ylab="", ylim=c(0, max(avgDonation[, 7])))
axis(4)
mtext(yTitle[2], side=4, line=2)
legend("right", col=c("red", "blue"), lty=c("solid", "dashed"), legend=yTitle)
注意:此处没有运行,具体选择参数过程请自己运行。下面预测是使用上面的模型。 6组参数分别设置MinSplit、MinBucket、MaxSurrogate和MaxDepth的值。例如,第一组参数设置为“1000-400-4-5”,表明MinSplit设为1000,MinBucke设为400,MaxSurrogate设为4,MaxDepth设为5。将MinSplit分别设置为1000、700和200进行测试,相应的将MinBucket设置为400、200和50。同时,MaxDepth也需要分别设置5,6,8和10,而整个实验过程中MaxSurrogate的值始终为4. 下图中横坐标表示接触客户(已排序)的百分比,纵坐标表示捐款数额。建立模型的目的是为了在接触相同数量的客户情况下筹集到更多的捐款。代码如下
#对比不同参数的结果
parameters <- c("1000-400-4-5", "1000-400-4-6", "1000-400-4-8", "1000-400-4-10")
#parameters <- c("1000-400-4-10", "700-200-4-10", "200-50-4-10")
paraNum <- length(parameters)
percentile <- 100*(1:testSize)/testSize
#1)第一个结果
results <- read.csv(paste("F:\\R\\Rworkspace/evaluation-TotalDonation-", parameters[1], ".csv", sep=""))
avgResult <- results$Average - cost*(1:testSize)
plot(percentile, avgResult, pty=1, type="l", lty=1, col=1, ylab="Amount of Donation", xlab="Contact percentile(%)", main="Parameters: MinSplit, MinBucket, MaxSurrogate, MaxDepth")
grid(col="gray", lty="dotted")
#2)其它结果
for(i in 2:paraNum) {
results <- read.csv(paste("F:\\R\\Rworkspace/evaluation-TotalDonation-", parameters[i], ".csv", sep=""))
avgResult <- results$Average - cost*(1:testSize)
lines(percentile, avgResult, type="l", lty=i, col=i)
}
legend("bottomrigth", col=1:paraNum, lty=1:paraNum, legend = parameters)
#上上图显示深度为8和10获得的结果比深度为5和6的结果更好,上图MinBucket和Minsp的3中不同设置获得了相似的结果。我们选择参数“1000-400-4-5”来创建最后的模型,因为这一组参数设置了最少的存储桶和分裂点,并且相对于其他模型的过度拟合而言拟合度较低。
对验证数据cup98val.txt评分,预测捐款数额大于0.68的客户,将向其发送邮件以便筹集捐款,评估的标准是总的捐款数额扣除所有的邮件成本
#1)读取所需要预测的数据
cup98val <- read.csv("F:\\R\\Rworkspace\\cup98lrn/cup98val.txt")
#对预测数据做属性选择
cup98val <- cup98val[, c("CONTROLN", varSet2)]
dim(cup98val)
## [1] 96367 30
dim(cup98)
## [1] 95412 30
#2)查看预测数据在训练数据中没有的属性
trainNames <- names(cup98)
scoreNames <- names(cup98val)
(idx <- which(trainNames %in% scoreNames))
## [1] 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
## [24] 25 26 27 28 29 30
print(trainNames[-idx])
## [1] "TARGET_D"
print(scoreNames[-idx])
## [1] "CONTROLN"
#3)因子转换:把预测数据与训练数据类型不一样的属性,修改为训练数据的类型(因子类型)
scoreData <- cup98val
vars <- intersect(trainNames, scoreNames)
for(i in 1:length(vars)) {
varname <- vars[i]
trainLevels <- levels(cup98[, varname])
scorelevels <- levels(scoreData[, varname])
if(is.factor(cup98[, varname]) & setequal(trainLevels, scorelevels)==F) {
cat("Warning:new values found in score data, and they will be changed to NA!\n")
cat(varname, "\n")
#把预测数据修改为训练数据类型
scoreData[, varname] <- factor(scoreData[, varname], levels=trainLevels)
}
}
## Warning:new values found in score data, and they will be changed to NA!
## GENDER
## Warning:new values found in score data, and they will be changed to NA!
## STATE
rm(cup98val)
#4)预测:加载训练好的模型并预测
load("F:\\R\\Rworkspace\\cup98-ctree1000-400-4-10-run-7.rdata")
pred <- predict(myCtree, newdata=scoreData)
pred <- round(pred, digits=3)
table(pred, useNA="ifany")
## pred
## 0.304 0.362 0.565 0.653 0.678 0.709 0.788 0.799 0.856 0.885 0.95 1.219
## 3294 4366 12418 26276 5614 8965 10715 3417 4494 2406 2006 4342
## 1.359 1.587 1.594 1.715 1.784 1.912 2.339 2.499
## 1012 1792 581 1233 709 746 694 1287
result <- data.frame(scoreData$CONTROLN, pred)
names(result) <- c("CONTROLN", "pred")
valTarget <- read.csv("F:\\R\\Rworkspace\\cup98lrn/valtargt.txt")
merged <- merge(result, valTarget, by="CONTROLN")
str(valTarget)
## 'data.frame': 29288 obs. of 3 variables:
## $ CONTROLN: int 3 6 9 11 16 19 20 23 24 25 ...
## $ TARGET_B: int 0 0 0 0 0 0 0 0 0 1 ...
## $ TARGET_D: num 0 0 0 0 0 0 0 0 0 25 ...
str(result)
## 'data.frame': 96367 obs. of 2 variables:
## $ CONTROLN: int 188946 126296 155244 123985 119118 10120 59465 80803 2824 145014 ...
## $ pred : num 0.709 0.709 0.678 0.304 0.653 0.678 0.653 0.565 0.788 0.788 ...
str(merged)
## 'data.frame': 29287 obs. of 4 variables:
## $ CONTROLN: int 3 6 9 11 16 19 20 23 24 25 ...
## $ pred : num 2.339 0.653 0.653 2.339 2.339 ...
## $ TARGET_B: int 0 0 0 0 0 0 0 0 0 1 ...
## $ TARGET_D: num 0 0 0 0 0 0 0 0 0 25 ...
#删除数值为NA的数据
ind <- which(is.na(valTarget$TARGET_D))
valTarget1 <- valTarget[-ind, ]
sum(valTarget1$TARGET_D - cost)
## [1] 2364.84
#预测捐款大于0.68的总捐款数
idx <- (merged$pred > cost)
sum(merged$TARGET_D[idx] - cost)
## [1] 3235.14
#5)客户排名
merged <- merged[order(merged$pred, decreasing = T), ]
x <- 100*(1:nrow(merged))/nrow(merged)
y <- cumsum(merged$TARGET_D) - cost*(1:nrow(valTarget))
#每隔10点绘制一次,减少绘制文件的大小
idx.pos <- c(seq(1, length(x), by=10), length(x))
plot(x[idx.pos], y[idx.pos], type="l", xlab="Contab Percentile(%)", ylab="Amount of Donation")
grid()
总结:本实例演示在真实的应用场景下如何建立决策树,并且所建立的模型不是最优模型。可以参考以下两种方法: 第一种:使用两阶段模型,第一阶段模型是预测捐款的可能性,第二阶段模型是预测有条件的捐款数额,结合两个阶段模型预测的结果得到一个无条的捐款数额。平衡数据:捐款的客户数量所占的百分比只有5.1%,而其他大部分的客户是没有捐款的。通过消减样本中未捐款的数量或者增加捐款客户的数量来平衡数据,这样有助于更简单地创建一个更好的预测模型。 第二种:基于日期和历史捐款记录来提取新的变量。在本例中,建模的过程没有日期变量和历史捐款数据。实际上,可以从这些数据中抽取一些潜在的有用信息,如自最后一次捐款后的天数,在过去一年、两年、三年中的捐款数额。新提取的变量有助于提高预测模型的性能。
注意:在本例中,建模的过程还排除一些具有很多等级水平的分类变量,因为加入这些分类变量将会消耗很多的内存空间和时间。但是,也可以通过分组的方式来减少等级数量,特别是一些不常用的等级水平,例如人口数量较少的州名和邮政编码。还可以通过删除离群点和填补缺失值来推定数据。