如何生成一组随机数用于测试?Sample函数:sample(x, size,replace, prob)
说明:prob表示按照给定的概率抽取,由一个向量组成
例子:
t1<-sample(1:100,15, replace = TRUE)
set.seed()函数
说明:设置随机种子,如果设置了随机种子每次的随机数是相同的。目的是方便以后他人进行测试
runif()函数:随机生成均匀分布的小数
runif(n,min = 0, max = 1)
例子
set.seed(1234)
as.integer(runif(15)*100)
R 常用数据包:
1.数据并行处理
require(doMC)
registerDoMC(cores=2)
cv.glmnet(x,y,parallel=TRUE)
说明:lambda.1se: which gives the mostregularized model such that error is within one standard error of the minimum.
L1正则化可以产生稀疏权值矩阵,即产生一个稀疏模型,可以用于特征选择
L2正则化可以防止模型过拟合(overfitting);一定程度上,L1也可以防止过拟合
2.VLMpackage
使用说明:https://github.com/capitalone/otvPlots/blob/master/README.md
library(otvPlots)
data_vars<-data.frame(read.table('mergeapplyUsr_filter.matrix+apptm',header=TRUE,sep=",", na.string = c(-9999,NULL,"misssing") ))
## Preparedata and labels
applyData<- PrepData(data_vars, dateNm = "date", dateGp ="weeks",dateGpBp = "quarters")
#bankLabels<- PrepLabels(bankLabels)
## Generatea pdf file of vlm plots, and csv files of summary statistics
vlm(dataFl =applyData, dateNm = "date", dateGp = "weeks", dateGpBp = "quarters", outFl ="test")
3.ctree
library(party)
str(df)
head(df)
set.seed(12345)
modelT1<-ctree(dpdTag~ ., df, controls = ctree_control(maxdepth =3))
plot(modelT1,type="simple")
4.R 多个文件merge
all.feat<- Reduce(function(x, y) merge(x, y, by = "passid", all.x=TRUE,sort = T), list(passid, raw.feat0, raw.feat1, raw.feat2, raw.feat3, raw.feat4,raw.feat5))
5.R 筛选缺失率在90以上的变量名称
union_allNA=row.names(data_vars_NA)[data_vars_NA$NA_percentage>0.9]
6.按照固定顺序排列
contain_vars(bb,names(aa))
7. Rfunction
mean_cal =function(data){
mean_list =data.frame(matrix(NA,length(unique(data$stat_dt)),dim(data)[2]-1))
names(mean_list)=names(data[,-1])
row.names(mean_list)=unique(data$stat_dt)
for ( i inunique(data$stat_dt,order=FALSE)){
for (j in names(mean_list)){
mean_list[i,j]=mean(data[which(data$stat_dt==i),j],na.rm=TRUE)
}
}
return(mean_list)
}
8. R cal_Ks
myKS <-function(pre,label){
true <- sum(label)
false <- length(label)-true
tpr <- NULL
fpr <- NULL
o_pre <- pre[order(pre)] # let thethreshold in an order from small to large
for (i in o_pre){
tp <- sum((pre >= i) & label)
tpr <- c(tpr,tp/true)
fp <- sum((pre >= i) & (1-label))
fpr <- c(fpr,fp/false)
}
plot(o_pre,tpr,type = "l",col="green",xlab="threshold",ylab="tpr,fpr")
lines(o_pre,fpr,type="l", co l = "red")
KSvalue <- max(tpr-fpr)
sub = paste("KS value =",KSvalue)
title(sub=sub)
cutpoint <- which(tpr-fpr==KSvalue)
thre <- o_pre[cutpoint]
lines(c(thre,thre),c(fpr[cutpoint],tpr[cutpoint]),col ="blue")
cat("KS-value:",KSvalue)
}