开始使用R

如何生成一组随机数用于测试？Sample函数：sample(x, size,replace, prob)

说明：prob表示按照给定的概率抽取，由一个向量组成

例子：

t1<-sample(1:100,15, replace = TRUE)

set.seed()函数

说明：设置随机种子，如果设置了随机种子每次的随机数是相同的。目的是方便以后他人进行测试

runif()函数：随机生成均匀分布的小数

runif(n,min = 0, max = 1)

例子

set.seed(1234)

as.integer(runif(15)*100)

R 常用数据包：

1.数据并行处理

require(doMC)

registerDoMC(cores=2)

cv.glmnet(x,y,parallel=TRUE)

说明：lambda.1se: which gives the mostregularized model such that error is within one standard error of the minimum.

L1正则化可以产生稀疏权值矩阵，即产生一个稀疏模型，可以用于特征选择

L2正则化可以防止模型过拟合（overfitting）；一定程度上，L1也可以防止过拟合

2.VLMpackage

使用说明：https://github.com/capitalone/otvPlots/blob/master/README.md

library(otvPlots)

data_vars<-data.frame(read.table('mergeapplyUsr_filter.matrix+apptm',header=TRUE,sep=",", na.string = c(-9999,NULL,"misssing") ))

## Preparedata and labels

applyData<- PrepData(data_vars, dateNm = "date", dateGp ="weeks",dateGpBp = "quarters")

#bankLabels<- PrepLabels(bankLabels)

## Generatea pdf file of vlm plots, and csv files of summary statistics

vlm(dataFl =applyData, dateNm = "date", dateGp = "weeks", dateGpBp = "quarters", outFl ="test")

3.ctree

library(party)

str(df)

head(df)

set.seed(12345)

modelT1<-ctree(dpdTag~ ., df, controls = ctree_control(maxdepth =3))

plot(modelT1,type="simple")

4.R 多个文件merge

all.feat<- Reduce(function(x, y) merge(x, y, by = "passid", all.x=TRUE,sort = T), list(passid, raw.feat0, raw.feat1, raw.feat2, raw.feat3, raw.feat4,raw.feat5))

5.R 筛选缺失率在90以上的变量名称

union_allNA=row.names(data_vars_NA)[data_vars_NA$NA_percentage>0.9]

6.按照固定顺序排列

contain_vars(bb,names(aa))

7. Rfunction

mean_cal =function(data){

mean_list =data.frame(matrix(NA,length(unique(data$stat_dt)),dim(data)[2]-1))

names(mean_list)=names(data[,-1])

row.names(mean_list)=unique(data$stat_dt)

for ( i inunique(data$stat_dt,order=FALSE)){

for (j in names(mean_list)){

mean_list[i,j]=mean(data[which(data$stat_dt==i),j],na.rm=TRUE)

}

return(mean_list)

}

8. R cal_Ks

myKS <-function(pre,label){

true <- sum(label)

false <- length(label)-true

tpr <- NULL

fpr <- NULL

o_pre <- pre[order(pre)] # let thethreshold in an order from small to large

for (i in o_pre){

tp <- sum((pre >= i) & label)

tpr <- c(tpr,tp/true)

fp <- sum((pre >= i) & (1-label))

fpr <- c(fpr,fp/false)

}

plot(o_pre,tpr,type = "l",col="green",xlab="threshold",ylab="tpr,fpr")

lines(o_pre,fpr,type="l", co l = "red")

KSvalue <- max(tpr-fpr)

sub = paste("KS value =",KSvalue)

title(sub=sub)

cutpoint <- which(tpr-fpr==KSvalue)

thre <- o_pre[cutpoint]

lines(c(thre,thre),c(fpr[cutpoint],tpr[cutpoint]),col ="blue")

cat("KS-value:",KSvalue)

}

开始使用R

你可能感兴趣的:(开始使用R)