开始使用R

如何生成一组随机数用于测试?Sample函数:sample(x, size,replace, prob)

说明:prob表示按照给定的概率抽取,由一个向量组成

例子:

t1<-sample(1:100,15, replace = TRUE)

set.seed()函数

说明:设置随机种子,如果设置了随机种子每次的随机数是相同的。目的是方便以后他人进行测试

runif()函数:随机生成均匀分布的小数

runif(n,min = 0, max = 1)

例子

set.seed(1234)

as.integer(runif(15)*100)

R 常用数据包:

1.数据并行处理

require(doMC)

registerDoMC(cores=2)

cv.glmnet(x,y,parallel=TRUE)

说明:lambda.1se: which gives the mostregularized model such that error is within one standard error of the minimum.

L1正则化可以产生稀疏权值矩阵,即产生一个稀疏模型,可以用于特征选择

L2正则化可以防止模型过拟合(overfitting);一定程度上,L1也可以防止过拟合

2.VLMpackage

使用说明:https://github.com/capitalone/otvPlots/blob/master/README.md

library(otvPlots)

data_vars<-data.frame(read.table('mergeapplyUsr_filter.matrix+apptm',header=TRUE,sep=",", na.string = c(-9999,NULL,"misssing") ))

## Preparedata and labels

applyData<- PrepData(data_vars, dateNm = "date", dateGp ="weeks",dateGpBp = "quarters")

#bankLabels<- PrepLabels(bankLabels)

## Generatea pdf file of vlm plots, and csv files of summary statistics

vlm(dataFl =applyData, dateNm = "date", dateGp = "weeks", dateGpBp = "quarters", outFl ="test")

3.ctree

library(party)

str(df)

head(df)

set.seed(12345)

modelT1<-ctree(dpdTag~ ., df, controls = ctree_control(maxdepth =3))

plot(modelT1,type="simple")


4.R 多个文件merge

all.feat<- Reduce(function(x, y) merge(x, y, by = "passid", all.x=TRUE,sort = T), list(passid, raw.feat0, raw.feat1, raw.feat2, raw.feat3, raw.feat4,raw.feat5))

5.R  筛选缺失率在90以上的变量名称

union_allNA=row.names(data_vars_NA)[data_vars_NA$NA_percentage>0.9]

6.按照固定顺序排列

contain_vars(bb,names(aa))

7. Rfunction

mean_cal =function(data){

     mean_list =data.frame(matrix(NA,length(unique(data$stat_dt)),dim(data)[2]-1))

     names(mean_list)=names(data[,-1])

     row.names(mean_list)=unique(data$stat_dt)

      for ( i inunique(data$stat_dt,order=FALSE)){

        for (j in names(mean_list)){

           mean_list[i,j]=mean(data[which(data$stat_dt==i),j],na.rm=TRUE)

        }

      }

      return(mean_list)

}

8.  R cal_Ks

myKS <-function(pre,label){

  true <- sum(label)

  false <- length(label)-true

  tpr <- NULL

  fpr <- NULL

  o_pre <- pre[order(pre)] # let thethreshold in an order from small to large

  for (i in o_pre){

    tp <- sum((pre >= i) & label)

    tpr <- c(tpr,tp/true)

    fp <- sum((pre >= i) & (1-label))

    fpr <- c(fpr,fp/false)

  }

    plot(o_pre,tpr,type = "l",col="green",xlab="threshold",ylab="tpr,fpr")

    lines(o_pre,fpr,type="l", co  l = "red")

    KSvalue <- max(tpr-fpr)

    sub = paste("KS value =",KSvalue)

    title(sub=sub)

    cutpoint <- which(tpr-fpr==KSvalue)

    thre <- o_pre[cutpoint]

   lines(c(thre,thre),c(fpr[cutpoint],tpr[cutpoint]),col ="blue")

    cat("KS-value:",KSvalue)

}

你可能感兴趣的:(开始使用R)