R语言中多线程运行程序

R 代码的运行效率不高,因此有时候可以考虑并行运行程序。

#apply系列函数

  • 实际不是并行
apply(X, MARGIN, FUN, ...)

lapply(X, FUN, ...)

sapply(X, FUN, ..., simplify = TRUE, USE.NAMES = TRUE)

vapply(X, FUN, FUN.VALUE, ..., USE.NAMES = TRUE)

##lapply()

  • 接受一个向量或者列表作为处理对象,返回结果是是与输入等长的列表:
lapply(1:5, function(x) x^2) 
[[1]]
[1] 1
[[2]]
[1] 4
[[3]]
[1] 9
[[4]]
[1] 16
[[5]]
[1] 25

lapply(1:5, function(x) c(x^2,x^3)) 
[[1]]
[1] 1 1
[[2]]
[1] 4 8
[[3]]
[1]  9 27
[[4]]
[1] 16 64
[[5]]
[1]  25 125

##sapply()

sapply(1:5, function(x) x^2) #This output is a vector
[1]  1  4  9 16 25
sapply(1:5, function(x) c(x^2,x^3)) #This outputs a matrix
     [,1] [,2] [,3] [,4] [,5]
[1,]    1    4    9   16   25
[2,]    1    8   27   64  125

注: 当设置simplify = FALSE, USE.NAMES = FALSE时,sapply() 返回结果与lappy()一样

sapply(1:5, function(x) x^2, simplify = FALSE, USE.NAMES = FALSE) 
#Output is same as for lapply()
[[1]]
[1] 1 1
[[2]]
[1] 4 8
[[3]]
[1]  9 27
[[4]]
[1] 16 64
[[5]]
[1]  25 125

#parallel包

parallel包可以将本地计算机的核分配给R程序,从而并行运行程序。

parallel的工作原理:

  • 查找系统中的内核数量;
  • 分配一部分核创建集群;
  • 程序并行运行时,只需要添加创建好的集群(作为一个参数);
  • 程序运行结束,关闭集群,释放内存。
install.packages(“parallel”)
library(parallel)
 
#检测系统核的数目
no_cores <- detectCores()
 
#创建集群
clust <- makeCluster(no_cores) 
#lapply()的并行版本是parLapply(),只需要一个额外的集群参数。
parLapply(clust,1:5, function(x) c(x^2,x^3))
[[1]]
[1] 1 1
[[2]]
[1] 4 8
[[3]]
[1]  9 27
[[4]]
[1] 16 64
[[5]]
[1]  25 125
stopCluster(clust)

#sapply()的并行版本是parSapply()
library(parallel)
no_cores <- detectCores()
clust <- makeCluster(no_cores) #This line will take time
base <- 4
clusterExport(clust, "base")
parSapply(clust, 1:5, function(exponent) base^exponent)
[1]    4   16   64  256 1024
stopCluster(clust)

#foreach

  • foreach包需要调用doParallel 包,使用registerdopar()函数使进程并行。
library(foreach)
library(doParallel)

#registerDoParallel(no_cores)也可以
registerDoParallel(makeCluster(no_cores))
  • foreach()函数需要%dopar%命令并行化程序
#输出向量设置.combine = c
foreach(exponent = 1:5, .combine = c)  %dopar%  base^exponent

[1]   3   9  27  81 243

#输出矩阵设置.combine = rbind
foreach(exponent = 1:5, .combine = rbind)  %dopar%  base^exponent

         [,1]
result.1    3
result.2    9
result.3   27
result.4   81
result.5  243


#输出列表设置.combine = list
foreach(exponent = 1:5, .combine = list, .multicombine=TRUE)  %dopar%  base^exponent

[[1]]
[1] 3

[[2]]
[1] 9

[[3]]
[1] 27

[[4]]
[1] 81

[[5]]
[1] 243

#输出数据框设置.combine = data.frame
foreach(exponent = 1:5, .combine = data.frame)  %dopar%  base^exponent
  result.1 result.2 result.3 result.4 result.5
1        2        4        8       16       32

#关闭集群
stopImplicitCluster()

#总结

lapply(1:5, function(x) x^2) #input is 1,2,3,4,5 and output is square of the input

lapply(1:5, function(x) c(x^2,x^3)) #The output should be square and cube of input

sapply(1:5, function(x) x^2) #This output is a vector

sapply(1:5, function(x) c(x^2,x^3)) #This outputs a matrix

sapply(1:5, function(x) x^2, simplify = FALSE, USE.NAMES = FALSE) #Output is same as for lapply()

#Include the parallel library. If the next line does not work, run install.packages(“parallel”) first
library(parallel)

# Use the detectCores() function to find the number of cores in system
no_cores <- detectCores()

# Setup cluster
clust <- makeCluster(no_cores) #This line will take time

#The parallel version of lapply() is parLapply() and needs an additional cluster argument.
parLapply(clust,1:5, function(x) c(x^2,x^3))
stopCluster(clust)

#Include the parallel library. If the next line does not work, run install.packages(“parallel”) first
library(parallel)

# Use the detectCores() function to find the number of cores in system
no_cores <- detectCores()

# Setup cluster
clust <- makeCluster(no_cores) #This line will take time

#Setting a base variable 
base <- 4
#Note that this line is required so that all cores in cluster have this variable available
clusterExport(clust, "base")

#Using the parSapply() function
parSapply(clust, 1:5, function(exponent) base^exponent)
stopCluster(clust)

clusterEvalQ(clust,library(randomForest))

library(foreach)
library(doParallel)

registerDoParallel(makeCluster(no_cores))

#Vector output
foreach(exponent = 1:5, .combine = c)  %dopar%  base^exponent

#Matrix output
foreach(exponent = 1:5, .combine = rbind)  %dopar%  base^exponent

#List output
foreach(exponent = 1:5, .combine = list, .multicombine=TRUE)  %dopar%  base^exponent

#Data Frame output
foreach(exponent = 1:5, .combine = data.frame)  %dopar%  base^exponent


#This also works
registerDoParallel(no_cores)

stopImplicitCluster()

#using .export parameter
registerDoParallel(no_cores)

base <- 2 #Declaring this variable outside the scope of foreach() function

sample_func <- function (exponent) {
  #Using the .export function here to include the base variable
  foreach(exponent = 1:5, .combine = c,.export = "base")  %dopar%  base^exponent
}
sample_func()
stopImplicitCluster()

#using .packages parameter
library(dplyr)
registerDoParallel(no_cores)
foreach(i = 1:5, .combine=c, .packages="dplyr") %dopar% {
  iris[i, ] %>% select(-Species) %>% sum
}
stopImplicitCluster()

clust<-makeCluster(no_cores, type="FORK")

registerDoParallel(makeCluster(no_cores, outfile="debug_file.txt"))
foreach(x=list(1:5, "a"))  %dopar%  print(x)

registerDoParallel(makeCluster(no_cores, outfile="debug_file.txt"))
foreach(x=list(1,2,3,4,5, "a"))  %dopar%  cat(dput(x), file = paste0("debug_file_", x, ".txt"))

registerDoParallel(makeCluster(no_cores))
foreach(x=list(1, 2, "a"))  %dopar%  
{
  tryCatch({
    c(1/x) #Should give an error when x is “a”
  }, error = function(e) return(paste0("Error occurred for '", x, "'", 
                                       " The error is '", e, "'")))
}

base=4 #Create a variable base whose value is 4
base_copy=base #Make a copy of the variable 
rm(base) #I can now remove the base variable and free up memory

rm(list=ls())

#原文

R bloggers Implementing Parallel Processing in R

你可能感兴趣的:(R语言中多线程运行程序)