R 代码的运行效率不高,因此有时候可以考虑并行运行程序。
#apply系列函数
- 实际不是并行
apply(X, MARGIN, FUN, ...)
lapply(X, FUN, ...)
sapply(X, FUN, ..., simplify = TRUE, USE.NAMES = TRUE)
vapply(X, FUN, FUN.VALUE, ..., USE.NAMES = TRUE)
##lapply()
- 接受一个向量或者列表作为处理对象,返回结果是是与输入等长的列表:
lapply(1:5, function(x) x^2)
[[1]]
[1] 1
[[2]]
[1] 4
[[3]]
[1] 9
[[4]]
[1] 16
[[5]]
[1] 25
lapply(1:5, function(x) c(x^2,x^3))
[[1]]
[1] 1 1
[[2]]
[1] 4 8
[[3]]
[1] 9 27
[[4]]
[1] 16 64
[[5]]
[1] 25 125
##sapply()
sapply(1:5, function(x) x^2) #This output is a vector
[1] 1 4 9 16 25
sapply(1:5, function(x) c(x^2,x^3)) #This outputs a matrix
[,1] [,2] [,3] [,4] [,5]
[1,] 1 4 9 16 25
[2,] 1 8 27 64 125
注: 当设置simplify = FALSE, USE.NAMES = FALSE时,sapply() 返回结果与lappy()一样
sapply(1:5, function(x) x^2, simplify = FALSE, USE.NAMES = FALSE)
#Output is same as for lapply()
[[1]]
[1] 1 1
[[2]]
[1] 4 8
[[3]]
[1] 9 27
[[4]]
[1] 16 64
[[5]]
[1] 25 125
#parallel包
parallel包可以将本地计算机的核分配给R程序,从而并行运行程序。
parallel的工作原理:
- 查找系统中的内核数量;
- 分配一部分核创建集群;
- 程序并行运行时,只需要添加创建好的集群(作为一个参数);
- 程序运行结束,关闭集群,释放内存。
install.packages(“parallel”)
library(parallel)
#检测系统核的数目
no_cores <- detectCores()
#创建集群
clust <- makeCluster(no_cores)
#lapply()的并行版本是parLapply(),只需要一个额外的集群参数。
parLapply(clust,1:5, function(x) c(x^2,x^3))
[[1]]
[1] 1 1
[[2]]
[1] 4 8
[[3]]
[1] 9 27
[[4]]
[1] 16 64
[[5]]
[1] 25 125
stopCluster(clust)
#sapply()的并行版本是parSapply()
library(parallel)
no_cores <- detectCores()
clust <- makeCluster(no_cores) #This line will take time
base <- 4
clusterExport(clust, "base")
parSapply(clust, 1:5, function(exponent) base^exponent)
[1] 4 16 64 256 1024
stopCluster(clust)
#foreach包
- foreach包需要调用doParallel 包,使用registerdopar()函数使进程并行。
library(foreach)
library(doParallel)
#registerDoParallel(no_cores)也可以
registerDoParallel(makeCluster(no_cores))
- foreach()函数需要%dopar%命令并行化程序
#输出向量设置.combine = c
foreach(exponent = 1:5, .combine = c) %dopar% base^exponent
[1] 3 9 27 81 243
#输出矩阵设置.combine = rbind
foreach(exponent = 1:5, .combine = rbind) %dopar% base^exponent
[,1]
result.1 3
result.2 9
result.3 27
result.4 81
result.5 243
#输出列表设置.combine = list
foreach(exponent = 1:5, .combine = list, .multicombine=TRUE) %dopar% base^exponent
[[1]]
[1] 3
[[2]]
[1] 9
[[3]]
[1] 27
[[4]]
[1] 81
[[5]]
[1] 243
#输出数据框设置.combine = data.frame
foreach(exponent = 1:5, .combine = data.frame) %dopar% base^exponent
result.1 result.2 result.3 result.4 result.5
1 2 4 8 16 32
#关闭集群
stopImplicitCluster()
#总结
lapply(1:5, function(x) x^2) #input is 1,2,3,4,5 and output is square of the input
lapply(1:5, function(x) c(x^2,x^3)) #The output should be square and cube of input
sapply(1:5, function(x) x^2) #This output is a vector
sapply(1:5, function(x) c(x^2,x^3)) #This outputs a matrix
sapply(1:5, function(x) x^2, simplify = FALSE, USE.NAMES = FALSE) #Output is same as for lapply()
#Include the parallel library. If the next line does not work, run install.packages(“parallel”) first
library(parallel)
# Use the detectCores() function to find the number of cores in system
no_cores <- detectCores()
# Setup cluster
clust <- makeCluster(no_cores) #This line will take time
#The parallel version of lapply() is parLapply() and needs an additional cluster argument.
parLapply(clust,1:5, function(x) c(x^2,x^3))
stopCluster(clust)
#Include the parallel library. If the next line does not work, run install.packages(“parallel”) first
library(parallel)
# Use the detectCores() function to find the number of cores in system
no_cores <- detectCores()
# Setup cluster
clust <- makeCluster(no_cores) #This line will take time
#Setting a base variable
base <- 4
#Note that this line is required so that all cores in cluster have this variable available
clusterExport(clust, "base")
#Using the parSapply() function
parSapply(clust, 1:5, function(exponent) base^exponent)
stopCluster(clust)
clusterEvalQ(clust,library(randomForest))
library(foreach)
library(doParallel)
registerDoParallel(makeCluster(no_cores))
#Vector output
foreach(exponent = 1:5, .combine = c) %dopar% base^exponent
#Matrix output
foreach(exponent = 1:5, .combine = rbind) %dopar% base^exponent
#List output
foreach(exponent = 1:5, .combine = list, .multicombine=TRUE) %dopar% base^exponent
#Data Frame output
foreach(exponent = 1:5, .combine = data.frame) %dopar% base^exponent
#This also works
registerDoParallel(no_cores)
stopImplicitCluster()
#using .export parameter
registerDoParallel(no_cores)
base <- 2 #Declaring this variable outside the scope of foreach() function
sample_func <- function (exponent) {
#Using the .export function here to include the base variable
foreach(exponent = 1:5, .combine = c,.export = "base") %dopar% base^exponent
}
sample_func()
stopImplicitCluster()
#using .packages parameter
library(dplyr)
registerDoParallel(no_cores)
foreach(i = 1:5, .combine=c, .packages="dplyr") %dopar% {
iris[i, ] %>% select(-Species) %>% sum
}
stopImplicitCluster()
clust<-makeCluster(no_cores, type="FORK")
registerDoParallel(makeCluster(no_cores, outfile="debug_file.txt"))
foreach(x=list(1:5, "a")) %dopar% print(x)
registerDoParallel(makeCluster(no_cores, outfile="debug_file.txt"))
foreach(x=list(1,2,3,4,5, "a")) %dopar% cat(dput(x), file = paste0("debug_file_", x, ".txt"))
registerDoParallel(makeCluster(no_cores))
foreach(x=list(1, 2, "a")) %dopar%
{
tryCatch({
c(1/x) #Should give an error when x is “a”
}, error = function(e) return(paste0("Error occurred for '", x, "'",
" The error is '", e, "'")))
}
base=4 #Create a variable base whose value is 4
base_copy=base #Make a copy of the variable
rm(base) #I can now remove the base variable and free up memory
rm(list=ls())
#原文
R bloggers Implementing Parallel Processing in R