R语言并行计算
2020-02-03 本文已影响0人
kittybaby
R包
parallel
doparallel
foreach
parallel包
1.鉴定本机的核数
# Load the parallel package
library(parallel)
# Store the number of cores in the object no_of_cores
no_of_cores <-detectCores()
# Print no_of_cores
print(no_of_cores)
2.parApply
3.parSapply
可变范围
在Mac / Linux上,您可以选择使用自动包含所有环境变量的makeCluster(no_core,type =“FORK”)(以下详细信息)。 在Windows上,您必须使用并行插座集群(PSOCK),其中仅包含已加载的基本包(请注意,PSOCK在所有系统上都是默认值)。 因此,您应该始终指定并行功能所需的哪些变量和库,例如 以下失败:
> cl<-makeCluster(4)
> base <- 2
>
> parLapply(cl,
+ 2:4,
+ function(exponent)
+ base^exponent)
Error in checkForRemoteErrors(val) :
3 nodes produced errors; first error: 找不到对象'base'
>
> stopCluster(cl)
> cl<-makeCluster(4)
>
> base <- 2
> clusterExport(cl, "base")
> parLapply(cl,
+ 2:4,
+ function(exponent)
+ base^exponent)
[[1]]
[1] 4
[[2]]
[1] 8
[[3]]
[1] 16
您需要使用clusterExport(cl,“base”)才能使该函数看到基本变量。 如果您正在使用某些特殊软件包,那么同样需要通过clusterEvalQ来加载它们。 我经常使用rms包,因此我使用clusterEvalQ(cl,library(rms))。 请注意,对clusterExport后变量的任何更改都将被忽略:
> cl<-makeCluster(no_cores)
> clusterExport(cl, "base")
> base <- 4
> # Run
> parLapply(cl,
+ 2:4,
+ function(exponent)
+ base^exponent)
[[1]]
[1] 4
[[2]]
[1] 8
[[3]]
[1] 16
>
> # Finish
> stopCluster(cl)
方法一
y <- 1:10
sapply(1:5, function(x) x + y)
library(parallel)
cl <- makeCluster(2)
y <- 1:10
# add y to function definition and parSapply call
parSapply(cl, 1:5, function(x,y) x + y, y)
# export y to the global environment of each node
# then call your original code
clusterExport(cl, "y")
parSapply(cl, 1:5, function(x) x + y)
方法二
library(parallel)
fun <- function(cl, y) {
parSapply(cl, 1:5, function(x) x + y)
}
cl <- makeCluster(2)
fun(cl, 1:10)
stopCluster(cl)
4.mclapply(wins不能使用)
workerFunc <- function(n) { return(n^2) }
values <- 1:100
library(parallel)
## Number of workers (R processes) to use:
numWorkers <- 8
## Parallel calculation (mclapply):
res <- mclapply(values, workerFunc, mc.cores = numWorkers)
print(unlist(res))
#Error in mclapply(values, workerFunc, mc.cores = numWorkers) :
# Windows不支持'mc.cores' > 1
5.parLapply
workerFunc <- function(n) { return(n^2) }
values <- 1:100
library(parallel)
## Number of workers (R processes) to use:
numWorkers <- 8
## Set up the ’cluster’
cl <- makeCluster(numWorkers, type = "PSOCK")
## Parallel calculation (parLapply):
res <- parLapply(cl, values, workerFunc)
## Shut down cluster
stopCluster(cl)
print(unlist(res))
foreach包
> library(foreach)
> library(doParallel)
载入需要的程辑包:iterators
>
> cl<-makeCluster(no_cores)
> registerDoParallel(cl)
> foreach(exponent = 2:4,
+ .combine = c) %dopar%
+ base^exponent
[1] 16 64 256
> foreach(exponent = 2:4,
+ .combine = rbind) %dopar%
+ base^exponent
[,1]
result.1 16
result.2 64
result.3 256
> foreach(exponent = 2:4,
+ .combine = list,
+ .multicombine = TRUE) %dopar%
+ base^exponent
[[1]]
[1] 16
[[2]]
[1] 64
[[3]]
[1] 256
> foreach(exponent = 2:4,
+ .combine = list) %dopar%
+ base^exponent
[[1]]
[[1]][[1]]
[1] 16
[[1]][[2]]
[1] 64
[[2]]
[1] 256
#stopImplicitCluster()
变量的域
默认情况下,相同的本地环境中的变量是可用的:
base <- 2
cl<-makeCluster(2)
registerDoParallel(cl)
foreach(exponent = 2:4,
.combine = c) %dopar%
base^exponent
stopCluster(cl)
> cl <- makeCluster(2)
> test <- function (exponent) {
+ foreach(exponent = 2:4,
+ .combine = c) %dopar%
+ base^exponent
+ }
> test()
Show Traceback
Rerun with Debug
Error in base^exponent : task 1 failed - "找不到对象'base'"
> base <- 2
> cl<-makeCluster(2)
> registerDoParallel(cl)
>
> base <- 4
> test <- function (exponent) {
+ foreach(exponent = 2:4,
+ .combine = c,
+ .export = "base") %dopar%
+ base^exponent
+ }
> test()
[1] 16 64 256
>
> stopCluster(cl)
同样,您可以使用.packages选项加载软件包,例如 .packages = c(“rms”,“mouse”)。 我强烈建议您始终导出所需的变量,因为它会限制在函数中封装代码时出现的问题。
cl <- makeCluster(4)
> registerDoParallel(cl)
> x <- iris[which(iris[,5] != "setosa"), c(1,5)]
> trials <- 10000
> ptime <- system.time({
+ r <- foreach(icount(trials), .combine=cbind) %dopar% {
+ ind <- sample(100, 100, replace=TRUE)
+ result1 <- glm(x[ind,2]~x[ind,1], family=binomial(logit))
+ coefficients(result1)
+ }
+ })[3]
> ptime
elapsed
20.01
> stime <- system.time({
+ r <- foreach(icount(trials), .combine=cbind) %do% {
+ ind <- sample(100, 100, replace=TRUE)
+ result1 <- glm(x[ind,2]~x[ind,1], family=binomial(logit))
+ coefficients(result1)
+ }
+ })[3]
> stime
elapsed
39.17
stopCluster(cl)
参考资料
http://gforge.se/2015/02/how-to-go-parallel-in-r-basics-tips/
https://stackoverflow.com/questions/24040280/parallel-computation-of-multiple-imputation-by-using-mice-r-package/27087791#27087791