R语言并行计算

2020-02-03 本文已影响0人 kittybaby

R包

parallel
doparallel
foreach

parallel包

1.鉴定本机的核数

# Load the parallel package
library(parallel)

# Store the number of cores in the object no_of_cores
no_of_cores <-detectCores()

# Print no_of_cores
print(no_of_cores)

2.parApply

3.parSapply

可变范围
在Mac / Linux上，您可以选择使用自动包含所有环境变量的makeCluster（no_core，type =“FORK”）（以下详细信息）。在Windows上，您必须使用并行插座集群（PSOCK），其中仅包含已加载的基本包（请注意，PSOCK在所有系统上都是默认值）。因此，您应该始终指定并行功能所需的哪些变量和库，例如以下失败：

> cl<-makeCluster(4)
> base <- 2
>  
> parLapply(cl, 
+           2:4, 
+           function(exponent) 
+             base^exponent)
Error in checkForRemoteErrors(val) : 
  3 nodes produced errors; first error: 找不到对象'base'
>  
> stopCluster(cl)

> cl<-makeCluster(4)
>  
> base <- 2
> clusterExport(cl, "base")
> parLapply(cl, 
+           2:4, 
+           function(exponent) 
+             base^exponent)
[[1]]
[1] 4

[[2]]
[1] 8

[[3]]
[1] 16

您需要使用clusterExport（cl，“base”）才能使该函数看到基本变量。如果您正在使用某些特殊软件包，那么同样需要通过clusterEvalQ来加载它们。我经常使用rms包，因此我使用clusterEvalQ（cl，library（rms））。请注意，对clusterExport后变量的任何更改都将被忽略：

> cl<-makeCluster(no_cores)
> clusterExport(cl, "base")
> base <- 4
> # Run
> parLapply(cl, 
+           2:4, 
+           function(exponent) 
+             base^exponent)
[[1]]
[1] 4

[[2]]
[1] 8

[[3]]
[1] 16

>  
> # Finish
> stopCluster(cl)

方法一

y  <- 1:10
sapply(1:5, function(x) x + y)

library(parallel)
cl <- makeCluster(2)
y  <- 1:10
# add y to function definition and parSapply call
parSapply(cl, 1:5, function(x,y) x + y, y)
# export y to the global environment of each node
# then call your original code
clusterExport(cl, "y")
parSapply(cl, 1:5, function(x) x + y)

方法二

library(parallel)
fun <- function(cl, y) {
  parSapply(cl, 1:5, function(x) x + y)
}
cl <- makeCluster(2)
fun(cl, 1:10)
stopCluster(cl)

4.mclapply(wins不能使用)

workerFunc <- function(n) { return(n^2) }
values <- 1:100
library(parallel)
## Number of workers (R processes) to use:
numWorkers <- 8
## Parallel calculation (mclapply):
res <- mclapply(values, workerFunc, mc.cores = numWorkers)
print(unlist(res))
#Error in mclapply(values, workerFunc, mc.cores = numWorkers) : 
#   Windows不支持'mc.cores' > 1

5.parLapply

workerFunc <- function(n) { return(n^2) }
values <- 1:100
library(parallel)
## Number of workers (R processes) to use:
numWorkers <- 8
## Set up the ’cluster’
cl <- makeCluster(numWorkers, type = "PSOCK")
## Parallel calculation (parLapply):
res <- parLapply(cl, values, workerFunc)
## Shut down cluster
stopCluster(cl)
print(unlist(res))

foreach包

> library(foreach)
> library(doParallel)
载入需要的程辑包：iterators
>  
> cl<-makeCluster(no_cores)
> registerDoParallel(cl)
> foreach(exponent = 2:4, 
+         .combine = c)  %dopar%  
+   base^exponent
[1]  16  64 256


> foreach(exponent = 2:4, 
+         .combine = rbind)  %dopar%  
+   base^exponent
         [,1]
result.1   16
result.2   64
result.3  256


> foreach(exponent = 2:4, 
+         .combine = list,
+         .multicombine = TRUE)  %dopar%  
+   base^exponent
[[1]]
[1] 16

[[2]]
[1] 64

[[3]]
[1] 256


> foreach(exponent = 2:4, 
+         .combine = list)  %dopar%  
+   base^exponent
[[1]]
[[1]][[1]]
[1] 16

[[1]][[2]]
[1] 64


[[2]]
[1] 256
#stopImplicitCluster()

变量的域
默认情况下，相同的本地环境中的变量是可用的:

base <- 2
cl<-makeCluster(2)
registerDoParallel(cl)
foreach(exponent = 2:4, 
        .combine = c)  %dopar%  
  base^exponent
stopCluster(cl)

> cl <- makeCluster(2)
> test <- function (exponent) {
+   foreach(exponent = 2:4, 
+           .combine = c)  %dopar%  
+     base^exponent
+ }
> test()
 Show Traceback
 
 Rerun with Debug
 Error in base^exponent : task 1 failed - "找不到对象'base'" 
 
 
 > base <- 2
> cl<-makeCluster(2)
> registerDoParallel(cl)
>  
> base <- 4
> test <- function (exponent) {
+   foreach(exponent = 2:4, 
+           .combine = c,
+           .export = "base")  %dopar%  
+     base^exponent
+ }
> test()
[1]  16  64 256
>  
> stopCluster(cl)

同样，您可以使用.packages选项加载软件包，例如 .packages = c（“rms”，“mouse”）。我强烈建议您始终导出所需的变量，因为它会限制在函数中封装代码时出现的问题。

cl <- makeCluster(4)
> registerDoParallel(cl)
> x <- iris[which(iris[,5] != "setosa"), c(1,5)]
> trials <- 10000
> ptime <- system.time({
+    r <- foreach(icount(trials), .combine=cbind) %dopar% {
+      ind <- sample(100, 100, replace=TRUE)
+      result1 <- glm(x[ind,2]~x[ind,1], family=binomial(logit))
+      coefficients(result1)
+      }
+    })[3]
> ptime
elapsed 
  20.01 
  
  
> stime <- system.time({
+    r <- foreach(icount(trials), .combine=cbind) %do% {
+      ind <- sample(100, 100, replace=TRUE)
+      result1 <- glm(x[ind,2]~x[ind,1], family=binomial(logit))
+      coefficients(result1)
+      }
+    })[3]
> stime
elapsed 
  39.17 
stopCluster(cl)

参考资料

http://gforge.se/2015/02/how-to-go-parallel-in-r-basics-tips/
https://stackoverflow.com/questions/24040280/parallel-computation-of-multiple-imputation-by-using-mice-r-package/27087791#27087791