The motivation for the parglm
package is a parallel version of the glm
function. It solves the iteratively re-weighted least squares using a QR decomposition with column pivoting with DGEQP3
function from LAPACK. The computation is done in parallel as in the bam
function in the mgcv
package. The cost is an additional \(O(Mp^2 + p^3)\) where \(p\) is the number of coefficients and \(M\) is the number chunks to be computed in parallel. The advantage is that you do not need to compile the package with an optimized BLAS or LAPACK which supports multithreading. The package also includes a method that computes the Fisher information and then solves the normal equation as in the speedglm
package. This is faster but less numerically stable.
Below, we perform estimate a logistic regression with 1000000 observations and 50 covariates. We vary the number of cores being used with the nthreads
argument to parglm.control
. The method
arguments sets which method is used. LINPACK
yields the QR method and FAST
yields a similar method as in the speedglm
package.
#####
# simulate
n # number of observations
#> [1] 1000000
p # number of covariates
#> [1] 50
set.seed(68024947)
X <- matrix(rnorm(n * p, 1/p, 1/sqrt(p)), n, ncol = p)
df <- data.frame(y = 1/(1 + exp(-(rowSums(X) - 1))) > runif(n), X)
#####
# compute and measure time. Setup call to make
library(microbenchmark)
library(speedglm)
library(parglm)
cl <- list(
quote(microbenchmark),
glm = quote(glm (y ~ ., binomial(), df)),
speedglm = quote(speedglm(y ~ ., family = binomial(), data = df)),
times = 11L)
tfunc <- function(method = "LINPACK", nthreads)
parglm(y ~ ., binomial(), df, control = parglm.control(method = method,
nthreads = nthreads))
cl <- c(
cl, lapply(1:n_threads, function(i) bquote(tfunc(nthreads = .(i)))))
names(cl)[5:(5L + n_threads - 1L)] <- paste0("parglm-LINPACK-", 1:n_threads)
cl <- c(
cl, lapply(1:n_threads, function(i) bquote(tfunc(
nthreads = .(i), method = "FAST"))))
names(cl)[(5L + n_threads):(5L + 2L * n_threads - 1L)] <-
paste0("parglm-FAST-", 1:n_threads)
cl <- as.call(cl)
cl # the call we make
#> microbenchmark(glm = glm(y ~ ., binomial(), df), speedglm = speedglm(y ~
#> ., family = binomial(), data = df), times = 11L, `parglm-LINPACK-1` = tfunc(nthreads = 1L),
#> `parglm-LINPACK-2` = tfunc(nthreads = 2L), `parglm-LINPACK-3` = tfunc(nthreads = 3L),
#> `parglm-LINPACK-4` = tfunc(nthreads = 4L), `parglm-LINPACK-5` = tfunc(nthreads = 5L),
#> `parglm-LINPACK-6` = tfunc(nthreads = 6L), `parglm-LINPACK-7` = tfunc(nthreads = 7L),
#> `parglm-LINPACK-8` = tfunc(nthreads = 8L), `parglm-LINPACK-9` = tfunc(nthreads = 9L),
#> `parglm-LINPACK-10` = tfunc(nthreads = 10L), `parglm-LINPACK-11` = tfunc(nthreads = 11L),
#> `parglm-LINPACK-12` = tfunc(nthreads = 12L), `parglm-LINPACK-13` = tfunc(nthreads = 13L),
#> `parglm-LINPACK-14` = tfunc(nthreads = 14L), `parglm-LINPACK-15` = tfunc(nthreads = 15L),
#> `parglm-LINPACK-16` = tfunc(nthreads = 16L), `parglm-LINPACK-17` = tfunc(nthreads = 17L),
#> `parglm-LINPACK-18` = tfunc(nthreads = 18L), `parglm-FAST-1` = tfunc(nthreads = 1L,
#> method = "FAST"), `parglm-FAST-2` = tfunc(nthreads = 2L,
#> method = "FAST"), `parglm-FAST-3` = tfunc(nthreads = 3L,
#> method = "FAST"), `parglm-FAST-4` = tfunc(nthreads = 4L,
#> method = "FAST"), `parglm-FAST-5` = tfunc(nthreads = 5L,
#> method = "FAST"), `parglm-FAST-6` = tfunc(nthreads = 6L,
#> method = "FAST"), `parglm-FAST-7` = tfunc(nthreads = 7L,
#> method = "FAST"), `parglm-FAST-8` = tfunc(nthreads = 8L,
#> method = "FAST"), `parglm-FAST-9` = tfunc(nthreads = 9L,
#> method = "FAST"), `parglm-FAST-10` = tfunc(nthreads = 10L,
#> method = "FAST"), `parglm-FAST-11` = tfunc(nthreads = 11L,
#> method = "FAST"), `parglm-FAST-12` = tfunc(nthreads = 12L,
#> method = "FAST"), `parglm-FAST-13` = tfunc(nthreads = 13L,
#> method = "FAST"), `parglm-FAST-14` = tfunc(nthreads = 14L,
#> method = "FAST"), `parglm-FAST-15` = tfunc(nthreads = 15L,
#> method = "FAST"), `parglm-FAST-16` = tfunc(nthreads = 16L,
#> method = "FAST"), `parglm-FAST-17` = tfunc(nthreads = 17L,
#> method = "FAST"), `parglm-FAST-18` = tfunc(nthreads = 18L,
#> method = "FAST"))
out <- eval(cl)
s <- summary(out) # result from `microbenchmark`
print(s[, c("expr", "min", "mean", "median", "max")], digits = 3,
row.names = FALSE)
#> expr min mean median max
#> glm 14.87 15.35 15.28 16.20
#> speedglm 4.13 4.57 4.62 4.75
#> parglm-LINPACK-1 13.23 13.42 13.39 13.67
#> parglm-LINPACK-2 7.78 7.93 7.97 8.03
#> parglm-LINPACK-3 5.97 6.12 6.09 6.35
#> parglm-LINPACK-4 5.13 5.29 5.28 5.51
#> parglm-LINPACK-5 4.61 4.80 4.84 4.90
#> parglm-LINPACK-6 4.26 4.44 4.43 4.60
#> parglm-LINPACK-7 4.02 4.26 4.32 4.41
#> parglm-LINPACK-8 3.89 4.04 4.01 4.26
#> parglm-LINPACK-9 3.71 3.92 4.00 4.11
#> parglm-LINPACK-10 3.58 3.77 3.72 3.99
#> parglm-LINPACK-11 3.60 3.70 3.68 3.96
#> parglm-LINPACK-12 3.54 3.68 3.66 3.83
#> parglm-LINPACK-13 3.57 3.73 3.72 3.87
#> parglm-LINPACK-14 3.47 3.57 3.57 3.67
#> parglm-LINPACK-15 3.31 3.64 3.60 4.38
#> parglm-LINPACK-16 3.14 3.57 3.58 3.85
#> parglm-LINPACK-17 3.38 3.60 3.56 3.80
#> parglm-LINPACK-18 3.19 3.48 3.50 3.70
#> parglm-FAST-1 4.03 4.16 4.15 4.30
#> parglm-FAST-2 2.88 3.08 2.97 3.35
#> parglm-FAST-3 2.33 2.62 2.65 2.84
#> parglm-FAST-4 2.32 2.51 2.49 2.67
#> parglm-FAST-5 2.36 2.50 2.51 2.58
#> parglm-FAST-6 2.00 2.32 2.31 2.56
#> parglm-FAST-7 2.19 2.36 2.40 2.44
#> parglm-FAST-8 1.97 2.23 2.26 2.36
#> parglm-FAST-9 1.96 2.06 2.04 2.19
#> parglm-FAST-10 1.94 2.14 2.18 2.34
#> parglm-FAST-11 1.90 2.12 2.08 2.30
#> parglm-FAST-12 1.99 2.13 2.07 2.32
#> parglm-FAST-13 1.92 2.12 2.16 2.24
#> parglm-FAST-14 1.92 2.08 2.04 2.24
#> parglm-FAST-15 1.88 2.07 2.03 2.26
#> parglm-FAST-16 1.88 2.04 2.04 2.19
#> parglm-FAST-17 1.91 2.08 2.08 2.30
#> parglm-FAST-18 1.97 2.15 2.19 2.30
The plot below shows median run times versus the number of cores. The dashed line is the median run time of glm
and the dotted line is the median run time of speedglm
. We could have used glm.fit
and parglm.fit
. This would make the relative difference bigger as both call e.g., model.matrix
and model.frame
which do take some time. To show this point, we first compute how much times this takes and then we make the plot. The continuous line is the computation time of model.matrix
and model.frame
.
modmat_time <- microbenchmark(
modmat_time = {
mf <- model.frame(y ~ ., df); model.matrix(terms(mf), mf)
}, times = 10)
modmat_time # time taken by `model.matrix` and `model.frame`
#> Unit: milliseconds
#> expr min lq mean median uq
#> modmat_time 976.246588 1177.963766 1183.842476 1182.462718 1196.209505
#> max neval
#> 1356.438417 10
par(mar = c(4.5, 4.5, .5, .5))
o <- aggregate(time ~ expr, out, median)[, 2] / 10^9
ylim <- range(o, 0); ylim[2] <- ylim[2] + .04 * diff(ylim)
o_linpack <- o[-c(1:2, (n_threads + 3L):length(o))]
o_fast <- o[-(1:(n_threads + 2L))]
plot(1:n_threads, o_linpack, xlab = "Number of cores", yaxs = "i",
ylim = ylim, ylab = "Run time", pch = 16)
points(1:n_threads, o_fast, pch = 1)
abline(h = o[1], lty = 2)
abline(h = o[2], lty = 3)
abline(h = median(modmat_time$time) / 10^9, lty = 1)
The open circles are the FAST
method and the other circles are the LINPACK
method. Again, the FAST
method and the speedglm
package compute the Fisher information and then solves the normal equation. This is advantages in terms of computation cost but may lead to unstable solutions. You can alter the number of observations in each parallel chunk with the block_size
argument of parglm.control
.
The single threaded performance of parglm
may be slower when there are more coefficients. The cause seems to be the difference between the LAPACK and LINPACK implementation. This presumably due to either the QR decomposition method and/or the qr.qty
method. On Windows, the parglm
do seems slower when build with Rtools
and the reason seems so be the qr.qty
method in LAPACK, dormqr
, which is slower then the LINPACK method, dqrsl
. Below is an illustration of the computation times on this machine.
qr1 <- qr(X)
qr2 <- qr(X, LAPACK = TRUE)
microbenchmark::microbenchmark(
`qr LINPACK` = qr(X),
`qr LAPACK` = qr(X, LAPACK = TRUE),
`qr.qty LINPACK` = qr.qty(qr1, df$y),
`qr.qty LAPACK` = qr.qty(qr2, df$y),
times = 11)
#> Unit: milliseconds
#> expr min lq mean median
#> qr LINPACK 3110.641142 3115.4115140 3139.2611351 3133.540966
#> qr LAPACK 2493.754534 2497.5766770 2516.8324145 2498.076688
#> qr.qty LINPACK 474.270743 478.8815830 494.5624567 492.309339
#> qr.qty LAPACK 529.304587 530.3935825 530.7427405 530.972688
#> uq max neval
#> 3138.1808365 3263.890541 11
#> 2512.5776900 2644.825526 11
#> 507.2853105 526.827080 11
#> 531.3701745 531.790984 11
sessionInfo()
#> R version 3.4.1 (2017-06-30)
#> Platform: x86_64-redhat-linux-gnu (64-bit)
#> Running under: Amazon Linux AMI 2018.03
#>
#> Matrix products: default
#> BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] speedglm_0.3-2 MASS_7.3-47 Matrix_1.2-10
#> [4] microbenchmark_1.4-6 parglm_0.1.0
#>
#> loaded via a namespace (and not attached):
#> [1] Rcpp_0.12.18 codetools_0.2-15 lattice_0.20-35 digest_0.6.15
#> [5] rprojroot_1.3-2 grid_3.4.1 backports_1.1.2 magrittr_1.5
#> [9] evaluate_0.11 stringi_1.2.4 rmarkdown_1.10 tools_3.4.1
#> [13] stringr_1.3.1 yaml_2.2.0 compiler_3.4.1 htmltools_0.3.6
#> [17] knitr_1.20