Introduction to the parglm package

The motivation for the parglm package is a parallel version of the glm function. It solves the iteratively re-weighted least squares using a QR decomposition with column pivoting with DGEQP3 function from LAPACK. The computation is done in parallel as in the bam function in the mgcv package. The cost is an additional \(O(Mp^2 + p^3)\) where \(p\) is the number of coefficients and \(M\) is the number chunks to be computed in parallel. The advantage is that you do not need to compile the package with an optimized BLAS or LAPACK which supports multithreading. The package also includes a method that computes the Fisher information and then solves the normal equation as in the speedglm package. This is faster but less numerically stable.

Example of computation time

Below, we perform estimate a logistic regression with 1000000 observations and 50 covariates. We vary the number of cores being used with the nthreads argument to parglm.control. The method arguments sets which method is used. LINPACK yields the QR method and FAST yields a similar method as in the speedglm package.

#####
# simulate
n # number of observations
#> [1] 1000000
p # number of covariates
#> [1] 50
set.seed(68024947)
X <- matrix(rnorm(n * p, 1/p, 1/sqrt(p)), n, ncol = p)
df <- data.frame(y = 1/(1 + exp(-(rowSums(X) - 1))) > runif(n), X)

#####
# compute and measure time. Setup call to make
library(microbenchmark)
library(speedglm)
library(parglm)
cl <- list(
  quote(microbenchmark), 
  glm      = quote(glm     (y ~ ., binomial(), df)), 
  speedglm = quote(speedglm(y ~ ., family = binomial(), data = df)), 
  times = 11L)
tfunc <- function(method = "LINPACK", nthreads)
  parglm(y ~ ., binomial(), df, control = parglm.control(method = method,
                                                         nthreads = nthreads))
cl <- c(
  cl, lapply(1:n_threads, function(i) bquote(tfunc(nthreads = .(i)))))
names(cl)[5:(5L + n_threads - 1L)] <- paste0("parglm-LINPACK-", 1:n_threads)

cl <- c(
  cl, lapply(1:n_threads, function(i) bquote(tfunc(
    nthreads = .(i), method = "FAST"))))
names(cl)[(5L + n_threads):(5L + 2L * n_threads - 1L)] <- 
  paste0("parglm-FAST-", 1:n_threads)

cl <- as.call(cl)
cl # the call we make
#> microbenchmark(glm = glm(y ~ ., binomial(), df), speedglm = speedglm(y ~ 
#>     ., family = binomial(), data = df), times = 11L, `parglm-LINPACK-1` = tfunc(nthreads = 1L), 
#>     `parglm-LINPACK-2` = tfunc(nthreads = 2L), `parglm-LINPACK-3` = tfunc(nthreads = 3L), 
#>     `parglm-LINPACK-4` = tfunc(nthreads = 4L), `parglm-LINPACK-5` = tfunc(nthreads = 5L), 
#>     `parglm-LINPACK-6` = tfunc(nthreads = 6L), `parglm-LINPACK-7` = tfunc(nthreads = 7L), 
#>     `parglm-LINPACK-8` = tfunc(nthreads = 8L), `parglm-LINPACK-9` = tfunc(nthreads = 9L), 
#>     `parglm-LINPACK-10` = tfunc(nthreads = 10L), `parglm-LINPACK-11` = tfunc(nthreads = 11L), 
#>     `parglm-LINPACK-12` = tfunc(nthreads = 12L), `parglm-LINPACK-13` = tfunc(nthreads = 13L), 
#>     `parglm-LINPACK-14` = tfunc(nthreads = 14L), `parglm-LINPACK-15` = tfunc(nthreads = 15L), 
#>     `parglm-LINPACK-16` = tfunc(nthreads = 16L), `parglm-LINPACK-17` = tfunc(nthreads = 17L), 
#>     `parglm-LINPACK-18` = tfunc(nthreads = 18L), `parglm-FAST-1` = tfunc(nthreads = 1L, 
#>         method = "FAST"), `parglm-FAST-2` = tfunc(nthreads = 2L, 
#>         method = "FAST"), `parglm-FAST-3` = tfunc(nthreads = 3L, 
#>         method = "FAST"), `parglm-FAST-4` = tfunc(nthreads = 4L, 
#>         method = "FAST"), `parglm-FAST-5` = tfunc(nthreads = 5L, 
#>         method = "FAST"), `parglm-FAST-6` = tfunc(nthreads = 6L, 
#>         method = "FAST"), `parglm-FAST-7` = tfunc(nthreads = 7L, 
#>         method = "FAST"), `parglm-FAST-8` = tfunc(nthreads = 8L, 
#>         method = "FAST"), `parglm-FAST-9` = tfunc(nthreads = 9L, 
#>         method = "FAST"), `parglm-FAST-10` = tfunc(nthreads = 10L, 
#>         method = "FAST"), `parglm-FAST-11` = tfunc(nthreads = 11L, 
#>         method = "FAST"), `parglm-FAST-12` = tfunc(nthreads = 12L, 
#>         method = "FAST"), `parglm-FAST-13` = tfunc(nthreads = 13L, 
#>         method = "FAST"), `parglm-FAST-14` = tfunc(nthreads = 14L, 
#>         method = "FAST"), `parglm-FAST-15` = tfunc(nthreads = 15L, 
#>         method = "FAST"), `parglm-FAST-16` = tfunc(nthreads = 16L, 
#>         method = "FAST"), `parglm-FAST-17` = tfunc(nthreads = 17L, 
#>         method = "FAST"), `parglm-FAST-18` = tfunc(nthreads = 18L, 
#>         method = "FAST"))

out <- eval(cl)

s <- summary(out) # result from `microbenchmark`
print(s[, c("expr", "min", "mean", "median", "max")], digits  = 3, 
      row.names = FALSE) 
#>               expr   min  mean median   max
#>                glm 14.87 15.35  15.28 16.20
#>           speedglm  4.13  4.57   4.62  4.75
#>   parglm-LINPACK-1 13.23 13.42  13.39 13.67
#>   parglm-LINPACK-2  7.78  7.93   7.97  8.03
#>   parglm-LINPACK-3  5.97  6.12   6.09  6.35
#>   parglm-LINPACK-4  5.13  5.29   5.28  5.51
#>   parglm-LINPACK-5  4.61  4.80   4.84  4.90
#>   parglm-LINPACK-6  4.26  4.44   4.43  4.60
#>   parglm-LINPACK-7  4.02  4.26   4.32  4.41
#>   parglm-LINPACK-8  3.89  4.04   4.01  4.26
#>   parglm-LINPACK-9  3.71  3.92   4.00  4.11
#>  parglm-LINPACK-10  3.58  3.77   3.72  3.99
#>  parglm-LINPACK-11  3.60  3.70   3.68  3.96
#>  parglm-LINPACK-12  3.54  3.68   3.66  3.83
#>  parglm-LINPACK-13  3.57  3.73   3.72  3.87
#>  parglm-LINPACK-14  3.47  3.57   3.57  3.67
#>  parglm-LINPACK-15  3.31  3.64   3.60  4.38
#>  parglm-LINPACK-16  3.14  3.57   3.58  3.85
#>  parglm-LINPACK-17  3.38  3.60   3.56  3.80
#>  parglm-LINPACK-18  3.19  3.48   3.50  3.70
#>      parglm-FAST-1  4.03  4.16   4.15  4.30
#>      parglm-FAST-2  2.88  3.08   2.97  3.35
#>      parglm-FAST-3  2.33  2.62   2.65  2.84
#>      parglm-FAST-4  2.32  2.51   2.49  2.67
#>      parglm-FAST-5  2.36  2.50   2.51  2.58
#>      parglm-FAST-6  2.00  2.32   2.31  2.56
#>      parglm-FAST-7  2.19  2.36   2.40  2.44
#>      parglm-FAST-8  1.97  2.23   2.26  2.36
#>      parglm-FAST-9  1.96  2.06   2.04  2.19
#>     parglm-FAST-10  1.94  2.14   2.18  2.34
#>     parglm-FAST-11  1.90  2.12   2.08  2.30
#>     parglm-FAST-12  1.99  2.13   2.07  2.32
#>     parglm-FAST-13  1.92  2.12   2.16  2.24
#>     parglm-FAST-14  1.92  2.08   2.04  2.24
#>     parglm-FAST-15  1.88  2.07   2.03  2.26
#>     parglm-FAST-16  1.88  2.04   2.04  2.19
#>     parglm-FAST-17  1.91  2.08   2.08  2.30
#>     parglm-FAST-18  1.97  2.15   2.19  2.30

The plot below shows median run times versus the number of cores. The dashed line is the median run time of glm and the dotted line is the median run time of speedglm. We could have used glm.fit and parglm.fit. This would make the relative difference bigger as both call e.g., model.matrix and model.frame which do take some time. To show this point, we first compute how much times this takes and then we make the plot. The continuous line is the computation time of model.matrix and model.frame.

modmat_time <- microbenchmark(
  modmat_time = {
    mf <- model.frame(y ~ ., df); model.matrix(terms(mf), mf)
  }, times = 10)
modmat_time # time taken by `model.matrix` and `model.frame`
#> Unit: milliseconds
#>         expr        min          lq        mean      median          uq
#>  modmat_time 976.246588 1177.963766 1183.842476 1182.462718 1196.209505
#>          max neval
#>  1356.438417    10

par(mar = c(4.5, 4.5, .5, .5))
o <- aggregate(time ~ expr, out, median)[, 2] / 10^9
ylim <- range(o, 0); ylim[2] <- ylim[2] + .04 * diff(ylim)

o_linpack <- o[-c(1:2, (n_threads + 3L):length(o))]
o_fast    <- o[-(1:(n_threads + 2L))]

plot(1:n_threads, o_linpack, xlab = "Number of cores", yaxs = "i",
     ylim = ylim, ylab = "Run time", pch = 16)
points(1:n_threads, o_fast, pch = 1)
abline(h = o[1], lty = 2)
abline(h = o[2], lty = 3)
abline(h = median(modmat_time$time) / 10^9, lty = 1)

The open circles are the FAST method and the other circles are the LINPACK method. Again, the FAST method and the speedglm package compute the Fisher information and then solves the normal equation. This is advantages in terms of computation cost but may lead to unstable solutions. You can alter the number of observations in each parallel chunk with the block_size argument of parglm.control.

The single threaded performance of parglm may be slower when there are more coefficients. The cause seems to be the difference between the LAPACK and LINPACK implementation. This presumably due to either the QR decomposition method and/or the qr.qty method. On Windows, the parglm do seems slower when build with Rtools and the reason seems so be the qr.qty method in LAPACK, dormqr, which is slower then the LINPACK method, dqrsl. Below is an illustration of the computation times on this machine.

qr1 <- qr(X)
qr2 <- qr(X, LAPACK = TRUE)
microbenchmark::microbenchmark(
  `qr LINPACK`     = qr(X), 
  `qr LAPACK`      = qr(X, LAPACK = TRUE), 
  `qr.qty LINPACK` = qr.qty(qr1, df$y), 
  `qr.qty LAPACK`  = qr.qty(qr2, df$y), 
  times = 11)
#> Unit: milliseconds
#>            expr         min           lq         mean      median
#>      qr LINPACK 3110.641142 3115.4115140 3139.2611351 3133.540966
#>       qr LAPACK 2493.754534 2497.5766770 2516.8324145 2498.076688
#>  qr.qty LINPACK  474.270743  478.8815830  494.5624567  492.309339
#>   qr.qty LAPACK  529.304587  530.3935825  530.7427405  530.972688
#>            uq         max neval
#>  3138.1808365 3263.890541    11
#>  2512.5776900 2644.825526    11
#>   507.2853105  526.827080    11
#>   531.3701745  531.790984    11

Introduction to the parglm package

Benjamin Christoffersen

2019-01-19

Example of computation time

Session info