This vignette introduces the SpaCOAP workflow for the analysis of spatial multi-omics dataset.
We demonstrate the use of SpaCOAP to one mouse spleen data collected from the SPOTS platform that are here, which can be downloaded to the current working path by the following command:
url1 <- "https://github.com/feiyoung/SpaCOAP/raw/master/vignettes_data/"
rna_object <- "seu_rna_over_Spleen.RDS?raw=true"
download.file(paste0(url1, rna_object),"seu_rna_over_Spleen.RDS",mode='wb')
protein_object <- "seu_adt_over_Spleen.RDS?raw=true"
download.file(paste0(url1,protein_object), 'seu_adt_over_Spleen.RDS', mode='wb')
## download annotation
download.file(paste0(url1, "cell_clusters_anno.rds?raw=true"), "cell_clusters_anno.rds", mode="wb")
Then we load the data to R.
seu_rna_over <- readRDS("./seu_rna_over_Spleen.RDS")
seu_adt_over <- readRDS("./seu_adt_over_Spleen.RDS")
load("./cell_clusters_anno.rds")
The package can be loaded with the command:
Define some functions for use.
searchRadius <- function(pos, lower.med=8, upper.med=10, radius.upper= NULL){
if (!inherits(pos, "matrix"))
stop("method is only for matrix object!")
## Automatically determine the upper radius
n_spots <- nrow(pos)
idx <- sample(n_spots, min(100, n_spots))
dis <- dist(pos[idx,])
if(is.null(radius.upper)){
#radius.upper <- max(dis)
radius.upper <- sort(dis)[20] ## select the nearest 20 spots.
}
radius.lower <- min(dis[dis>0])
Adj_sp <- DR.SC:::getneighborhood_fast(pos, radius=radius.upper)
Med <- summary(Matrix::rowSums(Adj_sp))['Median']
if(Med < lower.med) stop("The radius.upper is too smaller that cannot find median neighbors greater than 4.")
start.radius <- 1
Med <- 0
message("Find the adjacency matrix by bisection method...")
maxIter <- 30
k <- 1
while(!(Med >= lower.med && Med <=upper.med)){ # ensure that each spot has about 4~6 neighborhoods in median.
Adj_sp <- DR.SC:::getneighborhood_fast(pos, radius=start.radius)
Med <- summary(Matrix::rowSums(Adj_sp))['Median']
if(Med < lower.med){
radius.lower <- start.radius
start.radius <- (radius.lower + radius.upper)/2
}else if(Med >upper.med){
radius.upper <- start.radius
start.radius <- (radius.lower + radius.upper)/2
}
message("Current radius is ", round(start.radius, 2))
message("Median of neighborhoods is ", Med)
if(k > maxIter) {
message("Reach the maximum iteration but can not find a proper radius!")
break;
}
k <- k + 1
}
return(start.radius)
}
acc_fun <- function(y1, y2){
n1 <- length(unique(y1))
n2 <- length(unique(y2))
if(n1<n2){ ## ensure n1> n2
a <- y1
y1 <- y2
y2 <- a
n1 <- length(unique(y1))
n2 <- length(unique(y2))
}
cm <- as.matrix(table(Actual = y1, Predicted = y2))
rnames <-row.names(cm)
cnames <- colnames(cm)
union_names <- union(rnames, cnames)
n <- length(union_names)
cm_new <- matrix(0, n, n)
row.names(cm_new) <- colnames(cm_new) <- union_names
for(r in 1:n2){
cm_new[rnames,cnames[r]] <- cm[rnames,cnames[r]]
}
sum(diag(cm_new)) / length(y1)
}
kappa_fun <- function(y1, y2){
require(irr)
dat <- data.frame(y1, y2)
k_res <- kappa2(dat)
k_res$value
}
Wrap the data matrix from the SeuratObject, including the RNA expression count matrix X_count
, covraite matrix H
associated with protein markters, control variables (here only an intercept term) Z
, and spatial coordinate matrix pos
.
X_count <- Matrix::t(seu_rna_over[["RNA"]][seu_rna_over[['RNA']]@var.features,])
X_count <- as.matrix(X_count)
H <- t(as.matrix(seu_adt_over[["ADT"]]@data))
Z <- matrix(1, nrow(H), 1)
pos <- cbind(seu_rna_over$X0, seu_rna_over$X1)
radius_use <- searchRadius(pos, radius.upper = NULL)
set.seed(1)
n_spots <- nrow(pos)
idx <- sample(n_spots, min(100, n_spots))
dis <- dist(pos[idx,])
Adj_sp <- SpaCOAP:::getneighbor_weightmat(pos, radius = radius_use, width=median(dis))
Next, we select the number of factors and the rank of regreesion coefficient matrix using the proposed criterion.
q_max <- 20
d <- ncol(H)
rank_max <- d
tic <- proc.time()
reslist_max <- SpaCOAP(X_count, Adj_sp, H, Z,
rank_use = rank_max, q=q_max, epsELBO = 1e-8, maxIter = 30)
toc <- proc.time()
time_spacoap_max <- toc[3] - tic[3]
We apply the criterion, taking into account that the real data does not necessarily originate from our model. Therefore, we opt for a conservative approach that retains more information.
threshold=c(1e-15, 1e-20)
thre1 <- threshold[1]
beta_svalues <- svd(reslist_max$bbeta)$d
beta_svalues <- beta_svalues[beta_svalues>thre1]
ratio1 <- beta_svalues[-length(beta_svalues)] / beta_svalues[-1]
ratio1[1:10]
## Here, we set hr = 9 rather 1 to retain more information
thre2 <- threshold[2]
B_svalues <- svd(reslist_max$B)$d
B_svalues <- B_svalues[B_svalues>thre2]
ratio_fac <- B_svalues[-length(B_svalues)] / B_svalues[-1]
ratio_fac[1:6]
# [1] 6.123909e+00 2.749414e+00 1.376498e+00 1.314487e+00 3.310699e+14
# Here, we choose q=5 since huge decrease of singular values happen in q=5.
hq <- 5
First, we run the proposed SpaCOAP method.
hr <- 9;hq <- 5
featureList <- list()
tic <- proc.time()
reslist <- SpaCOAP(X_count, Adj_sp, H=H, Z= Z,
rank_use = hr, q=hq, epsELBO = 1e-8)
toc <- proc.time()
time_spacoap <- toc[3] - tic[3]
Matrix::rankMatrix(reslist$bbeta)
svd_x <- svd(reslist$bbeta, nu = hr, nv=hr)
H_spacoap <- H %*% svd_x$v
(R2_spacoap <- ProFAST::get_r2_mcfadden(embeds= cbind(H_spacoap, reslist$F), y=as.factor(cell_clusters)))
featureList[['SpaCOAP']] <- cbind(H_spacoap, reslist$F)
First, we run COAP and calculate the MacFadden’s R-square.
##COAP
library(COAP)
tic <- proc.time()
res_coap <- RR_COAP(X_count, Z = cbind(Z, H), rank_use= 2+hr, q=hq,
epsELBO = 1e-7, maxIter = 30)
toc <- proc.time()
time_coap <- toc[3] - tic[3]
svd_x <- svd(res_coap$bbeta[,-c(1)], nu = hr, nv=hr)
H_coap <- H %*% svd_x$v
(R2_coap <- ProFAST::get_r2_mcfadden(embeds= cbind(res_coap$H, H_coap), y=as.factor(cell_clusters)))
featureList[['COAP']] <- cbind(res_coap$H, H_coap)
Second, we run MRRR and calculate the MacFadden’s R-square.
## MRRR
mrrr_run <- function(Y, X, rank0, q=NULL, family=list(poisson()),
familygroup=rep(1,ncol(Y)), epsilon = 1e-4, sv.tol = 1e-2,
maxIter = 2000, trace=TRUE, truncflag=FALSE, trunc=500){
# epsilon = 1e-4; sv.tol = 1e-2; maxIter = 30; trace=TRUE
# Y <- X_count; X <- cbind(Z, H); rank0 = r + ncol(Z)
require(rrpack)
n <- nrow(Y); p <- ncol(Y)
if(!is.null(q)){
rank0 <- rank0+q
X <- cbind(X, diag(n))
}
if(truncflag){
## Trunction
Y[Y>trunc] <- trunc
}
svdX0d1 <- svd(X)$d[1]
init1 = list(kappaC0 = svdX0d1 * 5)
offset = NULL
control = list(epsilon = epsilon, sv.tol = sv.tol, maxit = maxIter,
trace = trace, gammaC0 = 1.1, plot.cv = TRUE,
conv.obj = TRUE)
fit.mrrr <- mrrr(Y=Y, X=X[,-1], family = family, familygroup = familygroup,
penstr = list(penaltySVD = "rankCon", lambdaSVD = 1),
control = control, init = init1, maxrank = rank0)
return(fit.mrrr)
}
tic <- proc.time()
res_mrrr <- mrrr_run(X_count, cbind(Z,H), rank0=hr+ncol(Z), q=hq)
str(res_mrrr)
toc <- proc.time()
time_mrrr <- toc[3] - tic[3]
hbbeta_mrrr <-t(res_mrrr$coef[1:ncol(cbind(Z,H)), ])
svd_x <- svd(hbbeta_mrrr[,-c(1)], nu = hr, nv=hr)
H_mrrr <- H %*% svd_x$v
Theta_hb <- (res_mrrr$coef[(ncol(cbind(Z,H))+1): (nrow(cbind(Z,H))+ncol(cbind(Z,H))), ])
svdTheta <- svd(Theta_hb, nu=hq, nv=hq)
F_mrrr <- svdTheta$u
(R2_mrrr <- ProFAST::get_r2_mcfadden(embeds= cbind(H_mrrr, F_mrrr), y=as.factor(cell_clusters)))
featureList[['MRRR']] <- cbind(H_mrrr, F_mrrr)
Finally, we execute FAST and compute the MacFadden’s R-squared. It is worth noting that we refrain from comparing with PLNPCA in this demo due to its time-consuming nature.
fast_run <- function(X_count, Adj_sp, q, verbose=TRUE, epsELBO=1e-8){
require(ProFAST)
reslist <- FAST_run(XList = list(X_count),
AdjList = list(Adj_sp), q = q, fit.model = 'poisson',
verbose=verbose, epsLogLik=epsELBO)
reslist$hV <- reslist$hV[[1]]
return(reslist)
}
tic <- proc.time()
res_fast <- fast_run(X_count, Adj_sp, q=hq, verbose=TRUE, epsELBO=1e-8)
toc <- proc.time()
time_fast <- toc - tic
(R2_fast <- ProFAST::get_r2_mcfadden(embeds= res_fast$hV, y=as.factor(cell_clusters)))
featureList[['FAST']] <- res_fast$hV
We evaluate the correlation between the low-dimensional representations learned by SpaCOAP and other methods with the annotated cell clusters using two metrics. The first metric is the adjusted MacFadden’s \(\mathrm{R}^2\), referred to as \(\mathrm{MacR}^2\). This measure quantifies the amount of biological information captured by the representations, where a higher value signifies superior performance in representation learning.
R2List <- list()
cell_label <- cell_clusters
for(im in 1: length(featureList)){
message("im = ", im)
R2List[[im]] <- ProFAST::get_r2_mcfadden(embeds= featureList[[im]], y=cell_label)
}
names(R2List) <- names(featureList)
Finally, we employ a classification model trained through randomForest, leveraging the cell clusters and the learned low-dimensional representations by SpaCOAP and other methods. We then compare the prediction performance of this model. To achieve this, we randomly divide the samples into training and testing data with a ratio of \(7:3\). We evaluate the model’s performance using prediction accuracy (ACC) and Cohen’s Kappa on the testing data. In contrast of ACC, Kappa offers a more robust measure of accuracy, as it takes into account the possibility that the agreement occurs by chance. We repeat this division process ten times to ensure the reliability of our findings.
N <- 10
n <- length(cell_label)
methodNames <- c("SpaCOAP", "COAP", "MRRR", "FAST")
n_methods <- length(methodNames)
metricList <- list(ACC = matrix(NA,N, n_methods), Kappa = matrix(NA,N, n_methods))
for(ii in 1: length(metricList)) colnames(metricList[[ii]]) <- methodNames
library(randomForest)
for(i in 1:N){
# i <- 1
message("i = ", i)
set.seed(i)
idx_train <- sort(sample(n, round(n*0.7)))
idx_test <- sort(setdiff(1:n, idx_train))
rf_spacoap <- randomForest(featureList[['SpaCOAP']][idx_train,], y=cell_label[idx_train])
hy_spacoap <- predict(rf_spacoap, newdata=featureList[['SpaCOAP']][idx_test,])
metricList$ACC[i,1] <- acc_fun(hy_spacoap, cell_label[idx_test])
metricList$Kappa[i,1] <- kappa_fun(hy_spacoap, cell_label[idx_test])
rf_coap <- randomForest(featureList[['COAP']][idx_train,], y=cell_label[idx_train])
hy_coap <- predict(rf_coap, newdata=featureList[['COAP']][idx_test,])
metricList$ACC[i,2] <- acc_fun(hy_coap, cell_label[idx_test])
metricList$Kappa[i,2] <- kappa_fun(hy_coap, cell_label[idx_test])
colnames(featureList[['MRRR']]) <- paste0("MRRR", 1:ncol(featureList[['MRRR']]))
rf_MRRR <- randomForest(featureList[['MRRR']][idx_train,], y=cell_label[idx_train])
hy_MRRR <- predict(rf_MRRR, newdata=featureList[['MRRR']][idx_test,])
metricList$ACC[i,3] <- acc_fun(hy_MRRR, cell_label[idx_test])
metricList$Kappa[i,3] <- kappa_fun(hy_MRRR, cell_label[idx_test])
colnames(featureList[['FAST']]) <- paste0("FAST", 1:ncol(featureList[['FAST']]))
rf_FAST <- randomForest(featureList[['FAST']][idx_train,], y=cell_label[idx_train])
hy_FAST <- predict(rf_FAST, newdata=featureList[['FAST']][idx_test,])
metricList$ACC[i,4] <- acc_fun(hy_FAST, cell_label[idx_test])
metricList$Kappa[i,4] <- kappa_fun(hy_FAST, cell_label[idx_test])
}
DT::datatable(t(round(sapply(metricList, colMeans, na.rm=T),2) ))
Session Info
sessionInfo()
#> R version 4.1.2 (2021-11-01)
#> Platform: x86_64-w64-mingw32/x64 (64-bit)
#> Running under: Windows 10 x64 (build 22631)
#>
#> Matrix products: default
#>
#> locale:
#> [1] LC_COLLATE=C
#> [2] LC_CTYPE=Chinese (Simplified)_China.936
#> [3] LC_MONETARY=Chinese (Simplified)_China.936
#> [4] LC_NUMERIC=C
#> [5] LC_TIME=Chinese (Simplified)_China.936
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> loaded via a namespace (and not attached):
#> [1] digest_0.6.29 R6_2.5.1 jsonlite_1.8.0 magrittr_2.0.3
#> [5] evaluate_0.15 stringi_1.7.6 rlang_1.1.0 cli_3.2.0
#> [9] rstudioapi_0.13 jquerylib_0.1.4 bslib_0.3.1 rmarkdown_2.11
#> [13] tools_4.1.2 stringr_1.4.0 xfun_0.29 yaml_2.3.6
#> [17] fastmap_1.1.0 compiler_4.1.2 htmltools_0.5.2 knitr_1.37
#> [21] sass_0.4.1