SpaCOAP: mouse spleen dataset

Wei Liu

2024-05-26

This vignette introduces the SpaCOAP workflow for the analysis of spatial multi-omics dataset.

We demonstrate the use of SpaCOAP to one mouse spleen data collected from the SPOTS platform that are here, which can be downloaded to the current working path by the following command:

url1 <- "https://github.com/feiyoung/SpaCOAP/raw/master/vignettes_data/"
rna_object <- "seu_rna_over_Spleen.RDS?raw=true"
download.file(paste0(url1, rna_object),"seu_rna_over_Spleen.RDS",mode='wb')
protein_object <- "seu_adt_over_Spleen.RDS?raw=true"
download.file(paste0(url1,protein_object), 'seu_adt_over_Spleen.RDS', mode='wb')
## download  annotation
download.file(paste0(url1, "cell_clusters_anno.rds?raw=true"), "cell_clusters_anno.rds", mode="wb")

Data preparation

Then we load the data to R.

seu_rna_over <- readRDS("./seu_rna_over_Spleen.RDS")
seu_adt_over <- readRDS("./seu_adt_over_Spleen.RDS")
load("./cell_clusters_anno.rds")

The package can be loaded with the command:

library(SpaCOAP) #

Define some functions for use.

searchRadius <- function(pos, lower.med=8, upper.med=10, radius.upper= NULL){
  if (!inherits(pos, "matrix"))
    stop("method is only for  matrix object!")
  
  
  ## Automatically determine the upper radius
  n_spots <- nrow(pos)
  idx <- sample(n_spots, min(100, n_spots))
  dis <- dist(pos[idx,])
  if(is.null(radius.upper)){
    #radius.upper <- max(dis)
    radius.upper <- sort(dis)[20] ## select the nearest 20 spots.
  }
  radius.lower <- min(dis[dis>0])
  Adj_sp <- DR.SC:::getneighborhood_fast(pos, radius=radius.upper)
  Med <- summary(Matrix::rowSums(Adj_sp))['Median']
  if(Med < lower.med) stop("The radius.upper is too smaller that cannot find median neighbors greater than 4.")
  start.radius <- 1
  Med <- 0
  message("Find the adjacency matrix by bisection method...")
  maxIter <- 30
  k <- 1
  while(!(Med >= lower.med && Med <=upper.med)){ # ensure that each spot has about 4~6 neighborhoods in median.
    
    Adj_sp <- DR.SC:::getneighborhood_fast(pos, radius=start.radius)
    Med <- summary(Matrix::rowSums(Adj_sp))['Median']
    if(Med < lower.med){
      radius.lower <- start.radius
      start.radius <- (radius.lower + radius.upper)/2
    }else if(Med >upper.med){
      radius.upper <- start.radius
      start.radius <- (radius.lower + radius.upper)/2
    }
    message("Current radius is ", round(start.radius, 2))
    message("Median of neighborhoods is ", Med)
    if(k > maxIter) {
      message("Reach the maximum iteration but can not find a proper radius!")
      break;
    }
    k <- k + 1
  }
  
  return(start.radius)
}

acc_fun <- function(y1, y2){
  n1 <- length(unique(y1))
  n2 <- length(unique(y2))
  if(n1<n2){ ## ensure n1> n2
    a <- y1
    y1 <- y2
    y2 <- a
    n1 <- length(unique(y1))
    n2 <- length(unique(y2))
  }
  cm <- as.matrix(table(Actual = y1, Predicted = y2))
  rnames <-row.names(cm)
  cnames <- colnames(cm)
  union_names <- union(rnames, cnames)
  n <- length(union_names)
  cm_new <- matrix(0, n, n)
  row.names(cm_new) <- colnames(cm_new) <- union_names
  for(r in 1:n2){
    cm_new[rnames,cnames[r]] <- cm[rnames,cnames[r]]
  }
  
  sum(diag(cm_new)) / length(y1)
}

kappa_fun <- function(y1, y2){
  require(irr)
  dat <- data.frame(y1, y2)
  k_res <- kappa2(dat)
  k_res$value
}

Obtain data matrices and tunning parameters

Wrap the data matrix from the SeuratObject, including the RNA expression count matrix X_count, covraite matrix H associated with protein markters, control variables (here only an intercept term) Z, and spatial coordinate matrix pos.


X_count <- Matrix::t(seu_rna_over[["RNA"]][seu_rna_over[['RNA']]@var.features,])
X_count <-  as.matrix(X_count)
H <- t(as.matrix(seu_adt_over[["ADT"]]@data))
Z <- matrix(1, nrow(H), 1)
pos <- cbind(seu_rna_over$X0, seu_rna_over$X1)


radius_use <- searchRadius(pos, radius.upper = NULL)
set.seed(1)
n_spots <- nrow(pos)
idx <- sample(n_spots, min(100, n_spots))
dis <- dist(pos[idx,])
Adj_sp <-  SpaCOAP:::getneighbor_weightmat(pos, radius = radius_use, width=median(dis))

Determine the structure dimension

Next, we select the number of factors and the rank of regreesion coefficient matrix using the proposed criterion.

We apply the criterion, taking into account that the real data does not necessarily originate from our model. Therefore, we opt for a conservative approach that retains more information.

Fitting SpaCOAP

First, we run the proposed SpaCOAP method.

Compare SpaCOAP and other methods

First, we run COAP and calculate the MacFadden’s R-square.

Second, we run MRRR and calculate the MacFadden’s R-square.

Finally, we execute FAST and compute the MacFadden’s R-squared. It is worth noting that we refrain from comparing with PLNPCA in this demo due to its time-consuming nature.

Summarize the metrics

We evaluate the correlation between the low-dimensional representations learned by SpaCOAP and other methods with the annotated cell clusters using two metrics. The first metric is the adjusted MacFadden’s \(\mathrm{R}^2\), referred to as \(\mathrm{MacR}^2\). This measure quantifies the amount of biological information captured by the representations, where a higher value signifies superior performance in representation learning.

Finally, we employ a classification model trained through randomForest, leveraging the cell clusters and the learned low-dimensional representations by SpaCOAP and other methods. We then compare the prediction performance of this model. To achieve this, we randomly divide the samples into training and testing data with a ratio of \(7:3\). We evaluate the model’s performance using prediction accuracy (ACC) and Cohen’s Kappa on the testing data. In contrast of ACC, Kappa offers a more robust measure of accuracy, as it takes into account the possibility that the agreement occurs by chance. We repeat this division process ten times to ensure the reliability of our findings.

N <- 10
n <- length(cell_label)
methodNames <- c("SpaCOAP", "COAP",  "MRRR", "FAST")
n_methods <- length(methodNames)
metricList <- list(ACC = matrix(NA,N, n_methods), Kappa = matrix(NA,N, n_methods))
for(ii in 1: length(metricList)) colnames(metricList[[ii]]) <- methodNames
library(randomForest)
for(i in 1:N){
  # i <- 1
  message("i = ", i)
  set.seed(i)
  idx_train <- sort(sample(n, round(n*0.7)))
  idx_test <- sort(setdiff(1:n, idx_train))
  rf_spacoap <- randomForest(featureList[['SpaCOAP']][idx_train,], y=cell_label[idx_train])
  hy_spacoap <- predict(rf_spacoap, newdata=featureList[['SpaCOAP']][idx_test,])
  metricList$ACC[i,1] <- acc_fun(hy_spacoap, cell_label[idx_test])
  metricList$Kappa[i,1] <- kappa_fun(hy_spacoap, cell_label[idx_test])
  
  rf_coap <- randomForest(featureList[['COAP']][idx_train,], y=cell_label[idx_train])
  hy_coap <- predict(rf_coap, newdata=featureList[['COAP']][idx_test,])
  metricList$ACC[i,2] <- acc_fun(hy_coap, cell_label[idx_test])
  metricList$Kappa[i,2] <- kappa_fun(hy_coap, cell_label[idx_test])
 
  colnames(featureList[['MRRR']]) <- paste0("MRRR", 1:ncol(featureList[['MRRR']]))
  rf_MRRR <- randomForest(featureList[['MRRR']][idx_train,], y=cell_label[idx_train])
  hy_MRRR <- predict(rf_MRRR, newdata=featureList[['MRRR']][idx_test,])
  metricList$ACC[i,3] <- acc_fun(hy_MRRR, cell_label[idx_test])
  metricList$Kappa[i,3] <- kappa_fun(hy_MRRR, cell_label[idx_test])

  colnames(featureList[['FAST']]) <- paste0("FAST", 1:ncol(featureList[['FAST']]))
  rf_FAST <- randomForest(featureList[['FAST']][idx_train,], y=cell_label[idx_train])
  hy_FAST <- predict(rf_FAST, newdata=featureList[['FAST']][idx_test,])
  metricList$ACC[i,4] <- acc_fun(hy_FAST, cell_label[idx_test])
  metricList$Kappa[i,4] <- kappa_fun(hy_FAST, cell_label[idx_test])
}

DT::datatable(t(round(sapply(metricList, colMeans, na.rm=T),2) ))

Session Info