title: "SinCellTE 2022: Statistical models and analyses" output:

html_document: default

Initializing the practical session

user.id <- Sys.getenv('USER')

Setting Dataset

input.rda <-paste0("/shared/projects/sincellte_2022/", user.id, "/Statistical_models_and_analyses/input/PBMC_CTRL_GE_FILTERED_without.rda")
output.dir <- paste0("/shared/projects/sincellte_2022/", user.id, "/Statistical_models_and_analyses/finalresult/")

Setting Paramaters

# Computational Parameters
nthreads <- 2
the.seed <- 1337L
# Analysis Parameters
features.n <- 3000
norm.method <- 'LogNormalize' # SCTransform or LogNormalize
dimred.method <- 'scbfa' # pca or scbfa
vtr.biases <- NULL #c("percent_mt", "percent_rb") # variables to regress (for SCTransform or scbfa)
vtr.scale <- FALSE # recommended to TRUE with scbfa if !is.null(vtr.biases)
dim.max <- 50

Setting seed and parallel instance up, loading R packages

set.seed(the.seed)
doParallel::registerDoParallel(nthreads)
cl <- BiocParallel::DoparParam()
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(ggraph))
suppressPackageStartupMessages(library(patchwork))
suppressPackageStartupMessages(library(dplyr))

Loading data

load(input.rda)
sample.name <- sobj@misc$params$sample.name
print(paste0("There are ", ncol(sobj), " cells and ", nrow(sobj), " genes."))

Normalization

# Normalization
if (norm.method == "SCTransform") {
  sobj <- suppressWarnings(Seurat::SCTransform(object = sobj, assay = "RNA", seed.use = the.seed, variable.features.n = features.n, vars.to.regress = vtr.biases, return.only.var.genes = TRUE))
} else if (norm.method == "LogNormalize") {
  sobj <- Seurat::NormalizeData(sobj, normalization.method = 'LogNormalize', assay = "RNA")
  sobj <- Seurat::FindVariableFeatures(sobj, assay = "RNA", nfeatures = features.n)
  sobj <- Seurat::ScaleData(sobj, assay = "RNA", vars.to.regress = vtr.biases)
} else stop("Unknown normalization method !")

# Save assay name
assay.name <- Seurat::DefaultAssay(sobj)

Check variable genes

# Highly Variable Genes (HVG)
HVG <- Seurat::VariableFeatures(sobj)

# Number of HVG
print(paste0("There are ", length(HVG), " highly variable genes (features.n = ", features.n,")."))

# Identify the top 20 HVGs
top20 <- head(HVG, 20)
top20

# Quantify mitochondrial genes in HVGs:
nb_mito_in_HVG <- length(grep("^MT-", HVG, value = TRUE))
print(paste0("There are ", nb_mito_in_HVG, " mitochondrial genes in the highly variable genes (", round(x = (nb_mito_in_HVG/length(HVG)*100), 2), "%)."))

# Plot variable features with and without labels
HVG_plot <- Seurat::VariableFeaturePlot(sobj, assay = assay.name,selection.method = "vst")
Seurat::LabelPoints(plot = HVG_plot, points = top20, repel = TRUE)

# Clean
rm(HVG, HVG_plot, top20, nb_mito_in_HVG)

Dimension reduction

Note: scbfa take a while (~40-50min).

# Save dimension reduction name
dimred.name <- paste0(assay.name, dimred.method)

if (dimred.method == 'pca') {
  # Dimension reduction
  sobj <- Seurat::RunPCA(object = sobj, verbose = FALSE, npcs = dim.max, reduction.name = dimred.name, reduction.key = paste0(dimred.name, '_'), seed.use = the.seed)
  
} else if (dimred.method == 'scbfa') {
  # Formatting covariates matrix named X
  if (is.null(vtr.biases)) {
    X <- NULL
  }else{
    minimetadata <- sobj@meta.data[,colnames(sobj@meta.data) %in% vtr.biases, drop = FALSE] #drop = FALSE to keep minimetadata as a dataframe
    X <- matrix(ncol = 0, nrow = nrow(sobj@meta.data)) # make an empty data.frame which will be filled afterwards
    for(v in vtr.biases) {
      if(is.character(minimetadata[[v]])) {
        minimetadata[[v]] <- as.factor(minimetadata[[v]])
        print(paste0("Converting '", v, "' factor into model matrix, adding it to the regression..."))
        mm <- model.matrix(~minimetadata[[v]])[,-1, drop = FALSE]
        X <- cbind(X, mm)
      } else {
        print(paste0("Adding '", v, "' covariate (scaled) to the regression ..."))
        X <- cbind(X, scale(minimetadata[[v]]))
      }
    }
    rm(minimetadata)
  }
  # Dimension reduction
  bfa.res <- scBFA::scBFA(scData = as.matrix(sobj@assays[[assay.name]]@counts[sobj@assays[[assay.name]]@var.features,]), numFactors = dim.max, X = X)
  dimnames(bfa.res$ZZ) <- list(colnames(sobj@assays[[assay.name]]@counts), paste0('SCBFA_', 1L:dim.max))
  dimnames(bfa.res$AA) <- list(sobj@assays[[assay.name]]@var.features, paste0('SCBFA_', 1L:dim.max))
  sobj@reductions[[dimred.name]] <- Seurat::CreateDimReducObject(embeddings = bfa.res$ZZ, loadings = bfa.res$AA, assay = assay.name, stdev = matrixStats::colSds(bfa.res$ZZ), key = paste0(dimred.name, '_'), misc = list())
  rm(bfa.res)

} else stop("Unknown reduction method !")

Bias correction ?

The goal is to evaluate the contribution of the known putative bias sources to the loadings of the dimension reduction through correlation.

Be careful : run the chunk in a single shot, due to plot structure.

# Parameters
meta.names = c('nCount_RNA', 'nFeature_RNA', 'percent_mt', 'percent_rb', "Cyclone.S.Score", "Cyclone.G1.Score", "Cyclone.G2M.Score", "Cyclone.SmG2M.Score", "Seurat.S.Score", "Seurat.G1.Score", "Seurat.G2M.Score", "Seurat.SmG2M.Score") # selected in colnames(sobj@meta.data)
eval.markers = c('GAPDH') # GAPDH used as a mock, you should use genes known to be markers of your cell subpopulations (when you know it).

# Draw an empty plot at the right scale
suppressWarnings(plot(0, 0, log = 'x', xlim = c(1L, dim.max), ylim = c(0,1), type = "n", xlab = paste0(dimred.name, " dimensions"), ylab = "spearman correlation", xaxs = "i", yaxs = "i", cex.axis = 0.7))

# meta.names correlations: calculation and plotting 
meta.names <- meta.names[meta.names %in% colnames(sobj@meta.data)]
meta.cols <- if (length(meta.names) > 12) scales::hue_pal()(length(meta.names)) else RColorBrewer::brewer.pal(length(meta.names), name = "Paired")
for(myf in seq_along(meta.names)) {
  corvec <- vapply(seq_len(dim.max), function(k) { (abs(cor(sobj@reductions[[dimred.name]]@cell.embeddings[,k], sobj[[meta.names[myf]]], method = "spearman"))) }, .1)
  lines(corvec, type = "l", ylim = c(0,1), col = meta.cols[myf], lwd = 2)
}
# eval.markers correlations: calculation and plotting
if (!is.null(eval.markers)) {
  eval.markers <- sort(eval.markers[eval.markers %in% rownames(sobj@assays[[assay.name]]@data)])
  mydata <- slot(sobj@assays[[assay.name]], "data")
  for(mym in seq_along(eval.markers)) {
    corvec <- vapply(seq_len(dim.max), function(k) { (abs(cor(sobj@reductions[[dimred.name]]@cell.embeddings[,k], mydata[rownames(mydata) == eval.markers[mym],], method = "spearman"))) }, .1)
    lines(corvec, type = "l", ylim = c(0,1), col = 1, lwd = 2, lty = mym+1L)
  }
}
# Adding legend
legend("topright", legend = c(meta.names, eval.markers), col = c(meta.cols, rep(1, length(eval.markers))), lty = c(rep(1, length(meta.names)), seq_along(eval.markers)), lwd = 2, xjust = 1, yjust = 1, cex=0.7)

# Clean
rm(meta.names, eval.markers, meta.cols, corvec)

How to choose the right amount of reduction dimensions to keep ?

Strategy 1 : the elbow plot

Seurat::ElbowPlot(sobj, ndims = dim.max, reduction = dimred.name)

Strategy 2 : algorithm method (Jackstraw: only LogNormalize follow by PCA is currently supported!)

JackStraw randomly permutes a subset of the data, then calculates projected PCA scores for these randomly selected genes. It then compares the PCA scores obtained for this random selection with the observed PCA scores from actual data, in order to determine their statistical significance. End result is a p-value for each association of a gene with a principal component.

if (norm.method == "LogNormalize" & dimred.method == "pca") {
  # Create a temporary reduction named "pca" (mandatory for JackStraw)
  sobj@reductions$pca <- sobj@reductions$RNApca
  # Compute JackStraw
  sobj <- Seurat::JackStraw(sobj, assay = assay.name,   reduction = "pca", dims = dim.max, num.replicate = dim.max)
  sobj <- Seurat::ScoreJackStraw(sobj, dims = 1:dim.max)
  Seurat::JackStrawPlot(sobj, dims = 1:dim.max, reduction = dimred.name)
  # Clean the temporary reduction
  sobj@reductions$pca <- NULL
  invisible(gc())
}

Strategy 3 : Genes contribution

The aim is to observe the contribution of genes to each component, and select dimensions that include variable genes (and even better : genes of interest expected to differ between expected cell types).

# Genes of interest?
print(sobj[[dimred.name]], dims = 1:dim.max, nfeatures = 15)
# Good separation according to genes?
for (i in 1:dim.max){
  Seurat::DimHeatmap(sobj, dims = i, nfeatures = 15, cells = ncol(sobj), reduction = dimred.name, balanced = TRUE)
}

Strategy 4 : Evaluate and compare effect on clustering (using Louvain)

The aim is to generate several combinations of the number of selected dimensions with Louvain clustering at different levels of resolution, to cover a large (but humanly bearable) chunk of the possible cases. We will make a double loop thanks to foreach package: one on dimensions, and another on resolutions. Actually, we will look at all possible uMAPs and assess the relationship between them (change in cell distribution)

# Parameters
eval.dim.max <- 50
eval.dim.min <- 3
eval.dim.steps <- 2
eval.res.max <- 1.2
eval.res.min <- 0.1
eval.res.steps <- 0.1

# Check and set automatic parameters
if(eval.dim.max > dim.max) stop("eval.dim.max can't be greater than dim.max!")
dimsvec <- seq.int(eval.dim.min, eval.dim.max, eval.dim.steps)
resvec <- seq(eval.res.min,eval.res.max,eval.res.steps)
`%dims.do%` <- if (BiocParallel::bpworkers(cl) > 1) foreach::"%dopar%" else foreach::"%do%"
`%res.do%` <- foreach::"%do%"

# Building a minimal Seurat object (for memory sake)
mini_sobj <- sobj
mini_sobj@assays[names(mini_sobj@assays)[names(mini_sobj@assays) != assay.name]] <- NULL #removing other assays
mini_sobj@assays[[assay.name]]@scale.data <- matrix(nrow = 0, ncol = 0) #removing scale.data slot
mini_sobj@assays[[assay.name]]@misc <- list() #removing misc
mini_sobj@reductions[names(mini_sobj@reductions)[names(mini_sobj@reductions) != dimred.name]] <- NULL #removing other reductions
suppressPackageStartupMessages(require(Matrix))
mini_sobj@assays[[assay.name]]@counts <- new("dgCMatrix") #slimming other slots

# Loop on dimension (dimsvec)
results.all <- foreach::foreach(the.dim = dimsvec, .combine = "cbind", .inorder = FALSE, .errorhandling = "stop", .noexport = objects(), .packages = c("Seurat", "ggplot2")) %dims.do% {
  print(paste0(" >>>>>> Testing dimensions 1 to ", the.dim))
  
  ## Compute umap
  umap.name <- paste0(dimred.name, the.dim,'dims_umap')
  mini_sobj <- Seurat::RunUMAP(object = mini_sobj, assay = assay.name, dims = 1L:the.dim, reduction = dimred.name, seed.use = the.seed, reduction.name = umap.name)
  ## Find Neighbors
  mini_sobj <- Seurat::FindNeighbors(object = mini_sobj, assay = assay.name, dims = 1L:the.dim, reduction = dimred.name)

  ## Loop on resolutions (resvec)
  res.results <- foreach::foreach(the.res = resvec, .inorder = FALSE, .errorhandling = "stop", .noexport = objects()) %res.do% {
    print(paste0(">>> Testing resolution ", the.res))
    
    ### Compute clustering
    mini_sobj <- Seurat::FindClusters(object = mini_sobj, random.seed = the.seed, resolution = the.res)
    
    ### Compute umap
    resdim.plot <- Seurat::LabelClusters(plot = Seurat::DimPlot(object = mini_sobj, reduction = umap.name, pt.size = 1) + ggplot2::ggtitle(paste0(dimred.name, " dims =  ", the.dim, "; res = ", the.res)) + Seurat::DarkTheme(), id = "ident", size = 3, repel = FALSE, color = "white", fontface = "bold")

    ### Return results (umap + clustering)
    return(list(resplot = resdim.plot, clusters = mini_sobj$seurat_clusters))
  }

  ## Decomposing res.results
  plots.results <- unlist(res.results, recursive = FALSE)[seq.int(1, length(res.results)*2, 2)] #get all plots
  dims.res <- as.data.frame(unlist(res.results, recursive = FALSE)[seq.int(2, length(res.results)*2, 2)]) #get all clusterings
  colnames(dims.res) <- c(paste0("clusters_", dimred.name, the.dim, "_res", resvec)) #with names begining by dimensions
  res.dims <- dims.res #get all clusterings
  colnames(res.dims) <- c(paste0("clusters_res", resvec, "_", dimred.name, the.dim)) #with names begining by resolutions
  
  ## Cleaning
  rm(res.results)
  mini_sobj@graphs <- list()
  invisible(gc())
  
  ## Return results (umap + clustering with names begining by dimensions + clustering with names begining by resolutions)
  return(list(plots.results.all = plots.results, dims.res.all = dims.res, res.dims.all = res.dims))
}

## Decomposing results.all
plots.results <- unlist(results.all["plots.results.all",], recursive = FALSE) #get all plots
dims.res.all <- as.data.frame(unlist(results.all["dims.res.all",], recursive = FALSE)) #get all clusterings
colnames(dims.res.all) <- sub("result\\.[0-9]+\\.", "", colnames(dims.res.all))
res.dims.all <- as.data.frame(unlist(results.all["res.dims.all",], recursive = FALSE)) #get all clusterings
colnames(res.dims.all) <- sub("result\\.[0-9]+\\.", "", colnames(res.dims.all))

## Print all umap for each dimensions range (all resolutions)
for (id.dim in seq(1,length(dimsvec)*length(resvec),length(resvec))) {
  to_plot <- plots.results[id.dim:(id.dim+length(resvec)-1)]
  print(patchwork::wrap_plots(to_plot) + patchwork::plot_layout(ncol = 2))
}
## Make clustree for all res of each dimensions range
for (the.dim in dimsvec) {
  if (length(resvec) > 1) {
    cres <- clustree::clustree(dims.res.all, prefix = paste0("clusters_", dimred.name, the.dim, "_res"))
    print(cres + ggplot2::ggtitle(paste0("dims = ", the.dim)))
  }
}
# Make clustree for all dimensions at each resolution
for (the.res in resvec) {
  if (length(dimsvec) > 1) {
    cres <- clustree::clustree(res.dims.all, prefix = paste0("clusters_res", the.res, "_", dimred.name))
    print(cres + ggplot2::ggtitle(paste0("res = ", the.res)))
  }
}

# Cleaning
rm(mini_sobj, results.all, plots.results, dims.res.all, res.dims.all, dimsvec, resvec, to_plot, cres)
invisible(gc())

Final clustering at selected parameters

# Parameters
chosen.dims <- 19
chosen.resolution <- 1.0

# Find Neighbors
sobj <- Seurat::FindNeighbors(object = sobj, assay = assay.name, reduction = dimred.name, compute.SNN = TRUE, dims = 1L:chosen.dims)

# Find clusters
sobj <- Seurat::FindClusters(object = sobj, resolution = chosen.resolution, random.seed = the.seed, algorithm = 1)

# Run UMAP
umap.name <- paste0(dimred.name, chosen.dims, 'dims_umap')
sobj <- Seurat::RunUMAP(object = sobj, assay = assay.name, dims = 1L:chosen.dims, reduction = dimred.name, reduction.name = umap.name, seed.use = the.seed)

# Print clusters
Seurat::LabelClusters(plot = Seurat::DimPlot(object = sobj, reduction = umap.name, pt.size = 1) + ggplot2::ggtitle("Louvain clusters)") + Seurat::DarkTheme(), id = "ident", size = 4, repel = FALSE, color = "white", fontface = "bold")
print("Number of cells for each cluster:")
table(Seurat::Idents(sobj))

Checking bias on final clustering

Clustering seems to be linked to a known source of bias ? If so, you probably should correct it.

# Metrics
Seurat::FeaturePlot(object = sobj, reduction = umap.name, pt.size = 2, features = "percent_mt", cols = c("gold", "blue"), order = TRUE) + Seurat::DarkTheme()
Seurat::FeaturePlot(object = sobj, reduction = umap.name, pt.size = 2, features = "percent_rb", cols = c("gold", "blue"), order = TRUE) + Seurat::DarkTheme()
Seurat::FeaturePlot(object = sobj, reduction = umap.name, pt.size = 2, features = "nCount_RNA", cols = c("gold", "blue"), order = TRUE) + Seurat::DarkTheme()
Seurat::FeaturePlot(object = sobj, reduction = umap.name, pt.size = 2, features = "nFeature_RNA", cols = c("gold", "blue"), order = TRUE) + Seurat::DarkTheme()

# Cell cycle (cyclone)
Seurat::DimPlot(object = sobj, reduction = umap.name, pt.size = 2, group.by = "Cyclone.Phase")  + ggplot2::ggtitle("Cell Phase (cyclone)") + Seurat::DarkTheme()
Seurat::FeaturePlot(object = sobj, reduction = umap.name, pt.size = 2, features = "Cyclone.S.Score", cols = c("gold", "blue"), order = TRUE) + Seurat::DarkTheme()
Seurat::FeaturePlot(object = sobj, reduction = umap.name, pt.size = 2, features = "Cyclone.G2M.Score", cols = c("gold", "blue"), order = TRUE) + Seurat::DarkTheme()
Seurat::FeaturePlot(object = sobj, reduction = umap.name, pt.size = 2, features = "Cyclone.SmG2M.Score", cols = c("gold", "blue"), order = TRUE) + Seurat::DarkTheme()
Seurat::FeaturePlot(object = sobj, reduction = umap.name, pt.size = 2, features = "Cyclone.G1.Score", cols = c("gold", "blue"), order = TRUE) + Seurat::DarkTheme()

# Cell cycle (Seurat)
Seurat::DimPlot(object = sobj, reduction = umap.name, pt.size = 2, group.by = "Seurat.Phase")  + ggplot2::ggtitle("Cell Phase (Seurat)") + Seurat::DarkTheme()
Seurat::FeaturePlot(object = sobj, reduction = umap.name, pt.size = 2, features = c("Seurat.S.Score"), cols = c("gold", "blue"), order = TRUE) + Seurat::DarkTheme()
Seurat::FeaturePlot(object = sobj, reduction = umap.name, pt.size = 2, features = "Seurat.G2M.Score", cols = c("gold", "blue"), order = TRUE) + Seurat::DarkTheme()
Seurat::FeaturePlot(object = sobj, reduction = umap.name, pt.size = 2, features = "Seurat.SmG2M.Score", cols = c("gold", "blue"), order = TRUE) + Seurat::DarkTheme()

Differential Gene Expression Analyses: Marker genes

The goal is to find marker genes specific of each cluster. If there is no markers for a cluster, maybe it is very close to another cell type, or maybe it is not a cell type (noise).

sobj.markers <- Seurat::FindAllMarkers(sobj, only.pos = TRUE, min.pct = 0.75, logfc.threshold = 0.25, test.use = "wilcox")
head(sobj.markers, n = 5)
top.markers <- sobj.markers %>% group_by(cluster) %>% top_n(n = 6, wt = avg_log2FC)
for (clust_id in unique(sort(top.markers$cluster))){
  genes_clust_id <- top.markers[as.character(top.markers$cluster) == clust_id, "gene"]$gene
  print(Seurat::VlnPlot(sobj, features = genes_clust_id) + plot_annotation(title = paste0('Top Markers for cluster ',clust_id)))
  print(Seurat::FeaturePlot(sobj, reduction = umap.name, features = genes_clust_id) + plot_annotation(title = paste0('Top Markers for cluster ',clust_id)))
}
Seurat::DoHeatmap(sobj, features = top.markers$gene)

Plot some genes on umap

Markers Cell Type
IL7R, CCR7 Naive CD4+ T
CD14, LYZ CD14+ Mono
IL7R, S100A4 Memory CD4+
MS4A1 B
CD8A CD8+ T
FCGR3A, MS4A7 FCGR3A+ Mono
GNLY, NKG7 NK
FCER1A, CST3 DC
PPBP Platelet
Seurat::FeaturePlot(sobj, reduction = umap.name, features = c("IL7R", "CCR7","CD14", "LYZ","IL7R", "S100A4"))
Seurat::FeaturePlot(sobj, reduction = umap.name, features = c("MS4A1","CD8A","FCGR3A", "MS4A7","GNLY", "NKG7"))
Seurat::FeaturePlot(sobj, reduction = umap.name, features = c("FCER1A", "CST3","PPBP"))

Save

# Save parameters
sobj@misc$params$NDRC$seed <- the.seed
sobj@misc$params$NDRC$normalization <- list(method = norm.method, features.used = features.n)
sobj@misc$params$NDRC$reductions <- list(method = dimred.method, dim.max = dim.max)
sobj@misc$params$NDRC$vtr.biases <- if(is.null(vtr.biases)) NA else vtr.biases
sobj@misc$params$NDRC$clustering <- list(method = "louvain", umap = umap.name, dimensions = chosen.dims, resolution = chosen.resolution)

# Save R session
sobj@misc$params$NDRC$Rsession <- utils::capture.output(devtools::session_info())

# Save seurat object
dir.create(output.dir, recursive = TRUE)
save(sobj, file = paste0(output.dir, paste(c(sample.name, 'without', norm.method, dimred.method, if(!is.null(vtr.biases)) vtr.biases), collapse = '_'), '.rda'), compress = "bzip2")
# Print R session info
utils::capture.output(devtools::session_info())