Statistical_models_and

# Save dimension reduction name
dimred.name <- paste0(assay.name, dimred.method)

if (dimred.method == 'pca') {
  # Dimension reduction
  sobj <- Seurat::RunPCA(object = sobj, verbose = FALSE, npcs = dim.max, reduction.name = dimred.name, reduction.key = paste0(dimred.name, '_'), seed.use = the.seed)
  
} else if (dimred.method == 'scbfa') {
  # Formatting covariates matrix named X
  if (is.null(vtr.biases)) {
    X <- NULL
  }else{
    minimetadata <- sobj@meta.data[,colnames(sobj@meta.data) %in% vtr.biases, drop = FALSE] #drop = FALSE to keep minimetadata as a dataframe
    X <- matrix(ncol = 0, nrow = nrow(sobj@meta.data)) # make an empty data.frame which will be filled afterwards
    for(v in vtr.biases) {
      if(is.character(minimetadata[[v]])) {
        minimetadata[[v]] <- as.factor(minimetadata[[v]])
        print(paste0("Converting '", v, "' factor into model matrix, adding it to the regression..."))
        mm <- model.matrix(~minimetadata[[v]])[,-1, drop = FALSE]
        X <- cbind(X, mm)
      } else {
        print(paste0("Adding '", v, "' covariate (scaled) to the regression ..."))
        X <- cbind(X, scale(minimetadata[[v]]))
      }
    }
    rm(minimetadata)
  }
  # Dimension reduction
  bfa.res <- scBFA::scBFA(scData = as.matrix(sobj@assays[[assay.name]]@counts[sobj@assays[[assay.name]]@var.features,]), numFactors = dim.max, X = X)
  dimnames(bfa.res$ZZ) <- list(colnames(sobj@assays[[assay.name]]@counts), paste0('SCBFA_', 1L:dim.max))
  dimnames(bfa.res$AA) <- list(sobj@assays[[assay.name]]@var.features, paste0('SCBFA_', 1L:dim.max))
  sobj@reductions[[dimred.name]] <- Seurat::CreateDimReducObject(embeddings = bfa.res$ZZ, loadings = bfa.res$AA, assay = assay.name, stdev = matrixStats::colSds(bfa.res$ZZ), key = paste0(dimred.name, '_'), misc = list())
  rm(bfa.res)

} else stop("Unknown reduction method !")

Bias correction ?

The goal is to evaluate the contribution of the known putative bias sources to the loadings of the dimension reduction through correlation.

# Parameters
meta.names = c('nCount_RNA', 'nFeature_RNA', 'percent_mt', 'percent_rb', "Cyclone.S.Score", "Cyclone.G1.Score", "Cyclone.G2M.Score", "Cyclone.SmG2M.Score", "Seurat.S.Score", "Seurat.G1.Score", "Seurat.G2M.Score", "Seurat.SmG2M.Score") # selected in colnames(sobj@meta.data)
eval.markers = c('GAPDH') # GAPDH used as a mock, you should use genes known to be markers of your cell subpopulations (when you know it).

# Draw an empty plot at the right scale
suppressWarnings(plot(0, 0, log = 'x', xlim = c(1L, dim.max), ylim = c(0,1), type = "n", xlab = paste0(dimred.name, " dimensions"), ylab = "spearman correlation", xaxs = "i", yaxs = "i", cex.axis = 0.7))

# meta.names correlations: calculation and plotting 
meta.names <- meta.names[meta.names %in% colnames(sobj@meta.data)]
meta.cols <- if (length(meta.names) > 12) scales::hue_pal()(length(meta.names)) else RColorBrewer::brewer.pal(length(meta.names), name = "Paired")
for(myf in seq_along(meta.names)) {
  corvec <- vapply(seq_len(dim.max), function(k) { (abs(cor(sobj@reductions[[dimred.name]]@cell.embeddings[,k], sobj[[meta.names[myf]]], method = "spearman"))) }, .1)
  lines(corvec, type = "l", ylim = c(0,1), col = meta.cols[myf], lwd = 2)
}
# eval.markers correlations: calculation and plotting
if (!is.null(eval.markers)) {
  eval.markers <- sort(eval.markers[eval.markers %in% rownames(sobj@assays[[assay.name]]@data)])
  mydata <- slot(sobj@assays[[assay.name]], "data")
  for(mym in seq_along(eval.markers)) {
    corvec <- vapply(seq_len(dim.max), function(k) { (abs(cor(sobj@reductions[[dimred.name]]@cell.embeddings[,k], mydata[rownames(mydata) == eval.markers[mym],], method = "spearman"))) }, .1)
    lines(corvec, type = "l", ylim = c(0,1), col = 1, lwd = 2, lty = mym+1L)
  }
}
# Adding legend
legend("topright", legend = c(meta.names, eval.markers), col = c(meta.cols, rep(1, length(eval.markers))), lty = c(rep(1, length(meta.names)), seq_along(eval.markers)), lwd = 2, xjust = 1, yjust = 1, cex=0.7)

# Clean
rm(meta.names, eval.markers, meta.cols, corvec)

How to choose the right amount of reduction dimensions to keep ?

Strategy 1 : the elbow plot

Strategy 2 : algorithm method (Jackstraw: only LogNormalize follow by PCA is currently supported!)

JackStraw randomly permutes a subset of the data, then calculates projected PCA scores for these randomly selected genes. It then compares the PCA scores obtained for this random selection with the observed PCA scores from actual data, in order to determine their statistical significance. End result is a p-value for each association of a gene with a principal component.

Strategy 3 : Genes contribution

The aim is to observe the contribution of genes to each component, and select dimensions that include variable genes (and even better : genes of interest expected to differ between expected cell types).

Strategy 4 : Evaluate and compare effect on clustering (using Louvain)

The aim is to generate several combinations of the number of selected dimensions with Louvain clustering at different levels of resolution, to cover a large (but humanly bearable) chunk of the possible cases. We will make a double loop thanks to foreach package: one on dimensions, and another on resolutions. Actually, we will look at all possible uMAPs and assess the relationship between them (change in cell distribution)

# Parameters
eval.dim.max <- 50
eval.dim.min <- 3
eval.dim.steps <- 2
eval.res.max <- 1.2
eval.res.min <- 0.1
eval.res.steps <- 0.1

# Check and set automatic parameters
if(eval.dim.max > dim.max) stop("eval.dim.max can't be greater than dim.max!")
dimsvec <- seq.int(eval.dim.min, eval.dim.max, eval.dim.steps)
resvec <- seq(eval.res.min,eval.res.max,eval.res.steps)
`%dims.do%` <- if (BiocParallel::bpworkers(cl) > 1) foreach::"%dopar%" else foreach::"%do%"
`%res.do%` <- foreach::"%do%"

# Building a minimal Seurat object (for memory sake)
mini_sobj <- sobj
mini_sobj@assays[names(mini_sobj@assays)[names(mini_sobj@assays) != assay.name]] <- NULL #removing other assays
mini_sobj@assays[[assay.name]]@scale.data <- matrix(nrow = 0, ncol = 0) #removing scale.data slot
mini_sobj@assays[[assay.name]]@misc <- list() #removing misc
mini_sobj@reductions[names(mini_sobj@reductions)[names(mini_sobj@reductions) != dimred.name]] <- NULL #removing other reductions
suppressPackageStartupMessages(require(Matrix))
mini_sobj@assays[[assay.name]]@counts <- new("dgCMatrix") #slimming other slots

# Loop on dimension (dimsvec)
results.all <- foreach::foreach(the.dim = dimsvec, .combine = "cbind", .inorder = FALSE, .errorhandling = "stop", .noexport = objects(), .packages = c("Seurat", "ggplot2")) %dims.do% {
  print(paste0(" >>>>>> Testing dimensions 1 to ", the.dim))
  
  ## Compute umap
  umap.name <- paste0(dimred.name, the.dim,'dims_umap')
  mini_sobj <- Seurat::RunUMAP(object = mini_sobj, assay = assay.name, dims = 1L:the.dim, reduction = dimred.name, seed.use = the.seed, reduction.name = umap.name)
  ## Find Neighbors
  mini_sobj <- Seurat::FindNeighbors(object = mini_sobj, assay = assay.name, dims = 1L:the.dim, reduction = dimred.name)

  ## Loop on resolutions (resvec)
  res.results <- foreach::foreach(the.res = resvec, .inorder = FALSE, .errorhandling = "stop", .noexport = objects()) %res.do% {
    print(paste0(">>> Testing resolution ", the.res))
    
    ### Compute clustering
    mini_sobj <- Seurat::FindClusters(object = mini_sobj, random.seed = the.seed, resolution = the.res)
    
    ### Compute umap
    resdim.plot <- Seurat::LabelClusters(plot = Seurat::DimPlot(object = mini_sobj, reduction = umap.name, pt.size = 1) + ggplot2::ggtitle(paste0(dimred.name, " dims =  ", the.dim, "; res = ", the.res)) + Seurat::DarkTheme(), id = "ident", size = 3, repel = FALSE, color = "white", fontface = "bold")

    ### Return results (umap + clustering)
    return(list(resplot = resdim.plot, clusters = mini_sobj$seurat_clusters))
  }

  ## Decomposing res.results
  plots.results <- unlist(res.results, recursive = FALSE)[seq.int(1, length(res.results)*2, 2)] #get all plots
  dims.res <- as.data.frame(unlist(res.results, recursive = FALSE)[seq.int(2, length(res.results)*2, 2)]) #get all clusterings
  colnames(dims.res) <- c(paste0("clusters_", dimred.name, the.dim, "_res", resvec)) #with names begining by dimensions
  res.dims <- dims.res #get all clusterings
  colnames(res.dims) <- c(paste0("clusters_res", resvec, "_", dimred.name, the.dim)) #with names begining by resolutions
  
  ## Cleaning
  rm(res.results)
  mini_sobj@graphs <- list()
  invisible(gc())
  
  ## Return results (umap + clustering with names begining by dimensions + clustering with names begining by resolutions)
  return(list(plots.results.all = plots.results, dims.res.all = dims.res, res.dims.all = res.dims))
}

## Decomposing results.all
plots.results <- unlist(results.all["plots.results.all",], recursive = FALSE) #get all plots
dims.res.all <- as.data.frame(unlist(results.all["dims.res.all",], recursive = FALSE)) #get all clusterings
colnames(dims.res.all) <- sub("result\\.[0-9]+\\.", "", colnames(dims.res.all))
res.dims.all <- as.data.frame(unlist(results.all["res.dims.all",], recursive = FALSE)) #get all clusterings
colnames(res.dims.all) <- sub("result\\.[0-9]+\\.", "", colnames(res.dims.all))

## Print all umap for each dimensions range (all resolutions)
for (id.dim in seq(1,length(dimsvec)*length(resvec),length(resvec))) {
  to_plot <- plots.results[id.dim:(id.dim+length(resvec)-1)]
  print(patchwork::wrap_plots(to_plot) + patchwork::plot_layout(ncol = 2))
}
## Make clustree for all res of each dimensions range
for (the.dim in dimsvec) {
  if (length(resvec) > 1) {
    cres <- clustree::clustree(dims.res.all, prefix = paste0("clusters_", dimred.name, the.dim, "_res"))
    print(cres + ggplot2::ggtitle(paste0("dims = ", the.dim)))
  }
}
# Make clustree for all dimensions at each resolution
for (the.res in resvec) {
  if (length(dimsvec) > 1) {
    cres <- clustree::clustree(res.dims.all, prefix = paste0("clusters_res", the.res, "_", dimred.name))
    print(cres + ggplot2::ggtitle(paste0("res = ", the.res)))
  }
}

# Cleaning
rm(mini_sobj, results.all, plots.results, dims.res.all, res.dims.all, dimsvec, resvec, to_plot, cres)
invisible(gc())

Final clustering at selected parameters

# Parameters
chosen.dims <- 19
chosen.resolution <- 1.0

# Find Neighbors
sobj <- Seurat::FindNeighbors(object = sobj, assay = assay.name, reduction = dimred.name, compute.SNN = TRUE, dims = 1L:chosen.dims)

# Find clusters
sobj <- Seurat::FindClusters(object = sobj, resolution = chosen.resolution, random.seed = the.seed, algorithm = 1)

# Run UMAP
umap.name <- paste0(dimred.name, chosen.dims, 'dims_umap')
sobj <- Seurat::RunUMAP(object = sobj, assay = assay.name, dims = 1L:chosen.dims, reduction = dimred.name, reduction.name = umap.name, seed.use = the.seed)

# Print clusters
Seurat::LabelClusters(plot = Seurat::DimPlot(object = sobj, reduction = umap.name, pt.size = 1) + ggplot2::ggtitle("Louvain clusters)") + Seurat::DarkTheme(), id = "ident", size = 4, repel = FALSE, color = "white", fontface = "bold")
print("Number of cells for each cluster:")
table(Seurat::Idents(sobj))

Checking bias on final clustering

Clustering seems to be linked to a known source of bias ? If so, you probably should correct it.

# Metrics
Seurat::FeaturePlot(object = sobj, reduction = umap.name, pt.size = 2, features = "percent_mt", cols = c("gold", "blue"), order = TRUE) + Seurat::DarkTheme()
Seurat::FeaturePlot(object = sobj, reduction = umap.name, pt.size = 2, features = "percent_rb", cols = c("gold", "blue"), order = TRUE) + Seurat::DarkTheme()
Seurat::FeaturePlot(object = sobj, reduction = umap.name, pt.size = 2, features = "nCount_RNA", cols = c("gold", "blue"), order = TRUE) + Seurat::DarkTheme()
Seurat::FeaturePlot(object = sobj, reduction = umap.name, pt.size = 2, features = "nFeature_RNA", cols = c("gold", "blue"), order = TRUE) + Seurat::DarkTheme()

# Cell cycle (cyclone)
Seurat::DimPlot(object = sobj, reduction = umap.name, pt.size = 2, group.by = "Cyclone.Phase")  + ggplot2::ggtitle("Cell Phase (cyclone)") + Seurat::DarkTheme()
Seurat::FeaturePlot(object = sobj, reduction = umap.name, pt.size = 2, features = "Cyclone.S.Score", cols = c("gold", "blue"), order = TRUE) + Seurat::DarkTheme()
Seurat::FeaturePlot(object = sobj, reduction = umap.name, pt.size = 2, features = "Cyclone.G2M.Score", cols = c("gold", "blue"), order = TRUE) + Seurat::DarkTheme()
Seurat::FeaturePlot(object = sobj, reduction = umap.name, pt.size = 2, features = "Cyclone.SmG2M.Score", cols = c("gold", "blue"), order = TRUE) + Seurat::DarkTheme()
Seurat::FeaturePlot(object = sobj, reduction = umap.name, pt.size = 2, features = "Cyclone.G1.Score", cols = c("gold", "blue"), order = TRUE) + Seurat::DarkTheme()

# Cell cycle (Seurat)
Seurat::DimPlot(object = sobj, reduction = umap.name, pt.size = 2, group.by = "Seurat.Phase")  + ggplot2::ggtitle("Cell Phase (Seurat)") + Seurat::DarkTheme()
Seurat::FeaturePlot(object = sobj, reduction = umap.name, pt.size = 2, features = c("Seurat.S.Score"), cols = c("gold", "blue"), order = TRUE) + Seurat::DarkTheme()
Seurat::FeaturePlot(object = sobj, reduction = umap.name, pt.size = 2, features = "Seurat.G2M.Score", cols = c("gold", "blue"), order = TRUE) + Seurat::DarkTheme()
Seurat::FeaturePlot(object = sobj, reduction = umap.name, pt.size = 2, features = "Seurat.SmG2M.Score", cols = c("gold", "blue"), order = TRUE) + Seurat::DarkTheme()

Differential Gene Expression Analyses: Marker genes

The goal is to find marker genes specific of each cluster. If there is no markers for a cluster, maybe it is very close to another cell type, or maybe it is not a cell type (noise).

Plot some genes on umap

Save

Markers	Cell Type
IL7R, CCR7	Naive CD4+ T
CD14, LYZ	CD14+ Mono
IL7R, S100A4	Memory CD4+
MS4A1	B
CD8A	CD8+ T
FCGR3A, MS4A7	FCGR3A+ Mono
GNLY, NKG7	NK
FCER1A, CST3	DC
PPBP	Platelet

# Save parameters
sobj@misc$params$NDRC$seed <- the.seed
sobj@misc$params$NDRC$normalization <- list(method = norm.method, features.used = features.n)
sobj@misc$params$NDRC$reductions <- list(method = dimred.method, dim.max = dim.max)
sobj@misc$params$NDRC$vtr.biases <- if(is.null(vtr.biases)) NA else vtr.biases
sobj@misc$params$NDRC$clustering <- list(method = "louvain", umap = umap.name, dimensions = chosen.dims, resolution = chosen.resolution)

# Save R session
sobj@misc$params$NDRC$Rsession <- utils::capture.output(devtools::session_info())

# Save seurat object
dir.create(output.dir, recursive = TRUE)
save(sobj, file = paste0(output.dir, paste(c(sample.name, 'without', norm.method, dimred.method, if(!is.null(vtr.biases)) vtr.biases), collapse = '_'), '.rda'), compress = "bzip2")

html_document: default

Initializing the practical session

Setting Dataset

Setting Paramaters

Setting seed and parallel instance up, loading R packages

Loading data

Normalization

Check variable genes

Dimension reduction