title: "SinCellTE 2022: Statistical models and analyses" output:
user.id <- Sys.getenv('USER')
input.rda <-paste0("/shared/projects/sincellte_2022/", user.id, "/Statistical_models_and_analyses/input/PBMC_CTRL_GE_FILTERED_without.rda")
output.dir <- paste0("/shared/projects/sincellte_2022/", user.id, "/Statistical_models_and_analyses/finalresult/")
# Computational Parameters
nthreads <- 2
the.seed <- 1337L
# Analysis Parameters
features.n <- 3000
norm.method <- 'LogNormalize' # SCTransform or LogNormalize
dimred.method <- 'scbfa' # pca or scbfa
vtr.biases <- NULL #c("percent_mt", "percent_rb") # variables to regress (for SCTransform or scbfa)
vtr.scale <- FALSE # recommended to TRUE with scbfa if !is.null(vtr.biases)
dim.max <- 50
set.seed(the.seed)
doParallel::registerDoParallel(nthreads)
cl <- BiocParallel::DoparParam()
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(ggraph))
suppressPackageStartupMessages(library(patchwork))
suppressPackageStartupMessages(library(dplyr))
load(input.rda)
sample.name <- sobj@misc$params$sample.name
print(paste0("There are ", ncol(sobj), " cells and ", nrow(sobj), " genes."))
# Normalization
if (norm.method == "SCTransform") {
sobj <- suppressWarnings(Seurat::SCTransform(object = sobj, assay = "RNA", seed.use = the.seed, variable.features.n = features.n, vars.to.regress = vtr.biases, return.only.var.genes = TRUE))
} else if (norm.method == "LogNormalize") {
sobj <- Seurat::NormalizeData(sobj, normalization.method = 'LogNormalize', assay = "RNA")
sobj <- Seurat::FindVariableFeatures(sobj, assay = "RNA", nfeatures = features.n)
sobj <- Seurat::ScaleData(sobj, assay = "RNA", vars.to.regress = vtr.biases)
} else stop("Unknown normalization method !")
# Save assay name
assay.name <- Seurat::DefaultAssay(sobj)
# Highly Variable Genes (HVG)
HVG <- Seurat::VariableFeatures(sobj)
# Number of HVG
print(paste0("There are ", length(HVG), " highly variable genes (features.n = ", features.n,")."))
# Identify the top 20 HVGs
top20 <- head(HVG, 20)
top20
# Quantify mitochondrial genes in HVGs:
nb_mito_in_HVG <- length(grep("^MT-", HVG, value = TRUE))
print(paste0("There are ", nb_mito_in_HVG, " mitochondrial genes in the highly variable genes (", round(x = (nb_mito_in_HVG/length(HVG)*100), 2), "%)."))
# Plot variable features with and without labels
HVG_plot <- Seurat::VariableFeaturePlot(sobj, assay = assay.name,selection.method = "vst")
Seurat::LabelPoints(plot = HVG_plot, points = top20, repel = TRUE)
# Clean
rm(HVG, HVG_plot, top20, nb_mito_in_HVG)
Note: scbfa take a while (~40-50min).
# Save dimension reduction name
dimred.name <- paste0(assay.name, dimred.method)
if (dimred.method == 'pca') {
# Dimension reduction
sobj <- Seurat::RunPCA(object = sobj, verbose = FALSE, npcs = dim.max, reduction.name = dimred.name, reduction.key = paste0(dimred.name, '_'), seed.use = the.seed)
} else if (dimred.method == 'scbfa') {
# Formatting covariates matrix named X
if (is.null(vtr.biases)) {
X <- NULL
}else{
minimetadata <- sobj@meta.data[,colnames(sobj@meta.data) %in% vtr.biases, drop = FALSE] #drop = FALSE to keep minimetadata as a dataframe
X <- matrix(ncol = 0, nrow = nrow(sobj@meta.data)) # make an empty data.frame which will be filled afterwards
for(v in vtr.biases) {
if(is.character(minimetadata[[v]])) {
minimetadata[[v]] <- as.factor(minimetadata[[v]])
print(paste0("Converting '", v, "' factor into model matrix, adding it to the regression..."))
mm <- model.matrix(~minimetadata[[v]])[,-1, drop = FALSE]
X <- cbind(X, mm)
} else {
print(paste0("Adding '", v, "' covariate (scaled) to the regression ..."))
X <- cbind(X, scale(minimetadata[[v]]))
}
}
rm(minimetadata)
}
# Dimension reduction
bfa.res <- scBFA::scBFA(scData = as.matrix(sobj@assays[[assay.name]]@counts[sobj@assays[[assay.name]]@var.features,]), numFactors = dim.max, X = X)
dimnames(bfa.res$ZZ) <- list(colnames(sobj@assays[[assay.name]]@counts), paste0('SCBFA_', 1L:dim.max))
dimnames(bfa.res$AA) <- list(sobj@assays[[assay.name]]@var.features, paste0('SCBFA_', 1L:dim.max))
sobj@reductions[[dimred.name]] <- Seurat::CreateDimReducObject(embeddings = bfa.res$ZZ, loadings = bfa.res$AA, assay = assay.name, stdev = matrixStats::colSds(bfa.res$ZZ), key = paste0(dimred.name, '_'), misc = list())
rm(bfa.res)
} else stop("Unknown reduction method !")
The goal is to evaluate the contribution of the known putative bias sources to the loadings of the dimension reduction through correlation.
Be careful : run the chunk in a single shot, due to plot structure.
# Parameters
meta.names = c('nCount_RNA', 'nFeature_RNA', 'percent_mt', 'percent_rb', "Cyclone.S.Score", "Cyclone.G1.Score", "Cyclone.G2M.Score", "Cyclone.SmG2M.Score", "Seurat.S.Score", "Seurat.G1.Score", "Seurat.G2M.Score", "Seurat.SmG2M.Score") # selected in colnames(sobj@meta.data)
eval.markers = c('GAPDH') # GAPDH used as a mock, you should use genes known to be markers of your cell subpopulations (when you know it).
# Draw an empty plot at the right scale
suppressWarnings(plot(0, 0, log = 'x', xlim = c(1L, dim.max), ylim = c(0,1), type = "n", xlab = paste0(dimred.name, " dimensions"), ylab = "spearman correlation", xaxs = "i", yaxs = "i", cex.axis = 0.7))
# meta.names correlations: calculation and plotting
meta.names <- meta.names[meta.names %in% colnames(sobj@meta.data)]
meta.cols <- if (length(meta.names) > 12) scales::hue_pal()(length(meta.names)) else RColorBrewer::brewer.pal(length(meta.names), name = "Paired")
for(myf in seq_along(meta.names)) {
corvec <- vapply(seq_len(dim.max), function(k) { (abs(cor(sobj@reductions[[dimred.name]]@cell.embeddings[,k], sobj[[meta.names[myf]]], method = "spearman"))) }, .1)
lines(corvec, type = "l", ylim = c(0,1), col = meta.cols[myf], lwd = 2)
}
# eval.markers correlations: calculation and plotting
if (!is.null(eval.markers)) {
eval.markers <- sort(eval.markers[eval.markers %in% rownames(sobj@assays[[assay.name]]@data)])
mydata <- slot(sobj@assays[[assay.name]], "data")
for(mym in seq_along(eval.markers)) {
corvec <- vapply(seq_len(dim.max), function(k) { (abs(cor(sobj@reductions[[dimred.name]]@cell.embeddings[,k], mydata[rownames(mydata) == eval.markers[mym],], method = "spearman"))) }, .1)
lines(corvec, type = "l", ylim = c(0,1), col = 1, lwd = 2, lty = mym+1L)
}
}
# Adding legend
legend("topright", legend = c(meta.names, eval.markers), col = c(meta.cols, rep(1, length(eval.markers))), lty = c(rep(1, length(meta.names)), seq_along(eval.markers)), lwd = 2, xjust = 1, yjust = 1, cex=0.7)
# Clean
rm(meta.names, eval.markers, meta.cols, corvec)
Seurat::ElbowPlot(sobj, ndims = dim.max, reduction = dimred.name)
JackStraw randomly permutes a subset of the data, then calculates projected PCA scores for these randomly selected genes. It then compares the PCA scores obtained for this random selection with the observed PCA scores from actual data, in order to determine their statistical significance. End result is a p-value for each association of a gene with a principal component.
if (norm.method == "LogNormalize" & dimred.method == "pca") {
# Create a temporary reduction named "pca" (mandatory for JackStraw)
sobj@reductions$pca <- sobj@reductions$RNApca
# Compute JackStraw
sobj <- Seurat::JackStraw(sobj, assay = assay.name, reduction = "pca", dims = dim.max, num.replicate = dim.max)
sobj <- Seurat::ScoreJackStraw(sobj, dims = 1:dim.max)
Seurat::JackStrawPlot(sobj, dims = 1:dim.max, reduction = dimred.name)
# Clean the temporary reduction
sobj@reductions$pca <- NULL
invisible(gc())
}
The aim is to observe the contribution of genes to each component, and select dimensions that include variable genes (and even better : genes of interest expected to differ between expected cell types).
# Genes of interest?
print(sobj[[dimred.name]], dims = 1:dim.max, nfeatures = 15)
# Good separation according to genes?
for (i in 1:dim.max){
Seurat::DimHeatmap(sobj, dims = i, nfeatures = 15, cells = ncol(sobj), reduction = dimred.name, balanced = TRUE)
}
The aim is to generate several combinations of the number of selected dimensions with Louvain clustering at different levels of resolution, to cover a large (but humanly bearable) chunk of the possible cases. We will make a double loop thanks to foreach package: one on dimensions, and another on resolutions. Actually, we will look at all possible uMAPs and assess the relationship between them (change in cell distribution)
# Parameters
eval.dim.max <- 50
eval.dim.min <- 3
eval.dim.steps <- 2
eval.res.max <- 1.2
eval.res.min <- 0.1
eval.res.steps <- 0.1
# Check and set automatic parameters
if(eval.dim.max > dim.max) stop("eval.dim.max can't be greater than dim.max!")
dimsvec <- seq.int(eval.dim.min, eval.dim.max, eval.dim.steps)
resvec <- seq(eval.res.min,eval.res.max,eval.res.steps)
`%dims.do%` <- if (BiocParallel::bpworkers(cl) > 1) foreach::"%dopar%" else foreach::"%do%"
`%res.do%` <- foreach::"%do%"
# Building a minimal Seurat object (for memory sake)
mini_sobj <- sobj
mini_sobj@assays[names(mini_sobj@assays)[names(mini_sobj@assays) != assay.name]] <- NULL #removing other assays
mini_sobj@assays[[assay.name]]@scale.data <- matrix(nrow = 0, ncol = 0) #removing scale.data slot
mini_sobj@assays[[assay.name]]@misc <- list() #removing misc
mini_sobj@reductions[names(mini_sobj@reductions)[names(mini_sobj@reductions) != dimred.name]] <- NULL #removing other reductions
suppressPackageStartupMessages(require(Matrix))
mini_sobj@assays[[assay.name]]@counts <- new("dgCMatrix") #slimming other slots
# Loop on dimension (dimsvec)
results.all <- foreach::foreach(the.dim = dimsvec, .combine = "cbind", .inorder = FALSE, .errorhandling = "stop", .noexport = objects(), .packages = c("Seurat", "ggplot2")) %dims.do% {
print(paste0(" >>>>>> Testing dimensions 1 to ", the.dim))
## Compute umap
umap.name <- paste0(dimred.name, the.dim,'dims_umap')
mini_sobj <- Seurat::RunUMAP(object = mini_sobj, assay = assay.name, dims = 1L:the.dim, reduction = dimred.name, seed.use = the.seed, reduction.name = umap.name)
## Find Neighbors
mini_sobj <- Seurat::FindNeighbors(object = mini_sobj, assay = assay.name, dims = 1L:the.dim, reduction = dimred.name)
## Loop on resolutions (resvec)
res.results <- foreach::foreach(the.res = resvec, .inorder = FALSE, .errorhandling = "stop", .noexport = objects()) %res.do% {
print(paste0(">>> Testing resolution ", the.res))
### Compute clustering
mini_sobj <- Seurat::FindClusters(object = mini_sobj, random.seed = the.seed, resolution = the.res)
### Compute umap
resdim.plot <- Seurat::LabelClusters(plot = Seurat::DimPlot(object = mini_sobj, reduction = umap.name, pt.size = 1) + ggplot2::ggtitle(paste0(dimred.name, " dims = ", the.dim, "; res = ", the.res)) + Seurat::DarkTheme(), id = "ident", size = 3, repel = FALSE, color = "white", fontface = "bold")
### Return results (umap + clustering)
return(list(resplot = resdim.plot, clusters = mini_sobj$seurat_clusters))
}
## Decomposing res.results
plots.results <- unlist(res.results, recursive = FALSE)[seq.int(1, length(res.results)*2, 2)] #get all plots
dims.res <- as.data.frame(unlist(res.results, recursive = FALSE)[seq.int(2, length(res.results)*2, 2)]) #get all clusterings
colnames(dims.res) <- c(paste0("clusters_", dimred.name, the.dim, "_res", resvec)) #with names begining by dimensions
res.dims <- dims.res #get all clusterings
colnames(res.dims) <- c(paste0("clusters_res", resvec, "_", dimred.name, the.dim)) #with names begining by resolutions
## Cleaning
rm(res.results)
mini_sobj@graphs <- list()
invisible(gc())
## Return results (umap + clustering with names begining by dimensions + clustering with names begining by resolutions)
return(list(plots.results.all = plots.results, dims.res.all = dims.res, res.dims.all = res.dims))
}
## Decomposing results.all
plots.results <- unlist(results.all["plots.results.all",], recursive = FALSE) #get all plots
dims.res.all <- as.data.frame(unlist(results.all["dims.res.all",], recursive = FALSE)) #get all clusterings
colnames(dims.res.all) <- sub("result\\.[0-9]+\\.", "", colnames(dims.res.all))
res.dims.all <- as.data.frame(unlist(results.all["res.dims.all",], recursive = FALSE)) #get all clusterings
colnames(res.dims.all) <- sub("result\\.[0-9]+\\.", "", colnames(res.dims.all))
## Print all umap for each dimensions range (all resolutions)
for (id.dim in seq(1,length(dimsvec)*length(resvec),length(resvec))) {
to_plot <- plots.results[id.dim:(id.dim+length(resvec)-1)]
print(patchwork::wrap_plots(to_plot) + patchwork::plot_layout(ncol = 2))
}
## Make clustree for all res of each dimensions range
for (the.dim in dimsvec) {
if (length(resvec) > 1) {
cres <- clustree::clustree(dims.res.all, prefix = paste0("clusters_", dimred.name, the.dim, "_res"))
print(cres + ggplot2::ggtitle(paste0("dims = ", the.dim)))
}
}
# Make clustree for all dimensions at each resolution
for (the.res in resvec) {
if (length(dimsvec) > 1) {
cres <- clustree::clustree(res.dims.all, prefix = paste0("clusters_res", the.res, "_", dimred.name))
print(cres + ggplot2::ggtitle(paste0("res = ", the.res)))
}
}
# Cleaning
rm(mini_sobj, results.all, plots.results, dims.res.all, res.dims.all, dimsvec, resvec, to_plot, cres)
invisible(gc())
# Parameters
chosen.dims <- 19
chosen.resolution <- 1.0
# Find Neighbors
sobj <- Seurat::FindNeighbors(object = sobj, assay = assay.name, reduction = dimred.name, compute.SNN = TRUE, dims = 1L:chosen.dims)
# Find clusters
sobj <- Seurat::FindClusters(object = sobj, resolution = chosen.resolution, random.seed = the.seed, algorithm = 1)
# Run UMAP
umap.name <- paste0(dimred.name, chosen.dims, 'dims_umap')
sobj <- Seurat::RunUMAP(object = sobj, assay = assay.name, dims = 1L:chosen.dims, reduction = dimred.name, reduction.name = umap.name, seed.use = the.seed)
# Print clusters
Seurat::LabelClusters(plot = Seurat::DimPlot(object = sobj, reduction = umap.name, pt.size = 1) + ggplot2::ggtitle("Louvain clusters)") + Seurat::DarkTheme(), id = "ident", size = 4, repel = FALSE, color = "white", fontface = "bold")
print("Number of cells for each cluster:")
table(Seurat::Idents(sobj))
Clustering seems to be linked to a known source of bias ? If so, you probably should correct it.
# Metrics
Seurat::FeaturePlot(object = sobj, reduction = umap.name, pt.size = 2, features = "percent_mt", cols = c("gold", "blue"), order = TRUE) + Seurat::DarkTheme()
Seurat::FeaturePlot(object = sobj, reduction = umap.name, pt.size = 2, features = "percent_rb", cols = c("gold", "blue"), order = TRUE) + Seurat::DarkTheme()
Seurat::FeaturePlot(object = sobj, reduction = umap.name, pt.size = 2, features = "nCount_RNA", cols = c("gold", "blue"), order = TRUE) + Seurat::DarkTheme()
Seurat::FeaturePlot(object = sobj, reduction = umap.name, pt.size = 2, features = "nFeature_RNA", cols = c("gold", "blue"), order = TRUE) + Seurat::DarkTheme()
# Cell cycle (cyclone)
Seurat::DimPlot(object = sobj, reduction = umap.name, pt.size = 2, group.by = "Cyclone.Phase") + ggplot2::ggtitle("Cell Phase (cyclone)") + Seurat::DarkTheme()
Seurat::FeaturePlot(object = sobj, reduction = umap.name, pt.size = 2, features = "Cyclone.S.Score", cols = c("gold", "blue"), order = TRUE) + Seurat::DarkTheme()
Seurat::FeaturePlot(object = sobj, reduction = umap.name, pt.size = 2, features = "Cyclone.G2M.Score", cols = c("gold", "blue"), order = TRUE) + Seurat::DarkTheme()
Seurat::FeaturePlot(object = sobj, reduction = umap.name, pt.size = 2, features = "Cyclone.SmG2M.Score", cols = c("gold", "blue"), order = TRUE) + Seurat::DarkTheme()
Seurat::FeaturePlot(object = sobj, reduction = umap.name, pt.size = 2, features = "Cyclone.G1.Score", cols = c("gold", "blue"), order = TRUE) + Seurat::DarkTheme()
# Cell cycle (Seurat)
Seurat::DimPlot(object = sobj, reduction = umap.name, pt.size = 2, group.by = "Seurat.Phase") + ggplot2::ggtitle("Cell Phase (Seurat)") + Seurat::DarkTheme()
Seurat::FeaturePlot(object = sobj, reduction = umap.name, pt.size = 2, features = c("Seurat.S.Score"), cols = c("gold", "blue"), order = TRUE) + Seurat::DarkTheme()
Seurat::FeaturePlot(object = sobj, reduction = umap.name, pt.size = 2, features = "Seurat.G2M.Score", cols = c("gold", "blue"), order = TRUE) + Seurat::DarkTheme()
Seurat::FeaturePlot(object = sobj, reduction = umap.name, pt.size = 2, features = "Seurat.SmG2M.Score", cols = c("gold", "blue"), order = TRUE) + Seurat::DarkTheme()
The goal is to find marker genes specific of each cluster. If there is no markers for a cluster, maybe it is very close to another cell type, or maybe it is not a cell type (noise).
sobj.markers <- Seurat::FindAllMarkers(sobj, only.pos = TRUE, min.pct = 0.75, logfc.threshold = 0.25, test.use = "wilcox")
head(sobj.markers, n = 5)
top.markers <- sobj.markers %>% group_by(cluster) %>% top_n(n = 6, wt = avg_log2FC)
for (clust_id in unique(sort(top.markers$cluster))){
genes_clust_id <- top.markers[as.character(top.markers$cluster) == clust_id, "gene"]$gene
print(Seurat::VlnPlot(sobj, features = genes_clust_id) + plot_annotation(title = paste0('Top Markers for cluster ',clust_id)))
print(Seurat::FeaturePlot(sobj, reduction = umap.name, features = genes_clust_id) + plot_annotation(title = paste0('Top Markers for cluster ',clust_id)))
}
Seurat::DoHeatmap(sobj, features = top.markers$gene)
Markers | Cell Type |
---|---|
IL7R, CCR7 | Naive CD4+ T |
CD14, LYZ | CD14+ Mono |
IL7R, S100A4 | Memory CD4+ |
MS4A1 | B |
CD8A | CD8+ T |
FCGR3A, MS4A7 | FCGR3A+ Mono |
GNLY, NKG7 | NK |
FCER1A, CST3 | DC |
PPBP | Platelet |
Seurat::FeaturePlot(sobj, reduction = umap.name, features = c("IL7R", "CCR7","CD14", "LYZ","IL7R", "S100A4"))
Seurat::FeaturePlot(sobj, reduction = umap.name, features = c("MS4A1","CD8A","FCGR3A", "MS4A7","GNLY", "NKG7"))
Seurat::FeaturePlot(sobj, reduction = umap.name, features = c("FCER1A", "CST3","PPBP"))
# Save parameters
sobj@misc$params$NDRC$seed <- the.seed
sobj@misc$params$NDRC$normalization <- list(method = norm.method, features.used = features.n)
sobj@misc$params$NDRC$reductions <- list(method = dimred.method, dim.max = dim.max)
sobj@misc$params$NDRC$vtr.biases <- if(is.null(vtr.biases)) NA else vtr.biases
sobj@misc$params$NDRC$clustering <- list(method = "louvain", umap = umap.name, dimensions = chosen.dims, resolution = chosen.resolution)
# Save R session
sobj@misc$params$NDRC$Rsession <- utils::capture.output(devtools::session_info())
# Save seurat object
dir.create(output.dir, recursive = TRUE)
save(sobj, file = paste0(output.dir, paste(c(sample.name, 'without', norm.method, dimred.method, if(!is.null(vtr.biases)) vtr.biases), collapse = '_'), '.rda'), compress = "bzip2")
# Print R session info
utils::capture.output(devtools::session_info())