--- title: "SinCellTE 2022: Statistical models and analyses" output: html_document: default --- ## Initializing the practical session ```{r uid} user.id <- Sys.getenv('USER') ``` ## Setting Dataset ```{r dset} input.rda <-paste0("/shared/projects/sincellte_2022/", user.id, "/Statistical_models_and_analyses/input/PBMC_CTRL_GE_FILTERED_without.rda") output.dir <- paste0("/shared/projects/sincellte_2022/", user.id, "/Statistical_models_and_analyses/finalresult/") ``` ## Setting Paramaters ```{r params} # Computational Parameters nthreads <- 2 the.seed <- 1337L # Analysis Parameters features.n <- 3000 norm.method <- 'LogNormalize' # SCTransform or LogNormalize dimred.method <- 'scbfa' # pca or scbfa vtr.biases <- NULL #c("percent_mt", "percent_rb") # variables to regress (for SCTransform or scbfa) vtr.scale <- FALSE # recommended to TRUE with scbfa if !is.null(vtr.biases) dim.max <- 50 ``` ## Setting seed and parallel instance up, loading R packages ```{r paral} set.seed(the.seed) doParallel::registerDoParallel(nthreads) cl <- BiocParallel::DoparParam() suppressPackageStartupMessages(library(ggplot2)) suppressPackageStartupMessages(library(ggraph)) suppressPackageStartupMessages(library(patchwork)) suppressPackageStartupMessages(library(dplyr)) ``` ## Loading data ```{r loading} load(input.rda) sample.name <- sobj@misc$params$sample.name print(paste0("There are ", ncol(sobj), " cells and ", nrow(sobj), " genes.")) ``` ## Normalization ```{r norm} # Normalization if (norm.method == "SCTransform") { sobj <- suppressWarnings(Seurat::SCTransform(object = sobj, assay = "RNA", seed.use = the.seed, variable.features.n = features.n, vars.to.regress = vtr.biases, return.only.var.genes = TRUE)) } else if (norm.method == "LogNormalize") { sobj <- Seurat::NormalizeData(sobj, normalization.method = 'LogNormalize', assay = "RNA") sobj <- Seurat::FindVariableFeatures(sobj, assay = "RNA", nfeatures = features.n) sobj <- Seurat::ScaleData(sobj, assay = "RNA", vars.to.regress = vtr.biases) } else stop("Unknown normalization method !") # Save assay name assay.name <- Seurat::DefaultAssay(sobj) ``` ## Check variable genes ```{r var_genes} # Highly Variable Genes (HVG) HVG <- Seurat::VariableFeatures(sobj) # Number of HVG print(paste0("There are ", length(HVG), " highly variable genes (features.n = ", features.n,").")) # Identify the top 20 HVGs top20 <- head(HVG, 20) top20 # Quantify mitochondrial genes in HVGs: nb_mito_in_HVG <- length(grep("^MT-", HVG, value = TRUE)) print(paste0("There are ", nb_mito_in_HVG, " mitochondrial genes in the highly variable genes (", round(x = (nb_mito_in_HVG/length(HVG)*100), 2), "%).")) # Plot variable features with and without labels HVG_plot <- Seurat::VariableFeaturePlot(sobj, assay = assay.name,selection.method = "vst") Seurat::LabelPoints(plot = HVG_plot, points = top20, repel = TRUE) # Clean rm(HVG, HVG_plot, top20, nb_mito_in_HVG) ``` ## Dimension reduction Note: scbfa take a while (~40-50min). ```{r dim_red} # Save dimension reduction name dimred.name <- paste0(assay.name, dimred.method) if (dimred.method == 'pca') { # Dimension reduction sobj <- Seurat::RunPCA(object = sobj, verbose = FALSE, npcs = dim.max, reduction.name = dimred.name, reduction.key = paste0(dimred.name, '_'), seed.use = the.seed) } else if (dimred.method == 'scbfa') { # Formatting covariates matrix named X if (is.null(vtr.biases)) { X <- NULL }else{ minimetadata <- sobj@meta.data[,colnames(sobj@meta.data) %in% vtr.biases, drop = FALSE] #drop = FALSE to keep minimetadata as a dataframe X <- matrix(ncol = 0, nrow = nrow(sobj@meta.data)) # make an empty data.frame which will be filled afterwards for(v in vtr.biases) { if(is.character(minimetadata[[v]])) { minimetadata[[v]] <- as.factor(minimetadata[[v]]) print(paste0("Converting '", v, "' factor into model matrix, adding it to the regression...")) mm <- model.matrix(~minimetadata[[v]])[,-1, drop = FALSE] X <- cbind(X, mm) } else { print(paste0("Adding '", v, "' covariate (scaled) to the regression ...")) X <- cbind(X, scale(minimetadata[[v]])) } } rm(minimetadata) } # Dimension reduction bfa.res <- scBFA::scBFA(scData = as.matrix(sobj@assays[[assay.name]]@counts[sobj@assays[[assay.name]]@var.features,]), numFactors = dim.max, X = X) dimnames(bfa.res$ZZ) <- list(colnames(sobj@assays[[assay.name]]@counts), paste0('SCBFA_', 1L:dim.max)) dimnames(bfa.res$AA) <- list(sobj@assays[[assay.name]]@var.features, paste0('SCBFA_', 1L:dim.max)) sobj@reductions[[dimred.name]] <- Seurat::CreateDimReducObject(embeddings = bfa.res$ZZ, loadings = bfa.res$AA, assay = assay.name, stdev = matrixStats::colSds(bfa.res$ZZ), key = paste0(dimred.name, '_'), misc = list()) rm(bfa.res) } else stop("Unknown reduction method !") ``` ## Bias correction ? The goal is to evaluate the contribution of the known putative bias sources to the loadings of the dimension reduction through correlation. **Be careful :** run the chunk in a single shot, due to plot structure. ```{r bias_corr} # Parameters meta.names = c('nCount_RNA', 'nFeature_RNA', 'percent_mt', 'percent_rb', "Cyclone.S.Score", "Cyclone.G1.Score", "Cyclone.G2M.Score", "Cyclone.SmG2M.Score", "Seurat.S.Score", "Seurat.G1.Score", "Seurat.G2M.Score", "Seurat.SmG2M.Score") # selected in colnames(sobj@meta.data) eval.markers = c('GAPDH') # GAPDH used as a mock, you should use genes known to be markers of your cell subpopulations (when you know it). # Draw an empty plot at the right scale suppressWarnings(plot(0, 0, log = 'x', xlim = c(1L, dim.max), ylim = c(0,1), type = "n", xlab = paste0(dimred.name, " dimensions"), ylab = "spearman correlation", xaxs = "i", yaxs = "i", cex.axis = 0.7)) # meta.names correlations: calculation and plotting meta.names <- meta.names[meta.names %in% colnames(sobj@meta.data)] meta.cols <- if (length(meta.names) > 12) scales::hue_pal()(length(meta.names)) else RColorBrewer::brewer.pal(length(meta.names), name = "Paired") for(myf in seq_along(meta.names)) { corvec <- vapply(seq_len(dim.max), function(k) { (abs(cor(sobj@reductions[[dimred.name]]@cell.embeddings[,k], sobj[[meta.names[myf]]], method = "spearman"))) }, .1) lines(corvec, type = "l", ylim = c(0,1), col = meta.cols[myf], lwd = 2) } # eval.markers correlations: calculation and plotting if (!is.null(eval.markers)) { eval.markers <- sort(eval.markers[eval.markers %in% rownames(sobj@assays[[assay.name]]@data)]) mydata <- slot(sobj@assays[[assay.name]], "data") for(mym in seq_along(eval.markers)) { corvec <- vapply(seq_len(dim.max), function(k) { (abs(cor(sobj@reductions[[dimred.name]]@cell.embeddings[,k], mydata[rownames(mydata) == eval.markers[mym],], method = "spearman"))) }, .1) lines(corvec, type = "l", ylim = c(0,1), col = 1, lwd = 2, lty = mym+1L) } } # Adding legend legend("topright", legend = c(meta.names, eval.markers), col = c(meta.cols, rep(1, length(eval.markers))), lty = c(rep(1, length(meta.names)), seq_along(eval.markers)), lwd = 2, xjust = 1, yjust = 1, cex=0.7) # Clean rm(meta.names, eval.markers, meta.cols, corvec) ``` ## How to choose the right amount of reduction dimensions to keep ? # Strategy 1 : the elbow plot ```{r elbow} Seurat::ElbowPlot(sobj, ndims = dim.max, reduction = dimred.name) ``` # Strategy 2 : algorithm method (Jackstraw: only LogNormalize follow by PCA is currently supported!) JackStraw randomly permutes a subset of the data, then calculates projected PCA scores for these randomly selected genes. It then compares the PCA scores obtained for this random selection with the observed PCA scores from actual data, in order to determine their statistical significance. End result is a p-value for each association of a gene with a principal component. ```{r jackstraw} if (norm.method == "LogNormalize" & dimred.method == "pca") { # Create a temporary reduction named "pca" (mandatory for JackStraw) sobj@reductions$pca <- sobj@reductions$RNApca # Compute JackStraw sobj <- Seurat::JackStraw(sobj, assay = assay.name, reduction = "pca", dims = dim.max, num.replicate = dim.max) sobj <- Seurat::ScoreJackStraw(sobj, dims = 1:dim.max) Seurat::JackStrawPlot(sobj, dims = 1:dim.max, reduction = dimred.name) # Clean the temporary reduction sobj@reductions$pca <- NULL invisible(gc()) } ``` # Strategy 3 : Genes contribution The aim is to observe the contribution of genes to each component, and select dimensions that include variable genes (and even better : genes of interest expected to differ between expected cell types). ```{r gene_cor, fig.height = 5, fig.width = 5} # Genes of interest? print(sobj[[dimred.name]], dims = 1:dim.max, nfeatures = 15) # Good separation according to genes? for (i in 1:dim.max){ Seurat::DimHeatmap(sobj, dims = i, nfeatures = 15, cells = ncol(sobj), reduction = dimred.name, balanced = TRUE) } ``` # Strategy 4 : Evaluate and compare effect on clustering (using Louvain) The aim is to generate several combinations of the number of selected dimensions with Louvain clustering at different levels of resolution, to cover a large (but humanly bearable) chunk of the possible cases. We will make a double loop thanks to foreach package: one on dimensions, and another on resolutions. Actually, we will look at all possible uMAPs and assess the relationship between them (change in cell distribution) ```{r eval, fig.height = 15, fig.width = 10} # Parameters eval.dim.max <- 50 eval.dim.min <- 3 eval.dim.steps <- 2 eval.res.max <- 1.2 eval.res.min <- 0.1 eval.res.steps <- 0.1 # Check and set automatic parameters if(eval.dim.max > dim.max) stop("eval.dim.max can't be greater than dim.max!") dimsvec <- seq.int(eval.dim.min, eval.dim.max, eval.dim.steps) resvec <- seq(eval.res.min,eval.res.max,eval.res.steps) `%dims.do%` <- if (BiocParallel::bpworkers(cl) > 1) foreach::"%dopar%" else foreach::"%do%" `%res.do%` <- foreach::"%do%" # Building a minimal Seurat object (for memory sake) mini_sobj <- sobj mini_sobj@assays[names(mini_sobj@assays)[names(mini_sobj@assays) != assay.name]] <- NULL #removing other assays mini_sobj@assays[[assay.name]]@scale.data <- matrix(nrow = 0, ncol = 0) #removing scale.data slot mini_sobj@assays[[assay.name]]@misc <- list() #removing misc mini_sobj@reductions[names(mini_sobj@reductions)[names(mini_sobj@reductions) != dimred.name]] <- NULL #removing other reductions suppressPackageStartupMessages(require(Matrix)) mini_sobj@assays[[assay.name]]@counts <- new("dgCMatrix") #slimming other slots # Loop on dimension (dimsvec) results.all <- foreach::foreach(the.dim = dimsvec, .combine = "cbind", .inorder = FALSE, .errorhandling = "stop", .noexport = objects(), .packages = c("Seurat", "ggplot2")) %dims.do% { print(paste0(" >>>>>> Testing dimensions 1 to ", the.dim)) ## Compute umap umap.name <- paste0(dimred.name, the.dim,'dims_umap') mini_sobj <- Seurat::RunUMAP(object = mini_sobj, assay = assay.name, dims = 1L:the.dim, reduction = dimred.name, seed.use = the.seed, reduction.name = umap.name) ## Find Neighbors mini_sobj <- Seurat::FindNeighbors(object = mini_sobj, assay = assay.name, dims = 1L:the.dim, reduction = dimred.name) ## Loop on resolutions (resvec) res.results <- foreach::foreach(the.res = resvec, .inorder = FALSE, .errorhandling = "stop", .noexport = objects()) %res.do% { print(paste0(">>> Testing resolution ", the.res)) ### Compute clustering mini_sobj <- Seurat::FindClusters(object = mini_sobj, random.seed = the.seed, resolution = the.res) ### Compute umap resdim.plot <- Seurat::LabelClusters(plot = Seurat::DimPlot(object = mini_sobj, reduction = umap.name, pt.size = 1) + ggplot2::ggtitle(paste0(dimred.name, " dims = ", the.dim, "; res = ", the.res)) + Seurat::DarkTheme(), id = "ident", size = 3, repel = FALSE, color = "white", fontface = "bold") ### Return results (umap + clustering) return(list(resplot = resdim.plot, clusters = mini_sobj$seurat_clusters)) } ## Decomposing res.results plots.results <- unlist(res.results, recursive = FALSE)[seq.int(1, length(res.results)*2, 2)] #get all plots dims.res <- as.data.frame(unlist(res.results, recursive = FALSE)[seq.int(2, length(res.results)*2, 2)]) #get all clusterings colnames(dims.res) <- c(paste0("clusters_", dimred.name, the.dim, "_res", resvec)) #with names begining by dimensions res.dims <- dims.res #get all clusterings colnames(res.dims) <- c(paste0("clusters_res", resvec, "_", dimred.name, the.dim)) #with names begining by resolutions ## Cleaning rm(res.results) mini_sobj@graphs <- list() invisible(gc()) ## Return results (umap + clustering with names begining by dimensions + clustering with names begining by resolutions) return(list(plots.results.all = plots.results, dims.res.all = dims.res, res.dims.all = res.dims)) } ## Decomposing results.all plots.results <- unlist(results.all["plots.results.all",], recursive = FALSE) #get all plots dims.res.all <- as.data.frame(unlist(results.all["dims.res.all",], recursive = FALSE)) #get all clusterings colnames(dims.res.all) <- sub("result\\.[0-9]+\\.", "", colnames(dims.res.all)) res.dims.all <- as.data.frame(unlist(results.all["res.dims.all",], recursive = FALSE)) #get all clusterings colnames(res.dims.all) <- sub("result\\.[0-9]+\\.", "", colnames(res.dims.all)) ## Print all umap for each dimensions range (all resolutions) for (id.dim in seq(1,length(dimsvec)*length(resvec),length(resvec))) { to_plot <- plots.results[id.dim:(id.dim+length(resvec)-1)] print(patchwork::wrap_plots(to_plot) + patchwork::plot_layout(ncol = 2)) } ## Make clustree for all res of each dimensions range for (the.dim in dimsvec) { if (length(resvec) > 1) { cres <- clustree::clustree(dims.res.all, prefix = paste0("clusters_", dimred.name, the.dim, "_res")) print(cres + ggplot2::ggtitle(paste0("dims = ", the.dim))) } } # Make clustree for all dimensions at each resolution for (the.res in resvec) { if (length(dimsvec) > 1) { cres <- clustree::clustree(res.dims.all, prefix = paste0("clusters_res", the.res, "_", dimred.name)) print(cres + ggplot2::ggtitle(paste0("res = ", the.res))) } } # Cleaning rm(mini_sobj, results.all, plots.results, dims.res.all, res.dims.all, dimsvec, resvec, to_plot, cres) invisible(gc()) ``` ## Final clustering at selected parameters ```{r clust, fig.height = 5, fig.width = 5} # Parameters chosen.dims <- 19 chosen.resolution <- 1.0 # Find Neighbors sobj <- Seurat::FindNeighbors(object = sobj, assay = assay.name, reduction = dimred.name, compute.SNN = TRUE, dims = 1L:chosen.dims) # Find clusters sobj <- Seurat::FindClusters(object = sobj, resolution = chosen.resolution, random.seed = the.seed, algorithm = 1) # Run UMAP umap.name <- paste0(dimred.name, chosen.dims, 'dims_umap') sobj <- Seurat::RunUMAP(object = sobj, assay = assay.name, dims = 1L:chosen.dims, reduction = dimred.name, reduction.name = umap.name, seed.use = the.seed) # Print clusters Seurat::LabelClusters(plot = Seurat::DimPlot(object = sobj, reduction = umap.name, pt.size = 1) + ggplot2::ggtitle("Louvain clusters)") + Seurat::DarkTheme(), id = "ident", size = 4, repel = FALSE, color = "white", fontface = "bold") print("Number of cells for each cluster:") table(Seurat::Idents(sobj)) ``` ## Checking bias on final clustering Clustering seems to be linked to a known source of bias ? If so, you probably should correct it. ```{r bias_clust, fig.height = 5, fig.width = 5} # Metrics Seurat::FeaturePlot(object = sobj, reduction = umap.name, pt.size = 2, features = "percent_mt", cols = c("gold", "blue"), order = TRUE) + Seurat::DarkTheme() Seurat::FeaturePlot(object = sobj, reduction = umap.name, pt.size = 2, features = "percent_rb", cols = c("gold", "blue"), order = TRUE) + Seurat::DarkTheme() Seurat::FeaturePlot(object = sobj, reduction = umap.name, pt.size = 2, features = "nCount_RNA", cols = c("gold", "blue"), order = TRUE) + Seurat::DarkTheme() Seurat::FeaturePlot(object = sobj, reduction = umap.name, pt.size = 2, features = "nFeature_RNA", cols = c("gold", "blue"), order = TRUE) + Seurat::DarkTheme() # Cell cycle (cyclone) Seurat::DimPlot(object = sobj, reduction = umap.name, pt.size = 2, group.by = "Cyclone.Phase") + ggplot2::ggtitle("Cell Phase (cyclone)") + Seurat::DarkTheme() Seurat::FeaturePlot(object = sobj, reduction = umap.name, pt.size = 2, features = "Cyclone.S.Score", cols = c("gold", "blue"), order = TRUE) + Seurat::DarkTheme() Seurat::FeaturePlot(object = sobj, reduction = umap.name, pt.size = 2, features = "Cyclone.G2M.Score", cols = c("gold", "blue"), order = TRUE) + Seurat::DarkTheme() Seurat::FeaturePlot(object = sobj, reduction = umap.name, pt.size = 2, features = "Cyclone.SmG2M.Score", cols = c("gold", "blue"), order = TRUE) + Seurat::DarkTheme() Seurat::FeaturePlot(object = sobj, reduction = umap.name, pt.size = 2, features = "Cyclone.G1.Score", cols = c("gold", "blue"), order = TRUE) + Seurat::DarkTheme() # Cell cycle (Seurat) Seurat::DimPlot(object = sobj, reduction = umap.name, pt.size = 2, group.by = "Seurat.Phase") + ggplot2::ggtitle("Cell Phase (Seurat)") + Seurat::DarkTheme() Seurat::FeaturePlot(object = sobj, reduction = umap.name, pt.size = 2, features = c("Seurat.S.Score"), cols = c("gold", "blue"), order = TRUE) + Seurat::DarkTheme() Seurat::FeaturePlot(object = sobj, reduction = umap.name, pt.size = 2, features = "Seurat.G2M.Score", cols = c("gold", "blue"), order = TRUE) + Seurat::DarkTheme() Seurat::FeaturePlot(object = sobj, reduction = umap.name, pt.size = 2, features = "Seurat.SmG2M.Score", cols = c("gold", "blue"), order = TRUE) + Seurat::DarkTheme() ``` ## Differential Gene Expression Analyses: Marker genes The goal is to find marker genes specific of each cluster. If there is no markers for a cluster, maybe it is very close to another cell type, or maybe it is not a cell type (noise). ```{r diff} sobj.markers <- Seurat::FindAllMarkers(sobj, only.pos = TRUE, min.pct = 0.75, logfc.threshold = 0.25, test.use = "wilcox") head(sobj.markers, n = 5) top.markers <- sobj.markers %>% group_by(cluster) %>% top_n(n = 6, wt = avg_log2FC) for (clust_id in unique(sort(top.markers$cluster))){ genes_clust_id <- top.markers[as.character(top.markers$cluster) == clust_id, "gene"]$gene print(Seurat::VlnPlot(sobj, features = genes_clust_id) + plot_annotation(title = paste0('Top Markers for cluster ',clust_id))) print(Seurat::FeaturePlot(sobj, reduction = umap.name, features = genes_clust_id) + plot_annotation(title = paste0('Top Markers for cluster ',clust_id))) } Seurat::DoHeatmap(sobj, features = top.markers$gene) ``` ## Plot some genes on umap | Markers | Cell Type | |:-------------:|:------------:| | IL7R, CCR7 | Naive CD4+ T | | CD14, LYZ | CD14+ Mono | | IL7R, S100A4 | Memory CD4+ | | MS4A1 | B | | CD8A | CD8+ T | | FCGR3A, MS4A7 | FCGR3A+ Mono | | GNLY, NKG7 | NK | | FCER1A, CST3 | DC | | PPBP | Platelet | ```{r plot_genes} Seurat::FeaturePlot(sobj, reduction = umap.name, features = c("IL7R", "CCR7","CD14", "LYZ","IL7R", "S100A4")) Seurat::FeaturePlot(sobj, reduction = umap.name, features = c("MS4A1","CD8A","FCGR3A", "MS4A7","GNLY", "NKG7")) Seurat::FeaturePlot(sobj, reduction = umap.name, features = c("FCER1A", "CST3","PPBP")) ``` ## Save ```{r save} # Save parameters sobj@misc$params$NDRC$seed <- the.seed sobj@misc$params$NDRC$normalization <- list(method = norm.method, features.used = features.n) sobj@misc$params$NDRC$reductions <- list(method = dimred.method, dim.max = dim.max) sobj@misc$params$NDRC$vtr.biases <- if(is.null(vtr.biases)) NA else vtr.biases sobj@misc$params$NDRC$clustering <- list(method = "louvain", umap = umap.name, dimensions = chosen.dims, resolution = chosen.resolution) # Save R session sobj@misc$params$NDRC$Rsession <- utils::capture.output(devtools::session_info()) # Save seurat object dir.create(output.dir, recursive = TRUE) save(sobj, file = paste0(output.dir, paste(c(sample.name, 'without', norm.method, dimred.method, if(!is.null(vtr.biases)) vtr.biases), collapse = '_'), '.rda'), compress = "bzip2") ``` ```{r Rsession} # Print R session info utils::capture.output(devtools::session_info()) ```