--- title: "SinCellTE 2022: Primary analyses" output: html_document: default --- ## Initializing the practical session ```{r uid} user.id <- Sys.getenv('USER') ``` ## Setting Dataset ```{r dset} input.dir <- paste0("/shared/projects/sincellte_2022/", user.id, "/Primary_analyses/input/matrix/") output.dir <- paste0("/shared/projects/sincellte_2022/", user.id, "/Primary_analyses/finalresult/") sample.name <- "PBMC_CTRL_GE" species <- "human" aligment <- "10X" #kallisto or 10X ``` ## Setting Paramaters ```{r params} # Computational Parameters the.seed <- 1337L # Analysis Parameters ## Empty droplets emptydrops.retain <- NULL ## Barcode-level QC cell pcmito.min <- 0 pcmito.max <- 20 pcribo.min <- 0 pcribo.max <- 100 min.features <- 200 min.counts <- 1000 ## Feature-level QC min.cells <- 5 ``` ## Setting seed and loading R packages ```{r seed_packages} set.seed(the.seed) suppressPackageStartupMessages(library(dplyr)) ``` ## Loading raw count matrix ```{r loading} # Loading raw count matrix if(aligment == "kallisto"){ scmat <- BUSpaRse::read_count_output(dir = input.dir, name = sample.name) # from BUStools }else if(aligment == "10X"){ scmat <- Seurat::Read10X(input.dir) # from 10X } else stop("Alignment technology unknown!") # Some metrics about raw count matrix: print('Number of barodes (droplets) in raw count matrix:') droplets.total.nb <- ncol(scmat) droplets.total.nb print('Number of features (genes) in raw count matrix:') genes.total.nb <- nrow(scmat) genes.total.nb print('Number of UMIs in raw count matrix:') umi.total.nb <- sum(scmat) umi.total.nb ``` ## Removing empty droplets ### Automatically : emptyDrops() ```{r emptydrops} # Removing empty droplets bc_rank <- DropletUtils::emptyDrops(m = scmat, retain = emptydrops.retain) scmat_filtered <- scmat[, which(bc_rank$FDR < 1E-03)] # Kneeplot ## Make the dataframe with all droplets (before filtering) and the number of UMI for each droplet, and if the droplets are filtered or not by emptydrops nb_umi_by_barcode <- data.frame(nb_umi = Matrix::colSums(scmat), barcodes = colnames(scmat)) nb_umi_by_barcode <- nb_umi_by_barcode %>% arrange(desc(nb_umi)) %>% dplyr::mutate(num_barcode = seq.int(ncol(scmat))) nb_umi_by_barcode$droplets_state <- "Empty Droplets" nb_umi_by_barcode[nb_umi_by_barcode$barcodes %in% colnames(scmat_filtered), "droplets_state"] <- "Full Droplets" ## Draw kneeplot ggplot2::ggplot(nb_umi_by_barcode, ggplot2::aes(y = nb_umi, x = num_barcode, color = droplets_state)) + ggplot2::geom_point() + ggplot2::ggtitle(paste0("Kneeplot of ", sample.name)) + ggplot2::theme(legend.title = ggplot2::element_blank()) + ggplot2::scale_y_log10(name = "Number of UMI by droplet (log scale)") + ggplot2::scale_x_log10(name = "Droplet rank (log scale)") + ggplot2::expand_limits(x = 0, y = 0) + ggplot2::scale_colour_manual(values = c("cyan3","royalblue4"), guide = 'legend') + ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(linetype = c(0, 0), shape = c(16, 16)))) # Creation of the Seurat object sobj <- Seurat::CreateSeuratObject(counts = scmat_filtered, project = sample.name) # Cleaning rm(bc_rank,nb_umi_by_barcode,scmat,scmat_filtered) invisible(gc()) ``` ### Manually ```{r lowcells} # Quantification ## Nb features by cell sobj$min_features <- sobj$nFeature_RNA >= min.features print(paste0('Number of droplets with >= ', min.features, ' features :')) print(table(sobj$min_features)) ## Nb counts by cell sobj$min_counts <- sobj$nCount_RNA >= min.counts print(paste0('Number of droplets with >= ', min.counts, ' of total counts:')) print(table(sobj$min_counts)) # Plots ## Histograms ggplot2::qplot(sobj[["nFeature_RNA", drop = TRUE]], geom = "histogram", bins = 101, fill = I("white"), col = I("black"), main = paste0("nFeature_RNA (>= ", min.features, " : ", length(which(sobj$min_features)), " droplets)"), xlab = "nFeature_RNA") + ggplot2::geom_vline(xintercept = min.features, col = "red", linetype = "dashed", size = 1.5) ggplot2::qplot(sobj[["nCount_RNA", drop = TRUE]], geom = "histogram", bins = 101, fill = I("white"), col = I("black"), main = paste0("nCount_RNA (>= ", min.counts, " : ", length(which(sobj$min_counts)), " droplets)"), xlab = "nCount_RNA") + ggplot2::geom_vline(xintercept = min.counts, col = "red", linetype = "dashed", size = 1.5) ## Violinplot Seurat::VlnPlot(sobj, features = c("nFeature_RNA", "nCount_RNA"), ncol = 2) ## Scatterplot Seurat::FeatureScatter(sobj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA") # Filtering cells on min.features and min.counts sobj <- sobj[, sobj$nFeature_RNA >= min.features & sobj$nCount_RNA >= min.counts] # Some metrics about filtered count matrix: print('Number of droplets in filtered data:') droplets.kept.nb <- ncol(sobj) droplets.kept.nb print('Number of genes in filtered data:') genes.kept.nb <- nrow(sobj) genes.kept.nb print('Number of UMIs in filtered data:') umi.kept.nb <- sum(sobj@assays$RNA@counts) umi.kept.nb print('Fraction of UMIs in droplets :') fraction.umi <- umi.kept.nb / umi.total.nb fraction.umi print('Mean of UMIs in droplets :') mean.umi <- round((umi.kept.nb / droplets.kept.nb), 0) mean.umi ``` ## Computing basic metrics : Percentages of mito + Percentages of ribo + Cell cycle prediction + Identification of doublets ### Percentages of mito + ribo ```{r mitoribo} # Pattern if(species == "human"){ mito_patern <- "^MT-" ribo_patern <- "^RP[LS][[:digit:]]" }else if (species == "mouse"){ mito_patern <- "^mt-" ribo_patern <- "^Rp[ls][[:digit:]]" } # Percentage MITO ## Percent sobj$percent_mt <- Seurat::PercentageFeatureSet(sobj, pattern = mito_patern) print('% Mitochondrial expression :') print(summary(sobj$percent_mt)) ## Number df_percent_mt <- data.frame(percent_mt = sobj$percent_mt, droplets_state = "in") df_percent_mt[df_percent_mt$percent_mt <= pcmito.min, "droplets_state"] <- "out.left" df_percent_mt[df_percent_mt$percent_mt >= pcmito.max, "droplets_state"] <- "out.right" print(paste0(pcmito.min, ' <= % mito <= ', pcmito.max, ' :')) print(table(df_percent_mt$droplets_state)) # Percentage RIBO ## Percent sobj$percent_rb <- Seurat::PercentageFeatureSet(sobj, pattern = ribo_patern) print('% Ribosomal expression :') print(summary(sobj$percent_rb)) ## Number df_percent_rb <- data.frame(percent_rb = sobj$percent_rb, droplets_state = "in") df_percent_rb[df_percent_rb$percent_rb <= pcribo.min, "droplets_state"] <- "out.left" df_percent_rb[df_percent_rb$percent_rb >= pcribo.max, "droplets_state"] <- "out.right" print(paste0(pcribo.min, ' <= % ribo <= ', pcribo.max, ' :')) print(table(df_percent_rb$droplets_state)) # Plots ## Histograms ggplot2::qplot(sobj$percent_mt, geom = "histogram", bins = 101, fill = I("white"), col = I("black"), main = paste0("%mito (in [", pcmito.min, ';', pcmito.max, "] : ", length(which(df_percent_mt$droplets_state == "in")), " droplets)"), xlab = "%mito") + ggplot2::geom_vline(xintercept = c(5,10,15,20), col = "cyan4", linetype = "dashed", size = 1) + ggplot2::geom_vline(xintercept = c(pcmito.min,pcmito.max), col = "red", linetype = "dashed", size = 1.5) ggplot2::qplot(sobj$percent_rb, geom = "histogram", bins = 101, fill = I("white"), col = I("black"), main = paste0("%ribo (in [", pcribo.min, ';', pcribo.max, "] : ", length(which(df_percent_rb$droplets_state == "in")), " droplets)"), xlab = "%ribo") + ggplot2::geom_vline(xintercept = c(pcribo.min,pcribo.max), col = "red", linetype = "dashed", size = 1.5) ## Violinplot Seurat::VlnPlot(sobj, features = c("percent_mt", "percent_rb"), ncol = 2) ``` ### Identification of background genes ```{r bkg} # Nb counts per gene df_cells_by_gene <- data.frame(nb_cells = apply(sobj@assays$RNA@counts,1,function(x){length(which(x>0))}), gene_state = "in", stringsAsFactors = FALSE) df_cells_by_gene[df_cells_by_gene$nb_cells <= min.cells, "gene_state"] <- "out" df_cells_by_gene_zoom <- df_cells_by_gene[df_cells_by_gene$nb_cells <= min.cells*4, ] print(paste0('Number of genes expressed in >= ', min.cells, ' cells:')) print(table(df_cells_by_gene$gene_state)) # Plots ## Histograms ggplot2::qplot(df_cells_by_gene$nb_cells, geom = "histogram", bins = 101, fill = I("white"), col = I("black"), main = paste0("nb genes (>= ", min.cells, " : ", length(which(df_cells_by_gene$gene_state == "in")), " genes)"), xlab = "nb cells") + ggplot2::geom_vline(xintercept = min.cells, col = "red", linetype = "dashed", size = 1.5) ggplot2::qplot(df_cells_by_gene_zoom$nb_cells, geom = "histogram", bins = 101, fill = I("white"), col = I("black"), main = paste0("nb genes (>= ", min.cells, " : ", length(which(df_cells_by_gene$gene_state == "in")), " genes) (Zoom)"),xlab = "nb cells") + ggplot2::geom_vline(xintercept = min.cells, col = "red", linetype = "dashed", size = 1.5) ``` ### Cell cycle prediction Note: Cyclone take a while (~10-15min). ```{r ccycle} # Seurat CC ## Load database if(species == "human"){ cc_seurat <- Seurat::cc.genes.updated.2019 }else if (species == "mouse"){ ### Seurat only provides cell cycle genes for homo sapiens (as this list was developed solely using data from this species). To use it on mus musculus, one can simply convert these human HUGO genes to to mus MGI orthologues (using biomaRt). See [this github issue](https://github.com/satijalab/seurat/issues/2493) cc_seurat <- Seurat::cc.genes.updated.2019 human_mart = biomaRt::useEnsembl("ensembl", dataset = "hsapiens_gene_ensembl") mouse_mart = biomaRt::useEnsembl("ensembl", dataset = "mmusculus_gene_ensembl") cc_seurat$s.genes = biomaRt::getLDS(attributes = c("hgnc_symbol"), filters = "hgnc_symbol", values = cc_seurat$s.genes , mart = human_mart, attributesL = c("mgi_symbol"), martL = mouse_mart, uniqueRows=T)[, 2] cc_seurat$g2m.genes = biomaRt::getLDS(attributes = c("hgnc_symbol"), filters = "hgnc_symbol", values = cc_seurat$g2m.genes , mart = human_mart, attributesL = c("mgi_symbol"), martL = mouse_mart, uniqueRows=T)[, 2] } ## Prediction sobj <- Seurat::CellCycleScoring(object = sobj, s.features = cc_seurat$s.genes, g2m.features = cc_seurat$g2m.genes, assay = "RNA", nbin = 10, seed = the.seed) sobj$Seurat.SmG2M.Score <- sobj$S.Score - sobj$G2M.Score ## Rename results columns with 'Seurat.' prefix sobj$Seurat.S.Score <- sobj$S.Score sobj$Seurat.G2M.Score <- sobj$G2M.Score sobj$Seurat.Phase <- sobj$Phase sobj$S.Score <- sobj$G2M.Score <- sobj$Phase <- NULL ## Print results print("Cell cycle phase according to Seurat: ") print(table(sobj$Seurat.Phase)) # Scran: Cyclone ## Load database if(species == "human"){ # Load cc_cyclone <- readRDS(system.file("exdata", "human_cycle_markers.rds", package = "scran")) # Translate database from ENSEMBL to gene SYMBOL suppressPackageStartupMessages(library(org.Hs.eg.db)) for (phase in names(cc_cyclone)) { for (fs in names(cc_cyclone[[phase]])) { cc_cyclone[[phase]][[fs]] <- suppressMessages(AnnotationDbi::mapIds(org.Hs.eg.db, keys=cc_cyclone[[phase]][[fs]], column = "SYMBOL", keytype = "ENSEMBL", multiVals = "first")) } } }else if (species == "mouse"){ # Load cc_cyclone <- readRDS(system.file("exdata", "mouse_cycle_markers.rds", package = "scran")) # Translate database from ENSEMBL to gene SYMBOL suppressPackageStartupMessages(library(org.Mm.eg.db)) for (phase in names(cc_cyclone)) { for (fs in names(cc_cyclone[[phase]])) { cc_cyclone[[phase]][[fs]] <- suppressMessages(AnnotationDbi::mapIds(org.Mm.eg.db, keys=cc_cyclone[[phase]][[fs]], column = "SYMBOL", keytype = "ENSEMBL", multiVals = "first")) } } } ## Prediction cycres <- scran::cyclone(Seurat::as.SingleCellExperiment(sobj), pairs = cc_cyclone, verbose = FALSE) sobj$Cyclone.SmG2M.Score <- cycres$normalized.scores$S - cycres$normalized.scores$G2M ## Save results in the seurat object sobj$Cyclone.Phase <- as.factor(cycres$phases) sobj$Cyclone.G1.Score <- cycres$normalized.scores$G1 sobj$Cyclone.S.Score <- cycres$normalized.scores$S sobj$Cyclone.G2M.Score <- cycres$normalized.scores$G2M rm(cycres) ## Print results print("Cell cycle phase according to Cyclone: ") print(table(sobj$Cyclone.Phase)) # Crossing results : Seurat -VS- Cyclone df_cycle <- data.frame(Seurat.Phase = sobj$Seurat.Phase, Cyclone.Phase = sobj$Cyclone.Phase) print(table(df_cycle)) print(paste("Seurat and Cyclone agreed for ",round((sum(diag(table(df_cycle))) / sum(table(df_cycle)) *100),2),"% of droplets.")) ``` ### Identification of cell doublets ```{r doublets} # scDblFinder sobj$scDblFinder.class <- Seurat::as.Seurat(scDblFinder::scDblFinder(Seurat::as.SingleCellExperiment(sobj)))$scDblFinder.class sobj$scDblFinder.class <- unname(sobj$scDblFinder.class == "doublet") print('scDblFinder doublets :') print(table(sobj$scDblFinder.class)) # scds sobj$scds.score <- scds::cxds_bcds_hybrid(Seurat::as.SingleCellExperiment(sobj))$hybrid_score sobj$scds.class <- unname(sobj$scds.score > 1) print('scds-hybrid doublets :') print(table(sobj$scds.class)) # union of scDblFinder & scds sobj$doublets_consensus.class <- sobj$scDblFinder.class | sobj$scds.class print('Consensus doublets :') print(table(sobj$doublets_consensus.class)) # Correlation between doublets and cell cycle ## Overlap cells df_S.cycle_doublets <- data.frame(Seurat.Phase = sobj$Seurat.Phase, doublets = sobj$doublets_consensus.class) print(table(df_S.cycle_doublets)) print(paste0(round((nrow(df_S.cycle_doublets[df_S.cycle_doublets$Seurat.Phase == "G2M" & df_S.cycle_doublets$doublets == TRUE,])/nrow(df_S.cycle_doublets[df_S.cycle_doublets$doublets == TRUE,])*100),2),"% of doublets are in G2M phase by Seurat.")) df_C.cycle_doublets <- data.frame(Cyclone.Phase = sobj$Cyclone.Phase, doublets = sobj$doublets_consensus.class) print(table(df_C.cycle_doublets)) print(paste0(round((nrow(df_C.cycle_doublets[df_C.cycle_doublets$Cyclone.Phase == "G2M" & df_C.cycle_doublets$doublets == TRUE,])/nrow(df_C.cycle_doublets[df_S.cycle_doublets$doublets == TRUE,])*100),2),"% of doublets are in G2M phase by Cyclone.")) ## Quick 'n dirty uMAP ## /!\ Warning: this umap will not represent the cell types present! Many parameters will have to be adjusted for this (tomorrow's course on Statistical models and analyses). Here, we just want to be able to represent all the cells on the same graph and plot the identification of doublets and the phases of the cell cycle /!\ tmp_sobj <- sobj tmp_sobj <- suppressWarnings(Seurat::SCTransform(tmp_sobj, seed.use = the.seed, verbose = FALSE)) #normalization tmp_sobj <- Seurat::RunPCA(tmp_sobj, verbose = FALSE,seed.use = the.seed) #dimension reduction tmp_sobj <- Seurat::RunUMAP(tmp_sobj, dims = 1:25, verbose = FALSE, seed.use = the.seed) Seurat::DimPlot(tmp_sobj, group.by = "Cyclone.Phase") + ggplot2::ggtitle("Cell Phase (cyclone)") + Seurat::DarkTheme() Seurat::DimPlot(tmp_sobj, group.by = "Seurat.Phase") + ggplot2::ggtitle("Cell Phase (Seurat)") + Seurat::DarkTheme() Seurat::DimPlot(tmp_sobj, group.by = "scDblFinder.class") + ggplot2::ggtitle("Cell doublets (scDblFinder)") + Seurat::DarkTheme() Seurat::DimPlot(tmp_sobj, group.by = "scds.class") + ggplot2::ggtitle("Cell doublets (scds)") + Seurat::DarkTheme() Seurat::DimPlot(tmp_sobj, group.by = "doublets_consensus.class") + ggplot2::ggtitle("Cell doublets (union)") + Seurat::DarkTheme() # Cleaning rm(tmp_sobj) invisible(gc()) ``` ## Filtering ```{r filter} # Filtering cell doublets sobj <- sobj[, !sobj$doublets_consensus.class] # Filtering non-qualified cells cellskeep <- sobj$percent_mt >= pcmito.min & sobj$percent_mt <= pcmito.max & sobj$percent_rb >= pcribo.min & sobj$percent_rb <= pcribo.max sobj <- sobj[, cellskeep] # Filtering features with ultra-low expression features_to_keep <- which(apply(sobj@assays$RNA@counts,1,function(x){ length(which(x>0)) }) >= min.cells) sobj <- sobj[features_to_keep,] ``` ## Checking the effect of filtering ```{r filter_effect} # Re-computing basic metrics : percentage of mito + ribo + nb features + nb counts ## Nb features by cell sobj$nFeature_RNA <- apply(sobj@assays$RNA@counts,2,function(x){ length(which(x>0)) }) sobj$min_features <- sobj$nFeature_RNA >= min.features ## Nb counts by cell sobj$nCount_RNA <- apply(sobj@assays$RNA@counts,2,function(x){ sum(x) }) sobj$min_counts <- sobj$nCount_RNA >= min.counts ## Percent mito sobj$percent_mt <- Seurat::PercentageFeatureSet(sobj, pattern = mito_patern) ## Percent ribo sobj$percent_rb <- Seurat::PercentageFeatureSet(sobj, pattern = ribo_patern) # Some metrics about total filtered count matrix: print('Number of cells in filtered data:') print(ncol(sobj)) print('Number of genes in filtered data:') print(nrow(sobj)) print('Number of UMIs in filtered data:') print(sum(sobj@assays$RNA@counts)) print(paste0('Number of cells with >= ', min.features, ' features :')) print(table(sobj$min_features)) print('Mean features by cell :') print(summary(sobj$nFeature_RNA)) print(paste0('Number of cells with >= ', min.counts, ' of total counts:')) print(table(sobj$min_counts)) print('Mean Count by cell:') print(summary(sobj$nCount_RNA)) print('% Mitochondrial expression :') print(summary(sobj$percent_mt)) print('% Ribosomal expression :') print(summary(sobj$percent_rb)) print("Cell cycle phases according to Seurat: ") print(table(sobj$Seurat.Phase)) print("Cell cycle phases according to Cyclone: ") print(table(sobj$Cyclone.Phase)) print('Consensus doublets :') print(table(sobj$doublets_consensus.class)) # Plots ## Histograms ggplot2::qplot(sobj[["nFeature_RNA", drop = TRUE]], geom = "histogram", bins = 101, fill = I("white"), col = I("black"), main = paste0("nFeature_RNA (>= ", min.features, " : ", length(which(sobj$min_features)), " cells)"), xlab = "nFeature_RNA") + ggplot2::geom_vline(xintercept = min.features, col = "red", linetype = "dashed", size = 1.5) ggplot2::qplot(sobj[["nCount_RNA", drop = TRUE]], geom = "histogram", bins = 101, fill = I("white"), col = I("black"), main = paste0("nCount_RNA (>= ", min.counts, " : ", length(which(sobj$min_counts)), " cells)"), xlab = "nCount_RNA") + ggplot2::geom_vline(xintercept = min.counts, col = "red", linetype = "dashed", size = 1.5) ggplot2::qplot(sobj$percent_mt, geom = "histogram", bins = 101, fill = I("white"), col = I("black"), main = paste0("%mito (in [", pcmito.min, ';', pcmito.max, "] : ", length(which(df_percent_mt$droplets_state == "in")), " cells)"), xlab = "%mito") + ggplot2::geom_vline(xintercept = c(5,10,15,20), col = "cyan4", linetype = "dashed", size = 1) + ggplot2::geom_vline(xintercept = c(pcmito.min,pcmito.max), col = "red", linetype = "dashed", size = 1.5) ggplot2::qplot(sobj$percent_rb, geom = "histogram", bins = 101, fill = I("white"), col = I("black"), main = paste0("%ribo (in [", pcribo.min, ';', pcribo.max, "] : ", length(which(df_percent_rb$droplets_state == "in")), " cells)"), xlab = "%ribo") + ggplot2::geom_vline(xintercept = c(pcribo.min,pcribo.max), col = "red", linetype = "dashed", size = 1.5) ## Violinplot Seurat::VlnPlot(sobj, features = c("nFeature_RNA", "nCount_RNA","percent_mt","percent_rb"), ncol = 4) ## Scatterplot Seurat::FeatureScatter(sobj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA") ``` ## Save ```{r save} # Save parameters sobj@misc$params$sample.name <- sample.name sobj@misc$params$species <- species sobj@misc$params$QC$seed <- the.seed sobj@misc$params$QC$emptydrops$emptydrops.retain <- emptydrops.retain sobj@misc$params$QC$pcmito.range <- c(pcmito.min,pcmito.max) sobj@misc$params$QC$pcribo.range <- c(pcribo.min,pcribo.max) sobj@misc$params$QC$min.features <- min.features sobj@misc$params$QC$min.counts <- min.counts sobj@misc$params$QC$min.cells <- min.cells # Save R session sobj@misc$params$QC$Rsession <- utils::capture.output(devtools::session_info()) # Save seurat object dir.create(output.dir, recursive = TRUE, showWarnings = FALSE) save(sobj, file = paste0(output.dir, sample.name, '_FILTERED_with.rda'), compress = "bzip2") ``` ```{r Rsession} # Print R session info utils::capture.output(devtools::session_info()) ```