Initializing the practical session

user.id <- Sys.getenv('USER')

Setting Dataset

input.dir <- paste0("/shared/projects/sincellte_2022/", user.id, "/Primary_analyses/input/matrix/")
output.dir <- paste0("/shared/projects/sincellte_2022/", user.id, "/Primary_analyses/finalresult/")
sample.name <- "PBMC_CTRL_GE"
species <- "human"
aligment <- "10X" #kallisto or 10X

Setting Paramaters

# Computational Parameters
the.seed <- 1337L

# Analysis Parameters
## Empty droplets
emptydrops.retain <- NULL
## Barcode-level QC cell
pcmito.min <- 0
pcmito.max <- 20
pcribo.min <- 0
pcribo.max <- 100
min.features <- 3
min.counts <- 100
## Feature-level QC
min.cells <- 5

Setting seed and loading R packages

set.seed(the.seed)
suppressPackageStartupMessages(library(dplyr))

Loading raw count matrix

# Loading raw count matrix
if(aligment == "kallisto"){
  scmat <- BUSpaRse::read_count_output(dir = input.dir, name = sample.name) # from BUStools
}else if(aligment == "10X"){
  scmat <- Seurat::Read10X(input.dir) # from 10X
} else stop("Alignment technology unknown!")

# Some metrics about raw count matrix:
print('Number of barodes (droplets) in raw count matrix:')
## [1] "Number of barodes (droplets) in raw count matrix:"
droplets.total.nb <- ncol(scmat)
droplets.total.nb
## [1] 343461
print('Number of features (genes) in raw count matrix:')
## [1] "Number of features (genes) in raw count matrix:"
genes.total.nb <- nrow(scmat)
genes.total.nb
## [1] 33077
print('Number of UMIs in raw count matrix:')
## [1] "Number of UMIs in raw count matrix:"
umi.total.nb <- sum(scmat)
umi.total.nb
## [1] 12925546

Removing empty droplets

Automatically : emptyDrops()

# Removing empty droplets
bc_rank <- DropletUtils::emptyDrops(m = scmat, retain = emptydrops.retain)
scmat_filtered <- scmat[, which(bc_rank$FDR < 1E-03)]

# Kneeplot
## Make the dataframe with all droplets (before filtering) and the number of UMI for each droplet, and if the droplets are filtered or not by emptydrops
nb_umi_by_barcode <- data.frame(nb_umi = Matrix::colSums(scmat), barcodes = colnames(scmat))
nb_umi_by_barcode <- nb_umi_by_barcode %>% arrange(desc(nb_umi)) %>% dplyr::mutate(num_barcode = seq.int(ncol(scmat)))
nb_umi_by_barcode$droplets_state <- "Empty Droplets"
nb_umi_by_barcode[nb_umi_by_barcode$barcodes %in% colnames(scmat_filtered), "droplets_state"] <- "Full Droplets"
## Draw kneeplot
ggplot2::ggplot(nb_umi_by_barcode, ggplot2::aes(y = nb_umi, x = num_barcode, color = droplets_state)) +
  ggplot2::geom_point() + ggplot2::ggtitle(paste0("Kneeplot of ", sample.name)) + 
  ggplot2::theme(legend.title = ggplot2::element_blank()) +
  ggplot2::scale_y_log10(name = "Number of UMI by droplet (log scale)") + 
  ggplot2::scale_x_log10(name = "Droplet rank (log scale)") +
  ggplot2::expand_limits(x = 0, y = 0) +
  ggplot2::scale_colour_manual(values = c("cyan3","royalblue4"), guide = 'legend') +
  ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(linetype = c(0, 0), shape = c(16, 16))))
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous x-axis

# Creation of the Seurat object
sobj <- Seurat::CreateSeuratObject(counts = scmat_filtered, project = sample.name)

# Cleaning
rm(bc_rank,nb_umi_by_barcode,scmat,scmat_filtered)
invisible(gc())

Manually

# Quantification
## Nb features by cell
sobj$min_features <- sobj$nFeature_RNA >= min.features
print(paste0('Number of droplets with >= ', min.features, ' features :'))
## [1] "Number of droplets with >= 3 features :"
print(table(sobj$min_features))
## 
## TRUE 
## 3967
## Nb counts by cell
sobj$min_counts <- sobj$nCount_RNA >= min.counts
print(paste0('Number of droplets with >= ', min.counts, ' of total counts:'))
## [1] "Number of droplets with >= 100 of total counts:"
print(table(sobj$min_counts))
## 
## TRUE 
## 3967
# Plots
## Histograms
ggplot2::qplot(sobj[["nFeature_RNA", drop = TRUE]], geom = "histogram", bins = 101, fill = I("white"), col = I("black"), main = paste0("nFeature_RNA (>= ", min.features, " : ", length(which(sobj$min_features)), " droplets)"), xlab = "nFeature_RNA") + ggplot2::geom_vline(xintercept = min.features, col = "red", linetype = "dashed", size = 1.5)

ggplot2::qplot(sobj[["nCount_RNA", drop = TRUE]], geom = "histogram", bins = 101, fill = I("white"), col = I("black"), main = paste0("nCount_RNA (>= ", min.counts, " : ", length(which(sobj$min_counts)), " droplets)"), xlab = "nCount_RNA") + ggplot2::geom_vline(xintercept = min.counts, col = "red", linetype = "dashed", size = 1.5)

## Violinplot
Seurat::VlnPlot(sobj, features = c("nFeature_RNA", "nCount_RNA"), ncol = 2)