---
title: "SinCellTE 2022: Primary analyses"
output:
  html_document: default
---

## Initializing the practical session

```{r uid}
user.id <- Sys.getenv('USER')
```

## Setting Dataset

```{r dset}
input.dir <- paste0("/shared/projects/sincellte_2022/", user.id, "/Primary_analyses/input/matrix/")
output.dir <- paste0("/shared/projects/sincellte_2022/", user.id, "/Primary_analyses/finalresult/")
sample.name <- "PBMC_CTRL_GE"
species <- "human"
aligment <- "10X" #kallisto or 10X
```

## Setting Paramaters

```{r params}
# Computational Parameters
the.seed <- 1337L

# Analysis Parameters
## Empty droplets
emptydrops.retain <- NULL
## Barcode-level QC cell
pcmito.min <- 0
pcmito.max <- 20
pcribo.min <- 0
pcribo.max <- 100
min.features <- 200
min.counts <- 1000
## Feature-level QC
min.cells <- 5
```

## Setting seed and loading R packages

```{r seed_packages}
set.seed(the.seed)
suppressPackageStartupMessages(library(dplyr))
```

## Loading raw count matrix

```{r loading}
# Loading raw count matrix
if(aligment == "kallisto"){
  scmat <- BUSpaRse::read_count_output(dir = input.dir, name = sample.name) # from BUStools
}else if(aligment == "10X"){
  scmat <- Seurat::Read10X(input.dir) # from 10X
} else stop("Alignment technology unknown!")

# Some metrics about raw count matrix:
print('Number of barodes (droplets) in raw count matrix:')
droplets.total.nb <- ncol(scmat)
droplets.total.nb
print('Number of features (genes) in raw count matrix:')
genes.total.nb <- nrow(scmat)
genes.total.nb
print('Number of UMIs in raw count matrix:')
umi.total.nb <- sum(scmat)
umi.total.nb
```

## Removing empty droplets

### Automatically : emptyDrops()

```{r emptydrops}
# Removing empty droplets
bc_rank <- DropletUtils::emptyDrops(m = scmat, retain = emptydrops.retain)
scmat_filtered <- scmat[, which(bc_rank$FDR < 1E-03)]

# Kneeplot
## Make the dataframe with all droplets (before filtering) and the number of UMI for each droplet, and if the droplets are filtered or not by emptydrops
nb_umi_by_barcode <- data.frame(nb_umi = Matrix::colSums(scmat), barcodes = colnames(scmat))
nb_umi_by_barcode <- nb_umi_by_barcode %>% arrange(desc(nb_umi)) %>% dplyr::mutate(num_barcode = seq.int(ncol(scmat)))
nb_umi_by_barcode$droplets_state <- "Empty Droplets"
nb_umi_by_barcode[nb_umi_by_barcode$barcodes %in% colnames(scmat_filtered), "droplets_state"] <- "Full Droplets"
## Draw kneeplot
ggplot2::ggplot(nb_umi_by_barcode, ggplot2::aes(y = nb_umi, x = num_barcode, color = droplets_state)) +
  ggplot2::geom_point() + ggplot2::ggtitle(paste0("Kneeplot of ", sample.name)) + 
  ggplot2::theme(legend.title = ggplot2::element_blank()) +
  ggplot2::scale_y_log10(name = "Number of UMI by droplet (log scale)") + 
  ggplot2::scale_x_log10(name = "Droplet rank (log scale)") +
  ggplot2::expand_limits(x = 0, y = 0) +
  ggplot2::scale_colour_manual(values = c("cyan3","royalblue4"), guide = 'legend') +
  ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(linetype = c(0, 0), shape = c(16, 16))))

# Creation of the Seurat object
sobj <- Seurat::CreateSeuratObject(counts = scmat_filtered, project = sample.name)

# Cleaning
rm(bc_rank,nb_umi_by_barcode,scmat,scmat_filtered)
invisible(gc())
```

### Manually

```{r lowcells}
# Quantification
## Nb features by cell
sobj$min_features <- sobj$nFeature_RNA >= min.features
print(paste0('Number of droplets with >= ', min.features, ' features :'))
print(table(sobj$min_features))
## Nb counts by cell
sobj$min_counts <- sobj$nCount_RNA >= min.counts
print(paste0('Number of droplets with >= ', min.counts, ' of total counts:'))
print(table(sobj$min_counts))

# Plots
## Histograms
ggplot2::qplot(sobj[["nFeature_RNA", drop = TRUE]], geom = "histogram", bins = 101, fill = I("white"), col = I("black"), main = paste0("nFeature_RNA (>= ", min.features, " : ", length(which(sobj$min_features)), " droplets)"), xlab = "nFeature_RNA") + ggplot2::geom_vline(xintercept = min.features, col = "red", linetype = "dashed", size = 1.5)
ggplot2::qplot(sobj[["nCount_RNA", drop = TRUE]], geom = "histogram", bins = 101, fill = I("white"), col = I("black"), main = paste0("nCount_RNA (>= ", min.counts, " : ", length(which(sobj$min_counts)), " droplets)"), xlab = "nCount_RNA") + ggplot2::geom_vline(xintercept = min.counts, col = "red", linetype = "dashed", size = 1.5)
## Violinplot
Seurat::VlnPlot(sobj, features = c("nFeature_RNA", "nCount_RNA"), ncol = 2)
## Scatterplot
Seurat::FeatureScatter(sobj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")

# Filtering cells on min.features and min.counts
sobj <- sobj[, sobj$nFeature_RNA >= min.features & sobj$nCount_RNA >= min.counts]

# Some metrics about filtered count matrix:
print('Number of droplets  in filtered data:')
droplets.kept.nb <- ncol(sobj)
droplets.kept.nb
print('Number of genes in filtered data:')
genes.kept.nb <- nrow(sobj)
genes.kept.nb
print('Number of UMIs in filtered data:')
umi.kept.nb <- sum(sobj@assays$RNA@counts)
umi.kept.nb
print('Fraction of UMIs in droplets :')
fraction.umi <- umi.kept.nb / umi.total.nb
fraction.umi
print('Mean of UMIs in droplets :')
mean.umi <- round((umi.kept.nb / droplets.kept.nb), 0)
mean.umi

```

## Computing basic metrics : Percentages of mito + Percentages of ribo + Cell cycle prediction + Identification of doublets

### Percentages of mito + ribo

```{r mitoribo}
# Pattern
if(species == "human"){
  mito_patern <- "^MT-"
  ribo_patern <- "^RP[LS][[:digit:]]"
}else if (species == "mouse"){
  mito_patern <- "^mt-"
  ribo_patern <- "^Rp[ls][[:digit:]]"
}

# Percentage MITO
## Percent
sobj$percent_mt <- Seurat::PercentageFeatureSet(sobj, pattern = mito_patern)
print('% Mitochondrial expression :')
print(summary(sobj$percent_mt))
## Number
df_percent_mt <- data.frame(percent_mt = sobj$percent_mt, droplets_state = "in")
df_percent_mt[df_percent_mt$percent_mt <= pcmito.min, "droplets_state"] <- "out.left"
df_percent_mt[df_percent_mt$percent_mt >= pcmito.max, "droplets_state"] <- "out.right"
print(paste0(pcmito.min, ' <= % mito <= ', pcmito.max, ' :'))
print(table(df_percent_mt$droplets_state))

# Percentage RIBO
## Percent
sobj$percent_rb <- Seurat::PercentageFeatureSet(sobj, pattern = ribo_patern)
print('% Ribosomal expression :')
print(summary(sobj$percent_rb))
## Number
df_percent_rb <- data.frame(percent_rb = sobj$percent_rb, droplets_state = "in")
df_percent_rb[df_percent_rb$percent_rb <= pcribo.min, "droplets_state"] <- "out.left"
df_percent_rb[df_percent_rb$percent_rb >= pcribo.max, "droplets_state"] <- "out.right"
print(paste0(pcribo.min, ' <= % ribo <= ', pcribo.max, ' :'))
print(table(df_percent_rb$droplets_state))

# Plots
## Histograms
ggplot2::qplot(sobj$percent_mt, geom = "histogram", bins = 101, fill = I("white"), col = I("black"), main = paste0("%mito (in [", pcmito.min, ';', pcmito.max, "] : ", length(which(df_percent_mt$droplets_state == "in")), " droplets)"), xlab = "%mito") + ggplot2::geom_vline(xintercept = c(5,10,15,20), col = "cyan4", linetype = "dashed", size = 1) + ggplot2::geom_vline(xintercept = c(pcmito.min,pcmito.max), col = "red", linetype = "dashed", size = 1.5)
ggplot2::qplot(sobj$percent_rb, geom = "histogram", bins = 101, fill = I("white"), col = I("black"), main = paste0("%ribo (in [", pcribo.min, ';', pcribo.max, "] : ", length(which(df_percent_rb$droplets_state == "in")), " droplets)"), xlab = "%ribo") + ggplot2::geom_vline(xintercept = c(pcribo.min,pcribo.max), col = "red", linetype = "dashed", size = 1.5)
## Violinplot
Seurat::VlnPlot(sobj, features = c("percent_mt", "percent_rb"), ncol = 2)

```

### Identification of background genes

```{r bkg}
# Nb counts per gene
df_cells_by_gene <- data.frame(nb_cells = apply(sobj@assays$RNA@counts,1,function(x){length(which(x>0))}), gene_state = "in", stringsAsFactors = FALSE)
df_cells_by_gene[df_cells_by_gene$nb_cells <= min.cells, "gene_state"] <- "out"
df_cells_by_gene_zoom <- df_cells_by_gene[df_cells_by_gene$nb_cells <= min.cells*4, ]
print(paste0('Number of genes expressed in >= ', min.cells, ' cells:'))
print(table(df_cells_by_gene$gene_state))
# Plots
## Histograms
ggplot2::qplot(df_cells_by_gene$nb_cells, geom = "histogram", bins = 101, fill = I("white"), col = I("black"), main = paste0("nb genes (>= ", min.cells, " : ", length(which(df_cells_by_gene$gene_state == "in")), " genes)"), xlab = "nb cells") + ggplot2::geom_vline(xintercept = min.cells, col = "red", linetype = "dashed", size = 1.5)
ggplot2::qplot(df_cells_by_gene_zoom$nb_cells, geom = "histogram", bins = 101, fill = I("white"), col = I("black"), main = paste0("nb genes (>= ", min.cells, " : ", length(which(df_cells_by_gene$gene_state == "in")), " genes) (Zoom)"),xlab = "nb cells") + ggplot2::geom_vline(xintercept = min.cells, col = "red", linetype = "dashed", size = 1.5)

```

### Cell cycle prediction

Note: Cyclone take a while (~10-15min).

```{r ccycle}
# Seurat CC
## Load database
if(species == "human"){
  cc_seurat <- Seurat::cc.genes.updated.2019
}else if (species == "mouse"){
  ### Seurat only provides cell cycle genes for homo sapiens (as this list was developed solely using data from this species). To use it on mus musculus, one can simply convert these human HUGO genes to to mus MGI orthologues (using biomaRt). See [this github issue](https://github.com/satijalab/seurat/issues/2493) 
  cc_seurat <- Seurat::cc.genes.updated.2019
  human_mart = biomaRt::useEnsembl("ensembl", dataset = "hsapiens_gene_ensembl")
  mouse_mart = biomaRt::useEnsembl("ensembl", dataset = "mmusculus_gene_ensembl")
  cc_seurat$s.genes = biomaRt::getLDS(attributes = c("hgnc_symbol"), filters = "hgnc_symbol", values = cc_seurat$s.genes , mart = human_mart, attributesL = c("mgi_symbol"), martL = mouse_mart, uniqueRows=T)[, 2]
  cc_seurat$g2m.genes = biomaRt::getLDS(attributes = c("hgnc_symbol"), filters = "hgnc_symbol", values = cc_seurat$g2m.genes , mart = human_mart, attributesL = c("mgi_symbol"), martL = mouse_mart, uniqueRows=T)[, 2]
}
## Prediction
sobj <- Seurat::CellCycleScoring(object = sobj, s.features = cc_seurat$s.genes, g2m.features = cc_seurat$g2m.genes, assay = "RNA", nbin = 10, seed = the.seed)
sobj$Seurat.SmG2M.Score <- sobj$S.Score - sobj$G2M.Score
## Rename results columns with 'Seurat.' prefix
sobj$Seurat.S.Score <- sobj$S.Score
sobj$Seurat.G2M.Score <- sobj$G2M.Score
sobj$Seurat.Phase <- sobj$Phase
sobj$S.Score <- sobj$G2M.Score <- sobj$Phase <- NULL
## Print results
print("Cell cycle phase according to Seurat: ")
print(table(sobj$Seurat.Phase))

# Scran: Cyclone
## Load database
if(species == "human"){
  # Load
  cc_cyclone <- readRDS(system.file("exdata", "human_cycle_markers.rds", package = "scran"))
  # Translate database from ENSEMBL to gene SYMBOL
  suppressPackageStartupMessages(library(org.Hs.eg.db))
  for (phase in names(cc_cyclone)) {
    for (fs in names(cc_cyclone[[phase]])) {
      cc_cyclone[[phase]][[fs]] <- suppressMessages(AnnotationDbi::mapIds(org.Hs.eg.db, keys=cc_cyclone[[phase]][[fs]], column = "SYMBOL", keytype = "ENSEMBL", multiVals = "first"))
    }
  }
}else if (species == "mouse"){
  # Load
  cc_cyclone <- readRDS(system.file("exdata", "mouse_cycle_markers.rds", package = "scran"))
  # Translate database from ENSEMBL to gene SYMBOL
  suppressPackageStartupMessages(library(org.Mm.eg.db))
  for (phase in names(cc_cyclone)) {
    for (fs in names(cc_cyclone[[phase]])) {
      cc_cyclone[[phase]][[fs]] <- suppressMessages(AnnotationDbi::mapIds(org.Mm.eg.db, keys=cc_cyclone[[phase]][[fs]], column = "SYMBOL", keytype = "ENSEMBL", multiVals = "first"))
    }
  }
}
## Prediction
cycres <- scran::cyclone(Seurat::as.SingleCellExperiment(sobj), pairs = cc_cyclone, verbose = FALSE)
sobj$Cyclone.SmG2M.Score <- cycres$normalized.scores$S - cycres$normalized.scores$G2M
## Save results in the seurat object
sobj$Cyclone.Phase <- as.factor(cycres$phases)
sobj$Cyclone.G1.Score <- cycres$normalized.scores$G1
sobj$Cyclone.S.Score <- cycres$normalized.scores$S
sobj$Cyclone.G2M.Score <- cycres$normalized.scores$G2M
rm(cycres)
## Print results
print("Cell cycle phase according to Cyclone: ")
print(table(sobj$Cyclone.Phase))

# Crossing results : Seurat -VS- Cyclone
df_cycle <- data.frame(Seurat.Phase = sobj$Seurat.Phase, Cyclone.Phase = sobj$Cyclone.Phase)
print(table(df_cycle))
print(paste("Seurat and Cyclone agreed for ",round((sum(diag(table(df_cycle))) / sum(table(df_cycle)) *100),2),"% of droplets."))

```

### Identification of cell doublets

```{r doublets}
# scDblFinder
sobj$scDblFinder.class <- Seurat::as.Seurat(scDblFinder::scDblFinder(Seurat::as.SingleCellExperiment(sobj)))$scDblFinder.class
sobj$scDblFinder.class <- unname(sobj$scDblFinder.class == "doublet")
print('scDblFinder doublets :')
print(table(sobj$scDblFinder.class))

# scds
sobj$scds.score <- scds::cxds_bcds_hybrid(Seurat::as.SingleCellExperiment(sobj))$hybrid_score
sobj$scds.class <- unname(sobj$scds.score > 1)
print('scds-hybrid doublets :')
print(table(sobj$scds.class))

# union of scDblFinder & scds
sobj$doublets_consensus.class <- sobj$scDblFinder.class | sobj$scds.class
print('Consensus doublets :')
print(table(sobj$doublets_consensus.class))

# Correlation between doublets and cell cycle
## Overlap cells
df_S.cycle_doublets <- data.frame(Seurat.Phase = sobj$Seurat.Phase, doublets = sobj$doublets_consensus.class)
print(table(df_S.cycle_doublets))
print(paste0(round((nrow(df_S.cycle_doublets[df_S.cycle_doublets$Seurat.Phase == "G2M" & df_S.cycle_doublets$doublets == TRUE,])/nrow(df_S.cycle_doublets[df_S.cycle_doublets$doublets == TRUE,])*100),2),"% of doublets are in G2M phase by Seurat."))
df_C.cycle_doublets <- data.frame(Cyclone.Phase = sobj$Cyclone.Phase, doublets = sobj$doublets_consensus.class)
print(table(df_C.cycle_doublets))
print(paste0(round((nrow(df_C.cycle_doublets[df_C.cycle_doublets$Cyclone.Phase == "G2M" & df_C.cycle_doublets$doublets == TRUE,])/nrow(df_C.cycle_doublets[df_S.cycle_doublets$doublets == TRUE,])*100),2),"% of doublets are in G2M phase by Cyclone."))
## Quick 'n dirty uMAP 
## /!\ Warning: this umap will not represent the cell types present! Many parameters will have to be adjusted for this (tomorrow's course on Statistical models and analyses). Here, we just want to be able to represent all the cells on the same graph and plot the identification of doublets and the phases of the cell cycle /!\
tmp_sobj <- sobj
tmp_sobj <- suppressWarnings(Seurat::SCTransform(tmp_sobj, seed.use = the.seed, verbose = FALSE)) #normalization
tmp_sobj <- Seurat::RunPCA(tmp_sobj, verbose = FALSE,seed.use = the.seed) #dimension reduction
tmp_sobj <- Seurat::RunUMAP(tmp_sobj, dims = 1:25, verbose = FALSE, seed.use = the.seed)
Seurat::DimPlot(tmp_sobj, group.by = "Cyclone.Phase") + ggplot2::ggtitle("Cell Phase (cyclone)") + Seurat::DarkTheme()
Seurat::DimPlot(tmp_sobj, group.by = "Seurat.Phase") + ggplot2::ggtitle("Cell Phase (Seurat)") + Seurat::DarkTheme()
Seurat::DimPlot(tmp_sobj, group.by = "scDblFinder.class") + ggplot2::ggtitle("Cell doublets (scDblFinder)") + Seurat::DarkTheme()
Seurat::DimPlot(tmp_sobj, group.by = "scds.class") + ggplot2::ggtitle("Cell doublets (scds)") + Seurat::DarkTheme()
Seurat::DimPlot(tmp_sobj, group.by = "doublets_consensus.class") + ggplot2::ggtitle("Cell doublets (union)") + Seurat::DarkTheme()

# Cleaning
rm(tmp_sobj)
invisible(gc())
```

## Filtering

```{r filter}
# Filtering cell doublets
sobj <- sobj[, !sobj$doublets_consensus.class]

# Filtering non-qualified cells
cellskeep <- sobj$percent_mt >= pcmito.min & sobj$percent_mt <= pcmito.max & sobj$percent_rb >= pcribo.min & sobj$percent_rb <= pcribo.max
sobj <- sobj[, cellskeep]

# Filtering features with ultra-low expression
features_to_keep <- which(apply(sobj@assays$RNA@counts,1,function(x){ length(which(x>0)) }) >= min.cells)
sobj <- sobj[features_to_keep,]

```

## Checking the effect of filtering

```{r filter_effect}
# Re-computing basic metrics : percentage of mito + ribo + nb features + nb counts
## Nb features by cell
sobj$nFeature_RNA <- apply(sobj@assays$RNA@counts,2,function(x){ length(which(x>0)) })
sobj$min_features <- sobj$nFeature_RNA >= min.features
## Nb counts by cell
sobj$nCount_RNA <- apply(sobj@assays$RNA@counts,2,function(x){ sum(x) })
sobj$min_counts <- sobj$nCount_RNA >= min.counts
## Percent mito
sobj$percent_mt <- Seurat::PercentageFeatureSet(sobj, pattern = mito_patern)
## Percent ribo
sobj$percent_rb <- Seurat::PercentageFeatureSet(sobj, pattern = ribo_patern)

# Some metrics about total filtered count matrix:
print('Number of cells in filtered data:')
print(ncol(sobj))
print('Number of genes in filtered data:')
print(nrow(sobj))
print('Number of UMIs in filtered data:')
print(sum(sobj@assays$RNA@counts))
print(paste0('Number of cells with >= ', min.features, ' features :'))
print(table(sobj$min_features))
print('Mean features by cell :')
print(summary(sobj$nFeature_RNA))
print(paste0('Number of cells with >= ', min.counts, ' of total counts:'))
print(table(sobj$min_counts))
print('Mean Count by cell:')
print(summary(sobj$nCount_RNA))
print('% Mitochondrial expression :')
print(summary(sobj$percent_mt))
print('% Ribosomal expression :')
print(summary(sobj$percent_rb))
print("Cell cycle phases according to Seurat: ")
print(table(sobj$Seurat.Phase))
print("Cell cycle phases according to Cyclone: ")
print(table(sobj$Cyclone.Phase))
print('Consensus doublets :')
print(table(sobj$doublets_consensus.class))

# Plots
## Histograms
ggplot2::qplot(sobj[["nFeature_RNA", drop = TRUE]], geom = "histogram", bins = 101, fill = I("white"), col = I("black"), main = paste0("nFeature_RNA (>= ", min.features, " : ", length(which(sobj$min_features)), " cells)"), xlab = "nFeature_RNA") + ggplot2::geom_vline(xintercept = min.features, col = "red", linetype = "dashed", size = 1.5)
ggplot2::qplot(sobj[["nCount_RNA", drop = TRUE]], geom = "histogram", bins = 101, fill = I("white"), col = I("black"), main = paste0("nCount_RNA (>= ", min.counts, " : ", length(which(sobj$min_counts)), " cells)"), xlab = "nCount_RNA") + ggplot2::geom_vline(xintercept = min.counts, col = "red", linetype = "dashed", size = 1.5)
ggplot2::qplot(sobj$percent_mt, geom = "histogram", bins = 101, fill = I("white"), col = I("black"), main = paste0("%mito (in [", pcmito.min, ';', pcmito.max, "] : ", length(which(df_percent_mt$droplets_state == "in")), " cells)"), xlab = "%mito") + ggplot2::geom_vline(xintercept = c(5,10,15,20), col = "cyan4", linetype = "dashed", size = 1) + ggplot2::geom_vline(xintercept = c(pcmito.min,pcmito.max), col = "red", linetype = "dashed", size = 1.5)
ggplot2::qplot(sobj$percent_rb, geom = "histogram", bins = 101, fill = I("white"), col = I("black"), main = paste0("%ribo (in [", pcribo.min, ';', pcribo.max, "] : ", length(which(df_percent_rb$droplets_state == "in")), " cells)"), xlab = "%ribo") + ggplot2::geom_vline(xintercept = c(pcribo.min,pcribo.max), col = "red", linetype = "dashed", size = 1.5)
## Violinplot
Seurat::VlnPlot(sobj, features = c("nFeature_RNA", "nCount_RNA","percent_mt","percent_rb"), ncol = 4)
## Scatterplot
Seurat::FeatureScatter(sobj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")

```

## Save

```{r save}
# Save parameters
sobj@misc$params$sample.name <- sample.name
sobj@misc$params$species <- species
sobj@misc$params$QC$seed <- the.seed
sobj@misc$params$QC$emptydrops$emptydrops.retain <- emptydrops.retain
sobj@misc$params$QC$pcmito.range <- c(pcmito.min,pcmito.max)
sobj@misc$params$QC$pcribo.range <- c(pcribo.min,pcribo.max)
sobj@misc$params$QC$min.features <- min.features
sobj@misc$params$QC$min.counts <- min.counts
sobj@misc$params$QC$min.cells <- min.cells

# Save R session
sobj@misc$params$QC$Rsession <- utils::capture.output(devtools::session_info())

# Save seurat object
dir.create(output.dir, recursive = TRUE, showWarnings = FALSE)
save(sobj, file = paste0(output.dir, sample.name, '_FILTERED_with.rda'), compress = "bzip2")

```

```{r Rsession}
# Print R session info
utils::capture.output(devtools::session_info())
```