GSVA is a method to identify activity of genesets (from a bank of terms, like MSigDb), in the distribution of gene expression from a sample. Enrichment results are expressed as “NES”, for “Normalized Enrichment Score”.
scGSVA is the adaptation of “classical” GSVA to scRNAseq data (ie, cells instead of samples)
The R packages needed for this tutorial are :
Additional packages needed to build this HTML report :
Copy this Rmd file (and its dependencies) from the common ‘Courses’ dir to your working directory :
## Working directory
user.id <- Sys.getenv('USER')
root.dir <- '/shared/projects/sincellte_2022'
script.dir <- paste(c(root.dir, 'Courses', 'Secondary_analyses',
'scripts'), collapse = '/')
work.dir <- paste(c(root.dir, user.id, 'Courses', 'Secondary_analysis',
'scGSVA'), collapse = '/')
dir.create(path = paste(c(work.dir, 'scGSVA_rmd_resources'), collapse = '/'),
recursive = TRUE, showWarnings = FALSE)
## Copying the Rmd
file.copy(from = paste0(script.dir, '/scGSVA_TD_deliverable.Rmd'),
to = work.dir, overwrite = TRUE)
## Copying the dependencies
file.copy(from = list.files(
path = paste0(script.dir, '/scGSVA_rmd_resources/'), full.names = TRUE),
to = paste0(work.dir, '/scGSVA_rmd_resources/'), overwrite = TRUE)
## Moving to your working directory
setwd(work.dir)
## If you want to visualize your files in Rstudio, you can click the little greyish/white arrow on the right of the path written at the top of your Rstudio Console.
data.dir <- paste(c(root.dir, 'Courses', 'Secondary_analyses', 'input',
'trajectory_analysis'), collapse = '/')
## Local input file
local.Seurat.file <- paste0(data.dir, '/Thymus_neonat_mm_Seurat_ENSEMBL.RDS')
## VARIABLES
my.species <- 'mouse' ## Current species
my.keytype <- 'ENSEMBL' ## Genes keytype
my.annot.db <- 'KEGG' ## Here we want to confront to KEGG db
my.annot <- 'cell_type' ## Which cell annotation will be used for term differential
my.ref <- 'DN' ## Which cell type will be used as reference for term diferential
my.red <- 'pca.30_umap'
my.terms <- c('Tryptophan metabolism', 'Metabolic pathways',
'ABC transporters', 'Huntington disease',
'Complement and coagulation cascades') ## Reduced list of terms to limit CPU time
my.seed <- 1337L
Time for this code chunk to run : 1.213 s
## Preparing annotation object for KEGG db
my.kbannot <- scGSVA::buildAnnot(species = my.species, keytype = my.keytype, anntype = my.annot.db)
## 'select()' returned 1:many mapping between keys and columns
## 'data.frame': 16857 obs. of 3 variables:
## $ GeneID: chr "ENSMUSG00000020804" "ENSMUSG00000020804" "ENSMUSG00000015243" "ENSMUSG00000015243" ...
## $ PATH : chr "00380" "01100" "02010" "04975" ...
## $ Annot : chr "Tryptophan metabolism" "Metabolic pathways" "ABC transporters" "Fat digestion and absorption" ...
## - attr(*, "na.action")= 'omit' Named int [1:26841] 1 4 12 22 41 46 47 68 82 83 ...
## ..- attr(*, "names")= chr [1:26841] "1" "4" "12" "22" ...
## Limiting the knowledge base (to speed computation for this training course)
my.kbannot <- my.kbannot[my.kbannot$Annot %in% my.terms]
str(my.kbannot)
## 'data.frame': 1509 obs. of 3 variables:
## $ GeneID: chr "ENSMUSG00000020804" "ENSMUSG00000020804" "ENSMUSG00000015243" "ENSMUSG00000028125" ...
## $ PATH : chr "00380" "01100" "02010" "02010" ...
## $ Annot : chr "Tryptophan metabolism" "Metabolic pathways" "ABC transporters" "ABC transporters" ...
## - attr(*, "na.action")= 'omit' Named int [1:26841] 1 4 12 22 41 46 47 68 82 83 ...
## ..- attr(*, "names")= chr [1:26841] "1" "4" "12" "22" ...
Time for this code chunk to run : 10.744 s
## Running on KEGG with method ssgsea (long for real data, even with 4 cores, and memory ogre)
set.seed(my.seed)
scg.res <- scGSVA::scgsva(obj = sobj, annot = my.kbannot, cores = 1)
Time for this code chunk to run : 1.388 s
## No id variables; using all as measure variables
colnames(nes.df) <- c('Term', 'NES')
ggplot2::ggplot(
nes.df, ggplot2::aes(x = NES, fill = Term)) + ggplot2::geom_density(
alpha = .4) + ggplot2::geom_vline(
xintercept = 0, linetype = 'dashed')
## Differential analysis of term enrichment (through linear regression), taking DN cells as reference
scg.fp <- scGSVA::findPathway(object = scg.res, group = my.annot, ref = my.ref)
summary(scg.fp$P.Value)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000000 0.000000 0.000000 0.062724 0.006171 0.735183
## Focus on some pathways (those with adj.p == 0)
fp.best.logical <- scg.fp$adj.P.Val == 0
fp.bests <- scg.fp$term[fp.best.logical]
message(paste0('Best enriched terms : ', paste(fp.bests, collapse = ', ')))
## Best enriched terms : Huntington.disease, Complement.and.coagulation.cascades
Time for this code chunk to run : 0.049 s
## logFC AveExpr t P.Value adj.P.Val B
## 11 0.2866668 0.1460130 41.65256 0 0 774.3397
## 12 -0.1032886 0.3586951 -40.83998 0 0 746.8366
## term comparision
## 11 Huntington.disease DP-M4_vs_DN
## 12 Complement.and.coagulation.cascades DP-M4_vs_DN
for (fpb in fp.bests) {
fplot <- scGSVA::featurePlot(object = scg.res, features = fpb, reduction = my.red, group_by = my.annot)
rplot <- scGSVA::ridgePlot(object = scg.res, features = fpb, group_by = my.annot)
print(patchwork::wrap_plots(list(fplot, rplot)))
}
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Picking joint bandwidth of 0.0216
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Picking joint bandwidth of 0.00745
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.3 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
##
## locale:
## [1] LC_CTYPE=fr_FR.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=fr_FR.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=fr_FR.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] org.Mm.eg.db_3.13.0 AnnotationDbi_1.56.2 IRanges_2.28.0
## [4] S4Vectors_0.32.3 Biobase_2.54.0 BiocGenerics_0.40.0
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.2 reticulate_1.22
## [3] tidyselect_1.1.1 RSQLite_2.2.9
## [5] htmlwidgets_1.5.4 grid_4.1.2
## [7] BiocParallel_1.28.3 Rtsne_0.15
## [9] munsell_0.5.0 ScaledMatrix_1.2.0
## [11] codetools_0.2-18 ica_1.0-2
## [13] future_1.23.0 miniUI_0.1.1.1
## [15] colorspace_2.0-2 highr_0.9
## [17] knitr_1.37 rstudioapi_0.13
## [19] Seurat_4.0.6 SingleCellExperiment_1.16.0
## [21] ROCR_1.0-11 tensor_1.5
## [23] listenv_0.8.0 labeling_0.4.2
## [25] MatrixGenerics_1.6.0 GenomeInfoDbData_1.2.7
## [27] polyclip_1.10-0 farver_2.1.0
## [29] bit64_4.0.5 pheatmap_1.0.12
## [31] rhdf5_2.38.0 parallelly_1.30.0
## [33] vctrs_0.3.8 generics_0.1.1
## [35] xfun_0.29 R6_2.5.1
## [37] GenomeInfoDb_1.30.0 rsvd_1.0.5
## [39] msigdbr_7.4.1 bitops_1.0-7
## [41] rhdf5filters_1.6.0 spatstat.utils_2.3-0
## [43] cachem_1.0.6 DelayedArray_0.20.0
## [45] assertthat_0.2.1 promises_1.2.0.1
## [47] scales_1.1.1 nnet_7.3-16
## [49] gtable_0.3.0 beachmat_2.10.0
## [51] globals_0.14.0 goftest_1.2-3
## [53] rlang_0.4.12 splines_4.1.2
## [55] rstatix_0.7.0 lazyeval_0.2.2
## [57] broom_0.7.11 spatstat.geom_2.3-1
## [59] checkmate_2.0.0 yaml_2.2.1
## [61] reshape2_1.4.4 abind_1.4-5
## [63] backports_1.4.1 httpuv_1.6.5
## [65] Hmisc_4.6-0 tools_4.1.2
## [67] ggplot2_3.3.5 ellipsis_0.3.2
## [69] spatstat.core_2.3-2 scGSVA_0.0.11
## [71] jquerylib_0.1.4 RColorBrewer_1.1-2
## [73] ggridges_0.5.3 Rcpp_1.0.7
## [75] plyr_1.8.6 base64enc_0.1-3
## [77] sparseMatrixStats_1.6.0 zlibbioc_1.40.0
## [79] purrr_0.3.4 RCurl_1.98-1.5
## [81] rpart_4.1-15 deldir_1.0-6
## [83] viridis_0.6.2 pbapply_1.5-0
## [85] cowplot_1.1.1 zoo_1.8-9
## [87] SeuratObject_4.0.4 SummarizedExperiment_1.24.0
## [89] ggrepel_0.9.1 cluster_2.1.2
## [91] magrittr_2.0.1 data.table_1.14.2
## [93] scattermore_0.7 lmtest_0.9-39
## [95] RANN_2.6.1 fitdistrplus_1.1-6
## [97] matrixStats_0.61.0 patchwork_1.1.1
## [99] mime_0.12 evaluate_0.14
## [101] GSVA_1.42.0 xtable_1.8-4
## [103] XML_3.99-0.8 jpeg_0.1-9
## [105] gridExtra_2.3 compiler_4.1.2
## [107] tibble_3.1.6 KernSmooth_2.23-20
## [109] crayon_1.4.2 htmltools_0.5.2
## [111] mgcv_1.8-38 later_1.3.0
## [113] Formula_1.2-4 tidyr_1.1.4
## [115] DBI_1.1.2 MASS_7.3-54
## [117] babelgene_21.4 car_3.0-12
## [119] Matrix_1.4-0 parallel_4.1.2
## [121] igraph_1.2.11 GenomicRanges_1.46.1
## [123] pkgconfig_2.0.3 foreign_0.8-81
## [125] plotly_4.10.0 spatstat.sparse_2.1-0
## [127] annotate_1.72.0 bslib_0.3.1
## [129] XVector_0.34.0 stringr_1.4.0
## [131] digest_0.6.29 sctransform_0.3.2
## [133] RcppAnnoy_0.0.19 graph_1.72.0
## [135] spatstat.data_2.1-2 Biostrings_2.62.0
## [137] rmarkdown_2.11 leiden_0.3.9
## [139] htmlTable_2.4.0 uwot_0.1.11
## [141] DelayedMatrixStats_1.16.0 GSEABase_1.56.0
## [143] curl_4.3.2 shiny_1.7.1
## [145] lifecycle_1.0.1 nlme_3.1-153
## [147] jsonlite_1.7.2 Rhdf5lib_1.16.0
## [149] carData_3.0-5 viridisLite_0.4.0
## [151] limma_3.50.0 fansi_1.0.0
## [153] pillar_1.6.4 lattice_0.20-45
## [155] KEGGREST_1.34.0 fastmap_1.1.0
## [157] httr_1.4.2 survival_3.2-13
## [159] glue_1.6.0 png_0.1-7
## [161] bit_4.0.4 stringi_1.7.6
## [163] sass_0.4.0 HDF5Array_1.22.1
## [165] blob_1.2.2 BiocSingular_1.10.0
## [167] latticeExtra_0.6-29 memoise_2.0.1
## [169] dplyr_1.0.7 irlba_2.3.5
## [171] future.apply_1.8.1