This file is associated with the course on Functional Analysis, accessible at this link: https://moodle.france-bioinformatique.fr/course/view.php?id=28. This file shows the code to perform over-representation analysis and gene set enrichment analysis. The analyses are based on the clusterProfiler package:

citation("clusterProfiler")
## Please cite S. Xu (2024) for using clusterProfiler. In addition, please
## cite G. Yu (2010) when using GOSemSim, G. Yu (2015) when using DOSE and
## G. Yu (2015) when using ChIPseeker.
## 
##   S Xu, E Hu, Y Cai, Z Xie, X Luo, L Zhan, W Tang, Q Wang, B Liu, R
##   Wang, W Xie, T Wu, L Xie, G Yu. Using clusterProfiler to characterize
##   multiomics data. Nature Protocols. 2024,
##   doi:10.1038/s41596-024-01020-z
## 
##   T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L
##   Zhan, X Fu, S Liu, X Bo, and G Yu. clusterProfiler 4.0: A universal
##   enrichment tool for interpreting omics data. The Innovation. 2021,
##   2(3):100141
## 
##   Guangchuang Yu, Li-Gen Wang, Yanyan Han and Qing-Yu He.
##   clusterProfiler: an R package for comparing biological themes among
##   gene clusters. OMICS: A Journal of Integrative Biology 2012,
##   16(5):284-287
## 
## To see these entries in BibTeX format, use 'print(<citation>,
## bibtex=TRUE)', 'toBibtex(.)', or set
## 'options(citation.bibtex.max=999)'.

Environment

We load packages of interest:

library(clusterProfiler)  # Make enrichment analysis
library(enrichplot)       # Awesome graphs
library(org.At.tair.db)   # A. Thaliana annotation

.libPaths()
## [1] "/shared/home/aonfroy/R/x86_64-conda-linux-gnu-library/4.4"     
## [2] "/shared/ifbstor1/software/miniconda/envs/r-4.4.1/lib/R/library"

Data

We load the data from the differential expression analysis.

deseq_genes = read.table(
  file = "./tables/KOvsWT.complete.txt",
  sep = "\t",
  header = TRUE
)

We assess the dimensions of the data

colnames(deseq_genes)
##  [1] "Id"             "WT1"            "WT2"            "WT3"           
##  [5] "KO1"            "KO2"            "KO3"            "norm.WT1"      
##  [9] "norm.WT2"       "norm.WT3"       "norm.KO1"       "norm.KO2"      
## [13] "norm.KO3"       "baseMean"       "WT"             "KO"            
## [17] "FoldChange"     "log2FoldChange" "stat"           "pvalue"        
## [21] "padj"           "dispGeneEst"    "dispFit"        "dispMAP"       
## [25] "dispersion"     "betaConv"       "maxCooks"
nrow(deseq_genes)
## [1] 27655
head(deseq_genes$Id)
## [1] "gene:AT1G01010" "gene:AT1G01020" "gene:AT1G01030" "gene:AT1G01040"
## [5] "gene:AT1G01050" "gene:AT1G01060"

We explore the data:

deseq_genes[deseq_genes$Id == "gene:AT1G61580", ]
##                  Id WT1 WT2 WT3 KO1 KO2 KO3 norm.WT1 norm.WT2 norm.WT3 norm.KO1
## 5120 gene:AT1G61580 248 231 205 119 131 125      229      210      215      131
##      norm.KO2 norm.KO3 baseMean  WT  KO FoldChange log2FoldChange  stat
## 5120      131      123   173.19 218 128      0.588         -0.766 -4.48
##            pvalue         padj dispGeneEst dispFit dispMAP dispersion betaConv
## 5120 7.465947e-06 0.0001156724           0  0.0311  0.0149     0.0149     TRUE
##      maxCooks
## 5120   0.0222

Gene identifiers

For a computer, gene:AT1G01010 is not AT1G01010. We fix identifiers for the computer:

head(deseq_genes$Id)
## [1] "gene:AT1G01010" "gene:AT1G01020" "gene:AT1G01030" "gene:AT1G01040"
## [5] "gene:AT1G01050" "gene:AT1G01060"
deseq_genes$Id = sub(pattern = "gene:",
                     replacement = "",
                     x = deseq_genes$Id)

head(deseq_genes$Id)
## [1] "AT1G01010" "AT1G01020" "AT1G01030" "AT1G01040" "AT1G01050" "AT1G01060"

Gene symbol

For a human, AT1G61580 is horrible to remember. We can add human-readable names. The latter are called “symbol”.

annotation = clusterProfiler::bitr(
  geneID   = deseq_genes$Id,          # Our gene list
  fromType = "TAIR",                  # We have TAIR ID
  toType   = c("ENTREZID", "SYMBOL"), # What we want
  OrgDb    = org.At.tair.db,          # Our annotation
  drop = FALSE
)
## 'select()' returned 1:many mapping between keys and columns
## Warning in clusterProfiler::bitr(geneID = deseq_genes$Id, fromType = "TAIR", :
## 3.42% of input gene IDs are fail to map...
head(annotation)
##        TAIR ENTREZID  SYMBOL
## 1 AT1G01010   839580 ANAC001
## 2 AT1G01010   839580  NAC001
## 3 AT1G01010   839580   NTL10
## 4 AT1G01020   839569    ARV1
## 5 AT1G01030   839321    NGA3
## 6 AT1G01040   839574    ASU1
dim(annotation)
## [1] 38947     3

We add the translation to the result table

deseq_genes_with_symbol = merge(
  x = deseq_genes,
  y = annotation,
  by.x = "Id",   # In deseq_genes, TAIR IDs are stored in the Id column
  by.y = "TAIR"  # In annotation, TAIR IDs are stored in the TAIR column
)

dim(deseq_genes)
## [1] 27655    27
dim(deseq_genes_with_symbol)
## [1] 38947    29
head(deseq_genes_with_symbol[, c("Id", "SYMBOL", "ENTREZID")])
##          Id  SYMBOL ENTREZID
## 1 AT1G01010 ANAC001   839580
## 2 AT1G01010  NAC001   839580
## 3 AT1G01010   NTL10   839580
## 4 AT1G01020    ARV1   839569
## 5 AT1G01030    NGA3   839321
## 6 AT1G01040    ASU1   839574

Adding symbol changes the number of dimensions of the table:

dim(deseq_genes)
## [1] 27655    27
dim(deseq_genes_with_symbol)
## [1] 38947    29

We are going to use the original table, so we clean the annotation and annotated table:

rm(annotation, deseq_genes_with_symbol)

Over-representation analysis

We need to filter differentially expressed genes in order to perform ORA.

dim(deseq_genes)
## [1] 27655    27
de_genes = deseq_genes[deseq_genes[, "padj"] <= 0.001, ]
de_genes = de_genes[!is.na(de_genes[, "log2FoldChange"]), ]
dim(de_genes)
## [1] 1807   27

Cellular components

We perform the ORA using the gene ontology for cellular components:

ego = clusterProfiler::enrichGO(
  gene = de_genes$Id,                # gene list
  universe = deseq_genes$Id,         # all genes
  OrgDb = org.At.tair.db,            # annotation
  keyType = "TAIR",                  # nature of the genes ID
  ont = "CC",                        # Cellular Components
  pvalueCutoff = 1,                  # significance threshold (take all)
  pAdjustMethod = "BH",              # p-value adjustment method
  readable = TRUE                    # For human beings
)

# View(ego)

How it looks like ?

head(ego@result, 3)
##                    ID                    Description GeneRatio   BgRatio
## GO:0055035 GO:0055035     plastid thylakoid membrane   74/1785 357/26909
## GO:0009535 GO:0009535 chloroplast thylakoid membrane   73/1785 349/26909
## GO:0019867 GO:0019867                 outer membrane   85/1785 477/26909
##                  pvalue     p.adjust       qvalue
## GO:0055035 1.033262e-18 1.443340e-16 1.310881e-16
## GO:0009535 1.042123e-18 1.443340e-16 1.310881e-16
## GO:0019867 5.207312e-17 3.606064e-15 3.275125e-15
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        geneID
## GO:0055035                                                             PSAD-2/EMB2784/PnsL2/NDF1/NDF6/LHCA2*1/ATLFNR2/DRT112/COR413-TM1/AB180/AB165/AB140/PSAF/PRK/CP22/PSAH-2/PSAG/RBCS1A/cS23z/CRR23/LIL8/SPPA/NDH-O/CRR3/LHCB2/LHCB2/NA/NA/PSBW/DEG11/DRN1/ATBCA1/EMB3119/LHCB4.2/ATNCED3/ATHM4/AOC1/ATPRX/CCL/DEG13/bL12cz/ATLOX2/PTAC16/CAB4/CHL/LHCA1/CYP20-3/OEP6/ATPC1/ATPD/PSAL/ATRAB8D/PSBQ/PSAE-1/PDE334/NDH-M/CURT1D/FKBP16-2/ATPLAT1/LHCB4.1/ATFER1/BETA/NA/PHGAP2/ATHMA8/NA/ATGSL1/NDH18/PAM68L/LHCB3/PSAN/PSAB/PSAA/AthCF1beta
## GO:0009535                                                                    PSAD-2/EMB2784/PnsL2/NDF1/NDF6/LHCA2*1/ATLFNR2/COR413-TM1/AB180/AB165/AB140/PSAF/PRK/CP22/PSAH-2/PSAG/RBCS1A/cS23z/CRR23/LIL8/SPPA/NDH-O/CRR3/LHCB2/LHCB2/NA/NA/PSBW/DEG11/DRN1/ATBCA1/EMB3119/LHCB4.2/ATNCED3/ATHM4/AOC1/ATPRX/CCL/DEG13/bL12cz/ATLOX2/PTAC16/CAB4/CHL/LHCA1/CYP20-3/OEP6/ATPC1/ATPD/PSAL/ATRAB8D/PSBQ/PSAE-1/PDE334/NDH-M/CURT1D/FKBP16-2/ATPLAT1/LHCB4.1/ATFER1/BETA/NA/PHGAP2/ATHMA8/NA/ATGSL1/NDH18/PAM68L/LHCB3/PSAN/PSAB/PSAA/AthCF1beta
## GO:0019867 PSAD-2/FZL/EMB2784/PnsL2/NDF1/NDF6/LHCA2*1/ATLFNR2/AtTTM2/COR413-TM1/AB180/AB165/AB140/PSAF/PRK/CP22/NA/PSAH-2/PSAG/RBCS1A/cS23z/CRR23/LIL8/AtTTM1/SPPA/NDH-O/ARC3/CRR3/LHCB2/LHCB2/NA/NA/PSBW/DEG11/DRN1/ATBCA1/EMB3119/LHCB4.2/ATNCED3/ATHM4/AOC1/ATPRX/CCL/DEG13/bL12cz/AtOM47/ATLOX2/PTAC16/CAB4/CHL/AtBCS1/LHCA1/CYP20-3/OEP6/ATPC1/ATPD/AZI3/PSAL/ATRAB8D/PSBQ/PSAE-1/KOC1/PDE334/NDH-M/CURT1D/FKBP16-2/ATPLAT1/LHCB4.1/ATFER1/BETA/NA/PHGAP2/ATHMA8/ATKO1/NA/ATGSL1/NDH18/At-NEET/PAM68L/NA/LHCB3/PSAN/PSAB/PSAA/AthCF1beta
##            Count
## GO:0055035    74
## GO:0009535    73
## GO:0019867    85

We visualize results:

graphics::barplot(ego,
                  showCategory = 15)