This file is associated with the course on Functional Analysis,
accessible at this link: https://moodle.france-bioinformatique.fr/course/view.php?id=28.
This file shows the code to perform over-representation analysis and
gene set enrichment analysis. The analyses are based on the
clusterProfiler
package:
citation("clusterProfiler")
## Please cite S. Xu (2024) for using clusterProfiler. In addition, please
## cite G. Yu (2010) when using GOSemSim, G. Yu (2015) when using DOSE and
## G. Yu (2015) when using ChIPseeker.
##
## S Xu, E Hu, Y Cai, Z Xie, X Luo, L Zhan, W Tang, Q Wang, B Liu, R
## Wang, W Xie, T Wu, L Xie, G Yu. Using clusterProfiler to characterize
## multiomics data. Nature Protocols. 2024,
## doi:10.1038/s41596-024-01020-z
##
## T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L
## Zhan, X Fu, S Liu, X Bo, and G Yu. clusterProfiler 4.0: A universal
## enrichment tool for interpreting omics data. The Innovation. 2021,
## 2(3):100141
##
## Guangchuang Yu, Li-Gen Wang, Yanyan Han and Qing-Yu He.
## clusterProfiler: an R package for comparing biological themes among
## gene clusters. OMICS: A Journal of Integrative Biology 2012,
## 16(5):284-287
##
## To see these entries in BibTeX format, use 'print(<citation>,
## bibtex=TRUE)', 'toBibtex(.)', or set
## 'options(citation.bibtex.max=999)'.
We load packages of interest:
library(clusterProfiler) # Make enrichment analysis
library(enrichplot) # Awesome graphs
library(org.At.tair.db) # A. Thaliana annotation
.libPaths()
## [1] "/shared/home/aonfroy/R/x86_64-conda-linux-gnu-library/4.4"
## [2] "/shared/ifbstor1/software/miniconda/envs/r-4.4.1/lib/R/library"
We load the data from the differential expression analysis.
deseq_genes = read.table(
file = "./tables/KOvsWT.complete.txt",
sep = "\t",
header = TRUE
)
We assess the dimensions of the data
colnames(deseq_genes)
## [1] "Id" "WT1" "WT2" "WT3"
## [5] "KO1" "KO2" "KO3" "norm.WT1"
## [9] "norm.WT2" "norm.WT3" "norm.KO1" "norm.KO2"
## [13] "norm.KO3" "baseMean" "WT" "KO"
## [17] "FoldChange" "log2FoldChange" "stat" "pvalue"
## [21] "padj" "dispGeneEst" "dispFit" "dispMAP"
## [25] "dispersion" "betaConv" "maxCooks"
nrow(deseq_genes)
## [1] 27655
head(deseq_genes$Id)
## [1] "gene:AT1G01010" "gene:AT1G01020" "gene:AT1G01030" "gene:AT1G01040"
## [5] "gene:AT1G01050" "gene:AT1G01060"
We explore the data:
deseq_genes[deseq_genes$Id == "gene:AT1G61580", ]
## Id WT1 WT2 WT3 KO1 KO2 KO3 norm.WT1 norm.WT2 norm.WT3 norm.KO1
## 5120 gene:AT1G61580 248 231 205 119 131 125 229 210 215 131
## norm.KO2 norm.KO3 baseMean WT KO FoldChange log2FoldChange stat
## 5120 131 123 173.19 218 128 0.588 -0.766 -4.48
## pvalue padj dispGeneEst dispFit dispMAP dispersion betaConv
## 5120 7.465947e-06 0.0001156724 0 0.0311 0.0149 0.0149 TRUE
## maxCooks
## 5120 0.0222
For a computer, gene:AT1G01010
is not
AT1G01010
. We fix identifiers for the computer:
head(deseq_genes$Id)
## [1] "gene:AT1G01010" "gene:AT1G01020" "gene:AT1G01030" "gene:AT1G01040"
## [5] "gene:AT1G01050" "gene:AT1G01060"
deseq_genes$Id = sub(pattern = "gene:",
replacement = "",
x = deseq_genes$Id)
head(deseq_genes$Id)
## [1] "AT1G01010" "AT1G01020" "AT1G01030" "AT1G01040" "AT1G01050" "AT1G01060"
For a human, AT1G61580
is horrible to remember. We can
add human-readable names. The latter are called “symbol”.
annotation = clusterProfiler::bitr(
geneID = deseq_genes$Id, # Our gene list
fromType = "TAIR", # We have TAIR ID
toType = c("ENTREZID", "SYMBOL"), # What we want
OrgDb = org.At.tair.db, # Our annotation
drop = FALSE
)
## 'select()' returned 1:many mapping between keys and columns
## Warning in clusterProfiler::bitr(geneID = deseq_genes$Id, fromType = "TAIR", :
## 3.42% of input gene IDs are fail to map...
head(annotation)
## TAIR ENTREZID SYMBOL
## 1 AT1G01010 839580 ANAC001
## 2 AT1G01010 839580 NAC001
## 3 AT1G01010 839580 NTL10
## 4 AT1G01020 839569 ARV1
## 5 AT1G01030 839321 NGA3
## 6 AT1G01040 839574 ASU1
dim(annotation)
## [1] 38947 3
We add the translation to the result table
deseq_genes_with_symbol = merge(
x = deseq_genes,
y = annotation,
by.x = "Id", # In deseq_genes, TAIR IDs are stored in the Id column
by.y = "TAIR" # In annotation, TAIR IDs are stored in the TAIR column
)
dim(deseq_genes)
## [1] 27655 27
dim(deseq_genes_with_symbol)
## [1] 38947 29
head(deseq_genes_with_symbol[, c("Id", "SYMBOL", "ENTREZID")])
## Id SYMBOL ENTREZID
## 1 AT1G01010 ANAC001 839580
## 2 AT1G01010 NAC001 839580
## 3 AT1G01010 NTL10 839580
## 4 AT1G01020 ARV1 839569
## 5 AT1G01030 NGA3 839321
## 6 AT1G01040 ASU1 839574
Adding symbol changes the number of dimensions of the table:
dim(deseq_genes)
## [1] 27655 27
dim(deseq_genes_with_symbol)
## [1] 38947 29
We are going to use the original table, so we clean the annotation and annotated table:
rm(annotation, deseq_genes_with_symbol)
We need to filter differentially expressed genes in order to perform ORA.
dim(deseq_genes)
## [1] 27655 27
de_genes = deseq_genes[deseq_genes[, "padj"] <= 0.001, ]
de_genes = de_genes[!is.na(de_genes[, "log2FoldChange"]), ]
dim(de_genes)
## [1] 1807 27
We perform the ORA using the gene ontology for cellular components:
ego = clusterProfiler::enrichGO(
gene = de_genes$Id, # gene list
universe = deseq_genes$Id, # all genes
OrgDb = org.At.tair.db, # annotation
keyType = "TAIR", # nature of the genes ID
ont = "CC", # Cellular Components
pvalueCutoff = 1, # significance threshold (take all)
pAdjustMethod = "BH", # p-value adjustment method
readable = TRUE # For human beings
)
# View(ego)
How it looks like ?
head(ego@result, 3)
## ID Description GeneRatio BgRatio
## GO:0055035 GO:0055035 plastid thylakoid membrane 74/1785 357/26909
## GO:0009535 GO:0009535 chloroplast thylakoid membrane 73/1785 349/26909
## GO:0019867 GO:0019867 outer membrane 85/1785 477/26909
## pvalue p.adjust qvalue
## GO:0055035 1.033262e-18 1.443340e-16 1.310881e-16
## GO:0009535 1.042123e-18 1.443340e-16 1.310881e-16
## GO:0019867 5.207312e-17 3.606064e-15 3.275125e-15
## geneID
## GO:0055035 PSAD-2/EMB2784/PnsL2/NDF1/NDF6/LHCA2*1/ATLFNR2/DRT112/COR413-TM1/AB180/AB165/AB140/PSAF/PRK/CP22/PSAH-2/PSAG/RBCS1A/cS23z/CRR23/LIL8/SPPA/NDH-O/CRR3/LHCB2/LHCB2/NA/NA/PSBW/DEG11/DRN1/ATBCA1/EMB3119/LHCB4.2/ATNCED3/ATHM4/AOC1/ATPRX/CCL/DEG13/bL12cz/ATLOX2/PTAC16/CAB4/CHL/LHCA1/CYP20-3/OEP6/ATPC1/ATPD/PSAL/ATRAB8D/PSBQ/PSAE-1/PDE334/NDH-M/CURT1D/FKBP16-2/ATPLAT1/LHCB4.1/ATFER1/BETA/NA/PHGAP2/ATHMA8/NA/ATGSL1/NDH18/PAM68L/LHCB3/PSAN/PSAB/PSAA/AthCF1beta
## GO:0009535 PSAD-2/EMB2784/PnsL2/NDF1/NDF6/LHCA2*1/ATLFNR2/COR413-TM1/AB180/AB165/AB140/PSAF/PRK/CP22/PSAH-2/PSAG/RBCS1A/cS23z/CRR23/LIL8/SPPA/NDH-O/CRR3/LHCB2/LHCB2/NA/NA/PSBW/DEG11/DRN1/ATBCA1/EMB3119/LHCB4.2/ATNCED3/ATHM4/AOC1/ATPRX/CCL/DEG13/bL12cz/ATLOX2/PTAC16/CAB4/CHL/LHCA1/CYP20-3/OEP6/ATPC1/ATPD/PSAL/ATRAB8D/PSBQ/PSAE-1/PDE334/NDH-M/CURT1D/FKBP16-2/ATPLAT1/LHCB4.1/ATFER1/BETA/NA/PHGAP2/ATHMA8/NA/ATGSL1/NDH18/PAM68L/LHCB3/PSAN/PSAB/PSAA/AthCF1beta
## GO:0019867 PSAD-2/FZL/EMB2784/PnsL2/NDF1/NDF6/LHCA2*1/ATLFNR2/AtTTM2/COR413-TM1/AB180/AB165/AB140/PSAF/PRK/CP22/NA/PSAH-2/PSAG/RBCS1A/cS23z/CRR23/LIL8/AtTTM1/SPPA/NDH-O/ARC3/CRR3/LHCB2/LHCB2/NA/NA/PSBW/DEG11/DRN1/ATBCA1/EMB3119/LHCB4.2/ATNCED3/ATHM4/AOC1/ATPRX/CCL/DEG13/bL12cz/AtOM47/ATLOX2/PTAC16/CAB4/CHL/AtBCS1/LHCA1/CYP20-3/OEP6/ATPC1/ATPD/AZI3/PSAL/ATRAB8D/PSBQ/PSAE-1/KOC1/PDE334/NDH-M/CURT1D/FKBP16-2/ATPLAT1/LHCB4.1/ATFER1/BETA/NA/PHGAP2/ATHMA8/ATKO1/NA/ATGSL1/NDH18/At-NEET/PAM68L/NA/LHCB3/PSAN/PSAB/PSAA/AthCF1beta
## Count
## GO:0055035 74
## GO:0009535 73
## GO:0019867 85
We visualize results:
graphics::barplot(ego,
showCategory = 15)