#!/usr/bin/env R library(clusterProfiler) # Make enrichment analysis library(enrichplot) # Awesome graphs library(org.At.tair.db) # A. Thaliana annotation ################################## # PRE-REQUISITES # !! Copy the folder to your directory ################################## # Set working directory setwd("/shared/projects//TP_GSEA/") ################################## # PART 1 : Introduction # From DESeq2, we have a large table with many columns ################################## # Load data deseq_genes = read.table( file = "tables/KOvsWT.complete.txt", sep = "\t", header = TRUE ) colnames(deseq_genes) nrow(deseq_genes) head(deseq_genes$Id) # Explore data deseq_genes[deseq_genes$Id == "gene:AT1G61580", ] ################################################### # PART 2: Gene Identifiers # For a computer, gene:AT1G01010 != AT1G01010 # For a human, AT1G01010 is horrible to remember # We need to clean identifiers, # and add human-readable names ##################################################### # Fix identifiers for the computer head(deseq_genes$Id) deseq_genes$Id = sub(pattern = "gene:", replacement = "", x = deseq_genes$Id) head(deseq_genes$Id) # Translate TAIR ID to gene symbol annotation = clusterProfiler::bitr( geneID = deseq_genes$Id, # Our gene list fromType = "TAIR", # We have TAIR ID toType = c("SYMBOL", "ENTREZID"), # What we want OrgDb = org.At.tair.db, # Our annotation drop = FALSE ) head(annotation) dim(annotation) # Add the translation to the result table deseq_genes_with_symbol = merge( x = deseq_genes, y = annotation, by.x = "Id", by.y = "TAIR" ) dim(deseq_genes_with_symbol) dim(deseq_genes) head(deseq_genes_with_symbol[, c("Id", "SYMBOL", "ENTREZID")]) ####################################################### # PART 3: Gene sets # We need to filter differentially expressed genes # in order to perform ORA. ####################################################### # Filter DEG dim(deseq_genes) de_genes = deseq_genes[deseq_genes[, "padj"] <= 0.001, ] de_genes = de_genes[!is.na(de_genes[, "log2FoldChange"]), ] dim(de_genes) # Perform ORA ego = clusterProfiler::enrichGO( gene = de_genes$Id, # gene list universe = deseq_genes$Id, # all genes OrgDb = org.At.tair.db, # annotation keyType = "TAIR", # nature of the genes ID ont = "CC", # Cellular Components pvalueCutoff = 1, # significance threshold (take all) pAdjustMethod = "BH", # p-value adjustment method readable = TRUE # For human beings ) View(ego) head(ego@result, 3) # Plot ORA results graphics::barplot(ego, showCategory = 15) enrichplot::dotplot(ego, showCategory = 15) # Search for enriched terms that related to roots grep(ego@result$Description, pattern = "root", value = TRUE) ego@result[ego@result$Description == "root hair", ] # Correct the previous ORA to include biological processes # instead of cellular components ego = clusterProfiler::enrichGO( gene = de_genes$Id, # gene list universe = deseq_genes$Id, # all genes OrgDb = org.At.tair.db, # annotation keyType = "TAIR", # nature of the genes ID ont = "BP", # Biological Processes pvalueCutoff = 1, # significance threshold (take all) pAdjustMethod = "BH", # p-value adjustment method readable = TRUE # For human beings ) # Plot ORA results graphics::barplot(ego, showCategory = 15) enrichplot::dotplot(ego, showCategory = 15) # Search for roots root_names = grep(ego@result$Description, pattern = "root", value = TRUE) root_names ego@result[ego@result$Description %in% root_names, ] # Plot ORA results graphics::barplot(ego, showCategory = root_names) enrichplot::dotplot(ego, showCategory = root_names) ################################################################## # PART 4: GSEA # We need to build a named vector which contains sorted numbers # Then we can run a Gene set enrichment analysis ################################################################### # Explore results to guess the right column to extract colnames(deseq_genes) # We choose to use the 'stat' column geneList = as.numeric(de_genes$stat) names(geneList) = de_genes$Id geneList = sort(geneList, decreasing = TRUE) head(geneList) # Run GSEA on Gene Ontology gsea = clusterProfiler::gseGO( geneList = geneList, # ranked gene list ont = "BP", # Biological Processes OrgDb = org.At.tair.db, # annotation keyType = "TAIR", # nature of the genes ID pAdjustMethod = "BH", # p-value adjustment method pvalueCutoff = 1, # significance threshold (take all) seed = 1 # fix randomness for permutations ) # Explore results gsea@result %>% dplyr::filter(p.adjust < 0.05) %>% dplyr::top_n(., n = 8, wt = abs(NES)) %>% dplyr::select(Description, NES, p.adjust, setSize) # Explore results for roots root_names = grep(gsea@result$Description, pattern = "root", value = TRUE) root_names gsea@result %>% dplyr::filter(p.adjust < 0.05) %>% dplyr::filter(Description %in% root_names) %>% dplyr::top_n(., n = 8, wt = abs(NES)) %>% dplyr::select(Description, NES, p.adjust, setSize) gene_set_name = "root hair cell differentiation" gene_set_id = which(gsea@result$Description == gene_set_name) gene_set_id enrichplot::gseaplot2( x = gsea, geneSetID = gene_set_id, title = gene_set_name ) ################################################################## # BONUS PART: VISUALIZATION ################################################################### # Plot multiple pathways on the same graph enrichplot::gseaplot2( x = gsea, geneSetID = c(1:3), title = "Most enriched terms" ) # Use a Heatmap enrichplot::heatplot( x = ego, # Our ORA showCategory = root_names, # Gene sets of interest foldChange = setNames(nm = de_genes$Id, de_genes$log2FoldChange) # Our fold changes ) # Use an upsetplot ego = enrichplot::pairwise_termsim(ego) enrichplot::upsetplot(x = ego, # Our ORA n = 10) # Nb of terms to display # Use an enrichment map enrichplot::emapplot(ego, showCategory = root_names) # Use an gene-concept network enrichplot::cnetplot(ego, showCategory = root_names, foldChange = setNames(nm = de_genes$Id, de_genes$log2FoldChange)) # Plot complete pathways library(pathview) # With this command, the picture is saved in the working directory # pv_out = pathview::pathview( # gene.data = geneList, # Our gene list # pathway.id = "ath00630", # Our pathway # species = "ath", # Our organism # limit = list(gene = max(abs(geneList))), # The color limits # gene.idtype = "TAIR" # The genes identifiers # ) # pv_out # https://stackoverflow.com/questions/60141841/how-to-get-pathview-plot-displayed-directly-rather-than-saving-as-a-file-in-r see_pathview = function(..., save_image = FALSE) { msg = capture.output(pathview::pathview(...), type = "message") msg = grep("image file", msg, value = TRUE) filename = sapply(strsplit(msg, " "), function(x) x[length(x)]) img = png::readPNG(filename) grid::grid.raster(img) if(!save_image) invisible(file.remove(filename)) } see_pathview(gene.data = geneList, pathway.id = "ath00630", species = "ath", limit = list(gene = max(abs(geneList))), gene.idtype = "TAIR")