Initialize the R environment and define two directory paths - you may need to update those for your system!
library( dplyr )
library( tidyr )
library( readr )
library( igraph )
library( diffuStats )
library( visNetwork )
library( kableExtra )
# FIXME: update the following two variables to match your system
script_path = '~/github/summer-school-multiomics-data-analysis-and-integration/topic_3/2-Diffusion-Statistics/'
data_path = '~/github/summer-school-multiomics-data-analysis-and-integration/topic_2/datasets/TaraOceans/'
setwd( script_path )
TARAoceans_metadata
contains three columns with string
codes for sample, ocean and depth.
TARAoceans_metadata <-
read_csv(
paste0( data_path, 'TARAoceans_metadata.csv' ),
show_col_types = F
) %>%
rename( sample = 1 )
TARAoceans_metadata %>%
slice_head( n = 5 ) %>%
kable() %>%
kable_minimal( full_width = F )
sample | ocean | depth |
---|---|---|
TARA_109_SRF | SPO | SRF |
TARA_149_MES | NAO | MES |
TARA_110_MES | SPO | MES |
TARA_102_MES | SPO | MES |
TARA_142_SRF | NAO | SRF |
TARAoceans_specfreq
contains a table with the raw read
counts for the different “species” in the different samples. Nota Bene:
(i) we use the wording “species” here for what should be called
Operational Taxonomic Unit (OTU) (ii) the source file name contains the
wording “Phylo” which would make more sense for the next file.
TARAoceans_specfreq <-
read_csv(
paste0( data_path, 'TARAoceans_proPhylo.csv' ),
show_col_types = F
) %>%
rename( sample = 1 )
TARAoceans_specfreq %>%
slice_head( n = 5 ) %>%
select( 1:6 ) %>%
kable() %>%
kable_minimal( full_width = F )
sample | EU638706.1.1353 | JN537192.1.1500 | CAFJ01000195.23.1515 | AJ567555.1.1514 | JQ452773.1.1411 |
---|---|---|---|---|---|
TARA_109_SRF | 0 | 0 | 0 | 0 | 0 |
TARA_149_MES | 0 | 0 | 0 | 0 | 0 |
TARA_110_MES | 0 | 4 | 0 | 0 | 0 |
TARA_102_MES | 0 | 1 | 0 | 0 | 0 |
TARA_142_SRF | 0 | 3 | 0 | 0 | 0 |
TARAoceans_lineage
contains a table with the lineage of
every “species”. Some lineages were corrected with respect to
NA
, uncultured
,
Family Incertae Sedis
, … taxa. Indeed, these correspond to
different entities despite a shared designation and must be
distinguished by introducing placeholder taxa. This also permit to
preserve branch lengths in the final phylogenetic tree. It might be
difficult to detect such shared designation in large dataset. Graphical
visualisation might help. The cleanup_uppertaxon
is totally
adhoc, but fix the problem.
cleanup_uppertaxon <- function( species, uppertaxon, rank ){
return(
ifelse(
is.na( uppertaxon ) | ( uppertaxon %in% c(
'uncultured',
'Family Incertae Sedis',
'Family XII Incertae Sedis',
'FamilyI',
'Incertae Sedis',
'Order Incertae Sedis',
'Marinicella',
'Prochlorococcus'
)),
paste0( rank, '_of_', species ),
paste0( uppertaxon, '_', rank )
)
);
}
TARAoceans_lineage <-
read_csv(
paste0( data_path, 'TARAoceans_taxonomy.csv' ),
show_col_types = F
) %>%
rename( species = 1 ) %>%
mutate(
domain = cleanup_uppertaxon( species, Domain, 'domain' ),
phylum = cleanup_uppertaxon( species, Phylum, 'phylum' ),
class = cleanup_uppertaxon( species, Class, 'class' ),
order = cleanup_uppertaxon( species, Order, 'order' ),
family = cleanup_uppertaxon( species, Family, 'family' ),
genus = cleanup_uppertaxon( species, Genus, 'genus' )
) %>%
select( species, genus, family, order, class, phylum, domain )
TARAoceans_lineage %>%
slice_head( n = 5 ) %>%
select( 1:4 ) %>%
kable() %>%
kable_minimal( full_width = F )
species | genus | family | order |
---|---|---|---|
EU638706.1.1353 | Rhodococcus_genus | Nocardiaceae_family | Corynebacteriales_order |
JN537192.1.1500 | genus_of_JN537192.1.1500 | family_of_JN537192.1.1500 | Sh765B-TzT-29_order |
CAFJ01000195.23.1515 | Bradyrhizobium_genus | Bradyrhizobiaceae_family | Rhizobiales_order |
AJ567555.1.1514 | genus_of_AJ567555.1.1514 | family_of_AJ567555.1.1514 | PAUC43f marine benthic group_order |
JQ452773.1.1411 | Actinomyces_genus | Actinomycetaceae_family | Actinomycetales_order |
We are going to use both visNetwork
and
igraph
to handle graphs. Both package can be fed with
tibbles describing vertices (AKA nodes) and edges. The same tibbles can
be used for both packages provided the first column of the vertices is
named id
and the two first columns for edges are named
from
and to
. First stats with edges:
e_phylogeny <- bind_rows(
TARAoceans_lineage %>% select( from = species, to = genus ),
TARAoceans_lineage %>% select( from = genus, to = family ),
TARAoceans_lineage %>% select( from = family, to = order ),
TARAoceans_lineage %>% select( from = order, to = class ),
TARAoceans_lineage %>% select( from = class, to = phylum ),
TARAoceans_lineage %>% select( from = phylum, to = domain )
) %>% distinct()
e_phylogeny %>%
slice_head( n = 5 ) %>%
kable() %>%
kable_minimal( full_width = F )
from | to |
---|---|
EU638706.1.1353 | Rhodococcus_genus |
JN537192.1.1500 | genus_of_JN537192.1.1500 |
CAFJ01000195.23.1515 | Bradyrhizobium_genus |
AJ567555.1.1514 | genus_of_AJ567555.1.1514 |
JQ452773.1.1411 | Actinomyces_genus |
Nota Bene: The following statement permitted to detect the
problematic taxa, e.g. uncultured
and to write the
exception list in cleanup_uppertaxon()
. Currently it return
an empty tibble.
e_phylogeny %>% count( from ) %>% filter( n > 1 )
Secondly let’s build the vertex tibbles. For practical reason, we are separating the “species” taxa from all the upper taxa:
v_species <- TARAoceans_lineage %>%
select( id = species ) %>%
mutate(
color = 'darkgreen',
label = id,
size = 25
) %>%
distinct()
v_uppertaxon <- bind_rows(
TARAoceans_lineage %>% select( id = genus ),
TARAoceans_lineage %>% select( id = family ),
TARAoceans_lineage %>% select( id = order ),
TARAoceans_lineage %>% select( id = class ),
TARAoceans_lineage %>% select( id = phylum ),
TARAoceans_lineage %>% select( id = domain )
) %>%
distinct() %>%
mutate(
color = 'lightgreen',
label = id,
size = 25,
)
v_phylogeny <- bind_rows( v_species , v_uppertaxon )
v_phylogeny %>%
slice_head( n = 5 ) %>%
kable() %>%
kable_minimal( full_width = F )
id | color | label | size |
---|---|---|---|
EU638706.1.1353 | darkgreen | EU638706.1.1353 | 25 |
JN537192.1.1500 | darkgreen | JN537192.1.1500 | 25 |
CAFJ01000195.23.1515 | darkgreen | CAFJ01000195.23.1515 | 25 |
AJ567555.1.1514 | darkgreen | AJ567555.1.1514 | 25 |
JQ452773.1.1411 | darkgreen | JQ452773.1.1411 | 25 |
Build the phylogeny in an igraph
object and count the
number of connected component.
igraph_phylogeny <- graph_from_data_frame(
e_phylogeny,
vertices = v_phylogeny
)
count_components( igraph_phylogeny )
## [1] 2
Aha! There are two phylogenetic trees here: one for bacteria and another one for Archea.
The Phylogeny can be visualized using igraph
. This is
very fast, but less easy to parametrize than visNetwork
.
This plot below reveals two things: the two trees are not properly
separated and the tree roots are placed at the bottom, which is unusual.
They would be better placed on the left).
plot(
igraph_phylogeny,
layout=layout_as_tree,
vertex.label=NA,
vertex.size=v_phylogeny$size/10, # the size were setup for visNetwork
edge.length = 1,
edge.arrow.mode = '-'
)
By using the decompose
function of igraph
,
one can separate each component of a graph. Here below are plotted the
large bacterial tree and the much smaller archeal one.
components <- decompose( igraph_phylogeny )
lapply(
components,
function( g ){
lay = layout_as_tree( g )
# plot(m, layout=-lay[, 2:1]) or maybe this is more what you want: lay[, 2] = -lay[,2] ; plot(m, layout=lay[,2:1])
plot(
g,
layout = lay[, 2:1],
vertex.label = NA,
vertex.size = v_phylogeny$size/5, # the size were setup for visNetwork
edge.length = 1,
edge.arrow.mode = '-' # no arrow head
)
}
)
## [[1]]
## NULL
##
## [[2]]
## NULL
The Phylogeny can also be visualized using visNetwork
.
The simulation takes a while to stabilize, hence for the
stabilization = FALSE
that prevent locking the display. The
labels become visible after zooming enough.
visNetwork(
edges = e_phylogeny,
nodes = v_phylogeny
) %>%
visPhysics( stabilization = FALSE ) %>%
visEdges( smooth = FALSE)
The idea is to calculate the correlations between sampling depth and “species” distribution. These correlation values will be used as fixed temperature, defined between -a and 1 for the diffusion algorithm.
First, let encode numerically the depth codes. The code below was inspired by Fig X of Y, with the depth expressed on a log-like scale from 0 (surface ) to 5. It would be better to use the real depth measures.
depth_code <- setNames( c( 0, 2, 3, 5 ), c( 'SRF', 'MIX', 'DCM', 'MES' ))
depth <- setNames(
unlist( depth_code[ TARAoceans_metadata$depth ] ),
unlist( TARAoceans_metadata$sample )
)
hist( depth )
The raw read counts are nomalized on a log scale after adding a one pseudocount. The correlations with depth is computed for each “species”. This measue:
species_score <- log10( 1 + as.matrix( TARAoceans_specfreq[ , -1 ] ))
rownames( species_score ) <- unlist( TARAoceans_specfreq[ , 1 ] )
cor_species_depth <- setNames(
as.vector( t( cor( depth, species_score ))),
colnames( species_score )
)
hist( cor_species_depth )
The diffuStats
package expects the graph as an
igraph
object, that we have already have produced. It must
be converted on-the-fly into an undircected graph. The method
regularisedLaplacianKernel
return a (possibly huge) matrix
that might takes some times to compute, although it is not the case here
with our “small” graph of about 1,000 vertices. The function
diffuse
with method z
is extremely fast
(mc
would be much slower).
ker_phylogeny <- regularisedLaplacianKernel(
graph = as.undirected( igraph_phylogeny ),
add_diag = 1
);
diffuse_phylogeny <- diffuse(
K = ker_phylogeny,
scores = cor_species_depth,
method = "z"
)
Print the twenty most enriched nodes based on the absolute value of the Z-score. Note that most of them are upper taxa, not “species” and they include a few placeholder taxa. ;-)
tibble(
id = names( diffuse_phylogeny ),
z_score = diffuse_phylogeny
) %>%
arrange( desc( abs( z_score ))) %>%
slice_head( n = 20 ) %>%
kable() %>%
kable_minimal( full_width = F )
id | z_score |
---|---|
SAR11 clade_order | -5.022086 |
SAR406 clade(Marine group A)_family | 4.528399 |
Deferribacterales_order | 4.526148 |
Nitrospinaceae_family | 4.513036 |
Deferribacteres_class | 4.510217 |
Deferribacteres_phylum | 4.372659 |
family_of_EU802926.1.1368 | -4.194378 |
Nitrospina_genus | 4.108841 |
Archaea_domain | 4.106533 |
Chesapeake-Delaware Bay_family | -3.853019 |
genus_of_EU800201.1.1303 | -3.766122 |
Surface 1_family | -3.761349 |
genus_of_EU802926.1.1368 | -3.747952 |
family_of_JX945345.1.1443 | -3.721350 |
genus_of_EF574663.1.1445 | -3.702239 |
EF574663.1.1445 | -3.699120 |
SAR116 clade_family | -3.692733 |
EU802926.1.1368 | -3.684021 |
EU800201.1.1303 | -3.610615 |
Chloroplast_class | -3.517676 |
Show the full graph colored by Z-score. The vertex with a border are those given a fixed temperature. It takes a while for the graph to stabilize. Thereafter one can identify blue and clades that correspond to taxonomic groups which abundances are correlated with depth positively (blue) or neagtively, respectively (red).
# A helper function to map -csoe to color
ramp_for_z <- colorRamp( c("red", "white", "blue") )
col_for_z <- function( z_score, thresh = 2.5 ){
return(
ifelse(
is.na( z_score ),
"white",
rgb(
ramp_for_z(
pmin( pmax(( z_score + 3 * thresh )/( 6 * thresh ), 0 ), 1 )
)
, maxColorValue = 255 )
)
)
}
w_phylogeny = v_phylogeny # local copy
w_phylogeny$color <- col_for_z( diffuse_phylogeny )
w_phylogeny$borderWidth <- ifelse( v_phylogeny$id %in% names( cor_species_depth ), 2, 0 )
visNetwork(
edges = e_phylogeny,
nodes = w_phylogeny
) %>%
visPhysics( stabilization = FALSE ) %>%
visEdges(smooth = FALSE)