Data
I have the following (simplified) dataset, we call df
from now on:
species rank value
1 Pseudomonas putida family Pseudomonadaceae
2 Pseudomonas aeruginosa family Pseudomonadaceae
3 Enterobacter xiangfangensis family Enterobacteriaceae
4 Salmonella enterica family Enterobacteriaceae
5 Klebsiella pneumoniae family Enterobacteriaceae
6 Pseudomonas putida genus Pseudomonas
7 Pseudomonas aeruginosa genus Pseudomonas
8 Enterobacter xiangfangensis genus Enterobacter
9 Salmonella enterica genus Salmonella
10 Klebsiella pneumoniae genus Klebsiella
11 Pseudomonas putida species Pseudomonas putida
12 Pseudomonas aeruginosa species Pseudomonas aeruginosa
13 Enterobacter xiangfangensis species Enterobacter hormaechei
14 Salmonella enterica species Salmonella enterica
15 Klebsiella pneumoniae species Klebsiella pneumoniae
What I want to achieve
This data is taxonomy data that shows the species
classification, where the rank
is in order of family > genus > species. Due to the hierarchical nature I want to show this as a tree, preferentially in ggplot2
like so:
What I tried
While I found a package, taxize
written to convert this (actually the full classification - only partially shown here) to a tree, using class2tree
:
class.dat <- classification(c("Pseudomonas putida", "Pseudomonas aeruginosa","Enterobacter xiangfangensis","Salmonella enterica","Klebsiella pneumoniae"), db = 'ncbi')
taxize::class2tree(class.dat)
This does not show the ranks like in my hand made graph, that I need in my visualization:
EDIT: dput of data
structure(list(species = c("Pseudomonas putida", "Pseudomonas putida",
"Pseudomonas putida", "Pseudomonas aeruginosa", "Pseudomonas aeruginosa",
"Pseudomonas aeruginosa", "Enterobacter xiangfangensis", "Enterobacter xiangfangensis",
"Enterobacter xiangfangensis", "Salmonella enterica", "Salmonella enterica",
"Salmonella enterica", "Klebsiella pneumoniae", "Klebsiella pneumoniae",
"Klebsiella pneumoniae"), rank = c("family", "genus", "species",
"family", "genus", "species", "family", "genus", "species", "family",
"genus", "species", "family", "genus", "species"), value = c("Pseudomonadaceae",
"Pseudomonas", "Pseudomonas putida", "Pseudomonadaceae", "Pseudomonas",
"Pseudomonas aeruginosa", "Enterobacteriaceae", "Enterobacter",
"Enterobacter hormaechei", "Enterobacteriaceae", "Salmonella",
"Salmonella enterica", "Enterobacteriaceae", "Klebsiella", "Klebsiella pneumoniae"
)), row.names = c(NA, -15L), class = "data.frame", .Names = c("species",
"rank", "value"))
EDIT: Response to @StupidWolf
I was able to convert the class.data to a dataframe and then into a parent-child dataframe to use it as input for the ggraph
. The only thing left is having the xlabel, in this case the interest
vector. However I'm not sure if that's possible in ggraph
:
# Retreive data
class.dat <- classification(c("Pseudomonas putida", "Pseudomonas aeruginosa","Enterobacter xiangfangensis","Salmonella enterica","Klebsiella pneumoniae"), db = 'ncbi')
# Specify interest
interest <- c('superkingdom', 'phylum','class','order','genus','species')
# Convert to wide matrix
df2 <- bind_rows(class.dat, .id = "column_label") %>%
dplyr::select(-id) %>%
filter(rank %in% interest) %>%
spread(rank, name) %>%
dplyr::select(-column_label) %>%
dplyr::select(interest) %>% # we need the order
as.matrix()
# Empty parent child matrix
parent.child <- matrix(nrow=0,ncol=2)
# Add data to parent child
for (i in 1:(ncol(df2)-1)){
parent.child <- rbind(parent.child,df2[,c(i,i+1)])
}
# To dataframe and add colnmaes
parent.child <- as.data.frame(parent.child)
colnames(parent.child) <- c('from', 'to')
# Convert this to a ggraph
g <- graph_from_data_frame(parent.child)
ggraph(g,layout='dendrogram',circular=FALSE) +
geom_edge_link() +
geom_node_label(aes(label=names(V(g))),size=3,nudge_y=-0.1) +
scale_y_reverse(labels = interest) + coord_flip() +
theme_classic()
Then we create a hierarchical bundling
d1 = data.frame(from="origin",to=c("Pseudomonadaceae","Enterobacteriaceae"))
d2 = data.frame(from=c("Pseudomonadaceae","Pseudomonadaceae","Enterobacteriaceae","Enterobacteriaceae","Enterobacteriaceae"),to=c("Pseudomonas","Pseudomonas","Enterobacter","Salmonella","Klebsiella"))
d3 = data.frame(from=c("Pseudomonas","Pseudomonas","Enterobacter","Salmonella","Klebsiella"),to=c("Pseudomonas putida","Pseudomonas aeruginosa","Enterobacter hormaechei","Salmonella enterica","Klebsiella pneumoniae"))
hierarchy <- rbind(d1, d2,d3)
vertices <- data.frame(name = unique(c(as.character(hierarchy$from), as.character(hierarchy$to))) )
Then we either plot them using igraph:
g <- graph_from_data_frame( hierarchy, vertices=vertices )
lay = layout.reingold.tilford(g)
par(mar=c(0,0,0,0))
plot(g, layout=-lay[, 2:1],vertex.label.cex=0.7,
vertex.size=1,edge.arrow.size= 0.4)
Or something like this in ggraph:
library(ggraph)
ggraph(g,layout='dendrogram',circular=FALSE) +
geom_edge_link() +
geom_node_label(aes(label=names(V(g))),size=2,nudge_y=-0.1) +
scale_y_reverse() + coord_flip() + theme_void()
Here's a graph based approach.
df = do.call(rbind, lapply(split(d, d$species), function(x){
data.frame(rbind(c(x$value[match(c("family", "genus"), x$rank)], "root"),
c(x$value[match(c("genus", "species"), x$rank)], NA)),
stringsAsFactors = FALSE)
}))
df = unique(df)
rownames(df) = NULL
df
library(igraph)
g = graph.data.frame(df, directed = FALSE)
plot(g, layout = layout_as_tree(g, root = which(V(g)$name %in% sort(unique(df[,1][df[,3] == "root"])))))
and ggplot
d2 = d %>%
spread(rank, value) %>%
arrange(family, genus, species) %>%
mutate(species = sapply(strsplit(species, " "), "[", 2),
y3 = row_number(),
grp = row_number(),
y2 = ave(y3, genus, FUN = function(x) mean(x)),
y1 = ave(y2, family, FUN = function(x) mean(x))) %>%
gather(key, y, -family, -genus, -species, -grp) %>%
mutate(x = as.numeric(factor(key, c("y1", "y2", "y3"))),
lbl = case_when(
key == "y1" ~ family,
key == "y2" ~ genus,
key == "y3" ~ species,
TRUE ~ NA_character_)) %>%
arrange(x, y)
graphics.off()
ggplot(d2, aes(x, y, group = grp, label = lbl)) +
geom_point(size = 2, shape = 21) +
geom_line() +
geom_text(hjust = "inward", vjust = "inward")
Great source for Phylogenetic trees with R by Prof.Guangchuang Yu:
https://yulab-smu.top/treedata-book/index.html
Heres my solution using ggtree:
# Packages :
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("ggtree")
library(ggplot2)
library(ggtree)
library(treeio)
library(ape)
library(tidytree)
its better to use a dedicated Format when computing phylogenies (for example NEXUS)
# New Hampshire eXtended format :
treetext="(((P.putida:1[&&NHX:S=S],P.aerufgiosa:1[&&NHX:S=S:B=])
:1.3[&&NHX:D=Pseudomonas:S=G]):1[&&NHX:D=Pseudonadaceae:S=F],
((K.pneumoniae:1[&&NHX:]):1.3[&&NHX:D=Klebsiella],(S.enterica:1[&&NHX:])
:1.3[&&NHX:D=Salmonella],(E.xiangfangensis:1[&&NHX:]):1.3[&&NHX:D=Enterobacter])
:1[&&NHX:D=Enterobacteriaceae])
:1[&&NHX:D=Gammaproteobacteria];"
tree <- read.nhx(textConnection(treetext))
# Plot Stuff
d <- data.frame(.panel = c('Tree','Tree','Tree','Tree'),
lab = c("Class","Family" ,"Genus", "Species"),
x=c(0,1,2,3), y=-2)
p<-ggtree(tree) + geom_tiplab(offset = F) +
geom_label(aes(x=branch, label=S), fill='lightgreen') +
geom_label(aes(label=D), fill='lightblue') + coord_cartesian(clip = 'off') +
theme_tree2(plot.margin=margin(3, 3, 3, 3 ,"cm"), axis.ticks = element_blank(), axis.text.x = element_blank())
p+geom_text(aes(label=lab), data=d)
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With