I've got data for several experiments that gives the proportion of a species' DNA in a sample that I want to represent in stacked bar charts. Example data is
sample_df <- structure(list(sample = c("R17108_BSSE_QGF_132757_HHWVVDRXX_1_G44937_CCGTGAAG_CAGTGGAT_S29_L001",
"R17108_BSSE_QGF_132757_HHWVVDRXX_1_G44937_CCGTGAAG_CAGTGGAT_S29_L001",
"R17108_BSSE_QGF_132757_HHWVVDRXX_1_G44937_CCGTGAAG_CAGTGGAT_S29_L001",
"R18676", "R18676", "R23399_COW6673A59_S33", "R23399_COW6673A59_S33",
"R23464_COW5599A32_S33", "R23464_COW5599A32_S33", "R23464_COW5599A32_S33",
"R24033_concatreseq", "R24033_concatreseq", "R24033_concatreseq",
"R24033_concatreseq", "R24033_concatreseq", "R24033_concatreseq",
"R24033_concatreseq", "R24033", "R24033", "R24033", "R24033",
"R24033", "R24033", "R30216_concatreseq", "R30216_concatreseq",
"R30216", "R30216", "R31417_concatreseq", "R31417", "R32064",
"R32064", "R32064", "R32064", "R4752_BSSE_QGF_132888_HHWVVDRXX_1_G45072_CAGGAGCC_GTCCAATC_S159_L001",
"R4752_BSSE_QGF_132888_HHWVVDRXX_1_G45072_CAGGAGCC_GTCCAATC_S159_L001",
"R4752_BSSE_QGF_132888_HHWVVDRXX_1_G45072_CAGGAGCC_GTCCAATC_S159_L001",
"R4752_BSSE_QGF_132888_HHWVVDRXX_1_G45072_CAGGAGCC_GTCCAATC_S159_L001",
"R4752_BSSE_QGF_132888_HHWVVDRXX_1_G45072_CAGGAGCC_GTCCAATC_S159_L001",
"R4775_LFO46Pool105_3311__L5_ACCACTGT_L005", "R4775_LFO46Pool105_3311__L5_ACCACTGT_L005"
), name = c("Microbacterium sp. LKL04", "Microbacterium oleivorans",
"Mycobacterium tuberculosis", "Staphylococcus cohnii", "Mycobacterium tuberculosis",
"Paraburkholderia fungorum", "Paraburkholderia xenovorans", "Paraburkholderia fungorum",
"Paraburkholderia aromaticivorans", "Paraburkholderia xenovorans",
"Bacillus safensis", "Bacillus sp. WP8", "Bacillus sp. PAMC28571",
"Bacillus sp. PAMC22265", "Bacillus pumilus", "Bacillus altitudinis",
"Bacillus subtilis", "Bacillus safensis", "Bacillus sp. WP8",
"Bacillus sp. PAMC28571", "Bacillus sp. PAMC22265", "Bacillus pumilus",
"Bacillus altitudinis", "Mycobacterium avium", "Mycobacterium tuberculosis",
"Mycobacterium avium", "Mycobacterium tuberculosis", "Mycobacterium avium",
"Mycobacterium avium", "Staphylococcus aureus", "Paenibacillus sp. 32O-W",
"Mycobacterium tuberculosis", "Homo sapiens", "Ralstonia pickettii",
"Ralstonia mannitolilytica", "Ralstonia insidiosa", "Ralstonia solanacearum",
"Mycobacterium tuberculosis", "Paenibacillus naphthalenovorans",
"Paenibacillus sp. B01"), fraction_total_reads = c(0.29347, 0.09071,
0.46242, 0.6525, 0.32403, 0.92541, 0.01772, 0.8842, 0.04011,
0.01561, 0.72733, 0.02744, 0.11121, 0.02673, 0.03845, 0.02282,
0.01176, 0.73674, 0.02711, 0.12122, 0.01858, 0.03677, 0.02115,
0.97964, 0.01579, 0.98397, 0.01227, 0.9907, 0.99348, 0.43967,
0.01337, 0.4288, 0.02825, 0.54439, 0.08077, 0.05916, 0.01978,
0.23135, 0.70247, 0.02424)), row.names = c(NA, -40L), class = c("tbl_df",
"tbl", "data.frame"))
And then plot it like this
ggplot(data = sample_df, aes(fill=name, y=fraction_total_reads, x=sample)) +
geom_bar(position="stack", stat="identity", color="black") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
To get
The problem is that I'm mainly interested in Mycobacterium tuberculosis, but the color for it often gets lost in the other Mycobacterium species so it doesn't stand out. Is there a way to assign a specific distinct color or background or border to one species without having to assign colors for all? My real data has >1000 species so that would be very labor intensive. In other places I've seen it suggested that I create a new column like so
sample_df2 <- sample_df %>%
mutate(is_mtb = ifelse(name == "Mycobacterium tuberculosis", TRUE, FALSE))
And then plot with
ggplot(data = sample_df2, aes(fill=is_mtb, y=fraction_total_reads, x=sample)) +
geom_bar(position="stack", stat="identity") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
But this looses too much information in that I don't see what the other species are.
It would be best for me if I had a plot like the first where Mycobacterium tuberculosis was highlighted in some way, and also the bottom of each bar.
You can approach it this way:
library(ggplot2)
# define default colours for each name
mynames <- unique(sample_df$name)
mycolours <- scales::hue_pal()(length(mynames))
mycolours <- setNames(mycolours, mynames)
# set up custom colour!
mycolours["Mycobacterium tuberculosis"] <- "red"
# your plot
ggplot(data = sample_df, aes(fill=name, y=fraction_total_reads, x=sample)) +
geom_bar(position="stack", stat="identity", color="black") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
# add this!
scale_fill_manual(values = mycolours)
You can highlight it even more with this:
library(ggplot2)
mynames <- unique(sample_df$name)
myfills <- scales::hue_pal()(length(mynames))
myfills <- setNames(myfills, mynames)
myfills["Mycobacterium tuberculosis"] <- "red"
mycolours <- rep("black", length(mynames))
mycolours <- setNames(mycolours, mynames)
mycolours["Mycobacterium tuberculosis"] <- "red"
myalphas <- rep(0.6, length(mynames))
myalphas <- setNames(myalphas, mynames)
myalphas["Mycobacterium tuberculosis"] <- 1
ggplot(data = sample_df, aes(y = fraction_total_reads,
x = sample,
alpha = name,
colour = name,
fill = name)) +
geom_bar(position = "stack", stat = "identity") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
scale_fill_manual (values = myfills ) +
scale_colour_manual(values = mycolours) +
scale_alpha_manual (values = myalphas )
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With