EDIT : Thought it might be useful to mention for anyone searching later that this data is the output I got from Biomart when I tried to output sequence attributes.
I have the following genomic data:
structure(list(Sequences = structure(c(2L, 10L, 3L, 8L, 9L, 1L,
5L, 4L, 6L, 7L), .Label = c(">ENSRNOG00000000902|Hsph1", ">ENSRNOG00000001136|Pebp1",
">ENSRNOG00000001214|Pfkl", "AGAGAGGCGAGCGGCGGAGAGCGGTGGCAAATACTGAACGCAGTCTCGCAGGGTAAGCCC",
"GAGCGATTGGGACCTCCCCTTTTGGATTGGTAGCTGAGCGGCAGTGGCGGCGGCTGCGTG",
"GAGGCATCTTCCCGGCCGGTCGGGAGCAGGAGGAGCACGCAGCGGATCCCAGGCAGAGGC",
"GGACCGGGCCAGCC", "GGCGGGGACAGGCGACAGCCGCGCGGAACGCAGAGCGGCGGGAGAGGAGCTCGGGCTCCT",
"GGTCTCTGCTGCCGTC", "GTTTAACTGCACTCGGGACTCGGCGCGCGCGTGTGTCTGTTCTCTCCATCGTC"
), class = "factor")), .Names = "Sequences", class = "data.frame", row.names = c(NA,
-10L))
I would like to rearrange the data so the first column shows the gene ID info (e.g. for the first case it would be: ">ENSRNOG00000001136|Pebp1") and then the lines of genomic code underneath it would appear together in the column next to it. Note that rows 7-10 have multiple lines of genetic code. Here, all the strings below the gene ID info would be concatenated together into one line instead of spread over 4 separate rows. Lastly, I also want to remove the ">" symbol that comes before each of the gene IDs.
The final output would thus be:
ID Sequence
ENSRNOG00000001136|Pebp1 GTTTAACTGCACTCGGGACTCGGCGCGCGCGTGTGTCTGTTCTCTCCATCGTC
ENSRNOG00000001214|Pfkl GGCGGGGACAGGCGACAGCCGCGCGGAACGCAGAGCGGCGGGAGAGGAGCTCGGGCTCCTGGTCTCTGCTGCCGTC
ENSRNOG00000000902|Hsph1 GAGCGATTGGGACCTCCCCTTTTGGATTGGTAGCTGAGCGGCAGTGGCGGCGGCTGCGTGAGAGAGGCGAGCGGCGGAGAGCGGTGGCAAATACTGAACGCAGTCTCGCAGGGTAAGCCCGAGGCATCTTCCCGGCCGGTCGGGAGCAGGAGGAGCACGCAGCGGATCCCAGGCAGAGGCGGACCGGGCCAGCC
Note that this is just the first few lines of ~2500 row data frame. The solution needs to be general enough that it can parse situations where the number of rows taken up by the sequence may be more than the 4 rows shown in the example above.
After labeling rows with whether it's a gene name (based on whether it has ">"), you can use cumsum
to determine groups of the rows. Then you can use substr
to remove the leading ">" and tapply
with paste
to combine all the genes in each group.
dat$Sequences <- as.character(dat$Sequences)
(is.gene <- grepl(">", dat$Sequences))
# [1] TRUE FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
(gene.groups <- cumsum(is.gene))
# [1] 1 1 2 2 2 3 3 3 3 3
data.frame(ID=substr(dat$Sequences[is.gene], 2, nchar(dat$Sequences[is.gene])),
Sequence=tapply(dat$Sequences[!is.gene], gene.groups[!is.gene], paste, collapse=""))
# ID
# 1 ENSRNOG00000001136|Pebp1
# 2 ENSRNOG00000001214|Pfkl
# 3 ENSRNOG00000000902|Hsph1
# Sequence
# 1 GTTTAACTGCACTCGGGACTCGGCGCGCGCGTGTGTCTGTTCTCTCCATCGTC
# 2 GGCGGGGACAGGCGACAGCCGCGCGGAACGCAGAGCGGCGGGAGAGGAGCTCGGGCTCCTGGTCTCTGCTGCCGTC
# 3 GAGCGATTGGGACCTCCCCTTTTGGATTGGTAGCTGAGCGGCAGTGGCGGCGGCTGCGTGAGAGAGGCGAGCGGCGGAGAGCGGTGGCAAATACTGAACGCAGTCTCGCAGGGTAAGCCCGAGGCATCTTCCCGGCCGGTCGGGAGCAGGAGGAGCACGCAGCGGATCCCAGGCAGAGGCGGACCGGGCCAGCC
Here's a possible data.table
solution (though you will have additional column called indx
that you can remove later if you wish)
library(data.table)
setDT(df)[, list(
ID = sub("^>", "", Sequences[1L]),
Sequences = paste(Sequences[-1L], collapse = "")
),
by = list(indx = cumsum(grepl(">", Sequences)))]
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With