I am computing haplotypes from multiple sequence alignments and am getting stretches of repeats such as RNNNNNNNT and RNNNT. There are many variations, making it hard to make sense of the data.
The data is listed as below and am I'm interested in generating the column haplotypes_2 based on haplotypes_1 as indicated:
hap_code haplotypes_1 haplotypes_2
1 SKNNNRNNNNNKNNNNNNNKF SK(N3)R(N5)K(N7)KF
2 SKNNNNNNNNNKNNNNNNNNKF SK(N9)K(N8)KF
3 SKNNNNNNNNNNNNNNNNKF SK(N16)KF
Using stringr
and a custom function:
library(stringr)
replace_string <- function(x) {
sprintf("(%s%i)", str_sub(x, end = 1L), str_length(x))
}
df1$hapnew <- str_replace_all(df1$haplotypes_1, "N+", replace_string)
hap_code haplotypes_1 haplotypes_2 hapnew
1 1 SKNNNRNNNNNKNNNNNNNKF SK(N3)R(N5)K(N7)KF SK(N3)R(N5)K(N7)KF
2 2 SKNNNNNNNNNKNNNNNNNNKF SK(N9)K(N8)KF SK(N9)K(N8)KF
3 3 SKNNNNNNNNNNNNNNNNKF SK(N16)KF SK(N16)KF
x = c("SKNNNRNNNNNKNNNNNNNKF", "SKNNNNNNNNNKNNNNNNNNKF", "SKNNNNNNNNNNNNNNNNKF")
sapply(strsplit(x, ""), function(mystr)
with(rle(mystr),
paste(paste0(ifelse(lengths > 1, paste0("(",values), values),
ifelse(lengths > 1, paste0(lengths,")"), ""),
collapse = ""))))
#[1] "SK(N3)R(N5)K(N7)KF" "SK(N9)K(N8)KF" "SK(N16)KF"
Here is an option using gsubfn
and str_count
library(stringr)
library(gsubfn)
gsubfn("N+", ~paste0("(", substr(x, 1, 1), str_count(x), ")"), df1$haplotypes_1)
#[1] "SK(N3)R(N5)K(N7)KF" "SK(N9)K(N8)KF" "SK(N16)KF"
Or as @G.Grothendieck suggested, the str_count
can be replaced with nchar
gsubfn("N+", ~sprintf("(%0.1s%d)", x, nchar(x)), df1$haplotypes_1)
#[1] "SK(N3)R(N5)K(N7)KF" "SK(N9)K(N8)KF" "SK(N16)KF"
Almost exactly the same as @d.b, but converted to a couple functions so it's reusable and reader-friendly:
abbreviate_letters <- function(type_letters) {
runs <- rle(type_letters)
run_codes <- ifelse(
runs[["lengths"]] == 1,
yes = runs[["values"]],
no = paste0("(", runs[["values"]], runs[["lengths"]], ")")
)
paste0(run_codes, collapse = "")
}
condense_haplotype <- function(haplotype_long) {
split_terms <- strsplit(haplotype_long, split = "")
vapply(
X = split_terms,
FUN = abbreviate_letters,
FUN.VALUE = character(1)
)
}
haplotypes <- c(
"SKNNNRNNNNNKNNNNNNNKF",
"SKNNNNNNNNNKNNNNNNNNKF",
"SKNNNNNNNNNNNNNNNNKF"
)
condense_haplotype(haplotypes)
# [1] "SK(N3)R(N5)K(N7)KF" "SK(N9)K(N8)KF" "SK(N16)KF"
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With