I'm trying to figure out an efficient way to go about splitting a string like
"111110000011110000111000"
into a vector
[1] "11111" "00000" "1111" "0000" "111" "000"
where "0" and "1" can be any alternating characters.
Try
strsplit(str1, '(?<=1)(?=0)|(?<=0)(?=1)', perl=TRUE)[[1]] #[1] "11111" "00000" "1111" "0000" "111" "000"
A modification of @rawr's solution with stri_extract_all_regex
library(stringi) stri_extract_all_regex(str1, '(?:(\\w))\\1*')[[1]] #[1] "11111" "00000" "1111" "0000" "111" "000" stri_extract_all_regex(x1, '(?:(\\w))\\1*')[[1]] #[1] "11111" "00000" "222" "000" "3333" "000" "1111" "0000" "111" #[10] "000" stri_extract_all_regex(x2, '(?:(\\w))\\1*')[[1]] #[1] "aaaaa" "bb" "ccccccc" "bbb" "a" "d" "11111" #[8] "00000" "222" "aaa" "bb" "cc" "d" "11" #[15] "D" "aa" "BB"
library(stringi) set.seed(24) x3 <- stri_rand_strings(1, 1e4) akrun <- function() stri_extract_all_regex(x3, '(?:(\\w))\\1*')[[1]] #modified @thelatemail's function to make it bit more general thelate <- function() regmatches(x3,gregexpr("(?:(\\w))\\1*", x3, perl=TRUE))[[1]] rawr <- function() strsplit(x3, '(?<=(\\w))(?!\\1)', perl=TRUE)[[1]] ananda <- function() unlist(read.fwf(textConnection(x3), rle(strsplit(x3, "")[[1]])$lengths, colClasses = "character")) Colonel <- function() with(rle(strsplit(x3,'')[[1]]), mapply(function(u,v) paste0(rep(v,u), collapse=''), lengths, values)) Cryo <- function(){ res_vector=rep(NA_character_,nchar(x3)) res_vector[1]=substr(x3,1,1) counter=1 old_tmp='' for (i in 2:nchar(x3)) { tmp=substr(x3,i,i) if (tmp==old_tmp) { res_vector[counter]=paste0(res_vector[counter],tmp) } else { res_vector[counter+1]=tmp counter=counter+1 } old_tmp=tmp } res_vector[!is.na(res_vector)] } richard <- function(){ cs <- cumsum( rle(stri_split_boundaries(x3, type = "character")[[1L]])$lengths ) stri_sub(x3, c(1, head(cs + 1, -1)), cs) } nicola<-function(x) { indices<-c(0,which(diff(as.integer(charToRaw(x)))!=0),nchar(x)) substring(x,indices[-length(indices)]+1,indices[-1]) } richard2 <- function() { cs <- cumsum(rle(strsplit(x3, NULL)[[1L]])[[1L]]) stri_sub(x3, c(1, head(cs + 1, -1)), cs) } system.time(akrun()) # user system elapsed # 0.003 0.000 0.003 system.time(thelate()) # user system elapsed # 0.272 0.001 0.274 system.time(rawr()) # user system elapsed # 0.397 0.001 0.398 system.time(ananda()) # user system elapsed # 3.744 0.204 3.949 system.time(Colonel()) # user system elapsed # 0.154 0.001 0.154 system.time(Cryo()) # user system elapsed # 0.220 0.005 0.226 system.time(richard()) # user system elapsed # 0.007 0.000 0.006 system.time(nicola(x3)) # user system elapsed # 0.190 0.001 0.191
On a slightly bigger string,
set.seed(24) x3 <- stri_rand_strings(1, 1e6) system.time(akrun()) #user system elapsed #0.166 0.000 0.155 system.time(richard()) # user system elapsed # 0.606 0.000 0.569 system.time(richard2()) # user system elapsed # 0.518 0.000 0.487 system.time(Colonel()) # user system elapsed # 9.631 0.000 9.358 library(microbenchmark) microbenchmark(richard(), richard2(), akrun(), times=20L, unit='relative') #Unit: relative # expr min lq mean median uq max neval cld # richard() 2.438570 2.633896 2.365686 2.315503 2.368917 2.124581 20 b #richard2() 2.389131 2.533301 2.223521 2.143112 2.153633 2.157861 20 b # akrun() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 20 a
NOTE: Tried to run the other methods, but it takes a long time
str1 <- "111110000011110000111000" x1 <- "1111100000222000333300011110000111000" x2 <- "aaaaabbcccccccbbbad1111100000222aaabbccd11DaaBB"
Variation on a theme:
x <- "111110000011110000111000" regmatches(x,gregexpr("1+|0+",x))[[1]] #[1] "11111" "00000" "1111" "0000" "111" "000"
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With