Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Split string based on alternating character in R

Tags:

r

I'm trying to figure out an efficient way to go about splitting a string like

"111110000011110000111000" 

into a vector

[1] "11111" "00000" "1111" "0000" "111" "000" 

where "0" and "1" can be any alternating characters.

like image 991
CodeShaman Avatar asked Apr 01 '15 05:04

CodeShaman


Video Answer


2 Answers

Try

strsplit(str1, '(?<=1)(?=0)|(?<=0)(?=1)', perl=TRUE)[[1]] #[1] "11111" "00000" "1111"  "0000"  "111"   "000"   

Update

A modification of @rawr's solution with stri_extract_all_regex

library(stringi) stri_extract_all_regex(str1, '(?:(\\w))\\1*')[[1]] #[1] "11111" "00000" "1111"  "0000"  "111"   "000"     stri_extract_all_regex(x1, '(?:(\\w))\\1*')[[1]] #[1] "11111" "00000" "222"   "000"   "3333"  "000"   "1111"  "0000"  "111"   #[10] "000"    stri_extract_all_regex(x2, '(?:(\\w))\\1*')[[1]] #[1] "aaaaa"   "bb"      "ccccccc" "bbb"     "a"       "d"       "11111"   #[8] "00000"   "222"     "aaa"     "bb"      "cc"      "d"       "11"      #[15] "D"       "aa"      "BB"      

Benchmarks

library(stringi)  set.seed(24) x3 <- stri_rand_strings(1, 1e4)  akrun <- function() stri_extract_all_regex(x3, '(?:(\\w))\\1*')[[1]] #modified @thelatemail's function to make it bit more general thelate <- function() regmatches(x3,gregexpr("(?:(\\w))\\1*", x3,              perl=TRUE))[[1]] rawr <- function() strsplit(x3, '(?<=(\\w))(?!\\1)', perl=TRUE)[[1]] ananda <- function() unlist(read.fwf(textConnection(x3),                  rle(strsplit(x3, "")[[1]])$lengths,                  colClasses = "character")) Colonel <- function() with(rle(strsplit(x3,'')[[1]]),     mapply(function(u,v) paste0(rep(v,u), collapse=''), lengths, values))  Cryo <- function(){    res_vector=rep(NA_character_,nchar(x3))   res_vector[1]=substr(x3,1,1)   counter=1   old_tmp=''     for (i in 2:nchar(x3)) {     tmp=substr(x3,i,i)     if (tmp==old_tmp) {     res_vector[counter]=paste0(res_vector[counter],tmp)     } else {     res_vector[counter+1]=tmp     counter=counter+1     }   old_tmp=tmp    }   res_vector[!is.na(res_vector)]   }    richard <- function(){      cs <- cumsum(      rle(stri_split_boundaries(x3, type = "character")[[1L]])$lengths    )    stri_sub(x3, c(1, head(cs + 1, -1)), cs)   }   nicola<-function(x) {    indices<-c(0,which(diff(as.integer(charToRaw(x)))!=0),nchar(x))    substring(x,indices[-length(indices)]+1,indices[-1])  }   richard2 <- function() {   cs <- cumsum(rle(strsplit(x3, NULL)[[1L]])[[1L]])   stri_sub(x3, c(1, head(cs + 1, -1)), cs)  }  system.time(akrun()) # user  system elapsed  # 0.003   0.000   0.003   system.time(thelate()) #   user  system elapsed  #  0.272   0.001   0.274   system.time(rawr()) # user  system elapsed  #  0.397   0.001   0.398   system.time(ananda()) #  user  system elapsed  # 3.744   0.204   3.949   system.time(Colonel()) #   user  system elapsed  #  0.154   0.001   0.154   system.time(Cryo()) #  user  system elapsed  # 0.220   0.005   0.226   system.time(richard()) #  user  system elapsed  # 0.007   0.000   0.006   system.time(nicola(x3)) # user  system elapsed  # 0.190   0.001   0.191  

On a slightly bigger string,

set.seed(24) x3 <- stri_rand_strings(1, 1e6)  system.time(akrun()) #user  system elapsed  #0.166   0.000   0.155  system.time(richard()) #  user  system elapsed  # 0.606   0.000   0.569  system.time(richard2()) #  user  system elapsed  # 0.518   0.000   0.487   system.time(Colonel()) #  user  system elapsed  # 9.631   0.000   9.358    library(microbenchmark)  microbenchmark(richard(), richard2(), akrun(), times=20L, unit='relative')  #Unit: relative  #     expr      min       lq     mean   median       uq      max neval cld  # richard() 2.438570 2.633896 2.365686 2.315503 2.368917 2.124581    20   b  #richard2() 2.389131 2.533301 2.223521 2.143112 2.153633 2.157861    20   b  # akrun() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000    20  a  

NOTE: Tried to run the other methods, but it takes a long time

data

str1 <- "111110000011110000111000" x1 <- "1111100000222000333300011110000111000" x2 <- "aaaaabbcccccccbbbad1111100000222aaabbccd11DaaBB" 
like image 136
akrun Avatar answered Sep 23 '22 06:09

akrun


Variation on a theme:

x <- "111110000011110000111000" regmatches(x,gregexpr("1+|0+",x))[[1]] #[1] "11111" "00000" "1111"  "0000"  "111"   "000" 
like image 22
thelatemail Avatar answered Sep 22 '22 06:09

thelatemail