Incremental sequences with interruptions

Question

I have a dataset with repeating sequences of TRUE that I would like to label based on some conditions - by id, and by the sequence's incremental value. A FALSE breaks the sequence of TRUEs and the first FALSE that breaks any given sequence of TRUE should be included in that sequence. Consecutive FALSEs in between TRUEs are irrelevant and are labeled 0.

For example:

> test
   id logical sequence
1   1    TRUE        1
2   1    TRUE        1
3   1   FALSE        1
4   1    TRUE        2
5   1    TRUE        2
6   1   FALSE        2
7   1    TRUE        3
8   2    TRUE        1
9   2    TRUE        1
10  2    TRUE        1
11  2   FALSE        1
12  2    TRUE        2
13  2    TRUE        2
14  2    TRUE        2
15  3   FALSE        0
16  3   FALSE        0
17  3   FALSE        0
18  3    TRUE        1
19  3   FALSE        1
20  3    TRUE        2
21  3   FALSE        2
22  3   FALSE        0
23  3   FALSE        0
24  3   FALSE        0
25  3    TRUE        3

And so on. I have considered using rle() which produces

> rle(test$logical)
Run Length Encoding
  lengths: int [1:13] 2 1 2 1 4 1 3 3 1 1 ...
  values : logi [1:13] TRUE FALSE TRUE FALSE TRUE FALSE ...

But I am not sure how to map this back on the data frame. Any suggestions on how to approach this problem?

Here are the sample data:

> dput(test)
structure(list(id = c(1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 
2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3), logical = c(TRUE, TRUE, 
FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE, TRUE, FALSE, TRUE, 
TRUE, TRUE, FALSE, FALSE, FALSE, TRUE, FALSE, TRUE, FALSE, FALSE, 
FALSE, FALSE, TRUE)), .Names = c("id", "logical"), class = "data.frame", row.names = c(NA, 
-25L))

Jaap · Accepted Answer

A pure data.table solution:

# load the 'data.table'-package & convert 'test' to a data.table with 'setDT'
library(data.table)
setDT(test)

# calculate the new sequence
test[, new_seq := (rleid(logical) - !logical) * !(!logical & !shift(logical, fill = FALSE)), by = id
     ][new_seq != 0, new_seq := rleid(new_seq), by = id][]

which gives:

    id logical new_seq
 1:  1    TRUE       1
 2:  1    TRUE       1
 3:  1   FALSE       1
 4:  1    TRUE       2
 5:  1    TRUE       2
 6:  1   FALSE       2
 7:  1    TRUE       3
 8:  2    TRUE       1
 9:  2    TRUE       1
10:  2    TRUE       1
11:  2   FALSE       1
12:  2    TRUE       2
13:  2    TRUE       2
14:  2    TRUE       2
15:  3   FALSE       0
16:  3   FALSE       0
17:  3   FALSE       0
18:  3    TRUE       1
19:  3   FALSE       1
20:  3    TRUE       2
21:  3   FALSE       2
22:  3   FALSE       0
23:  3   FALSE       0
24:  3   FALSE       0
25:  3    TRUE       3

What this does:

rleid(logical) - !logical creates a numeric run length id and substracts 1 for where logical is equal to FALSE
The result of the previous step is then multiplied with the result of !(!logical & !shift(logical, fill = FALSE)), which is a TRUE/FALSE vector for consequtive FALSE values except the first one of a FALSE-sequence.
Finally, we create a new run length id for only the rows where new_seq is not equal to 0 and have your desired result.

A slightly improved alternative (as suggested by @jogo in the comments):

test[, new_seq := (rleid(logical) - !logical) * (logical | shift(logical, fill = FALSE)), by = id
     ][new_seq != 0, new_seq := rleid(new_seq), by = id][]

Andre Elrico · Answer

There is for sure a better implementation of makeSeq function but this works.

This one uses libraries data.table, magrittr and dplyr

Function

makeSeq <- function(x) {
    res  <- ifelse(!x&!lag(x,default = F),T,x) %>% {!.} %>% lag(default=T) %>% cumsum
    IND2F<- ifelse(!x&!lag(x,default = F),T,x) != x
    res[IND2F]  <- 0
    res[!IND2F] <- rleidv(res[!IND2F])
    return(res)
}

data.table solution

setDT(df)[,yourSEQ:=makeSeq(logical),by="id"]
df

tidyverse fans use

df %>% group_by(id) %>% mutate(yourSEQ = makeSeq(logical)) %>% ungroup

Result

> df
    id logical yourSEQ
 1:  1    TRUE       1
 2:  1    TRUE       1
 3:  1   FALSE       1
 4:  1    TRUE       2
 5:  1    TRUE       2
 6:  1   FALSE       2
 7:  1    TRUE       3
 8:  2    TRUE       1
 9:  2    TRUE       1
10:  2    TRUE       1
11:  2   FALSE       1
12:  2    TRUE       2
13:  2    TRUE       2
14:  2    TRUE       2
15:  3   FALSE       0
16:  3   FALSE       0
17:  3   FALSE       0
18:  3    TRUE       1
19:  3   FALSE       1
20:  3    TRUE       2
21:  3   FALSE       2
22:  3   FALSE       0
23:  3   FALSE       0
24:  3   FALSE       0
25:  3    TRUE       3
    id logical yourSEQ

chinsoon12 · Answer

without using rle in dtmtd2 and also some timings:

dplyrmtd0 <- function() {
    test %>%
        group_by(id) %>%
        mutate(sum_rle = with(rle(logical), rep(cumsum(values), lengths))) %>% 
        mutate(sequence2 = if_else(logical == F & lag(logical) == F, 0L, sum_rle, missing = 0L))
}

setDT(test)    
makeSeq <- function(x) {
    res  <- ifelse(!x&!lag(x,default = F),T,x) %>% {!.} %>% lag(default=T) %>% cumsum
    IND2F<- ifelse(!x&!lag(x,default = F),T,x) != x
    res[IND2F]  <- 0
    res[!IND2F] <- rleidv(res[!IND2F])
    return(res)
}
dt0 <- copy(test)
dtmtd0 <- function() {
    dt0[,yourSEQ:=makeSeq(logical),by="id"]   
}

dt1 <- copy(test)
dtmtd1 <- function() {
    dt1[, new_seq := (rleid(logical) - !logical) * !(!logical & !shift(logical, fill = FALSE)), by = id
        ][new_seq != 0, new_seq := rleid(new_seq), by = id][]   
}

dt4 <- copy(test)
dtmtd2 <- function() {
    dt4[, sequence := {
            idx <- cumsum(diff(c(FALSE, logical))==1L)
            mask <- shift(logical, fill=FALSE) | logical
            idx * mask
        }, by=id]
}

microbenchmark(dplyrmtd0(), dtmtd0(), dtmtd1(), dtmtd2(), times=5L)

timings:

Unit: milliseconds
        expr      min       lq     mean   median       uq      max neval
 dplyrmtd0() 375.6089 376.7271 433.1885 380.7428 443.8844 588.9791     5
    dtmtd0() 481.5189 487.1245 492.9527 495.6855 500.1588 500.2759     5
    dtmtd1() 146.0376 147.0163 154.7501 152.7157 154.2976 173.6831     5
    dtmtd2() 106.3401 107.7728 112.7580 108.5239 119.4398 121.7131     5

data:

library(data.table)
library(dplyr)
library(microbenchmark)
M <- 1e6
test <- data.frame(id=sample(LETTERS, M, replace=TRUE) ,
    logical=sample(c(TRUE, FALSE), M, replace=TRUE))
test <- test[order(test$id),]

CJ Yetman · Answer

You could use the cumsum for your rle values, then you have to go back and fix the sequential FALSE values.

library(dplyr)

test %>%
  group_by(id) %>%
  mutate(sum_rle = with(rle(logical), rep(cumsum(values), lengths))) %>% 
  mutate(sequence2 = if_else(logical == F & lag(logical) == F, 0L, sum_rle, missing = 0L)) %>% 
  print(n = 25)

# # A tibble: 25 x 5
# # Groups:   id [3]
#       id logical sequence sum_rle sequence2
#    <int> <lgl>      <int>   <int>     <int>
#  1     1 TRUE           1       1         1
#  2     1 TRUE           1       1         1
#  3     1 FALSE          1       1         1
#  4     1 TRUE           2       2         2
#  5     1 TRUE           2       2         2
#  6     1 FALSE          2       2         2
#  7     1 TRUE           3       3         3
#  8     2 TRUE           1       1         1
#  9     2 TRUE           1       1         1
# 10     2 TRUE           1       1         1
# 11     2 FALSE          1       1         1
# 12     2 TRUE           2       2         2
# 13     2 TRUE           2       2         2
# 14     2 TRUE           2       2         2
# 15     3 FALSE          0       0         0
# 16     3 FALSE          0       0         0
# 17     3 FALSE          0       0         0
# 18     3 TRUE           1       1         1
# 19     3 FALSE          1       1         1
# 20     3 TRUE           2       2         2
# 21     3 FALSE          2       2         2
# 22     3 FALSE          0       2         0
# 23     3 FALSE          0       2         0
# 24     3 FALSE          0       2         0
# 25     3 TRUE           3       3         3

if you prefer a really concise version of the same thing...

library(dplyr)

group_by(test, id) %>%
  mutate(sequence = if_else(!logical & !lag(logical), 0L, 
                            with(rle(logical), rep(cumsum(values), lengths)), 
                            missing = 0L))

Incremental sequences with interruptions

Tags:

r

data.table

dplyr

run-length-encoding

the_darkside

4 Answers

Jaap

Andre Elrico

chinsoon12

CJ Yetman

Recent Activity

Donate For Us

Incremental sequences with interruptions

Tags:

r

data.table

dplyr

run-length-encoding

the_darkside

4 Answers

Jaap

Andre Elrico

chinsoon12

CJ Yetman

Related questions

Recent Activity

Donate For Us