Grouping n or more observations in data table without interrputing sequences of consecutive values

Question

In R, I have the following sample data table:

library(data.table)
type <- c("d1", "d1", "d2", "d3", "d3", "d3", "d4", "d4", "d4", "d4", "d4", "d5", "d5", "d6", "d6")
DT <- data.table(type)
DT[, id := seq(.N), by = .(type)]

Which looks like this:

# Input:
#
    type id
 1:   d1  1
 2:   d1  2
 3:   d2  1
 4:   d3  1
 5:   d3  2
 6:   d3  3
 7:   d4  1
 8:   d4  2
 9:   d4  3
10:   d4  4
11:   d4  5
12:   d5  1
13:   d5  2
14:   d6  1
15:   d6  2

I would like to group observations from the type column into chunks of five by adding a new column that contains a unique ID for each group of five. However, sequences of identical values in the type column are not to be assigned to different group IDs, which means that a chunk may contain more than five elements. In other words, what I try to achieve is to add a column chunk with a counter that increases by +1 once five elements have been counted and the last sequence of consecutive identical values from the type column is completed. The desired output is hence:

# Desired output
    type id chunk
 1:   d1  1     1
 2:   d1  2     1
 3:   d2  1     1
 4:   d3  1     1
 5:   d3  2     1
 6:   d3  3     1
 7:   d4  1     2
 8:   d4  2     2
 9:   d4  3     2
10:   d4  4     2
11:   d4  5     2
12:   d5  1     3
13:   d5  2     3
14:   d6  1     3
15:   d6  2     3

Any suggestions and help are welcome, especially vectorised solutions. Thank you very much in advance.

IceCreamToucan · Accepted Answer

DT[, grp := .GRP, type]


i <- 1
DT[1:5, chunk := i] # set chunk = i for first five rows
DT[grp == last(grp[!is.na(chunk)]), chunk := i] # make chunk = i for any rows with same type

while((last.I <- DT[, last(.I[!is.na(chunk)])]) < nrow(DT)){
  i <- i + 1
  DT[last.I + seq(min(c(5, nrow(DT) - last.I))), chunk := i] # set chunk = i for next five rows
  DT[grp == last(grp[!is.na(chunk)]), chunk := i] # make chunk = i for any rows with same type
}
DT[, grp := NULL][]

#     type id chunk
#  1:   d1  1     1
#  2:   d1  2     1
#  3:   d2  1     1
#  4:   d3  1     1
#  5:   d3  2     1
#  6:   d3  3     1
#  7:   d4  1     2
#  8:   d4  2     2
#  9:   d4  3     2
# 10:   d4  4     2
# 11:   d4  5     2
# 12:   d5  1     3
# 13:   d5  2     3
# 14:   d6  1     3
# 15:   d6  2     3

@Frank posted a simpler solution in the comments

gDT = DT[, .N, by=type][, g := 1L]
s = first(gDT$N)
gg = 1L
for (ii in 1:nrow(gDT)){
  if (s >= 5){ 
    s = 0 
    gg = gg + 1L
    gDT[ii:.N, g := gg][]
  } 
  else s = s + gDT$N[ii]
}
DT[gDT, on=.(type), g := i.g]

chinsoon12 · Answer

Adding a Rcpp method and some timings:

sample data:

library(Rcpp)
library(data.table)
library(microbenchmark)

set.seed(0L)
ntypes <- 1e4L
x <- unlist(mapply(rep, 1:ntypes, sample(1:10, ntypes, replace=TRUE)))
DT <- data.table(type=x)
DT1 <- copy(DT)

functions defn:

system.time(cppFunction(
    'NumericVector lumpGrp(NumericVector x) {
        int counter = 1, grp = 1;
        NumericVector ret(x.size());
        ret[0] = grp;
        for (int n = 1; n < x.size(); n++) {
            if (counter >= 5 && x[n] != x[n-1]) {
                counter = 1;
                grp += 1;
            } else {
                counter += 1;
            }
            ret[n] = grp;
        }
        return ret;
    }'))


mtd0 <- function() {
    gDT = DT1[, .N, by=type][, g := 1L]
    s = first(gDT$N)
    gg = 1L
    for (ii in 1:nrow(gDT)){
        if (s >= 5){
            s = 0
            gg = gg + 1L
            gDT[ii:.N, g := gg][]
        }
        else s = s + gDT$N[ii]
    }
    DT1[gDT, on=.(type), g := i.g]
    DT1
}

mtd1 <- function() {
    DT[, grp := .GRP, type][]

    i <- 1
    DT[1:5, chunk := i] # set chunk = i for first five rows
    DT[grp == last(grp[!is.na(chunk)]), chunk := i] # make chunk = i for any rows with same type

    while((last.I <- DT[, last(.I[!is.na(chunk)])]) < nrow(DT)){
        i <- i + 1
        DT[last.I + seq(min(c(5, nrow(DT) - last.I))), chunk := i] # set chunk = i for next five rows
        DT[grp == last(grp[!is.na(chunk)]), chunk := i] # make chunk = i for any rows with same type
    }
    DT[, grp := NULL][]
}

checks:

#identical(mtd0()$g, mtd1()$chunk)
#identical(mtd0()$g, lumpGrp(x))

head(x, 50)
# [1] 1 1 1 1 1 1 1 1 1 2 2 2 3 3 3 3 4 4 4 4 4 4 5 5 5 5 5 5 5 5 5 5 6 6 6 7 7 7 7 7 7
#[42] 7 7 7 8 8 8 8 8 8



head(mtd0()$g, 50)
# [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4
#[42] 4 4 4 5 5 5 5 5 5

head(mtd1()$chunk, 50)
# [1] 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 5
#[42] 5 5 5 6 6 6 6 6 6

head(lumpGrp(x), 50)
# [1] 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 5
#[42] 5 5 5 6 6 6 6 6 6



tail(mtd1()$chunk, 20)
#[1] 6807 6807 6807 6808 6808 6808 6808 6808 6809 6809 6809 6809 6809 6809    1    1    1    1    1    1

tail(lumpGrp(x), 20)
#[1] 6807 6807 6807 6808 6808 6808 6808 6808 6809 6809 6809 6809 6809 6809 6810 6810 6810 6810 6810 6810

timing code:

mtd0 <- compiler::cmpfun(mtd0)
mtd1 <- compiler::cmpfun(mtd1)
microbenchmark(mtd0(), mtd1(), lumpGrp(x), times=3L)

timings:

Unit: microseconds
       expr         min           lq         mean      median          uq         max neval
     mtd0() 2930021.679 2952683.7490 2972758.5293 2975345.819 2994126.954 3012908.090     3
     mtd1()    7306.673    7573.0050    7763.6367    7839.337    7992.119    8144.900     3
 lumpGrp(x)     431.032     431.3635     476.6073     431.695     499.395     567.095     3

akrun · Answer

One option would be

DT[, chunk := cumsum(shift(!cumsum(id != shift(id, fill = id[2])) %% 5,
                    fill = TRUE))]
DT
#    type id chunk
# 1:   d1  1     1
# 2:   d1  2     1
# 3:   d2  1     1
# 4:   d3  1     1
# 5:   d3  2     1
# 6:   d3  3     1
# 7:   d4  1     2
# 8:   d4  2     2
# 9:   d4  3     2
#10:   d4  4     2
#11:   d4  5     2
#12:   d5  1     3
#13:   d5  2     3
#14:   d6  1     3
#15:   d6  2     3

Grouping n or more observations in data table without interrputing sequences of consecutive values

Tags:

r

data.table

MichaelU

3 Answers

IceCreamToucan

chinsoon12

akrun

Recent Activity

Donate For Us

Grouping n or more observations in data table without interrputing sequences of consecutive values

Tags:

r

data.table

MichaelU

3 Answers

IceCreamToucan

chinsoon12

akrun

Related questions

Recent Activity

Donate For Us