In R, I have the following sample data table:
library(data.table)
type <- c("d1", "d1", "d2", "d3", "d3", "d3", "d4", "d4", "d4", "d4", "d4", "d5", "d5", "d6", "d6")
DT <- data.table(type)
DT[, id := seq(.N), by = .(type)]
Which looks like this:
# Input:
#
type id
1: d1 1
2: d1 2
3: d2 1
4: d3 1
5: d3 2
6: d3 3
7: d4 1
8: d4 2
9: d4 3
10: d4 4
11: d4 5
12: d5 1
13: d5 2
14: d6 1
15: d6 2
I would like to group observations from the type
column into chunks of five by adding a new column that contains a unique ID for each group of five. However, sequences of identical values in the type
column are not to be assigned to different group IDs, which means that a chunk may contain more than five elements. In other words, what I try to achieve is to add a column chunk
with a counter that increases by +1 once five elements have been counted and the last sequence of consecutive identical values from the type
column is completed. The desired output is hence:
# Desired output
type id chunk
1: d1 1 1
2: d1 2 1
3: d2 1 1
4: d3 1 1
5: d3 2 1
6: d3 3 1
7: d4 1 2
8: d4 2 2
9: d4 3 2
10: d4 4 2
11: d4 5 2
12: d5 1 3
13: d5 2 3
14: d6 1 3
15: d6 2 3
Any suggestions and help are welcome, especially vectorised solutions. Thank you very much in advance.
DT[, grp := .GRP, type]
i <- 1
DT[1:5, chunk := i] # set chunk = i for first five rows
DT[grp == last(grp[!is.na(chunk)]), chunk := i] # make chunk = i for any rows with same type
while((last.I <- DT[, last(.I[!is.na(chunk)])]) < nrow(DT)){
i <- i + 1
DT[last.I + seq(min(c(5, nrow(DT) - last.I))), chunk := i] # set chunk = i for next five rows
DT[grp == last(grp[!is.na(chunk)]), chunk := i] # make chunk = i for any rows with same type
}
DT[, grp := NULL][]
# type id chunk
# 1: d1 1 1
# 2: d1 2 1
# 3: d2 1 1
# 4: d3 1 1
# 5: d3 2 1
# 6: d3 3 1
# 7: d4 1 2
# 8: d4 2 2
# 9: d4 3 2
# 10: d4 4 2
# 11: d4 5 2
# 12: d5 1 3
# 13: d5 2 3
# 14: d6 1 3
# 15: d6 2 3
@Frank posted a simpler solution in the comments
gDT = DT[, .N, by=type][, g := 1L]
s = first(gDT$N)
gg = 1L
for (ii in 1:nrow(gDT)){
if (s >= 5){
s = 0
gg = gg + 1L
gDT[ii:.N, g := gg][]
}
else s = s + gDT$N[ii]
}
DT[gDT, on=.(type), g := i.g]
Adding a Rcpp
method and some timings:
sample data:
library(Rcpp)
library(data.table)
library(microbenchmark)
set.seed(0L)
ntypes <- 1e4L
x <- unlist(mapply(rep, 1:ntypes, sample(1:10, ntypes, replace=TRUE)))
DT <- data.table(type=x)
DT1 <- copy(DT)
functions defn:
system.time(cppFunction(
'NumericVector lumpGrp(NumericVector x) {
int counter = 1, grp = 1;
NumericVector ret(x.size());
ret[0] = grp;
for (int n = 1; n < x.size(); n++) {
if (counter >= 5 && x[n] != x[n-1]) {
counter = 1;
grp += 1;
} else {
counter += 1;
}
ret[n] = grp;
}
return ret;
}'))
mtd0 <- function() {
gDT = DT1[, .N, by=type][, g := 1L]
s = first(gDT$N)
gg = 1L
for (ii in 1:nrow(gDT)){
if (s >= 5){
s = 0
gg = gg + 1L
gDT[ii:.N, g := gg][]
}
else s = s + gDT$N[ii]
}
DT1[gDT, on=.(type), g := i.g]
DT1
}
mtd1 <- function() {
DT[, grp := .GRP, type][]
i <- 1
DT[1:5, chunk := i] # set chunk = i for first five rows
DT[grp == last(grp[!is.na(chunk)]), chunk := i] # make chunk = i for any rows with same type
while((last.I <- DT[, last(.I[!is.na(chunk)])]) < nrow(DT)){
i <- i + 1
DT[last.I + seq(min(c(5, nrow(DT) - last.I))), chunk := i] # set chunk = i for next five rows
DT[grp == last(grp[!is.na(chunk)]), chunk := i] # make chunk = i for any rows with same type
}
DT[, grp := NULL][]
}
checks:
#identical(mtd0()$g, mtd1()$chunk)
#identical(mtd0()$g, lumpGrp(x))
head(x, 50)
# [1] 1 1 1 1 1 1 1 1 1 2 2 2 3 3 3 3 4 4 4 4 4 4 5 5 5 5 5 5 5 5 5 5 6 6 6 7 7 7 7 7 7
#[42] 7 7 7 8 8 8 8 8 8
head(mtd0()$g, 50)
# [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4
#[42] 4 4 4 5 5 5 5 5 5
head(mtd1()$chunk, 50)
# [1] 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 5
#[42] 5 5 5 6 6 6 6 6 6
head(lumpGrp(x), 50)
# [1] 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 5
#[42] 5 5 5 6 6 6 6 6 6
tail(mtd1()$chunk, 20)
#[1] 6807 6807 6807 6808 6808 6808 6808 6808 6809 6809 6809 6809 6809 6809 1 1 1 1 1 1
tail(lumpGrp(x), 20)
#[1] 6807 6807 6807 6808 6808 6808 6808 6808 6809 6809 6809 6809 6809 6809 6810 6810 6810 6810 6810 6810
timing code:
mtd0 <- compiler::cmpfun(mtd0)
mtd1 <- compiler::cmpfun(mtd1)
microbenchmark(mtd0(), mtd1(), lumpGrp(x), times=3L)
timings:
Unit: microseconds
expr min lq mean median uq max neval
mtd0() 2930021.679 2952683.7490 2972758.5293 2975345.819 2994126.954 3012908.090 3
mtd1() 7306.673 7573.0050 7763.6367 7839.337 7992.119 8144.900 3
lumpGrp(x) 431.032 431.3635 476.6073 431.695 499.395 567.095 3
One option would be
DT[, chunk := cumsum(shift(!cumsum(id != shift(id, fill = id[2])) %% 5,
fill = TRUE))]
DT
# type id chunk
# 1: d1 1 1
# 2: d1 2 1
# 3: d2 1 1
# 4: d3 1 1
# 5: d3 2 1
# 6: d3 3 1
# 7: d4 1 2
# 8: d4 2 2
# 9: d4 3 2
#10: d4 4 2
#11: d4 5 2
#12: d5 1 3
#13: d5 2 3
#14: d6 1 3
#15: d6 2 3
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With