I have data with a grouping variable "ID" and some values:
ID, Value
1, 1
1, 2
1, 3
1, 4
2, 5
2, 6
2, 7
2, 8
Within each group, I want to append the first row after the last row. Desired result:
ID, Value
1, 1
1, 2
1, 3
1, 4
1, 1 # First row of ID 1 inserted as last row in the group
2, 5
2, 6
2, 7
2, 8
2, 5 # First row of ID 2 inserted as last row in the group
I have 5000 row of this.
And with data.table:
library(data.table)
dt <- data.table(ID = rep(1:2, each = 4), Value = 1:8)
dt[,.(Value = c(Value, first(Value))), ID]
#> ID Value
#> 1: 1 1
#> 2: 1 2
#> 3: 1 3
#> 4: 1 4
#> 5: 1 1
#> 6: 2 5
#> 7: 2 6
#> 8: 2 7
#> 9: 2 8
#> 10: 2 5
Benchmarking with a 5000-row table:
library(dplyr)
dt <- data.table(ID = rep(1:1250, each = 4), Value = 1:5e3)
f1 <- function(dt) dt[,.(Value = c(Value, first(Value))), ID]
# base R
f2 <- function(dt) do.call(rbind, lapply(split(dt, by = "ID"), function(x) rbind(x, x[1,])))
# tidyverse
f3 <- function(dt) {
dt %>%
group_by(ID) %>%
do(add_row(., ID = unique(.$ID), Value = first(.$Value))) %>%
ungroup()
}
f4 <- function(dt) {
dt %>%
group_by(ID) %>%
group_modify(~ add_row(., Value = first(.$Value))) %>%
ungroup()
}
microbenchmark::microbenchmark(data.table = f1(dt),
"base R" = f2(dt),
tidyverse1 = f3(dt),
tidyverse2 = f4(dt),
times = 10)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> data.table 3.4989 3.6844 4.16619 4.3085 4.4623 5.3020 10
#> base R 245.1397 263.1636 283.61131 284.8053 307.7105 310.3901 10
#> tidyverse1 761.7097 773.3705 791.05115 787.9463 808.5416 821.5321 10
#> tidyverse2 711.9593 716.4959 752.20273 728.2170 782.6474 837.1926 10
If speed is really important, this simple Rcpp function provides a very fast solution:
Rcpp::cppFunction(
"IntegerVector firstLast(const IntegerVector& x) {
const int n = x.size();
IntegerVector idxOut(2*n);
int i0 = 1;
int idx = 0;
idxOut(0) = 1;
for (int i = 1; i < n; i++) {
if (x(i) != x(i - 1)) {
idxOut(++idx) = i0;
i0 = i + 1;
}
idxOut(++idx) = i + 1;
}
idxOut(++idx) = i0;
return idxOut[Rcpp::Range(0, idx)];
}"
)
Benchmarking against the fastest solution from this answer (on a much larger dataset):
dt = data.table(ID = rep(1:125e4, each = 4), Value = 1:5e6)
microbenchmark::microbenchmark(
f_uniq_dt = setorder(rbindlist(list(dt, unique(dt, by = "ID"))), ID),
f_Rcpp = dt[firstLast(dt$ID)],
check = "equal"
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> f_uniq_dt 78.6056 83.71345 95.42876 85.80720 90.03685 175.8867 100
#> f_Rcpp 49.1485 53.38275 60.96322 55.44925 58.01485 121.3637 100
Within tidyverse, you could use add_row with do (now deprecated) or group_modify (experimental):
dat |>
group_by(ID) |>
do(add_row(., ID = unique(.$ID), Value = first(.$Value))) |>
ungroup()
dat |>
group_by(ID) |>
group_modify(~ add_row(., Value = first(.$Value))) |>
ungroup()
Or bind_rows with summarize (my variation of @Gregor Thomas, thanks):
dat |>
group_by(ID) |>
summarize(bind_rows(cur_data(), head(cur_data(), 1))) |>
ungroup()
Or by applying the same logic of @Henrik using bind_rows, filter, and arrange:
dat |>
bind_rows(dat |> filter(!duplicated(ID))) |>
arrange(ID)
Output:
# A tibble: 10 × 2
ID Value
<int> <dbl>
1 1 1
2 1 2
3 1 3
4 1 4
5 1 1
6 2 5
7 2 6
8 2 7
9 2 8
10 2 5
Thanks to @SamR for the data.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With