using tidyverse; counting after and before change in value, within groups, generating new variables for each unique shift

I am looking for a tidyverse-solution that can count occurrences of unique values of TF within groups, id in the data datatbl. When TF changes I want to count both forward and backwards from that point. This counting should be stored in a new variable PM##, so that PM## holds both plus and minus to each unique shift in TF.

This question is similar to a question I previously asked, but here I am specifically looking for a solution using tidyverse tools. Uwe provided an elegant answer to the inital question using data.table here.

If this question violates any SO policies please let me know and I'll be happy to reopen my initial question or append this an bounty-issue.

To illustrate my question with a minimal working example. I have data like this,

# install.packages(c("tidyverse"), dependencies = TRUE)

tbl <- tibble(id = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
                     1, 1, 1, 1, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7), 
              TF = c(NA, 0, NA, 0, 0, 1, 1, 1, NA, 0, 0, NA, 0, 0,
                     0, 1, 1, 1, NA, NA, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1))
#> # A tibble: 30 x 2
#>       id    TF
#>    <dbl> <dbl>
#>  1     0    NA
#>  2     0     0
#>  3     0    NA
#>  4     0     0
#>  5     0     0
#>  6     0     1
#>  7     0     1
#>  8     0     1
#>  9     0    NA
#> 10     0     0
#> # ... with 20 more rows

and this is what I am trying to obtain,

dfa <- tibble(id = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
                     1, 1, 1, 1, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7),
              TF = c(NA, 0, NA, 0, 0, 1, 1, 1, NA, 0, 0, NA, 0, 0,
                     0, 1, 1, 1, NA, NA, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1),
              PM01 = c(NA, -3, NA, -2, -1, 1, 2, 3, NA, NA, NA, NA, -3, -2, -1,
                       1, 2, 3, NA, NA, -2, -1, 1, NA, NA, NA, NA, NA, NA, NA),
              PM02 = c(NA, NA, NA, NA, NA, -3, -2, -1, NA, 1, 2, NA, NA, NA, NA,
                       NA, NA, NA, NA, NA, NA, NA, -1, 1, 2, NA, NA, NA, NA, NA),
              PM03 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
                       NA, NA, NA, NA, NA, NA, NA, NA, -2, -1, 1, NA, NA, NA, NA),
              PM04 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
                       NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -1, 1, NA, NA, NA),
              PM05 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
                       NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -1, 1, 2, 3)

#> # A tibble: 30 x 7
#>       id    TF  PM01  PM02  PM03  PM04  PM05
#>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#>  1     0    NA    NA    NA    NA    NA    NA
#>  2     0     0    -3    NA    NA    NA    NA
#>  3     0    NA    NA    NA    NA    NA    NA
#>  4     0     0    -2    NA    NA    NA    NA
#>  5     0     0    -1    NA    NA    NA    NA
#>  6     0     1     1    -3    NA    NA    NA
#>  7     0     1     2    -2    NA    NA    NA
#>  8     0     1     3    -1    NA    NA    NA
#>  9     0    NA    NA    NA    NA    NA    NA
#> 10     0     0    NA     1    NA    NA    NA
#> # ... with 20 more rows
Here is another tidyverse approach that uses dplyr, tidyr and zoo (used for its na.locf function) package:

Firstly, instead of dropping NAs in the TF column and then join back as all the other suggested approaches (including the data.table approach), I wrote a helper method here, that counts forward by chunks ignoring NAs;

forward_count <- function(v) {
    valid <- !is.na(v)
    valid_v <- v[valid]
    chunk_size = head(rle(valid_v)$lengths, -1)
    idx <- cumsum(chunk_size) + 1
    ones <- rep(1, length(valid_v))
    ones[idx] <- 1 - chunk_size
    v[valid] <- cumsum(ones)

And it works as is required by count after the change:

v <- sample(c(NA, 0, 1), 15, replace = T)
# [1] NA NA NA  0  1 NA  1 NA  1  1  0  1  0  0  0
# [1] NA NA NA  1  1 NA  2 NA  3  4  1  1  1  2  3

Count before the change can be implemented by reverse the vector twice with this exact same function:

# [1] NA NA NA -1 -4 NA -3 NA -2 -1 -1 -1 -3 -2 -1

Now define the headers, count forward column as fd, count backward column as bd using dplyr package:

library(dplyr); library(tidyr); library(zoo);

tidy_method <- function(df) {
    df %>% 
        group_by(id) %>% 
            rle_id = cumsum(diff(na.locf(c(0, TF))) != 0),   # chunk id for constant TF
            PM_fd = if_else(                 # PM count after change headers
                rle_id == head(rle_id, 1), 
                "head", sprintf('PM%02d', rle_id)
            PM_bd = if_else(                 # shift the header up as before change headers
                rle_id == tail(rle_id, 1), 
                "tail", sprintf('PM%02d', rle_id+1)
            fd = forward_count(TF),             # after change count
            bd = -rev(forward_count(rev(TF))),  # before change count
            rn = seq_along(id)) %>%             # row number
        gather(key, value, PM_fd, PM_bd) %>%    # align headers with the count
        mutate(count_ = if_else(key == "PM_fd", fd, bd)) %>%
        select(-key) %>% spread(value, count_) %>%    # reshaper PM column as headers
        select(id, TF, rn, matches('PM')) %>%  # drop no longer needed columns
        arrange(id, rn) %>% select(-rn)

Timing compared with the data.table method:

Define the data.table method as:

dt_method <- function(df) {
    tmp_dt <- setDT(df)[, rn := .I][!is.na(TF)][, rl := rleid(TF), by = id][
        , c("up", "dn") := .(seq_len(.N), -rev(seq_len(.N))), by = .(id, rl)][]

    res_dt <- tmp_dt[tmp_dt[, seq_len(max(rl) - 1L), by = .(id)], on = .(id), allow.cartesian = TRUE][
        rl == V1, PM := dn][rl == V1 + 1L, PM := up][
            , dcast(.SD, id + TF + rn ~ sprintf("PM%02d", V1), value.var = "PM")][
                df, on = .(rn, id, TF)][, -"rn"]

Data: A medium sized data by repeating the sample data frame 200 times:

df_test <- bind_rows(rep(list(df), 200))

microbenchmark::microbenchmark(dt_method(df_test), tidy_method(df_test), times = 10)
#Unit: milliseconds
#                 expr       min        lq      mean    median        uq       max neval
#   dt_method(df_test) 2321.5852 2439.8393 2490.8583 2456.1118 2557.4423 2834.2399    10
# tidy_method(df_test)  402.3624  412.2838  437.0801  414.5655  418.6564  540.9667    10

Order the data.table method result by id and convert all column data types to numeric; the results from data.table approach and tidyverse are identical:

    as.data.frame(dt_method(df_test)[order(id), lapply(.SD, as.numeric)]), 
# [1] TRUE
