Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Run length sequence by time and ID

Tags:

r

This problem does not seem to have been put out here before.

I want to find the number of subjects that score 1 for 6 consecutive hours. The subjects have not been scored every hour so if an hour is missing the hours are not consecutive and the output for that 6-hour period should be NA. The reason for assigning NA would be that we do not know how the subject has scored on the missing hour. This problem can be used to count consecutive hits, but only count it if a subject has participated.

My dataframe looks like this:

ID<-c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2)
hour<-c(1,2,3,7,8,9,10,11,12,17,18,19,1,2,3,4,5,6,8,9,15)
A<-c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1)
df<-data.frame(ID,hour,A)

I have tried to use the rle function (I am sure its possible) but I cannot get it to condition on both hour and ID. The output would be like this:

ID<-c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2)
hour<-c(1,2,3,7,8,9,10,11,12,17,18,19,1,2,3,4,5,6,8,9,15)
A<-c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1)
six<-c(NA,NA,NA,0,0,0,0,0,1,NA,NA,NA,0,0,0,0,0,1,NA,NA,NA)
df<-data.frame(ID,hour,A,six)

Thank you in advance.

I believe the original data set I gave was too small to make the solutions more generalizable.
I just tried the codes with this dataset and found that this will result in a wrong result.

ID<-c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4)
hour<-c(1,2,3,7,8,9,10,12,13,17,18,19,1,2,3,4,5,6,8,9,15,1:23,27,28,29,30,31) 
A<-c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,rep(1,28))
df<-data.frame(ID,hour,A)

For the new dataset the output should be:

ID<-c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4)
hour<-c(1,2,3,7,8,9,10,12,13,17,18,19,1,2,3,4,5,6,8,9,15,1:23,27,28,29,30,31) 
A<-c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,rep(1,28))
six<-c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,0,0,0,0,0,1,NA,NA,NA,0,0,0,0,0,1,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA)
df<-data.frame(ID,hour,A,six)
like image 783
User LG Avatar asked Feb 14 '18 13:02

User LG


2 Answers

Here is an approach in tidyverse with the updated data set:

library(tidyverse)

df %>%
  group_by(ID) %>%
  expand(hour = seq(min(hour), max(hour))) %>%
  left_join(df) %>%
  mutate(rle =  rep(rle(A)$lengths, times = rle(A)$lengths)) %>%
  group_by(ID, rle) %>%
  mutate(sum = cumsum(A),
         six = ifelse(rle >= 6 & A == 1, 0, NA),
         six = ifelse(sum == 6, 1, ifelse(sum > 6, NA, six))) %>%
  filter(!is.na(A)) %>%
  ungroup() %>%
  select(ID, hour, A, six) %>%
  as.data.frame() ->  df_out2

check requested output:

ID<-c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4)
hour<-c(1,2,3,7,8,9,10,12,13,17,18,19,1,2,3,4,5,6,8,9,15,1:23,27,28,29,30,31) 
A<-c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,rep(1,28))
six<-c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,0,0,0,0,0,1,NA,NA,NA,0,0,0,0,0,1,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA)
df<-data.frame(ID,hour,A,six)

all.equal(df, df_out2)
#output
TRUE

The old answer:

df %>%
  mutate(rle =  rep(rle(A)$lengths, times = rle(A)$lengths)) %>%
  group_by(ID, rle) %>%
  mutate(sum = cumsum(A),
         six = ifelse(rle >= 6 & A == 1, 0, NA),
         six = ifelse(sum == 6, 1, ifelse(sum > 6, NA, six))) %>%
  ungroup() %>%
  select(ID, hour, A, six) %>%
  as.data.frame() ->  df_out2

Lets check if the result is like requested:

ID <- c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2)
hour <- c(1,2,3,7,8,9,10,11,12,17,18,19,1,2,3,4,5,6,8,9,15)
A <- c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1)
six <- c(NA,NA,NA,0,0,0,0,0,1,NA,NA,NA,0,0,0,0,0,1,NA,NA,NA)
df1 <- data.frame(ID, hour, A, six)

df1 is requested output

all.equal(df1, df_out2)
#output
TRUE   

some benchmarking:

library(microbenchmark)
library(data.table)

akrun <- function(df){
  setDT(df)[, grp := rleid(A)][, Anew := A *((hour - shift(hour, fill = hour[1])) ==1), grp
                               ][,  sixnew :=if(sum(A)>=5)  rep(c(0, 1), c(.N-1, 1)) else NA_real_,.(rleid(Anew), grp)]
  i1 <- df[, .I[which(is.na(sixnew) & shift(sixnew == 0, type = 'lead'))], grp]$V1
  df[i1, sixnew := 0][, c("Anew", "grp") := NULL][]
  }

missuse <- function(df){
  df %>%
    mutate(rle =  rep(rle(A)$lengths, times = rle(A)$lengths)) %>%
    group_by(ID, rle) %>%
    mutate(sum = cumsum(A),
           six = ifelse(rle >= 6 & A == 1, 0, NA),
           six = ifelse(sum == 6, 1, ifelse(sum > 6, NA, six))) %>%
    ungroup() %>%
    select(ID, hour, A, six)
}


Mike <- function(df){
  ave(df$A, 
      cumsum(!(df$hour == shift(df$hour, fill = 0) + 1)), 
      FUN = function(x) {
        if(all(x==1) & length(x) >= 6) return(c(rep(0, length(x) - 1), 1))
        else return(rep(NA, length(x)))})
}

microbenchmark(Mike(df),
               akrun(df),
               missuse(df))

#output
Unit: microseconds
        expr       min         lq       mean     median         uq       max neval
    Mike(df)   491.291   575.7115   704.2213   597.7155   629.0295  9578.684   100
   akrun(df)  6568.313  6725.5175  7867.4059  6843.5790  7279.2240 69790.755   100
 missuse(df) 11042.822 11321.0505 12434.8671 11512.3200 12616.3485 43170.935   100

way to go Mike H.!

like image 77
missuse Avatar answered Oct 25 '22 18:10

missuse


To get the groupings you can compare the current hour to the lagged hour to see if they are "consecutive" or 1 integer apart and then take the cumsum of that. Once you have the groupings you can use a simple ave to get the output you want.

library(data.table)
df$six <-  ave(df$A, 
               cumsum(!(df$hour == shift(df$hour, fill = 0) + 1)), 
               FUN = function(x) {
                    if(all(x==1) & length(x) >= 6) return(c(rep(0, length(x) - 1), 1))
                    else return(rep(NA, length(x)))}
               )

df
#   ID hour A six
#1   1    1 0  NA
#2   1    2 1  NA
#3   1    3 0  NA
#4   1    7 1   0
#5   1    8 1   0
#6   1    9 1   0
#7   1   10 1   0
#8   1   11 1   0
#9   1   12 1   1
#10  1   17 0  NA
#11  1   18 0  NA
#12  1   19 0  NA
#13  2    1 1   0
#14  2    2 1   0
#15  2    3 1   0
#16  2    4 1   0
#17  2    5 1   0
#18  2    6 1   1
#19  2    8 1  NA
#20  2    9 1  NA
#21  2   15 1  NA

If you only want to select a patient at most once, this will select the last time period:

df$six_adj <- ave(df$six, df$ID, df$six, FUN = function(x) {
                                                if(all(x==1)) return(c(rep(0, length(x) - 1), 1))
                                                else return(x)}
  )
like image 45
Mike H. Avatar answered Oct 25 '22 17:10

Mike H.