This problem does not seem to have been put out here before.
I want to find the number of subjects that score 1 for 6 consecutive hours. The subjects have not been scored every hour so if an hour is missing the hours are not consecutive and the output for that 6-hour period should be NA. The reason for assigning NA would be that we do not know how the subject has scored on the missing hour. This problem can be used to count consecutive hits, but only count it if a subject has participated.
My dataframe looks like this:
ID<-c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2)
hour<-c(1,2,3,7,8,9,10,11,12,17,18,19,1,2,3,4,5,6,8,9,15)
A<-c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1)
df<-data.frame(ID,hour,A)
I have tried to use the rle function (I am sure its possible) but I cannot get it to condition on both hour and ID. The output would be like this:
ID<-c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2)
hour<-c(1,2,3,7,8,9,10,11,12,17,18,19,1,2,3,4,5,6,8,9,15)
A<-c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1)
six<-c(NA,NA,NA,0,0,0,0,0,1,NA,NA,NA,0,0,0,0,0,1,NA,NA,NA)
df<-data.frame(ID,hour,A,six)
Thank you in advance.
I believe the original data set I gave was too small to make the solutions more generalizable.
I just tried the codes with this dataset and found that this will result in a wrong result.
ID<-c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4)
hour<-c(1,2,3,7,8,9,10,12,13,17,18,19,1,2,3,4,5,6,8,9,15,1:23,27,28,29,30,31)
A<-c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,rep(1,28))
df<-data.frame(ID,hour,A)
For the new dataset the output should be:
ID<-c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4)
hour<-c(1,2,3,7,8,9,10,12,13,17,18,19,1,2,3,4,5,6,8,9,15,1:23,27,28,29,30,31)
A<-c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,rep(1,28))
six<-c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,0,0,0,0,0,1,NA,NA,NA,0,0,0,0,0,1,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA)
df<-data.frame(ID,hour,A,six)
Here is an approach in tidyverse with the updated data set:
library(tidyverse)
df %>%
group_by(ID) %>%
expand(hour = seq(min(hour), max(hour))) %>%
left_join(df) %>%
mutate(rle = rep(rle(A)$lengths, times = rle(A)$lengths)) %>%
group_by(ID, rle) %>%
mutate(sum = cumsum(A),
six = ifelse(rle >= 6 & A == 1, 0, NA),
six = ifelse(sum == 6, 1, ifelse(sum > 6, NA, six))) %>%
filter(!is.na(A)) %>%
ungroup() %>%
select(ID, hour, A, six) %>%
as.data.frame() -> df_out2
check requested output:
ID<-c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4)
hour<-c(1,2,3,7,8,9,10,12,13,17,18,19,1,2,3,4,5,6,8,9,15,1:23,27,28,29,30,31)
A<-c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,rep(1,28))
six<-c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,0,0,0,0,0,1,NA,NA,NA,0,0,0,0,0,1,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA)
df<-data.frame(ID,hour,A,six)
all.equal(df, df_out2)
#output
TRUE
df %>%
mutate(rle = rep(rle(A)$lengths, times = rle(A)$lengths)) %>%
group_by(ID, rle) %>%
mutate(sum = cumsum(A),
six = ifelse(rle >= 6 & A == 1, 0, NA),
six = ifelse(sum == 6, 1, ifelse(sum > 6, NA, six))) %>%
ungroup() %>%
select(ID, hour, A, six) %>%
as.data.frame() -> df_out2
Lets check if the result is like requested:
ID <- c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2)
hour <- c(1,2,3,7,8,9,10,11,12,17,18,19,1,2,3,4,5,6,8,9,15)
A <- c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1)
six <- c(NA,NA,NA,0,0,0,0,0,1,NA,NA,NA,0,0,0,0,0,1,NA,NA,NA)
df1 <- data.frame(ID, hour, A, six)
df1 is requested output
all.equal(df1, df_out2)
#output
TRUE
some benchmarking:
library(microbenchmark)
library(data.table)
akrun <- function(df){
setDT(df)[, grp := rleid(A)][, Anew := A *((hour - shift(hour, fill = hour[1])) ==1), grp
][, sixnew :=if(sum(A)>=5) rep(c(0, 1), c(.N-1, 1)) else NA_real_,.(rleid(Anew), grp)]
i1 <- df[, .I[which(is.na(sixnew) & shift(sixnew == 0, type = 'lead'))], grp]$V1
df[i1, sixnew := 0][, c("Anew", "grp") := NULL][]
}
missuse <- function(df){
df %>%
mutate(rle = rep(rle(A)$lengths, times = rle(A)$lengths)) %>%
group_by(ID, rle) %>%
mutate(sum = cumsum(A),
six = ifelse(rle >= 6 & A == 1, 0, NA),
six = ifelse(sum == 6, 1, ifelse(sum > 6, NA, six))) %>%
ungroup() %>%
select(ID, hour, A, six)
}
Mike <- function(df){
ave(df$A,
cumsum(!(df$hour == shift(df$hour, fill = 0) + 1)),
FUN = function(x) {
if(all(x==1) & length(x) >= 6) return(c(rep(0, length(x) - 1), 1))
else return(rep(NA, length(x)))})
}
microbenchmark(Mike(df),
akrun(df),
missuse(df))
#output
Unit: microseconds
expr min lq mean median uq max neval
Mike(df) 491.291 575.7115 704.2213 597.7155 629.0295 9578.684 100
akrun(df) 6568.313 6725.5175 7867.4059 6843.5790 7279.2240 69790.755 100
missuse(df) 11042.822 11321.0505 12434.8671 11512.3200 12616.3485 43170.935 100
way to go Mike H.!
To get the groupings you can compare the current hour to the lagged hour to see if they are "consecutive" or 1 integer apart and then take the cumsum
of that. Once you have the groupings you can use a simple ave
to get the output you want.
library(data.table)
df$six <- ave(df$A,
cumsum(!(df$hour == shift(df$hour, fill = 0) + 1)),
FUN = function(x) {
if(all(x==1) & length(x) >= 6) return(c(rep(0, length(x) - 1), 1))
else return(rep(NA, length(x)))}
)
df
# ID hour A six
#1 1 1 0 NA
#2 1 2 1 NA
#3 1 3 0 NA
#4 1 7 1 0
#5 1 8 1 0
#6 1 9 1 0
#7 1 10 1 0
#8 1 11 1 0
#9 1 12 1 1
#10 1 17 0 NA
#11 1 18 0 NA
#12 1 19 0 NA
#13 2 1 1 0
#14 2 2 1 0
#15 2 3 1 0
#16 2 4 1 0
#17 2 5 1 0
#18 2 6 1 1
#19 2 8 1 NA
#20 2 9 1 NA
#21 2 15 1 NA
If you only want to select a patient at most once, this will select the last time period:
df$six_adj <- ave(df$six, df$ID, df$six, FUN = function(x) {
if(all(x==1)) return(c(rep(0, length(x) - 1), 1))
else return(x)}
)
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With