I have data that looks like this (note dates are in DD-MM-YYYY format):
ID date drug score
A 28/08/2016 2 3
A 29/08/2016 1 4
A 30/08/2016 2 4
A 2/09/2016 2 4
A 3/09/2016 1 4
A 4/09/2016 2 4
B 8/08/2016 1 3
B 9/08/2016 2 4
B 10/08/2016 2 3
B 11/08/2016 1 3
C 30/11/2016 2 4
C 2/12/2016 1 5
C 3/12/2016 2 1
C 5/12/2016 1 4
C 6/12/2016 2 4
C 8/12/2016 1 2
C 9/12/2016 1 2
For 'drug': 1=drug taken, 2=no drug taken.
I need to summarise for each ID:
If a drug was taken 2 days in a row (eg the last 2 rows of the example) then these scores should not be counted in the -1day or +1day calculations (i.e., each of the last two rows would contribute to the 0day score but would not contribute to the other metrics).
So for this example data, I would need an output table like this:
-1day 0day +1day
A 3.5 4 4
B 3 3 4
C 3.25 2.5
Note that there is not a record for all dates and that the -1day and +1day calculations need to be based on the actual dates and not just the records in the dataset.
I have no idea how to do this.
I also have two additional bonus questions:
I will most likely also need to calculate -2day and +2day scores, so need to be able to adapt an answer to do that.
How could I calculate a NoDrug score, which is the mean of all days that are not within 5 days of a drug taking day.
Here is code to generate a dataframe with this example data:
data<-data.frame(ID=c("A","A","A","A","A","A","B","B","B","B","C","C","C","C","C","C","C"),
date=as.Date(c("28/08/2016","29/08/2016","30/08/2016","2/09/2016","3/09/2016","4/09/2016","8/08/2016","9/08/2016","10/08/2016","11/08/2016","30/11/2016","2/12/2016","3/12/2016","5/12/2016","6/12/2016","8/12/2016","9/12/2016"),format= "%d/%m/%Y"),
drug=c(2,1,2,2,1,2,1,2,2,1,2,1,2,1,2,1,1),
score=c(3,4,4,4,4,4,3,4,3,3,4,5,1,4,4,2,2))
You can use dplyr to get this:
df <- data.frame(
ID=c("A","A","A","A","A","A","B","B","B","B","C","C","C","C","C","C","C"),
date=as.Date(c("28/08/2016","29/08/2016","30/08/2016","2/09/2016","3/09/2016","4/09/2016","8/08/2016","9/08/2016","10/08/2016","11/08/2016","30/11/2016","2/12/2016","3/12/2016","5/12/2016","6/12/2016","8/12/2016","9/12/2016"),format= "%d/%m/%Y"),
drug=c(2,1,2,2,1,2,1,2,2,1,2,1,2,1,2,1,1),
score=c(3,4,4,4,4,4,3,4,3,3,4,5,1,4,4,2,2)
)
df
#> ID date drug score
#> 1 A 2016-08-28 2 3
#> 2 A 2016-08-29 1 4
#> 3 A 2016-08-30 2 4
#> 4 A 2016-09-02 2 4
#> 5 A 2016-09-03 1 4
#> 6 A 2016-09-04 2 4
#> 7 B 2016-08-08 1 3
#> 8 B 2016-08-09 2 4
#> 9 B 2016-08-10 2 3
#> 10 B 2016-08-11 1 3
#> 11 C 2016-11-30 2 4
#> 12 C 2016-12-02 1 5
#> 13 C 2016-12-03 2 1
#> 14 C 2016-12-05 1 4
#> 15 C 2016-12-06 2 4
#> 16 C 2016-12-08 1 2
#> 17 C 2016-12-09 1 2
A nice way to solve these sorts of problems, making rows implicitly missing observations explicitly missing, is to use tidyr::complete
library(dplyr)
library(tidyr)
df1 <- df %>%
group_by(ID) %>%
complete(date = seq(min(date), max(date), by = "day"))
df1
#> Source: local data frame [22 x 4]
#> Groups: ID [3]
#>
#> # A tibble: 22 x 4
#> ID date drug score
#> <fctr> <date> <dbl> <dbl>
#> 1 A 2016-08-28 2 3
#> 2 A 2016-08-29 1 4
#> 3 A 2016-08-30 2 4
#> 4 A 2016-08-31 NA NA
#> 5 A 2016-09-01 NA NA
#> 6 A 2016-09-02 2 4
#> 7 A 2016-09-03 1 4
#> 8 A 2016-09-04 2 4
#> 9 B 2016-08-08 1 3
#> 10 B 2016-08-09 2 4
#> # ... with 12 more rows
df2 <- df1 %>%
group_by(ID) %>%
mutate(day_of = drug == 1,
day_before = (lead(drug) == 1 & day_of == FALSE),
day_after = (lag(drug) == 1 & day_of == FALSE))
df2
#> Source: local data frame [22 x 7]
#> Groups: ID [3]
#>
#> # A tibble: 22 x 7
#> ID date drug score day_of day_before day_after
#> <fctr> <date> <dbl> <dbl> <lgl> <lgl> <lgl>
#> 1 A 2016-08-28 2 3 FALSE TRUE NA
#> 2 A 2016-08-29 1 4 TRUE FALSE FALSE
#> 3 A 2016-08-30 2 4 FALSE NA TRUE
#> 4 A 2016-08-31 NA NA NA NA FALSE
#> 5 A 2016-09-01 NA NA NA FALSE NA
#> 6 A 2016-09-02 2 4 FALSE TRUE NA
#> 7 A 2016-09-03 1 4 TRUE FALSE FALSE
#> 8 A 2016-09-04 2 4 FALSE NA TRUE
#> 9 B 2016-08-08 1 3 TRUE FALSE FALSE
#> 10 B 2016-08-09 2 4 FALSE FALSE TRUE
#> # ... with 12 more rows
dplyr::mutate_at applies a function (in funs()) to all the columns selected in vars(). summarise_at operates the same way in terms of operating on a some selected columns, but instead of changing the values of the full dataset it reduces it done to one row per group. Can can read more about mmutate, summarise, and the special *_at versions.
df3 <- df2 %>%
mutate_at(vars(starts_with("day_")), funs(if_else(. == TRUE, score, NA_real_))) %>%
summarise_at(vars(starts_with("day_")), mean, na.rm = TRUE)
df3
#> # A tibble: 3 x 4
#> ID day_of day_before day_after
#> <fctr> <dbl> <dbl> <dbl>
#> 1 A 4.00 3.5 4.0
#> 2 B 3.00 3.0 4.0
#> 3 C 3.25 NaN 2.5
Here is a possibility using dplyr and its lead and lagfunctions:
library(tidyverse)
data %>% group_by(ID) %>%
arrange(date) %>%
mutate(
# use ifelse for cases of drugs being take twice or more in a row
`-1 day` = ifelse(dplyr::lag(drug) != 1, dplyr::lag(score, 1), NA),
`+1 day` = ifelse(dplyr::lead(drug) != 1, dplyr::lead(score, 1), NA)
) %>%
filter(drug == 1) %>%
summarise_all(mean, na.rm = TRUE) %>%
select(
`-1 day`,
`0 day` = score,
`+1 day`,
-date,
-drug
)
# A tibble: 3 × 3
`-1 day` `0 day` `+1 day`
<dbl> <dbl> <dbl>
1 3.5 4.00 4.0
2 3.0 3.00 4.0
3 3.0 3.25 2.5
I prefer to use time series packages (like zoo) for such tasks.
library(zoo)
#function that handles conversion to zoo time series
my_zoo=function(x,idx) {
date_range=seq(min(idx),max(idx),by="day")
#add missing dates
dummy_zoo=merge(zoo(x,idx),zoo(NA,date_range),all=TRUE)[,1]
#add NA entry at top/bottom
rbind(dummy_zoo,rbind(zoo(NA,max(idx)+1),zoo(NA,min(idx)-1)))
}
#split by ID, handle cases where drug is NA
split_data=lapply(split(data,df$ID),function(x) {
list(score=my_zoo(x$score,x$date),
taken=(my_zoo(x$drug,x$date)==1)&
!is.na(my_zoo(x$drug,x$date)))})
#calculate stats
#your requirement that subsequent days with drug taken...
#... are completely omitted is a bit tricky to handle
res=data.frame(
mean_m1=sapply(split_data,function(x) {
mean(x$score[diff(x$taken,-1)>0&
lag(diff(x$taken),+1)],
na.rm=TRUE)}),
mean_0=sapply(split_data,function(x) {
mean(x$score[x$taken],
na.rm=TRUE)}),
mean_p1=sapply(split_data,function(x) {
mean(x$score[diff(x$taken,+1)<0&
lag(diff(x$taken),-1)],
na.rm=TRUE)}))
res
# mean_m1 mean_0 mean_p1
# A 3.5 4.00 4.0
# B 3.0 3.00 4.0
# C NaN 3.25 2.5
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With