Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Increase efficiency in finding first occurrence of events

I am working with a large time series data.table, 60 *B*illion rows X 50 Columns

For three specific columns, I would like to add a corresponding T/F column indicating, by idCol, the first time each event occurs

In other words, for ColumnA, the new column would be

DT[, flag.ColumnA :=  dateCol==min(dateCol)
   , by=list(idCol, ColumnA)]

HOWEVER: There are often ties for min(dateCol) and the resolution for ties is that only one element be marked TRUE, the remaining FALSE. Which leads to the following approach

## Set key to {idCol, dateCol} so that the first row in each group
##   is the unique element in that group that should be set to TRUE
setkey(DT, idCol, dateCol)
DT[, flag.ColumnA := FALSE]
DT[, { DT[ .I[[1L]], flag.ColumnA := TRUE] }  # braces here are just for easier reading
   , by=list(idCol, ColumnA)]

The problem is that this second method increases run time over 3x, while the first method already takes over an hour per column (on a relatively fast box)

I also considered manually resolving the ties in method 1, but that was slower than the above two methods.

Any suggestions on how to accomplish this task more efficiently? Sample Data below


Sample of Expected Output

DT["ID_01"] [ColumnA %in% c("BT", "CK", "MH")] [order(ColumnA, dateCol)]

    idCol    dateCol ColumnA ColumnB flag.ColumnA.M1 flag.ColumnA.M2
 1: ID_01 2013-06-01      BT     xxx            TRUE            TRUE <~~ M1 is WRONG, M2 is correct
 2: ID_01 2013-06-01      BT     www            TRUE           FALSE <~~ M1 is WRONG, M2 is correct
 3: ID_01 2013-06-01      BT     yyy            TRUE           FALSE <~~ M1 is WRONG, M2 is correct
 4: ID_01 2013-06-22      BT     xxx           FALSE           FALSE
 5: ID_01 2013-11-23      BT     yyy           FALSE           FALSE
 6: ID_01 2013-11-30      BT     zzz           FALSE           FALSE
 7: ID_01 2013-06-15      CK     www            TRUE            TRUE
 8: ID_01 2013-06-15      CK     uuu            TRUE           FALSE
 9: ID_01 2013-06-15      CK     www            TRUE           FALSE
10: ID_01 2013-06-29      CK     zzz           FALSE           FALSE
11: ID_01 2013-10-12      CK     vvv           FALSE           FALSE
12: ID_01 2013-11-02      CK     uuu           FALSE           FALSE
13: ID_01 2013-06-22      MH     uuu            TRUE            TRUE
14: ID_01 2013-06-22      MH     xxx            TRUE           FALSE
15: ID_01 2013-06-22      MH     zzz            TRUE           FALSE
16: ID_01 2013-08-24      MH     ttt           FALSE           FALSE
17: ID_01 2013-09-07      MH     xxx           FALSE           FALSE
18: ID_01 2013-09-14      MH     zzz           FALSE           FALSE
19: ID_01 2013-09-21      MH     vvv           FALSE           FALSE
20: ID_01 2013-11-30      MH     ttt           FALSE           FALSE

Sample Data

# increase N for realistic test    
N <- 2e4  # N should be large, as certain methods will be seemingly fast but wont scale

ids   <- sprintf("ID_%02d", seq(5))
A     <- apply(expand.grid(LETTERS, LETTERS), 1, paste0, collapse="")
B     <- paste0(letters, letters, letters)[20:26]
dates <- seq.Date(as.Date("2013-06-01"), as.Date("2013-12-01"), by=7)

set.seed(1)
DT <- data.table( dateCol=sample(dates, N, TRUE)
                , idCol  =sample(ids,   N, TRUE)
                , ColumnA=sample(A,     N, TRUE)
                , ColumnB=sample(B,     N, TRUE)
                , key="idCol")


{
  cat("\n==========\nMETHOD ONE:\n")
  print(system.time({
       DT[, flag.ColumnA.M1 :=  dateCol==min(dateCol)
          , by=list(idCol, ColumnA)]}))
  cat("\n\n==========\nMETHOD TWO:\n")
  print(system.time({
      setkey(DT, idCol, dateCol)
      DT[, flag.ColumnA.M2 := FALSE]
      DT[, { DT[ .I[[1L]], flag.ColumnA.M2 := TRUE] }  # braces here are just for easier reading
         , by=list(idCol, ColumnA)]}))
}

## For Example, looking at ID_01, at a few select values of ColumnA: 
DT["ID_01"] [ColumnA %in% c("BT", "CK", "MH")] [order(ColumnA, dateCol)]
like image 578
Ricardo Saporta Avatar asked Nov 19 '13 22:11

Ricardo Saporta


2 Answers

Just use which.min to resolve the ties:

DT[, flag := FALSE]
DT[DT[, .I[which.min(dateCol)], by = list(idCol, ColumnA)]$V1, flag := TRUE]

For your small data sample this is instantaneous for me, as in unmeasurable by system.time, and it's 1.5x faster at N=1e7 than your method 1. I didn't test larger N's.

like image 97
eddi Avatar answered Oct 14 '22 05:10

eddi


I would use set instead of := within [.data.table. When set can create columns, this will be faster again.

something like (keying by id and date to ensure that ordering is correct)

system.time({
set.seed(1)
DT <- data.table( dateCol=sample(dates, N, TRUE)
                  , idCol  =sample(ids,   N, TRUE)
                  , ColumnA=sample(A,     N, TRUE)
                  , ColumnB=sample(B,     N, TRUE)
                  , key=c("idCol", "dateCol"))
ll <- lapply(c('ColumnA','ColumnB'), function(cc) DT[,.I[1],by = c('idCol',cc)][['V1']])

flags <- c('flagA','flagB')
DT[, (flags) := FALSE]

jflag <- match(flags, names(DT), nomatch=0)

for(jj in seq_along(jflag)){
  set(DT, i = ll[[jj]], j = jflag[jj], value = TRUE)

}
})


# See this is lightening fast (even incorporating the creation of the data.table)
##   user  system elapsed 
##   0.02    0.00    0.02 




DT["ID_01"] [ColumnA %in% c("BT", "CK", "MH")] [order(ColumnA, dateCol)]
    idCol    dateCol ColumnA ColumnB flagA flagB
 1: ID_01 2013-06-01      BT     xxx  TRUE  TRUE
 2: ID_01 2013-06-01      BT     www FALSE FALSE
 3: ID_01 2013-06-01      BT     yyy FALSE FALSE
 4: ID_01 2013-06-22      BT     xxx FALSE FALSE
 5: ID_01 2013-11-23      BT     yyy FALSE FALSE
 6: ID_01 2013-11-30      BT     zzz FALSE FALSE
 7: ID_01 2013-06-15      CK     www  TRUE FALSE
 8: ID_01 2013-06-15      CK     uuu FALSE FALSE
 9: ID_01 2013-06-15      CK     www FALSE FALSE
10: ID_01 2013-06-29      CK     zzz FALSE FALSE
11: ID_01 2013-10-12      CK     vvv FALSE FALSE
12: ID_01 2013-11-02      CK     uuu FALSE FALSE
13: ID_01 2013-06-22      MH     uuu  TRUE FALSE
14: ID_01 2013-06-22      MH     xxx FALSE FALSE
15: ID_01 2013-06-22      MH     zzz FALSE FALSE
16: ID_01 2013-08-24      MH     ttt FALSE FALSE
17: ID_01 2013-09-07      MH     xxx FALSE FALSE
18: ID_01 2013-09-14      MH     zzz FALSE FALSE
19: ID_01 2013-09-21      MH     vvv FALSE FALSE
20: ID_01 2013-11-30      MH     ttt FALSE FALSE

Other possibilities would include doing everything in a single sweep through

for example (this would be faster if you knew the column number of the added column each time (which you should in your situation, there would be no need to match the flag columns to the names of the data.table hten)

lapply(c('A','B'), function(LL){
    cn <- sprintf('Column%s',LL)
    fl <- sprintf('flag%s',LL)
    DT[, (fl) :=FALSE]
    is <- DT[,.I[1],by =c('idCol',cn)][['V1']]
    jm <- match(fl, names(DT), nomatch=0)
    set(DT, i=is, j=jm, value=TRUE)
    invisible()
  })

or

for(ff in seq_along(flags)){

  is <- DT2[,.I[1], by =c('idCol',cols[1])][['V1']]
  set(DT2, i = is, j = jflag[ff], value = TRUE)
}
like image 32
mnel Avatar answered Oct 14 '22 04:10

mnel