Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Efficient comparison of POSIXct in data.table

Hello I am looking for an efficient way of selecting POSIXct rows from a data.table such that the time of day is less than say 12:00:00 (NOTE that millisecond is NOT required, so we can use ITime for example)

set.seed(1); N = 1e7;
DT = data.table(dts = .POSIXct(1e5*rnorm(N), tz="GMT"))
DT
                               dts
#       1: 1969-12-31 06:35:54.618925
#       2: 1970-01-01 05:06:04.332422
#     ---                           
# 9999999: 1970-01-03 00:37:00.035565
#10000000: 1969-12-30 08:30:23.624506

One solution (the problem here is that the cast could be costly if N is big)

f <- function(t, st, et) {time <- as.ITime(t); return(time>=as.ITime(st) & time<=as.ITime(et))}
P <- function(t, s) { #geekTrader solution
    ep <- .parseISO8601(s) 
    if(grepl('T[0-9]{2}:[0-9]{2}:[0-9]{2}/T[0-9]{2}:[0-9]{2}:[0-9]{2}', s)){
        first.time <- as.double(ep$first.time)
        last.time <- as.double(ep$last.time)-31449600
        SecOfDay <- as.double(t) %% 86400
        return(SecOfDay >= first.time & SecOfDay <= last.time )
    } else {
        return(t >= ep$first.time & t <= ep$last.time)    
    }
}

Quick look about the perf

system.time(resf <- DT[f(dts,'00:00:00','11:59:59')])
   user  system elapsed 
   1.01    0.28    1.29
system.time(resP <- DT[P(dts,'T00:00:00/T11:59:59')])
   user  system elapsed 
   0.64    0.13    0.76 

identical(resf,resP)
[1] TRUE
like image 606
statquant Avatar asked Apr 05 '13 09:04

statquant


3 Answers

 P <- function(t, s) {
  ep <- .parseISO8601(s)

  if(grepl('T[0-9]{2}:[0-9]{2}:[0-9]{2}/T[0-9]{2}:[0-9]{2}:[0-9]{2}', s)){
    first.time <- as.double(ep$first.time)
    last.time <- as.double(ep$last.time)-31449600
    SecOfDay <- as.double(t) %% 86400
    return(SecOfDay >= first.time & SecOfDay <= last.time )

  } else {
    return(t >= ep$first.time & t <= ep$last.time)    
  }

}

F <- function(t, st, et) {
  time <- as.ITime(t) 
  return(time>=as.ITime(st) & time<=as.ITime(et))
}


 Sys.setenv(TZ='GMT')
 N = 1e7;
 set.seed(1);

 DT <- data.table(dts = .POSIXct(1e5*rnorm(N), tz="GMT"))


 system.time(resP <- DT[P(dts, 'T00:00:00/T12:00:00'), ])
##   user  system elapsed 
##   1.11    0.11    1.22 
 system.time(resF <- DT[F(dts,'00:00:00','12:00:00')])
##   user  system elapsed 
##   2.22    0.29    2.51 

 resP
##                         dts
##      1: 1969-12-31 06:35:54
##      2: 1970-01-01 05:06:04
##      3: 1969-12-31 00:47:17
##      4: 1970-01-01 09:09:10
##      5: 1969-12-31 01:12:33
##     ---                    
##5000672: 1970-01-01 06:08:15
##5000673: 1970-01-01 05:02:27
##5000674: 1969-12-31 02:25:24
##5000675: 1970-01-03 00:37:00
##5000676: 1969-12-30 08:30:23
 resF
##                         dts
##      1: 1969-12-31 06:35:54
##      2: 1970-01-01 05:06:04
##      3: 1969-12-31 00:47:17
##      4: 1970-01-01 09:09:10
##      5: 1969-12-31 01:12:33
##     ---                    
##5000672: 1970-01-01 06:08:15
##5000673: 1970-01-01 05:02:27
##5000674: 1969-12-31 02:25:24
##5000675: 1970-01-03 00:37:00
##5000676: 1969-12-30 08:30:23

 #Check the correctness
 resP[,list(mindts=max(dts)),by=list(as.Date(dts))]
##       as.Date              mindts
## 1: 1969-12-31 1969-12-31 12:00:00
## 2: 1970-01-01 1970-01-01 12:00:00
## 3: 1969-12-29 1969-12-29 12:00:00
## 4: 1970-01-02 1970-01-02 12:00:00
## 5: 1969-12-30 1969-12-30 12:00:00
## 6: 1970-01-03 1970-01-03 12:00:00
## 7: 1970-01-04 1970-01-04 11:59:59
## 8: 1970-01-05 1970-01-05 11:59:45
## 9: 1969-12-28 1969-12-28 12:00:00
##10: 1969-12-27 1969-12-27 11:59:21
##11: 1970-01-06 1970-01-06 10:53:21
##12: 1969-12-26 1969-12-26 10:15:03
##13: 1970-01-07 1970-01-07 08:21:55
 resF[,list(mindts=max(dts)),by=list(as.Date(dts))]
##       as.Date              mindts
## 1: 1969-12-31 1969-12-31 12:00:00
## 2: 1970-01-01 1970-01-01 12:00:00
## 3: 1969-12-29 1969-12-29 12:00:00
## 4: 1970-01-02 1970-01-02 12:00:00
## 5: 1969-12-30 1969-12-30 12:00:00
## 6: 1970-01-03 1970-01-03 12:00:00
## 7: 1970-01-04 1970-01-04 11:59:59
## 8: 1970-01-05 1970-01-05 11:59:45
## 9: 1969-12-28 1969-12-28 12:00:00
##10: 1969-12-27 1969-12-27 11:59:21
##11: 1970-01-06 1970-01-06 10:53:21
##12: 1969-12-26 1969-12-26 10:15:03
##13: 1970-01-07 1970-01-07 08:21:55

Now some demo of nice xts style subsetting

 DT[P(dts, '1970')]
##                         dts
##      1: 1970-01-01 05:06:04
##      2: 1970-01-02 20:18:48
##      3: 1970-01-01 09:09:10
##      4: 1970-01-01 13:32:22
##      5: 1970-01-01 20:30:32
##     ---                    
##5001741: 1970-01-02 15:51:12
##5001742: 1970-01-03 01:41:31
##5001743: 1970-01-01 06:08:15
##5001744: 1970-01-01 05:02:27
##5001745: 1970-01-03 00:37:00
 DT[P(dts, '197001')]
##                         dts
##      1: 1970-01-01 05:06:04
##      2: 1970-01-02 20:18:48
##      3: 1970-01-01 09:09:10
##      4: 1970-01-01 13:32:22
##      5: 1970-01-01 20:30:32
##     ---                    
##5001741: 1970-01-02 15:51:12
##5001742: 1970-01-03 01:41:31
##5001743: 1970-01-01 06:08:15
##5001744: 1970-01-01 05:02:27
##5001745: 1970-01-03 00:37:00
 DT[P(dts, '19700102')]
##                         dts
##      1: 1970-01-02 20:18:48
##      2: 1970-01-02 17:59:38
##      3: 1970-01-02 07:14:53
##      4: 1970-01-02 02:13:03
##      5: 1970-01-02 01:31:37
##     ---                    
##1519426: 1970-01-02 11:25:24
##1519427: 1970-01-02 10:00:21
##1519428: 1970-01-02 05:21:25
##1519429: 1970-01-02 05:11:26
##1519430: 1970-01-02 15:51:12
 DT[P(dts, '19700102 00:00:00/19700103 12:00:00')]
##                         dts
##      1: 1970-01-02 20:18:48
##      2: 1970-01-02 17:59:38
##      3: 1970-01-02 07:14:53
##      4: 1970-01-02 02:13:03
##      5: 1970-01-02 01:31:37
##     ---                    
##1785762: 1970-01-02 05:21:25
##1785763: 1970-01-02 05:11:26
##1785764: 1970-01-02 15:51:12
##1785765: 1970-01-03 01:41:31
##1785766: 1970-01-03 00:37:00

 #Check the correctness again
 DT[P(dts, '19700102 00:00:00/19700103 12:00:00'), max(dts)]
##[1] "1970-01-03 12:00:00 GMT"
 DT[P(dts, '19700102 00:00:00/19700103 12:00:00'), min(dts)]
##[1] "1970-01-02 00:00:00 GMT"
like image 170
CHP Avatar answered Oct 22 '22 07:10

CHP


The canonical way of doing this is to convert to POSIXlt and extract the hour component.

hour(as.POSIXlt(DT$dts, "GMT")) < 12

This seems to be comparable in performance to the other techniques discussed (and is easier to understand).

like image 4
Richie Cotton Avatar answered Oct 22 '22 07:10

Richie Cotton


Here's a way that uses some of the functionality from xts to accomplish what you want. This is not a great solution because xts objects must be ordered by time, but data.table objects do not have to be. Also, it may not be terribly fast since there is some redundant work being done by xts and data.table. Nonetheless, I thought it might be interesting.

library(data.table)
library(xts)
set.seed(1); N = 1e5;
# I tweaked the following line to make this reproducible in other timezones.
DT = data.table(dts = .POSIXct(1e5*rnorm(N), tz="GMT"))
setkey(DT, dts)  # must sort on time first so that the `xts` object we're about 
                 # to create has the same order
DT[, XTS:=xts(rep(NA, .N), dts)]  # add a dummy xts object as a column
DT[XTS["T00:00:00/T11:59:59.999999", which=TRUE]][, list(dts)] 
                       dts
    1: 1969-12-27 00:28:41
    2: 1969-12-27 00:34:00
    3: 1969-12-27 03:11:21
    4: 1969-12-27 04:20:27
    5: 1969-12-28 00:00:21
   ---                    
49825: 1970-01-05 08:05:22
49826: 1970-01-05 09:35:32
49827: 1970-01-05 09:49:49
49828: 1970-01-05 09:50:27
49829: 1970-01-05 11:07:32

The above uses an xts-style subsetting string to get the rows where the time is between 00:00:00 and 12:00:00 for every day. Using which=TRUE returns the row number instead of the data from that row, so that we can subset the data.table by those rows.

You could use a string like "1970-01-01" to get all data from that day, or "1970-01" to get all data from January 1970, or "1970-01-01/1970-01-02" to get all rows from those two days.

like image 3
GSee Avatar answered Oct 22 '22 07:10

GSee