Hello I am looking for an efficient way of selecting POSIXct
rows from a data.table
such that the time of day is less than say 12:00:00
(NOTE that millisecond is NOT required, so we can use ITime
for example)
set.seed(1); N = 1e7;
DT = data.table(dts = .POSIXct(1e5*rnorm(N), tz="GMT"))
DT
dts
# 1: 1969-12-31 06:35:54.618925
# 2: 1970-01-01 05:06:04.332422
# ---
# 9999999: 1970-01-03 00:37:00.035565
#10000000: 1969-12-30 08:30:23.624506
One solution (the problem here is that the cast could be costly if N is big)
f <- function(t, st, et) {time <- as.ITime(t); return(time>=as.ITime(st) & time<=as.ITime(et))}
P <- function(t, s) { #geekTrader solution
ep <- .parseISO8601(s)
if(grepl('T[0-9]{2}:[0-9]{2}:[0-9]{2}/T[0-9]{2}:[0-9]{2}:[0-9]{2}', s)){
first.time <- as.double(ep$first.time)
last.time <- as.double(ep$last.time)-31449600
SecOfDay <- as.double(t) %% 86400
return(SecOfDay >= first.time & SecOfDay <= last.time )
} else {
return(t >= ep$first.time & t <= ep$last.time)
}
}
Quick look about the perf
system.time(resf <- DT[f(dts,'00:00:00','11:59:59')])
user system elapsed
1.01 0.28 1.29
system.time(resP <- DT[P(dts,'T00:00:00/T11:59:59')])
user system elapsed
0.64 0.13 0.76
identical(resf,resP)
[1] TRUE
P <- function(t, s) {
ep <- .parseISO8601(s)
if(grepl('T[0-9]{2}:[0-9]{2}:[0-9]{2}/T[0-9]{2}:[0-9]{2}:[0-9]{2}', s)){
first.time <- as.double(ep$first.time)
last.time <- as.double(ep$last.time)-31449600
SecOfDay <- as.double(t) %% 86400
return(SecOfDay >= first.time & SecOfDay <= last.time )
} else {
return(t >= ep$first.time & t <= ep$last.time)
}
}
F <- function(t, st, et) {
time <- as.ITime(t)
return(time>=as.ITime(st) & time<=as.ITime(et))
}
Sys.setenv(TZ='GMT')
N = 1e7;
set.seed(1);
DT <- data.table(dts = .POSIXct(1e5*rnorm(N), tz="GMT"))
system.time(resP <- DT[P(dts, 'T00:00:00/T12:00:00'), ])
## user system elapsed
## 1.11 0.11 1.22
system.time(resF <- DT[F(dts,'00:00:00','12:00:00')])
## user system elapsed
## 2.22 0.29 2.51
resP
## dts
## 1: 1969-12-31 06:35:54
## 2: 1970-01-01 05:06:04
## 3: 1969-12-31 00:47:17
## 4: 1970-01-01 09:09:10
## 5: 1969-12-31 01:12:33
## ---
##5000672: 1970-01-01 06:08:15
##5000673: 1970-01-01 05:02:27
##5000674: 1969-12-31 02:25:24
##5000675: 1970-01-03 00:37:00
##5000676: 1969-12-30 08:30:23
resF
## dts
## 1: 1969-12-31 06:35:54
## 2: 1970-01-01 05:06:04
## 3: 1969-12-31 00:47:17
## 4: 1970-01-01 09:09:10
## 5: 1969-12-31 01:12:33
## ---
##5000672: 1970-01-01 06:08:15
##5000673: 1970-01-01 05:02:27
##5000674: 1969-12-31 02:25:24
##5000675: 1970-01-03 00:37:00
##5000676: 1969-12-30 08:30:23
#Check the correctness
resP[,list(mindts=max(dts)),by=list(as.Date(dts))]
## as.Date mindts
## 1: 1969-12-31 1969-12-31 12:00:00
## 2: 1970-01-01 1970-01-01 12:00:00
## 3: 1969-12-29 1969-12-29 12:00:00
## 4: 1970-01-02 1970-01-02 12:00:00
## 5: 1969-12-30 1969-12-30 12:00:00
## 6: 1970-01-03 1970-01-03 12:00:00
## 7: 1970-01-04 1970-01-04 11:59:59
## 8: 1970-01-05 1970-01-05 11:59:45
## 9: 1969-12-28 1969-12-28 12:00:00
##10: 1969-12-27 1969-12-27 11:59:21
##11: 1970-01-06 1970-01-06 10:53:21
##12: 1969-12-26 1969-12-26 10:15:03
##13: 1970-01-07 1970-01-07 08:21:55
resF[,list(mindts=max(dts)),by=list(as.Date(dts))]
## as.Date mindts
## 1: 1969-12-31 1969-12-31 12:00:00
## 2: 1970-01-01 1970-01-01 12:00:00
## 3: 1969-12-29 1969-12-29 12:00:00
## 4: 1970-01-02 1970-01-02 12:00:00
## 5: 1969-12-30 1969-12-30 12:00:00
## 6: 1970-01-03 1970-01-03 12:00:00
## 7: 1970-01-04 1970-01-04 11:59:59
## 8: 1970-01-05 1970-01-05 11:59:45
## 9: 1969-12-28 1969-12-28 12:00:00
##10: 1969-12-27 1969-12-27 11:59:21
##11: 1970-01-06 1970-01-06 10:53:21
##12: 1969-12-26 1969-12-26 10:15:03
##13: 1970-01-07 1970-01-07 08:21:55
Now some demo of nice xts
style subsetting
DT[P(dts, '1970')]
## dts
## 1: 1970-01-01 05:06:04
## 2: 1970-01-02 20:18:48
## 3: 1970-01-01 09:09:10
## 4: 1970-01-01 13:32:22
## 5: 1970-01-01 20:30:32
## ---
##5001741: 1970-01-02 15:51:12
##5001742: 1970-01-03 01:41:31
##5001743: 1970-01-01 06:08:15
##5001744: 1970-01-01 05:02:27
##5001745: 1970-01-03 00:37:00
DT[P(dts, '197001')]
## dts
## 1: 1970-01-01 05:06:04
## 2: 1970-01-02 20:18:48
## 3: 1970-01-01 09:09:10
## 4: 1970-01-01 13:32:22
## 5: 1970-01-01 20:30:32
## ---
##5001741: 1970-01-02 15:51:12
##5001742: 1970-01-03 01:41:31
##5001743: 1970-01-01 06:08:15
##5001744: 1970-01-01 05:02:27
##5001745: 1970-01-03 00:37:00
DT[P(dts, '19700102')]
## dts
## 1: 1970-01-02 20:18:48
## 2: 1970-01-02 17:59:38
## 3: 1970-01-02 07:14:53
## 4: 1970-01-02 02:13:03
## 5: 1970-01-02 01:31:37
## ---
##1519426: 1970-01-02 11:25:24
##1519427: 1970-01-02 10:00:21
##1519428: 1970-01-02 05:21:25
##1519429: 1970-01-02 05:11:26
##1519430: 1970-01-02 15:51:12
DT[P(dts, '19700102 00:00:00/19700103 12:00:00')]
## dts
## 1: 1970-01-02 20:18:48
## 2: 1970-01-02 17:59:38
## 3: 1970-01-02 07:14:53
## 4: 1970-01-02 02:13:03
## 5: 1970-01-02 01:31:37
## ---
##1785762: 1970-01-02 05:21:25
##1785763: 1970-01-02 05:11:26
##1785764: 1970-01-02 15:51:12
##1785765: 1970-01-03 01:41:31
##1785766: 1970-01-03 00:37:00
#Check the correctness again
DT[P(dts, '19700102 00:00:00/19700103 12:00:00'), max(dts)]
##[1] "1970-01-03 12:00:00 GMT"
DT[P(dts, '19700102 00:00:00/19700103 12:00:00'), min(dts)]
##[1] "1970-01-02 00:00:00 GMT"
The canonical way of doing this is to convert to POSIXlt
and extract the hour component.
hour(as.POSIXlt(DT$dts, "GMT")) < 12
This seems to be comparable in performance to the other techniques discussed (and is easier to understand).
Here's a way that uses some of the functionality from xts to accomplish what you want. This is not a great solution because xts
objects must be ordered by time, but data.table
objects do not have to be. Also, it may not be terribly fast since there is some redundant work being done by xts
and data.table
. Nonetheless, I thought it might be interesting.
library(data.table)
library(xts)
set.seed(1); N = 1e5;
# I tweaked the following line to make this reproducible in other timezones.
DT = data.table(dts = .POSIXct(1e5*rnorm(N), tz="GMT"))
setkey(DT, dts) # must sort on time first so that the `xts` object we're about
# to create has the same order
DT[, XTS:=xts(rep(NA, .N), dts)] # add a dummy xts object as a column
DT[XTS["T00:00:00/T11:59:59.999999", which=TRUE]][, list(dts)]
dts
1: 1969-12-27 00:28:41
2: 1969-12-27 00:34:00
3: 1969-12-27 03:11:21
4: 1969-12-27 04:20:27
5: 1969-12-28 00:00:21
---
49825: 1970-01-05 08:05:22
49826: 1970-01-05 09:35:32
49827: 1970-01-05 09:49:49
49828: 1970-01-05 09:50:27
49829: 1970-01-05 11:07:32
The above uses an xts
-style subsetting string to get the rows where the time is between 00:00:00 and 12:00:00 for every day. Using which=TRUE
returns the row number instead of the data from that row, so that we can subset the data.table
by those rows.
You could use a string like "1970-01-01" to get all data from that day, or "1970-01" to get all data from January 1970, or "1970-01-01/1970-01-02" to get all rows from those two days.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With