I am working on a project where I have few million rows and each contains timestamp. Each row also includes the unique ID of that event. Current condition is that rows with the same event ID can have timestamp difference of 1 minute (there cant be two events with delta timestamp less than 1minute).
What I want to simulate is the situation which would happen if minimal timestamp difference would be 3 minutes.
TIME_STAMP PREV_TIME_STAMP Unique ID
06-27-2021 07:07:22 06-27-2021 06:30:00 1
06-27-2021 07:18:26 06-27-2021 07:07:22 1
06-27-2021 07:20:26 06-27-2021 07:18:26 1
06-27-2021 07:22:26 06-27-2021 07:20:26 1
06-27-2021 07:22:26 06-27-2021 07:22:26 1
06-27-2021 15:18:05 06-27-2021 15:11:00 2
06-27-2021 15:19:05 06-27-2021 15:18:05 2
06-27-2021 12:31:37 06-27-2021 12:30:00 2
06-27-2021 12:35:05 06-27-2021 12:30:00 2
The problem is that I cant only make new column with difference between messages, I do need a loop for this - why? see below:
From the table the situation is following:
So there is need to define a referent timestamp (it is the previous ACCEPTED time), and delta between new time and previous time WHICH IS ACCEPTED has to be 3 minutes or higher.
I hope that I managed to explain it well enough. If not, please answer and I will provide as much as information I can.
Thanks in advance!
EDIT:
df <- data.frame(TIME_STAMP = as.POSIXct(strptime(
c("06-27-2021 07:07:22",
"06-27-2021 07:18:26",
"06-27-2021 07:20:26",
"06-27-2021 07:22:26",
"06-27-2021 07:22:26",
"06-27-2021 15:18:05",
"06-27-2021 15:19:05",
"06-27-2021 12:31:37",
"06-27-2021 12:35:05"), "%m-%d-%Y %H:%M:%S")),
PREV_TIME_STAMP = as.POSIXct(strptime(
c("06-27-2021 06:30:00",
"06-27-2021 07:07:22",
"06-27-2021 07:18:26",
"06-27-2021 07:20:26",
"06-27-2021 07:22:26",
"06-27-2021 15:11:00",
"06-27-2021 15:18:05",
"06-27-2021 12:30:00",
"06-27-2021 12:30:00"), "%m-%d-%Y %H:%M:%S")),
ID = c(1,1,1,1,1,2,2,2,2))
First you should rearrange the data and remove the redundancy of your two time columns:
library(data.table)
DT <- fread(" TIME_STAMP, Unique ID
06-27-2021 06:30:00, 1
06-27-2021 07:07:22, 1
06-27-2021 07:18:26, 1
06-27-2021 07:20:26, 1
06-27-2021 07:22:26, 1
06-27-2021 07:22:26, 1
06-27-2021 15:11:00, 2
06-27-2021 15:18:05, 2
06-27-2021 15:19:05, 2
06-27-2021 12:31:37, 2
06-27-2021 12:35:05, 2")
Then you can do this easily with Rcpp:
library(Rcpp)
cppFunction(
'LogicalVector deleteRow(const NumericVector x) {
const double n = x.size();
double j = 0;
LogicalVector res = LogicalVector(n);
for (double i = 1; i < n; i++) {
if (x(i) - x(j) < 180) {
res[i] = true;
} else {
j = i;
}
}
return res;
}')
DT[, TIME_STAMP := as.POSIXct(TIME_STAMP, format = "%m-%d-%Y %H:%M:%S", tz = "GMT")]
setkey(DT, `Unique ID`, TIME_STAMP) #ensure sorting
DT[, delete := deleteRow(TIME_STAMP), by = `Unique ID`]
# TIME_STAMP Unique ID delete
# 1: 2021-06-27 06:30:00 1 FALSE
# 2: 2021-06-27 07:07:22 1 FALSE
# 3: 2021-06-27 07:18:26 1 FALSE
# 4: 2021-06-27 07:20:26 1 TRUE
# 5: 2021-06-27 07:22:26 1 FALSE
# 6: 2021-06-27 07:22:26 1 TRUE
# 7: 2021-06-27 12:31:37 2 FALSE
# 8: 2021-06-27 12:35:05 2 FALSE
# 9: 2021-06-27 15:11:00 2 FALSE
#10: 2021-06-27 15:18:05 2 FALSE
#11: 2021-06-27 15:19:05 2 TRUE
Assuming that it is just typo in last 2 entries of PREV_TIME_STAMP for ID=2, here is another method using Reduce
using Ronald's dataset.
#sort by TIME_STAMP to make sure older entries come up first
DT[order(TIME_STAMP),
#convert numeric to POSIX
as.POSIXct(
#get a distinct set of timestamp that is greater than 3 minutes
unique(
#use curr if more than 3 mins from prev, else keep the prev value
Reduce(function(x,y) if(as.double(y-x,units="mins") >= 3) y else x,
TIME_STAMP,
accumulate=TRUE)
),
origin="1970-01-01", tz="GMT"),
by=`Unique ID`]
edit: share timings. tl;dr Roland method is way faster
library(data.table)
set.seed(0L)
M <- 2e6
nIDs <- M/1e3
DT <- data.table(
ID=sample(nIDs, M, replace=TRUE),
TIME_STAMP=as.POSIXct(as.numeric(Sys.time())+sample(60*(0:4), M, replace=TRUE), origin="1970-01-01", tz="GMT"))
setorder(DT, ID, TIME_STAMP)
DT2 <- copy(DT)
library(Rcpp)
cppFunction(
'LogicalVector deleteRow(const NumericVector x) {
const double n = x.size();
double j = 0;
LogicalVector res = LogicalVector(n);
for (double i = 1; i < n; i++) {
if (x(i) - x(j) < 180) {
res[i] = true;
} else {
j = i;
}
}
return res;
}')
filter1 <- function(start, width) {
end <- start + width - 1L # closed interval
o <- order(c(start, end))
is_start <- rep(c(TRUE, FALSE), each = length(start))[o]
event <- rep(c(1, -1), each = length(start))[o] # 1 == open, -1 == close
cvg <- cumsum(event) # number of open intervals
must <- (event == 1 & cvg == 1)[is_start]
open <- start[must] # non-overlapping events
close <- end[must] + 1L
might <- findInterval(start, sort(c(open, close))) %% 2 == 0
must | might
}
filter_all <- function(start, width) {
idx <- !logical(length(start))
repeat {
idx0 <- filter1(start[idx], width)
if (sum(idx0) == sum(idx))
break
idx[idx] <- idx0
}
idx
}
basemtd <- function() {
DT[, filter_all(TIME_STAMP, 3), by=ID]
}
rcppmtd <- function() {
DT[, delete := deleteRow(TIME_STAMP), by=ID]
}
dtmtd2 <- function() {
DT2[,
as.POSIXct(
unique(
Reduce(function(x,y) if(as.double(y-x,units="mins") >= 3) y else x,
TIME_STAMP,
accumulate=TRUE)
),
origin="1970-01-01", tz="GMT"),
by=ID]
}
library(microbenchmark)
microbenchmark(basemtd(), rcppmtd(), dtmtd2(), times=3L)
timings:
Unit: milliseconds
expr min lq mean median uq max neval
basemtd() 3579.0786 3601.19295 3608.667733333333 3623.3073 3623.46230000000014 3623.6173 3
rcppmtd() 37.0085 37.53650 39.001500000000 38.0645 39.99800000000000 41.9315 3
dtmtd2() 210238.1842 210901.39020 211303.247133333323 211564.5962 211835.77860000001965 212106.9610 3
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With