Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Replace NA with last non-NA in data.table by using only data.table

Tags:

r

na

data.table

I want to replace NA values with last non-NA values in data.table and using data.table. I have one solution, but it's considerably slower than na.locf:

library(data.table)
library(zoo)
library(microbenchmark)

f1 <- function(x) {
    x[, X := na.locf(X, na.rm = F)]
    x
}

f2 <- function(x) {
    cond <- !is.na(x[, X])
    x[, X := .SD[, X][1L], by = cumsum(cond)]
    x
}

m1 <- data.table(X = rep(c(NA,NA,1,2,NA,NA,NA,6,7,8), 100))
m2 <- data.table(X = rep(c(NA,NA,1,2,NA,NA,NA,6,7,8), 100))

microbenchmark(f1(m1), f2(m2), times = 10)

#Unit: milliseconds
#   expr        min          lq      median          uq         max neval
# f1(m1)   2.648938    2.770792    2.959156    3.894635    6.032533    10
# f2(m2) 994.267610 1916.250440 1926.420436 1941.401077 2008.929024    10

I want to know, why it's so slow and whether a faster solution exists or not.

like image 705
Eldar Agalarov Avatar asked Jun 17 '14 03:06

Eldar Agalarov


2 Answers

Here's a data.table-only solution, but it's slightly slower than na.locf:

m1[, X := X[1], by = cumsum(!is.na(X))]
m1
#       X
#   1: NA
#   2: NA
#   3:  1
#   4:  2
#   5:  2
#  ---   
# 996:  2
# 997:  2
# 998:  6
# 999:  7
#1000:  8

Speed test:

m1 <- data.table(X = rep(c(NA,NA,1,2,NA,NA,NA,6,7,8), 1e6))
f3 = function(x) x[, X := X[1], by = cumsum(!is.na(X))]

system.time(f1(copy(m1)))
# user  system elapsed 
# 3.84    0.58    4.62 
system.time(f3(copy(m1)))
# user  system elapsed 
# 5.56    0.19    6.04 

And here's a perverse way of making it faster, but I think one that makes it considerably less readable:

f4 = function(x) {
  x[, tmp := cumsum(!is.na(X))]
  setattr(x, "sorted", "tmp") # set the key without any checks
  x[x[!is.na(X)], X := i.X][, tmp := NULL]
}

system.time(f4(copy(m1)))
# user  system elapsed 
# 3.32    0.51    4.00 
like image 169
eddi Avatar answered Oct 01 '22 09:10

eddi


As I mentioned in my comment, Rcpp is pretty fast for this. Below I compare the zoo::na.locf approach, @eddi's f3 and f4, and the Rcpp approach posted here by @RomainFrancois.

First, the benchmark results:

microbenchmark(f.zoo(m1), eddi.f3(m2), eddi.f4(m3), f.Rcpp(m4), times = 10)

## Unit: milliseconds
##         expr      min         lq    median        uq       max neval
##    f.zoo(m1) 1297.969 1403.67418 1443.5441 1527.7644 1597.9724    10
##  eddi.f3(m2) 2982.103 2998.48809 3039.6543 3068.9303 3078.3963    10
##  eddi.f4(m3) 1970.650 2017.55740 2061.6599 2074.1497 2099.8892    10
##   f.Rcpp(m4)   95.411   98.44505  107.6925  119.2838  171.7855    10

And the function definitions:

library(data.table)
library(zoo)
library(microbenchmark)
library(Rcpp)

m1 <- m2 <- m3 <- m4 <- 
  data.table(X = rep(c(NA, NA, 1, 2, NA, NA, NA, 6, 7, 8), 1e6))

f.zoo <- function(x) {
  x[, X := na.locf(X, na.rm = F)]
  x
}

eddi.f3 = function(x) x[, X := X[1], by = cumsum(!is.na(X))]

eddi.f4 = function(x) {
  x[, tmp := cumsum(!is.na(X))]
  setattr(x, "sorted", "tmp")
  x[x[!is.na(X)], X := i.X][, tmp := NULL]
}

# Make the Cpp function available
cppFunction('
NumericVector naLocfCpp(NumericVector x) {
    double *p=x.begin(), *end = x.end() ;
    double v = *p ; p++ ;

    while( p < end ){
        while( p<end && !NumericVector::is_na(*p) ) p++ ;
        v = *(p-1) ;
        while( p<end && NumericVector::is_na(*p) ) {
            *p = v ;
            p++ ;
        }
    }

    return x;
}')

f.Rcpp <- function(x) {
  naLocfCpp(x$X)
  x
}

And all produce identical results:

out1 <- f.zoo(m1)
out2 <- eddi.f3(m2)
out3 <- eddi.f4(m3)
out4 <- f.Rcpp(m4)

all(identical(out1, out2), identical(out1, out3), identical(out1, out4))

## TRUE
like image 21
jbaums Avatar answered Oct 01 '22 08:10

jbaums