I would like to fill in row NAs in a data.table, using 'locf', but to treat every row separately. I cant seem to get a result from the following;
require(data.table)
set.seed(456)
# some dummy data
dt <- data.table(a = sample(1:4,6, replace=T), b = sample(1:4,6, replace=T), c = sample(1:4,6, replace=T),
d = sample(1:4,6, replace=T), e = sample(1:4,6, replace=T), f = sample(1:4,6, replace=T),
g = sample(1:4,6, replace=T), h = sample(1:4,6, replace=T), i = sample(1:4,6, replace=T),
j = sample(1:4,6, replace=T), xx = sample(1:4,6, replace=T))
dt[4, c:=NA]
dt[1, g:=NA]
dt[1, h:=NA]
# set colnames
cols <- setdiff(names(dt),"xx")
# use nafill over rows
dt[, (cols) := nafill(.SD, type="locf"), seq_len(nrow(dt)), .SDcols = cols]
The result is no different to the original table, what have I missed
a b c d e f g h i j xx
1: 1 3 3 2 3 1 NA NA 4 3 1
2: 1 1 2 2 1 2 2 1 2 4 1
3: 3 2 3 1 1 4 3 3 2 1 2
4: 2 3 NA 1 2 2 1 4 3 4 2
5: 1 2 3 4 4 3 2 2 2 4 3
6: 4 1 4 2 1 4 4 3 3 4 3
(N.B. actual data is 12 million rows, if this has any bearing on performance)
One good method, using a for
loop. It's not row-by-row, it operates on "all rows with an NA
in column 'X'" at one time, for each column in cols
.
for (i in seq_along(cols)[-1]) {
prevcol <- cols[i-1]
thiscol <- cols[i]
dt[is.na(get(thiscol)), (thiscol) := fcoalesce(get(thiscol), get(prevcol)) ]
}
dt
# a b c d e f g h i j xx
# <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
# 1: 1 3 3 2 3 1 1 1 4 3 1
# 2: 1 1 2 2 1 2 2 1 2 4 1
# 3: 3 2 3 1 1 4 3 3 2 1 2
# 4: 2 3 3 1 2 2 1 4 3 4 2
# 5: 1 2 3 4 4 3 2 2 2 4 3
# 6: 4 1 4 2 1 4 4 3 3 4 3
Admittedly the use of get(.)
is not perfect, but I think it'll generally be okay.
Another method, about as fast (depending on the size of data):
dt[, (cols) := Reduce(function(prev,this) fcoalesce(this, prev), .SD, accumulate = TRUE), .SDcols = cols]
# same results
Benchmarking, since you said that with 2M rows, performance is important.
I'll go with 2M rows and update the method for randomizing the NA
s.
library(data.table)
set.seed(456)
n <- 2e6 # 6e5
dt <- data.table(a = sample(1:4,n, replace=T), b = sample(1:4,n, replace=T), c = sample(1:4,n, replace=T), d = sample(1:4,n, replace=T), e = sample(1:4,n, replace=T), f = sample(1:4,n, replace=T), g = sample(1:4,n, replace=T), h = sample(1:4,n, replace=T), i = sample(1:4,n, replace=T), j = sample(1:4,n, replace=T), xx = sample(1:4,n, replace=T))
mtx <- cbind(sample(nrow(dt), ceiling(n*11/20), replace=TRUE), sample(ncol(dt), ceiling(n*11/20), replace=TRUE))
mtx <- mtx[!duplicated(mtx),]
dt[mtx] <- NA
head(dt)
# a b c d e f g h i j xx
# <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
# 1: 1 2 2 3 2 1 2 3 3 2 2
# 2: 1 3 4 1 4 4 3 2 4 3 3
# 3: 3 4 2 2 3 4 2 2 1 NA 1
# 4: 2 1 4 1 2 3 NA 4 4 4 3
# 5: 1 2 3 3 4 3 3 NA 1 4 1
# 6: 4 3 4 2 2 NA 4 1 2 4 2
Unfortunately, the transpose
method fails:
system.time({
dt2 = transpose(dt)
setnafill(dt2, type = 'locf')
dt2 = transpose(dt2)
setnames(dt2, names(dt))
})
# Error: cannot allocate vector of size 30.6 Gb
but the for
loop (and Reduce
, incidentally) works fine:
cols <- setdiff(names(dt),"N")
system.time({
for (i in seq_along(cols)[-1]) {
prevcol <- cols[i-1]
thiscol <- cols[i]
dt[is.na(get(thiscol)), (thiscol) := fcoalesce(get(thiscol), get(prevcol)) ]
}
})
# user system elapsed
# 0.14 0.00 0.11
head(dt)
# a b c d e f g h i j xx
# <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
# 1: 1 2 2 3 2 1 2 3 3 2 2
# 2: 1 3 4 1 4 4 3 2 4 3 3
# 3: 3 4 2 2 3 4 2 2 1 1 1
# 4: 2 1 4 1 2 3 3 4 4 4 3
# 5: 1 2 3 3 4 3 3 3 1 4 1
# 6: 4 3 4 2 2 2 4 1 2 4 2
If I simplify the problem-set to 600K rows, then I can get both to work. (I don't know the tipover point for my system ... it might be 1M, who knows, I just wanted to compare them side-by-side.) With n <- 6e5
and generating dt
, I see the following data and simple timing:
head(dt)
# a b c d e f g h i j xx
# <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
# 1: 1 2 3 1 3 4 NA 3 3 3 3
# 2: 1 3 2 2 4 3 1 2 2 4 1
# 3: 3 4 2 1 1 1 1 4 2 4 2
# 4: 2 4 1 NA 1 4 3 1 4 1 1
# 5: 1 NA 4 2 NA NA 4 4 2 2 NA
# 6: 4 1 4 4 1 2 3 3 1 1 2
sum(is.na(dt))
# [1] 321782
system.time({
dt2 = transpose(dt)
setnafill(dt2, type = 'locf')
dt2 = transpose(dt2)
setnames(dt2, names(dt))
})
# user system elapsed
# 4.27 4.50 7.74
sum(is.na(dt)) # 'dt' is unchanged, only important here to compare the 'for' loop
# [1] 321782
sum(is.na(dt2)) # rows with leading columns having 'NA', nothing to coalesce, not surprising
# [1] 30738
cols <- setdiff(names(dt),"N")
system.time({
for (i in seq_along(cols)[-1]) {
prevcol <- cols[i-1]
thiscol <- cols[i]
dt[is.na(get(thiscol)), (thiscol) := fcoalesce(get(thiscol), get(prevcol)) ]
}
})
# user system elapsed
# 0.10 0.03 0.06
identical(dt, dt2)
# [1] TRUE
### regenerate `dt` so it has `NA`s again
system.time({
dt[, (cols) := Reduce(function(prev,this) fcoalesce(this,prev), .SD, accumulate = TRUE), .SDcols = cols]
})
# user system elapsed
# 0.03 0.00 0.03
identical(dt, dt2)
# [1] TRUE
A more robust benchmark such as bench::mark
is going to be encumbered a little by the need to copy(dt)
every pass. Though this overhead is not huge,
bench::mark(copy(dt))
# # A tibble: 1 x 13
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
# 1 copy(dt) 7.77ms 20.9ms 45.1 25.2MB 0 23 0 510ms <data.table[,11] [600,000 x 11]> <Rprofmem[,3] [14 x 3]> <bch:tm [23]> <tibble [23 x 3]>
it is still extra. As such, I'll compare the transpose
code twice, once with and once without, in order to better compare it to the for
and reduce
answers more honestly. (Note that bench::mark
's default action is to verify that all outputs are identical
. This can be disabled, but I have not done that, so all code blocks return the same results.)
bench::mark(
transpose1 = {
dt2 = transpose(dt)
setnafill(dt2, type = 'locf')
dt2 = transpose(dt2)
setnames(dt2, names(dt))
dt2
},
transpose2 = {
dt0 = copy(dt)
dt2 = transpose(dt0)
setnafill(dt2, type = 'locf')
dt2 = transpose(dt2)
setnames(dt2, names(dt0))
dt2
},
forloop = {
dt0 <- copy(dt)
for (i in seq_along(cols)[-1]) {
prevcol <- cols[i-1]
thiscol <- cols[i]
dt0[is.na(get(thiscol)), (thiscol) := fcoalesce(get(thiscol), get(prevcol)) ]
}
dt0
},
reduce = {
dt0 <- copy(dt)
dt0[, (cols) := Reduce(function(prev,this) fcoalesce(this,prev), .SD, accumulate = TRUE), .SDcols = cols]
},
min_iterations = 10)
# Warning: Some expressions had a GC in every iteration; so filtering is disabled.
# # A tibble: 4 x 13
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
# 1 transpose1 4.94s 5.48s 0.154 1.28GB 0.201 10 13 1.08m <data.table[,11] [600,000 x 11]> <Rprofmem[,3] [33,008 x 3]> <bch:tm [10]> <tibble [10 x 3]>
# 2 transpose2 5.85s 6.29s 0.130 1.3GB 0.259 10 20 1.29m <data.table[,11] [600,000 x 11]> <Rprofmem[,3] [15,316 x 3]> <bch:tm [10]> <tibble [10 x 3]>
# 3 forloop 48.37ms 130.91ms 2.87 71.14MB 0 10 0 3.49s <data.table[,11] [600,000 x 11]> <Rprofmem[,3] [191 x 3]> <bch:tm [10]> <tibble [10 x 3]>
# 4 reduce 48.08ms 75.82ms 4.70 71MB 0.470 10 1 2.13s <data.table[,11] [600,000 x 11]> <Rprofmem[,3] [38 x 3]> <bch:tm [10]> <tibble [10 x 3]>
From this:
(Edited to increase the benchmark's minimum iterations to 10.)
Another way to solve this problem would be to use the set
function. This solution is both fast and very memory efficient. I also compared it with the @r2evans forloop
and Reduce
cases on a 12M rows data.table.
I also considered one modified version of the forloop
case in @erevens answer (forloop1
below). The new version consists in simply removing the expression in the data.table argument i (is.na(get(thiscol))
). This change helps to improve both memory usage and performance compared to the original one.
library(data.table)
for(cl in seq_along(cols)[-1L]) set(dt, j=cl, value=fcoalesce(dt3[[cl]], dt3[[cl-1L]]))
n <- 12e6
set.seed(0123456789)
d <- setDT(replicate(7, sample(c(1:4, NA), n, TRUE, (5:1)/15), simplify=FALSE))
setnames(d, c(letters[1:6], "xx"))
cols <- setdiff(names(d),"xx")
dt0 <- copy(d)
dt1 <- copy(d)
dt2 <- copy(d)
dt3 <- copy(d)
bench::mark(
# modified version
forloop1 = {
for (i in seq_along(cols)[-1]) {
prevcol <- cols[i-1]
thiscol <- cols[i]
# i not specified
dt0[, (thiscol) := fcoalesce(get(thiscol), get(prevcol)) ]
}
dt0
},
# original version
forloop2 = {
for (i in seq_along(cols)[-1]) {
prevcol <- cols[i-1]
thiscol <- cols[i]
dt1[is.na(get(thiscol)), (thiscol) := fcoalesce(get(thiscol), get(prevcol)) ]
}
dt1
},
reduce = {
dt2[, (cols) := Reduce(function(prev,this) fcoalesce(this,prev), .SD, accumulate = TRUE), .SDcols = cols]
},
set = {
for(cl in seq_along(cols)[-1L]) set(dt3, j=cl, value=fcoalesce(dt3[[cl]], dt3[[cl-1L]]))
dt3
},
min_iterations = 5L)
# # A tibble: 4 x 13
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
# 1 forloop1 77.1ms 87.9ms 10.9 229MB 2.74 4 1 366ms <data.table [12,000,000 x 7]> <Rprofmem [134 x 3]> <bench_tm [5]> <tibble [5 x 3]>
# 2 forloop2 192.8ms 201.3ms 5.01 460MB 3.34 3 2 599ms <data.table [12,000,000 x 7]> <Rprofmem [183 x 3]> <bench_tm [5]> <tibble [5 x 3]>
# 3 reduce 114.5ms 130.2ms 7.76 458MB 5.17 3 2 387ms <data.table [12,000,000 x 7]> <Rprofmem [21 x 3]> <bench_tm [5]> <tibble [5 x 3]>
# 4 set 65.6ms 68.5ms 14.5 229MB 9.65 3 2 207ms <data.table [12,000,000 x 7]> <Rprofmem [76 x 3]> <bench_tm [5]> <tibble [5 x 3]>
Using the set
function leads to better performance and memory usage. I personally care more about the median time (as opposed to total_time
).
What about transposing the data.table back and forth?
dt2 = transpose(dt)
setnafill(dt2, type = 'locf')
dt2 = transpose(dt2)
setnames(dt2, names(dt))
NB: As can be seen in @r2evans answer, this solution is considerably slower.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With