I would like to replace up to n
consecutive NA values in vector with latest non-NA value.
For example, if:
a <- c(1,NA,NA,NA,NA,NA,2,NA,1,NA,NA,NA)
n <- 2
I would like to obtain:
c(1,1,1,NA,NA,NA,2,2,1,1,1,NA)
n
is maximum number of NA values that can be replaced by given element).
I know na.locf()
function, but I don't know how to set the limit n
. Is it possible to do it?
Here's an option using na.locf
and rle
library(zoo)
r <- rle(is.na(a))
a <- na.locf(a)
is.na(a) <- sequence(r$lengths) > n & rep(r$values, r$lengths)
a
# [1] 1 1 1 NA NA NA 2 2 1 1 1 NA
So here I first computed the run lengths of elements in a
(including the NA entries), then replaced all NA's using na.locf
and finally turned those elements back to NA's where the run lengths were greater than n
and the elements were NA
.
As another idea, we can find the last indices of "a" without NA
s:
is = seq_along(a)
i = cummax((!is.na(a)) * is)
i
# [1] 1 1 1 1 1 1 7 7 9 9 9 9
Replace the last non-NA
index with the current index if last non-NA
is more than "n" steps away:
wh = (is - i) > n
i[wh] = is[wh]
i
# [1] 1 1 1 4 5 6 7 7 9 9 9 12
And subset "a":
a[i]
# [1] 1 1 1 NA NA NA 2 2 1 1 1 NA
You could do this using split
and replace
in base R
f <- function(a, n) {
# split the vector based on the position of non-NA values
l <- split(a, cumsum(seq_along(a) %in% which(!is.na(a))))
unlist(lapply(l, function(r) replace(r, 1:(n+1), r[1])[seq_along(r)]),use.names = FALSE)
}
f(a, n = 2)
#[1] 1 1 1 NA NA NA 2 2 1 1 1 NA
f(a, n = 3)
#[1] 1 1 1 1 NA NA 2 2 1 1 1 1
Benchmarking (random generated vector of size 7467)
library(microbenchmark)
library(dplyr)
library(zoo)
set.seed(123)
a <- unlist(replicate(1000, c(sample(10, 2), rep(NA, sample.int(10, 1)))))
length(a)
# [1] 7467
n <- 3
f_989 <- function(a, n) {
# split the vector based on the position of non-NA values
l <- split(a, cumsum(seq_along(a) %in% which(!is.na(a))))
unlist(lapply(l, function(r) replace(r, 1:(n+1), r[1])[seq_along(r)]),use.names = FALSE)
}
f_zx8754 <- function(a, n)
data.frame(a) %>% mutate(gr = cumsum(!is.na(a))) %>%
group_by(gr) %>%
mutate(res = if_else(row_number() <= n + 1, na.locf(a), a)) %>%
.$res
f_docendo_discimus <- function(a, n){
r <- rle(is.na(a))
a <- na.locf(a)
is.na(a) <- sequence(r$lengths) > n & rep(r$values, r$lengths)
a
}
f_akrun <- function(a,n)
ave(a, cumsum(!is.na(a)), FUN = function(x) replace(x, pmin(length(x), seq(n+1)), x[1]))
f_alexis_laz=function(a,n){
is = seq_along(a)
i = cummax((!is.na(a)) * is)
wh = (is - i) > n
i[wh] = is[wh]
a[i]
}
r <- f_989(a,n)
identical(r, f_zx8754(a,n))
# [1] TRUE
identical(r, f_docendo_discimus(a,n))
# [1] TRUE
identical(r, f_akrun(a,n))
# [1] TRUE
identical(r, f_alexis_laz(a,n))
# [1] TRUE
res <- microbenchmark("f1"=f_989(a,n), "f2"=f_zx8754(a,n),
"f3"=f_docendo_discimus(a,n), "f4"=f_akrun(a,n), "f5"=f_alexis_laz(a,n))
print(res, order="mean")
# Unit: microseconds
# expr min lq mean median uq max neval
# f5 129.804 137.014 161.106 141.6715 151.7375 1031.511 100
# f3 1249.351 1354.215 1471.478 1392.9750 1482.2140 2553.086 100
# f1 4736.895 5093.852 5630.367 5345.3450 6069.9260 8848.513 100
# f4 22165.601 23936.866 24660.990 24485.6725 24883.6440 29453.177 100
# f2 205854.339 215582.174 221524.448 218643.9540 224211.0435 261512.922 100
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With