I have this character vector:
dput(t$line)
c("0304", "0305", "0306", "0308", "0311", "0313", "0314", "0316",
"0318", "0321", "0322", "0323", "0324", "0326", "0327", "0330",
"0333", "0337", "0338", "0339", "0342", "0341", "0344", "0346",
"0347", "0348", "0349", "0350", "0352", "0353", "0357", "0359",
"0360", "0362", "0363", "0364", "0365", "0367", "0371", "0370",
"0373", "0375", "0378", "0380", "0381", "0385", "0386", "0387",
"0391", "0395", "0394", "0397", "0398", "0399", "0400", "0402",
"0404", "0405", "0406", "0408", "0412", "0416", "0419", "0423",
"0424", "0425", "0426", "0428", "0429", "0432", "0433", "0436",
"0435", "0439", "0437", "0440", "0441")
The numbers it contains are not completely continuous. I'd like to make them continuous, while preserving the leading zero or zeros where needed. I've come up with this solution:
paste("0", seq(as.numeric(t$line[1]), as.numeric(t$line[1]) + length(t$line), 1), sep = "")
[1] "0304" "0305" "0306" "0307" "0308" "0309" "0310" "0311" "0312" "0313" "0314" "0315" "0316" "0317" "0318" "0319" "0320"
[18] "0321" "0322" "0323" "0324" "0325" "0326" "0327" "0328" "0329" "0330" "0331" "0332" "0333" "0334" "0335" "0336" "0337"
[35] "0338" "0339" "0340" "0341" "0342" "0343" "0344" "0345" "0346" "0347" "0348" "0349" "0350" "0351" "0352" "0353" "0354"
[52] "0355" "0356" "0357" "0358" "0359" "0360" "0361" "0362" "0363" "0364" "0365" "0366" "0367" "0368" "0369" "0370" "0371"
[69] "0372" "0373" "0374" "0375" "0376" "0377" "0378" "0379" "0380" "0381"
This works okay as long as there is exactly one 0
to be added. There may however be more than one leading zero or none at all. How can the sequence be made continuous with appropriate leading zeros?
One stringr
option could be:
str_pad(seq.int(min(as.numeric(x)), length.out = length(x)), 4, "left", "0")
[1] "0304" "0305" "0306" "0307" "0308" "0309" "0310" "0311" "0312" "0313" "0314" "0315" "0316"
[14] "0317" "0318" "0319" "0320" "0321" "0322" "0323" "0324" "0325" "0326" "0327" "0328" "0329"
[27] "0330" "0331" "0332" "0333" "0334" "0335" "0336" "0337" "0338" "0339" "0340" "0341" "0342"
[40] "0343" "0344" "0345" "0346" "0347" "0348" "0349" "0350" "0351" "0352" "0353" "0354" "0355"
[53] "0356" "0357" "0358" "0359" "0360" "0361" "0362" "0363" "0364" "0365" "0366" "0367" "0368"
[66] "0369" "0370" "0371" "0372" "0373" "0374" "0375" "0376" "0377" "0378" "0379" "0380"
A more general solution that takes into account the maximum length of the entries and therefore implictly the number of leading zeros:
t$line2 <- c("000517","00524")
Cont.PadZero <- function(vec) sprintf(paste0("%0", max(nchar(vec)), "d"), seq.int(min(as.numeric(vec)), max(as.numeric(vec))))
Cont.PadZero(t$line2)
[1] "000517" "000518" "000519" "000520" "000521" "000522" "000523" "000524"
You want a continuous sequence of length(x)
starting at min(x)
, where nchar
of the resulting elements is identical to that of x
.
Use sprintf
instead of paste0
to format leading zeros. nchar(x)[1]
gives the length to which (occasional) padding with zeros is required. If it's not safe that the lengths are equal use max(nchar(x))
, but that's slower.
Since x[1]
does not necessarily have to be the minimum you may want to use min(as.numeric(x))
as starting point. When you use seq
, it's end point should be min(as.numeric(x)) + length(x) - 1
(because the min is already the first element). Or use length.out=length(x)
which appears to be faster, combined with seq.int
even faster.
sprintf(paste0("%0", nchar(x)[1], "d"), seq.int(min(as.numeric(x)), length.out=length(x)))
# [1] "0304" "0305" "0306" "0307" "0308" "0309" "0310" "0311" "0312" "0313" "0314" "0315"
# [13] "0316" "0317" "0318" "0319" "0320" "0321" "0322" "0323" "0324" "0325" "0326" "0327"
# [25] "0328" "0329" "0330" "0331" "0332" "0333" "0334" "0335" "0336" "0337" "0338" "0339"
# [37] "0340" "0341" "0342" "0343" "0344" "0345" "0346" "0347" "0348" "0349" "0350" "0351"
# [49] "0352" "0353" "0354" "0355" "0356" "0357" "0358" "0359" "0360" "0361" "0362" "0363"
# [61] "0364" "0365" "0366" "0367" "0368" "0369" "0370" "0371" "0372" "0373" "0374" "0375"
# [73] "0376" "0377" "0378" "0379" "0380"
Another option is using colon :
, but seq.int
above appears to be faster (see benchmark below).
sprintf(paste0("%0", nchar(x)[1], "d"), 0:(length(x) - 1) + min(as.numeric(x)))
NB: To complete the original vector by imputing missings, you may do:
sprintf(paste0("%0", max(nchar(x)), "d"), do.call(`:`, as.list(range(as.numeric(x)))))
# [1] "0304" "0305" "0306" "0307" "0308" "0309" "0310" "0311" "0312" "0313" "0314"
# [12] "0315" "0316" "0317" "0318" "0319" "0320" "0321" "0322" "0323" "0324" "0325"
# [23] "0326" "0327" "0328" "0329" "0330" "0331" "0332" "0333" "0334" "0335" "0336"
# [34] "0337" "0338" "0339" "0340" "0341" "0342" "0343" "0344" "0345" "0346" "0347"
# [45] "0348" "0349" "0350" "0351" "0352" "0353" "0354" "0355" "0356" "0357" "0358"
# [56] "0359" "0360" "0361" "0362" "0363" "0364" "0365" "0366" "0367" "0368" "0369"
# [67] "0370" "0371" "0372" "0373" "0374" "0375" "0376" "0377" "0378" "0379" "0380"
# [78] "0381" "0382" "0383" "0384" "0385" "0386" "0387" "0388" "0389" "0390" "0391"
# [89] "0392" "0393" "0394" "0395" "0396" "0397" "0398" "0399" "0400" "0401" "0402"
# [100] "0403" "0404" "0405" "0406" "0407" "0408" "0409" "0410" "0411" "0412" "0413"
# [111] "0414" "0415" "0416" "0417" "0418" "0419" "0420" "0421" "0422" "0423" "0424"
# [122] "0425" "0426" "0427" "0428" "0429" "0430" "0431" "0432" "0433" "0434" "0435"
# [133] "0436" "0437" "0438" "0439" "0440" "0441"
f1 <- function() sprintf(paste0("%0", max(nchar(x)), "d"),
seq(min(as.numeric(x)), min(as.numeric(x)) + length(x) - 1))
f2 <- function() sprintf(paste0("%0", max(nchar(x)), "d"),
seq(min(as.numeric(x)), length.out=length(x)))
f3 <- function() sprintf(paste0("%0", max(nchar(x)), "d"),
seq.int(min(as.numeric(x)), length.out=length(x)))
f31 <- function() sprintf(paste0("%0", nchar(x[1]), "d"),
seq.int(min(as.numeric(x)), length.out=length(x)))
f4 <- function() sprintf(paste0("%0", nchar(x[1]), "d"),
0:(length(x) - 1) + min(as.numeric(x)))
f5 <- function() stringr::str_pad(seq.int(min(as.numeric(x)),
length.out=length(x)),
nchar(x[1]), "left", "0")
set.seed(5789)
x <- sample(sprintf("%05d", 1:99999))
microbenchmark::microbenchmark(seq_to=f1(), seq_len=f2(), seq.int=f3(),
seq.int1=f31(), colon=f4(), stringr=f5())
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# seq_to 104.22119 106.83928 108.92791 107.81301 109.68406 124.35686 100 f
# seq_len 87.14385 89.89180 92.34962 90.97192 92.09823 110.59426 100 d
# seq.int 85.72324 87.93885 89.91353 89.03327 90.32758 113.41480 100 c
# seq.int1 59.54312 61.63065 62.86618 62.47707 63.53334 76.33471 100 a
# colon 60.94867 63.16109 64.73306 63.88925 64.79997 81.63646 100 b
# stringr 99.08452 101.56649 104.01522 102.74420 104.20269 158.30948 100 e
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With