R has a handy cross platform tar() function that can tar and gzip files. It seems this function was designed to tar up entire directories. I was hoping to use this function to tar and compress a subset of a directory, or a single file. I can't seem to do this, however. I was expecting the following to tar up a single csv file in the current working directory:
tar( "tst.tgz", "myCsv.csv", compression="gzip" )
So is it only possible to use the tar() function on directories?
I've temporarily gotten around this by creating a temp directory, copying my files, and then tarring the whole temp dir. But I was hoping for a bit simpler solution. That would not require copying the files which is somewhat time consuming for large files.
I don't think that is possible as you describe JD. The files
argument is passed to the path
argument of list.files
, and as a result it works by tarring up files in directories, not individual files.
If you are prepared to edit an internal function, tar()
can be made to do what you want, by fiddling with the call to list.files()
inside tar()
. A bit of fiddling produced the tar2()
function below, which has extra arguments to control what list.files()
returns. Using this function, we can achieve what you want via a call like this:
tar2("foo.tar", path = ".", pattern = "bar.csv", recursive = FALSE,
full.names = FALSE, all.files = FALSE)
The all.files = FALSE
is likely redundant unless you have hidden files with names containing "bar.csv"
.
The recursive = FALSE
bit just stops the function searching anywhere but the current directory, which seems what you want and speeds the search up if the working dir has a lot of files and sub-folder.
The full.names = FALSE
bit is key. If this if TRUE
, list.files()
returns the matched filename as "./bar.csv"
, which tar()
would stick in a folder inside the tarball. If we set this to FALSE
, list.files()
returns "bar.csv"
, so we get a tarball with a single CSV file as requested.
If you have files with similar names and wish to only find the stated filename, peg it inside the pattern with ^
and $
, eg:
tar2("foo.tar", path = ".", pattern = "^bar.csv$", recursive = FALSE,
full.names = FALSE, all.files = FALSE)
Here is the modified tar()
function as tar2()
:
tar2 <- function (tarfile, files = NULL, compression = c("none", "gzip",
"bzip2", "xz"), compression_level = 6, tar = Sys.getenv("tar"),
pattern = NULL, all.files = TRUE, recursive = TRUE, full.names = TRUE)
{
if (is.character(tarfile)) {
TAR <- tar
if (nzchar(TAR) && TAR != "internal") {
flags <- switch(match.arg(compression), none = "cf",
gzip = "zcf", bzip2 = "jcf", xz = "Jcf")
cmd <- paste(TAR, flags, shQuote(tarfile), paste(shQuote(files),
collapse = " "))
return(invisible(system(cmd)))
}
con <- switch(match.arg(compression), none = file(tarfile,
"wb"), gzip = gzfile(tarfile, "wb", compress = compression_level),
bzip2 = bzfile(tarfile, "wb", compress = compression_level),
xz = xzfile(tarfile, "wb", compress = compression_level))
on.exit(close(con))
}
else if (inherits(tarfile, "connection"))
con <- tarfile
else stop("'tarfile' must be a character string or a connection")
files <- list.files(files, recursive = recursive, all.files = all.files,
full.names = full.names, pattern = pattern)
bf <- unique(dirname(files))
files <- c(bf[!bf %in% c(".", files)], files)
for (f in unique(files)) {
info <- file.info(f)
if (is.na(info$size)) {
warning(gettextf("file '%s' not found", f), domain = NA)
next
}
header <- raw(512L)
if (info$isdir && !grepl("/$", f))
f <- paste(f, "/", sep = "")
name <- charToRaw(f)
if (length(name) > 100L) {
if (length(name) > 255L)
stop("file path is too long")
s <- max(which(name[1:155] == charToRaw("/")))
if (is.infinite(s) || s + 100 < length(name))
stop("file path is too long")
warning("storing paths of more than 100 bytes is not portable:\n ",
sQuote(f), domain = NA)
prefix <- name[1:(s - 1)]
name <- name[-(1:s)]
header[345 + seq_along(prefix)] <- prefix
}
header[seq_along(name)] <- name
header[101:107] <- charToRaw(sprintf("%07o", info$mode))
uid <- info$uid
if (!is.null(uid) && !is.na(uid))
header[109:115] <- charToRaw(sprintf("%07o", uid))
gid <- info$gid
if (!is.null(gid) && !is.na(gid))
header[117:123] <- charToRaw(sprintf("%07o", gid))
size <- ifelse(info$isdir, 0, info$size)
header[137:147] <- charToRaw(sprintf("%011o", as.integer(info$mtime)))
if (info$isdir)
header[157L] <- charToRaw("5")
else {
lnk <- Sys.readlink(f)
if (is.na(lnk))
lnk <- ""
header[157L] <- charToRaw(ifelse(nzchar(lnk), "2",
"0"))
if (nzchar(lnk)) {
if (length(lnk) > 100L)
stop("linked path is too long")
header[157L + seq_len(nchar(lnk))] <- charToRaw(lnk)
size <- 0
}
}
header[125:135] <- charToRaw(sprintf("%011o", as.integer(size)))
header[258:262] <- charToRaw("ustar")
header[264:265] <- charToRaw("0")
s <- info$uname
if (!is.null(s) && !is.na(s)) {
ns <- nchar(s, "b")
header[265L + (1:ns)] <- charToRaw(s)
}
s <- info$grname
if (!is.null(s) && !is.na(s)) {
ns <- nchar(s, "b")
header[297L + (1:ns)] <- charToRaw(s)
}
header[149:156] <- charToRaw(" ")
checksum <- sum(as.integer(header))%%2^24
header[149:154] <- charToRaw(sprintf("%06o", as.integer(checksum)))
header[155L] <- as.raw(0L)
writeBin(header, con)
if (info$isdir || nzchar(lnk))
next
inf <- file(f, "rb")
for (i in seq_len(ceiling(info$size/512L))) {
block <- readBin(inf, "raw", 512L)
writeBin(block, con)
if ((n <- length(block)) < 512L)
writeBin(raw(512L - n), con)
}
close(inf)
}
block <- raw(512L)
writeBin(block, con)
writeBin(block, con)
invisible(0L)
}
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With