I want to update join two tables within a function. Here is an example without using a function:
library(data.table)
Xtest <- data.table(a = rnorm(20), b = rnorm(20), c = 1:20)
Ytest <- data.table(c = 1:10, d = rnorm(10))
Xtest[Ytest, on = .(c), newcol := i.d]
# > Xtest[Ytest, on = .(c), newcol := i.d]
# > Xtest
# a b c newcol
# 1: -1.68473343 -0.74498296 1 0.35096663
# 2: -0.98461614 2.15317525 2 -1.33890396
# 3: -1.65427602 1.21183896 3 1.49641480
# 4: -0.65045253 -0.74609860 4 -0.03227097
# 5: 1.49058508 1.20315276 5 1.41580186
# 6: -0.31631871 0.68716871 6 -0.03671959
# 7: 1.35923085 -0.20082238 7 -2.27959124
# 8: -0.75649545 0.24058212 8 0.93770862
# 9: 0.22452260 -0.28212892 9 -0.02500419
# 10: 0.30209786 1.33697797 10 0.67729741
# 11: 0.88748221 -0.54421418 11 NA
# 12: 0.47207422 -0.28159382 12 NA
# 13: -1.17270475 0.83940750 13 NA
# 14: -2.02787820 -0.03672582 14 NA
# 15: -0.22187761 0.59137210 15 NA
# 16: 0.97750330 -0.27030756 16 NA
# 17: 0.22725940 0.54617488 17 NA
# 18: 0.94065525 -0.23482152 18 NA
# 19: 2.12049977 0.69920776 19 NA
# 20: 0.06192823 0.12262739 20 NA
# Xtest[, newcol := NULL]
I tried to recast the above code into a function, but the Ycol argument seems to be hidden:
myjoinfunction <- function(X, Y, joinlist, newcol, Ycol) {
eval(substitute(X[Y, on = joinlist, newcol:=i.Ycol]))
}
# > myjoinfunction <- function(X, Y, joinlist, newcol, Ycol) {
# + eval(substitute(X[Y, on = joinlist, newcol:=i.Ycol]))
# + }
# > myjoinfunction(Xtest, Ytest, list(c), D, d)
# Error in eval(jsub, SDenv, parent.frame()) : object 'i.Ycol' not found
# 8.
# eval(jsub, SDenv, parent.frame())
# 7.
# eval(jsub, SDenv, parent.frame())
# 6.
# `[.data.table`(Xtest, Ytest, on = list(c), `:=`(D, i.Ycol))
# 5.
# Xtest[Ytest, on = list(c), `:=`(D, i.Ycol)]
# 4.
# eval(substitute(X[Y, on = joinlist, `:=`(newcol, i.Ycol)]))
# 3.
# eval(substitute(X[Y, on = joinlist, `:=`(newcol, i.Ycol)]))
# 2.
# eval(substitute(X[Y, on = joinlist, `:=`(newcol, i.Ycol)]))
# 1.
# myjoinfunction(Xtest, Ytest, list(c), D, d)
How can I make the Ycol argument visible within the function?
UPDATED EXAMPLE
To be more general, here is a new example of an update join I would like to write a function for:
Xtest <- data.table(
a = rnorm(20),
b = rnorm(20),
c = rep(letters[1:4], rep(5, 4)),
d = rep(1:5, 4)
)
Ytest <- data.table(
c = rep(letters[1:2], rep(5, 2)),
d = rep(1:5, 2),
e = rnorm(10),
f = rnorm(10)
)
# > Xtest[Ytest, on = .(c, d), `:=`(newcol1 = i.e, newcol2 = i.f) ]
# > Xtest
# a b c d newcol1 newcol2
# 1: -2.4939743 -0.200370619 a 1 -1.4934893 -1.0288955
# 2: 1.0188321 -1.182286508 a 2 1.3811712 0.9747131
# 3: 0.5217161 -0.152117649 a 3 -0.4168069 0.1218213
# 4: -0.1584167 0.583640353 a 4 0.4644738 1.7888567
# 5: -0.4271398 0.020067301 a 5 2.5279998 2.0919953
# 6: -1.7692909 0.250129040 b 1 -1.5964246 -1.0884861
# 7: -0.8899915 0.971742055 b 2 0.3011304 1.2629524
# 8: -0.4490363 -1.540005621 b 3 -0.7992208 -0.5155775
# 9: -0.5706488 -1.037077614 b 4 1.0058213 1.9787692
# 10: -0.0922679 1.444487848 b 5 -0.2893311 -0.6095043
# 11: 0.9924810 -1.144513228 c 1 NA NA
# 12: 1.2232591 1.503649791 c 2 NA NA
# 13: 0.8751961 0.892765910 c 3 NA NA
# 14: 0.9960554 0.499310073 c 4 NA NA
# 15: -0.6184695 1.867985589 c 5 NA NA
# 16: 0.6503936 0.422683211 d 1 NA NA
# 17: -0.6160834 -1.585713893 d 2 NA NA
# 18: 1.5949931 -0.544704857 d 3 NA NA
# 19: 0.7232079 -0.006460518 d 4 NA NA
# 20: -0.2824961 0.119585859 d 5 NA NA
Using a suggestion from David Arenburg in the comments trips up when joinlist is a list of strings
myjoinfunction1 <- function(X, Y, joinlist, newcol, Ycol) X[Y, on = joinlist, newcol:= get(paste0("i.", Ycol))]
# > myjoinfunction1(Xtest, Ytest, list("c", "d"), "newcol", "e")
# Error in .parse_on(substitute(on), isnull_inames) :
# 'on' argument should be a named atomic vector of column names indicating which columns in 'i' should be joined with which columns in 'x'.
# 5.
# stop("'on' argument should be a named atomic vector of column names indicating which columns in 'i' should be joined with which columns in 'x'.")
# 4.
# .parse_on(substitute(on), isnull_inames)
# 3.
# `[.data.table`(X, Y, on = joinlist, `:=`(newcol, get(paste0("i.",
# Ycol))))
# 2.
# X[Y, on = joinlist, `:=`(newcol, get(paste0("i.", Ycol)))]
# 1.
# myjoinfunction1(Xtest, Ytest, list("c", "d"), "newcol", "e")
For the more general second example:
f <- function(X, Y, joinlist, cols) {
X[Y, on = joinlist, names(cols) := mget(sprintf("i.%s", cols))]
}
Usage:
set.seed(1)
Xtest2 <- data.table(
a = rnorm(20),
b = rnorm(20),
c = rep(letters[1:4], rep(5, 4)),
d = rep(1:5, 4)
)
Ytest2 <- data.table(
c = rep(letters[1:2], rep(5, 2)),
d = rep(1:5, 2),
e = rnorm(10),
f = rnorm(10)
)
f(Xtest2, Ytest2, c("c", "d"), c(newcol1 = "e", newcol2 = "f"))
Side note: There are some edge cases to watch out for when adding columns in a data.table (running out of "column slots"; adding to a table loaded from disk)
Here is my answer based on the comments. The eval and substitute can be applied at different locations within the data.table, instead of wrapping the entire data.table in eval(substitute(DT)). Also, when using substitute with a string instead of an expression, eval is not needed.
myjoinfunction <- function(X, Y, joinlist, newcol, Ycol) {
Ycol <- paste0("i.", as.character(substitute(Ycol))[-1])
X[
Y,
on = eval(substitute(joinlist)),
as.character(substitute(newcol))[-1] := mget(Ycol)
d]
}
Usage:
set.seed(42)
Xtest <- data.table(
a = rnorm(20),
b = rnorm(20),
c = rep(letters[1:4], rep(5, 4)),
d = rep(1:5, 4),
id = 1:20
)
Ytest <- data.table(
c = rep(letters[1:2], rep(5, 2)),
d = rep(1:5, 2),
e = rnorm(10),
f = rnorm(10),
id = 1:10
)
# multiple columns supported both for joining and updating
myjoinfunction(Xtest, Ytest, list(c, d), list(newcol1, newcol2), list(e, f));
# Single columns have to be expressed in a list
myjoinfunction(Xtest, Ytest, list(id), list(newcol1), list(e));
By not having the arguments as strings, this is better for interactive use than programmatic use.
Suggestions for refinements are welcome! Especially if there is a more elegant way to express single columns in the function arguments or handle lists within the function (I don't like having to use [-1] as a workaround within the function when handling lists, I think it will lead to errors eventually.)
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With