I have this code which does exactly what I want but I have to call my function three times for every group which seems very inefficient.
library(data.table)
myRegr = function(x, y) {
regr = lm.fit(cbind(1, x), y)
coefs = regr$coef
k = coefs[[2]]
m = coefs[[1]]
r2 = 1 - var(regr$residuals) / var(y)
return (c(k = k, m = m, r2 = r2))
}
dt = data.table(a = c(0, 0, 0, 1, 1, 1),
x = c(12, 21, 15, 34, 32, 31),
y = c(3, 1, 6, 4, 2, 8))
result = dt[,list(minX = min(x),
minY = min(y),
k = myRegr(x, y)["k"],
m = myRegr(x, y)["m"],
r2 = myRegr(x, y)["r2"]
),
by = list(a)]
print(result)
Outputs:
a minX minY k m r2
0 12 1 -0.3095238 8.285714 0.3176692
1 31 2 -1.0000000 37.000000 0.2500000
Any idea how I can rewrite this to only call the function once?
UPDATE: My example didn't cover the complete problem as I have a fourth column which I'm selecting on, here is a better example:
library(data.table)
myRegr = function(x, y) {
regr = lm.fit(cbind(1, x), y)
coefs = regr$coef
k = coefs[[2]]
m = coefs[[1]]
r2 = 1 - var(regr$residuals) / var(y)
return (c(k = k, m = m, r2 = r2))
}
df = data.frame(a = c(0, 0, 0, 1, 1, 1),
x = c(12, 21, 15, 34, 32, 31),
y = c(3, 1, 6, 4, 2, 8),
time = as.POSIXct(c("2019-01-01 08:12:00", "2019-01-01 08:13:00", "2019-01-01 08:14:00", "2019-01-01 08:12:00", "2019-01-01 08:13:00", "2019-01-01 08:14:00")))
dt = data.table(df)
result = dt[, list(firstX = x[time == min(time)],
firstY = y[time == min(time)],
k = myRegr(x, y)["k"],
m = myRegr(x, y)["m"],
r2 = myRegr(x, y)["r2"]
),
by = a]
print(result)
Outputs:
a firstX firstY k m r2
0 12 3 -0.3095238 8.285714 0.3176692
1 34 4 -1.0000000 37.000000 0.2500000
Tried wrapping it all in a function but it actually slowed things down:
library(data.table)
myRegrList = function(group) {
firstX = group[,x[time == min(time)]]
firstY = group[,y[time == min(time)]]
regr = lm.fit(cbind(1, group$x), group$y)
coefs = regr$coef
k = coefs[[2]]
m = coefs[[1]]
r2 = 1 - var(regr$residuals) / var(group$y)
return (list(firstX = firstX, firstY = firstY, k = k, m = m, r2 = r2))
}
result = dt[, myRegrList(.SD), by = a]
print(result)
You can modify your function to return a vector and dcast
final result:
library(data.table)
myRegr = function(x, y) {
regr <- lm.fit(cbind(1, x), y)
c(
regr$coef[[1]],
regr$coef[[2]],
1 - var(regr$residuals) / var(y)
)
}
result <- df[, .(minX = min(x), minY = min(y), myRegr(x, y), c("m", "k", "r2")), a]
dcast(result, a + minX + minY ~ V4, value.var = "V3")
This solution is not perfect as I have to create V4
(add c("m", "k", "r2")
vector). There should be a better way to do this (perhaps even not to use dcast
). Maybe more experienced data.table
users could advice on this?
Data:
df <- data.table(
a = c(0, 0, 0, 1, 1, 1),
x = c(12, 21, 15, 34, 32, 31),
y = c(3, 1, 6, 4, 2, 8)
)
If you make your function return a list you only need to call
dt[, myRegr(x, y), by = a]
# a minX minY k m r2
#1: 0 12 1 -0.3095238 8.285714 0.3176692
#2: 1 31 2 -1.0000000 37.000000 0.2500000
With
myRegr = function(x, y) {
regr = lm.fit(cbind(1, x), y)
coefs = regr$coef
k = coefs[[2]]
m = coefs[[1]]
r2 = 1 - var(regr$residuals) / var(y)
return (list(# minX = min(x),
# minY = min(y),
k = k,
m = m,
r2 = r2))
}
update
You might subset for x
and y
values and then join with the result of your function
result <- dt[dt[, .I[which.min(time)], by = a]$V1, .(a, x, y)]
result <- result[dt[, myRegr(x, y), by = a], on = .(a)]
result
# a x y k m r2
#1: 0 12 3 -0.3095238 8.285714 0.3176692
#2: 1 34 4 -1.0000000 37.000000 0.2500000
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With