I'm struggling to find an easy a fast solution to create a new data frame by multiplying all "group" of columns between them.
Data for example
a1 <- rnorm(n = 10)
b1 <- rnorm(n = 10)
c1 <- rnorm(n = 10)
a2 <- rnorm(n = 10)
b2 <- rnorm(n = 10)
c2 <- rnorm(n = 10)
For example in this in my initial datatable
Original <- data.frame(
date = seq(today()-9, today(), by = 1),
a1 = a1,
b1 = b1,
c1 = c1,
a2 = a2,
b2 = b2,
c2 = c2)
and this datatable is what I would like to achieve (e.i., columns with all the possible combination between the columns that end with a 1 and columns with all the possible combination between the columns that end with a 2)
Objective <- data.frame(
date = seq(today()-9, today(), by = 1),
b1a1 = b1*a1,
c1a1 = c1*a1,
c1b1 = c1*b1,
b2c2 = b2*c2,
b2a2 = b2*a2,
c2a2 = c2*a2)
I tried with loops but it's not a very elegant and efficient solution; or at least mine wasn't. A solution using the tidyverse would be very welcome
Thanks in advance
I.T
Here is base R option -
cbind(Original[1], do.call(cbind,
unname(lapply(split.default(Original[-1],
gsub('\\D', '', names(Original[-1]))), function(x) {
do.call(cbind, combn(names(x), 2, function(y) {
setNames(data.frame(do.call(`*`, Original[y])),
paste0(y, collapse = ''))
}, simplify = FALSE))
}))))
# date a1b1 a1c1 b1c1 a2b2 a2c2 b2c2
#1 2021-05-28 -0.06708 1.393018 -0.1213 0.1795 -1.0878 -0.0947
#2 2021-05-29 0.33234 0.045563 0.0201 0.0607 0.0247 0.9219
#3 2021-05-30 0.05043 0.160582 0.0341 0.1748 -0.3893 -0.1184
#4 2021-05-31 0.93642 0.980333 0.8156 0.0746 -1.1128 -0.1571
#5 2021-06-01 -1.21365 -0.256619 0.3268 -1.0106 -0.3542 2.1991
#6 2021-06-02 -0.09550 1.311417 -0.0754 -0.8243 -0.5532 1.1986
#7 2021-06-03 0.32514 0.373324 2.3262 -1.1904 -3.0764 0.7171
#8 2021-06-04 -0.41219 1.034527 -0.8338 -1.8588 -1.0202 2.6916
#9 2021-06-05 0.12488 -0.155639 -0.2294 0.2380 0.4288 0.3711
#10 2021-06-06 -0.00665 0.000139 -0.0105 -2.0117 -0.6363 1.0802
Explanation of the answer -
split.default
is used to split the data in groups.split.default(Original[-1], gsub('\\D', '', names(Original[-1])))
#$`1`
# a1 b1 c1
#1 -0.87773 0.0764 -1.5871
#2 0.86812 0.3828 0.0525
#3 0.48761 0.1034 0.3293
#4 -1.06095 -0.8826 -0.9240
#5 0.97625 -1.2432 -0.2629
#6 -1.28910 0.0741 -1.0173
#7 -0.22843 -1.4234 -1.6343
#8 -0.71512 0.5764 -1.4467
#9 0.29108 0.4290 -0.5347
#10 -0.00937 0.7098 -0.0149
#$`2`
# a2 b2 c2
#1 -1.4360 -0.125 0.758
#2 -0.0403 -1.507 -0.612
#3 -0.7580 -0.231 0.514
#4 0.7270 0.103 -1.531
#5 -0.4035 2.505 0.878
#6 0.6168 -1.336 -0.897
#7 2.2599 -0.527 -1.361
#8 -0.8394 2.215 1.215
#9 -0.5244 -0.454 -0.818
#10 1.0886 -1.848 -0.585
where gsub
is used to remove all non-numeric character from the column names which is used to create groups.
gsub('\\D', '', names(Original[-1]))
#[1] "1" "1" "1" "2" "2" "2"
For every group using lapply
we create every combination of column names (combn(names(x), 2.....
) taking 2 columns at a time.
Multiply each combination (do.call(
*, Original[y])
) create a one-column dataframe and give the name of the column using setNames
that is name of the combination (paste0(y, collapse = '')
)
All the combinations from step 3 are combined into one dataframe. (do.call(cbind, combn.....
).
All such groups are again combined into one dataframe (do.call(cbind, lapply...
).
First column with dates is kept as it is in the final output (cbind(Original[1], ....
).
Very good question. A tidyverse
approach. This approach will have combination of uneven number of columns per group. Explanation -
pivot_longer
gsub
. You may use str_replace
too.dplyr::group_split
tidyr::pivot_wider
inside purrr::map
nowcombn
and Reduce
. You may also use purrr::reduce
herecombn
and Reduce
purrr::reduce
in conjunction with dplyr::left_join
list is converted back to intended shapeset.seed(123)
a1 <- rnorm(n = 10)
b1 <- rnorm(n = 10)
c1 <- rnorm(n = 10)
a2 <- rnorm(n = 10)
b2 <- rnorm(n = 10)
c2 <- rnorm(n = 10)
Original <- data.frame(
date = seq(Sys.Date()-9, Sys.Date(), by = 1),
a1 = a1,
b1 = b1,
c1 = c1,
a2 = a2,
b2 = b2,
c2 = c2)
library(tidyverse)
Original %>% pivot_longer(!date) %>%
mutate(grp = gsub('^\\D*(\\d)+$', '\\1', name)) %>%
group_split(grp, .keep = F) %>%
map(~ .x %>% pivot_wider(names_from = name, values_from = value)) %>%
map(~ combn(.x[-1], 2, FUN = Reduce, f = `*`) %>% as.data.frame() %>%
setNames(combn(names(.x[-1]), 2, FUN = Reduce, f = paste0)) %>% cbind(.x[1], .)) %>%
reduce(~left_join(.x, .y, by = 'date'))
date a1b1 a1c1 b1c1 a2b2 a2c2 b2c2
1 2021-05-28 -0.68606804 0.59848918 -1.30710356 -0.29626767 0.108031283 -0.175982140
2 2021-05-29 -0.08282104 0.05017292 -0.07843039 0.06135046 0.008423333 0.005935364
3 2021-05-30 0.62468579 -1.59924166 -0.41119329 -1.13268875 -0.038374446 0.054248120
4 2021-05-31 0.00780406 -0.05139295 -0.08067566 1.90463287 1.201815497 2.968438088
5 2021-06-01 -0.07186344 -0.08080991 0.34742254 0.99243873 -0.185489171 -0.272722771
6 2021-06-02 3.06467216 -2.89278864 -3.01397443 -0.77341778 1.044302702 -1.703161152
7 2021-06-03 0.22946735 0.38614963 0.41709268 -0.22316502 -0.857881519 0.623969018
8 2021-06-04 2.48789113 -0.19402639 -0.30162620 0.02889143 -0.036194437 -0.272813136
9 2021-06-05 -0.48172830 0.78173260 -0.79823906 -0.23864021 -0.037894774 0.096601990
10 2021-06-06 0.21070515 -0.55877763 -0.59279292 0.03171951 -0.082159505 -0.018002847
Check it for this extended dataset
set.seed(123)
a1 <- rnorm(n = 10)
b1 <- rnorm(n = 10)
c1 <- rnorm(n = 10)
a2 <- rnorm(n = 10)
b2 <- rnorm(n = 10)
c2 <- rnorm(n = 10)
d2 <- rnorm(n = 10)
Original <- data.frame(
date = seq(Sys.Date()-9, Sys.Date(), by = 1),
a1 = a1,
b1 = b1,
c1 = c1,
a2 = a2,
b2 = b2,
c2 = c2,
d2 = d2)
library(tidyverse)
Original %>% pivot_longer(!date) %>%
mutate(grp = gsub('^\\D*(\\d)+$', '\\1', name)) %>%
group_split(grp, .keep = F) %>%
map(~ .x %>% pivot_wider(names_from = name, values_from = value)) %>%
map(~ combn(.x[-1], 2, FUN = Reduce, f = `*`) %>% as.data.frame() %>%
setNames(combn(names(.x[-1]), 2, FUN = Reduce, f = paste0)) %>% cbind(.x[1], .)) %>%
reduce(~left_join(.x, .y, by = 'date'))
date a1b1 a1c1 b1c1 a2b2 a2c2 a2d2 b2c2 b2d2 c2d2
1 2021-05-28 -0.68606804 0.59848918 -1.30710356 -0.29626767 0.108031283 0.161902656 -0.175982140 -0.26373820 0.09616971
2 2021-05-29 -0.08282104 0.05017292 -0.07843039 0.06135046 0.008423333 0.148221326 0.005935364 0.10444173 0.01433970
3 2021-05-30 0.62468579 -1.59924166 -0.41119329 -1.13268875 -0.038374446 -0.298262480 0.054248120 0.42163941 0.01428475
4 2021-05-31 0.00780406 -0.05139295 -0.08067566 1.90463287 1.201815497 -0.894445153 2.968438088 -2.20924515 -1.39402460
5 2021-06-01 -0.07186344 -0.08080991 0.34742254 0.99243873 -0.185489171 -0.880563395 -0.272722771 -1.29468307 0.24197936
6 2021-06-02 3.06467216 -2.89278864 -3.01397443 -0.77341778 1.044302702 0.209022041 -1.703161152 -0.34089562 0.46029226
7 2021-06-03 0.22946735 0.38614963 0.41709268 -0.22316502 -0.857881519 0.248271309 0.623969018 -0.18057692 -0.69416615
8 2021-06-04 2.48789113 -0.19402639 -0.30162620 0.02889143 -0.036194437 -0.003281582 -0.272813136 -0.02473471 0.03098700
9 2021-06-05 -0.48172830 0.78173260 -0.79823906 -0.23864021 -0.037894774 -0.282179411 0.096601990 0.71933645 0.11422674
10 2021-06-06 0.21070515 -0.55877763 -0.59279292 0.03171951 -0.082159505 -0.779997773 -0.018002847 -0.17091365 0.44269850
Created on 2021-06-06 by the reprex package (v2.0.0)
You can also use the following solution, not as concise as other answers but here is a different approach that might have some points worthy of consideration. Much of the first chunk of codes I tried to emulate combn
function with tidyverse
equivalences. So first chuck which leads to df2
data set creates all the combinations whose products you would like to calculate and the second chunk just evaluates them in the context of Original
data set. Anyway thank you for this fantastic question that pushed me to the limits.
library(dplyr)
library(tidyr)
library(purrr)
library(stringr)
library(rlang)
cols <- c("(\\w1)", "(\\w2)")
cols %>%
map_dfc(~ names(Original)[str_detect(names(Original), .x)] %>%
as_tibble() %>%
mutate(value2 = rev(value)) %>%
expand(value, value2) %>%
filter(value != value2) %>%
rowwise() %>%
mutate(comb = paste0(sort(c(value, value2)), collapse = "*")) %>%
select(comb) %>%
distinct(comb)) %>%
rename_with(~ str_remove(., "\\.\\.\\."), everything()) %>%
pivot_longer(everything(), names_to = c(".value", "id"),
names_pattern = "(\\w+)(\\d)") -> df2
df2 %>%
select(comb) %>%
rowwise() %>%
mutate(data = map(comb, ~ eval_tidy(parse_expr(.x), data = Original))) %>%
unnest(cols = c(data)) %>%
group_by(comb) %>%
mutate(id = row_number()) %>%
pivot_wider(names_from = comb, values_from = data) %>%
relocate(ends_with("1")) %>%
bind_cols(Original$date) %>%
rename_with(~ str_remove(., "\\*"), everything()) %>%
rename(Date = ...8) %>%
relocate(Date) %>%
select(-id)
# A tibble: 10 x 7
Date a1b1 a1c1 b1c1 a2b2 a2c2 b2c2
<date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2021-05-28 -0.129 0.0912 -0.0838 -1.55 -1.52 2.11
2 2021-05-29 -0.477 -1.58 0.352 -3.55 -0.144 0.101
3 2021-05-30 0.195 0.708 0.105 0.910 -0.356 -0.177
4 2021-05-31 -0.194 0.0219 -0.0111 -1.35 0.261 -0.200
5 2021-06-01 0.0140 0.107 0.000601 -0.0279 -0.126 0.104
6 2021-06-02 0.242 0.141 0.174 -0.0174 0.695 -0.0570
7 2021-06-03 -0.439 -0.360 0.589 0.804 -2.76 -1.79
8 2021-06-04 -1.02 -0.0349 0.0137 2.07 0.357 0.495
9 2021-06-05 -0.00670 0.550 -0.00161 -0.000907 0.00503 -0.925
10 2021-06-06 -0.287 -0.505 0.718 -0.0290 -0.00351 0.0256
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With