I couldn't find a question similar to the one that I have here. I have a very large named list of named vectors that match column names in a dataframe. I would like to use the list of named vectors to replace values in the dataframe columns that match each list element's name. That is, the name of the vector in the list matches the name of the dataframe column and the key-value pair in each vector element will be used to recode the column.
Reprex below:
library(tidyverse)
# Starting tibble
test <- tibble(Names = c("Alice","Bob","Cindy"),
A = c(3,"q",7),
B = c(1,2,"b"),
C = c("a","g",9))
# Named vector
A <- c("5" = "alpha", "7" = "bravo", "3" = "charlie", "q" = "delta")
B <- c("1" = "yes", "2" = "no", "b" = "bad", "c" = "missing")
C <- c("9" = "beta", "8" = "gamma", "a" = "delta", "g" = "epsilon")
# Named list of named vectors
dicts <- list("A" = A, "B" = B, "C" = C) # Same names as columns
I'm able to use mutate
and specify the column and list item manually.
# Works when replacement vector is specified
test %>%
mutate(across(c("A"),
~recode(., !!!dicts$A)))
#> # A tibble: 3 x 4
#> Names A B C
#> <chr> <chr> <chr> <chr>
#> 1 Alice charlie 1 a
#> 2 Bob delta 2 g
#> 3 Cindy bravo b 9
However, the following does not work:
# Does not work when replacement vector using column names
test %>%
mutate(across(c("A", "B", "C"),
~recode(., !!!dicts$.)))
Error: Problem with
mutate()
input..1
. x No replacements provided. i Input..1
is(function (.cols = everything(), .fns = NULL, ..., .names = NULL) ...
.
Additionally, I've found that map2_dfr
works only when all non-recoded columns are specified:
# map2_dfr Sort of works, but requires dropping some columns
map2_dfr(test %>% select(names(dicts)),
dicts,
~recode(.x, !!!.y))
#> # A tibble: 3 x 3
#> A B C
#> <chr> <chr> <chr>
#> 1 charlie yes delta
#> 2 delta no epsilon
#> 3 bravo bad beta
I'm looking to recode columns using the names from the list, without dropping columns.
You can try the base R code below
idx <- match(names(dicts), names(test))
test[idx] <- Map(`[`, dicts, test[idx])
which gives
> test
# A tibble: 3 x 4
Names A B C
<chr> <chr> <chr> <chr>
1 Alice charlie yes delta
2 Bob delta no epsilon
3 Cindy bravo bad beta
Below are three approaches:
First, we can make it work with dplyr::across
in a custom function using dplyr::cur_column()
.
library(tidyverse)
myfun <- function(x) {
mycol <- cur_column()
dplyr::recode(x, !!! dicts[[mycol]])
}
test %>%
mutate(across(c("A", "B", "C"), myfun))
#> # A tibble: 3 x 4
#> Names A B C
#> <chr> <chr> <chr> <chr>
#> 1 Alice charlie yes delta
#> 2 Bob delta no epsilon
#> 3 Cindy bravo bad beta
A second option is to transform the dicts
into a list of expression and then just splice it into mutate
using the !!!
operator:
expr_ls <- imap(dicts, ~ quo(recode(!!sym(.y), !!!.x)))
test %>%
mutate(!!! expr_ls)
#> # A tibble: 3 x 4
#> Names A B C
#> <chr> <chr> <chr> <chr>
#> 1 Alice charlie yes delta
#> 2 Bob delta no epsilon
#> 3 Cindy bravo bad beta
Finally, in the larger tidyverse we could use purrr::lmap_at
, but it makes the underlying function more complex than it needs to be:
myfun2 <- function(x) {
x_nm <- names(x)
mutate(x, !! x_nm := recode(!! sym(x_nm), !!! dicts[[x_nm]]))
}
lmap_at(test,
names(dicts),
myfun2)
#> # A tibble: 3 x 4
#> Names A B C
#> <chr> <chr> <chr> <chr>
#> 1 Alice charlie yes delta
#> 2 Bob delta no epsilon
#> 3 Cindy bravo bad beta
Original data
# Starting tibble
test <- tibble(Names = c("Alice","Bob","Cindy"),
A = c(3,"q",7),
B = c(1,2,"b"),
C = c("a","g",9))
# Named vector
A <- c("5" = "alpha", "7" = "bravo", "3" = "charlie", "q" = "delta")
B <- c("1" = "yes", "2" = "no", "b" = "bad", "c" = "missing")
C <- c("9" = "beta", "8" = "gamma", "a" = "delta", "g" = "epsilon")
# Named list of named vectors
dicts <- list("A" = A, "B" = B, "C" = C) # Same names as columns
Created on 2021-12-15 by the reprex package (v2.0.1)
Base R (should be translated easily to dplyr
)
# Helper function
look_dict <- function(col, values) dicts[[col]][values]
# lapply
test[names(dicts)] <- lapply(names(dicts), \(col) look_dict(col, test[[col]]))
# magrittr and for loop to avoid repeating code
library(magrittr)
for (col in names(dicts)) test[[col]] %<>% look_dict(col, .)
# # A tibble: 3 x 4
# Names A B C
# <chr> <chr> <chr> <chr>
# 1 Alice charlie yes delta
# 2 Bob delta no epsilon
# 3 Cindy bravo bad beta
One work around would be to use your map2_dfr
code, but then bind the columns that are needed to the map2_dfr
output. Though you still have to drop the names column.
library(tidyverse)
map2_dfr(test %>% select(names(dicts)),
dicts,
~ recode(.x,!!!.y)) %>%
dplyr::bind_cols(., Names = test$Names) %>%
dplyr::select(4, 1:3)
Output
# A tibble: 3 × 4
Names A B C
<chr> <chr> <chr> <chr>
1 Alice charlie yes delta
2 Bob delta no epsilon
3 Cindy bravo bad beta
Not a full answer, but I figured a benchmark of the (at the point of writing) existing solutions might be helpful. As with every benchmark YMMV:
As we see, sindri_baldur
's base R version is actually the fastest
(code below)
bench::mark(
karl_base_r(data, dicts),
tim_across(data, dicts),
tim_lmap(data, dicts),
sotos_pivot(data, dicts),
thomas_base_r(data, dicts),
sindri_base_r(data, dicts),
check = FALSE
)
#> # A tibble: 6 x 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 karl_base_r(data, dicts) 825.9us 968.9us 814. 428.17KB 6.25
#> 2 tim_across(data, dicts) 5.04ms 6.44ms 147. 2.4MB 4.15
#> 3 tim_lmap(data, dicts) 7.34ms 8.49ms 108. 106.06KB 4.17
#> 4 sotos_pivot(data, dicts) 12.79ms 14.58ms 60.6 1.26MB 4.18
#> 5 thomas_base_r(data, dicts) 392us 438.6us 1891. 0B 4.07
#> 6 sindir_base_r(data, dicts) 116.8us 136.7us 5793. 0B 4.11
For a larger dataset, ThomasIsCoding
base R version is a bit faster than Sindir's solution.
set.seed(15)
data_large <- data %>% sample_n(1e6, replace = TRUE)
bench::mark(
karl_base_r(data_large, dicts),
tim_across(data_large, dicts),
tim_lmap(data_large, dicts),
thomas_base_r(data_large, dicts),
sindir_base_r(data_large, dicts),
check = FALSE
)
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
#> # A tibble: 5 x 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:t> <dbl> <bch:byt> <dbl>
#> 1 karl_base_r(data_large, dicts) 856ms 856ms 1.17 503.9MB 9.35
#> 2 tim_across(data_large, dicts) 647ms 647ms 1.55 504.9MB 10.8
#> 3 tim_lmap(data_large, dicts) 809ms 809ms 1.24 503.6MB 11.1
#> 4 thomas_base_r(data_large, dicts) 131ms 148ms 6.53 80.1MB 3.27
#> 5 sindir_base_r(data_large, dicts) 150ms 180ms 5.08 80.1MB 5.08
library(tidyverse)
library(magrittr)
# Starting tibble
data <- tibble(Names = c("Alice","Bob","Cindy"),
A = c(3,"q",7),
B = c(1,2,"b"),
C = c("a","g",9))
# Named vector
A <- c("5" = "alpha", "7" = "bravo", "3" = "charlie", "q" = "delta")
B <- c("1" = "yes", "2" = "no", "b" = "bad", "c" = "missing")
C <- c("9" = "beta", "8" = "gamma", "a" = "delta", "g" = "epsilon")
# Named list of named vectors
dicts <- list("A" = A, "B" = B, "C" = C) # Same names as columns
# function definitions
karl_base_r <- function(data, dicts) {
for (x in names(dicts))
{data[[x]] <- do.call(recode, c(list(data[[x]]), dicts[[x]])) }
data
}
tim_across <- function(data, dicts) {
myfun <- function(x) {
mycol <- cur_column()
dplyr::recode(x, !!! dicts[[mycol]])
}
data %>%
mutate(across(c("A", "B", "C"), myfun))
}
tim_lmap <- function(data, dicts) {
myfun2 <- function(x) {
x_nm <- names(x)
mutate(x, !! x_nm := recode(!! sym(x_nm), !!! dicts[[x_nm]]))
}
lmap_at(data,
names(dicts),
myfun2)
}
sotos_pivot <- function(data, dicts) {
data %>%
pivot_longer(-1) %>%
left_join(stack(dicts) %>%
rownames_to_column('value'),
by = c('value', 'name' = 'ind')) %>%
pivot_wider(id_cols = -value, names_from = name, values_from = values)
}
thomas_base_r <- function(data, dicts) {
idx <- match(names(dicts), names(data))
data[idx] <- Map(`[`, dicts, data[idx])
data
}
sindri_base_r <- function(data, dicts) {
look_dict <- function(col, values) dicts[[col]][values]
data[names(dicts)] <- lapply(names(dicts), \(col) look_dict(col, data[[col]]))
data
}
Created on 2021-12-15 by the reprex package (v2.0.0)
using base R and recode:
for (x in names(dicts)) { test[[x]] <- do.call(recode, c(list(test[[x]]), dicts[[x]])) }
> test
# A tibble: 3 × 4
Names A B C
<chr> <chr> <chr> <chr>
1 Alice charlie yes delta
2 Bob delta no epsilon
3 Cindy bravo bad beta
Also note that other solutions based on Map()
or str_replace_all()
only work because the test example only uses simple substitutions. If .default
or .missing
were used they would most probably fail.
A solution with merging the two can be,
library(dplyr)
library(tidyr)
test %>%
pivot_longer(-1) %>%
left_join(stack(dicts) %>%
rownames_to_column('value'),
by = c('value', 'name' = 'ind')) %>%
pivot_wider(id_cols = -value, names_from = name, values_from = values)
# A tibble: 3 x 4
# Names A B C
# <chr> <chr> <chr> <chr>
#1 Alice charlie yes delta
#2 Bob delta no epsilon
#3 Cindy bravo bad beta
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With