I am a recoding a variable based on some rather long strings, here exemplified by the strings A, B, C, D, E, and G. I was wondering if there is a way to recode this with out having to repeat the reference to df$foo 12 times using base R? Maybe there is some some smarter faster way routhe I could explore? is this is really the smartest way to do it in R?
df <- data.frame(
  foo = 1000:1010,
  bar = letters[1:11])
df  
    foo bar
1  1000   a
2  1001   b
3  1002   c
4  1003   d
5  1004   e
6  1005   f
7  1006   g
8  1007   h
9  1008   i
10 1009   j
11 1010   k
A  <- c(1002)
B  <- c(1007, 1008)
C  <- c(1001, 1003)
D  <- c(1004, 1006)
E  <- c(1000, 1005)
G  <- c(1010, 1009)
df$foo[df$foo %in% A] <- 1
df$foo[df$foo %in% B] <- 2
df$foo[df$foo %in% C] <- 3
df$foo[df$foo %in% D] <- 4
df$foo[df$foo %in% E] <- 5
df$foo[df$foo %in% G] <- 7
df
   foo bar
1    5   a
2    3   b
3    1   c
4    3   d
5    4   e
6    5   f
7    4   g
8    2   h
9    2   i
10   7   j
11   7   k
I've rewritten the five solutions to functions to be able to compare them using the microbenchmark package and the result is that Tyler Rinker and flodel's solutions is the fastest solutions (see results below), not to say this question is about speed alone. I am also looking for conciseness and smartness in the solution. Out of curiosity I also added a solution using the Recode function from the car package.  Please feel free to let me know if I could have rewritten the solutions in a more optimal way or if the the microbenchmark package is not the best way to compare these functions.
df <- data.frame(
  foo = sample(1000:1010, 1e5+22, replace = TRUE),
  bar = rep(letters, 3847))
str(df)
A  <- c(1002)
B  <- c(1007, 1008)
C  <- c(1001, 1003)
D  <- c(1004, 1006)
E  <- c(1000, 1005)
G  <- c(1010, 1009)
# juba's solution
juba <- function(df,foo) within(df, {foo[foo %in% A] <- 1; foo[foo %in% B] <- 2;foo[foo %in% C] <- 3;foo[foo %in% D] <- 4;foo[foo %in% E] <- 5;foo[foo %in% G] <- 7})
# Arun's solution
Arun <- function(df,x) factor(df[,x], levels=c(A,B,C,D,E,G), labels=c(1, rep(c(2:5, 7), each=2)))
# flodel's solution
flodel <- function(df,x) rep(c(1, 2, 3, 4, 5, 7), sapply(list(A, B, C, D, E, G), length))[match(df[,x], unlist(list(A, B, C, D, E, G)))]
# Tyler Rinker's solution
TylerRinker <- function(df,x)  data.frame(vals = unlist(list(A  = c(1002),B  = c(1007, 1008),C  = c(1001, 1003),D  = c(1004, 1006),E  = c(1000, 1005), G = c(1010, 1009))), labs = c(1, rep(c(2:5, 7), each=2)))[match(df[,x], unlist(list(A  = c(1002),B  = c(1007, 1008),C  = c(1001, 1003),D  = c(1004, 1006),E  = c(1000, 1005), G = c(1010, 1009)))), 2] 
# agstudy's solution
agstudy <- function(df,foo) merge(df,data.frame(foo=unlist(list(A, B, C, D, E, G)), val =rep((1:7)[-6],rapply(list(A, B, C, D, E, G), length))))
# Recode from the car package
ReINcar <- function(df,x) Recode(df[,x], "A='A'; B='B'; C='C'; D='D'; E='E'; G='G'")
# install.packages("microbenchmark", dependencies = TRUE)
require(microbenchmark)
# run test
res <- microbenchmark(juba(df, foo), Arun(df, 1), flodel(df, 1), TylerRinker(df,1) ,agstudy(df, foo), ReINcar(df, 1), times = 25)
There were 15 warnings (use warnings() to see them) # warning duo to x's solution
## Print results:
print(res)
the numbers,
   Unit: milliseconds
                   expr        min         lq     median         uq        max neval
          juba(df, foo)  37.944355  39.521603  41.987174  46.385974  79.559750    25
            Arun(df, 1)  23.833334  24.115776  24.648842  26.987431  55.466448    25
          flodel(df, 1)   3.586179   3.637024   3.956814   6.468735  28.404166    25
     TylerRinker(df, 1)   3.919563   4.115994   4.529926   5.532688   8.508956    25
       agstudy(df, foo) 301.487732 324.641734 334.801005 352.753496 415.421212    25
         ReINcar(df, 1)  73.655566  77.903088  81.745037 101.038791 125.158208    25
### Plot results:
boxplot(res)

Here is a general (scalable) approach, very fast too:
sets <- list(A, B, C, D, E, G)
vals <-    c(1, 2, 3, 4, 5, 7)
keys   <- unlist(sets)
values <- rep(vals, sapply(sets, length))
df$foo <- values[match(df$foo, keys)]
                        Using within can help you save some keystrokes :
df <- within(df,
       {foo[foo %in% A] <- 1;
        foo[foo %in% B] <- 2;
        foo[foo %in% C] <- 3;
        foo[foo %in% D] <- 4;
        foo[foo %in% E] <- 5;
        foo[foo %in% G] <- 7})
                        You could also do: (Edited)
> df$foo <- factor(df$foo, levels=c(A,B,C,D,E,G), labels=c(1, rep(c(2:5, 7), each=2)))
# Warning message:
# In `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels) else paste0(labels,  :
#   duplicated levels will not be allowed in factors anymore
#    foo bar
# 1    5   a
# 2    3   b
# 3    1   c
# 4    3   d
# 5    4   e
# 6    5   f
# 7    4   g
# 8    2   h
# 9    2   i
# 10   7   j
# 11   7   k
                        If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With