I am a recoding a variable based on some rather long strings, here exemplified by the strings A, B, C, D, E, and G. I was wondering if there is a way to recode this with out having to repeat the reference to df$foo
12 times using base R? Maybe there is some some smarter faster way routhe I could explore? is this is really the smartest way to do it in R?
df <- data.frame(
foo = 1000:1010,
bar = letters[1:11])
df
foo bar
1 1000 a
2 1001 b
3 1002 c
4 1003 d
5 1004 e
6 1005 f
7 1006 g
8 1007 h
9 1008 i
10 1009 j
11 1010 k
A <- c(1002)
B <- c(1007, 1008)
C <- c(1001, 1003)
D <- c(1004, 1006)
E <- c(1000, 1005)
G <- c(1010, 1009)
df$foo[df$foo %in% A] <- 1
df$foo[df$foo %in% B] <- 2
df$foo[df$foo %in% C] <- 3
df$foo[df$foo %in% D] <- 4
df$foo[df$foo %in% E] <- 5
df$foo[df$foo %in% G] <- 7
df
foo bar
1 5 a
2 3 b
3 1 c
4 3 d
5 4 e
6 5 f
7 4 g
8 2 h
9 2 i
10 7 j
11 7 k
I've rewritten the five solutions to functions to be able to compare them using the microbenchmark package and the result is that Tyler Rinker and flodel's solutions is the fastest solutions (see results below), not to say this question is about speed alone. I am also looking for conciseness and smartness in the solution. Out of curiosity I also added a solution using the Recode
function from the car package. Please feel free to let me know if I could have rewritten the solutions in a more optimal way or if the the microbenchmark package is not the best way to compare these functions.
df <- data.frame(
foo = sample(1000:1010, 1e5+22, replace = TRUE),
bar = rep(letters, 3847))
str(df)
A <- c(1002)
B <- c(1007, 1008)
C <- c(1001, 1003)
D <- c(1004, 1006)
E <- c(1000, 1005)
G <- c(1010, 1009)
# juba's solution
juba <- function(df,foo) within(df, {foo[foo %in% A] <- 1; foo[foo %in% B] <- 2;foo[foo %in% C] <- 3;foo[foo %in% D] <- 4;foo[foo %in% E] <- 5;foo[foo %in% G] <- 7})
# Arun's solution
Arun <- function(df,x) factor(df[,x], levels=c(A,B,C,D,E,G), labels=c(1, rep(c(2:5, 7), each=2)))
# flodel's solution
flodel <- function(df,x) rep(c(1, 2, 3, 4, 5, 7), sapply(list(A, B, C, D, E, G), length))[match(df[,x], unlist(list(A, B, C, D, E, G)))]
# Tyler Rinker's solution
TylerRinker <- function(df,x) data.frame(vals = unlist(list(A = c(1002),B = c(1007, 1008),C = c(1001, 1003),D = c(1004, 1006),E = c(1000, 1005), G = c(1010, 1009))), labs = c(1, rep(c(2:5, 7), each=2)))[match(df[,x], unlist(list(A = c(1002),B = c(1007, 1008),C = c(1001, 1003),D = c(1004, 1006),E = c(1000, 1005), G = c(1010, 1009)))), 2]
# agstudy's solution
agstudy <- function(df,foo) merge(df,data.frame(foo=unlist(list(A, B, C, D, E, G)), val =rep((1:7)[-6],rapply(list(A, B, C, D, E, G), length))))
# Recode from the car package
ReINcar <- function(df,x) Recode(df[,x], "A='A'; B='B'; C='C'; D='D'; E='E'; G='G'")
# install.packages("microbenchmark", dependencies = TRUE)
require(microbenchmark)
# run test
res <- microbenchmark(juba(df, foo), Arun(df, 1), flodel(df, 1), TylerRinker(df,1) ,agstudy(df, foo), ReINcar(df, 1), times = 25)
There were 15 warnings (use warnings() to see them) # warning duo to x's solution
## Print results:
print(res)
the numbers,
Unit: milliseconds
expr min lq median uq max neval
juba(df, foo) 37.944355 39.521603 41.987174 46.385974 79.559750 25
Arun(df, 1) 23.833334 24.115776 24.648842 26.987431 55.466448 25
flodel(df, 1) 3.586179 3.637024 3.956814 6.468735 28.404166 25
TylerRinker(df, 1) 3.919563 4.115994 4.529926 5.532688 8.508956 25
agstudy(df, foo) 301.487732 324.641734 334.801005 352.753496 415.421212 25
ReINcar(df, 1) 73.655566 77.903088 81.745037 101.038791 125.158208 25
### Plot results:
boxplot(res)
Here is a general (scalable) approach, very fast too:
sets <- list(A, B, C, D, E, G)
vals <- c(1, 2, 3, 4, 5, 7)
keys <- unlist(sets)
values <- rep(vals, sapply(sets, length))
df$foo <- values[match(df$foo, keys)]
Using within
can help you save some keystrokes :
df <- within(df,
{foo[foo %in% A] <- 1;
foo[foo %in% B] <- 2;
foo[foo %in% C] <- 3;
foo[foo %in% D] <- 4;
foo[foo %in% E] <- 5;
foo[foo %in% G] <- 7})
You could also do: (Edited)
> df$foo <- factor(df$foo, levels=c(A,B,C,D,E,G), labels=c(1, rep(c(2:5, 7), each=2)))
# Warning message:
# In `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels) else paste0(labels, :
# duplicated levels will not be allowed in factors anymore
# foo bar
# 1 5 a
# 2 3 b
# 3 1 c
# 4 3 d
# 5 4 e
# 6 5 f
# 7 4 g
# 8 2 h
# 9 2 i
# 10 7 j
# 11 7 k
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With