Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

R Replace values based on conditions (for same ID) without using for-loop

I have a df similar to this one but much bigger (100.000 rows x 100 columns)

df <-data.frame(id=c("1","2","2","3","4","4", "4", "4", "4", "4", "5"), date = c("2015-01-15", "2004-03-01", "2017-03-15", "2000-01-15", "2006-05-08", "2008-05-09", "2014-05-11", "2014-06-11", "2014-07-11", "2014-08-11", "2015-12-19"), A =c (0,1,1,0,1,1,0,0,1,1,1), B=c(1,0,1,0,1,0,0,0,1,1,1), C = c(0,1,0,0,0,1,1,1,1,1,0), D = c(0,0,0,1,1,1,1,0,1,0,1), E = c(1,1,1,0,0,0,0,0,1,1,1), A.1 = c(0,0,0,0,0,0,0,0,0,0,0), B.1 = c(0,0,0,0,0,0,0,0,0,0,0), C.1 = c(0,0,0,0,0,0,0,0,0,0,0), D.1 = c(0,0,0,0,0,0,0,0,0,0,0), E.1 = c(0,0,0,0,0,0,0,0,0,0,0), acumulativediff = c(0, 0, 4762, 0, 0, 732, 2925, 2956, 2986, 3017, 0))

What I have to accomplish is this:

structure(list(id = structure(c(1L, 2L, 2L, 3L, 4L, 4L, 4L, 4L, 4L, 4L,5L), .Label = c("1", "2", "3", "4", "5"), class = "factor"), date = structure(c(9L, 2L, 11L, 1L, 3L, 4L, 5L, 6L, 7L, 8L,10L), .Label = c("2000-01-15", "2004-03-01", "2006-05-08","2008-05-09", "2014-05-11", "2014-06-11", "2014-07-11", "2014-08-11","2015-01-15", "2015-12-19", "2017-03-15"), class = "factor"), A = c(0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1), B = c(1, 0, 1, 0,1, 0, 0, 0, 1, 1, 1), C = c(0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0), D = c(0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1), E = c(1, 1, 1,0, 0, 0, 0, 0, 1, 1, 1), A.1 = c(0, 0, 4762, 0, 0, 732, 2925,0, 0, 3017, 0), B.1 = c(0, 0, 0, 0, 0, 732, 0, 0, 0, 3017,0), C.1 = c(0, 0, 4762, 0, 0, 0, 2925, 2956, 2986, 3017,
0), D.1 = c(0, 0, 0, 0, 0, 732, 2925, 2956, 0, 3017, 0),E.1 = c(0, 0, 4762, 0, 0, 0, 0, 0, 0, 3017, 0), acumulativediff = c(0, 0, 4762, 0, 0, 732, 2925, 2956, 2986, 3017, 0)), .Names = c("id","date", "A", "B", "C", "D", "E", "A.1", "B.1", "C.1", "D.1", "E.1", "acumulativediff"), row.names = c(NA,-11L), class = "data.frame") 

The idea is to replace 0's from A.1, B.1, C.1 columns with the values of 'acumulativediff' column, based on two conditions:

df[i,1]  == df[i-1,1] & df[i,names] == "1" & df[i-1,names] == "1", df[i,diff]
df[i,1]  == df[i-1,1] & df[i,names] == "0" & df[i-1,names] == "1", df[i,diff]

I was able to do it, using a non-efficient loop-for which seems to work on small df but not with bigger ones (it takes almost two hours)

names <- colnames(df[3:7])
names2 <- colnames(df[8:12])
diff <- which(colnames(df)=="acumulativediff")
for (i in 2:nrow(df)){
df[i,names2] <- ifelse (df[i,1]  == df[i-1,1] & df[i,names] == "1" & 
df[i-1,names] == "1", df[i,diff],
      ifelse (df[i,1]  == df[i-1,1] & df[i,names] == "0" & df[i-1,names] == "1", df[i,diff], 0))}

Any idea or advice to omit the loop to achieve a more efficient code?

like image 493
torakxkz Avatar asked Feb 24 '26 07:02

torakxkz


1 Answers

I'll suggest to ignore A.1, B.1 etc columns. Just re-create those columns using dplyr::mutate_at and the rules specified by OP. The dplyr::lag with default = 0 will help to avoid NA in result.

library(dplyr)

df %>% select(-ends_with(".1")) %>%
  mutate_at(vars(A:E), 
       funs(l = ifelse(lag(id)==id & lag(., default=0) == "1",acumulativediff,0)))


#    id       date A B C D E acumulativediff  A_l  B_l  C_l  D_l  E_l
# 1   1 2015-01-15 0 1 0 0 1               0    0    0    0    0    0
# 2   2 2004-03-01 1 0 1 0 1               0    0    0    0    0    0
# 3   2 2017-03-15 1 1 0 0 1            4762 4762    0 4762    0 4762
# 4   3 2000-01-15 0 0 0 1 0               0    0    0    0    0    0
# 5   4 2006-05-08 1 1 0 1 0               0    0    0    0    0    0
# 6   4 2008-05-09 1 0 1 1 0             732  732  732    0  732    0
# 7   4 2014-05-11 0 0 1 1 0            2925 2925    0 2925 2925    0
# 8   4 2014-06-11 0 0 1 0 0            2956    0    0 2956 2956    0
# 9   4 2014-07-11 1 1 1 1 1            2986    0    0 2986    0    0
# 10  4 2014-08-11 1 1 1 0 1            3017 3017 3017 3017 3017 3017
# 11  5 2015-12-19 1 1 0 1 1               0    0    0    0    0    0
like image 52
MKR Avatar answered Feb 27 '26 01:02

MKR