Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Sum amount last 6 month prior to the date of transaction

This is my transaction data. It shows the transactions made from the accounts in from column to the accounts in to column with the date and the amount information

data 

id          from    to          date        amount  
<int>       <fctr>  <fctr>      <date>      <dbl>
19521       6644    6934        2005-01-01  700.0
19524       6753    8456        2005-01-01  600.0
19523       9242    9333        2005-01-01  1000.0
…           …       …           …           …
1056317     7819    7454        2010-12-31  60.2
1056318     6164    7497        2010-12-31  107.5
1056319     7533    7492        2010-12-31  164.1

I want to calculate how much transaction amount the accounts in from column received in the last 6 month prior to the date that particular transaction was made and want to save this information as a new column.

This following code works very well to accomplish this in a small dataset ,say, with 1000 rows:

library(dplyr)
library(purrr)
data %>% 
  mutate(total_trx_amount_received_in_last_sixmonth= map2_dbl(from, date, 
~sum(amount[to == .x & between(date, .y - 180, .y)])))

However, since my data has over 1 million rows, this code will take more than a couple of hours to complete. I searched the internet if I can speed up the run time of this code. I tried this suggestion on SO about how to make purrr map function run faster. So, I tried the following code and instead of mutate of dplyr I used data.table to speed up the code even faster:

library(future)
library(data.table)
library(furrr)
data[, total_trx_amount_received_in_last_sixmonth:= furrr::future_pmap_dbl(list(from, date), 
~mean(amount[to == .x & between(date, .y-180, .y)])) ]

But, the speed hasn't been improved at all.

Is there any suggestion on how can I make the code run faster?

dput() output of the data:

structure(list(id = c(18529L, 13742L, 9913L, 956L, 2557L, 1602L, 
18669L, 35900L, 48667L, 51341L, 53713L, 60126L, 60545L, 65113L, 
66783L, 83324L, 87614L, 88898L, 89874L, 94765L, 100277L, 101587L, 
103444L, 108414L, 113319L, 121516L, 126607L, 130170L, 131771L, 
135002L, 149431L, 157403L, 157645L, 158831L, 162597L, 162680L, 
163901L, 165044L, 167082L, 168562L, 168940L, 172578L, 173031L, 
173267L, 177507L, 179167L, 182612L, 183499L, 188171L, 189625L, 
193940L, 198764L, 199342L, 200134L, 203328L, 203763L, 204733L, 
205651L, 209672L, 210242L, 210979L, 214532L, 214741L, 215738L, 
216709L, 220828L, 222140L, 222905L, 226133L, 226527L, 227160L, 
228193L, 231782L, 232454L, 233774L, 237836L, 237837L, 238860L, 
240223L, 245032L, 246673L, 247561L, 251611L, 251696L, 252663L, 
254410L, 255126L, 255230L, 258484L, 258485L, 259309L, 259910L, 
260542L, 262091L, 264462L, 264887L, 264888L, 266125L, 268574L, 
272959L), from = c("5370", "5370", "5370", "8605", "5370", "6390", 
"5370", "5370", "8934", "5370", "5635", "6046", "5680", "8026", 
"9037", "5370", "7816", "8046", "5492", "8756", "5370", "9254", 
"5370", "5370", "7078", "6615", "5370", "9817", "8228", "8822", 
"5735", "7058", "5370", "8667", "9315", "6053", "7990", "8247", 
"8165", "5656", "9261", "5929", "8251", "5370", "6725", "5370", 
"6004", "7022", "7442", "5370", "8679", "6491", "7078", "5370", 
"5370", "5370", "5658", "5370", "9296", "8386", "5370", "5370", 
"5370", "9535", "5370", "7541", "5370", "9621", "5370", "7158", 
"8240", "5370", "5370", "8025", "5370", "5370", "5370", "6989", 
"5370", "7059", "5370", "5370", "5370", "9121", "5608", "5370", 
"5370", "7551", "5370", "5370", "5370", "5370", "9163", "9362", 
"6072", "5370", "5370", "5370", "5370", "5370"), to = c("9356", 
"5605", "8567", "5370", "5636", "5370", "8933", "8483", "5370", 
"7626", "5370", "5370", "5370", "5370", "5370", "9676", "5370", 
"5370", "5370", "5370", "9105", "5370", "9772", "6979", "5370", 
"5370", "7564", "5370", "5370", "5370", "5370", "5370", "8744", 
"5370", "5370", "5370", "5370", "5370", "5370", "5370", "5370", 
"5370", "5370", "7318", "5370", "8433", "5370", "5370", "5370", 
"7122", "5370", "5370", "5370", "8566", "6728", "9689", "5370", 
"8342", "5370", "5370", "5614", "5596", "5953", "5370", "7336", 
"5370", "7247", "5370", "7291", "5370", "5370", "6282", "7236", 
"5370", "8866", "8613", "9247", "5370", "6767", "5370", "9273", 
"7320", "9533", "5370", "5370", "8930", "9343", "5370", "9499", 
"7693", "7830", "5392", "5370", "5370", "5370", "7497", "8516", 
"9023", "7310", "8939"), date = structure(c(12934, 13000, 13038, 
13061, 13099, 13113, 13117, 13179, 13238, 13249, 13268, 13296, 
13299, 13309, 13314, 13391, 13400, 13404, 13409, 13428, 13452, 
13452, 13460, 13482, 13493, 13518, 13526, 13537, 13542, 13544, 
13596, 13616, 13617, 13626, 13633, 13633, 13639, 13642, 13646, 
13656, 13660, 13664, 13667, 13669, 13677, 13686, 13694, 13694, 
13707, 13716, 13725, 13738, 13739, 13746, 13756, 13756, 13756, 
13761, 13769, 13770, 13776, 13786, 13786, 13786, 13791, 13799, 
13806, 13813, 13817, 13817, 13817, 13822, 13829, 13830, 13836, 
13847, 13847, 13847, 13852, 13860, 13866, 13871, 13878, 13878, 
13878, 13882, 13883, 13883, 13887, 13887, 13888, 13889, 13890, 
13891, 13895, 13896, 13896, 13899, 13905, 13909), class = "Date"), 
    amount = c(24.4, 7618, 21971, 5245, 2921, 8000, 169.2, 71.5, 
    14.6, 4214, 14.6, 13920, 14.6, 24640, 1600, 261.1, 16400, 
    3500, 2700, 19882, 182, 14.6, 16927, 25653, 3059, 2880, 9658, 
    4500, 12480, 14.6, 1000, 3679, 34430, 12600, 14.6, 19.2, 
    4900, 826, 3679, 2100, 38000, 79, 11400, 21495, 3679, 200, 
    14.6, 100.6, 3679, 5300, 108.9, 3679, 2696, 7500, 171.6, 
    14.6, 99.2, 2452, 3679, 3218, 700, 69.7, 14.6, 91.5, 2452, 
    3679, 2900, 17572, 14.6, 14.6, 90.5, 2452, 49752, 3679, 1900, 
    14.6, 870, 85.2, 2452, 3679, 1600, 540, 14.6, 14.6, 79, 210, 
    2452, 28400, 720, 180, 420, 44289, 489, 3679, 840, 2900, 
    150, 870, 420, 14.6)), row.names = c(NA, -100L), class = "data.frame")

like image 571
rlock Avatar asked Sep 08 '20 13:09

rlock


2 Answers

This is simply a non-equi join in data.table. You can create a variable of date - 180 and limit the join between the current date and that variable. This should be fairly quick

library(data.table)
setDT(dt)[, date_minus_180 := date - 180]
dt[, amnt_6_m := .SD[dt, sum(amount, na.rm = TRUE), 
     on = .(to = from, date <= date, date >= date_minus_180), by = .EACHI]$V1]
head(dt, 10)
#        id from   to       date  amount date_minus_180 amnt_6_m
#  1: 18529 5370 9356 2005-05-31    24.4     2004-12-02      0.0
#  2: 13742 5370 5605 2005-08-05  7618.0     2005-02-06      0.0
#  3:  9913 5370 8567 2005-09-12 21971.0     2005-03-16      0.0
#  4:   956 8605 5370 2005-10-05  5245.0     2005-04-08      0.0
#  5:  2557 5370 5636 2005-11-12  2921.0     2005-05-16   5245.0
#  6:  1602 6390 5370 2005-11-26  8000.0     2005-05-30      0.0
#  7: 18669 5370 8933 2005-11-30   169.2     2005-06-03  13245.0
#  8: 35900 5370 8483 2006-01-31    71.5     2005-08-04  13245.0
#  9: 48667 8934 5370 2006-03-31    14.6     2005-10-02      0.0
# 10: 51341 5370 7626 2006-04-11  4214.0     2005-10-13   8014.6
like image 165
David Arenburg Avatar answered Nov 12 '22 16:11

David Arenburg


Here is one option using data.table:

library(data.table)
setDT(df)
setkey(df, to, date)

# Unique combination of from and date
af <- df[, unique(.SD), .SDcols = c("from", "date")]

# For each combination check sum of incoming in the last 6 months
for (i in 1:nrow(af)) {
  set(
    af, i = i, j = "am6m", 
    value = df[(date) %between% (af$date[[i]] - c(180, 0)) & to == af$from[[i]], sum(amount)]
  )
}
# Join the results into the main data.frame
df[, am6m := af[.SD, on = .(from, date), am6m]]



> tail(df)
#        id from   to       date  amount    am6m
# 1:  18529 5370 9356 2005-05-31    24.4     0.0
# 2: 258484 5370 9499 2008-01-09   720.0 74543.5
# 3: 251611 5370 9533 2007-12-31    14.6 46143.5
# 4:  83324 5370 9676 2006-08-31   261.1 40203.8
# 5: 203763 5370 9689 2007-08-31    14.6 92353.1
# 6: 103444 5370 9772 2006-11-08 16927.0 82671.2
like image 3
sindri_baldur Avatar answered Nov 12 '22 15:11

sindri_baldur