This is my transaction data. It shows the transactions made from the accounts in from
column to the accounts in to
column with the date and the amount information
data
id from to date amount
<int> <fctr> <fctr> <date> <dbl>
19521 6644 6934 2005-01-01 700.0
19524 6753 8456 2005-01-01 600.0
19523 9242 9333 2005-01-01 1000.0
… … … … …
1056317 7819 7454 2010-12-31 60.2
1056318 6164 7497 2010-12-31 107.5
1056319 7533 7492 2010-12-31 164.1
I want to calculate how much transaction amount the accounts in from
column received in the last 6 month prior to the date that particular transaction was made and want to save this information as a new column.
This following code works very well to accomplish this in a small dataset ,say, with 1000 rows:
library(dplyr)
library(purrr)
data %>%
mutate(total_trx_amount_received_in_last_sixmonth= map2_dbl(from, date,
~sum(amount[to == .x & between(date, .y - 180, .y)])))
However, since my data has over 1 million rows, this code will take more than a couple of hours to complete.
I searched the internet if I can speed up the run time of this code. I tried this suggestion on SO about how to make purrr
map
function run faster. So, I tried the following code and instead of mutate
of dplyr
I used data.table
to speed up the code even faster:
library(future)
library(data.table)
library(furrr)
data[, total_trx_amount_received_in_last_sixmonth:= furrr::future_pmap_dbl(list(from, date),
~mean(amount[to == .x & between(date, .y-180, .y)])) ]
But, the speed hasn't been improved at all.
Is there any suggestion on how can I make the code run faster?
dput() output of the data:
structure(list(id = c(18529L, 13742L, 9913L, 956L, 2557L, 1602L,
18669L, 35900L, 48667L, 51341L, 53713L, 60126L, 60545L, 65113L,
66783L, 83324L, 87614L, 88898L, 89874L, 94765L, 100277L, 101587L,
103444L, 108414L, 113319L, 121516L, 126607L, 130170L, 131771L,
135002L, 149431L, 157403L, 157645L, 158831L, 162597L, 162680L,
163901L, 165044L, 167082L, 168562L, 168940L, 172578L, 173031L,
173267L, 177507L, 179167L, 182612L, 183499L, 188171L, 189625L,
193940L, 198764L, 199342L, 200134L, 203328L, 203763L, 204733L,
205651L, 209672L, 210242L, 210979L, 214532L, 214741L, 215738L,
216709L, 220828L, 222140L, 222905L, 226133L, 226527L, 227160L,
228193L, 231782L, 232454L, 233774L, 237836L, 237837L, 238860L,
240223L, 245032L, 246673L, 247561L, 251611L, 251696L, 252663L,
254410L, 255126L, 255230L, 258484L, 258485L, 259309L, 259910L,
260542L, 262091L, 264462L, 264887L, 264888L, 266125L, 268574L,
272959L), from = c("5370", "5370", "5370", "8605", "5370", "6390",
"5370", "5370", "8934", "5370", "5635", "6046", "5680", "8026",
"9037", "5370", "7816", "8046", "5492", "8756", "5370", "9254",
"5370", "5370", "7078", "6615", "5370", "9817", "8228", "8822",
"5735", "7058", "5370", "8667", "9315", "6053", "7990", "8247",
"8165", "5656", "9261", "5929", "8251", "5370", "6725", "5370",
"6004", "7022", "7442", "5370", "8679", "6491", "7078", "5370",
"5370", "5370", "5658", "5370", "9296", "8386", "5370", "5370",
"5370", "9535", "5370", "7541", "5370", "9621", "5370", "7158",
"8240", "5370", "5370", "8025", "5370", "5370", "5370", "6989",
"5370", "7059", "5370", "5370", "5370", "9121", "5608", "5370",
"5370", "7551", "5370", "5370", "5370", "5370", "9163", "9362",
"6072", "5370", "5370", "5370", "5370", "5370"), to = c("9356",
"5605", "8567", "5370", "5636", "5370", "8933", "8483", "5370",
"7626", "5370", "5370", "5370", "5370", "5370", "9676", "5370",
"5370", "5370", "5370", "9105", "5370", "9772", "6979", "5370",
"5370", "7564", "5370", "5370", "5370", "5370", "5370", "8744",
"5370", "5370", "5370", "5370", "5370", "5370", "5370", "5370",
"5370", "5370", "7318", "5370", "8433", "5370", "5370", "5370",
"7122", "5370", "5370", "5370", "8566", "6728", "9689", "5370",
"8342", "5370", "5370", "5614", "5596", "5953", "5370", "7336",
"5370", "7247", "5370", "7291", "5370", "5370", "6282", "7236",
"5370", "8866", "8613", "9247", "5370", "6767", "5370", "9273",
"7320", "9533", "5370", "5370", "8930", "9343", "5370", "9499",
"7693", "7830", "5392", "5370", "5370", "5370", "7497", "8516",
"9023", "7310", "8939"), date = structure(c(12934, 13000, 13038,
13061, 13099, 13113, 13117, 13179, 13238, 13249, 13268, 13296,
13299, 13309, 13314, 13391, 13400, 13404, 13409, 13428, 13452,
13452, 13460, 13482, 13493, 13518, 13526, 13537, 13542, 13544,
13596, 13616, 13617, 13626, 13633, 13633, 13639, 13642, 13646,
13656, 13660, 13664, 13667, 13669, 13677, 13686, 13694, 13694,
13707, 13716, 13725, 13738, 13739, 13746, 13756, 13756, 13756,
13761, 13769, 13770, 13776, 13786, 13786, 13786, 13791, 13799,
13806, 13813, 13817, 13817, 13817, 13822, 13829, 13830, 13836,
13847, 13847, 13847, 13852, 13860, 13866, 13871, 13878, 13878,
13878, 13882, 13883, 13883, 13887, 13887, 13888, 13889, 13890,
13891, 13895, 13896, 13896, 13899, 13905, 13909), class = "Date"),
amount = c(24.4, 7618, 21971, 5245, 2921, 8000, 169.2, 71.5,
14.6, 4214, 14.6, 13920, 14.6, 24640, 1600, 261.1, 16400,
3500, 2700, 19882, 182, 14.6, 16927, 25653, 3059, 2880, 9658,
4500, 12480, 14.6, 1000, 3679, 34430, 12600, 14.6, 19.2,
4900, 826, 3679, 2100, 38000, 79, 11400, 21495, 3679, 200,
14.6, 100.6, 3679, 5300, 108.9, 3679, 2696, 7500, 171.6,
14.6, 99.2, 2452, 3679, 3218, 700, 69.7, 14.6, 91.5, 2452,
3679, 2900, 17572, 14.6, 14.6, 90.5, 2452, 49752, 3679, 1900,
14.6, 870, 85.2, 2452, 3679, 1600, 540, 14.6, 14.6, 79, 210,
2452, 28400, 720, 180, 420, 44289, 489, 3679, 840, 2900,
150, 870, 420, 14.6)), row.names = c(NA, -100L), class = "data.frame")
This is simply a non-equi join in data.table. You can create a variable of date - 180
and limit the join between the current date and that variable. This should be fairly quick
library(data.table)
setDT(dt)[, date_minus_180 := date - 180]
dt[, amnt_6_m := .SD[dt, sum(amount, na.rm = TRUE),
on = .(to = from, date <= date, date >= date_minus_180), by = .EACHI]$V1]
head(dt, 10)
# id from to date amount date_minus_180 amnt_6_m
# 1: 18529 5370 9356 2005-05-31 24.4 2004-12-02 0.0
# 2: 13742 5370 5605 2005-08-05 7618.0 2005-02-06 0.0
# 3: 9913 5370 8567 2005-09-12 21971.0 2005-03-16 0.0
# 4: 956 8605 5370 2005-10-05 5245.0 2005-04-08 0.0
# 5: 2557 5370 5636 2005-11-12 2921.0 2005-05-16 5245.0
# 6: 1602 6390 5370 2005-11-26 8000.0 2005-05-30 0.0
# 7: 18669 5370 8933 2005-11-30 169.2 2005-06-03 13245.0
# 8: 35900 5370 8483 2006-01-31 71.5 2005-08-04 13245.0
# 9: 48667 8934 5370 2006-03-31 14.6 2005-10-02 0.0
# 10: 51341 5370 7626 2006-04-11 4214.0 2005-10-13 8014.6
Here is one option using data.table
:
library(data.table)
setDT(df)
setkey(df, to, date)
# Unique combination of from and date
af <- df[, unique(.SD), .SDcols = c("from", "date")]
# For each combination check sum of incoming in the last 6 months
for (i in 1:nrow(af)) {
set(
af, i = i, j = "am6m",
value = df[(date) %between% (af$date[[i]] - c(180, 0)) & to == af$from[[i]], sum(amount)]
)
}
# Join the results into the main data.frame
df[, am6m := af[.SD, on = .(from, date), am6m]]
> tail(df)
# id from to date amount am6m
# 1: 18529 5370 9356 2005-05-31 24.4 0.0
# 2: 258484 5370 9499 2008-01-09 720.0 74543.5
# 3: 251611 5370 9533 2007-12-31 14.6 46143.5
# 4: 83324 5370 9676 2006-08-31 261.1 40203.8
# 5: 203763 5370 9689 2007-08-31 14.6 92353.1
# 6: 103444 5370 9772 2006-11-08 16927.0 82671.2
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With