Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

R: Calculate moving maximum slope by week accounting for factors

Tags:

r

data.table

I have a data.frame that includes heating degree day (HDD) below.

structure(list(WinterID = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L), .Label = c("2002", "2002_2003", "2003", "2003_2004", 
"2004", "2004_2005", "2005", "2005_2006", "2006", "2006_2007", 
"2007", "2007_2008", "2008"), class = "factor"), Date = structure(c(11968, 
11969, 11970, 11971, 11972, 11973, 11974, 11975, 11976, 11977, 
11978, 11979, 11980, 11981, 11982, 11983, 11984, 11985, 11986, 
11987, 11988, 11989, 11990, 11991, 11992, 11993, 11994, 11995, 
11996, 11997, 11998, 11999, 12000, 12001, 12002, 12003, 12004, 
12005, 12006, 12007, 12008, 12009, 12010, 12011, 12012, 12013, 
12014, 12015, 12016, 12017, 12018, 12019, 12020, 12021, 12022, 
12023, 12024, 12025, 12026, 12027, 12028, 12029, 12030, 12031, 
12032, 12033, 12034, 12035, 12036, 12037, 12038, 12039, 12040, 
12041, 12042, 12043, 12044, 12045, 12046, 12047, 12048, 12049, 
12050, 12051, 12052, 12053, 12054, 12055, 12056, 12057, 12058, 
12059, 12060, 12061, 12062, 12063, 12064, 12065, 12066, 12067, 
12068, 12069, 12070, 12071, 12072, 12073, 12074, 12075, 12076, 
12077, 12078, 12079, 12080, 12081, 12082, 12083, 12084, 12085, 
12086, 12087, 12088, 12089, 12090, 12091, 12092, 12093, 12094, 
12095, 12096, 12097, 12098, 12099, 12100, 12101, 12102, 12103, 
12104, 12105, 12106, 12107, 12108, 12109, 12110, 12111, 12112, 
12113, 12114, 12115, 12116, 12117, 12118, 12119, 12120, 12121, 
12122, 12123, 12124, 12125, 12126, 12127, 12128, 12129, 12130, 
12131, 12132, 12133, 12134, 12135, 12136, 12137, 12138, 12139, 
12140, 12141, 12142, 12010, 12011, 12014, 12015, 12017, 12023, 
12024, 12025, 12026, 12027, 12028, 12029, 12030, 12042, 12070, 
12071, 12075, 12076, 12077, 12078, 12079, 12080, 12082, 12083, 
12084), class = "Date"), SiteID = structure(c(1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L), .Label = "NW_SB", class = "factor"), SubstrateConcat = structure(c(2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("B_A", "B_B", "B_E"), class = "factor"), 
    HDD = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0.246666666666667, 7.12666666666667, 10.6133333333333, 
    2.96666666666667, 0, 0.0933333333333337, 7.31333333333334, 
    10.7133333333333, 6.20000000000001, 2.70666666666667, 6.20000000000001, 
    3.88666666666667, 16.5866666666667, 28.3933333333333, 12.98, 
    21.6133333333333, 19.14, 12.6666666666667, 7.52, 3.33333333333334, 
    18.2933333333333, 4.14666666666667, 2.17333333333334, 26.08, 
    1.38, 7.48000000000001, 36.5733333333333, 53.4666666666667, 
    98.4533333333333, 109.093333333333, 104.14, 80.2466666666667, 
    47.0333333333333, 14.7133333333333, 15.7266666666667, 21.1066666666667, 
    5.07333333333334, 0.613333333333334, 6.18000000000001, 29.5666666666667, 
    45.5333333333333, 59.5666666666667, 91.44, 85.38, 51.1, 25.9666666666667, 
    14.8266666666667, 34.48, 79.16, 90.08, 66.3533333333333, 
    75.14, 97.1733333333333, 83.3066666666667, 50.0133333333333, 
    37.2733333333333, 88.9133333333334, 101.926666666667, 100.56, 
    99.2933333333334, 97.66, 89.6466666666667, 110.613333333333, 
    79.1466666666667, 92.6066666666667, 71.7133333333333, 31.32, 
    27.02, 39.02, 98.14, 62.5866666666667, 46.7933333333333, 
    47.5133333333333, 48.3666666666667, 25.5333333333333, 13.6, 
    17.9133333333333, 14.16, 7.98666666666667, 3.44, 1.86666666666667, 
    12.66, 0, 7.09333333333334, 21.3266666666667, 40.52, 18.8466666666667, 
    37.8466666666667, 33.42, 33.7133333333333, 15.6133333333333, 
    0.720000000000001, 2.31333333333334, 12.3066666666667, 8.48666666666667, 
    2.86, 0, 0, 0, 6.98666666666667, 6.67333333333334, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6.58000000000001, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13.42, 30.5266666666667, 
    1.12, 28.5066666666667, 6.82666666666667, 10.3933333333333, 
    3.18, 11.0466666666667, 0, 0, 0)), .Names = c("WinterID", 
"Date", "SiteID", "SubstrateConcat", "HDD"), row.names = c(NA, 
200L), class = "data.frame")

I'm trying to calculate the moving maximum slope over 7 days beginning on 4 November of each year without using a loop. This moving maximum slope needs to account for WinterID, SiteID, and SubstrateConcat.

For clarification, the calculation I'm trying to obtain is this:

Slope=(max-min)/7, where:
Max= (i-3)+(i-2)+(i-1)+i+(i+1)+(i+2)+(i+3) 
Min= (i-3)

(((i-3)+(i-2)+(i-1)+i+(i+1)+(i+2)+(i+3)) - (i-3))/7

So, using a real example starting with 2002-11-19 as i:

(0+0.24+7.13+10.61+2.97+0+0.97) - 0)/7 = 3.13

I tried using zoo package rollmean, however, I could not figure out how to account for WinterID, SiteID, and SubstrateConcat. This gave me an "order.by" error where my Date values were not unique, since I have dates with different SubstrateConcat and WinterID criteria. As I enter more data into the database, there will eventually be dates with multiple SiteID criteria as well.

I thought maybe xts, TTR and ROC would be what I could use as in this question: Maximum slope for a given interval each day. But again, I don't understand how to specify the multiple group factors, as well as going three days forward and three days back as in align=center with rollmean.

Will someone please point me in the right direction here? Will one of the above functions combined with ddply work?

Thank you!

EDITED to include the answer after the answer supplied by @eddi.

dt <- data.table(df)
dt[, MaxSlope := if(length(HDD)<7) {rep(NA_real_, length(HDD))} else {filter(HDD, c(1,1,1,1,1,1,0)/7)}, by=list(Winter, Site, Substrate)]

This code works perfectly for dates that are continuous. Can anyone recommend how to tweak this code for data that has missing dates? For instance, I have:

   Date  Temp 
 Nov 21  14 
 Nov 23  10 
 Nov 24  12 
 Nov 27  11 
 Nov 28  7 
 Nov 29  9 
 Nov 30  10 
 Dec 01  12 
 Dec 02  8  
 Dec 03  7

I don't want the Max Slope calculated for Nov 21, Nov 23 and Nov 24 because there isn't consecutive data for the calculation. Instead, I want "NA" inserted. Can the existing code above, be modified to accommodate this?

like image 914
BgnR Avatar asked Sep 04 '13 22:09

BgnR


2 Answers

Sounds like you need filter (or you could also use one of the rolling mean/sum functions). And the grouping part is easiest to do with data.table:

library(data.table)
dt = data.table(your_df)

dt[, filter(HDD, c(1,1,1,1,1,1,0))/7,
     by = list(WinterID, SiteID, SubstrateConcat)]
like image 143
eddi Avatar answered Nov 13 '22 00:11

eddi


I couldn't get working solution with ddply, though I didn't spend much time debugging. Here's a solution using base functions (assuming your object is named hdd).

# split your object into groups
shdd <- split(hdd, hdd[,c("WinterID","SiteID","SubstrateConcat")], drop=TRUE)
# create a function to apply to each group
f <- function(d) transform(d, MaxSlopeHDD=rollmax(c(NA,diff(d$HDD)),7,fill=NA))
# apply the function to each group and rbind the results together
shdd <- do.call(rbind, lapply(shdd, f))
like image 42
Joshua Ulrich Avatar answered Nov 13 '22 00:11

Joshua Ulrich