I would like to use the R dplyr package to calculate the following interval related questions without using loops:
The interval endpoints are from the columns df_abs$interval and df_rel$interval . e.g.
The data frames with the data and intervals look like this:
library(dplyr)
# ----------{ data and interval ----------
df_data <- data.frame(varA = NA,
varB = NA,
varC = c(-81.0, -14.3, 29.6, 42.7, 46.4, 57.7, 15.3, 256.3, 20.3, -25.1, -23.1, -17.5))
df_abs <- data.frame(interval = c(-Inf, -60, -30, 0, 30, 60, 100, 200, Inf),
count = NA,
sum = NA)
df_rel <- data.frame(interval = c(0,5,15,50,75,95,100),
count = NA,
sum = NA)
# ---------- data and interval }----------
# ----------{ calculation ----------
# absolute data frame
for (i in 1 : nrow(df_abs)-1) {
# count observation between interval
df_abs$count[i+1] <- summarise(df_data, sum(df_abs$interval[i] < varC & varC <= df_abs$interval[i+1]))
# sum between interval
df_abs$sum[i+1] <- sum(df_data$varC[df_abs$interval[i] < df_data$varC & df_data$varC <= df_abs$interval[i+1]])
}
# relative data frame
df_data_arranged <- df_data %>%
arrange(varC) %>%
mutate(observationPercent = c(1:nrow(df_data)) * 100/length(df_data$varC))
for (i in 1 : nrow(df_rel)-1) {
# count observation between interval
df_rel$count[i+1] <- summarise(df_data_arranged, sum(df_rel$interval[i] < observationPercent & observationPercent <= df_rel$interval[i+1]))
# sum between interval
df_rel$sum[i+1] <- sum(df_data_arranged$varC[df_rel$interval[i] < df_data_arranged$observationPercent & df_data_arranged$observationPercent <= df_rel$interval[i+1]])
}
# ---------- calculation }----------
The answer should look like this:
df_abs <- data.frame(interval = c(-Inf, -60, -30, 0, 30, 60, 100, 200, Inf),
count = c(0,1,0,4,3,3,0,0,1),
sum = c(0,-81,0,-80,65.2,146.8,0,0,256.3))
df_rel <- data.frame(interval = c(0,5,15,50,75,95,100),
count = c(0,0,1,4,3,2,1),
sum = c(0,0,-81,-39.6,92.6,104.1,256.3))
As far as I understand the dplyr package, there should be a rather short and straight forward solution for each of the two problems without having to use loops at all.
This can be done as follows:
create a new column (mutate
) to identify which observation belongs to which interval (through base::cut
)
group your observations by intervals (group_by
)
apply your operations on the result (summarise
with dplyr
's n()
and a common sum
here)
As follows:
df_abs <- mutate(df_data, interval = cut(varC, df_abs$interval)) %>%
group_by(interval) %>%
summarise(count=n(), sum=sum(varC))
# interval count sum
#1 (-Inf,-60] 1 -81.0
#2 (-30,0] 4 -80.0
#3 (0,30] 3 65.2
#4 (30,60] 3 146.8
#5 (200, Inf] 1 256.3
df_rel <- mutate(df_data_arranged,
interval = cut(observationPercent, df_rel$interval)) %>%
group_by(interval) %>%
summarise(count=n(), sum=sum(varC))
# interval count sum
#1 (5,15] 1 -81.0
#2 (15,50] 5 -64.7
#3 (50,75] 3 92.6
#4 (75,95] 2 104.1
#5 (95,100] 1 256.3
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With