Following is a sample data frame:
> dput(df)
structure(list(Vehicle.ID = c(21L, 21L, 21L, 21L, 21L, 21L, 21L,
21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L,
45L, 45L, 45L, 45L, 45L, 45L, 45L, 45L, 45L, 45L, 45L, 45L, 45L,
45L, 45L, 45L, 45L, 45L, 45L, 45L), gap.dist = c(36L, 37L, 38L,
39L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 47L, 48L, 49L, 50L, 51L,
52L, 53L, 54L, 55L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L,
34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L, 42L, 43L, 44L), safept = c("no",
"no", "no", "no", "dx_safe+CC2", "no", "no", "no", "no", "dx_safe",
"no", "no", "no", "no", "no", "dx_safe+CC2", "no", "no", "dx_safe",
"no", "no", "no", "no", "no", "dx_safe+CC2", "no", "no", "no",
"no", "dx_safe", "no", "no", "no", "no", "no", "no", "no", "no",
"dx_safe", "no")), .Names = c("Vehicle.ID", "gap.dist", "safept"
), row.names = c(NA, -40L), class = "data.frame")
I want to create 2 columns. The first column is safetylower
which should contain the value of gap.dist
by Vehicle.ID
at the FIRST occurrence of "dx_safe"
in safept
column. The second column is safetyupper
which should contain either:
gap.dist
by Vehicle.ID
at the FIRST occurrence of"dx_safe+CC2"
AFTER the FIRST occurrence of dx_safe
(the value"dx_safe+CC2"
after the first occurrence of "dx_safe"
.gap.dist
for a given Vehicle.ID
So, the desired output is something like following:
> dput(df)
structure(list(Vehicle.ID = c(21L, 21L, 21L, 21L, 21L, 21L, 21L,
21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L,
45L, 45L, 45L, 45L, 45L, 45L, 45L, 45L, 45L, 45L, 45L, 45L, 45L,
45L, 45L, 45L, 45L, 45L, 45L, 45L), gap.dist = c(36L, 37L, 38L,
39L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 47L, 48L, 49L, 50L, 51L,
52L, 53L, 54L, 55L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L,
34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L, 42L, 43L, 44L), safept = c("no",
"no", "no", "no", "dx_safe+CC2", "no", "no", "no", "no", "dx_safe",
"no", "no", "no", "no", "no", "dx_safe+CC2", "no", "no", "dx_safe",
"no", "no", "no", "no", "no", "dx_safe+CC2", "no", "no", "no",
"no", "dx_safe", "no", "no", "no", "no", "no", "no", "no", "no",
"dx_safe", "no"), safetylower = c(45, 45, 45, 45, 45, 45, 45,
45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 34, 34, 34,
34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34,
34), safetyupper = c(51, 51, 51, 51, 51, 51, 51, 51, 51, 51,
51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 44, 44, 44, 44, 44, 44,
44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44)), .Names = c("Vehicle.ID",
"gap.dist", "safept", "safetylower", "safetyupper"), row.names = c(NA,
-40L), class = "data.frame")
I could only create the first column safetylower
by using match
. The following shows the code I tried that didn't achieve the goal. Please help.
library(plyr)
df <- ddply(df, 'Vehicle.ID', transform,
safetylower = gap.dist[match('dx_safe', safept)],
safetyupper = gap.dist[match('dx_safe+CC2', safept)])
What if there are more than one sets of dx_safe
and dx_safe+CC2
? Consider the following data frame:
df <- data.frame(Vehicle.ID=rep(c(5,6),each= 50),
gap.dist = rep(seq(from=10, to=59), 2),
safept = rep(c(rep('no', 5), 'dx_safe+CC2', rep('no', 4), 'dx_safe', rep('no', 3), 'dx_safe+CC2', rep('no', 5), 'dx_safe', rep('no', 28), 'dx_safe+CC2'), 2))
Building on the same code as provided in both answers (they both work flawlessly), how can I consider only the longer set (the one with longest number of rows in between) and get the gap.dist
values for safetylower
and safetyupper
(by Vehilce.ID
s)? The output should be:
> dput(df)
structure(list(Vehicle.ID = c(5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6), gap.dist = c(10L, 11L, 12L, 13L, 14L, 15L, 16L,
17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L,
30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L, 42L,
43L, 44L, 45L, 46L, 47L, 48L, 49L, 50L, 51L, 52L, 53L, 54L, 55L,
56L, 57L, 58L, 59L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L,
19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L,
32L, 33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L, 42L, 43L, 44L,
45L, 46L, 47L, 48L, 49L, 50L, 51L, 52L, 53L, 54L, 55L, 56L, 57L,
58L, 59L), safept = structure(c(3L, 3L, 3L, 3L, 3L, 2L, 3L, 3L,
3L, 3L, 1L, 3L, 3L, 3L, 2L, 3L, 3L, 3L, 3L, 3L, 1L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 3L, 3L, 3L, 3L, 3L, 2L,
3L, 3L, 3L, 3L, 1L, 3L, 3L, 3L, 2L, 3L, 3L, 3L, 3L, 3L, 1L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L), .Label = c("dx_safe",
"dx_safe+CC2", "no"), class = "factor"), safetylower = c(30,
30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
30, 30, 30), safetyupper = c(59, 59, 59, 59, 59, 59, 59, 59,
59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,
59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,
59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,
59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,
59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,
59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59)), .Names = c("Vehicle.ID",
"gap.dist", "safept", "safetylower", "safetyupper"), row.names = c(NA,
-100L), class = "data.frame")
I think your idea of using match
is right, but you need some more lines of code. I'll stick with ddply
since that's what you are using already.
ddply(df, .(Vehicle.ID), function(d) {
i <- match("dx_safe", d$safept) # first match of "dx_safe"
j <- i + match("dx_safe+CC2", d$safept[(i+1):nrow(d)])
# first match of "dx_safe+CC2" after first match of "dx_safe"
if(is.na(j)) j <- nrow(d) # if no match, set equal to the last entry
transform(d, safetylower = d$gap.dist[i],
safetyupper = d$gap.dist[j])
})
Note that this could be problematic and need adjustment if there is a chance that "dx_safe" does not appear at all in d$safept
for a certain Vehicle.ID
, but from the wording of your question I take it that it always does.
Also, +1 for the well-structured question :)
EDIT If you have many "pairs" of "dx_safe" and "dx_safe+CC2" and want to compare all the "distances" between them and select the largest one:
ddply(df, .(Vehicle.ID), function(d) {
i <- which(d$safept == "dx_safe") # matches of "dx_safe"
if (!length(i)) # if no matches of "dx_safe"
return(transform(d, safetylower = NA, safetyupper = NA))
j <- which(d$safept == "dx_safe+CC2") # matches of "dx_safe+CC2"
j <- j[j > i[1]] # discard occurences before first "dx_safe"
if (!length(j)) { # if no occurences of "dx_safe+CC2"
lower.index <- i[1]
upper.index <- nrow(d)
} else {
intervals <- findInterval(j, i)
distances <- sapply(j, function(x) x - max(i[i < x]))
max.dist <- max(distances[!duplicated(intervals)])
index <- match(max.dist, distances)
lower.index <- i[index]
upper.index <- j[index]
}
return(transform(d, safetylower = d$gap.dist[lower.index],
safetyupper = d$gap.dist[upper.index]))
})
The reason for testing for non-duplicated intervals in the above is we don't want to allow for the distance occuring between one "dx_safe" and one "dx_safe+CC2" to be selected as the maximum one, if there is another "dx_safe+CC2" between them. Is that right? i.e. if you have a vector c("dx_safe", "no", "no", "dx_safe+CC2", "no", "dx_safe+CC2")
the distance is calculated as 3 and not 5. Let me know if this is not what you had in mind. Please test carefully before using because I don't have the data and can't verify it works as expected for all the edge-cases, but I think it should cover them.
How about a divide and conquer approach with split()
unsplit(lapply(split(df, df$Vehicle.ID), function(x) {
lower <- which(x$safept=="dx_safe")[1]
upper <- Filter(function(x) x>lower, which(x$safept=="dx_safe+CC2"))[1]
if(is.na(upper)) {
upper = nrow(x)
}
cbind(x, safetylower=x$gap.dist[lower], safetyupper=x$gap.dist[upper])
}), df$Vehicle.ID)
Here we basically create a data.frame for each "Vehicle.ID" then I used your definitions to find the appropriate row index for each value of "gap.dist". Finally, I add those values back to the data.frame and then unsplit()
the data to restore the order.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With