I have below mentioned dataframe:
ID Date Status Category
TR-1 2018-01-10 Passed A
TR-2 2018-01-09 Passed B
TR-3 2018-01-09 Failed C
TR-3 2018-01-09 Failed A
TR-4 2018-01-08 Failed B
TR-5 2018-01-08 Passed C
TR-5 2018-01-08 Failed A
TR-6 2018-01-07 Passed A
By utilizing the above given dataframe I want a output format as shown below:
The Date
should be in descending order and the category sequence should be like C, A and B.
Date count distinct_count Passed Failed
2018-01-10 1 1 1 0
A 1 1 1 0
B 0 0 0 0
C 0 0 0 0
2018-01-09 3 2 1 2
A 1 1 1 0
B 1 1 1 0
C 1 1 1 0
To derive the above output, I have tried below code but it couldn't work and not able to get expected output.
Output<-DF %>%
group_by(Date=Date,A,B,C) %>%
summarise(`Count` = n(),
`Distinct_count` = n_distinct(ID),
Passed=sum(Status=='Passed'),
A=count(category='A'),
B=count(category='B'),
C=count(category='C'),
Failed=sum(Status=='Failed'))
Dput:
structure(list(ID = structure(c(1L, 2L, 3L, 3L, 4L, 5L, 5L, 6L
), .Label = c("TR-1", "TR-2", "TR-3", "TR-4", "TR-5", "TR-6"), class = "factor"),
Date = structure(c(4L, 3L, 3L, 3L, 2L, 2L, 2L, 1L), .Label = c("07/01/2018",
"08/01/2018", "09/01/2018", "10/01/2018"), class = "factor"),
Status = structure(c(2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L), .Label = c("Failed",
"Passed"), class = "factor"), Category = structure(c(1L,
2L, 3L, 1L, 2L, 3L, 1L, 1L), .Label = c("A", "B", "C"), class = "factor")), .Names = c("ID",
"Date", "Status", "Category"), class = "data.frame", row.names = c(NA,
-8L))
That was a tough one:
# I'm converting some variables to factors to get the "order" right and to fill in missing unobserved values later in dcast.
df1$Category <- factor(df1$Category, levels = unique(df1$Category))
date_lvls <- as.Date(df1$Date, "%Y-%m-%d") %>% unique %>% sort(decreasing = TRUE) %>% as.character
df1$Date <- factor(df1$Date, date_lvls)
# lets use data.table
library(data.table)
setDT(df1)
# make a lookup table to deal with the duplicated ID issue. Not sure how to do this elegant
tmp <- dcast.data.table(df1, Date ~ ID, fun.aggregate = length)
tmp <- structure(rowSums(tmp[,-1] == 2), .Names = as.character(unlist(tmp[, 1])))
# precaution! Boilerplate incoming in 3, 2, .. 1
dcast.data.table(df1, Date + Category ~ Status, drop = FALSE)[
,`:=`(Failed=+!is.na(Failed), Passed=+!is.na(Passed))][
, c("count","distinct_count") := rowSums(cbind(Failed,Passed))][
, Category := as.character(Category)][
, rbind(
cbind(Category = as.character(Date[1]), count = sum(count), distinct_count = sum(distinct_count) - tmp[as.character(Date[1])], Passed = sum(Passed), Failed = sum(Failed)),
.SD
, fill = TRUE), by = Date][
, Date := NULL ][]
result:
# Category count distinct_count Passed Failed
#1: 2018-01-10 1 1 1 0
#2: A 1 1 1 0
#3: B 0 0 0 0
#4: C 0 0 0 0
#5: 2018-01-09 3 2 1 2
#6: A 1 1 0 1
#7: B 1 1 1 0
#8: C 1 1 0 1
#9: 2018-01-08 3 2 1 2
#10: A 1 1 0 1
#11: B 1 1 0 1
#12: C 1 1 1 0
#13: 2018-01-07 1 1 1 0
#14: A 1 1 1 0
#15: B 0 0 0 0
#16: C 0 0 0 0
data:
df1<-
structure(list(ID = c("TR-1", "TR-2", "TR-3", "TR-3", "TR-4",
"TR-5", "TR-5", "TR-6"), Date = c("2018-01-10", "2018-01-09",
"2018-01-09", "2018-01-09", "2018-01-08", "2018-01-08", "2018-01-08",
"2018-01-07"), Status = c("Passed", "Passed", "Failed", "Failed",
"Failed", "Passed", "Failed", "Passed"), Category = c("A", "B",
"C", "A", "B", "C", "A", "A")), row.names = c(NA, -8L), class = "data.frame")
please note:
please run every line of code one after another. For this you can close every ENDING open bracket and run the line till the end: e.g.
run : dcast.data.table(df1, Date + Category ~ Status, drop = FALSE)[]
run : dcast.data.table(df1, Date + Category ~ Status, drop = FALSE)[
,
:=(Failed=+!is.na(Failed), Passed=+!is.na(Passed))][]
... till the end
if then anything is unclear please ask me about this specific thing.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With