I'm struggling to determine how to produce tabular output as shown below. (I’d love to be able to utilize conditional logic to shade cells as shown in the attached output produced in Excel, but I’d be happy simply understanding how to produce the output without the shading for starters.)
Desired Output (produced in Excel)
Overall Process Employed:
Load required packages.
library(gains)
library(plyr)
library(StatMeasures)
library(sqldf)
library(tables)
full_dataset <- MineThatData
reduced_dataset <- full_dataset[ ,
c("conversion","spend","train","logistic.score")]
reduced_dataset <- rename(reduced_dataset,c("logistic.score"="score"))
reduced_dataset$score <- round(reduced_dataset$score, 8)
summary(reduced_dataset)
trainDF <- reduced_dataset[reduced_dataset$train == 1, ]
testDF <- reduced_dataset[reduced_dataset$train == 0, ]
trainDF$Decile <- decile(trainDF$score, decreasing = TRUE)
summarize_results_by_decile <- function(Input_DF, Output_DF) {
Output_DF <- sqldf("
select
case when train = 1 then 'Train' else 'Test' end as Sample
,Decile
,count(*) as Num_Records
,sum(conversion) as Num_Converters
,sum(spend) as Sum_Spend
,min(score) as Min_Score
,max(score) as Max_Score
,round(avg(conversion),4) as Pct_Response
,round(avg(spend),2) as Spend_per_Record
from Input_DF
group by Decile
order by Decile
")
temp_df <- sqldf("
select
case when train = 1 then 'Train' else 'Test' end as Sample
,11 as Decile
,count(*) as Num_Records
,sum(conversion) as Num_Converters
,sum(Spend) as Sum_Spend
,min(score) as Min_Score
,max(score) as Max_Score
,round(avg(conversion),4) as Pct_Response
,round(avg(spend),2) as Spend_per_Record
from Input_DF
")
Output_DF <- rbind(Output_DF , temp_df)
Output_DF$Decile <- factor(Output_DF$Decile,
labels =c("1","2","3","4","5","6","7","8","9","10","Total"))
Output_DF$Pct_of_Records <- paste(format(round(Output_DF$Num_Records
/ temp_df$Num_Records * 100, 1), nsmall=1), "%", sep="")
Output_DF$Pct_of_Converters <- paste(format(round(Output_DF$Num_Converters
/ temp_df$Num_Converters * 100, 1), nsmall=1), "%", sep="")
Output_DF$Pct_of_Spend <- paste(format(round(Output_DF$Sum_Spend
/ temp_df$Sum_Spend * 100, 1), nsmall=1), '%', sep="")
Output_DF$Num_Records <- format(Output_DF$Num_Records, big.mark = ",")
Output_DF$Num_Converters <- format(Output_DF$Num_Converters,
big.mark = ",")
Output_DF$Sum_Spend <- paste("$" , sep="", format(Output_DF$Sum_Spend,
big.mark = ","))
Output_DF$Pct_Response <- paste(format(round(Output_DF$Pct_Response * 100,
2), nsmall=2), "%", sep="")
Output_DF$Spend_per_Record <- paste("$", sep="",
format(Output_DF$Spend_per_Record, nsmall=2))
return(Output_DF)
}
summary_results_train <- summarize_results_by_decile(trainDF,
summary_results_train)
Min_Decile_Scores <- t(subset(summary_results_train, select = Min_Score))
assign_decile <- function(score_var, decile_var) {
decile_var <- ifelse(score_var >= Min_Decile_Scores[1], 1,
ifelse(score_var >= Min_Decile_Scores[2], 2,
ifelse(score_var >= Min_Decile_Scores[3], 3,
ifelse(score_var >= Min_Decile_Scores[4], 4,
ifelse(score_var >= Min_Decile_Scores[5], 5,
ifelse(score_var >= Min_Decile_Scores[6], 6,
ifelse(score_var >= Min_Decile_Scores[7], 7,
ifelse(score_var >= Min_Decile_Scores[8], 8,
ifelse(score_var >= Min_Decile_Scores[9], 9, 10)))))))))
return(decile_var)
}
Validate the logic used to assign decile assignments on the Training data:
trainDF$Replicate_Decile <- assign_decile(trainDF$score,
trainDF$Replicate_Decile)
table(trainDF$Decile, trainDF$Replicate_Decile)
trainDF$Replicate_Decile <- NULL
testDF$Decile <- assign_decile(testDF$score, testDF$Decile)
summary_results_test <- summarize_results_by_decile(testDF,
summary_results_test)
summary_results <- rbind(summary_results_train, summary_results_test)
summary_results <- subset(summary_results, select = -c(Min_Score,Max_Score))
This is done to re-order columns as they are intended to be displayed:
summary_results <- summary_results[ ,c("Sample", "Decile", "Num_Records",
"Num_Converters", "Sum_Spend", "Pct_of_Records",
"Pct_of_Converters","Pct_of_Spend","Pct_Response",
"Spend_per_Record")]
This is done to affect how the column names appear:
summary_results <- rename(summary_results,
c("Num_Records" = "# Records",
"Num_Converters" = "# Converters",
"Sum_Spend" = "Total Spend",
"Pct_of_Records" = "% of Records",
"Pct_of_Converters" = "% of Converters",
"Pct_of_Spend" = "% of Spend",
"Pct_Response" = "% Conversion",
"Spend_per_Record" = "$ per Record"))
print(summary_results[summary_results$Sample == 'Train', -1], row.names = FALSE)
print(summary_results[summary_results$Sample == 'Test' , -1], row.names = FALSE)
Output I'm Able to Produce in R
Along with this being my first post ever to Stack Overflow, I'm a relatively new R user. I hope my code is understandable! Thanks in advance for any assistance.
Not so nice and consise solution:
library(gains)
library(expss)
full_dataset = MineThatData
reduced_dataset = full_dataset[ ,
c("conversion","spend","train","logistic.score")]
reduced_dataset$score = round(reduced_dataset$logistic.score, 8)
summary_fun = function(data){
calc(data,
list(
"# Records" = NROW(data),
"# Converters" = sum(conversion),
"$ Spend" = sum(spend),
"% Conversion" = round(mean(conversion),4)*100,
"$ per Record" = round(mean(spend),2)
)
)
}
reduced_dataset %>%
compute({
decile_points = quantile(score[train==1],
probs = seq(0,1,by = 0.1)
)
decile_points[length(decile_points)] = Inf
# '11 - ' is needed make reverse order
decile = 11 - as.integer(cut(score, decile_points, include.lowest = TRUE))
rm(decile_points) # we don't need this in our dataset
}) %>%
# "|" to suppress variable label
tab_rows("|" = decile, total(label = "Total")) %>%
tab_cols(total(label = "|")) %>%
tab_cells(sheet(conversion,
spend)) %>%
tab_subgroup(train==1) %>%
tab_stat_fun_df(summary_fun, label = "Train") %>%
tab_subgroup(train==0) %>%
tab_stat_fun_df(summary_fun, label = "Test") %>%
tab_pivot(stat_position = "inside_columns") %>%
# calculate percent
do_repeat(i = perl("Records|Converters|Spend"), {
..[gsub("#|\\$", "% of", .item_value, perl = TRUE)] = i/i[.N]*100
}) %>%
# move some columns to the end
keep(!fixed("% Conversion") & !fixed("$ per Record"), other()) %>%
# formating
do_repeat(i = fixed("#"), {
i = format(i, big.mark = ",")
}) %>%
do_repeat(i = fixed("$"), {
i = paste0("$", format(i, big.mark = ","))
}) %>%
do_repeat(i = fixed("%"), {
i = paste0(format(round(i, 1), nsmall=1), "%")
}) %>%
htmlTable()
This gives the following result:
Disclaimer: I am an author of the 'expss' package.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With