Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Nested Tabular Output in R

Tags:

r

I'm struggling to determine how to produce tabular output as shown below. (I’d love to be able to utilize conditional logic to shade cells as shown in the attached output produced in Excel, but I’d be happy simply understanding how to produce the output without the shading for starters.)


Desired Output (produced in Excel) image description


Overall Process Employed:

  1. Use the MineThatData dataset available with the "gains" package.
  2. The full dataset includes four model scores. Just keep the "logistic.score" for illustration.
  3. Assign each record in the Training sample (train = 1) to a decile based on model score (i.e., logistic.score).
  4. Use score ranges from the Training sample to assign records in the Test sample to a decile
  5. Report various statistics (including conversion rate and spend per record) by decile for each sampling group.

Load required packages.

library(gains)
library(plyr)
library(StatMeasures)
library(sqldf)
library(tables)

full_dataset <- MineThatData

reduced_dataset <- full_dataset[ , 
                           c("conversion","spend","train","logistic.score")]

reduced_dataset <- rename(reduced_dataset,c("logistic.score"="score"))

reduced_dataset$score <- round(reduced_dataset$score, 8)

summary(reduced_dataset)

trainDF <- reduced_dataset[reduced_dataset$train == 1, ]
testDF  <- reduced_dataset[reduced_dataset$train == 0, ]

trainDF$Decile <- decile(trainDF$score, decreasing = TRUE)

summarize_results_by_decile <- function(Input_DF, Output_DF) {
  Output_DF <- sqldf("
     select
      case when train = 1 then 'Train' else 'Test' end as Sample
     ,Decile
     ,count(*) as Num_Records
     ,sum(conversion) as Num_Converters
     ,sum(spend) as Sum_Spend
     ,min(score) as Min_Score
     ,max(score) as Max_Score
     ,round(avg(conversion),4) as Pct_Response
     ,round(avg(spend),2) as Spend_per_Record
     from Input_DF
     group by Decile
     order by Decile
     ")

  temp_df <- sqldf("
     select
      case when train = 1 then 'Train' else 'Test' end as Sample
     ,11 as Decile
     ,count(*) as Num_Records
     ,sum(conversion) as Num_Converters
     ,sum(Spend) as Sum_Spend
     ,min(score) as Min_Score
     ,max(score) as Max_Score
     ,round(avg(conversion),4) as Pct_Response
     ,round(avg(spend),2) as Spend_per_Record
     from Input_DF
     ")

  Output_DF <- rbind(Output_DF , temp_df)

  Output_DF$Decile <- factor(Output_DF$Decile, 
                labels =c("1","2","3","4","5","6","7","8","9","10","Total"))

  Output_DF$Pct_of_Records <- paste(format(round(Output_DF$Num_Records    
    / temp_df$Num_Records * 100, 1), nsmall=1), "%", sep="")

  Output_DF$Pct_of_Converters <- paste(format(round(Output_DF$Num_Converters 
    / temp_df$Num_Converters * 100, 1), nsmall=1), "%", sep="")

  Output_DF$Pct_of_Spend <- paste(format(round(Output_DF$Sum_Spend      
    / temp_df$Sum_Spend * 100, 1), nsmall=1), '%', sep="")

  Output_DF$Num_Records <- format(Output_DF$Num_Records, big.mark = ",")

  Output_DF$Num_Converters <- format(Output_DF$Num_Converters, 
                                     big.mark = ",")

  Output_DF$Sum_Spend <- paste("$" , sep="", format(Output_DF$Sum_Spend,      
                                                    big.mark = ","))

  Output_DF$Pct_Response <- paste(format(round(Output_DF$Pct_Response * 100, 
                                        2), nsmall=2), "%", sep="")

  Output_DF$Spend_per_Record <- paste("$", sep="", 
                               format(Output_DF$Spend_per_Record, nsmall=2))

return(Output_DF)
}

summary_results_train <- summarize_results_by_decile(trainDF, 
                                                     summary_results_train)

Min_Decile_Scores <- t(subset(summary_results_train, select = Min_Score))

assign_decile <- function(score_var, decile_var) {
  decile_var <- ifelse(score_var >= Min_Decile_Scores[1], 1,
                ifelse(score_var >= Min_Decile_Scores[2], 2,
                ifelse(score_var >= Min_Decile_Scores[3], 3,
                ifelse(score_var >= Min_Decile_Scores[4], 4,
                ifelse(score_var >= Min_Decile_Scores[5], 5,
                ifelse(score_var >= Min_Decile_Scores[6], 6,
                ifelse(score_var >= Min_Decile_Scores[7], 7,
                ifelse(score_var >= Min_Decile_Scores[8], 8,
                ifelse(score_var >= Min_Decile_Scores[9], 9, 10)))))))))

  return(decile_var)
}

Validate the logic used to assign decile assignments on the Training data:

trainDF$Replicate_Decile <- assign_decile(trainDF$score, 
                                                   trainDF$Replicate_Decile)
table(trainDF$Decile, trainDF$Replicate_Decile)
trainDF$Replicate_Decile <- NULL

testDF$Decile <- assign_decile(testDF$score, testDF$Decile)

summary_results_test <- summarize_results_by_decile(testDF, 
                                                    summary_results_test)

summary_results <- rbind(summary_results_train, summary_results_test)

summary_results <- subset(summary_results, select = -c(Min_Score,Max_Score))

This is done to re-order columns as they are intended to be displayed:

summary_results <- summary_results[ ,c("Sample", "Decile", "Num_Records", 
                         "Num_Converters", "Sum_Spend", "Pct_of_Records", 
                         "Pct_of_Converters","Pct_of_Spend","Pct_Response", 
                         "Spend_per_Record")]

This is done to affect how the column names appear:

summary_results <- rename(summary_results,
   c("Num_Records"       = "# Records",
     "Num_Converters"    = "# Converters",
     "Sum_Spend"         = "Total Spend",
     "Pct_of_Records"    = "% of Records",
     "Pct_of_Converters" = "% of Converters",
     "Pct_of_Spend"      = "% of Spend",
     "Pct_Response"      = "% Conversion",
     "Spend_per_Record"  = "$ per Record"))

print(summary_results[summary_results$Sample == 'Train', -1], row.names = FALSE)
print(summary_results[summary_results$Sample == 'Test' , -1], row.names = FALSE)

Output I'm Able to Produce in R image description


Along with this being my first post ever to Stack Overflow, I'm a relatively new R user. I hope my code is understandable! Thanks in advance for any assistance.

like image 715
Dean Avatar asked Feb 02 '18 00:02

Dean


1 Answers

Not so nice and consise solution:

library(gains)
library(expss)

full_dataset = MineThatData

reduced_dataset = full_dataset[ , 
                                 c("conversion","spend","train","logistic.score")]
reduced_dataset$score = round(reduced_dataset$logistic.score, 8)

summary_fun = function(data){
    calc(data, 
         list(
             "# Records" = NROW(data),
             "# Converters" = sum(conversion), 
             "$ Spend" = sum(spend), 
             "% Conversion" = round(mean(conversion),4)*100, 
             "$ per Record" = round(mean(spend),2)    
         )
    )
}

reduced_dataset %>% 
    compute({
        decile_points =  quantile(score[train==1],
                                  probs = seq(0,1,by = 0.1)
        )
        decile_points[length(decile_points)] = Inf
        # '11 - ' is needed make reverse order
        decile = 11 - as.integer(cut(score, decile_points, include.lowest = TRUE)) 
        rm(decile_points) # we don't need this in our dataset

    }) %>% 
    # "|" to suppress variable label
    tab_rows("|" = decile, total(label = "Total")) %>% 
    tab_cols(total(label = "|")) %>% 
    tab_cells(sheet(conversion, 
                    spend)) %>% 
    tab_subgroup(train==1) %>% 
    tab_stat_fun_df(summary_fun, label = "Train") %>% 
    tab_subgroup(train==0) %>% 
    tab_stat_fun_df(summary_fun, label = "Test") %>% 
    tab_pivot(stat_position = "inside_columns") %>% 
    # calculate  percent
    do_repeat(i = perl("Records|Converters|Spend"), {
        ..[gsub("#|\\$", "% of", .item_value, perl = TRUE)] = i/i[.N]*100
    }) %>%
    # move some columns to the end
    keep(!fixed("% Conversion") & !fixed("$ per Record"), other()) %>% 
    # formating
    do_repeat(i = fixed("#"), {
        i = format(i, big.mark = ",")
    }) %>% 
    do_repeat(i = fixed("$"), {
        i = paste0("$", format(i, big.mark = ","))
    }) %>%
    do_repeat(i = fixed("%"), {
        i = paste0(format(round(i, 1), nsmall=1), "%")
    }) %>% 
    htmlTable()

This gives the following result: table

Disclaimer: I am an author of the 'expss' package.

like image 184
Gregory Demin Avatar answered Oct 04 '22 02:10

Gregory Demin