Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Applying group_by and summarise(sum) but keep columns with non-relevant conflicting data?

My question is very similar to Applying group_by and summarise on data while keeping all the columns' info but I would like to keep columns which get excluded because they conflict after grouping.

Label <- c("203c","203c","204a","204a","204a","204a","204a","204a","204a","204a")
Type <- c("wholefish","flesh","flesh","fleshdelip","formula","formuladelip",
          "formula","formuladelip","wholefish", "wholefishdelip")
Proportion <- c(1,1,0.67714,0.67714,0.32285,0.32285,0.32285, 
                0.32285, 0.67714,0.67714)
N <- (1:10)
C <- (1:10)
Code <- c("c","a","a","b","a","b","c","d","c","d")

df <- data.frame(Label,Type, Proportion, N, C, Code)
df

   Label           Type Proportion  N  C Code
1   203c      wholefish     1.0000  1  1    c
2   203c          flesh     1.0000  2  2    a
3   204a          flesh     0.6771  3  3    a
4   204a     fleshdelip     0.6771  4  4    b
5   204a        formula     0.3228  5  5    a
6   204a   formuladelip     0.3228  6  6    b
7   204a        formula     0.3228  7  7    c
8   204a   formuladelip     0.3228  8  8    d
9   204a      wholefish     0.6771  9  9    c
10  204a wholefishdelip     0.6771 10 10    d

total <- df %>% 
  #where the Label and Code are the same the Proportion, N and C 
  #should be added together respectively
  group_by(Label, Code) %>% 
  #total proportion should add up to 1 
  #my way of checking that the correct task has been completed
  summarise_if(is.numeric, sum)

# A tibble: 6 x 5
# Groups:   Label [?]
   Label   Code Proportion     N     C
  <fctr> <fctr>      <dbl> <int> <int>
1   203c      a    1.00000     2     2
2   203c      c    1.00000     1     1
3   204a      a    0.99999     8     8
4   204a      b    0.99999    10    10
5   204a      c    0.99999    16    16
6   204a      d    0.99999    18    18

Up until here I get what I want. Now I would like to include the column Type though it is excluded because values are conflicting. this is the result I would like to obtain

# A tibble: 6 x 5
# Groups:   Label [?]
   Label   Code Proportion     N     C    Type
  <fctr> <fctr>      <dbl> <int> <int>  <fctr>
1   203c      a    1.00000     2     2    wholefish
2   203c      c    1.00000     1     1    flesh
3   204a      a    0.99999     8     8    flesh_formula
4   204a      b    0.99999    10    10    fleshdelip_formuladelip
5   204a      c    0.99999    16    16    wholefish_formula
6   204a      d    0.99999    18    18    wholefishdelip_formuladelip

I have tried ungroup() and some variations of mutate and unite but to no avail, any suggestions would be greatly appreciated

like image 981
mckisa Avatar asked Oct 03 '17 21:10

mckisa


2 Answers

Here's a tidyverse solution that keeps your group_by statement the same. The key is to use mutate_if for each variable type first (i.e., numeric, character), then get distinct rows.


library(tidyverse)
#> Loading tidyverse: ggplot2
#> Loading tidyverse: tibble
#> Loading tidyverse: tidyr
#> Loading tidyverse: readr
#> Loading tidyverse: purrr
#> Loading tidyverse: dplyr
#> Conflicts with tidy packages ----------------------------------------------
#> filter(): dplyr, stats
#> lag():    dplyr, stats

Label <- c("203c", "203c", "204a", "204a", "204a", "204a", "204a", "204a",
  "204a", "204a")
Type <- c("wholefish", "flesh", "flesh", "fleshdelip", "formula", "formuladelip",
  "formula", "formuladelip", "wholefish", "wholefishdelip")
Proportion <- c(1, 1, 0.67714, 0.67714, 0.32285, 0.32285, 0.32285, 0.32285,
  0.67714, 0.67714)
N <- (1:10)
C <- (1:10)
Code <- c("c", "a", "a", "b", "a", "b", "c", "d", "c", "d")

df <- data_frame(Label, Type, Proportion, N, C, Code)
df
#> # A tibble: 10 x 6
#>    Label           Type Proportion     N     C  Code
#>    <chr>          <chr>      <dbl> <int> <int> <chr>
#>  1  203c      wholefish    1.00000     1     1     c
#>  2  203c          flesh    1.00000     2     2     a
#>  3  204a          flesh    0.67714     3     3     a
#>  4  204a     fleshdelip    0.67714     4     4     b
#>  5  204a        formula    0.32285     5     5     a
#>  6  204a   formuladelip    0.32285     6     6     b
#>  7  204a        formula    0.32285     7     7     c
#>  8  204a   formuladelip    0.32285     8     8     d
#>  9  204a      wholefish    0.67714     9     9     c
#> 10  204a wholefishdelip    0.67714    10    10     d

df %>%
  group_by(Label, Code) %>%
  mutate_if(is.numeric, sum) %>%
  mutate_if(is.character, funs(paste(unique(.), collapse = "_"))) %>%
  distinct()
#> # A tibble: 6 x 6
#> # Groups:   Label, Code [6]
#>   Label                        Type Proportion     N     C  Code
#>   <chr>                       <chr>      <dbl> <int> <int> <chr>
#> 1  203c                   wholefish    1.00000     1     1     c
#> 2  203c                       flesh    1.00000     2     2     a
#> 3  204a               flesh_formula    0.99999     8     8     a
#> 4  204a     fleshdelip_formuladelip    0.99999    10    10     b
#> 5  204a           formula_wholefish    0.99999    16    16     c
#> 6  204a formuladelip_wholefishdelip    0.99999    18    18     d
like image 77
Jake Thompson Avatar answered Sep 19 '22 12:09

Jake Thompson


Here are two other options:

1) Nest columns into one column and then customize the summary by checking the data types:

df %>% 
    group_by(Label, Code) %>% nest() %>% 
    mutate(data = map(data, 
        ~ as.tibble(map(.x, ~ if(is.numeric(.x)) sum(.x) else paste(.x, collapse="_")))
          )
    ) %>% unnest()

# A tibble: 6 x 6
#   Label   Code                        Type Proportion     N     C
#  <fctr> <fctr>                       <chr>      <dbl> <int> <int>
#1   203c      c                   wholefish    1.00000     1     1
#2   203c      a                       flesh    1.00000     2     2
#3   204a      a               flesh_formula    0.99999     8     8
#4   204a      b     fleshdelip_formuladelip    0.99999    10    10
#5   204a      c           formula_wholefish    0.99999    16    16
#6   204a      d formuladelip_wholefishdelip    0.99999    18    18

2) summarize separately and then join the result:

numeric <- df %>% 
    group_by(Label, Code) %>% 
    summarise_if(is.numeric, sum)

character <- df %>% 
    group_by(Label, Code) %>% 
    summarise_if(~ is.character(.) || is.factor(.), ~ paste(., collapse="_"))

inner_join(numeric, character, by = c("Label", "Code"))
# A tibble: 6 x 6
# Groups:   Label [?]
#   Label   Code Proportion     N     C                        Type
#  <fctr> <fctr>      <dbl> <int> <int>                       <chr>
#1   203c      a    1.00000     2     2                       flesh
#2   203c      c    1.00000     1     1                   wholefish
#3   204a      a    0.99999     8     8               flesh_formula
#4   204a      b    0.99999    10    10     fleshdelip_formuladelip
#5   204a      c    0.99999    16    16           formula_wholefish
#6   204a      d    0.99999    18    18 formuladelip_wholefishdelip
like image 31
Psidom Avatar answered Sep 18 '22 12:09

Psidom