Add a column based on all other columns in DB

Tags:

r

dplyr

I have a database with one index and a huge number of columns for said index. I want to add a column showing in a value is in any of the other columns for said index.

For example:

library(nycflights13)
flights %>% 
  select(tailnum,flight) %>%
  filter(!is.na(tailnum)) %>% 
  arrange(tailnum) %>% 
  group_by(tailnum) %>% 
  mutate(id = row_number()) %>%
  pivot_wider(names_from = "id", values_from = "flight")

# A tibble: 4,043 x 576
# Groups:   tailnum [4,043]
   tailnum   `1`   `2`   `3`   `4`   `5`   `6`   `7`   `8`   `9`  `10`  `11`  `12`  `13`  `14`  `15`  `16`
   <chr>   <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
 1 D942DN   2247  1685  1959   781    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
 2 N0EGMQ   4579  4584  4610  4662  4661  4610  4610  4584  4610  4669  4584  4610  4669  4584  4610  4589
 3 N10156   4560  4269  4667  4334  4298  4520  4297  4370  4352  4695  4214  4085  4370  4204  4641  4092
 4 N102UW   1125  1830  2095  2069  2069  2071  1805  1767  1767   720   720  1103  1619  1431  1435  1288
 5 N103US   1575  1427   975  1125  2095  2088  2053  2095  2095  2069   802  2017  2017  1691  1431  1435
 6 N104UW   1973  1125  2053  1767  1972  1767  1767  1963  2017  1767   975  1443  1103  1987  1081  1288
 7 N10575   4617  4352  4434  4250  4537  4572  4106  3841  4305  3817  3819  4120  4202  4393  4316  4276
 8 N105UW   2095  2095  1767  2069  1767  1834  2053  2069  1767  2069  1767  1745   978  1081  1435  1435
 9 N107US   1491  2095  2095  1830  2189  1972  1751  2069  2069  1767  1491  1125  1081  1987  1435  1697
10 N108UW   1125   840  2086  2088  2088  1767  1767  2017  1767  1767  1767  1125  1987  1895  1973  1435

I want to add a column at the end of the table showing if any of the other columns contained "1767".

988

asked Dec 29 '21 18:12

Edi Itelman

3 Answers

We can use rowwise with any.

library(tidyverse)

library(nycflights13)
df <- 
flights %>% 
  select(tailnum,flight) %>%
  filter(!is.na(tailnum)) %>% 
  arrange(tailnum) %>% 
  group_by(tailnum) %>% 
  mutate(id = row_number()) %>%
  pivot_wider(names_from = "id", values_from = "flight")

df <- 
df %>% 
  rowwise() %>% 
  mutate(contains_1767 = any(c_across(where(is.numeric)) == 1767, na.rm = TRUE))

df[, ncol(df)]
#> # A tibble: 4,043 × 1
#> # Rowwise: 
#>    contains_1767
#>    <lgl>        
#>  1 FALSE        
#>  2 FALSE        
#>  3 FALSE        
#>  4 TRUE         
#>  5 FALSE        
#>  6 TRUE         
#>  7 FALSE        
#>  8 TRUE         
#>  9 TRUE         
#> 10 TRUE         
#> # … with 4,033 more rows


system.time({df <- 
  df %>% 
  rowwise() %>% 
  mutate(contains_1767 = any(c_across(where(is.numeric)) == 1767, na.rm = TRUE))})
#>    user  system elapsed 
#>   2.317   0.049   2.367

^{Created on 2021-12-29 by the reprex package (v2.0.1)}

148

answered Oct 08 '22 13:10

jpdugo17

This can be done with if_any

out <- flights %>% 
  select(tailnum,flight) %>%
  filter(!is.na(tailnum)) %>% 
  arrange(tailnum) %>% 
  group_by(tailnum) %>% 
  mutate(id = row_number()) %>%
  ungroup %>%
  pivot_wider(names_from = "id", values_from = "flight") %>%
   mutate(new = if_any(where(is.numeric), ~ . %in% 1767))

-checking

> head(out$new)
[1] FALSE FALSE FALSE  TRUE FALSE  TRUE
> sort(unique(which(head(out) == 1767, arr.ind = TRUE)[,1]))
[1] 4 6

answered Oct 08 '22 13:10

akrun

Another possible solution, based on rowSums:

library(tidyverse)

library(nycflights13)
flights %>% 
  select(tailnum,flight) %>%
  filter(!is.na(tailnum)) %>% 
  arrange(tailnum) %>% 
  group_by(tailnum) %>% 
  mutate(id = row_number()) %>%
  pivot_wider(names_from = "id", values_from = "flight")%>% 
  ungroup %>% 
  mutate(res = pmin(1, rowSums(. == 1767, na.rm = T)), .after=tailnum)

#> # A tibble: 4,043 × 577
#>    tailnum   res   `1`   `2`   `3`   `4`   `5`   `6`   `7`   `8`   `9`  `10`
#>    <chr>   <dbl> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
#>  1 D942DN      0  2247  1685  1959   781    NA    NA    NA    NA    NA    NA
#>  2 N0EGMQ      0  4579  4584  4610  4662  4661  4610  4610  4584  4610  4669
#>  3 N10156      0  4560  4269  4667  4334  4298  4520  4297  4370  4352  4695
#>  4 N102UW      1  1125  1830  2095  2069  2069  2071  1805  1767  1767   720
#>  5 N103US      0  1575  1427   975  1125  2095  2088  2053  2095  2095  2069
#>  6 N104UW      1  1973  1125  2053  1767  1972  1767  1767  1963  2017  1767
#>  7 N10575      0  4617  4352  4434  4250  4537  4572  4106  3841  4305  3817
#>  8 N105UW      1  2095  2095  1767  2069  1767  1834  2053  2069  1767  2069
#>  9 N107US      1  1491  2095  2095  1830  2189  1972  1751  2069  2069  1767
#> 10 N108UW      1  1125   840  2086  2088  2088  1767  1767  2017  1767  1767
#> # … with 4,033 more rows, and 565 more variables: 11 <int>, 12 <int>, 13 <int>,
#> #   14 <int>, 15 <int>, 16 <int>, 17 <int>, 18 <int>, 19 <int>, 20 <int>,
#> #   21 <int>, 22 <int>, 23 <int>, 24 <int>, 25 <int>, 26 <int>, 27 <int>,
#> #   28 <int>, 29 <int>, 30 <int>, 31 <int>, 32 <int>, 33 <int>, 34 <int>,
#> #   35 <int>, 36 <int>, 37 <int>, 38 <int>, 39 <int>, 40 <int>, 41 <int>,
#> #   42 <int>, 43 <int>, 44 <int>, 45 <int>, 46 <int>, 47 <int>, 48 <int>,
#> #   49 <int>, 50 <int>, 51 <int>, 52 <int>, 53 <int>, 54 <int>, 55 <int>, …

answered Oct 08 '22 14:10

PaulS

Related questions
                            
                                How to find complementary rows in matrix in R
                            
                                Mutate_all except some columns
                            
                                Replace values in tibble in R 4.0
                            
                                Chunk continuous timeseries data into non-continuous time windows for multiple time periods and multiple groups
                            
                                "Error: Must subset rows with a valid subscript vector" in preProcess() when using knnImpute
                            
                                Unexpected return for NA in factor lookup
                            
                                Apply a Bayesian model (JAGS) for various iterations
                            
                                R mutate multiple columns with ifelse
                            
                                How to make continuous a discontinuous sequence of character numbers with leading zero(s)?
                            
                                creating Rd documentation files for R6 classes not in a package
                            
                                Replacing diagonal elements using dplyr pipe
                            
                                (tidyverse approach) calculating rowsum across several columns where info on columns to include comes from a different data frame
                            
                                Providing the correct match for a list in a list of lists in R
                            
                                How to write an efficient wrapper for data wrangling, allowing to turn off any wrapped part when calling the wrapper
                            
                                Very simple yet confusing R question about bind_rows()
                            
                                Can R Markdown be made to preview from anywhere other than the start of the document?
                            
                                expand_grid with identical vectors
                            
                                Transforming complete age from character to numeric in R
                            
                                !!! (splice operator) for ggplot2 geom_point() function
                            
                                Using summarize(across(..., .fns = ...)) with a multi-variate function

Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!

Donate Us With