Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

R - more effective left_join [duplicate]

I have got two dataframes - one containing names and ranges of limits (only few hundreds of rows, 1000 at most), which needs to be assigned to a "measurements" dataframe which can consist of million of rows (or ten's of millions of row).

Currently I am doing left_join and filtering value to get a specific limit assigned to each measurement. This however is quite ineffective and cost a lot of resources. For larger dataframes, the code is even unable to run.

Any ideas for more effective solutions will be helpful.

library(dplyr)

## this one has got only few houndreds rows
df_limits <- read.table(text="Title station_id  limit_from  limit_to
Level_3_Low 1   0   70
Level_2_Low 1   70  90
Level_1_Low 1   90  100
Optimal 1   100 110
Level_1_High    1   110 130
Level_2_High    1   130 150
Level_3_High    1   150 180
Level_3_Low 2   0   70
Level_2_Low 2   70  90
Level_1_Low 2   90  100
Optimal 2   100 110
Level_1_High    2   110 130
Level_2_High    2   130 150
Level_3_High    2   150 180
Level_3_Low 3   0   70
Level_2_Low 3   70  90
Level_1_Low 3   90  100
Optimal 3   100 110
Level_1_High    3   110 130
Level_2_High    3   130 150
Level_3_High    3   150 180
",header  = TRUE, stringsAsFactors = TRUE)


 # this DF has got millions of rows
df_measurements <- read.table(text="measurement_id  station_id  value
12121534    1   172
12121618    1   87
12121703    1   9
12121709    2   80
12121760    2   80
12121813    2   115
12121881    3   67
12121907    3   100
12121920    3   108
12121979    1   102
12121995    1   53
12122022    1   77
12122065    2   158
12122107    2   144
12122113    2   5
12122135    3   100
12122187    3   136
12122267    3   130
12122359    1   105
12122366    1   126
12122398    1   143
",header  = TRUE, stringsAsFactors = TRUE)


df_results <- left_join(df_measurements,df_limits, by = "station_id") %>% 
              filter ((value >= limit_from & value < limit_to) | is.na(Title)) %>%
              select(names(df_measurements), Title)
like image 325
Petrik Avatar asked Aug 06 '21 12:08

Petrik


2 Answers

Another data.table solution using non-equijoins:

library(data.table)

setDT(df_measurements)
setDT(df_limits) 

df_limits[df_measurements, .(station_id, measurement_id, value, Title),
          on=.(station_id = station_id, limit_from < value, limit_to >= value)]

   station_id measurement_id value        Title
 1:          1       12121534   172 Level_3_High
 2:          1       12121618    87  Level_2_Low
 3:          1       12121703     9  Level_3_Low
 4:          2       12121709    80  Level_2_Low
 5:          2       12121760    80  Level_2_Low
 6:          2       12121813   115 Level_1_High
 7:          3       12121881    67  Level_3_Low
 8:          3       12121907   100  Level_1_Low
 9:          3       12121920   108      Optimal
10:          1       12121979   102      Optimal
11:          1       12121995    53  Level_3_Low
12:          1       12122022    77  Level_2_Low
13:          2       12122065   158 Level_3_High
14:          2       12122107   144 Level_2_High
15:          2       12122113     5  Level_3_Low
16:          3       12122135   100  Level_1_Low
17:          3       12122187   136 Level_2_High
18:          3       12122267   130 Level_1_High
19:          1       12122359   105      Optimal
20:          1       12122366   126 Level_1_High
21:          1       12122398   143 Level_2_High
like image 50
Waldi Avatar answered Oct 24 '22 02:10

Waldi


A simple base R (no need additional packages) option using subset + merge

subset(
  merge(
    df_measurements,
    df_limits,
    all = TRUE
  ),
  limit_from < value & limit_to >= value
)

gives

    station_id measurement_id value        Title limit_from limit_to
7            1       12121534   172 Level_3_High        150      180
9            1       12121618    87  Level_2_Low         70       90
15           1       12121703     9  Level_3_Low          0       70
23           1       12122022    77  Level_2_Low         70       90
34           1       12122398   143 Level_2_High        130      150
39           1       12121979   102      Optimal        100      110
43           1       12121995    53  Level_3_Low          0       70
54           1       12122366   126 Level_1_High        110      130
60           1       12122359   105      Optimal        100      110
65           2       12121760    80  Level_2_Low         70       90
75           2       12121813   115 Level_1_High        110      130
79           2       12121709    80  Level_2_Low         70       90
91           2       12122065   158 Level_3_High        150      180
97           2       12122107   144 Level_2_High        130      150
99           2       12122113     5  Level_3_Low          0       70
108          3       12121907   100  Level_1_Low         90      100
116          3       12121920   108      Optimal        100      110
124          3       12122267   130 Level_1_High        110      130
127          3       12121881    67  Level_3_Low          0       70
136          3       12122135   100  Level_1_Low         90      100
146          3       12122187   136 Level_2_High        130      150

Another option is using dplyr

  df_measurements %>%
    group_by(station_id) %>%
    mutate(Title = with(
      df_limits,
      Title[
        findInterval(
          value,
          unique(unlist(cbind(limit_from, limit_to)[station_id == first(.$station_id)])),
          left.open = TRUE
        )
      ]
    )) %>%
    ungroup()

which gives

# A tibble: 21 x 4
   measurement_id station_id value Title       
            <int>      <int> <int> <fct>
 1       12121534          1   172 Level_3_High
 2       12121618          1    87 Level_2_Low
 3       12121703          1     9 Level_3_Low
 4       12121709          2    80 Level_2_Low
 5       12121760          2    80 Level_2_Low
 6       12121813          2   115 Level_1_High
 7       12121881          3    67 Level_3_Low
 8       12121907          3   100 Level_1_Low
 9       12121920          3   108 Optimal
10       12121979          1   102 Optimal
# ... with 11 more rows

Benchmarking


f_TIC1 <- function() {
  subset(
    merge(
      df_measurements,
      df_limits,
      all = TRUE
    ),
    limit_from < value & limit_to >= value
  )
}

f_TIC2 <- function() {
  df_measurements %>%
    group_by(station_id) %>%
    mutate(Title = with(
      df_limits,
      Title[
        findInterval(
          value,
          unique(unlist(cbind(limit_from, limit_to)[station_id == first(station_id)])),
          left.open = TRUE
        )
      ]
    )) %>%
    ungroup()
}


dt_limits <- as.data.table(df_limits)
dt_measurements <- as.data.table(df_measurements)
f_Waldi <- function() {
  dt_limits[
    dt_measurements,
    .(station_id, measurement_id, value, Title),
    on = .(station_id, limit_from < value, limit_to >= value)
  ]
}

f_TimTeaFan <- function() {
  setkey(dt_limits, station_id, limit_from, limit_to)
  foverlaps(dt_measurements[, value2 := value],
    dt_limits,
    by.x = c("station_id", "value", "value2"),
    type = "within",
  )[
    value < limit_to,
    .(measurement_id, station_id, value, Title)
  ]
}

you will see that

Unit: relative
          expr      min       lq     mean   median       uq      max neval
      f_TIC1() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000   100
      f_TIC2() 4.848639 4.909985 4.895588 4.942616 5.124704 2.580819   100
     f_Waldi() 3.182027 3.010615 3.069916 3.114160 3.397845 1.698386   100
 f_TimTeaFan() 5.523778 5.112872 5.226145 5.112407 5.745671 2.446987   100
like image 41
ThomasIsCoding Avatar answered Oct 24 '22 02:10

ThomasIsCoding