This might be very easy but I have not yet figure it out.
This is a section of my dataset:
structure(list(Patent = c("4683202", "4683195", "4800159", "4965188",
"4994368", "5328824", "4879214", "4921794", "4983728", "4994372"
), subclass = c("435/91.2", "435/91.2", "435/91.2", "435/91.2",
"435/91.2", "435/91.2", "435/91.2", "435/91.2", "435/91.2", "435/91.2"
), AppYear = c(1985L, 1986L, 1986L, 1987L, 1987L, 1987L, 1988L,
1988L, 1990L, 1990L), app = 1:10, class = "data.frame", row.names = c(NA,
-10L), .Names = c("Patent", "subclass", "AppYear", "app", "lag(AppYear)"
))
> data
# A tibble: 10 x 3
Patent subclass AppYear
<chr> <chr> <int>
1 4683202 435/91.2 1985
2 4683195 435/91.2 1986
3 4800159 435/91.2 1986
4 4965188 435/91.2 1987
5 4994368 435/91.2 1987
6 5328824 435/91.2 1987
7 4879214 435/91.2 1988
8 4921794 435/91.2 1988
9 4983728 435/91.2 1990
10 4994372 435/91.2 1990
Firstly, I need to get the rolling count of distinct years 'app'. Secondly, I need to create the lag of the distinct years 'lag(AppYear)', which in case the previous year(s) is the same will fetch the line with year-1.
Desired Output
# A tibble: 10 x 5
Patent subclass AppYear app `lag(AppYear)`
<chr> <chr> <int> <int> <int>
1 4683202 435/91.2 1985 1 NA
2 4683195 435/91.2 1986 2 1985
3 4800159 435/91.2 1986 2 1985
4 4965188 435/91.2 1987 3 1986
5 4994368 435/91.2 1987 3 1986
6 5328824 435/91.2 1987 3 1986
7 4879214 435/91.2 1988 4 1987
8 4921794 435/91.2 1988 4 1987
9 4983728 435/91.2 1990 5 1988
10 4994372 435/91.2 1990 5 1988
Edit The whole dataset includes many subclasses thus I need to group first by subclass
. The data is now sorted in this way:
data <- data %>%
select(Patent, subclass, AppYear) %>%
arrange(AppYear,Patent) %>%
group_by(subclass) %>%
mutate(app = 1:n(), lag(AppYear))
.
structure(list(Patent = c("4683202", "4683195", "4800159", "4965188",
"4994368", "5328824", "4879214", "4921794", "4983728", "4994372",
"5066584", "5075216", "5091310", "5093245", "5132215", "5185243",
"5409818", "5409818", "6107023", "4994370", "5001050", "5023171",
"5035996", "5035996", "5043272", "5045450", "5055393", "5085983",
"5106729", "5106729"), subclass = c("435/91.2", "435/91.2", "435/91.2",
"435/91.2", "435/91.2", "435/91.2", "435/91.2", "435/91.2", "435/91.2",
"435/91.2", "435/91.2", "435/91.2", "435/91.2", "435/91.2", "435/91.2",
"435/91.2", "435/91.21", "435/91.2", "435/91.2", "435/91.2",
"435/91.2", "435/91.2", "435/91.2", "435/91.21", "435/91.2",
"435/91.2", "435/91.2", "435/91.2", "435/91.2", "435/91.21"),
AppYear = c(1985L, 1986L, 1986L, 1987L, 1987L, 1987L, 1988L,
1988L, 1988L, 1988L, 1988L, 1988L, 1988L, 1988L, 1988L, 1988L,
1988L, 1988L, 1988L, 1989L, 1989L, 1989L, 1989L, 1989L, 1989L,
1989L, 1989L, 1989L, 1989L, 1989L), app = c(1L, 2L, 3L, 4L,
5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 1L,
17L, 18L, 19L, 20L, 21L, 22L, 2L, 23L, 24L, 25L, 26L, 27L,
3L), `lag(AppYear)` = c(NA, 1985L, 1986L, 1986L, 1987L, 1987L,
1987L, 1988L, 1988L, 1988L, 1988L, 1988L, 1988L, 1988L, 1988L,
1988L, NA, 1988L, 1988L, 1988L, 1989L, 1989L, 1989L, 1988L,
1989L, 1989L, 1989L, 1989L, 1989L, 1989L)), class = "data.frame", row.names = c(NA,
-30L), .Names = c("Patent", "subclass", "AppYear", "app", "lag(AppYear)"
))
I have tried to get app
using many approaches such as cumsum(1:length(AppYear))
but could not find a successful answer.
Addressing the follow-up question about a df with multiple groups of subclass
.
library(dplyr)
df1 %>%
select(Patent, subclass, AppYear) %>%
arrange(AppYear, Patent) %>%
group_by(subclass) %>%
group_map(~mutate(.,app=group_indices(.,AppYear),
lag_year = rep(lag(unique(.$AppYear)), count_(., "AppYear")$n)),
keep = T) %>%
bind_rows() %>%
arrange(AppYear, Patent)
#> # A tibble: 30 x 5
#> Patent subclass AppYear app lag_year
#> <chr> <chr> <int> <int> <int>
#> 1 4683202 435/91.2 1985 1 NA
#> 2 4683195 435/91.2 1986 2 1985
#> 3 4800159 435/91.2 1986 2 1985
#> 4 4965188 435/91.2 1987 3 1986
#> 5 4994368 435/91.2 1987 3 1986
#> 6 5328824 435/91.2 1987 3 1986
#> 7 4879214 435/91.2 1988 4 1987
#> 8 4921794 435/91.2 1988 4 1987
#> 9 4983728 435/91.2 1988 4 1987
#> 10 4994372 435/91.2 1988 4 1987
#> # ... with 20 more rows
N.B. I am using the data provided by OP under Edit section of the question.
library(dplyr)
df1 %>%
arrange(AppYear, Patent) %>%
mutate(app = group_indices(.,AppYear),
lag_year = rep(lag(unique(.$AppYear)), count_(., "AppYear")$n))
#> # A tibble: 10 x 5
#> Patent subclass AppYear app lag_year
#> <chr> <chr> <int> <int> <int>
#> 1 4683202 435/91.2 1985 1 NA
#> 2 4683195 435/91.2 1986 2 1985
#> 3 4800159 435/91.2 1986 2 1985
#> 4 4965188 435/91.2 1987 3 1986
#> 5 4994368 435/91.2 1987 3 1986
#> 6 5328824 435/91.2 1987 3 1986
#> 7 4879214 435/91.2 1988 4 1987
#> 8 4921794 435/91.2 1988 4 1987
#> 9 4983728 435/91.2 1990 5 1988
#> 10 4994372 435/91.2 1990 5 1988
Data:
df1 <- structure(list(Patent=c("4683202", "4683195", "4800159", "4965188",
"4994368", "5328824", "4879214", "4921794", "4983728", "4994372"),
subclass=c("435/91.2", "435/91.2", "435/91.2", "435/91.2", "435/91.2",
"435/91.2", "435/91.2", "435/91.2", "435/91.2", "435/91.2"),
AppYear=c(1985L, 1986L, 1986L, 1987L, 1987L, 1987L, 1988L,
1988L, 1990L, 1990L)),
row.names=c(NA, -10L),
class=c("tbl_df", "tbl", "data.frame"))
Here's a data.table
version. The gnarly part is a self-join to allow for a look-up table to do the lag year.
library(data.table)
dt <- as.data.table(df1)
setorder(dt, AppYear, Patent)
dt[, app := rleid(AppYear), by = .(subclass)]
dt[unique(dt[, .(lagging_year = shift(AppYear)
,lagging_app = shift(app) + 1), by = subclass])
, on = .(subclass
,app = lagging_app )
, lag_year := lagging_year]
dt
Patent subclass AppYear app lag_year
1: 4683202 435/91.2 1985 1 NA
2: 4683195 435/91.2 1986 2 1985
3: 4800159 435/91.2 1986 2 1985
4: 4965188 435/91.2 1987 3 1986
5: 4994368 435/91.2 1987 3 1986
6: 5328824 435/91.2 1987 3 1986
7: 4879214 435/91.2 1988 4 1987
8: 4921794 435/91.2 1988 4 1987
9: 4983728 435/91.2 1988 4 1987
10: 4994372 435/91.2 1988 4 1987
#total of 30 rows.
Here's largely the equivalent of @M-M's answer. Note the .GRP
keeps counting up so it's not 100% equivalent. The first subclass
that is 435/91.21 has an app
of 5 here instead of 1 in the other solutions.
library(data.table)
dt <- as.data.table(df1)
setorder(dt, AppYear, Patent)
dt[, `:=` (app = .GRP, app_cnt = .N), by = .( AppYear, subclass)]
dt[, lag_year := rep(shift(unique(AppYear)), unique(app_cnt)), by = .(subclass)]
dt[, app_cnt := NULL]
dt
And here's the performance:
Unit: milliseconds
expr min lq mean median uq max neval
M_M_dplyr 6.5839 6.85235 7.442658 6.94440 7.26040 23.0357 100
cole_dt_join 6.0260 6.27025 6.616121 6.44040 6.84965 8.3686 100
cole_dt_rep 3.0404 3.15575 3.435112 3.26355 3.76085 4.7526 100
cole_dt_rep_rleid 3.4116 3.59275 3.911844 3.68695 4.01000 10.3520 100
Data and functions:
library(data.table)
library(microbenchmark)
library(dplyr)
df1 <- structure(list(Patent = c("4683202", "4683195", "4800159", "4965188",
"4994368", "5328824", "4879214", "4921794", "4983728", "4994372",
"5066584", "5075216", "5091310", "5093245", "5132215", "5185243",
"5409818", "5409818", "6107023", "4994370", "5001050", "5023171",
"5035996", "5035996", "5043272", "5045450", "5055393", "5085983",
"5106729", "5106729"), subclass = c("435/91.2", "435/91.2", "435/91.2",
"435/91.2", "435/91.2", "435/91.2", "435/91.2", "435/91.2", "435/91.2",
"435/91.2", "435/91.2", "435/91.2", "435/91.2", "435/91.2", "435/91.2",
"435/91.2", "435/91.21", "435/91.2", "435/91.2", "435/91.2",
"435/91.2", "435/91.2", "435/91.2", "435/91.21", "435/91.2",
"435/91.2", "435/91.2", "435/91.2", "435/91.2", "435/91.21"),
AppYear = c(1985L, 1986L, 1986L, 1987L, 1987L, 1987L, 1988L,
1988L, 1988L, 1988L, 1988L, 1988L, 1988L, 1988L, 1988L, 1988L,
1988L, 1988L, 1988L, 1989L, 1989L, 1989L, 1989L, 1989L, 1989L,
1989L, 1989L, 1989L, 1989L, 1989L)
)
, class = "data.frame"
, row.names = c(NA,
-30L)
, .Names = c("Patent", "subclass", "AppYear"))
# dt join method ----------------------------------------------------------
dt <- as.data.table(df1)
setorder(dt, AppYear, Patent)
dt[, app := rleid(AppYear), by = .(subclass)]
dt[unique(dt[, .(lagging_year = shift(AppYear)
,lagging_app = shift(app) + 1), by = subclass])
, on = .(subclass
,app = lagging_app )
, lag_year := lagging_year]
dt
# dt rep rleid method -----------------------------------------------------------
dt <- as.data.table(df1)
setorder(dt, AppYear, Patent)
dt[, app := rleid(AppYear), by = .(subclass)]
dt[, app_cnt := .N, by = .( AppYear, subclass)]
dt[, lag_year := rep(shift(unique(AppYear)), unique(app_cnt)), by = .(subclass)]
dt[, app_cnt := NULL]
dt
# dt rep and .GRP ---------------------------------------------------------
dt <- as.data.table(df1)
setorder(dt, AppYear, Patent)
dt[, `:=` (app = .GRP, app_cnt = .N), by = .( AppYear, subclass)]
dt[, lag_year := rep(shift(unique(AppYear)), unique(app_cnt)), by = .(subclass)]
dt[, app_cnt := NULL]
dt
# benchmarks --------------------------------------------------------------
microbenchmark(M_M_dplyr = {
df1 %>%
arrange(AppYear, Patent) %>%
group_by(subclass) %>%
group_map(~mutate(.,app=group_indices(.,AppYear),
lag_year = rep(lag(unique(.$AppYear)), count_(., "AppYear")$n)),
keep = T) %>%
bind_rows() %>%
arrange(AppYear, Patent)
}
, cole_dt_join = {
dt <- as.data.table(df1)
setorder(dt, AppYear, Patent)
dt[, app := rleid(AppYear), by = .(subclass)]
dt[unique(dt[, .(lagging_year = shift(AppYear)
,lagging_app = shift(app) + 1), by = subclass])
, on = .(subclass
,app = lagging_app )
, lag_year := lagging_year]
}
,cole_dt_rep = {
dt <- as.data.table(df1)
setorder(dt, AppYear, Patent)
dt[, `:=` (app = .GRP, app_cnt = .N), by = .( AppYear, subclass)]
dt[, lag_year := rep(shift(unique(AppYear)), unique(app_cnt)), by = .(subclass)]
dt[, app_cnt := NULL]
}
,cole_dt_rep_rleid = {
dt <- as.data.table(df1)
dt[, app := rleid(AppYear), by = .(subclass)]
dt[, app_cnt := .N, by = .( AppYear, subclass)]
dt[, lag_year := rep(shift(unique(AppYear)), unique(app_cnt)), by = .(subclass)]
dt[, app_cnt := NULL]
}
)
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With