Delete duplicates between groups in R

Q: How to remove duplicate columns from a list in SQL Server?

To remove duplicate columns we can, again, use the duplicated () function: # Drop Duplicated Columns: ex_df.un <- example_df [!duplicated (as.list (example_df))] # Dimenesions dim (ex_df.un) # 8 Rows and 4 Columns # First five rows: head (ex_df.un) Now, to remove duplicate columns we added the as.list () function and removed the “,”.

Q: What does the R function duplicated () return?

The R function duplicated() returns a logical vector where TRUE specifies which elements of a vector or data frame are duplicates.

Q: How to find and drop duplicate elements in R?

Find and drop duplicate elements. The R function duplicated() returns a logical vector where TRUE specifies which elements of a vector or data frame are duplicates. Given the following vector: x <- c(1, 1, 4, 5, 4, 6) To find the position of duplicate elements in x, use this:

Tags:

r

Thanks in advance for any help. my Data looks like this:

|year|class|
|---|----|
|2007|a|
|2007|b|
|2007|c|
|2007|d|
|2008|a|
|2008|b|
|2008|e|
|2008|f|
|2009|c|
|2009|d|
|2009|e|
|2009|g|

The goal would be to delete any classes which occure in the previous year, so the final data looks like this:

|year|class|
|---|----|
|2007|a|
|2007|b|
|2007|c|
|2007|d|
|2008|e|
|2008|f|
|2009|c|
|2009|d|
|2009|g|

I tried this code, I intendet to group the data and then delete all within group duplicates but it did not remove everything just a few rows. Instead of duplicates() I also tried unique() which did not work.

d %>% group_by(class, Group = c(0, cumsum(diff(year) != 1))) %>%
  filter(!(duplicated(class, fromLast = TRUE)| duplicated(class))) %>%
  ungroup() %>%
  select(-Group)

Is there maybe another R function which can look at group differences? Thanks for any help

Edit: Thanks too all for your very helpfull answers!

405

asked May 21 '21 11:05

Annabell

Video Answer

5 Answers

Left join DF to itself on class and a year difference of 1 and retain only those rows for which there is no such match.

library(sqldf)

sqldf("select a.*
  from DF a
  left join DF b on b.class = a.class and b.year = a.year - 1
  where  b.year is null")

giving:

  year class
1 2007     a
2 2007     b
3 2007     c
4 2007     d
5 2008     e
6 2008     f
7 2009     c
8 2009     d
9 2009     g

Note

Lines <- "|year|class|
|2007|a|
|2007|b|
|2007|c|
|2007|d|
|2008|a|
|2008|b|
|2008|e|
|2008|f|
|2009|c|
|2009|d|
|2009|e|
|2009|g|"
DF <- read.table(text = Lines, sep = "|", header = TRUE)[2:3]

177

answered Oct 16 '22 19:10

G. Grothendieck

using library(data.table)

setDT(df)[, .(class = setdiff(class, df[year==y-1, class])), by=.(y=year)]

#       y class
# 1: 2007     a
# 2: 2007     b
# 3: 2007     c
# 4: 2007     d
# 5: 2008     e
# 6: 2008     f
# 7: 2009     c
# 8: 2009     d
# 9: 2009     g

answered Oct 16 '22 18:10

dww

An analysis of all the current answer

df=structure(list(year = c(2007L, 2007L, 2007L, 2007L, 2008L, 2008L, 
                        2008L, 2008L, 2009L, 2009L, 2009L, 2009L), class = c("a", "b", 
                                                                             "c", "d", "a", "b", "e", "f", "c", "d", "e", "g")), class = "data.frame", row.names = c(NA, 
                                                                                                                                                                     -12L))

library(sqldf)
library(data.table)
library(dplyr)
library(purrr)
library(microbenchmark)

groth = function() {
  sqldf("select a.*
  from df a
  left join df b on b.class = a.class and b.year = a.year - 1
  where  b.year is null")
}

thomas1 = function() {
  dflst <- unname(split(df, df$year))
  for (k in seq_along(dflst)[-1]) {
    dflst[[k]] <- subset(dflst[[k]], !class %in% dflst[[k - 1]]$class)
  }
  dfout <- do.call(rbind, dflst)
}

thomas2 = function() {
  dfout <- subset(merge(
    df,
    transform(
      df,
      yr = year + 1
    ),
    by.x = c("year", "class"),
    by.y = c("yr", "class"),
    all.x = TRUE
  ),
  is.na(year.y),
  select = -year.y
  )
}

dww = function() {
  setDT(df)[, .(class = setdiff(class, df[year==y-1, class])), by=.(y=year)]
}

user29 = function() {
  df=df[order(df$class,df$year),]
  df$y_diff=c(0,diff(df$year))
  df$c_lag=c("x",head(df$class,-1))
  df[df$y_diff!=1 | df$class!=df$c_lag,1:2]
}

anous = function() {
  df %>%
    group_by(class) %>%
    mutate(dup = n() > 1) %>%
    group_split() %>%
    map_dfr(~ if(unique(.x$dup) & (.x$year[2] - .x$year[1]) == 1) {
      .x %>% slice_head(n = 1)
    } else {
      .x
    }) %>%
    select(-dup) %>%
    arrange(year)
}

benchmark

set.seed(1)
microbenchmark::microbenchmark(
  groth(), thomas1(), thomas2(), dww(), user29(), anous(), times=10)

Unit: microseconds
      expr       min        lq      mean    median        uq       max neval
   groth()  8864.702  9532.502 10885.691  9774.151 11628.401 14432.101    10
 thomas1()   792.801   836.001  1666.511  1024.651  1065.601  7921.401    10
 thomas2()  1758.700  2024.700  3172.011  2371.601  3348.701  8032.301    10
     dww()  3876.201  4280.400  4953.251  4383.701  5320.101  8807.501    10
  user29()   464.601   494.502  1249.081   542.951   643.300  7562.401    10
   anous() 10506.801 11091.602 12232.101 11424.801 12889.401 17279.201    10

with a much bigger dataframe, I had to remove thomas2 because it did not work

df=data.frame(
  "year"=sample(2000:2020,1e5,replace=T),
  "class"=sample(LETTERS[1:20],1e5,replace=T)
)

microbenchmark::microbenchmark(
  groth(), thomas1(), dww(), user29(), anous(), times=10)

Unit: milliseconds
      expr       min          lq       mean     median        uq       max neval
   groth() 1217.9176 1270.225702 1290.86323 1305.06580 1322.3443 1341.0451    10
 thomas1()   13.6828   14.331401   17.94286   17.76540   21.2913   23.5265    10
     dww()   31.3091   36.660201   41.31367   40.27055   44.5629   54.6295    10
  user29()    7.8137    9.481402   11.97380   11.31740   14.2235   16.9593    10
   anous()   12.7733   13.266902   14.60760   13.50610   15.1067   19.9610    10

answered Oct 16 '22 18:10

user2974951

Here are some base R solution:

`split` + `for` loop

dflst <- unname(split(df, df$year))
for (k in seq_along(dflst)[-1]) {
  dflst[[k]] <- subset(dflst[[k]], !class %in% dflst[[k - 1]]$class)
}
dfout <- do.call(rbind, dflst)

`merge` + `subset` + `is.na`

dfout <- subset(merge(
  df,
  transform(
    df,
    yr = year + 1
  ),
  by.x = c("year", "class"),
  by.y = c("yr", "class"),
  all.x = TRUE
),
is.na(year.y),
select = -year.y
)

which gives

   year class
1  2007     a
2  2007     b
3  2007     c
4  2007     d
7  2008     e
8  2008     f
9  2009     c
10 2009     d
12 2009     g

data

> dput(df)
structure(list(year = c(2007L, 2007L, 2007L, 2007L, 2008L, 2008L, 
2008L, 2008L, 2009L, 2009L, 2009L, 2009L), class = c("a", "b",
"c", "d", "a", "b", "e", "f", "c", "d", "e", "g")), class = "data.frame", row.names = c(NA,
-12L))

answered Oct 16 '22 19:10

ThomasIsCoding

General assumptions

Table is ordered by Year

Case one

For each group of records (grouped by Year) remove Class value if it has appeared in previous Year.

Solution

Transform the data, so for each Year in table, Class becomes a list of all Class values appeared during particular Year (chop());
For each particular Year remove Class value (setdiff) if it has appeared during previous Year (lag(Class));
Transform the Class from list of lists to atomic vector.

Code

library(tidyverse)

dat %>%
  chop(Class) %>% 
  mutate(Class = map2(Class, lag(Class), setdiff)) %>%
  unchop(Class)

Output

#  Year Class
#1 2007     a
#2 2007     b
#3 2007     c
#4 2007     d
#5 2008     e
#6 2008     f
#7 2009     c
#8 2009     d
#9 2009     g

Case two

This case is more interesting comparing to the previous one, because in order to solve it, one needs to compare current list of Class values, to all of the values of Class appeared during previous years (sic!).

Solution

Transform the data, so for each Year in table, Class becomes a list of all Class values appeared during particular Year (chop());
Create list of Class values so each entry of the list contain the unique set of Class values appeared during particular Year and all the Years before (accumulate(Class, union));
For each particular Year remove Class value (setdiff) if it has appeared during previous Years (lag(...)) as it has been calculated at step 2.
Transform the Class from list of lists to atomic vector.

Code

library(tidyverse)

dat %>%
  chop(Class) %>% 
  mutate(Class = map2(Class, lag(accumulate(Class, union)), setdiff)) %>%
  unchop(Class)

Output

#  Year Class
#1 2007     a
#2 2007     b
#3 2007     c
#4 2007     d
#5 2008     e
#6 2008     f
#7 2009     g

Data

I have changed the names of the variables, capitalizing first letter. It is against the concept of tidy data, and it bothers me a lot. However, the fact that you do use name class, which is the name of a R's primitive function bothers me even more.

dat <- structure(
  list(
    Year = c(2007, 2007, 2007, 2007, 2008, 2008, 2009, 2009, 2009),
    Class = c("a", "b", "c", "d", "e", "f", "c", "d", "g")
  ),
  class = "data.frame", row.names = c(NA,-9L)
)

answered Oct 16 '22 18:10

utubun

Related questions
                            
                                Indent without adding a bullet point or number in RMarkdown
                            
                                Convert Excel numeric to date
                            
                                wrapping long geom_text labels
                            
                                How to correctly output Plotly plots in shiny?
                            
                                Using dplyr summarize with different operations for multiple columns
                            
                                All combinations of letters/numbers under specific conditions
                            
                                r - Convert output from sf::st_within to vector
                            
                                R - ggplot2 time series x-axis to show last day of the month
                            
                                Image output in shiny app
                            
                                Convert an integer to a string in R
                            
                                R Caret Package Error - At least one of the class levels is not a valid R variable name
                            
                                Replacing zeroes with NA for values preceding non-zero
                            
                                R - Fitting a grid over a City Map and inputting data into grid squares
                            
                                ggplot scale_color_manual with breaks does not match expected order
                            
                                Generate all unique combinations from a vector with repeating elements
                            
                                R: What are dates in a dates vector: dates or numeric values? (difference between x[i] and i)
                            
                                Clear R environment of all objetcs & packages
                            
                                Separate a shopping list into multiple columns
                            
                                How to create a geom line plot with single geom point at the end with legend
                            
                                Combine: rowwise(), mutate(), across(), for multiple functions

Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!

Donate Us With

Delete duplicates between groups in R

Tags:

r

Annabell

People also ask

Video Answer

5 Answers

Note

G. Grothendieck

dww

user2974951

`split` + `for` loop

`merge` + `subset` + `is.na`

data

ThomasIsCoding

General assumptions

Case one

Case two

Data

utubun

Recent Activity

Donate For Us

Delete duplicates between groups in R

Tags:

r

Annabell

People also ask

Video Answer

5 Answers

Note

G. Grothendieck

dww

user2974951

split + for loop

merge + subset + is.na

data

ThomasIsCoding

General assumptions

Case one

Case two

Data

utubun

Related questions

Recent Activity

Donate For Us

`split` + `for` loop

`merge` + `subset` + `is.na`