Thanks in advance for any help. my Data looks like this:
|year|class|
|---|----|
|2007|a|
|2007|b|
|2007|c|
|2007|d|
|2008|a|
|2008|b|
|2008|e|
|2008|f|
|2009|c|
|2009|d|
|2009|e|
|2009|g|
The goal would be to delete any classes which occure in the previous year, so the final data looks like this:
|year|class|
|---|----|
|2007|a|
|2007|b|
|2007|c|
|2007|d|
|2008|e|
|2008|f|
|2009|c|
|2009|d|
|2009|g|
I tried this code, I intendet to group the data and then delete all within group duplicates but it did not remove everything just a few rows. Instead of duplicates() I also tried unique() which did not work.
d %>% group_by(class, Group = c(0, cumsum(diff(year) != 1))) %>%
filter(!(duplicated(class, fromLast = TRUE)| duplicated(class))) %>%
ungroup() %>%
select(-Group)
Is there maybe another R function which can look at group differences? Thanks for any help
Edit: Thanks too all for your very helpfull answers!
To delete the duplicate rows from the table in SQL Server, you follow these steps: Find duplicate rows using GROUP BY clause or ROW_NUMBER() function. Use DELETE statement to remove the duplicate rows.
GROUP BY does not remove duplicates.
How do I remove duplicate rows in R? To delete duplicate rows in R you can the duplicated () function. Here’s how to remove all the duplicates in the data frame called “study_df”, study_df.un <- study_df [!duplicated (df)].
To remove duplicate columns we can, again, use the duplicated () function: # Drop Duplicated Columns: ex_df.un <- example_df [!duplicated (as.list (example_df))] # Dimenesions dim (ex_df.un) # 8 Rows and 4 Columns # First five rows: head (ex_df.un) Now, to remove duplicate columns we added the as.list () function and removed the “,”.
The R function duplicated() returns a logical vector where TRUE specifies which elements of a vector or data frame are duplicates.
Find and drop duplicate elements. The R function duplicated() returns a logical vector where TRUE specifies which elements of a vector or data frame are duplicates. Given the following vector: x <- c(1, 1, 4, 5, 4, 6) To find the position of duplicate elements in x, use this:
Left join DF to itself on class and a year difference of 1 and retain only those rows for which there is no such match.
library(sqldf)
sqldf("select a.*
from DF a
left join DF b on b.class = a.class and b.year = a.year - 1
where b.year is null")
giving:
year class
1 2007 a
2 2007 b
3 2007 c
4 2007 d
5 2008 e
6 2008 f
7 2009 c
8 2009 d
9 2009 g
Lines <- "|year|class|
|2007|a|
|2007|b|
|2007|c|
|2007|d|
|2008|a|
|2008|b|
|2008|e|
|2008|f|
|2009|c|
|2009|d|
|2009|e|
|2009|g|"
DF <- read.table(text = Lines, sep = "|", header = TRUE)[2:3]
using library(data.table)
setDT(df)[, .(class = setdiff(class, df[year==y-1, class])), by=.(y=year)]
# y class
# 1: 2007 a
# 2: 2007 b
# 3: 2007 c
# 4: 2007 d
# 5: 2008 e
# 6: 2008 f
# 7: 2009 c
# 8: 2009 d
# 9: 2009 g
An analysis of all the current answer
df=structure(list(year = c(2007L, 2007L, 2007L, 2007L, 2008L, 2008L,
2008L, 2008L, 2009L, 2009L, 2009L, 2009L), class = c("a", "b",
"c", "d", "a", "b", "e", "f", "c", "d", "e", "g")), class = "data.frame", row.names = c(NA,
-12L))
library(sqldf)
library(data.table)
library(dplyr)
library(purrr)
library(microbenchmark)
groth = function() {
sqldf("select a.*
from df a
left join df b on b.class = a.class and b.year = a.year - 1
where b.year is null")
}
thomas1 = function() {
dflst <- unname(split(df, df$year))
for (k in seq_along(dflst)[-1]) {
dflst[[k]] <- subset(dflst[[k]], !class %in% dflst[[k - 1]]$class)
}
dfout <- do.call(rbind, dflst)
}
thomas2 = function() {
dfout <- subset(merge(
df,
transform(
df,
yr = year + 1
),
by.x = c("year", "class"),
by.y = c("yr", "class"),
all.x = TRUE
),
is.na(year.y),
select = -year.y
)
}
dww = function() {
setDT(df)[, .(class = setdiff(class, df[year==y-1, class])), by=.(y=year)]
}
user29 = function() {
df=df[order(df$class,df$year),]
df$y_diff=c(0,diff(df$year))
df$c_lag=c("x",head(df$class,-1))
df[df$y_diff!=1 | df$class!=df$c_lag,1:2]
}
anous = function() {
df %>%
group_by(class) %>%
mutate(dup = n() > 1) %>%
group_split() %>%
map_dfr(~ if(unique(.x$dup) & (.x$year[2] - .x$year[1]) == 1) {
.x %>% slice_head(n = 1)
} else {
.x
}) %>%
select(-dup) %>%
arrange(year)
}
benchmark
set.seed(1)
microbenchmark::microbenchmark(
groth(), thomas1(), thomas2(), dww(), user29(), anous(), times=10)
Unit: microseconds
expr min lq mean median uq max neval
groth() 8864.702 9532.502 10885.691 9774.151 11628.401 14432.101 10
thomas1() 792.801 836.001 1666.511 1024.651 1065.601 7921.401 10
thomas2() 1758.700 2024.700 3172.011 2371.601 3348.701 8032.301 10
dww() 3876.201 4280.400 4953.251 4383.701 5320.101 8807.501 10
user29() 464.601 494.502 1249.081 542.951 643.300 7562.401 10
anous() 10506.801 11091.602 12232.101 11424.801 12889.401 17279.201 10
with a much bigger dataframe, I had to remove thomas2 because it did not work
df=data.frame(
"year"=sample(2000:2020,1e5,replace=T),
"class"=sample(LETTERS[1:20],1e5,replace=T)
)
microbenchmark::microbenchmark(
groth(), thomas1(), dww(), user29(), anous(), times=10)
Unit: milliseconds
expr min lq mean median uq max neval
groth() 1217.9176 1270.225702 1290.86323 1305.06580 1322.3443 1341.0451 10
thomas1() 13.6828 14.331401 17.94286 17.76540 21.2913 23.5265 10
dww() 31.3091 36.660201 41.31367 40.27055 44.5629 54.6295 10
user29() 7.8137 9.481402 11.97380 11.31740 14.2235 16.9593 10
anous() 12.7733 13.266902 14.60760 13.50610 15.1067 19.9610 10
Here are some base R solution:
split
+ for
loopdflst <- unname(split(df, df$year))
for (k in seq_along(dflst)[-1]) {
dflst[[k]] <- subset(dflst[[k]], !class %in% dflst[[k - 1]]$class)
}
dfout <- do.call(rbind, dflst)
merge
+ subset
+ is.na
dfout <- subset(merge(
df,
transform(
df,
yr = year + 1
),
by.x = c("year", "class"),
by.y = c("yr", "class"),
all.x = TRUE
),
is.na(year.y),
select = -year.y
)
which gives
year class
1 2007 a
2 2007 b
3 2007 c
4 2007 d
7 2008 e
8 2008 f
9 2009 c
10 2009 d
12 2009 g
> dput(df)
structure(list(year = c(2007L, 2007L, 2007L, 2007L, 2008L, 2008L,
2008L, 2008L, 2009L, 2009L, 2009L, 2009L), class = c("a", "b",
"c", "d", "a", "b", "e", "f", "c", "d", "e", "g")), class = "data.frame", row.names = c(NA,
-12L))
Table is ordered by Year
For each group of records (grouped by Year
) remove Class
value if it has appeared in previous Year
.
Year
in table, Class
becomes a list
of all Class
values appeared during particular Year
(chop()
);Year
remove Class
value (setdiff
) if it has appeared during previous Year
(lag(Class)
);Class
from list
of list
s to atomic vector.library(tidyverse)
dat %>%
chop(Class) %>%
mutate(Class = map2(Class, lag(Class), setdiff)) %>%
unchop(Class)
Output
# Year Class
#1 2007 a
#2 2007 b
#3 2007 c
#4 2007 d
#5 2008 e
#6 2008 f
#7 2009 c
#8 2009 d
#9 2009 g
This case is more interesting comparing to the previous one, because in order to solve it, one needs to compare current list of Class
values, to all of the values of Class
appeared during previous years (sic!).
Year
in table, Class
becomes a list
of all Class
values appeared during particular Year
(chop()
);Class
values so each entry of the list contain the unique set of Class
values appeared during particular Year
and all the Year
s before (accumulate(Class, union)
);Year
remove Class
value (setdiff
) if it has appeared during previous Year
s (lag(...)
) as it has been calculated at step 2.Class
from list
of list
s to atomic vector.library(tidyverse)
dat %>%
chop(Class) %>%
mutate(Class = map2(Class, lag(accumulate(Class, union)), setdiff)) %>%
unchop(Class)
Output
# Year Class
#1 2007 a
#2 2007 b
#3 2007 c
#4 2007 d
#5 2008 e
#6 2008 f
#7 2009 g
I have changed the names of the variables, capitalizing first letter. It is against the concept of tidy data, and it bothers me a lot. However, the fact that you do use name class
, which is the name of a R's primitive function bothers me even more.
dat <- structure(
list(
Year = c(2007, 2007, 2007, 2007, 2008, 2008, 2009, 2009, 2009),
Class = c("a", "b", "c", "d", "e", "f", "c", "d", "g")
),
class = "data.frame", row.names = c(NA,-9L)
)
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With