I have a data set that has some instances where for a given location at the same date and time the value is different. I am trying to create a subset data frame showing these instances. This is an example of what I mean:
I have looked at similar questions on SO but I can't seem to get what I want. I keep getting back instances where this isn't the case.
Here's the code I am using:
dat1<-data_concern_join2%>%
group_by(locid,stdate,sttime,charnam,valunit)%>%
filter(n()>1)
Sample Data:
structure(list(orgid = c("USGS-NJ", "USGS-NJ", "USGS-NJ", "USGS-NJ",
"USGS-NJ", "USGS-NJ", "USGS-NJ", "USGS-NJ", "USGS-NJ", "USGS-NJ",
"USGS-NJ", "USGS-NJ", "USGS-NJ", "USGS-NJ", "USGS-NJ"), locid = c("USGS-01396030",
"USGS-01396030", "USGS-01389850", "USGS-01389850", "USGS-01391500",
"USGS-01391500", "USGS-01391500", "USGS-01391500", "USGS-01393960",
"USGS-01390610", "USGS-01391500", "USGS-01390610", "USGS-01391500",
"USGS-01378760", "USGS-01378760"), stdate = structure(c(11359,
11359, 11359, 11359, 16504, 16504, 16112, 16112, 11367, 13860,
12401, 13860, 16325, 13006, 13006), class = "Date"), sttime = structure(c(34200,
34200, 36000, 36000, 40500, 40500, 39600, 39600, 36000, 39600,
32400, 39600, 38400, 36900, 36900), class = c("hms", "difftime"
), units = "secs"), charnam = c("Specific conductance", "Specific conductance",
"Specific conductance", "Specific conductance", "Specific conductance",
"Specific conductance", "Specific conductance", "Specific conductance",
"Specific conductance", "Specific conductance", "Specific conductance",
"Specific conductance", "Specific conductance", "Specific conductance",
"Specific conductance"), val = c(7450, 7300, 3210, 3180, 1930,
1920, 1740, 1650, 1480, 1390, 1380, 1330, 1300, 1280, 1270),
valunit = c("uS/cm @25C", "uS/cm @25C", "uS/cm @25C", "uS/cm @25C",
"uS/cm @25C", "uS/cm @25C", "uS/cm @25C", "uS/cm @25C", "uS/cm @25C",
"uS/cm @25C", "uS/cm @25C", "uS/cm @25C", "uS/cm @25C", "uS/cm @25C",
"uS/cm @25C"), swqs = c("FW2-NT", "FW2-NT", "FW2-NT", "FW2-NT",
"FW2-NT", "FW2-NT", "FW2-NT", "FW2-NT", "FW2-NT", "FW2-NT",
"FW2-NT", "FW2-NT", "FW2-NT", "FW2-NT", "FW2-NT"), WMA = c(7L,
7L, 4L, 4L, 4L, 4L, 4L, 4L, 7L, 4L, 4L, 4L, 4L, 6L, 6L),
year = c(2001L, 2001L, 2001L, 2001L, 2015L, 2015L, 2014L,
2014L, 2001L, 2007L, 2003L, 2007L, 2014L, 2005L, 2005L),
HUC14 = c("HUC02030104050090", "HUC02030104050090", "HUC02030103120050",
"HUC02030103120050", "HUC02030103140070", "HUC02030103140070",
"HUC02030103140070", "HUC02030103140070", "HUC02030104050010",
"HUC02030103140010", "HUC02030103140070", "HUC02030103140010",
"HUC02030103140070", "HUC02030103010040", "HUC02030103010040"
)), .Names = c("orgid", "locid", "stdate", "sttime", "charnam",
"val", "valunit", "swqs", "WMA", "year", "HUC14"), row.names = c(NA,
-15L), class = c("grouped_df", "tbl_df", "tbl", "data.frame"), vars = c("locid",
"stdate", "sttime", "charnam", "valunit"), drop = TRUE, indices = list(
13:14, 2:3, c(9L, 11L), 10L, 6:7, 12L, 4:5, 8L, 0:1), group_sizes = c(2L,
2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), biggest_group_size = 2L, labels = structure(list(
locid = c("USGS-01378760", "USGS-01389850", "USGS-01390610",
"USGS-01391500", "USGS-01391500", "USGS-01391500", "USGS-01391500",
"USGS-01393960", "USGS-01396030"), stdate = structure(c(13006,
11359, 13860, 12401, 16112, 16325, 16504, 11367, 11359), class = "Date"),
sttime = structure(c(36900, 36000, 39600, 32400, 39600, 38400,
40500, 36000, 34200), class = c("hms", "difftime"), units = "secs"),
charnam = c("Specific conductance", "Specific conductance",
"Specific conductance", "Specific conductance", "Specific conductance",
"Specific conductance", "Specific conductance", "Specific conductance",
"Specific conductance"), valunit = c("uS/cm @25C", "uS/cm @25C",
"uS/cm @25C", "uS/cm @25C", "uS/cm @25C", "uS/cm @25C", "uS/cm @25C",
"uS/cm @25C", "uS/cm @25C")), row.names = c(NA, -9L), class = "data.frame", vars = c("locid",
"stdate", "sttime", "charnam", "valunit"), drop = TRUE, .Names = c("locid",
"stdate", "sttime", "charnam", "valunit")))
We can do
library(data.table)
unique(setDT(data_concern_join2),
by = c('locid', 'stdate', 'sttime', 'charnam', 'valunit'))
dplyr
's distinct
is designed for that
data_concern_join2 %>% distinct(locid, stdate, sttime, charnam, valunit, .keep_all = T)
First, a simpler question about removing entire rows with duplicates in one column:
library(dplyr)
df <- tibble(x=c(1,1:4), y=letters[1:5])
df
#> # A tibble: 5 x 2
#> x y
#> <dbl> <chr>
#> 1 1 a
#> 2 1 b
#> 3 2 c
#> 4 3 d
#> 5 4 e
df %>%
group_by(x) %>%
mutate(rank=row_number(x)) %>%
filter(rank==1)
# optional: %>% ungroup() %>% select(-rank)
#> # A tibble: 4 x 3
#> # Groups: x [4]
#> x y rank
#> <dbl> <fct> <int>
#> 1 1 a 1
#> 2 2 c 1
#> 3 3 d 1
#> 4 4 e 1
It’s a common pattern in SQL if your database supports window functions, but expressed with dplyr verbs. Here’s the Redshift / PostgreSQL form:
select * from df where 1 = row_number() over (partition by x)
So, if you want just the duplicates (rows where x
gets repeated) then just replace the rank==1
with rank>1
:
df %>%
group_by(x) %>%
mutate(rank=row_number(x)) %>%
filter(rank>1)
#> # A tibble: 1 x 3
#> # Groups: x [1]
#> x y rank
#> <dbl> <fct> <int>
#> 1 1 b 2
The above doesn't show the original row per x
, just its duplicates. If you want all rows, then I think the other answers cover it, but my version would be:
x_dupes <-
df %>%
group_by(x) %>%
summarise(n=n()) %>%
filter(n>1)
df %>% semi_join(x_dupes, by = "x")
#> # A tibble: 2 x 2
#> x y
#> <dbl> <chr>
#> 1 1 a
#> 2 1 b
Finally, you may also care about the order of the rows, as you may have a fixed view on what a duplicate is. In this example, I keep the last value of y
in alphabetical order, for duplicated x
.
df %>%
group_by(x) %>%
arrange(desc(y)) %>%
mutate(rank=row_number(x)) %>%
filter(rank==1)
#> # A tibble: 4 x 3
#> # Groups: x [4]
#> x y rank
#> <dbl> <fct> <int>
#> 1 4 e 1
#> 2 3 d 1
#> 3 2 c 1
#> 4 1 b 1
Note, compared to the second example above, y
was a
and is now b
in the duplicated row. The row order is also reversed.
Does this work for you?
dat1<-data_concern_join2%>%
group_by(locid,stdate,sttime,charnam,valunit)%>%
mutate(count = 1:n()) %>% # the number should increase with each replicated row
filter(count == 1) # filter only the first and remove the others
Output:
# A tibble: 9 x 12
# Groups: locid, stdate, sttime, charnam, valunit [9]
orgid locid stdate sttime charnam val valunit swqs WMA year HUC14 count
<chr> <chr> <date> <time> <chr> <dbl> <chr> <chr> <int> <int> <chr> <int>
1 USGS-NJ USGS-01396030 2001-02-06 09:30 Specific conductance 7450 uS/cm @25C FW2-NT 7 2001 HUC02030104050090 1
2 USGS-NJ USGS-01389850 2001-02-06 10:00 Specific conductance 3210 uS/cm @25C FW2-NT 4 2001 HUC02030103120050 1
3 USGS-NJ USGS-01391500 2015-03-10 11:15 Specific conductance 1930 uS/cm @25C FW2-NT 4 2015 HUC02030103140070 1
4 USGS-NJ USGS-01391500 2014-02-11 11:00 Specific conductance 1740 uS/cm @25C FW2-NT 4 2014 HUC02030103140070 1
5 USGS-NJ USGS-01393960 2001-02-14 10:00 Specific conductance 1480 uS/cm @25C FW2-NT 7 2001 HUC02030104050010 1
6 USGS-NJ USGS-01390610 2007-12-13 11:00 Specific conductance 1390 uS/cm @25C FW2-NT 4 2007 HUC02030103140010 1
7 USGS-NJ USGS-01391500 2003-12-15 09:00 Specific conductance 1380 uS/cm @25C FW2-NT 4 2003 HUC02030103140070 1
8 USGS-NJ USGS-01391500 2014-09-12 10:40 Specific conductance 1300 uS/cm @25C FW2-NT 4 2014 HUC02030103140070 1
9 USGS-NJ USGS-01378760 2005-08-11 10:15 Specific conductance 1280 uS/cm @25C FW2-NT 6 2005 HUC02030103010040 1
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With