Imagine a df in the following format:
ID1 ID2
1 A 1
2 A 2
3 A 3
4 A 4
5 A 5
6 B 1
7 B 2
8 B 3
9 B 4
10 B 5
11 C 1
12 C 2
13 C 3
14 C 4
15 C 5
The problem is to randomly select one row (ideally adjustable to n rows) for the first unique value in ID1, remove the corresponding ID2 value from the dataset, randomly select a value from the remaining pool of ID2 values for the second ID1 value (i.e. recursively), and so on.
So, for example, for the first ID1 value, it would do sample(1:5, 1)
, with the result 2. For the second ID1 value, it would do sample(c(1, 3:5), 1)
, with the result 3. For the third ID1 value, it would do sample(c(1, 4:5), 1)
, with the result 5. It cannot happen that there isn't at least one unique ID2 value left to assign to a particular ID1. However, with multiple ID2 values to select (e.g. three), it may happen that there isn't a sufficient number of them; in that case, select as much as possible. In the end, the results should have a similar format:
ID1 ID2
1 A 2
2 B 3
3 C 5
It should be efficient enough to handle reasonably large datasets (tens of thousands unique values in ID1 and hundreds of thousands unique values per ID2).
I tried multiple ways to solve this problem, but honestly none of them are meaningful and would likely only contribute to confusion, so I'm not sharing them here.
Sample data:
df <- data.frame(ID1 = rep(LETTERS[1:3], each = 5),
ID2 = rep(1:5, 3))
Below are some approaches:
Reduce
+ subset
igraph
for
loopsYou can try the code below (Reduce
is applied to recursively adding unvisited ID2
values)
lst <- split(df, ~ID1)
lst[[1]] <- lst[[1]][sample(1:nrow(lst[[1]]), 1), ]
Reduce(
function(x, y) {
y <- subset(y, !ID2 %in% x$ID2)
rbind(x, y[sample(nrow(y), 1), ])
},
lst
)
which gives
ID1 ID2
4 A 4
7 B 2
11 C 1
As we can see, this problem can be interpreted as a matching problem in graph theory
library(igraph)
library(dplyr)
g <- df %>%
arrange(sample(n())) %>%
graph_from_data_frame() %>%
set_vertex_attr(
name = "type",
value = names(V(.)) %in% df$ID1
)
type.convert(
setNames(
rev(
stack(
max_bipartite_match(g)$matching[unique(df$ID1)]
)
), names(df)
),
as.is = TRUE
)
and we can get
ID1 ID2
1 A 2
2 B 5
3 C 1
for
loop Dynamic Programming lst <- with(df, split(ID2, ID1))
v <- c()
for (k in seq_along(lst)) {
u <- lst[[k]][!lst[[k]] %in% v]
v <- c(v, u[sample(length(u), 1)])
}
type.convert(
data.frame(ID1 = names(lst), ID2 = v),
as.is = TRUE
)
which gives
ID1 ID2
1 A 4
2 B 5
3 C 3
I think this algorithm does what you want, but it's not very efficient. It may provide others with a starting point for faster solutions.
all_ID1 <- unique(df$ID1)
available <- unique(df$ID2)
new_ID2 <- numeric(length(all_ID1))
for(i in seq_along(all_ID1))
{
ID2_group <- df$ID2[df$ID1 == all_ID1[i]]
sample_space <- ID2_group[ID2_group %in% available]
new_ID2[i]<- sample(sample_space, 1)
available <- available[available != new_ID2[i]]
}
data.frame(ID1 = all_ID1, ID2 = new_ID2)
#> ID1 ID2
#> 1 A 5
#> 2 B 1
#> 3 C 2
Note that this will not work if you run out of unique ID2 values. For example, if you had letters A:F in the ID1 column, each with ID2 values of 1:5, then by the time you get to selecting an ID2 value for the ID1 value "F", there are no unique ID2 values left, since numbers 1 to 5 have all been assigned to letters A:E. You don't state in your question what should happen when there are no unique ID2 values left to assign to a particular ID1 - should they be NA
, or are repeats allowed at that point?
Edit
The following modification allows arbitrary n
to be chosen. If all the available numbers run out, the sample space gets replenished:
AC_function <- function(ID1, ID2, n = 1)
{
all_ID1 <- rep(unique(ID1), each = n)
available <- unique(ID2)
new_ID2 <- numeric(length(all_ID1))
for(i in seq_along(all_ID1))
{
ID2_group <- ID2[ID1 == all_ID1[i]]
sample_space <- ID2_group[ID2_group %in% available]
if(length(sample_space) < 1) {
available <- unique(ID2)
sample_space <- ID2_group[ID2_group %in% available]
}
if(length(sample_space) == 1) {
new_ID2[i] <- sample_space
available <- available[available != new_ID2[i]]
}
else {
new_ID2[i] <- sample(sample_space, 1)
available <- available[available != new_ID2[i]]
}
}
data.frame(ID1 = all_ID1, ID2 = new_ID2)
}
For example:
AC_function(df$ID1, df$ID2)
#> ID1 ID2
#> 1 A 2
#> 2 B 4
#> 3 C 5
AC_function(df$ID1, df$ID2, n = 2)
#> ID1 ID2
#> 1 A 1
#> 2 A 2
#> 3 B 5
#> 4 B 4
#> 5 C 3
#> 6 C 2
Created on 2021-11-03 by the reprex package (v2.0.0)
selected <- c()
for(i in unique(df[,1])) {
x <- df[df[,"ID1"]==i,"ID2"]
y <- setdiff(x,selected)
selected <- unique(c(sample(y,1),selected))
}
data.frame(ID1 = unique(df[,1]), ID2 =selected)
gives,
ID1 ID2
1 A 4
2 B 2
3 C 3
Welcome to update the benchmark!
df <- data.frame(
ID1 = rep(LETTERS, each = 10000),
ID2 = sample(1000, length(LETTERS) * 10000, replace = TRUE)
)
f_TIC1 <- function() {
lst <- split(df, ~ID1)
lst[[1]] <- lst[[1]][sample(1:nrow(lst[[1]]), 1), ]
Reduce(
function(x, y) {
y <- subset(y, !ID2 %in% x$ID2)
rbind(x, y[sample(nrow(y), 1), ])
},
lst
)
}
library(igraph)
library(dplyr)
f_TIC2 <- function() {
g <- df %>%
arrange(sample(n())) %>%
graph_from_data_frame() %>%
set_vertex_attr(
name = "type",
value = names(V(.)) %in% df$ID1
)
type.convert(
setNames(
rev(
stack(
max_bipartite_match(g)$matching[unique(df$ID1)]
)
), names(df)
),
as.is = TRUE
)
}
f_TIC3 <- function() {
lst <- with(df, split(ID2, ID1))
v <- c()
for (k in seq_along(lst)) {
u <- lst[[k]][!lst[[k]] %in% v]
v <- c(v, u[sample(length(u), 1)])
}
type.convert(
data.frame(ID1 = names(lst), ID2 = v),
as.is = TRUE
)
}
f_GKi1 <- function() {
. <- split(df$ID2, df$ID1)
data.frame(ID1 = type.convert(names(.), as.is=TRUE),
ID2 = Reduce(function(x, y) {c(x, sample(y[!y %in% x], 1))}, c(list(NULL), .)))
}
f_GKi2 <- function() {
. <- split(df$ID2, df$ID1)
x <- df$ID2[0]
for(y in .) {
y <- y[!y %in% x]
x <- c(x, y[sample.int(length(y),1)])
}
data.frame(ID1 = type.convert(names(.), as.is=TRUE), ID2 = x)
}
library(fastmatch)
library(dqrng)
f_GKi3 <- function() {
. <- split(df$ID2, df$ID1)
x <- df$ID2[0]
for(y in .) {
y <- y[!y %fin% x]
x <- c(x, y[dqsample.int(length(y),1)])
}
data.frame(ID1 = type.convert(names(.), as.is=TRUE), ID2 = x)
}
f_GKi4 <- function() {
. <- split(df$ID2, df$ID1)
x <- vector(typeof(df$ID2), length(.))
for(i in seq_along(.)) {
y <- .[[i]]
y <- y[!y %fin% x[seq_len(i-1)]]
x[i] <- y[dqsample.int(length(y),1)]
}
data.frame(ID1 = type.convert(names(.), as.is=TRUE), ID2 = x)
}
f_Onyambu <- function() {
data <- df[order(df$ID1, df$ID2),] #Just in case it is not sorted
n <- 1
st <- table(data[[1]])
s <- min(st)
m <- length(st)
size <- min(m*n, s)
samples <- sample(s, size)
index <- rep(seq(s), each = n, length = size) * s - s + samples
data[index, ]
}
bm <- microbenchmark::microbenchmark(
f_TIC1(),
f_TIC2(),
f_TIC3(),
f_GKi1(),
f_GKi2(),
f_GKi3(),
f_GKi4(),
f_Onyambu()
)
ggplot2::autoplot(bm)
bm
#Unit: milliseconds
# expr min lq mean median uq max neval
# f_TIC1() 43.85147 46.00637 48.77332 46.53265 48.06150 86.60333 100
# f_TIC2() 138.12085 143.15468 154.59155 146.49701 169.47343 191.70579 100
# f_TIC3() 13.30333 13.89822 15.16400 14.49575 15.57266 52.16352 100
# f_GKi1() 13.42718 13.88382 16.22395 14.31689 15.69188 52.70818 100
# f_GKi2() 13.34032 13.80074 14.70703 14.52709 15.46372 17.80398 100
# f_GKi3() 11.86203 12.09923 14.73456 12.26890 13.84257 50.41542 100
# f_GKi4() 11.86614 12.08120 13.19142 12.20973 13.74152 50.82025 100
# f_Onyambu() 201.06478 203.11184 206.04584 204.10129 205.60191 242.28008 100
Currently GKi3 and GKi4 are the fastest followed by TIC3, GKi1 and GKi2 which are more or less equal as they use the same logic from TIC1, which was optimizes in GKi1 and reused in TIC3 and GKi2.
You can use sample
in Reduce
on the split
df.
df <- data.frame(ID1 = rep(LETTERS[1:3], each = 5),
ID2 = rep(1:5, 3))
set.seed(42)
. <- split(df$ID2, df$ID1)
data.frame(ID1 = `storage.mode<-`(names(.), typeof(df$ID1)),
ID2 = Reduce(function(x, y) {
y <- y[!y %in% x]
c(x, y[sample.int(length(y),1)])}, c(list(NULL), .)))
# ID1 ID2
#1 A 1
#2 B 2
#3 C 3
Or using a for loop:
. <- split(df$ID2, df$ID1)
x <- df$ID2[0]
for(y in .) {
y <- y[!y %in% x]
x <- c(x, y[sample.int(length(y),1)])
}
data.frame(ID1 = `storage.mode<-`(names(.), typeof(df$ID1)), ID2 = x)
# ID1 ID2
#1 A 1
#2 B 2
#3 C 3
Or using fastmatch
and dqrng
instead of base
:
library(fastmatch)
library(dqrng)
. <- split(df$ID2, df$ID1)
x <- df$ID2[0]
for(y in .) {
y <- y[!y %fin% x]
x <- c(x, y[dqsample.int(length(y),1)])
}
data.frame(ID1 = `storage.mode<-`(names(.), typeof(df$ID1)), ID2 = x)
# ID1 ID2
#1 A 2
#2 B 1
#3 C 5
and creating the result vector with final size:
library(fastmatch)
library(dqrng)
. <- split(df$ID2, df$ID1)
x <- vector(typeof(df$ID2), length(.))
for(i in seq_along(.)) {
y <- .[[i]]
y <- y[!y %fin% x[seq_len(i-1)]]
x[i] <- y[dqsample.int(length(y),1)]
}
data.frame(ID1 = `storage.mode<-`(names(.), typeof(df$ID1)), ID2 = x)
# ID1 ID2
#1 A 3
#2 B 1
#3 C 2
DISCLAIMER: This solution assumes that the data is arranged/ordered. If the data is not ordered. Please order it according to the ID1
column first then use the function:
There is another way of doing this without using for
-loop/ Recursion or even higher level functions. We need to note that sample
function in R is vectorized. Therefore, if all the groups in your dataframe are of the same size, or rather increasing in size, then you could make use of the vectorized sample.
n <- 1 # to be sampled from each group
s <- 5 # size of each group - Note that you have to give the minimum size.
m <- length(unique(df[[1]])) # number of groups.
size <- min(m*n, s) #Total number of sampled data from the dataframe
samples <- sample(s, size)
index <- rep(seq(s), each = n, length = size) * s - s + samples
df[index, ]
This can be written in a function:
sub_sample <- function(data, n){
st <- table(data[[1]])
s <- min(st)
m <- length(st)
size <- min(m*n, s)
samples <- sample(s, size)
st1 <- rep(c(0, cumsum(head(st,-1))), each = n, length = size)
index <- st1 + samples
data[index, ]
}
sub_sample(df, 1)
ID1 ID2
1 A 1
7 B 2
13 C 3
sub_sample(df, 2)
ID1 ID2
1 A 1
5 A 5
8 B 3
7 B 2
14 C 4
Note that when subsetting n=2
we only have 1 group C row. Why? That is because group C has 5 rows. But we have already used 4 samples for groups A and B. We only remain with 1 sample for group C.
Speed Test:
when n = 1
:
Unit: milliseconds
expr min lq mean median uq max neval
f_TIC1() 35.682390 41.610310 53.68912 45.618234 49.88343 227.73160 100
f_TIC2() 151.643959 166.402584 196.51770 179.098992 192.16335 401.36526 100
f_TIC3() 11.059033 12.268831 14.53906 13.278606 15.38623 23.32695 100
f_GKi1() 10.725358 11.879908 14.70369 13.108852 17.86946 26.71074 100
f_GKi2() 10.816891 11.941173 16.55455 12.989787 17.92708 198.44482 100
f_GKi3() 8.942479 9.950978 14.94726 10.857187 13.35428 171.08643 100
f_GKi4() 9.085794 9.834465 13.98820 10.666282 13.20658 191.47267 100
sub_sample(df, 1) 7.878367 8.737534 11.22173 9.508084 14.22219 19.82063 100
When n>1
, This code easily tackles that. The others need to be tweaked alittle but their speed drops drastically. This method works like a charm even when n = group size
. Most of others take too long or even fail
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With