data1=data.frame("Group1" = sample(1:2,100,r=T),
"Group2" = sample(c('a','b'),100,r=T),
"V1" = sample(1:3, 100, r=T),
"V2" = sample(0:1, 100, r=T),
"V3" = sample(1:5, 100, r=T),
"V4" = sample(1:2, 100, r=T))
data2=data.frame("Group1"=c(1,1,2,2),
"Group2"=c('a','b','a','b'),
"Size"=c(9,7,6,10),
"V1"=c(NA),
"V2"=c(NA),
"V3"=c(NA),
"V4"=c(NA))
I have 'data1' that contains my data. Then I have 'data2' which has 'Group1' and 'Group2' and 'Size'. What I wish for is to group my data by ('Group1' and 'Group2') and take a random sample of size 'Size' from 'data1' to fill in V1-V4 in data2.
The hopeful output would look like this but with the NA values filled in based on 'data1'
library(dplyr);library(tidyr)
data3= data2 %>%
uncount(Size)
library(data.table)
setDT(data1)
setDT(data2)
# sample indices from each group
i <-
data2[data1, on = .(Group1, Group2)
][, .(i_samp = sample(.I, Size)), by = .(Group1, Group2, Size)
][, i_samp]
# subset to sampled indices
merge(data1[i], data2[, .(Group1, Group2, Size)])
# Group1 Group2 V1 V2 V3 V4 Size
# 1: 1 a 3 1 2 2 9
# 2: 1 a 3 1 5 1 9
# 3: 1 a 2 1 4 2 9
# 4: 1 a 3 1 1 1 9
# 5: 1 a 3 1 4 1 9
# 6: 1 a 1 0 3 1 9
# 7: 1 a 3 1 1 1 9
# 8: 1 a 1 1 1 2 9
# 9: 1 a 2 0 2 1 9
# 10: 1 b 2 0 5 2 7
# 11: 1 b 3 0 5 2 7
# 12: 1 b 3 1 4 2 7
# 13: 1 b 1 1 1 1 7
# 14: 1 b 1 1 4 1 7
# 15: 1 b 1 0 1 1 7
# 16: 1 b 1 0 3 1 7
# 17: 2 a 2 0 5 1 6
# 18: 2 a 1 0 5 1 6
# 19: 2 a 3 1 1 2 6
# 20: 2 a 1 0 2 1 6
# 21: 2 a 3 1 1 2 6
# 22: 2 a 1 1 3 2 6
# 23: 2 b 3 0 2 1 10
# 24: 2 b 2 1 5 1 10
# 25: 2 b 3 0 1 1 10
# 26: 2 b 3 1 2 1 10
# 27: 2 b 2 0 5 1 10
# 28: 2 b 2 0 2 1 10
# 29: 2 b 2 0 2 2 10
# 30: 2 b 1 0 1 1 10
# 31: 2 b 3 0 5 1 10
# 32: 2 b 3 0 5 1 10
# Group1 Group2 V1 V2 V3 V4 Size
Input data used:
data1=data.frame("Group1" = sample(1:2,100,r=T),
"Group2" = sample(c('a','b'),100,r=T),
"V1" = sample(1:3, 100, r=T),
"V2" = sample(0:1, 100, r=T),
"V3" = sample(1:5, 100, r=T),
"V4" = sample(1:2, 100, r=T))
data2=data.frame("Group1"=c(1,1,2,2),
"Group2"=c('a','b','a','b'),
"Size"=c(9,7,6,10),
"V1"=c(NA),
"V2"=c(NA),
"V3"=c(NA),
"V4"=c(NA))
Here's a more parameterized version where you explicitly set the columns you want to fill and the keys connecting the two tables
fill_key <- c('Group1', 'Group2')
columns_to_fill <- paste0('V', 1:4)
# sample indices from each group
i <-
data2[data1, on = (fill_key)
][, .(i_samp = sample(.I, Size)), by = c(fill_key, 'Size')
][, i_samp]
# subset to sampled indices
merge(data1[i, c(fill_key, columns_to_fill), with = FALSE],
data2[, c(fill_key, 'Size'), with = FALSE])
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With