Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

fill a data.table based on value in another data.table

I'm very new to data.table but would like to solve my problem with it, as I have the feeling it would be 1000 times faster than with "regular" data.frames.

Here is my problem:

What I have:

2 data.tables dt1 and dt2 like so:

dt1 <- data.table(SID=paste0("S", 1:15), Chromo=rep(1:3, e=5), PP=rep(1:5, 3), P1=0, P2=0, P3=0)
set.seed(17)
dt2 <- data.table(PID=rep(paste0("P", 1:3), c(2, 6, 3)), Chr=c(1, 3, 1, 1, 2, 3, 3, 3, 2, 2, 3), start= c(1, 1, 1, 4, 2, 1, 2, 4, 2, 4, 2), end=c(3, 4, 2, 5, 4, 1, 3, 5, 3, 5, 5), val=rnorm(11))

What I want:

Fill dt1 with dt2[, val] in the right column, based on dt2[, PID] and the right lines, based on dt1[, Chromo]=dt2[, Chr] and dt1[, PP] in between dt2[, start] and dt2[, end].

What I do right now: (which doesn't make me proud, to say the least...)

# preparing the tables, computing dt1 rows indices
dt2[, numcol:=(1:ncol(dt1))[match(dt2[,PID], colnames(dt1))]]
setkey(dt2, Chr, start, end)
setkey(dt1, Chromo, PP)
ind_start <- dt1[dt2[,.(Chr, start)], which=T]
ind_end <- dt1[dt2[,.(Chr, end)], which=T]
dt2[,c("ind_start", "ind_end"):=list(ind_start, ind_end)]

# and feeling I'm that close but can't conclude with `data.table` so doing this "lame" `for` loop with `data.frames`.......................
df1 <- as.data.frame(dt1)
df2 <- as.data.frame(dt2)
nr_seg <- nrow(df2)
for(i in 1:nr_seg){
    df1[df2[i,"ind_start"]:df2[i,"ind_end"], df2[i,"numcol"]] <- df2[i, "val"]
}

The input tables and desired output (except I'd like a data.table):

dt1
    # SID Chromo PP P1 P2 P3
 # 1:  S1      1  1  0  0  0
 # 2:  S2      1  2  0  0  0
 # 3:  S3      1  3  0  0  0
 # 4:  S4      1  4  0  0  0
 # 5:  S5      1  5  0  0  0
 # 6:  S6      2  1  0  0  0
 # 7:  S7      2  2  0  0  0
 # 8:  S8      2  3  0  0  0
 # 9:  S9      2  4  0  0  0
# 10: S10      2  5  0  0  0
# 11: S11      3  1  0  0  0
# 12: S12      3  2  0  0  0
# 13: S13      3  3  0  0  0
# 14: S14      3  4  0  0  0
# 15: S15      3  5  0  0  0

dt2
  # PID Chr start end         val
 # 1:  P2   1     1   2 -0.23298702
 # 2:  P1   1     1   3 -1.01500872
 # 3:  P2   1     4   5 -0.81726793
 # 4:  P3   2     2   3  0.25523700
 # 5:  P2   2     2   4  0.77209084
 # 6:  P3   2     4   5  0.36658112
 # 7:  P2   3     1   1 -0.16561194
 # 8:  P1   3     1   4 -0.07963674
 # 9:  P2   3     2   3  0.97287443
# 10:  P3   3     2   5  1.18078924
# 11:  P2   3     4   5  1.71653398

df1
   # SID Chromo PP          P1         P2        P3
# 1   S1      1  1 -1.01500872 -0.2329870 0.0000000
# 2   S2      1  2 -1.01500872 -0.2329870 0.0000000
# 3   S3      1  3 -1.01500872  0.0000000 0.0000000
# 4   S4      1  4  0.00000000 -0.8172679 0.0000000
# 5   S5      1  5  0.00000000 -0.8172679 0.0000000
# 6   S6      2  1  0.00000000  0.0000000 0.0000000
# 7   S7      2  2  0.00000000  0.7720908 0.2552370
# 8   S8      2  3  0.00000000  0.7720908 0.2552370
# 9   S9      2  4  0.00000000  0.7720908 0.3665811
# 10 S10      2  5  0.00000000  0.0000000 0.3665811
# 11 S11      3  1 -0.07963674 -0.1656119 0.0000000
# 12 S12      3  2 -0.07963674  0.9728744 1.1807892
# 13 S13      3  3 -0.07963674  0.9728744 1.1807892
# 14 S14      3  4 -0.07963674  1.7165340 1.1807892
# 15 S15      3  5  0.00000000  1.7165340 1.1807892  
like image 507
Cath Avatar asked Jun 16 '15 08:06

Cath


1 Answers

library(data.table)
dt1 <- data.table(SID=paste0("S", 1:15), Chromo=rep(1:3, e=5), PP=rep(1:5, 3), P1=0, P2=0, P3=0)
set.seed(17)
dt2 <- data.table(PID=rep(paste0("P", 1:3), c(2, 6, 3)), Chr=c(1, 3, 1, 1, 2, 3, 3, 3, 2, 2, 3), start= c(1, 1, 1, 4, 2, 1, 2, 4, 2, 4, 2), end=c(3, 4, 2, 5, 4, 1, 3, 5, 3, 5, 5), val=rnorm(11))

dt1[, PP1 := PP]
dt1[, c("P1", "P2", "P3") := NULL]


setkey(dt2, Chr, start, end)

setkey(dt1, Chromo, PP, PP1)

res <- foverlaps(dt1, dt2, type="within")
res[is.na(PID), PID := "P1"] #to ensure that dcast works if there is no match
res <- dcast.data.table(res, SID + Chromo + PP ~ PID, value.var = "val")
setkey(res, Chromo, PP)

#    SID Chromo PP          P1         P2        P3
# 1:  S1      1  1 -1.01500872 -0.2329870        NA
# 2:  S2      1  2 -1.01500872 -0.2329870        NA
# 3:  S3      1  3 -1.01500872         NA        NA
# 4:  S4      1  4          NA -0.8172679        NA
# 5:  S5      1  5          NA -0.8172679        NA
# 6:  S6      2  1          NA         NA        NA
# 7:  S7      2  2          NA  0.7720908 0.2552370
# 8:  S8      2  3          NA  0.7720908 0.2552370
# 9:  S9      2  4          NA  0.7720908 0.3665811
#10: S10      2  5          NA         NA 0.3665811
#11: S11      3  1 -0.07963674 -0.1656119        NA
#12: S12      3  2 -0.07963674  0.9728744 1.1807892
#13: S13      3  3 -0.07963674  0.9728744 1.1807892
#14: S14      3  4 -0.07963674  1.7165340 1.1807892
#15: S15      3  5          NA  1.7165340 1.1807892
like image 115
Roland Avatar answered Oct 20 '22 23:10

Roland