Recode according to second dataset and split columns

Question

I have a matrix "mat" with 012 coded SNPs as columns and people as rows. For example:

> mat<-matrix(c("0","1","0","1","2","0","1","1","2"),3,byrow=T)
> rownames(mat)<-c("ID1","ID2","ID3")
> colnames(mat)<-c("rs123","rs333","rs9000")

> mat

    rs123 rs333 rs9000
ID1    "0"   "1"   "0"   
ID2    "1"   "2"   "0"   
ID3    "1"   "1"   "2"

In a different matrix "mat2" I have the respective alleles in two columns (i.e. minor and major allele) and the SNPs as rows.

> mat2<-matrix(c("A","T","C","T","T","G"),3,byrow=T)
> rownames(mat2)<-c("rs123","rs333","rs9000")
> colnames(mat2)<-c("Allele_A","Allele_B")

> mat2

         Allele_A Allele_B
 rs123        "A"      "T"     
 rs333        "C"      "T"     
rs9000        "T"      "G"

Now I want to recode the 012-coded SNPs from the first matrix to be in two columns: They should be their respective Allele A in both new columns if their code is zero and A/B if it's one and B/B if it's two. In my example I would like to get the following:

> mat3<-matrix(c("A","C","T","A","T","T","A","T","T","T","T","T","A","C","G","T","T","G" ),3,byrow=T)
> rownames(mat3)<-c("ID1","ID2","ID3")
> colnames(mat3)<-c("rs123_1","rs333_1","rs9000_1","rs123_2","rs333_2","rs9000_2")

> mat3

     rs123_1 rs333_1 rs9000_1 rs123_2 rs333_2 rs9000_2
ID1       "A"     "C"     "T"      "A"     "T"     "T"     
ID2       "A"     "T"     "T"      "T"     "T"     "T"     
ID3       "A"     "C"     "G"      "T"     "T"     "G"

can you help me to achieve that? Thank you in advance!

TheComeOnMan · Accepted Answer

library(reshape2)
library(data.table)

mat<-data.table(matrix(c("ID1","0","1","0","ID2","1","2","0","ID3","1","1","2"),3,byrow=T))
setnames(mat, c("ID","rs123","rs333","rs9000"))

mat2<-data.table(matrix(c("rs123","A","T","rs333","C","T","rs9000","T","G"),3,byrow=T))
setnames(mat2, c("rs","Allele_A","Allele_B"))

mat <- data.table(melt(mat, id.vars = 'ID'))
setnames(mat,'variable','rs')

mat <- merge(mat,mat2, by = 'rs', all.x = TRUE)
mat[,a1 := Allele_A]
mat[value == 2,a1 := Allele_B]
mat[,a2 := Allele_B]
mat[value == 0,a2 := Allele_A]

mat1 <- dcast(mat, ID~rs, value.var = 'a1')
mat2 <- dcast(mat, ID~rs, value.var = 'a2')
mat <- merge(mat1,mat2,by = 'ID', suffixes = c('_1','_2'))

Output

> mat
   ID rs123_1 rs333_1 rs9000_1 rs123_2 rs333_2 rs9000_2
1 ID1       A       C        T       A       T        T
2 ID2       A       T        T       T       T        T
3 ID3       A       C        G       T       T        G

Ricardo Saporta · Answer

heres another data.table solution

library(data.table)
DT.mat1 <- as.data.table(mat)
DT.mat2 <- as.data.table(mat2, keep.rownames=TRUE)

## Make sure all characters
DT.mat2[, names(DT.mat2) := lapply(.SD, as.character)]

# set key to the row names
setkey(DT.mat2, rn)

  DT.mat1[, setNames(nm=paste(names(.SD), "_", rep(1:2, times=ncol(.SD))), 
              unlist(lapply(names(.SD), function(x) { 
                      .sdx <- .SD[[x]] 
                      DT.mat2[.(x)][, list(ifelse(.sdx==2, Allele_B, Allele_A),
                                           ifelse(.sdx==0, Allele_B, Allele_A))]
              }), recursive=FALSE, use.names=FALSE)
  )]

   rs123 _ 1 rs333 _ 2 rs9000 _ 1 rs123 _ 2 rs333 _ 1 rs9000 _ 2
1:         A         T          C         C         T          G
2:         A         A          T         C         T          G
3:         A         A          C         C         G          T

You can use microbenchmark (from the microbenchmark package) to time this. I'm not sure how large your actual data is, but I would guess as your data grows, this would be quite fast

Recode according to second dataset and split columns

Tags:

r

VGaertner

2 Answers

TheComeOnMan

Ricardo Saporta

Recent Activity

Donate For Us

Recode according to second dataset and split columns

Tags:

r

VGaertner

2 Answers

TheComeOnMan

Ricardo Saporta

Related questions

Recent Activity

Donate For Us