I have a dataframe (TBB) as follows
X.CHROM POS INFO
chr1 134324 SAMPLE=LNGHJ;
chr2 2333 SAMPLE=dd;GERP;.;
chr2 3334 SAMPLE=;GERP;DDS;CDC=dd;
I would like to extract everything between each semicolon and put it in its own column so that the result would look like
X.CHROM POS INFO
chr1 134324 SAMPLE=LNGHJ; SAMPLE=LNGHJ
chr2 2333 SAMPLE=dd;GERP;DDS=3; SAMPLE=dd GERP DDS=3
chr2 3334 SAMPLE=;GERP;DDS; SAMPLE= GERP DDS
I tried this
TBB3 <- TBB[grep("SAMPLE.*?;", TBB$INFO), ]
TBB4<-cbind(TBB3,TBB)
but the columns are the wrong sizes relative to each other and obviously I would have to repeat this for each word after a semicolon so not very efficient.
You could try stringi::stri_split_fixed
library(stringi)
cbind(TBB, stri_split_fixed(TBB$INFO, ";", simplify = TRUE, omit_empty = TRUE))
# X.CHROM POS INFO 1 2 3
# 1 chr1 134324 SAMPLE=LNGHJ; SAMPLE=LNGHJ
# 2 chr2 2333 SAMPLE=dd;GERP;DDS=3; SAMPLE=dd GERP DDS=3
# 3 chr2 3334 SAMPLE=;GERP;DDS; SAMPLE= GERP DDS
You could also try good old plyr::rbind.fill or dplyr::rbind_list in combination with strsplit:
cbind(TBB,
do.call(dplyr::rbind_list,
lapply(strsplit(as.character(TBB$INFO), split = ";", fixed = TRUE),
function(x)
as.data.frame(t(x), stringsAsFactors = F))
)
)
# X.CHROM POS INFO V1 V2 V3 V4
# 1 chr1 134324 SAMPLE=LNGHJ; SAMPLE=LNGHJ <NA> <NA> <NA>
# 2 chr2 2333 SAMPLE=dd;GERP;.; SAMPLE=dd GERP . <NA>
# 3 chr2 3334 SAMPLE=;GERP;DDS;CDC=dd; SAMPLE= GERP DDS CDC=dd
Or you can use cSplit from splitstackshape
library(splitstackshape)
cSplit(TBB, 'INFO', ';', drop=FALSE)
You could try the package base from the R library:
dd <- read.table(header = TRUE, text = "X.CHROM POS INFO
chr1 134324 SAMPLE=LNGHJ;
chr2 2333 SAMPLE=dd;GERP;.;
chr2 3334 SAMPLE=;GERP;DDS;CDC=dd;")
(dd1 <- read.table(text = as.character(dd$INFO), sep = ';', fill = NA,
na.strings = c('', '.','NA')))
# V1 V2 V3 V4 V5
# 1 SAMPLE=LNGHJ <NA> <NA> <NA> NA
# 2 SAMPLE=dd GERP <NA> <NA> NA
# 3 SAMPLE= GERP DDS CDC=dd NA
cbind(dd, dd1[, -ncol(dd1)])
# X.CHROM POS INFO V1 V2 V3 V4
# 1 chr1 134324 SAMPLE=LNGHJ; SAMPLE=LNGHJ <NA> <NA> <NA>
# 2 chr2 2333 SAMPLE=dd;GERP;.; SAMPLE=dd GERP <NA> <NA>
# 3 chr2 3334 SAMPLE=;GERP;DDS;CDC=dd; SAMPLE= GERP DDS CDC=dd
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With