I have a data frame like below
V1 V2 V3 V4 V5
ATP1A2 ATP1A1 220948_s_at LARS2 204016_at
CAPRIN1 ATP1A2 LARS2
ATP1A1 ATP1A2 203296_s_at MTHFD1 202309_at
CBX3 ATP1B1 201243_s_at PRKCSH 200707_at
AUP1 ATP1B1 PRKCSH 214080_x_at
LARS2 AUP1 220525_s_at PRKCSH
MTHFD1 AUP1 VDAC2 211662_s_at
VDAC2 B3GNT3 204856_at VDAC2
PRKCSH B3GNT3
ATP1B1 CAPRIN1 200722_s_at
B3GNT3 CAPRIN1 200723_s_at
CAPRIN1
CBX3 200037_s_at
I want to remove those element from V2 that V3 is empty or from V4 that V5 is empty for example in V3, the second element is empty, so both the second element of V3 and V2 should be removed. how the output will looks like is seen below.
V1 V2 V3 V4 V5
ATP1A2 ATP1A1 220948_s_at LARS2 204016_at
CAPRIN1 ATP1A2 203296_s_at MTHFD1 202309_at
ATP1A1 ATP1B1 201243_s_at PRKCSH 200707_at
CBX3 AUP1 220525_s_at PRKCSH 214080_x_at
AUP1 B3GNT3 204856_at VDAC2 211662_s_at
LARS2 CAPRIN1 200722_s_at
MTHFD1 CAPRIN1 200723_s_at
VDAC2 CBX3 200037_s_at
PRKCSH
ATP1B1
B3GNT3
I can detect how to find those empty string in each column but the problem is how to remove two columns rows based on that and do it for all data frame
# to find the empty elements
Rcolumn3 <-which(df$V3=="")
Rcolumn5 <-which(df$V5=="")
df<- structure(list(V1 = structure(c(3L, 7L, 2L, 8L, 5L, 9L, 10L,
12L, 11L, 4L, 6L, 1L, 1L), .Label = c("", "ATP1A1", "ATP1A2",
"ATP1B1", "AUP1", "B3GNT3", "CAPRIN1", "CBX3", "LARS2", "MTHFD1",
"PRKCSH", "VDAC2"), class = "factor"), V2 = structure(c(1L, 2L,
2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 6L, 7L), .Label = c("ATP1A1",
"ATP1A2", "ATP1B1", "AUP1", "B3GNT3", "CAPRIN1", "CBX3"), class = "factor"),
V3 = structure(c(9L, 1L, 6L, 5L, 1L, 8L, 1L, 7L, 1L, 3L,
4L, 1L, 2L), .Label = c("", "200037_s_at", "200722_s_at",
"200723_s_at", "201243_s_at", "203296_s_at", "204856_at",
"220525_s_at", "220948_s_at"), class = "factor"), V4 = structure(c(2L,
2L, 3L, 4L, 4L, 4L, 5L, 5L, 1L, 1L, 1L, 1L, 1L), .Label = c("",
"LARS2", "MTHFD1", "PRKCSH", "VDAC2"), class = "factor"),
V5 = structure(c(4L, 1L, 3L, 2L, 6L, 1L, 5L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("", "200707_at", "202309_at", "204016_at",
"211662_s_at", "214080_x_at"), class = "factor")), .Names = c("V1",
"V2", "V3", "V4", "V5"), class = "data.frame", row.names = c(NA,
-13L))
you can get the data by
This is a way:
#convert to character
df[] <- lapply(df, as.character)
#convert the equivalent V2 or V4 columns' rows into ''
a <- which(df$V3=='')
b <- which(df$V5=='')
df$V2[a] <- ''
df$V4[b] <- ''
At this point I ve written a function that pushes the blanks to the end/bottom of a vector/column:
push_up <- function(vec){
c(vec[-which(vec=='')], vec[which(vec=='')])
}
and apply that to the df:
df[] <- lapply(df, push_up)
Output:
> df
V1 V2 V3 V4 V5
1 ATP1A2 ATP1A1 220948_s_at LARS2 204016_at
2 CAPRIN1 ATP1A2 203296_s_at MTHFD1 202309_at
3 ATP1A1 ATP1B1 201243_s_at PRKCSH 200707_at
4 CBX3 AUP1 220525_s_at PRKCSH 214080_x_at
5 AUP1 B3GNT3 204856_at VDAC2 211662_s_at
6 LARS2 CAPRIN1 200722_s_at
7 MTHFD1 CAPRIN1 200723_s_at
8 VDAC2 CBX3 200037_s_at
9 PRKCSH
10 ATP1B1
11 B3GNT3
12
13
Or df[df$V1 != '',] to remove rows 12,13
We could try set from data.table as there are many columns (created 'df1' with a couple more columns). The basic approach is similar to @Lyzander's, i.e. changing the elements that are not '' in alternating columns to '' based on the '' values in the other column and then "push" the '' elements to the end of each column. (Assumption is that the modification starts for alternating columns from the 2nd column).
library(data.table)
df1 <- cbind(df, setNames(df[-1], paste0('V',6:9)))
Changed the column class from factor to character as it will be easier for changing the values.
df1[] <- lapply(df1, as.character)
Convert the 'data.frame' to 'data.table'
setDT(df1)
Create a numeric index for the alternating column
j1 <- as.integer(seq(3, ncol(df1), by=2))
Use set to replace the elements in the 'j-1th' column based on the presence of '' in the 'jth' column, order the 'jth' and 'j-1th' columns based on the presence of '' (2nd and 3rd set lines). I am not sure if this is the most efficient though (considering three lines of set), but set in general is very efficient as the overhead of [.data.table is avoided
for(j in j1){
set(df1, i=which(df1[[j]]==''), j=j-1L, value='')
set(df1, i=NULL, j=j, value= df1[[j]][order(df1[[j]]=='')])
set(df1, i=NULL, j=j-1L, value= df1[[j-1L]][order(df1[[j-1L]]=='')])
}
Remove the '' elements at the end of V1 column to get the expected result.
df1[V1!='']
# V1 V2 V3 V4 V5 V6 V7 V8
#1: ATP1A2 ATP1A1 220948_s_at LARS2 204016_at ATP1A1 220948_s_at LARS2
#2: CAPRIN1 ATP1A2 203296_s_at MTHFD1 202309_at ATP1A2 203296_s_at MTHFD1
#3: ATP1A1 ATP1B1 201243_s_at PRKCSH 200707_at ATP1B1 201243_s_at PRKCSH
#4: CBX3 AUP1 220525_s_at PRKCSH 214080_x_at AUP1 220525_s_at PRKCSH
#5: AUP1 B3GNT3 204856_at VDAC2 211662_s_at B3GNT3 204856_at VDAC2
#6: LARS2 CAPRIN1 200722_s_at CAPRIN1 200722_s_at
#7: MTHFD1 CAPRIN1 200723_s_at CAPRIN1 200723_s_at
#8: VDAC2 CBX3 200037_s_at CBX3 200037_s_at
#9: PRKCSH
#10: ATP1B1
#11: B3GNT3
# V9
#1: 204016_at
#2: 202309_at
#3: 200707_at
#4: 214080_x_at
#5: 211662_s_at
#6:
#7:
#8:
#9:
#10:
#11:
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With