I have been busy with this question since last night and I could not figure out how to do it.
What I want to do is to match df1 strings to df2 strings and get the similar ones out
what I do is like this
# a function to arrange the data to have IDs for each string
normalize <- function(x, delim) {
x <- gsub(")", "", x, fixed=TRUE)
x <- gsub("(", "", x, fixed=TRUE)
idx <- rep(seq_len(length(x)), times=nchar(gsub(sprintf("[^%s]",delim), "", as.character(x)))+1)
names <- unlist(strsplit(as.character(x), delim))
return(setNames(idx, names))
}
# a function to arrange the second df
lookup <- normalize(df2[,1], ",")
# a function to match them and give the IDs
process <- function(s) {
lookup_try <- lookup[names(s)]
found <- which(!is.na(lookup_try))
pos <- lookup_try[names(s)[found]]
return(paste(s[found], pos, sep="-"))
#change the last line to "return(as.character(pos))" to get only the result as in the comment
}
then I get the results like this
res <- lapply(colnames(df1), function(x) process(normalize(df1[,x], ";")))
This gives me the row number of each string from df1 and row number of string from df2 that matched. so the output of this data looks like this
> res
$s1
[1] "3-4" "4-1" "5-4"
$s2
[1] "2-4" "3-15" "7-16"
The first column IDs is the row number of df2 which matched with strings in df1 The second column No is the number of times it matched The third column ID-col-n is the row number of string in df1 which matched with that string + their column name the forth is string from first column of the df1 which matched with that string the fifth column is the string of second column which matched with that string and so on
We can compare two columns in R by using ifelse(). This statement is used to check the condition given and return the data accordingly.
The match() function in R is used to: Return the index position of the first element present in a vector with a specific value. Return the index position of the first matching elements of the first vector in the second vector.
To find the row corresponding to a nearest value in an R data frame, we can use which. min function after getting the absolute difference between the value and the column along with single square brackets for subsetting the row.
In this case I find it easier to switch the data to the wide format and before merging it to the lookup table.
You could try:
library(tidyr)
library(dplyr)
df1_tmp <- df1
df2_tmp <- df2
#add numerical id to df1_tmp to keep row information
df1_tmp$id <- seq_along(df1_tmp[,1])
#switch to wide and unnest rows with several strings
df1_tmp <- gather(df1_tmp,key="s_val",value="query_string",-id)
df1_tmp <- df1_tmp %>%
mutate(query_string = strsplit(as.character(query_string), ";")) %>%
unnest(query_string)
df2_tmp$IDs. <- gsub("[()]", "", df2_tmp$IDs.)
#add numerical id to df1_tmp to keep row information
df2_tmp$id <- seq_along(df2_tmp$IDs.)
#unnest rows with several strings
df2_tmp <- df2_tmp %>%
mutate(IDs. = strsplit(as.character(IDs.), ",")) %>%
unnest(IDs.)
res <- merge(df1_tmp,df2_tmp,by.x="query_string",by.y="IDs.")
res$ID_col_n <- paste(paste0(res$id.x,res$s_val))
res$total_id <- 1:nrow(res)
res <- spread(res,s_val,value=query_string,fill=NA)
res
#summarize to get required output
res <- res %>% group_by(id.y) %>%
mutate(No=n()) %>% group_by(id.y,No) %>%
summarise_each(funs(paste(.[!is.na(.)],collapse=","))) %>%
select(-id.x,-total_id)
colnames(res)[colnames(res)=="id.y"]<-"IDs"
res$df1_colMatch_counts <- rowSums(res[,-(1:3)]!="")
df2_counts <- df2_tmp %>% group_by(id) %>% summarize(df2_string_counts=n())
res <- merge(res,df2_counts,by.x="IDs",by.y="id")
res
res
IDs No ID_col_n s1 s2 df1_colMatch_counts df2_string_counts
1 1 1 4s1 P41182 1 2
2 2 1 4s1 P41182 1 2
3 3 1 4s1 P41182 1 2
4 4 3 2s2,3s1,5s1 Q9Y6Q9,Q09472 Q92831 2 4
5 15 1 3s2 P54612 1 5
6 16 1 7s2 O15143 1 7
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With