apply function visits each row in the Dataset and subset function finds records that are similiar to the row currently being visited by apply function.
findFragment<-function(Dataset){
df1 <<- data.frame(Col9=character(),aid=character(),month=as.Date(character()),year=as.Date(character()),Outcome=character(),ser_no=character(),Similar=character(),stringsAsFactors=FALSE)
rowind<<-0
start.time <- Sys.time()
apply(Dataset,1,function(slic){
rowind<<-rowind+1
fragment<-subset(Dataset, subset = ser_no %in% slic[1] &
Outcome %in% slic[2] &
year %in% slic[3] &
month %in% slic[4] &
code %in% slic[5] &
name %in% slic[6] &
!(aid %in% slic[7]) &
((as.numeric(Percentage)<=(as.numeric(slic[8])+0.01) &
as.numeric(Percentage)>=as.numeric(slic[8])-0.01)
)
)
#Refiltering results
#If result includes more than 3 rows then refilter back on these rows and include only those rows that have percentage+-0.0001
if(nrow(fragment)>3){
fragment<<-subset(fragment, subset = ((as.numeric(Percentage)<=(as.numeric(slic[8])+0.0001) &
as.numeric(Percentage)>=as.numeric(slic[8])-0.0001)
))
}
#Writing data is extremely slow in below way(takes 30+ minutes).
#fragmentize$Similiar[rowind]<<-paste(as.character(unlist(fragment[7])),collapse=",")
#Writing data this way takes total execution time to 9 minutes
# df1<<-rbind(df1,data.frame(Col9=slic[9],
# aid=slic[7],
# ser_no=slic[1],
# Outcome=slic[2],
# month=slic[4],
# year=slic[3],
# Similar=paste(as.character(unlist(fragment[7])),collapse=",")),make.row.names = FALSE)
})
# df1<<-merge(x = Dataset, y = df1, by = c("Col9","aid","ser_no","Outcome","month","year"), all = TRUE)
cat("Completed in",Sys.time()-start.time)
}
fragmentize$Similiar<-0
findFragment(fragmentize)
It is taking 4 minutes 40 seconds to just find the subset. Is there more effective approach to find subsets quickly and write results back in least time possible?
Test dataframe I (Takes 4 minutes 10 second).
fragmentize<-data.frame(ser_no=rep("A1",35243),Outcome=rep("A2",35243),year=rep("A3",35243),month=rep("A4",35243),code=rep("A5",35243),name=rep("A6",35243),aid=rep(letters[1:4],35243),Percentage=rep(1,35243),col9=rep("A9",35243),col10=rep("A10",35243),col11=rep("A11",35243),col12=rep("A12",35243),col13=rep("A13",35243),col4=rep("A14",35243),col15=rep("A15",35243),col16=rep("A16",35243),col7=rep("A17",35243),col8=rep("A18",35243),col19=rep("A19",35243),col20=rep("A20",35243),col21=rep("A21",35243),col22=rep("A22",35243),col23=rep("A23",35243),col24=rep("A24",35243),col25=rep("A25",35243),col26=rep("A26",35243),col27=rep("A27",35243),col28=rep("A28",35243),col29=rep("A29",35243))
Test dataframe II : It Replicates pattern in my actual dataframe.Execution time is 21 minutes which is more as compared to 4 minutes 40 seconds for my actual dataframe.
fragmentize<-data.frame(col9=rep("A9",35243),col10=rep("A10",35243),col11=rep("A11",35243),col12=rep("A12",35243),col13=rep("A13",35243),col4=rep("A14",35243),col15=rep("A15",35243),col16=rep("A16",35243),col7=rep("A17",35243),col8=rep("A18",35243),col19=rep("A19",35243),col20=rep("A20",35243),col21=rep("A21",35243),col22=rep("A22",35243),col23=rep("A23",35243),col24=rep("A24",35243),col25=rep("A25",35243),col26=rep("A26",35243),col27=rep("A27",35243),col28=rep("A28",35243),col29=rep("A29",35243))
library(random)
ser_noVal<-rep(1:831)
OutcomeVal<-c("Aggressive","Balanced","Positive","Negative","Neutral","Conservative")
yearVal<-c(2013:2017)
monthVal<-c(1:12)
codeVal <- c("A", "B", "C")
nameVal<-randomStrings(n=33, len=2, digits=FALSE,loweralpha=TRUE, unique=TRUE, check=TRUE)
aidVal<-randomStrings(n=222, len=4, digits=TRUE,loweralpha=TRUE, unique=TRUE, check=TRUE)
percentVal<-c(1:1561)
fragmentize$ser_no[sample(1:nrow(fragmentize), nrow(fragmentize), FALSE)] <- rep(ser_noVal, c(6,70,4,83,1,92,1,1,6,16,8,3,376,63,735,23,28,3,24,1,84,13,119,7,5,4,1,29,1,27,7,3,9,7,4,11,7,14,2,1,1,16,5,150,31,10,1,1049,2,47,36,2,41,37,6,81,55,6,11,22,3,10,30,4,8,4,175,9,6,1,1,83,20,1,34,38,1,3,41,6,19,1,13,65,42,115,53,18,19,36,5,16,20,38,1,36,1,1,1,4,7,5,19,7,8,39,113,4,1,21,21,2,12,7,6,11,33,19,1,1,53,2,195,79,1,1,2,2,3,1,7,3,11,5,2,1,16,2,14,2,2,15,4,54,4,3,2,40,49,2,1,3,22,9,25,5,42,8,5,6,8,8,3,179,2,4,16,131,113,20,1,13,27,57,52,34,7,4,1,3,22,21,577,16,28,31,82,1,1,74,26,25,1,23,1,29,116,33,1,3,9,8,11,12,1,2,3,11,1,1,13,3,22,13,1,15,2,4,20,1,2,7,2,2,18,147,8,2,50,5,25,2,12,1,98,6,6,37,55,20,9,6,3,8,4,2,2,9,2,32,6,183,10,141,755,34,1,13,3,1,83,1,10,1,566,27,1,38,1,45,7,44,43,11,18,259,36,64,6,19,31,33,355,70,14,26,41,619,139,1,2,45,76,2,49,5,19,51,30,16,32,12,10,1,4,2,80,25,45,84,50,346,125,60,61,321,6,14,17,13,37,7,4,61,79,207,68,111,49,75,425,92,50,329,4,22,2,7,88,1265,3,22,41,10,29,1,37,3,1,13,20,35,10,33,26,5,1,1,1,1,1,2,3,6,14,2,4,2,20,921,132,9,8,114,438,57,37,10,1778,21,10,44,1,4,3,10,48,1,100,123,6,15,234,3,15,3,14,13,46,39,2,72,3,97,97,10,13,2,38,3,4,17,49,143,5,76,61,11,17,16,40,1,1,1,1,1,9,6,1,2,20,28,30,4,30,14,9,80,1,32,7,20,4,26,2,66,4,2,1,2,12,2,8,2,12,56,9,1023,33,19,1,3,46,1,6,88,40,84,85,35,28,314,3,7,61,79,34,55,2,23,1,10,1,2,77,6,70,40,1,4,93,1,48,3,5,17,2,8,1,2,1,7,27,13,23,4,4,4,7,1,2,1,1,2,18,13,44,32,1,2,2,8,103,1,6,366,4,4,5,2,6,15,6,30,10,1,3,1,2,4,20,8,1,86,3,3,3,2,4,76,3,436,4,1,10,28,17,39,1,1,896,21,12,24,1,177,29,8,3,36,14,2,6,9,1,17,5,2,113,48,2,8,15,155,34,465,23,1,222,1,22,14,23,4,11,3,18,12,17,2,5,3,7,4,2,1,1,1,2,2,9,185,22,11,1,1,14,3,3,2,11,2,4,2,1,4,17,4,213,7,62,1,210,126,38,1,391,2,6,67,44,21,19,16,98,14,4,1,1,2,197,8,31,1,48,1,10,9,36,24,54,65,1,5,5,12,224,13,41,28,7,339,50,5,9,2,3,3,1,1,1,2,7,1,35,11,25,1,2,12,23,4,14,6,2,3,20,36,7,2,6,10,22,1,2,6,2,18,14,15,10,24,11,3,78,2,1,10,236,293,25,43,5,14,4,32,29,4,1,6,6,9,1,202,173,1,12,1,18,1,55,56,3,9,4,3,12,4,2,32,3,22,7,45,15,4,5,4,3,2,1,7,7,12,4,1,2,8,166,1,10,9,15,1,1,11,8,26,67,1,288,39,3,31,4,25,6,7,4,22,5,3,1,71,19,3,5,19,4,27,21,4,22,5,1,52,1,7,70,27,277,1,4,1,80,1,141,10,4,6,3,11,5,6,15,1,1,1,6,1,2))
fragmentize$Outcome[sample(1:nrow(fragmentize), nrow(fragmentize), FALSE)] <- rep(OutcomeVal, c(21775,3034,126,10,10277,21))
fragmentize$year[sample(1:nrow(fragmentize), nrow(fragmentize), FALSE)] <- rep(yearVal, c(11,2709,8476,11308,12739))
fragmentize$month[sample(1:nrow(fragmentize), nrow(fragmentize), FALSE)] <- rep(monthVal, c(2536, 2535, 2780, 2616, 2902, 3190, 3274, 3553, 3623, 3515, 2339, 2380))
fragmentize$code[sample(1:nrow(fragmentize), nrow(fragmentize), FALSE)] <- rep(codeVal, c(7610,24718,2915))
fragmentize$name[sample(1:nrow(fragmentize), nrow(fragmentize), FALSE)] <- rep(nameVal, c(218, 917, 1736, 555, 42, 76, 79, 267, 1988, 116, 194, 161, 12, 353, 261, 285, 382, 6050, 2053, 45, 1, 276, 4598, 7543, 337, 14, 1, 591, 1020, 657, 139, 3995, 281))
fragmentize$aid[sample(1:nrow(fragmentize), nrow(fragmentize), FALSE)] <- rep(aidVal, c(310, 82, 26, 6, 493, 175, 31, 4, 19, 160, 263, 248, 68, 20, 666, 303, 6, 125, 190, 8, 108, 93, 206, 11, 278, 2, 273, 3, 3, 4, 285, 1, 555, 44, 93, 21, 94, 5309, 46, 25, 7, 249, 67, 20, 3, 15, 15, 16, 5, 12, 5, 17, 67, 44, 332, 57, 358, 25, 204, 8, 612, 108, 47, 273, 16, 20, 516, 16, 344, 33, 153, 4, 43, 73, 14, 37, 88, 7, 26, 23, 116, 33, 28, 66, 24, 21, 18, 32, 96, 6, 16, 3, 176, 121, 109, 177, 8, 30, 156, 117, 24, 90, 199, 236, 24, 25, 34, 20, 50, 14, 19, 30, 8, 20, 3, 10, 55, 24, 26, 17, 17, 29, 147, 148, 6, 2031, 65, 1135, 632, 91, 544, 1073, 11, 617, 15, 18, 2, 226, 182, 89, 513, 23, 149, 6, 398, 148, 13, 129, 323, 26, 4, 4, 155, 63, 32, 64, 23, 2, 120, 1, 2, 1, 10, 25, 120, 993, 5, 335, 40, 539, 413, 116, 78, 15, 38, 2, 15, 34, 271, 3, 604, 375, 52, 47, 459, 457, 177, 28, 293, 49, 266, 96, 1836, 18, 127, 18, 246, 5, 8, 4, 11, 102, 24, 21, 63, 57, 25, 22, 2, 1, 1, 51, 74, 56, 154, 97, 21, 31, 4, 3, 1, 11))
fragmentize$Percentage[sample(1:nrow(fragmentize), nrow(fragmentize), FALSE)] <- rep(percentVal, c(116,84,64,108,25,36,104,6,17,21,129,70,32,34,18,234,37,14,102,4,5,24,57,19,130,7,22,81,123,9,1,6,4,7,103,22,30,2,17,18,44,176,3,12,71,7,20,52,11,10,7,81,7,6,5,3,45,15,9,116,10,78,5,39,36,7,34,7,44,5,14,58,7,23,386,13,46,1,79,12,18,4,15,6,1009,6,47,55,36,18,15,2,1,2,297,39,6,18,50,33,18,37,632,5,26,28,31,187,15,26,9,1,38,27,9,25,2,4,486,49,11,104,130,6,3,9,6,3,16,5,9,392,96,9,4,7,1,39,35,8,3,12,14,94,309,59,3,15,1,18,85,277,13,6,3,4,68,204,2,7,59,5,19,189,1,440,2,44,109,151,2,45,6,3,131,18,23,17,334,1,103,27,18,2,27,2,75,98,7,19,2,72,1,10,82,17,256,20,17,1,92,2,1,13,71,3,21,13,86,1,16,1,83,103,226,1,26,20,1,63,1,7,9,10,51,2,155,70,11,4,10,2,49,152,9,2,42,9,21,53,33,11,1,101,8,49,1,3,1,2,4,141,9,17,163,44,7,36,121,17,32,6,4,2,26,9,4,72,1,22,70,3,1,4,1,74,24,41,39,30,8,1,27,4,30,1,73,4,21,10,9,8,117,9,65,3,6,24,14,2,4,89,6,2,20,49,40,266,3,4,1,23,1,28,14,17,22,28,20,1,6,58,25,10,4,6,37,168,11,8,3,58,4,99,62,20,22,15,2,20,1,32,3,3,9,4,19,1,7,33,1,18,4,1,13,13,11,38,27,1,20,176,18,10,1,1,15,20,3,21,13,4,49,6,10,22,2,1,12,10,78,7,5,4,13,7,22,5,8,10,72,2,17,1,9,1,13,14,129,21,2,12,1,13,51,12,138,3,3,9,9,6,17,11,13,4,1,6,15,11,1,102,15,2,1,4,5,48,7,12,4,2,2,6,20,9,2,28,25,1,1,12,16,30,12,10,4,3,2,88,13,1,216,13,9,1,3,11,12,9,7,1,1,48,6,2,16,2,1,4,2,12,11,16,11,1,7,67,3,2,1,6,323,23,1,25,5,2,5,57,10,50,5,97,4,4,19,5,2,1,5,5,4,7,4,2,6,4,1,1,2,1,5,2,13,13,1,7,1,6,3,43,3,1,47,8,5,1,179,97,5,10,40,1,5,4,3,11,1,4,2,8,1,1,3,7,5,1,54,1,7,2,3,43,1,1,3,3,1,191,27,1,3,1,19,51,3,3,3,33,4,41,2,15,2,2,6,114,1,1,1,2,2,13,1,1,3,1,1,3,3,1,1,107,2,7,2,10,2,1,1,30,1,42,1,1,67,1,1,11,3,48,32,1,4,2,58,1,1,149,2,17,1,40,97,1,2,6,1,20,1,1,28,127,30,1,1,14,13,5,84,5,2,4,1,86,4,13,15,18,18,11,8,3,1,12,49,92,5,3,2,100,12,81,1,6,64,7,15,6,20,13,82,46,19,26,7,67,2,6,9,1,29,3,1,10,2,64,5,18,107,203,9,2,2,101,52,2,6,1,70,7,10,86,1,1,1,7,1,15,1,1,51,4,44,5,15,2,50,79,27,12,64,1,17,32,54,44,114,1,34,4,12,96,56,1,31,5,1,3,37,4,130,5,4,3,1,26,2,20,41,9,1,37,4,18,1,2,157,30,5,5,27,6,30,1,20,42,1,51,34,7,12,16,1,57,1,1,40,29,1,37,36,32,2,14,43,3,4,10,2,2,17,36,27,10,1,53,101,111,1,10,56,2,1,43,152,8,103,2,29,2,44,2,18,44,87,49,5,43,16,13,1,53,26,30,6,17,7,2,24,36,4,41,2,17,1,24,1,7,5,8,14,1,38,45,14,38,22,10,2,11,8,13,3,28,1,19,1,18,14,15,2,26,2,15,27,1,3,22,28,49,10,2,1,20,22,77,1,2,4,122,1,1,44,1,14,15,1,70,4,4,25,54,10,34,13,17,2,2,23,30,13,1,2,10,15,1,14,30,23,1,1,21,15,12,1,13,2,1,6,26,3,12,1,62,10,15,3,21,34,14,3,10,2,35,18,6,1,90,44,1,1,172,3,7,79,13,37,2,13,23,8,2,10,59,1,12,1,107,6,11,9,25,4,1,2,26,14,18,8,322,1,19,1,6,162,1,9,19,2,9,5,3,12,50,4,16,9,5,34,14,10,2,1,46,40,15,3,13,55,20,93,8,1,2,25,8,7,58,14,17,3,1,7,2,5,3,4,1,131,3,1,2,7,18,45,6,3,12,8,11,18,5,13,7,35,7,1,8,4,5,15,49,6,1,1,80,11,2,5,1,5,19,20,179,22,1,10,1,9,48,111,3,21,1,3,17,20,2,2,2,7,3,6,1,6,8,12,5,5,1,129,1,1,2,10,6,8,16,8,2,2,9,20,1,74,5,42,5,1,1,1,2,14,4,12,9,47,12,38,1,1,3,8,34,1,5,1,4,4,21,2,1,1,14,95,23,14,2,1,90,7,7,32,8,6,1,2,19,12,1,2,7,30,4,1,10,1,2,7,1,7,3,16,1,9,4,3,5,1,76,3,17,8,1,6,70,3,1,11,3,7,27,1,2,40,2,3,7,6,8,3,1,49,14,56,1,17,2,5,5,70,5,13,3,10,2,3,10,1,2,4,5,94,1,3,2,1,5,2,6,4,4,5,6,12,1,16,68,1,4,11,4,4,1,9,1,6,3,9,5,4,50,3,1,12,4,1,5,2,24,35,2,3,2,60,1,3,2,6,3,2,2,9,7,1,11,12,5,4,3,56,7,1,5,1,1,4,1,18,5,1,1,9,159,1,11,2,8,2,3,1,1,9,3,7,2,68,2,5,43,2,4,38,1,5,2,26,1,4,2,1,5,10,1,4,2,1,8,2,6,86,2,2,1,10,3,1,4,10,3,35,17,3,4,14,1,1,17,4,6,39,3,13,50,6,3,3,38,4,1,3,2,26,1,5,28,2,5,1,1,21,1,13,6,2,4,6,13,3,5,9,3,2,1,32,1,8,20,2,2,8,2,2,30,1,9,2,4,4,4,1,13,1,45,2,5,3,1,1,23,12,1,2,1,1,1,26,1,14,1,1,6,1,10,1,10,7,2,2,1,1,1,4,11,4,2,2,1,3,2,19,8,5,4,3,1,1,52,4,1,1,2,3,4,3,1,23,23,2,2,2,1,1,9,6,2,26,1,1,2,2,1,1,1,1,10,4,7,27,4,2,1,1,24,3,3,2,1,3,5,2,4,14,1,1,4,3,2,1,18,1,1,2,4,2,1,5,2,1,5,1,4,1,1,5,1,5,1,1,3,2,1,5,1,3,1,1,1,3,3,2,1,5,1,4,5,4,3,2,1,1,1,4,6,2,1,1,1,9,1,2,1,3,1,1,1,5,5,8,1,1,1,2,6,2,2,4,1,3,2,2,1,9,1,2,4,1,3,25))
rm(ser_noVal,OutcomeVal,yearVal,monthVal,codeVal,nameVal,aidVal,percentVal)
As far as I have understood the question, the OP wants to find similar records in his production data set which have identical values in ser_no
, Outcome
, year
, month
, code
, and name
and approximately equal values in Percentage
(within a given tolerance). The OP has requested to append the aid
values of any matching rows except the aid
value of the actual row.
On possible approach is a non-equi self-join using data.table
:
library(data.table)
eps <- 0.01
system.time(
setDT(fragmentize, key = c("ser_no", "Outcome", "year", "month", "code", "name", "aid"))[
, Percentage := as.numeric(Percentage)][
, similar := fragmentize[
.(ser_no = ser_no, Outcome = Outcome, year = year, month = month,
code = code, name = name, aid = aid,
lb = Percentage * (1 - eps), ub = Percentage * (1 + eps)),
on = .(ser_no, Outcome, year, month, code, name,
Percentage >= lb, Percentage <= ub),
by = .EACHI, toString(setdiff(unique(x.aid), i.aid))][, V1]]
)
On my system, OP's Test dataframe II took
User System Elapsed 0.61 0.00 0.64
which is magnitudes faster than the 21 minutes the OP has reported for this sample data set.
As result, fragmentize
has gained an additional column similar
:
str(fragmentize)
Classes ‘data.table’ and 'data.frame': 35243 obs. of 30 variables: $ col9 : Factor w/ 1 level "A9": 1 1 1 1 1 1 1 1 1 1 ... $ col10 : Factor w/ 1 level "A10": 1 1 1 1 1 1 1 1 1 1 ... $ col11 : Factor w/ 1 level "A11": 1 1 1 1 1 1 1 1 1 1 ... $ col12 : Factor w/ 1 level "A12": 1 1 1 1 1 1 1 1 1 1 ... $ col13 : Factor w/ 1 level "A13": 1 1 1 1 1 1 1 1 1 1 ... $ col4 : Factor w/ 1 level "A14": 1 1 1 1 1 1 1 1 1 1 ... $ col15 : Factor w/ 1 level "A15": 1 1 1 1 1 1 1 1 1 1 ... $ col16 : Factor w/ 1 level "A16": 1 1 1 1 1 1 1 1 1 1 ... $ col7 : Factor w/ 1 level "A17": 1 1 1 1 1 1 1 1 1 1 ... $ col8 : Factor w/ 1 level "A18": 1 1 1 1 1 1 1 1 1 1 ... $ col19 : Factor w/ 1 level "A19": 1 1 1 1 1 1 1 1 1 1 ... $ col20 : Factor w/ 1 level "A20": 1 1 1 1 1 1 1 1 1 1 ... $ col21 : Factor w/ 1 level "A21": 1 1 1 1 1 1 1 1 1 1 ... $ col22 : Factor w/ 1 level "A22": 1 1 1 1 1 1 1 1 1 1 ... $ col23 : Factor w/ 1 level "A23": 1 1 1 1 1 1 1 1 1 1 ... $ col24 : Factor w/ 1 level "A24": 1 1 1 1 1 1 1 1 1 1 ... $ col25 : Factor w/ 1 level "A25": 1 1 1 1 1 1 1 1 1 1 ... $ col26 : Factor w/ 1 level "A26": 1 1 1 1 1 1 1 1 1 1 ... $ col27 : Factor w/ 1 level "A27": 1 1 1 1 1 1 1 1 1 1 ... $ col28 : Factor w/ 1 level "A28": 1 1 1 1 1 1 1 1 1 1 ... $ col29 : Factor w/ 1 level "A29": 1 1 1 1 1 1 1 1 1 1 ... $ ser_no : int 1 1 1 1 1 1 2 2 2 2 ... $ Outcome : chr "Aggressive" "Aggressive" "Aggressive" "Aggressive" ... $ year : int 2015 2015 2016 2017 2015 2016 2014 2014 2015 2015 ... $ month : int 11 11 5 5 2 10 5 10 2 5 ... $ code : chr "A" "B" "B" "B" ... $ name : chr "wt" "Ds" "UF" "Of" ... $ aid : chr "UuaR" "uwIL" "9WAx" "h5eH" ... $ Percentage: num 255 1295 168 549 85 ... $ similar : chr "" "" "" "" ... - attr(*, ".internal.selfref")=<externalptr> - attr(*, "sorted")= chr "ser_no" "Outcome" "year" "month" ...
As similar
is empty for the vast majority of rows, we show only the non empty rows and also only the relevant columns. Setting the keys has already sorted fragmentize
which makes it easier to verify the result:
fragmentize[similar != "", .(ser_no, Outcome, year, month, code, name, aid,
Percentage, similar)]
ser_no Outcome year month code name aid Percentage similar 1: 13 Aggressive 2016 3 B gZ 21So 525 59PL 2: 13 Aggressive 2016 3 B gZ 59PL 529 21So 3: 15 Aggressive 2017 1 B nt C2i4 1311 uwIL 4: 15 Aggressive 2017 1 B nt uwIL 1323 C2i4 5: 15 Aggressive 2017 6 B Wj hMo4 308 mrDx 6: 15 Aggressive 2017 6 B Wj mrDx 308 hMo4 7: 48 Aggressive 2016 11 B gZ 4LVK 1216 FtSG 8: 48 Aggressive 2016 11 B gZ FtSG 1205 4LVK 9: 48 Aggressive 2017 5 B nt 59PL 85 f1Fh 10: 48 Aggressive 2017 5 B nt f1Fh 85 59PL 11: 48 Aggressive 2017 7 B Wj lVpw 1021 mz3h 12: 48 Aggressive 2017 7 B Wj mz3h 1021 lVpw 13: 252 Aggressive 2016 6 B gZ bkk6 75 spPd 14: 252 Aggressive 2016 6 B gZ spPd 75 bkk6 15: 255 Aggressive 2015 9 B Wj 59PL 29 dceG 16: 255 Aggressive 2015 9 B Wj dceG 29 59PL 17: 265 Aggressive 2017 9 B FB FodL 756 twvT 18: 265 Aggressive 2017 9 B FB twvT 759 FodL 19: 276 Aggressive 2016 11 A gZ 59PL 949 M6sO 20: 276 Aggressive 2016 11 A gZ M6sO 944 59PL 21: 288 Aggressive 2017 6 B gZ 21So 878 Y9gk 22: 288 Aggressive 2017 6 B gZ Y9gk 882 21So 23: 340 Aggressive 2015 7 B nt FtSG 763 kBpV 24: 340 Aggressive 2015 7 B nt kBpV 767 FtSG 25: 340 Aggressive 2016 4 B Ds 21So 731 bkk6 26: 340 Aggressive 2016 4 B Ds bkk6 727 21So 27: 340 Aggressive 2017 10 B nt B4fM 673 M6sO 28: 340 Aggressive 2017 10 B nt M6sO 678 B4fM 29: 340 Neutral 2017 8 A Oa 59PL 872 Vyl1 30: 340 Neutral 2017 8 A Oa Vyl1 872 59PL 31: 340 Neutral 2017 9 B FB 59PL 723 75iU 32: 340 Neutral 2017 9 B FB 75iU 723 59PL 33: 370 Aggressive 2015 6 A gZ 3Xre 132 DWZh 34: 370 Aggressive 2015 6 A gZ DWZh 132 3Xre 35: 370 Aggressive 2016 5 B gZ 1reu 1162 jSL1 36: 370 Aggressive 2016 5 B gZ jSL1 1158 1reu 37: 370 Aggressive 2017 3 B Wj 21So 872 spPd 38: 370 Aggressive 2017 3 B Wj spPd 867 21So 39: 370 Aggressive 2017 4 B FB 0Xza 1547 NXGE 40: 370 Aggressive 2017 4 B FB NXGE 1535 0Xza 41: 379 Aggressive 2015 2 B FB mJAy 133 zQZw 42: 379 Aggressive 2015 2 B FB zQZw 133 mJAy 43: 379 Aggressive 2015 7 B gZ FtSG 201 spPd 44: 379 Aggressive 2015 7 B gZ spPd 201 FtSG 45: 379 Aggressive 2016 8 B Wj 75iU 95 HzTb 46: 379 Aggressive 2016 8 B Wj HzTb 95 75iU 47: 379 Aggressive 2016 9 B gZ F9c3 244 LpB1 48: 379 Aggressive 2016 9 B gZ LpB1 246 F9c3 49: 379 Aggressive 2016 12 B nt 4DGD 507 zYVN 50: 379 Aggressive 2016 12 B nt zYVN 504 4DGD 51: 379 Aggressive 2017 1 B Wj LpB1 85 gzvo 52: 379 Aggressive 2017 1 B Wj gzvo 85 LpB1 53: 379 Aggressive 2017 9 B FB Xo8U 60 hSJN 54: 379 Aggressive 2017 9 B FB hSJN 60 Xo8U 55: 379 Aggressive 2017 9 B Wj 75iU 12 Puss 56: 379 Aggressive 2017 9 B Wj Puss 12 75iU 57: 379 Aggressive 2017 11 B Wj 1reu 817 N7dg, SCPN 58: 379 Aggressive 2017 11 B Wj N7dg 809 SCPN, 1reu 59: 379 Aggressive 2017 11 B Wj SCPN 809 N7dg, 1reu 60: 379 Aggressive 2017 12 B gZ B4fM 17 hMo4 61: 379 Aggressive 2017 12 B gZ hMo4 17 B4fM 62: 379 Neutral 2016 9 B Wj L58K 103 hMo4 63: 379 Neutral 2016 9 B Wj hMo4 103 L58K 64: 379 Neutral 2017 6 B gZ 21So 1016 I46B 65: 379 Neutral 2017 6 B gZ I46B 1012 21So 66: 379 Neutral 2017 9 B Wj 21So 1244 LpB1 67: 379 Neutral 2017 9 B Wj LpB1 1240 21So 68: 379 Neutral 2017 11 B gZ 3Vpo 483 spPd 69: 379 Neutral 2017 11 B gZ spPd 483 3Vpo 70: 393 Aggressive 2015 2 B FB 8SzN 323 cKuN 71: 393 Aggressive 2015 2 B FB cKuN 322 8SzN 72: 458 Aggressive 2015 1 B FB 75iU 972 GWLn 73: 458 Aggressive 2015 1 B FB GWLn 977 75iU 74: 458 Neutral 2017 1 B Wj 21So 483 59PL 75: 458 Neutral 2017 1 B Wj 59PL 483 21So 76: 458 Neutral 2017 6 B iN hMo4 802 spPd 77: 458 Neutral 2017 6 B iN spPd 807 hMo4 78: 526 Aggressive 2017 3 B Wj 4DGD 992 59PL 79: 526 Aggressive 2017 3 B Wj 59PL 991 4DGD 80: 552 Aggressive 2015 7 B Wj 9oyt 95 OWxi 81: 552 Aggressive 2015 7 B Wj OWxi 95 9oyt 82: 552 Aggressive 2017 10 B Ds 59PL 890 9WAx 83: 552 Aggressive 2017 10 B Ds 9WAx 894 59PL 84: 561 Aggressive 2015 1 B gZ f1Fh 949 spPd 85: 561 Aggressive 2015 1 B gZ spPd 952 f1Fh 86: 561 Aggressive 2016 4 B Wj I46B 776 hpRD 87: 561 Aggressive 2016 4 B Wj hpRD 771 I46B 88: 561 Aggressive 2016 8 B gZ eKpA 809 rp75 89: 561 Aggressive 2016 8 B gZ rp75 807 eKpA 90: 561 Aggressive 2016 9 B Wj 4LVK 882 CF4V, M6sO 91: 561 Aggressive 2016 9 B Wj CF4V 878 4LVK, M6sO 92: 561 Aggressive 2016 9 B Wj M6sO 882 CF4V, 4LVK 93: 651 Aggressive 2017 2 B Ds 59PL 179 SCPN 94: 651 Aggressive 2017 2 B Ds SCPN 179 59PL 95: 735 Aggressive 2017 8 B iN M6sO 760 tNgx 96: 735 Aggressive 2017 8 B iN tNgx 758 M6sO 97: 817 Neutral 2016 6 B gZ I46B 197 SCPN 98: 817 Neutral 2016 6 B gZ SCPN 198 I46B ser_no Outcome year month code name aid Percentage similar
As can be seen from rows 1 and 2, the detected similarities are symmetric, i.e, row 1 points to 59PL
as similar while row 2 points to 21So
. There are also two cases where 3 similar rows have been identified.
setDT()
coerces fragmentize
to a data.table
object, thereby setting keys on some columns. This is not required for the join but sorts fragmentize
which helps to verify the correctness of the result. In addition, it may speed up the join.Percentage
is coerced to type double
to prevent type conversions during the join. In Test dataframe II, the OP has created Percentage
as integer
type while the lower and upper bounds used for the range join are of type double
. Note that Percentage
is updated by reference or in place, i.e., without copying the whole data object to save time and memory.similar
is created using the result of the aggregating non-equi self join.fragmentize
is right joined with selected columns of itself. These are specified as a list
using the abbreviation .()
. In addition, lb
and ub
are created as lower and upper bound for the approximate match with Percentage
using a relative tolerance of eps
.on
clause specifies the columns which should match exactly in the join as well as the non-equi join conditions. AFAIK, it is not possible to specify an anti join on a single column. Therefore, the condition aid != aid
has to be treated in another way.by = .EACHI
parameter requests to join and aggregate simultaneously for each group of rows which match the join conditions. This avoids the creation of a potentially large intermediate table which holds all multiple matches.toString(setdiff(unique(x.aid), i.aid))
. In case of multiple matches, each aid
value should only appear once. Then, setdiff()
removes the aid
value of the actual row from that result which implements OP's requirement aid != aid
. Finally, the result is collapsed to a single string.[, V1]
expression extracts just the column with aggregated values which eventually becomes the new column similar
.The first step in R optimization is to vectorize as many operations as possible. Here we vectorize all of the comparisons for the columns that should be identical, and only do row-wise operations for aid and Percentage. The latter could be vectorized by doing a self-join and filter instead of the mapply
, but we're already below the target speed.
library(dplyr)
start.time <- Sys.time()
fragmentize <- fragmentize %>%
# group by all the columns that should match
group_by(ser_no, Outcome, year, month, code, name) %>%
#row-wise within-group filter for different aid and close percentage
mutate(similar = mapply(function (aid_i, Percentage_i) {
aid[aid != aid_i & abs(Percentage_i - Percentage) <= 1]
}, aid_i = aid, Percentage_i = Percentage, SIMPLIFY = FALSE)) %>%
ungroup %>%
mutate(similar = sapply(similar, paste, collapse = ", "))
cat("Completed in", Sys.time() - start.time)
> Completed in 1.856045
Less than 2 seconds using the 35K row example dataset from your question. The trick here is that the bare variable names anywhere within a mutate
call on a grouped data frame will evaluate to a vector of the values just for that group, so the mapply
call performs a row-wise search to comparing the values of each row to others for matching, but within the smaller search space of just the rows that have already been identified as matching on all the grouping variables.
I would recommend omitting the final mutate
to keep similar
as a list column rather than a collapsed string to make it easier to work with, but I've included that collapsing step as you did in your example code to keep the timings comparable. Also note your code had a filter on Percentage within +/-0.01, but the example data had only integers in Percentage
, so I did +/-1 instead. You would want to replace the <= 1
with <= 0.01
.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With