R - mgsub problem: substrings being replaced not whole strings

Tags:

I have downloaded the street abbreviations from USPS. Here is the data:

dput(usps_streets)
structure(list(common_abbrev = c("allee", "alley", "ally", "aly", 
"anex", "annex", "annx", "anx", "arc", "arcade", "av", "ave", 
"aven", "avenu", "avenue", "avn", "avnue", "bayoo", "bayou", 
"bch", "beach", "bend", "bnd", "blf", "bluf", "bluff", "bluffs", 
"bot", "btm", "bottm", "bottom", "blvd", "boul", "boulevard", 
"boulv", "br", "brnch", "branch", "brdge", "brg", "bridge", "brk", 
"brook", "brooks", "burg", "burgs", "byp", "bypa", "bypas", "bypass", 
"byps", "camp", "cp", "cmp", "canyn", "canyon", "cnyn", "cape", 
"cpe", "causeway", "causwa", "cswy", "cen", "cent", "center", 
"centr", "centre", "cnter", "cntr", "ctr", "centers", "cir", 
"circ", "circl", "circle", "crcl", "crcle", "circles", "clf", 
"cliff", "clfs", "cliffs", "clb", "club", "common", "commons", 
"cor", "corner", "corners", "cors", "course", "crse", "court", 
"ct", "courts", "cts", "cove", "cv", "coves", "creek", "crk", 
"crescent", "cres", "crsent", "crsnt", "crest", "crossing", "crssng", 
"xing", "crossroad", "crossroads", "curve", "dale", "dl", "dam", 
"dm", "div", "divide", "dv", "dvd", "dr", "driv", "drive", "drv", 
"drives", "est", "estate", "estates", "ests", "exp", "expr", 
"express", "expressway", "expw", "expy", "ext", "extension", 
"extn", "extnsn", "exts", "fall", "falls", "fls", "ferry", "frry", 
"fry", "field", "fld", "fields", "flds", "flat", "flt", "flats", 
"flts", "ford", "frd", "fords", "forest", "forests", "frst", 
"forg", "forge", "frg", "forges", "fork", "frk", "forks", "frks", 
"fort", "frt", "ft", "freeway", "freewy", "frway", "frwy", "fwy", 
"garden", "gardn", "grden", "grdn", "gardens", "gdns", "grdns", 
"gateway", "gatewy", "gatway", "gtway", "gtwy", "glen", "gln", 
"glens", "green", "grn", "greens", "grov", "grove", "grv", "groves", 
"harb", "harbor", "harbr", "hbr", "hrbor", "harbors", "haven", 
"hvn", "ht", "hts", "highway", "highwy", "hiway", "hiwy", "hway", 
"hwy", "hill", "hl", "hills", "hls", "hllw", "hollow", "hollows", 
"holw", "holws", "inlt", "is", "island", "islnd", "islands", 
"islnds", "iss", "isle", "isles", "jct", "jction", "jctn", "junction", 
"junctn", "juncton", "jctns", "jcts", "junctions", "key", "ky", 
"keys", "kys", "knl", "knol", "knoll", "knls", "knolls", "lk", 
"lake", "lks", "lakes", "land", "landing", "lndg", "lndng", "lane", 
"ln", "lgt", "light", "lights", "lf", "loaf", "lck", "lock", 
"lcks", "locks", "ldg", "ldge", "lodg", "lodge", "loop", "loops", 
"mall", "mnr", "manor", "manors", "mnrs", "meadow", "mdw", "mdws", 
"meadows", "medows", "mews", "mill", "mills", "missn", "mssn", 
"motorway", "mnt", "mt", "mount", "mntain", "mntn", "mountain", 
"mountin", "mtin", "mtn", "mntns", "mountains", "nck", "neck", 
"orch", "orchard", "orchrd", "oval", "ovl", "overpass", "park", 
"prk", "parks", "parkway", "parkwy", "pkway", "pkwy", "pky", 
"parkways", "pkwys", "pass", "passage", "path", "paths", "pike", 
"pikes", "pine", "pines", "pnes", "pl", "plain", "pln", "plains", 
"plns", "plaza", "plz", "plza", "point", "pt", "points", "pts", 
"port", "prt", "ports", "prts", "pr", "prairie", "prr", "rad", 
"radial", "radiel", "radl", "ramp", "ranch", "ranches", "rnch", 
"rnchs", "rapid", "rpd", "rapids", "rpds", "rest", "rst", "rdg", 
"rdge", "ridge", "rdgs", "ridges", "riv", "river", "rvr", "rivr", 
"rd", "road", "roads", "rds", "route", "row", "rue", "run", "shl", 
"shoal", "shls", "shoals", "shoar", "shore", "shr", "shoars", 
"shores", "shrs", "skyway", "spg", "spng", "spring", "sprng", 
"spgs", "spngs", "springs", "sprngs", "spur", "spurs", "sq", 
"sqr", "sqre", "squ", "square", "sqrs", "squares", "sta", "station", 
"statn", "stn", "stra", "strav", "straven", "stravenue", "stravn", 
"strvn", "strvnue", "stream", "streme", "strm", "street", "strt", 
"st", "str", "streets", "smt", "suite", "sumit", "sumitt", "summit", 
"ter", "terr", "terrace", "throughway", "trace", "traces", "trce", 
"track", "tracks", "trak", "trk", "trks", "trafficway", "trail", 
"trails", "trl", "trls", "trailer", "trlr", "trlrs", "tunel", 
"tunl", "tunls", "tunnel", "tunnels", "tunnl", "trnpk", "turnpike", 
"turnpk", "underpass", "un", "union", "unions", "valley", "vally", 
"vlly", "vly", "valleys", "vlys", "vdct", "via", "viadct", "viaduct", 
"view", "vw", "views", "vws", "vill", "villag", "village", "villg", 
"villiage", "vlg", "villages", "vlgs", "ville", "vl", "vis", 
"vist", "vista", "vst", "vsta", "walk", "walks", "wall", "wy", 
"way", "ways", "well", "wells", "wls"), usps_abbrev = c("aly", 
"aly", "aly", "aly", "anx", "anx", "anx", "anx", "arc", "arc", 
"ave", "ave", "ave", "ave", "ave", "ave", "ave", "byu", "byu", 
"bch", "bch", "bnd", "bnd", "blf", "blf", "blf", "blfs", "btm", 
"btm", "btm", "btm", "blvd", "blvd", "blvd", "blvd", "br", "br", 
"br", "brg", "brg", "brg", "brk", "brk", "brks", "bg", "bgs", 
"byp", "byp", "byp", "byp", "byp", "cp", "cp", "cp", "cyn", "cyn", 
"cyn", "cpe", "cpe", "cswy", "cswy", "cswy", "ctr", "ctr", "ctr", 
"ctr", "ctr", "ctr", "ctr", "ctr", "ctrs", "cir", "cir", "cir", 
"cir", "cir", "cir", "cirs", "clf", "clf", "clfs", "clfs", "clb", 
"clb", "cmn", "cmns", "cor", "cor", "cors", "cors", "crse", "crse", 
"ct", "ct", "cts", "cts", "cv", "cv", "cvs", "crk", "crk", "cres", 
"cres", "cres", "cres", "crst", "xing", "xing", "xing", "xrd", 
"xrds", "curv", "dl", "dl", "dm", "dm", "dv", "dv", "dv", "dv", 
"dr", "dr", "dr", "dr", "drs", "est", "est", "ests", "ests", 
"expy", "expy", "expy", "expy", "expy", "expy", "ext", "ext", 
"ext", "ext", "exts", "fall", "fls", "fls", "fry", "fry", "fry", 
"fld", "fld", "flds", "flds", "flt", "flt", "flts", "flts", "frd", 
"frd", "frds", "frst", "frst", "frst", "frg", "frg", "frg", "frgs", 
"frk", "frk", "frks", "frks", "ft", "ft", "ft", "fwy", "fwy", 
"fwy", "fwy", "fwy", "gdn", "gdn", "gdn", "gdn", "gdns", "gdns", 
"gdns", "gtwy", "gtwy", "gtwy", "gtwy", "gtwy", "gln", "gln", 
"glns", "grn", "grn", "grns", "grv", "grv", "grv", "grvs", "hbr", 
"hbr", "hbr", "hbr", "hbr", "hbrs", "hvn", "hvn", "hts", "hts", 
"hwy", "hwy", "hwy", "hwy", "hwy", "hwy", "hl", "hl", "hls", 
"hls", "holw", "holw", "holw", "holw", "holw", "inlt", "is", 
"is", "is", "iss", "iss", "iss", "isle", "isle", "jct", "jct", 
"jct", "jct", "jct", "jct", "jcts", "jcts", "jcts", "ky", "ky", 
"kys", "kys", "knl", "knl", "knl", "knls", "knls", "lk", "lk", 
"lks", "lks", "land", "lndg", "lndg", "lndg", "ln", "ln", "lgt", 
"lgt", "lgts", "lf", "lf", "lck", "lck", "lcks", "lcks", "ldg", 
"ldg", "ldg", "ldg", "loop", "loop", "mall", "mnr", "mnr", "mnrs", 
"mnrs", "mdw", "mdws", "mdws", "mdws", "mdws", "mews", "ml", 
"mls", "msn", "msn", "mtwy", "mt", "mt", "mt", "mtn", "mtn", 
"mtn", "mtn", "mtn", "mtn", "mtns", "mtns", "nck", "nck", "orch", 
"orch", "orch", "oval", "oval", "opas", "park", "park", "park", 
"pkwy", "pkwy", "pkwy", "pkwy", "pkwy", "pkwy", "pkwy", "pass", 
"psge", "path", "path", "pike", "pike", "pne", "pnes", "pnes", 
"pl", "pln", "pln", "plns", "plns", "plz", "plz", "plz", "pt", 
"pt", "pts", "pts", "prt", "prt", "prts", "prts", "pr", "pr", 
"pr", "radl", "radl", "radl", "radl", "ramp", "rnch", "rnch", 
"rnch", "rnch", "rpd", "rpd", "rpds", "rpds", "rst", "rst", "rdg", 
"rdg", "rdg", "rdgs", "rdgs", "riv", "riv", "riv", "riv", "rd", 
"rd", "rds", "rds", "rte", "row", "rue", "run", "shl", "shl", 
"shls", "shls", "shr", "shr", "shr", "shrs", "shrs", "shrs", 
"skwy", "spg", "spg", "spg", "spg", "spgs", "spgs", "spgs", "spgs", 
"spur", "spur", "sq", "sq", "sq", "sq", "sq", "sqs", "sqs", "sta", 
"sta", "sta", "sta", "stra", "stra", "stra", "stra", "stra", 
"stra", "stra", "strm", "strm", "strm", "st", "st", "st", "st", 
"sts", "smt", "ste", "smt", "smt", "smt", "ter", "ter", "ter", 
"trwy", "trce", "trce", "trce", "trak", "trak", "trak", "trak", 
"trak", "trfy", "trl", "trl", "trl", "trl", "trlr", "trlr", "trlr", 
"tunl", "tunl", "tunl", "tunl", "tunl", "tunl", "tpke", "tpke", 
"tpke", "upas", "un", "un", "uns", "vly", "vly", "vly", "vly", 
"vlys", "vlys", "via", "via", "via", "via", "vw", "vw", "vws", 
"vws", "vlg", "vlg", "vlg", "vlg", "vlg", "vlg", "vlgs", "vlgs", 
"vl", "vl", "vis", "vis", "vis", "vis", "vis", "walk", "walk", 
"wall", "way", "way", "ways", "wl", "wls", "wls")), class = "data.frame", row.names = c(NA, 
-503L))

I would like to use them to work with street addresses and states. Toy data:

a <- c("10900 harper ave", "12235 davis annex", "24 van cortland parkway")

To convert common abbreviations to the usps abbreviation (standardizing the data), I built a little function:

mr_zip <- function(x){
  x <-textclean::mgsub(usps_streets$common_abbrev, usps_streets$usps_abbrev, x, fixed = T,
                   order.pattern = T)
  return(x)
}

The problem arises when I apply my function to my data:

f <- sapply(a, mr_zip)

I get the wrong results:

 "10900 harper avee"       "1235 davis anx" "24 van cortland pkway"

Because what I should be getting is:

"10900 harper ave"       "1235 davis anx" "24 van cortland pkwy"

My questions:

Why is this happening when I specified order.pattern = T and fixed = T in the mgsub function?
What can I do to fix it?
Is there an alternative approach to using vectors in multiple substitution patterns for text?

Thanks in advance, all suggestions are welcome.

EDIT: Thanks to @RichieSacramento I have found that using the word boundary does help but the function is still incredibly slow when used on a large dataframe (> 400,000 rows). Using safe = TRUE in mgsub leads to the function working properly but it's incredibly slow. Something quick would be desired--hence the bounty.

489

asked Oct 06 '21 14:10

jvalenti

Video Answer

3 Answers

Latest update for all interested

I am writing an additional answer, because my original answer couldn't hold such long text and code anymore.

Dear colleagues, below I have collected all the functions that were created here in one collective code block so that anyone who wants to can try it out and do not have to combine it with several answers.

First of all, I unified all the functions so that each accepts two arguments at the input and returns a modified tibble at the output. I also moved all internal functions outside of processing functions.

Finally, I performed benchmarks for tables with 100, 1,000, 10,000, 100,000 and 1,000,000 rows.

Here is all the code

library(tidyverse)
library(data.table)

library(tidyverse)
USPS = tibble(
 common_abbrev = c("allee", "alley", "ally", "aly",
 "anex", "annex", "annx", "anx", "arc", "arcade", "av", "ave",
 "aven", "avenu", "avenue", "avn", "avnue", "bayoo", "bayou",
 "bch", "beach", "bend", "bnd", "blf", "bluf", "bluff", "bluffs",
 "bot", "btm", "bottm", "bottom", "blvd", "boul", "boulevard",
 "boulv", "br", "brnch", "branch", "brdge", "brg", "bridge", "brk",
 "brook", "brooks", "burg", "burgs", "byp", "bypa", "bypas", "bypass",
 "byps", "camp", "cp", "cmp", "canyn", "canyon", "cnyn", "cape",
 "cpe", "causeway", "causwa", "cswy", "cen", "cent", "center",
 "centr", "centre", "cnter", "cntr", "ctr", "centers", "cir",
 "circ", "circl", "circle", "crcl", "crcle", "circles", "clf",
 "cliff", "clfs", "cliffs", "clb", "club", "common", "commons",
 "cor", "corner", "corners", "cors", "course", "crse", "court",
 "ct", "courts", "cts", "cove", "cv", "coves", "creek", "crk",
 "crescent", "cres", "crsent", "crsnt", "crest", "crossing", "crssng",
 "xing", "crossroad", "crossroads", "curve", "dale", "dl", "dam",
 "dm", "div", "divide", "dv", "dvd", "dr", "driv", "drive", "drv",
 "drives", "est", "estate", "estates", "ests", "exp", "expr",
 "express", "expressway", "expw", "expy", "ext", "extension",
 "extn", "extnsn", "exts", "fall", "falls", "fls", "ferry", "frry",
 "fry", "field", "fld", "fields", "flds", "flat", "flt", "flats",
 "flts", "ford", "frd", "fords", "forest", "forests", "frst",
 "forg", "forge", "frg", "forges", "fork", "frk", "forks", "frks",
 "fort", "frt", "ft", "freeway", "freewy", "frway", "frwy", "fwy",
 "garden", "gardn", "grden", "grdn", "gardens", "gdns", "grdns",
 "gateway", "gatewy", "gatway", "gtway", "gtwy", "glen", "gln",
 "glens", "green", "grn", "greens", "grov", "grove", "grv", "groves",
 "harb", "harbor", "harbr", "hbr", "hrbor", "harbors", "haven",
 "hvn", "ht", "hts", "highway", "highwy", "hiway", "hiwy", "hway",
 "hwy", "hill", "hl", "hills", "hls", "hllw", "hollow", "hollows",
 "holw", "holws", "inlt", "is", "island", "islnd", "islands",
 "islnds", "iss", "isle", "isles", "jct", "jction", "jctn", "junction",
 "junctn", "juncton", "jctns", "jcts", "junctions", "key", "ky",
 "keys", "kys", "knl", "knol", "knoll", "knls", "knolls", "lk",
 "lake", "lks", "lakes", "land", "landing", "lndg", "lndng", "lane",
 "ln", "lgt", "light", "lights", "lf", "loaf", "lck", "lock",
 "lcks", "locks", "ldg", "ldge", "lodg", "lodge", "loop", "loops",
 "mall", "mnr", "manor", "manors", "mnrs", "meadow", "mdw", "mdws",
 "meadows", "medows", "mews", "mill", "mills", "missn", "mssn",
 "motorway", "mnt", "mt", "mount", "mntain", "mntn", "mountain",
 "mountin", "mtin", "mtn", "mntns", "mountains", "nck", "neck",
 "orch", "orchard", "orchrd", "oval", "ovl", "overpass", "park",
 "prk", "parks", "parkway", "parkwy", "pkway", "pkwy", "pky",
 "parkways", "pkwys", "pass", "passage", "path", "paths", "pike",
 "pikes", "pine", "pines", "pnes", "pl", "plain", "pln", "plains",
 "plns", "plaza", "plz", "plza", "point", "pt", "points", "pts",
 "port", "prt", "ports", "prts", "pr", "prairie", "prr", "rad",
 "radial", "radiel", "radl", "ramp", "ranch", "ranches", "rnch",
 "rnchs", "rapid", "rpd", "rapids", "rpds", "rest", "rst", "rdg",
 "rdge", "ridge", "rdgs", "ridges", "riv", "river", "rvr", "rivr",
 "rd", "road", "roads", "rds", "route", "row", "rue", "run", "shl",
 "shoal", "shls", "shoals", "shoar", "shore", "shr", "shoars",
 "shores", "shrs", "skyway", "spg", "spng", "spring", "sprng",
 "spgs", "spngs", "springs", "sprngs", "spur", "spurs", "sq",
 "sqr", "sqre", "squ", "square", "sqrs", "squares", "sta", "station",
 "statn", "stn", "stra", "strav", "straven", "stravenue", "stravn",
 "strvn", "strvnue", "stream", "streme", "strm", "street", "strt",
 "st", "str", "streets", "smt", "suite", "sumit", "sumitt", "summit",
 "ter", "terr", "terrace", "throughway", "trace", "traces", "trce",
 "track", "tracks", "trak", "trk", "trks", "trafficway", "trail",
 "trails", "trl", "trls", "trailer", "trlr", "trlrs", "tunel",
 "tunl", "tunls", "tunnel", "tunnels", "tunnl", "trnpk", "turnpike",
 "turnpk", "underpass", "un", "union", "unions", "valley", "vally",
 "vlly", "vly", "valleys", "vlys", "vdct", "via", "viadct", "viaduct",
 "view", "vw", "views", "vws", "vill", "villag", "village", "villg",
 "villiage", "vlg", "villages", "vlgs", "ville", "vl", "vis",
 "vist", "vista", "vst", "vsta", "walk", "walks", "wall", "wy",
 "way", "ways", "well", "wells", "wls"),
 usps_abbrev = c("aly",
 "aly", "aly", "aly", "anx", "anx", "anx", "anx", "arc", "arc",
 "ave", "ave", "ave", "ave", "ave", "ave", "ave", "byu", "byu",
 "bch", "bch", "bnd", "bnd", "blf", "blf", "blf", "blfs", "btm",
 "btm", "btm", "btm", "blvd", "blvd", "blvd", "blvd", "br", "br",
 "br", "brg", "brg", "brg", "brk", "brk", "brks", "bg", "bgs",
 "byp", "byp", "byp", "byp", "byp", "cp", "cp", "cp", "cyn", "cyn",
 "cyn", "cpe", "cpe", "cswy", "cswy", "cswy", "ctr", "ctr", "ctr",
 "ctr", "ctr", "ctr", "ctr", "ctr", "ctrs", "cir", "cir", "cir",
 "cir", "cir", "cir", "cirs", "clf", "clf", "clfs", "clfs", "clb",
 "clb", "cmn", "cmns", "cor", "cor", "cors", "cors", "crse", "crse",
 "ct", "ct", "cts", "cts", "cv", "cv", "cvs", "crk", "crk", "cres",
 "cres", "cres", "cres", "crst", "xing", "xing", "xing", "xrd",
 "xrds", "curv", "dl", "dl", "dm", "dm", "dv", "dv", "dv", "dv",
 "dr", "dr", "dr", "dr", "drs", "est", "est", "ests", "ests",
 "expy", "expy", "expy", "expy", "expy", "expy", "ext", "ext",
 "ext", "ext", "exts", "fall", "fls", "fls", "fry", "fry", "fry",
 "fld", "fld", "flds", "flds", "flt", "flt", "flts", "flts", "frd",
 "frd", "frds", "frst", "frst", "frst", "frg", "frg", "frg", "frgs",
 "frk", "frk", "frks", "frks", "ft", "ft", "ft", "fwy", "fwy",
 "fwy", "fwy", "fwy", "gdn", "gdn", "gdn", "gdn", "gdns", "gdns",
 "gdns", "gtwy", "gtwy", "gtwy", "gtwy", "gtwy", "gln", "gln",
 "glns", "grn", "grn", "grns", "grv", "grv", "grv", "grvs", "hbr",
 "hbr", "hbr", "hbr", "hbr", "hbrs", "hvn", "hvn", "hts", "hts",
 "hwy", "hwy", "hwy", "hwy", "hwy", "hwy", "hl", "hl", "hls",
 "hls", "holw", "holw", "holw", "holw", "holw", "inlt", "is",
 "is", "is", "iss", "iss", "iss", "isle", "isle", "jct", "jct",
 "jct", "jct", "jct", "jct", "jcts", "jcts", "jcts", "ky", "ky",
 "kys", "kys", "knl", "knl", "knl", "knls", "knls", "lk", "lk",
 "lks", "lks", "land", "lndg", "lndg", "lndg", "ln", "ln", "lgt",
 "lgt", "lgts", "lf", "lf", "lck", "lck", "lcks", "lcks", "ldg",
 "ldg", "ldg", "ldg", "loop", "loop", "mall", "mnr", "mnr", "mnrs",
 "mnrs", "mdw", "mdws", "mdws", "mdws", "mdws", "mews", "ml",
 "mls", "msn", "msn", "mtwy", "mt", "mt", "mt", "mtn", "mtn",
 "mtn", "mtn", "mtn", "mtn", "mtns", "mtns", "nck", "nck", "orch",
 "orch", "orch", "oval", "oval", "opas", "park", "park", "park",
 "pkwy", "pkwy", "pkwy", "pkwy", "pkwy", "pkwy", "pkwy", "pass",
 "psge", "path", "path", "pike", "pike", "pne", "pnes", "pnes",
 "pl", "pln", "pln", "plns", "plns", "plz", "plz", "plz", "pt",
 "pt", "pts", "pts", "prt", "prt", "prts", "prts", "pr", "pr",
 "pr", "radl", "radl", "radl", "radl", "ramp", "rnch", "rnch",
 "rnch", "rnch", "rpd", "rpd", "rpds", "rpds", "rst", "rst", "rdg",
 "rdg", "rdg", "rdgs", "rdgs", "riv", "riv", "riv", "riv", "rd",
 "rd", "rds", "rds", "rte", "row", "rue", "run", "shl", "shl",
 "shls", "shls", "shr", "shr", "shr", "shrs", "shrs", "shrs",
 "skwy", "spg", "spg", "spg", "spg", "spgs", "spgs", "spgs", "spgs",
 "spur", "spur", "sq", "sq", "sq", "sq", "sq", "sqs", "sqs", "sta",
 "sta", "sta", "sta", "stra", "stra", "stra", "stra", "stra",
 "stra", "stra", "strm", "strm", "strm", "st", "st", "st", "st",
 "sts", "smt", "ste", "smt", "smt", "smt", "ter", "ter", "ter",
 "trwy", "trce", "trce", "trce", "trak", "trak", "trak", "trak",
 "trak", "trfy", "trl", "trl", "trl", "trl", "trlr", "trlr", "trlr",
 "tunl", "tunl", "tunl", "tunl", "tunl", "tunl", "tpke", "tpke",
 "tpke", "upas", "un", "un", "uns", "vly", "vly", "vly", "vly",
 "vlys", "vlys", "via", "via", "via", "via", "vw", "vw", "vws",
 "vws", "vlg", "vlg", "vlg", "vlg", "vlg", "vlg", "vlgs", "vlgs",
 "vl", "vl", "vis", "vis", "vis", "vis", "vis", "walk", "walk",
 "wall", "way", "way", "ways", "wl", "wls", "wls"))

randomAddresses = function(n){
  tibble(
    addresses = paste(
      sample(10:10000, n, replace = TRUE),
      sample(c("harper", "davis", "van cortland", "marry", "von brown"), n, replace = TRUE),
      sample(USPS$common_abbrev, n, replace = TRUE)
    )
  )
}

set.seed(1111)
df = randomAddresses(10)

USPS_conv2 = function(x, y) {
  t = str_split(x, " ")
  comm = t[[1]][length(t[[1]])]
  str_replace(x, comm, y[comm])
}
USPS_conv2 = Vectorize(USPS_conv2, "x")

f_MK_conv2 <- function(x, y) {
  x %>% mutate(
    addresses = USPS_conv2(addresses, 
      array(data = y$usps_abbrev, dimnames = list(y$common_abbrev))))
}
f_MK_conv2(df, USPS)


ht.create <- function() new.env()

ht.insert <- function(ht, key, value) ht[[key]] <- value
ht.insert <- Vectorize(ht.insert, c("key", "value"))

ht.lookup <- function(ht, key) ht[[key]]
ht.lookup <- Vectorize(ht.lookup, "key")

ht.delete <- function(ht, key) rm(list = key, envir = ht, inherits = FALSE)
ht.delete <- Vectorize(ht.delete, "key")


f_MK_replaceString <- function(x, y) {
  ht <- ht.create()
  ht.insert(ht, y$common_abbrev, y$usps_abbrev)

  txt <- x$addresses
  i <- sapply(strsplit(txt, ""), function(x) max(which(x == " ")))
  txt <- paste0(
    str_sub(txt, end = i),
    ht.lookup(ht, str_sub(txt, start = i + 1))
  )
  x %>% mutate(addresses = txt)
}
f_MK_replaceString(df, USPS)

f_TIC1 <- function(x, y) {
  x %>% mutate(addresses = sapply(
    strsplit(x$addresses, " "),
    function(x) {
      with(y, {
        idx <- match(x, common_abbrev)
        paste0(ifelse(is.na(idx), x, usps_abbrev[idx]),
               collapse = " "
        )
      })
    }
  )
  )
}
f_TIC1(df, USPS)


f_TIC2 <- function(x, y) {
  res <- c()
  for (s in x$addresses) {
    v <- unlist(strsplit(s, "\\W+"))
    for (p in v) {
      k <- match(p, y$common_abbrev)
      if (!is.na(k)) {
        s <- with(
          y,
          gsub(
            sprintf("\\b%s\\b", common_abbrev[k]),
            usps_abbrev[k],
            s
          )
        )
      }
    }
    res <- append(res, s)
  }
  x %>% mutate(addresses = res)
}
f_TIC2(df, USPS)


f_TIC3 <- function(x, y) {
  x.split <- strsplit(x$addresses, " ")
  lut <- with(y, setNames(usps_abbrev, common_abbrev))
  grp <- rep(seq_along(x.split), lengths(x.split))
  xx <- unlist(x.split)
  r <- lut[xx]
  x %>% mutate(addresses = tapply(
    replace(xx, !is.na(r), na.omit(r)),
    grp,
    function(s) paste0(s, collapse = " ")
  ))
}
f_TIC3(df, USPS)

f_TIC4 <- function(x, y) {
  xb <- gsub("^.*\\s+", "", x$addresses)
  rp <- with(USPS, usps_abbrev[match(xb, common_abbrev)])
  x %>% mutate(addresses = paste0(gsub("\\w+$", "", x$addresses), replace(xb, !is.na(rp), na.omit(rp))))
}
f_TIC4(df, USPS)

f_JM <- function(x, y) {
  x$abbreviation <- gsub("^.* ", "", x$addresses)
  setDT(x)
  setDT(y)
  x[y, abbreviation := usps_abbrev, on = .(abbreviation = common_abbrev)]
  
  x$addresses <- paste(str_extract(x$addresses, "^.* "), x$abbreviation, sep = "")
  x$abbreviation <- NULL
  return(as_tibble(x))
}
f_JM(df, USPS)

set.seed(1111)
df = randomAddresses(100)

library(microbenchmark)
mb1 <- microbenchmark(
  f_MK_conv2(df, USPS),
  f_MK_replaceString(df, USPS),
  f_TIC1(df, USPS),
  f_TIC2(df, USPS),
  f_TIC3(df, USPS),
  f_TIC4(df, USPS),
  f_JM(df, USPS),
  times = 20L
)
ggplot2::autoplot(mb1)

set.seed(1111)
df = randomAddresses(1000)

library(microbenchmark)
mb1 <- microbenchmark(
  f_MK_conv2(df, USPS),
  f_MK_replaceString(df, USPS),
  f_TIC1(df, USPS),
  f_TIC2(df, USPS),
  f_TIC3(df, USPS),
  f_TIC4(df, USPS),
  f_JM(df, USPS),
  times = 20L
)
ggplot2::autoplot(mb1)

set.seed(1111)
df = randomAddresses(10000)

library(microbenchmark)
mb1 <- microbenchmark(
  f_MK_conv2(df, USPS),
  f_MK_replaceString(df, USPS),
  f_TIC1(df, USPS),
  f_TIC2(df, USPS),
  f_TIC3(df, USPS),
  f_TIC4(df, USPS),
  f_JM(df, USPS),
  times = 20L
)
ggplot2::autoplot(mb1)

set.seed(1111)
df = randomAddresses(100000)

library(microbenchmark)
mb1 <- microbenchmark(
  f_MK_replaceString(df, USPS),
  f_TIC3(df, USPS),
  f_TIC4(df, USPS),
  f_JM(df, USPS),
  times = 20L
)
ggplot2::autoplot(mb1)

set.seed(1111)
df = randomAddresses(1000000)

library(microbenchmark)
mb1 <- microbenchmark(
  f_MK_replaceString(df, USPS),
  f_TIC4(df, USPS),
  f_JM(df, USPS),
  times = 20L
)
ggplot2::autoplot(mb1)

And now the result in the form of charts

enter image description here

Time for conclusions and summary

@jared_mamrot - you are absolutely right. data.table is amazing!!

@ThomasIsCoding - bravo for f_TIC4. Its simplicity is beautiful!!

@AnyoneWhoComesBy - congratulations if you've read this to the end. I believe that you, too, could learn a lot here!!

173

answered Oct 24 '22 05:10

Marek Fiołka

Update

Here is the benchmarking for the existing to OP's question (borrow test data from @Marek Fiołka but with n <- 10000)

> mb1
Unit: milliseconds
                              expr       min        lq       mean    median
          f_MK_conv2(df$addresses) 1409.0643 1470.3992 1612.09037 1631.3014
 f_MK_replaceString(df, addresses)   50.1582   54.3035   94.53149   62.5772
              f_TIC1(df$addresses)  394.5972  420.3283  461.50675  447.6186
              f_TIC2(df$addresses) 1579.1868 1852.6873 2052.28388 1964.8845
              f_TIC3(df$addresses)   65.8436   71.5448   93.36210   84.9698
        uq       max neval
 1710.3459 1898.6773    20
  116.3108  264.2616    20
  499.4052  626.9240    20
 2246.5562 2916.2253    20
  102.7689  183.5121    20

enter image description here

where the benchmark code is given as follows

f_MK_conv2 <- function(x) {
  USPSv <- array(
    data = USPS$usps_abbrev,
    dimnames = list(USPS$common_abbrev)
  )
  USPS_conv2 <- function(x) {
    t <- str_split(x, " ")
    comm <- t[[1]][length(t[[1]])]
    str_replace(x, comm, USPSv[comm])
  }
  Vectorize(USPS_conv2)(x)
}

f_MK_replaceString <- function(.data, value) {
  ht.create <- function() new.env()

  ht.insert <- function(ht, key, value) ht[[key]] <- value
  ht.insert <- Vectorize(ht.insert, c("key", "value"))

  ht.lookup <- function(ht, key) ht[[key]]
  ht.lookup <- Vectorize(ht.lookup, "key")

  ht.delete <- function(ht, key) rm(list = key, envir = ht, inherits = FALSE)
  ht.delete <- Vectorize(ht.delete, "key")

  addHashTable2 <- function(.x, .y, key, value) {
    key <- enquo(key)
    value <- enquo(value)

    if (!all(c(as_label(key), as_label(value)) %in% names(.y))) {
      stop(paste0(
        "`.y` must contain `", as_label(key),
        "` and `", as_label(value), "` columns"
      ))
    }

    if ((.y %>% distinct(!!key, !!value) %>% nrow()) !=
      (.y %>% distinct(!!key) %>% nrow())) {
      warning(paste0(
        "\nThe number of unique values of the ", as_label(key),
        " variable is different\n",
        " from the number of unique values of the ",
        as_label(key), " and ", as_label(value), " pairs!\n",
        "The dictionary will only return the last values for a given key!"
      ))
    }

    ht <- ht.create()
    ht %>% ht.insert(
      .y %>% distinct(!!key, !!value) %>% pull(!!key),
      .y %>% distinct(!!key, !!value) %>% pull(!!value)
    )
    attr(.x, "hashTab") <- ht
    .x
  }

  .data <- .data %>% addHashTable2(USPS, common_abbrev, usps_abbrev)

  value <- enquo(value)
  # Test whether the value variable is in .data
  if (!(as_label(value) %in% names(.data))) {
    stop(paste(
      "The", as_label(value),
      "variable does not exist in the .data table!"
    ))
  }

  # Dictionary attribute presence test
  if (!("hashTab" %in% names(attributes(.data)))) {
    stop(paste0(
      "\nThere is no dictionary attribute in the .data table!\n",
      "Use addHashTable or addHashTable2 to add a dictionary attribute."
    ))
  }

  txt <- .data %>% pull(!!value)
  i <- sapply(strsplit(txt, ""), function(x) max(which(x == " ")))
  txt <- paste0(
    str_sub(txt, end = i),
    ht.lookup(
      attr(.data, "hashTab"),
      str_sub(txt, start = i + 1)
    )
  )
  .data %>% mutate(!!value := txt)
}

f_TIC1 <- function(x) {
  sapply(
    strsplit(x, " "),
    function(x) {
      with(USPS, {
        idx <- match(x, common_abbrev)
        paste0(ifelse(is.na(idx), x, usps_abbrev[idx]),
          collapse = " "
        )
      })
    }
  )
}

f_TIC2 <- function(x) {
  res <- c()
  for (s in x) {
    v <- unlist(strsplit(s, "\\W+"))
    for (p in v) {
      k <- match(p, USPS$common_abbrev)
      if (!is.na(k)) {
        s <- with(
          USPS,
          gsub(
            sprintf("\\b%s\\b", common_abbrev[k]),
            usps_abbrev[k],
            s
          )
        )
      }
    }
    res <- append(res, s)
  }
  res
}

f_TIC3 <- function(x) {
  x.split <- strsplit(x, " ")
  lut <- with(USPS, setNames(usps_abbrev, common_abbrev))
  grp <- rep(seq_along(x.split), lengths(x.split))
  xx <- unlist(x.split)
  r <- lut[xx]
  tapply(
    replace(xx, !is.na(r), na.omit(r)),
    grp,
    function(s) paste0(s, collapse = " ")
  )
}

f_TIC4 <- function(x) {
  xb <- gsub("^.*\\s+", "", x)
  rp <- with(USPS, usps_abbrev[match(xb, common_abbrev)])
  paste0(gsub("\\w+$", "", x), replace(xb, !is.na(rp), na.omit(rp)))
}

f_JM <- function(x) {
  x$abbreviation <- gsub("^.* ", "", x$addresses)
  setDT(x)
  setDT(USPS)
  x[USPS, abbreviation := usps_abbrev, on = .(abbreviation = common_abbrev)]

  x$usps_abbreviation <- paste(str_extract(x$addresses, "^.* "), x$abbreviation, sep = "")
}

set.seed(1111)
df <- randomAddresses(10000)

library(microbenchmark)
mb1 <- microbenchmark(
  f_MK_conv2(df$addresses),
  f_MK_replaceString(df, addresses),
  f_JM(df),
  f_TIC1(df$addresses),
  f_TIC2(df$addresses),
  f_TIC3(df$addresses),
  f_TIC4(df$addresses),
  times = 20L
)
ggplot2::autoplot(mb1)

Possible Solutions

Perhaps one of the following base R options could help

solution 1

f_TIC1 <- function(x) {
  sapply(
    strsplit(x, " "),
    function(x) {
      with(USPS, {
        idx <- match(x, common_abbrev)
        paste0(ifelse(is.na(idx), x, usps_abbrev[idx]),
          collapse = " "
        )
      })
    }
  )
}

solution 2


f_TIC2 <- function(x) {
  res <- c()
  for (s in x) {
    v <- unlist(strsplit(s, "\\W+"))
    for (p in v) {
      k <- match(p, USPS$common_abbrev)
      if (!is.na(k)) {
        s <- with(
          USPS,
          gsub(
            sprintf("\\b%s\\b", common_abbrev[k]),
            usps_abbrev[k],
            s
          )
        )
      }
    }
    res <- append(res, s)
  }
  res
}

solution 3


f_TIC3 <- function(x) {
  x.split <- strsplit(x, " ")
  lut <- with(USPS, setNames(usps_abbrev, common_abbrev))
  grp <- rep(seq_along(x.split), lengths(x.split))
  xx <- unlist(x.split)
  r <- lut[xx]
  tapply(
    replace(xx, !is.na(r), na.omit(r)),
    grp,
    function(s) paste0(s, collapse = " ")
  )
}

solution 4 (this is for a special case, i.e., abbreviation for the last word only)

f_TIC4 <- function(x) {
  xb <- gsub("^.*\\s+", "", x)
  rp <- with(USPS, usps_abbrev[match(xb, common_abbrev)])
  paste0(gsub("\\w+$", "", x), replace(xb, !is.na(rp), na.omit(rp)))
}

output

[1] "10900 harper ave"     "12235 davis anx"      "24 van cortland pkwy"

answered Oct 24 '22 05:10

ThomasIsCoding

Update:

I spent some time tweaking my existing answer (below) and I believe it's the fastest method. Also, it's worth noting that if you add perl = TRUE to the gsub in f_JM and TIC4 you get a noticeable increase in speed with this example (may not apply to 'real world' data). There is also a major caveat to my answer, as it is predicated on the abbreviated term being the last term in the address (TIC1, TIC2 and TIC3 for example don't rely on that assumption).

Huge thanks to @Marek and @TIC for the benchmarking code and for the constructive comments:

## Benchmarking with updated f_JM() and TIC4()
library(data.table)
library(tidyverse)

USPS = tibble(
  common_abbrev = c("allee", "alley", "ally", "aly",
                    "anex", "annex", "annx", "anx", "arc", "arcade", "av", "ave",
                    "aven", "avenu", "avenue", "avn", "avnue", "bayoo", "bayou",
                    "bch", "beach", "bend", "bnd", "blf", "bluf", "bluff", "bluffs",
                    "bot", "btm", "bottm", "bottom", "blvd", "boul", "boulevard",
                    "boulv", "br", "brnch", "branch", "brdge", "brg", "bridge", "brk",
                    "brook", "brooks", "burg", "burgs", "byp", "bypa", "bypas", "bypass",
                    "byps", "camp", "cp", "cmp", "canyn", "canyon", "cnyn", "cape",
                    "cpe", "causeway", "causwa", "cswy", "cen", "cent", "center",
                    "centr", "centre", "cnter", "cntr", "ctr", "centers", "cir",
                    "circ", "circl", "circle", "crcl", "crcle", "circles", "clf",
                    "cliff", "clfs", "cliffs", "clb", "club", "common", "commons",
                    "cor", "corner", "corners", "cors", "course", "crse", "court",
                    "ct", "courts", "cts", "cove", "cv", "coves", "creek", "crk",
                    "crescent", "cres", "crsent", "crsnt", "crest", "crossing", "crssng",
                    "xing", "crossroad", "crossroads", "curve", "dale", "dl", "dam",
                    "dm", "div", "divide", "dv", "dvd", "dr", "driv", "drive", "drv",
                    "drives", "est", "estate", "estates", "ests", "exp", "expr",
                    "express", "expressway", "expw", "expy", "ext", "extension",
                    "extn", "extnsn", "exts", "fall", "falls", "fls", "ferry", "frry",
                    "fry", "field", "fld", "fields", "flds", "flat", "flt", "flats",
                    "flts", "ford", "frd", "fords", "forest", "forests", "frst",
                    "forg", "forge", "frg", "forges", "fork", "frk", "forks", "frks",
                    "fort", "frt", "ft", "freeway", "freewy", "frway", "frwy", "fwy",
                    "garden", "gardn", "grden", "grdn", "gardens", "gdns", "grdns",
                    "gateway", "gatewy", "gatway", "gtway", "gtwy", "glen", "gln",
                    "glens", "green", "grn", "greens", "grov", "grove", "grv", "groves",
                    "harb", "harbor", "harbr", "hbr", "hrbor", "harbors", "haven",
                    "hvn", "ht", "hts", "highway", "highwy", "hiway", "hiwy", "hway",
                    "hwy", "hill", "hl", "hills", "hls", "hllw", "hollow", "hollows",
                    "holw", "holws", "inlt", "is", "island", "islnd", "islands",
                    "islnds", "iss", "isle", "isles", "jct", "jction", "jctn", "junction",
                    "junctn", "juncton", "jctns", "jcts", "junctions", "key", "ky",
                    "keys", "kys", "knl", "knol", "knoll", "knls", "knolls", "lk",
                    "lake", "lks", "lakes", "land", "landing", "lndg", "lndng", "lane",
                    "ln", "lgt", "light", "lights", "lf", "loaf", "lck", "lock",
                    "lcks", "locks", "ldg", "ldge", "lodg", "lodge", "loop", "loops",
                    "mall", "mnr", "manor", "manors", "mnrs", "meadow", "mdw", "mdws",
                    "meadows", "medows", "mews", "mill", "mills", "missn", "mssn",
                    "motorway", "mnt", "mt", "mount", "mntain", "mntn", "mountain",
                    "mountin", "mtin", "mtn", "mntns", "mountains", "nck", "neck",
                    "orch", "orchard", "orchrd", "oval", "ovl", "overpass", "park",
                    "prk", "parks", "parkway", "parkwy", "pkway", "pkwy", "pky",
                    "parkways", "pkwys", "pass", "passage", "path", "paths", "pike",
                    "pikes", "pine", "pines", "pnes", "pl", "plain", "pln", "plains",
                    "plns", "plaza", "plz", "plza", "point", "pt", "points", "pts",
                    "port", "prt", "ports", "prts", "pr", "prairie", "prr", "rad",
                    "radial", "radiel", "radl", "ramp", "ranch", "ranches", "rnch",
                    "rnchs", "rapid", "rpd", "rapids", "rpds", "rest", "rst", "rdg",
                    "rdge", "ridge", "rdgs", "ridges", "riv", "river", "rvr", "rivr",
                    "rd", "road", "roads", "rds", "route", "row", "rue", "run", "shl",
                    "shoal", "shls", "shoals", "shoar", "shore", "shr", "shoars",
                    "shores", "shrs", "skyway", "spg", "spng", "spring", "sprng",
                    "spgs", "spngs", "springs", "sprngs", "spur", "spurs", "sq",
                    "sqr", "sqre", "squ", "square", "sqrs", "squares", "sta", "station",
                    "statn", "stn", "stra", "strav", "straven", "stravenue", "stravn",
                    "strvn", "strvnue", "stream", "streme", "strm", "street", "strt",
                    "st", "str", "streets", "smt", "suite", "sumit", "sumitt", "summit",
                    "ter", "terr", "terrace", "throughway", "trace", "traces", "trce",
                    "track", "tracks", "trak", "trk", "trks", "trafficway", "trail",
                    "trails", "trl", "trls", "trailer", "trlr", "trlrs", "tunel",
                    "tunl", "tunls", "tunnel", "tunnels", "tunnl", "trnpk", "turnpike",
                    "turnpk", "underpass", "un", "union", "unions", "valley", "vally",
                    "vlly", "vly", "valleys", "vlys", "vdct", "via", "viadct", "viaduct",
                    "view", "vw", "views", "vws", "vill", "villag", "village", "villg",
                    "villiage", "vlg", "villages", "vlgs", "ville", "vl", "vis",
                    "vist", "vista", "vst", "vsta", "walk", "walks", "wall", "wy",
                    "way", "ways", "well", "wells", "wls"),
  usps_abbrev = c("aly",
                  "aly", "aly", "aly", "anx", "anx", "anx", "anx", "arc", "arc",
                  "ave", "ave", "ave", "ave", "ave", "ave", "ave", "byu", "byu",
                  "bch", "bch", "bnd", "bnd", "blf", "blf", "blf", "blfs", "btm",
                  "btm", "btm", "btm", "blvd", "blvd", "blvd", "blvd", "br", "br",
                  "br", "brg", "brg", "brg", "brk", "brk", "brks", "bg", "bgs",
                  "byp", "byp", "byp", "byp", "byp", "cp", "cp", "cp", "cyn", "cyn",
                  "cyn", "cpe", "cpe", "cswy", "cswy", "cswy", "ctr", "ctr", "ctr",
                  "ctr", "ctr", "ctr", "ctr", "ctr", "ctrs", "cir", "cir", "cir",
                  "cir", "cir", "cir", "cirs", "clf", "clf", "clfs", "clfs", "clb",
                  "clb", "cmn", "cmns", "cor", "cor", "cors", "cors", "crse", "crse",
                  "ct", "ct", "cts", "cts", "cv", "cv", "cvs", "crk", "crk", "cres",
                  "cres", "cres", "cres", "crst", "xing", "xing", "xing", "xrd",
                  "xrds", "curv", "dl", "dl", "dm", "dm", "dv", "dv", "dv", "dv",
                  "dr", "dr", "dr", "dr", "drs", "est", "est", "ests", "ests",
                  "expy", "expy", "expy", "expy", "expy", "expy", "ext", "ext",
                  "ext", "ext", "exts", "fall", "fls", "fls", "fry", "fry", "fry",
                  "fld", "fld", "flds", "flds", "flt", "flt", "flts", "flts", "frd",
                  "frd", "frds", "frst", "frst", "frst", "frg", "frg", "frg", "frgs",
                  "frk", "frk", "frks", "frks", "ft", "ft", "ft", "fwy", "fwy",
                  "fwy", "fwy", "fwy", "gdn", "gdn", "gdn", "gdn", "gdns", "gdns",
                  "gdns", "gtwy", "gtwy", "gtwy", "gtwy", "gtwy", "gln", "gln",
                  "glns", "grn", "grn", "grns", "grv", "grv", "grv", "grvs", "hbr",
                  "hbr", "hbr", "hbr", "hbr", "hbrs", "hvn", "hvn", "hts", "hts",
                  "hwy", "hwy", "hwy", "hwy", "hwy", "hwy", "hl", "hl", "hls",
                  "hls", "holw", "holw", "holw", "holw", "holw", "inlt", "is",
                  "is", "is", "iss", "iss", "iss", "isle", "isle", "jct", "jct",
                  "jct", "jct", "jct", "jct", "jcts", "jcts", "jcts", "ky", "ky",
                  "kys", "kys", "knl", "knl", "knl", "knls", "knls", "lk", "lk",
                  "lks", "lks", "land", "lndg", "lndg", "lndg", "ln", "ln", "lgt",
                  "lgt", "lgts", "lf", "lf", "lck", "lck", "lcks", "lcks", "ldg",
                  "ldg", "ldg", "ldg", "loop", "loop", "mall", "mnr", "mnr", "mnrs",
                  "mnrs", "mdw", "mdws", "mdws", "mdws", "mdws", "mews", "ml",
                  "mls", "msn", "msn", "mtwy", "mt", "mt", "mt", "mtn", "mtn",
                  "mtn", "mtn", "mtn", "mtn", "mtns", "mtns", "nck", "nck", "orch",
                  "orch", "orch", "oval", "oval", "opas", "park", "park", "park",
                  "pkwy", "pkwy", "pkwy", "pkwy", "pkwy", "pkwy", "pkwy", "pass",
                  "psge", "path", "path", "pike", "pike", "pne", "pnes", "pnes",
                  "pl", "pln", "pln", "plns", "plns", "plz", "plz", "plz", "pt",
                  "pt", "pts", "pts", "prt", "prt", "prts", "prts", "pr", "pr",
                  "pr", "radl", "radl", "radl", "radl", "ramp", "rnch", "rnch",
                  "rnch", "rnch", "rpd", "rpd", "rpds", "rpds", "rst", "rst", "rdg",
                  "rdg", "rdg", "rdgs", "rdgs", "riv", "riv", "riv", "riv", "rd",
                  "rd", "rds", "rds", "rte", "row", "rue", "run", "shl", "shl",
                  "shls", "shls", "shr", "shr", "shr", "shrs", "shrs", "shrs",
                  "skwy", "spg", "spg", "spg", "spg", "spgs", "spgs", "spgs", "spgs",
                  "spur", "spur", "sq", "sq", "sq", "sq", "sq", "sqs", "sqs", "sta",
                  "sta", "sta", "sta", "stra", "stra", "stra", "stra", "stra",
                  "stra", "stra", "strm", "strm", "strm", "st", "st", "st", "st",
                  "sts", "smt", "ste", "smt", "smt", "smt", "ter", "ter", "ter",
                  "trwy", "trce", "trce", "trce", "trak", "trak", "trak", "trak",
                  "trak", "trfy", "trl", "trl", "trl", "trl", "trlr", "trlr", "trlr",
                  "tunl", "tunl", "tunl", "tunl", "tunl", "tunl", "tpke", "tpke",
                  "tpke", "upas", "un", "un", "uns", "vly", "vly", "vly", "vly",
                  "vlys", "vlys", "via", "via", "via", "via", "vw", "vw", "vws",
                  "vws", "vlg", "vlg", "vlg", "vlg", "vlg", "vlg", "vlgs", "vlgs",
                  "vl", "vl", "vis", "vis", "vis", "vis", "vis", "walk", "walk",
                  "wall", "way", "way", "ways", "wl", "wls", "wls"))

randomAddresses = function(n){
  tibble(
    addresses = paste(
      sample(10:10000, n, replace = TRUE),
      sample(c("harper", "davis", "van cortland", "marry", "von brown"), n, replace = TRUE),
      sample(USPS$common_abbrev, n, replace = TRUE)
    )
  )
}

set.seed(1111)
df = randomAddresses(10)

USPS_conv2 = function(x, y) {
  t = str_split(x, " ")
  comm = t[[1]][length(t[[1]])]
  str_replace(x, comm, y[comm])
}
USPS_conv2 = Vectorize(USPS_conv2, "x")

f_MK_conv2 <- function(x, y) {
  x %>% mutate(
    addresses = USPS_conv2(addresses, 
                           array(data = y$usps_abbrev, dimnames = list(y$common_abbrev))))
}
f_MK_conv2(df, USPS)
#> # A tibble: 10 × 1
#>    addresses          
#>    <chr>              
#>  1 8995 davis crk     
#>  2 8527 davis tunl    
#>  3 7663 von brown wall
#>  4 3043 harper lk     
#>  5 9192 von brown gdn 
#>  6 120 marry riv      
#>  7 72 von brown lcks  
#>  8 8752 marry gdn     
#>  9 7754 davis cor     
#> 10 3745 davis jcts


ht.create <- function() new.env()

ht.insert <- function(ht, key, value) ht[[key]] <- value
ht.insert <- Vectorize(ht.insert, c("key", "value"))

ht.lookup <- function(ht, key) ht[[key]]
ht.lookup <- Vectorize(ht.lookup, "key")

ht.delete <- function(ht, key) rm(list = key, envir = ht, inherits = FALSE)
ht.delete <- Vectorize(ht.delete, "key")


f_MK_replaceString <- function(x, y) {
  ht <- ht.create()
  ht.insert(ht, y$common_abbrev, y$usps_abbrev)
  
  txt <- x$addresses
  i <- sapply(strsplit(txt, ""), function(x) max(which(x == " ")))
  txt <- paste0(
    str_sub(txt, end = i),
    ht.lookup(ht, str_sub(txt, start = i + 1))
  )
  x %>% mutate(addresses = txt)
}
f_MK_replaceString(df, USPS)
#> # A tibble: 10 × 1
#>    addresses          
#>    <chr>              
#>  1 8995 davis crk     
#>  2 8527 davis tunl    
#>  3 7663 von brown wall
#>  4 3043 harper lk     
#>  5 9192 von brown gdn 
#>  6 120 marry riv      
#>  7 72 von brown lcks  
#>  8 8752 marry gdn     
#>  9 7754 davis cor     
#> 10 3745 davis jcts

f_TIC1 <- function(x, y) {
  x %>% mutate(addresses = sapply(
    strsplit(x$addresses, " "),
    function(x) {
      with(y, {
        idx <- match(x, common_abbrev)
        paste0(ifelse(is.na(idx), x, usps_abbrev[idx]),
               collapse = " "
        )
      })
    }
  )
  )
}
f_TIC1(df, USPS)
#> # A tibble: 10 × 1
#>    addresses          
#>    <chr>              
#>  1 8995 davis crk     
#>  2 8527 davis tunl    
#>  3 7663 von brown wall
#>  4 3043 harper lk     
#>  5 9192 von brown gdn 
#>  6 120 marry riv      
#>  7 72 von brown lcks  
#>  8 8752 marry gdn     
#>  9 7754 davis cor     
#> 10 3745 davis jcts


f_TIC2 <- function(x, y) {
  res <- c()
  for (s in x$addresses) {
    v <- unlist(strsplit(s, "\\W+"))
    for (p in v) {
      k <- match(p, y$common_abbrev)
      if (!is.na(k)) {
        s <- with(
          y,
          gsub(
            sprintf("\\b%s\\b", common_abbrev[k]),
            usps_abbrev[k],
            s
          )
        )
      }
    }
    res <- append(res, s)
  }
  x %>% mutate(addresses = res)
}
f_TIC2(df, USPS)
#> # A tibble: 10 × 1
#>    addresses          
#>    <chr>              
#>  1 8995 davis crk     
#>  2 8527 davis tunl    
#>  3 7663 von brown wall
#>  4 3043 harper lk     
#>  5 9192 von brown gdn 
#>  6 120 marry riv      
#>  7 72 von brown lcks  
#>  8 8752 marry gdn     
#>  9 7754 davis cor     
#> 10 3745 davis jcts


f_TIC3 <- function(x, y) {
  x.split <- strsplit(x$addresses, " ")
  lut <- with(y, setNames(usps_abbrev, common_abbrev))
  grp <- rep(seq_along(x.split), lengths(x.split))
  xx <- unlist(x.split)
  r <- lut[xx]
  x %>% mutate(addresses = tapply(
    replace(xx, !is.na(r), na.omit(r)),
    grp,
    function(s) paste0(s, collapse = " ")
  ))
}
f_TIC3(df, USPS)
#> # A tibble: 10 × 1
#>    addresses          
#>    <chr>              
#>  1 8995 davis crk     
#>  2 8527 davis tunl    
#>  3 7663 von brown wall
#>  4 3043 harper lk     
#>  5 9192 von brown gdn 
#>  6 120 marry riv      
#>  7 72 von brown lcks  
#>  8 8752 marry gdn     
#>  9 7754 davis cor     
#> 10 3745 davis jcts

f_TIC4 <- function(x, y) {
  xb <- gsub("^.*\\s+", "", x$addresses, perl = TRUE)
  rp <- with(USPS, usps_abbrev[match(xb, common_abbrev)])
  x %>% mutate(addresses = paste0(gsub("\\w+$", "", x$addresses), replace(xb, !is.na(rp), na.omit(rp))))
}
f_TIC4(df, USPS)
#> # A tibble: 10 × 1
#>    addresses          
#>    <chr>              
#>  1 8995 davis crk     
#>  2 8527 davis tunl    
#>  3 7663 von brown wall
#>  4 3043 harper lk     
#>  5 9192 von brown gdn 
#>  6 120 marry riv      
#>  7 72 von brown lcks  
#>  8 8752 marry gdn     
#>  9 7754 davis cor     
#> 10 3745 davis jcts

f_JM <- function(x, y) {
  x$abbreviation <- gsub("^.* ", "", x$addresses, perl = TRUE)
  setDT(x)
  setDT(y)
  x[y, abbreviation := usps_abbrev, on = .(abbreviation = common_abbrev)]
  x$addresses <- paste(str_extract(x$addresses, "^.* "), x$abbreviation, sep = "")
  x$abbreviation <- NULL
  return(as_tibble(x))
}
f_JM(df, USPS)
#> # A tibble: 10 × 1
#>    addresses          
#>    <chr>              
#>  1 8995 davis crk     
#>  2 8527 davis tunl    
#>  3 7663 von brown wall
#>  4 3043 harper lk     
#>  5 9192 von brown gdn 
#>  6 120 marry riv      
#>  7 72 von brown lcks  
#>  8 8752 marry gdn     
#>  9 7754 davis cor     
#> 10 3745 davis jcts

set.seed(1111)
df = randomAddresses(100)

library(microbenchmark)
mb1 <- microbenchmark(
  f_MK_conv2(df, USPS),
  f_MK_replaceString(df, USPS),
  f_TIC1(df, USPS),
  f_TIC2(df, USPS),
  f_TIC3(df, USPS),
  f_TIC4(df, USPS),
  f_JM(df, USPS),
  times = 20L
)
ggplot2::autoplot(mb1)
#> Coordinate system already present. Adding new coordinate system, which will replace the existing one.

set.seed(1111)
df = randomAddresses(1000)

library(microbenchmark)
mb1 <- microbenchmark(
  f_MK_conv2(df, USPS),
  f_MK_replaceString(df, USPS),
  f_TIC1(df, USPS),
  f_TIC2(df, USPS),
  f_TIC3(df, USPS),
  f_TIC4(df, USPS),
  f_JM(df, USPS),
  times = 20L
)
ggplot2::autoplot(mb1)
#> Coordinate system already present. Adding new coordinate system, which will replace the existing one.

set.seed(1111)
df = randomAddresses(10000)

library(microbenchmark)
mb1 <- microbenchmark(
  f_MK_conv2(df, USPS),
  f_MK_replaceString(df, USPS),
  f_TIC1(df, USPS),
  f_TIC2(df, USPS),
  f_TIC3(df, USPS),
  f_TIC4(df, USPS),
  f_JM(df, USPS),
  times = 20L
)
ggplot2::autoplot(mb1)
#> Coordinate system already present. Adding new coordinate system, which will replace the existing one.

set.seed(1111)
df = randomAddresses(100000)

library(microbenchmark)
mb1 <- microbenchmark(
  f_MK_replaceString(df, USPS),
  f_TIC3(df, USPS),
  f_TIC4(df, USPS),
  f_JM(df, USPS),
  times = 20L
)
ggplot2::autoplot(mb1)
#> Coordinate system already present. Adding new coordinate system, which will replace the existing one.

set.seed(1111)
df = randomAddresses(1000000)

library(microbenchmark)
mb1 <- microbenchmark(
  f_MK_replaceString(df, USPS),
  f_TIC4(df, USPS),
  f_JM(df, USPS),
  times = 20L
)
ggplot2::autoplot(mb1)
#> Coordinate system already present. Adding new coordinate system, which will replace the existing one.

^{Created on 2021-11-04 by the reprex package (v2.0.1)}

Original:

Brilliant answers @Marek and @TIC! After some tweaking and benchmarking I think this data.table 'split/lookup-replace/paste' approach might be faster:

library(tidyverse)
library(data.table)

n=1000000
set.seed(1111)
df = tibble(
  addresses = paste(
    sample(10:10000, n, replace = TRUE),
    sample(c("harper", "davis", "van cortland", "marry", "von brown"), n, replace = TRUE),
    sample(USPS$common_abbrev, n, replace = TRUE)
  )
)
df
#> # A tibble: 1,000,000 × 1
#>    addresses              
#>    <chr>                  
#>  1 8995 marry pass        
#>  2 8527 davis spng        
#>  3 7663 marry loaf        
#>  4 3043 davis common      
#>  5 9192 marry bnd         
#>  6 120 von brown corner   
#>  7 72 van cortland plains 
#>  8 8752 van cortland crcle
#>  9 7754 von brown sqrs    
#> 10 3745 marry key         
#> # … with 999,990 more rows

start_time =Sys.time()
df$abbreviation <- gsub("^.* ", "", df$addresses)
setDT(df)
setDT(USPS)
df[USPS, abbreviation:=usps_abbrev, on=.(abbreviation=common_abbrev)]

df$usps_abbreviation <- paste(str_extract(df$addresses, "^.* "), df$abbreviation, sep = "")
Sys.time()-start_time
#> Time difference of 2.804245 secs
df
#>                    addresses abbreviation usps_abbreviation
#>       1:     8995 marry pass         pass   8995 marry pass
#>       2:     8527 davis spng          spg    8527 davis spg
#>       3:     7663 marry loaf           lf     7663 marry lf
#>       4:   3043 davis common          cmn    3043 davis cmn
#>       5:      9192 marry bnd          bnd    9192 marry bnd
#>      ---                                                   
#>  999996:     1379 marry vdct          via    1379 marry via
#>  999997:    237 harper avnue          ave    237 harper ave
#>  999998:      7592 davis riv          riv    7592 davis riv
#>  999999: 4963 marry junction          jct    4963 marry jct
#> 1000000:     813 harper bluf          blf    813 harper blf

^{Created on 2021-11-03 by the reprex package (v2.0.1)}

Edit

I changed dt_func() to produce the same output as Marek's function (fairer comparison) and it's still super quick:

set.seed(1111)
df <- randomAddresses(10000)

dt_func <- function(x) {
  x$abbreviation <- gsub("^.* ", "", x$addresses)
  setDT(x)
  setDT(USPS)
  x[USPS, abbreviation:=usps_abbrev, on=.(abbreviation=common_abbrev)]
  
  x$addresses <- paste(str_extract(x$addresses, "^.* "), x$abbreviation, sep = "")
  x$abbreviation <- NULL
  return(as_tibble(x))
}

Compare output:

df2 <- f_MK_replaceString(df, addresses)
df3 <- dt_func(df)
dplyr::all_equal(df2, df3)
#> [1] TRUE

answered Oct 24 '22 04:10

jared_mamrot

Related questions
                            
                                How can I plot a 1-D plot in R?
                            
                                convert a vector to a list
                            
                                Bizzarre issue trying to make Rpy2 2.1.9 work with R 2.12.1, using Python 2.6 under Windows xp - Rpy can't find the R.dll?
                            
                                Rscript on ubuntu
                            
                                arrow() in ggplot2 no longer supported
                            
                                How to filter/subset a data.frame using values from one of its column [duplicate]
                            
                                Gram Schmidt with R
                            
                                How to prevent two labels to overlap in a barchart?
                            
                                Loop through data frame and variable names
                            
                                Customize background to highlight ranges of data in ggplot [duplicate]
                            
                                R solving hackerrank challenge
                            
                                trouble installing packages in CentOS: internet routines cannot be loaded
                            
                                Forest plot for a beginner simple example using ggplot2 [edited] [closed]
                            
                                Finding the number of values above a value and less than a value in a df column without using a loop
                            
                                How to calculate mean of all columns, by group?
                            
                                R Changing Order of Facets
                            
                                Using table() in dplyr chain
                            
                                Creating a vector in R of counts for number of times each element appears in another vector
                            
                                Fill missing values with previous values by row using dplyr
                            
                                Get column names and dataframe name from a list of dataframes into a single dataframe

Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!

Donate Us With