Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Data Cleaning (Addresses) Python

I'm looking to clean a dataset with 61k rows. I need to clean its street address column. Presently, the addresses are a nightmare. Sometimes full addresses are written out (i.e. 111 Frederick Douglass Blvd) other times the same address will be written in short hand (i.e 111 8th Ave/ 111 8th Avenue).

I'm a newer pandas/python user so i'm not sure how to tackle such massive cleaning needs. Before I've used .replace() but that seems to be the completely wrong approach. Would appreciate any of your insights.

Edit: Essentially I'm looking to standardize addresses. Addresses were inputed either incorrectly or in multiple different formats. Long term goal is to map all of these addresses but in the data's present form that's not possible

Dataset: https://data.cityofnewyork.us/City-Government/Evictions/6z8x-wfk4

like image 954
ynnad Avatar asked Oct 19 '25 01:10

ynnad


1 Answers

I've used this function to standardize addresses (and put the street type in shortened form, consistent with Google's formatting). Apologies for the code dump, but the regex is full of interesting things to consider.

I've also added digits for when Avenues are called by their ordinal:

def shorten_rd(address):
    '''Completes the road type. I.e. Rd becomes Road, st becomes Street as per Google etc.'''
    address = address.title()
    address = re.sub(r" Street(?=$| [NE(So|S$)(We|W$)])", ' St', address)
    address = re.sub(r" Road(?=$| [NE(So|S$)(We|W$)])", ' Rd', address)
    address = re.sub(r"(?<!The) Avenue(?=$| [NE(So|S$)(We|W$)])", ' Ave', address)
    address = re.sub(r" Close(?=$| [NE(So|S$)(We|W$)])", ' Cl', address)
    address = re.sub(r" Court(?=$| [NE(So|S$)(We|W$)])", ' Ct', address)
    address = re.sub(r"(?<!The) Crescent(?=$| [NE(So|S$)(We|W$)])", ' Cres', address)
    address = re.sub(r" Boulevarde?(?=$| [NE(So|S$)(We|W$)])", ' Blvd', address)
    address = re.sub(r" Drive(?=$| [NE(So|S$)(We|W$)])", ' Dr', address)
    address = re.sub(r" Lane(?=$| [NE(So|S$)(We|W$)])", ' Ln', address)
    address = re.sub(r" Place(?=$| [NE(So|S$)(We|W$)])", ' Pl', address)
    address = re.sub(r" Square(?=$| [NE(So|S$)(We|W$)])", ' Sq', address)
    address = re.sub(r"(?<!The) Parade(?=$| [NE(So|S$)(We|W$)])", ' Pde', address)
    address = re.sub(r" Circuit(?=$| [NE(So|S$)(We|W$)])", ' Cct', address)
    return address

def lengthen_rd(address):
    address = address.title()
    address = re.sub(r" St(?=$| [NE(So|S$)(We|W$)])", " Street", address)
    address = re.sub(r" Rd(?=$| [NE(So|S$)(We|W$)])", " Road", address)
    address = re.sub(r" Ave(?=$| [NE(So|S$)(We|W$)])", " Avenue", address)
    address = re.sub(r" Cl(?=$| [NE(So|S$)(We|W$)])", " Close", address)
    address = re.sub(r" Ct(?=$| [NE(So|S$)(We|W$)])", " Court", address)
    address = re.sub(r" Cres(?=$| [NE(So|S$)(We|W$)])", " Crescent", address)
    address = re.sub(r" Blvd(?=$| [NE(So|S$)(We|W$)])", " Boulevard", address)
    address = re.sub(r" Dr(?=$| [NE(So|S$)(We|W$)])", " Drive", address)
    address = re.sub(r" Ln(?=$| [NE(So|S$)(We|W$)])", " Lane", address)
    address = re.sub(r" Pl(?=$| [NE(So|S$)(We|W$)])", " Place", address)
    address = re.sub(r" Sq(?=$| [NE(So|S$)(We|W$)])", " Square", address)
    address = re.sub(r" Pde(?=$| [NE(So|S$)(We|W$)])", " Parade", address)
    address = re.sub(r" Cct(?=$| [NE(So|S$)(We|W$)])", " Circuit", address)
    return address

def standard_addr(address):
    '''Checks for unit numbers and street addresses and puts them in the standard format''' 
    #print("################################")
    #print("### Address: ", address)
    unit_nums = re.findall(r"(?<=Unit )\w?\d+\w?|(?<=U)\d+\w?|\w?\d+\w?(?=\s*/)", address)
    unit_num = unit_nums[0] if len(unit_nums)==1 else ""
    #print("Unit Number: ", unit_num)
    proc_addr = re.sub(r"Unit \w?\d+\w?/?|U\d+\w?/?|\w?\d+\w?\s*/", "", address)
    proc_addr = re.sub(r"^[,\- ]+|[,\- ]+$", "", proc_addr)
    #print("Unitless address: ", proc_addr)
    type_opts = r"Terrace|Way|Walk|St|Rd|Ave|Cl|Ct|Cres|Blvd|Dr|Ln|Pl|Sq|Pde|Cct"
    road_attrs_pattern = r"(?P<rd_no>\w?\d+(\-\d+)?\w?\s+)(?P<rd_nm>[a-zA-z \d\-]+)\s+(?P<rd_tp>" + type_opts + ")"
    #print("Road Attr Pattern: ", road_attrs_pattern)
    road_attrs = re.search(road_attrs_pattern, proc_addr)
    try:
        road_num = road_attrs.group('rd_no').strip()
    except AttributeError:
        road_num = ""
    #print("Road number: ", road_num)
    try:
        road_name = road_attrs.group('rd_nm').strip()
    except AttributeError:
        road_name = ""
    #print("Road name: ", road_name)
    try:
        road_type = road_attrs.group('rd_tp').strip()
    except AttributeError:
        road_type = ""
    #print("Road type: ", road_type)
    proc_addr = lengthen_rd(re.sub(r"^[,\- ]+|[,\- ]+$", "", re.sub(road_attrs_pattern, "", proc_addr)))
    #print("Leftover: ", proc_addr)

    unit_seg = (unit_num + "/" if unit_num!="" else "") if road_num != "" else ("Unit " + unit_num + ", " if unit_num!="" else "")
    road_seg = ((road_num + " " if road_num!="" else "") + road_name + " " + road_type).strip()
    post_road_seg = " " + proc_addr if proc_addr != "" else ""
    proc_addr = (unit_seg + road_seg) + post_road_seg
    #print("### Processed Address: ", proc_addr)
    return proc_addr
like image 159
AER Avatar answered Oct 21 '25 15:10

AER



Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!