Geocoding and Census Data

Fun with maps!

Backstory

Cambridge has 13 neighborhoods. When I worked in Circulation, one of our jobs was to look up a patron's address on a list of street addresses and determine which neighborhood they were in, then code that into the system. It took forever, and was not very accurate.

At some point I noticed that some of the addresses on our list had the wrong neighborhood attached to them. Thus began a three-month, lunch-break-consuming quest to fix The List. At the time, there was no good source for what streets were in Cambridge. Maps were out of date, incomplete, or, in some cases, just plain wrong.

So I used what resources I could and I fixed The List. (Fun fact: there is one building in Cambridge that has a street number of zero!)

Also at this time, I did not know Python. So, naturally, I created a really complicated Excel macro to look at a list of addresses, clean up the data, and determine the neighborhood code.

Luckily, a few years later, I was able to work with a collegue to import the addresses into the census geocoder and write the data to a field in the patron record. It inputs the census tract number, not the neighborhood code, but it's still been very useful. We can see where our patrons are located (and where they are not located) and combine that information with census data to figure out who we are reaching or not reaching.

The following is a bunch of code and maps that I kept from when I was testing and playing around with geocoding.

Cleaning address data and running it through the census geocoder.

Get your addresses into a list and create regular expressions to extract the street, city, state, and zip code.

import pandas as pd
import numpy as np
import re

df = pd.read_csv('campatrons.txt', sep='^', encoding='utf8', dtype={'RECORD #(PATRON)': str, 'MA TOWN': int, 'P TYPE': int, PCODE4': int, 'ADDRESS': str, 'ADDRESS2': str})

add1List = df['ADDRESS'].tolist()
add2List = df['ADDRESS2'].tolist()
patrec = df['RECORD #(PATRON)'].tolist()


pat1dollarstreet = r"(\d.*)"
pat1dollarcity = r"(.*)([A-Za-z][A-Za-z])(.*)(\d\d\d\d\d)"

pat2dollarstreet = r"^((?!(Mail|mail)).)*$"

street = []
city = []
state = []
zipcode = []
patID = []

Do a big, long thing to basically get the parts of the address into different columns, and get the data ready for the geocoder.

for i in range(len(add1List)):
    addSplit1 = str(add1List[i]).split('$')
    addSplit2 = str(add2List[i]).split('$')
    if len(addSplit1) == 2:
        if re.match(pat1dollarstreet, addSplit1[0]) and re.match(pat1dollarcity, addSplit1[1]):
            patID.append(patrec[i])
            street.append(re.search(pat1dollarstreet, addSplit1[0]).group(1))
            city.append(re.search(pat1dollarcity, addSplit1[1]).group(1))
            state.append(re.search(pat1dollarcity, addSplit1[1]).group(2))
            zipcode.append(re.search(pat1dollarcity, addSplit1[1]).group(4))
        else:
            if len(addSplit2) == 2:
                if re.match(pat1dollarstreet, addSplit2[0]) and re.match(pat1dollarcity, addSplit2[1]):
                    patID.append(patrec[i])
                    street.append(re.search(pat1dollarstreet, addSplit2[0]).group(1))
                    city.append(re.search(pat1dollarcity, addSplit2[1]).group(1))
                    state.append(re.search(pat1dollarcity, addSplit2[1]).group(2))
                    zipcode.append(re.search(pat1dollarcity, addSplit2[1]).group(4))
                else:
                    pass
            elif len(addSplit2) == 3:
                if re.match(pat2dollarstreet, addSplit2[0]) and re.match(pat1dollarcity, addSplit2[2]):
                    patID.append(patrec[i])
                    street.append(re.search(pat2dollarstreet, addSplit2[0]).group(1))
                    city.append(re.search(pat1dollarcity, addSplit2[2]).group(1))
                    state.append(re.search(pat1dollarcity, addSplit2[2]).group(2))
                    zipcode.append(re.search(pat1dollarcity, addSplit2[2]).group(4))
                elif re.match(pat2dollarstreet, addSplit2[1]) and re.match(pat1dollarcity, addSplit2[2]):
                    patID.append(patrec[i])
                    street.append(re.search(pat2dollarstreet, addSplit2[1]).group(1))
                    city.append(re.search(pat1dollarcity, addSplit2[2]).group(1))
                    state.append(re.search(pat1dollarcity, addSplit2[2]).group(2))
                    zipcode.append(re.search(pat1dollarcity, addSplit2[2]).group(4))
                else:
                    pass
            else:
                pass
    elif len(addSplit1) == 3:
        if re.match(pat2dollarstreet, addSplit1[0]) and re.match(pat1dollarcity, addSplit1[2]):
            patID.append(patrec[i])
            street.append(re.search(pat2dollarstreet, addSplit1[0]).group(1))
            city.append(re.search(pat1dollarcity, addSplit1[2]).group(1))
            state.append(re.search(pat1dollarcity, addSplit1[2]).group(2))
            zipcode.append(re.search(pat1dollarcity, addSplit1[2]).group(4))
        elif re.match(pat2dollarstreet, addSplit1[1]) and re.match(pat1dollarcity, addSplit1[2]):
            patID.append(patrec[i])
            street.append(re.search(pat2dollarstreet, addSplit1[1]).group(1))
            city.append(re.search(pat1dollarcity, addSplit1[2]).group(1))
            state.append(re.search(pat1dollarcity, addSplit1[2]).group(2))
            zipcode.append(re.search(pat1dollarcity, addSplit1[2]).group(4))
        else:
            pass
    else:
        pass

The census geocoder can only check documents of 10,000 addresses at a time, so that's why we are splitting them up.

dfOut = pd.DataFrame() 

dfOut['ID'] = patID
dfOut['Street'] = street
dfOut['City'] = city
dfOut['State'] = state
dfOut['Zipcode'] = zipcode

#dfOut.to_csv('outfullco.csv', sep=',', index=False, header=False)

df1 = dfOut.iloc[:10000,:]
df2 = dfOut.iloc[10001:20000,:]
df3 = dfOut.iloc[20001:30000,:]
df4 = dfOut.iloc[30001:40000,:]
df5 = dfOut.iloc[40001:50000,:]
df6 = dfOut.iloc[50001:60000,:]
df7 = dfOut.iloc[60001:70000,:]
df8 = dfOut.iloc[70001:80000,:]
df9 = dfOut.iloc[80001:,:]

df1.to_csv('out1.csv', sep=',', index=False, header=False)
df2.to_csv('out2.csv', sep=',', index=False, header=False)
df3.to_csv('out3.csv', sep=',', index=False, header=False)
df4.to_csv('out4.csv', sep=',', index=False, header=False)
df5.to_csv('out5.csv', sep=',', index=False, header=False)
df6.to_csv('out6.csv', sep=',', index=False, header=False)
df7.to_csv('out7.csv', sep=',', index=False, header=False)
df8.to_csv('out8.csv', sep=',', index=False, header=False)
df9.to_csv('out9.csv', sep=',', index=False, header=False)

Time to geocode! I've created a CSV with the different file names and will loop through it to run each file through the geocoder.

import censusgeocode
import pandas as pd
import time

starttime = time.perf_counter()
print(starttime)

cg = censusgeocode.CensusGeocode()

getFile = pd.read_csv('filenames.csv', header=None)
nameList = getFile[0].tolist()

for x in nameList:
    print(time.perf_counter())
    result = cg.addressbatch(x)
    print(time.perf_counter())
    df = pd.DataFrame(result, columns=result[0].keys())
    if x == nameList[0]:
        df.to_csv('outputPatronsFinal3.csv', mode='a', header=True)
    else:
        df.to_csv('outputPatronsFinal3.csv', mode='a', header=False)
    print(time.perf_counter())

endtime = time.perf_counter()
print(endtime)

print(f"Total time: {endtime - starttime:0.4f} seconds")

It took 8.84 hours to geocode 81,000 lines, with a 92% success rate.

Now it's time to do fun stuff with maps.

Here is a basic map I made charting how many patrons were located in each census tract.

import json
import pandas as pd
import plotly.express as px
import geopandas as gpd
import plotly.io as pio


zipfile = "zip:///Users/Kate/Desktop/tl_2020_25_tract10.zip"
tracts = gpd.read_file(zipfile).to_crs("EPSG:4326")

df = pd.read_csv("/Users/Kate/Desktop/GeoPatrons.csv", dtype={'geoid':str})
df = df[(df['geoid'] >= '25017352101') & (df['geoid'] <= '25017355000')]


tracts = pd.merge(tracts, df, left_on='GEOID10', right_on='geoid', how="inner")

zipjson = json.loads(tracts.to_json())

fig = px.choropleth_mapbox(df, geojson=tracts, locations='geoid', featureidkey="properties.geoid", color='total_patrons',
                           color_continuous_scale="Viridis",
                           mapbox_style="open-street-map",
                           zoom=13, center = {"lat": 42.3736, "lon": -71.1097},
                           opacity=0.75,
                           labels={'total_patrons':'Number of Patrons','tracts':'Census Tract'}
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})

pio.write_html(fig, file='/Users/Kate/Desktop/geopatTest.html', auto_open=True)

Next I tested out Plotly's dropdown feature. You can click the image to go to the full, working map. But just know that it might take a while to load. There's a lot of info there!

import geopandas as gpd
import plotly.express as px
import plotly.io as pio
import pandas as pd
import numpy as np
import plotly.graph_objs as go
import json


zipfile = "zip:///Users/Kate/Desktop/zipcodes_ma.zip"
zipcodes = gpd.read_file(zipfile).to_crs("EPSG:4326")

df = pd.read_csv("/Users/Kate/Desktop/zipdata.csv", dtype={'POSTCODE':str})

zipcodes = pd.merge(zipcodes, df, on='POSTCODE', how="inner")

zipjson = json.loads(zipcodes.to_json())

cols_dd = ['Arabic', 'Armenian', 'Bengali']

visible = np.array(cols_dd)

traces = []
buttons = []

for value in cols_dd:
    traces.append(go.Choropleth(
        locations = zipcodes.index,
        geojson = zipjson,
        z=zipcodes[value],
        colorbar_title=value,
        colorscale='deep_r',
        hovertemplate="<b>" + zipcodes['Town'] + "</b><br>" +
                      "Number of checkouts: " + zipcodes[value].astype(str),
        visible = True if value==cols_dd[0] else False
    ))

    buttons.append(dict(label=value, method='update', args=[{'visible':list(visible==value)},{'title':f'<b>{value}</b>'}]))

updatemenus = [{'active':0, 'buttons':buttons}]

fig = go.Figure(data=traces, layout=dict(updatemenus=updatemenus))

first_title = cols_dd[0]
fig.update_layout(title=f'<b>{first_title}</b>', title_x=0.5, geo = dict(scope='usa'))

fig.update_geos(fitbounds="locations", visible=True)

pio.write_html(fig, file='/Users/Kate/Desktop/Language_Maps/dropdowntest.html', auto_open=True)

This was my basemap test. We were looking at how well our Takeout Tech (hotspots, chromebooks) circulated.

import json
import pandas as pd
import plotly.express as px
import geopandas as gpd
import plotly.io as pio

zipfile = "zip:///Users/Kate/Desktop/tl_2020_25_tract10.zip"
tracts = gpd.read_file(zipfile).to_crs("EPSG:4326")

df = pd.read_csv("/Users/Kate/Desktop/5weekTech.csv", dtype={'TRACT':str})


tracts = pd.merge(tracts, df, left_on='TRACTCE10', right_on='TRACT', how="inner")

zipjson = json.loads(tracts.to_json())

fig = px.choropleth_mapbox(df, geojson=zipjson, locations='TRACT', featureidkey="properties.TRACT", color='All',
                           color_continuous_scale="Viridis",
                           mapbox_style="open-street-map",
                           zoom=12, center = {"lat": 42.3736, "lon": -71.1097},
                           opacity=0.75
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})

pio.write_html(fig, file='/Users/Kate/Desktop/TakeoutTechAll.html', auto_open=True)

And finally, a test of subplots looking at Takeout Tech circulation over a few months.

import json
import pandas as pd
import geopandas as gpd
import plotly.io as pio
import plotly.graph_objects as go
from plotly.subplots import make_subplots

zipfile = "zip:///Users/Kate/Desktop/tl_2020_25_tract10.zip"
tracts = gpd.read_file(zipfile).to_crs("EPSG:4326")

df = pd.read_csv("/Users/Kate/Desktop/CObySCATbyTract8-4-21ToT.csv", dtype={'Tract':str})


tracts = pd.merge(tracts, df, left_on='TRACTCE10', right_on='Tract', how="inner")

zipjson = json.loads(tracts.to_json())


fig1 = go.Choroplethmapbox(geojson=zipjson, locations=df.Tract, featureidkey="properties.Tract", z=df.April,
                                    colorscale="viridis", zmin=0, zmax=35,
                                    marker_opacity=0.7, marker_line_width=1, showlegend=False, showscale=False)

fig2 = go.Choroplethmapbox(geojson=zipjson, locations=df.Tract, featureidkey="properties.Tract", z=df.May,
                                    colorscale="viridis", zmin=0, zmax=35,
                                    marker_opacity=0.7, marker_line_width=1, showlegend=False, showscale=False)

fig3 = go.Choroplethmapbox(geojson=zipjson, locations=df.Tract, featureidkey="properties.Tract", z=df.June,
                                    colorscale="viridis", zmin=0, zmax=35,
                                    marker_opacity=0.7, marker_line_width=1, showlegend=False, showscale=False)

fig4 = go.Choroplethmapbox(geojson=zipjson, locations=df.Tract, featureidkey="properties.Tract", z=df.July,
                                    colorscale="viridis", zmin=0, zmax=35,
                                    marker_opacity=0.7, marker_line_width=1, showlegend=False, showscale=False)




# Initialize figure with subplots
fig = make_subplots(
    rows=2, cols=2, subplot_titles=("April", "May", "June", "July"),
    specs=[[{"type": "choroplethmapbox"}, {"type": "choroplethmapbox"}], [{"type": "choroplethmapbox"}, {"type": "choroplethmapbox"}]],
    vertical_spacing=0.1)

# Add first map
fig.add_trace(
    fig1,
    row=1, col=1
)

# Add second map
fig.add_trace(
    fig2,
    row=1, col=2
)

# Add third map
fig.add_trace(
    fig3,
    row=2, col=1
)

# Add fourth map
fig.add_trace(
    fig4,
    row=2, col=2
)


fig.update_layout(mapbox_style="open-street-map", mapbox2_style="open-street-map", mapbox3_style="open-street-map", mapbox4_style="open-street-map",
mapbox_zoom=11.5, mapbox2_zoom=11.5, mapbox3_zoom=11.5, mapbox4_zoom=11.5, 
mapbox_center = {"lat": 42.378418, "lon": -71.109544}, mapbox2_center = {"lat": 42.378418, "lon": -71.109544}, mapbox3_center = {"lat": 42.378418, "lon": -71.109544}, mapbox4_center = {"lat": 42.378418, "lon": -71.109544})


pio.write_html(fig, file='/Users/Kate/Desktop/TakeoutTechMonths.html', auto_open=True)

Thanks for reading! If you have any comments or questions, feel free to email me at kate@kate-wolfe.com.