import numpy as np
import pandas as pd
import zipfile_deflate64 as zipfile
import csv
from io import TextIOWrapper, BytesIO
import regex as re
from tqdm import tqdm
import glob
import geopandas as gpd
import fuzzymatcher
import recordlinkage
import seaborn as sns
import gc
from numba import njit

pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',30)
tqdm.pandas()


                    voters = zipfile.ZipFile('AllNYSVoters_20220711.zip')
test_df = pd.DataFrame()

def read_in_chunks(file_object, chunk_size=1024):
    """Lazy function (generator) to read a file piece by piece.
    Default chunk size: 1k."""
    while True:
        data = file_object.readlines(chunk_size)
        if not data:
            break
        yield data
        

def process_csv_chunk(chunk, column_names, filter_col = None, filter_list = None):
    chunk_data = [row.decode('latin') for row in chunk]
    chunk_reader = csv.reader(chunk_data)
    
    interim_df = pd.DataFrame(data=[line for line in chunk_reader], columns=column_names)
    
    #Create NYC Electoral District number by appending Assembly Dist. and Electoral Dist. fields
    interim_df['FULL_ED'] = interim_df['AD']+interim_df['ED'].apply(lambda x: str(x).zfill(3))
    
    if filter_col is not None:
        interim_df = interim_df[interim_df[filter_col].apply(lambda x: str(x).replace('"','') in filter_list)]
    
    return interim_df


                    #Columns entered manually from data dictionary packaged with voter file
cols = ['LASTNAME','FIRSTNAME','MIDDLENAME','NAMESUFFIX',
        'RADDNUMBER','RHALFCODE','RPREDIRECTION','RSTREETNAME',
        'RPOSTDIRECTION', 'RAPARTMENTTYPE', 'RAPARTMENT', 'RADDRNONSTD','RCITY','RZIP5','RZIP4',
        'MAILADD1','MAILADD2','MAILADD3','MAILADD4',
        'DOB','GENDER','ENROLLMENT','OTHERPARTY',
        'COUNTYCODE','ED','LD','TOWNCITY','WARD','CD','SD','AD',
        'LASTVOTEDDATE','PREVYEARVOTED','PREVCOUNTY','PREVADDRESS','PREVNAME',
        'COUNTYVRNUMBER','REGDATE','VRSOURCE','IDREQUIRED','IDMET',
        'STATUS','REASONCODE','INACT_DATE','PURGE_DATE','SBOEID','VoterHistory']

#List of NYCHA electoral districts obtained using QGIS, by intersection of NYCHA developments layer
#with ED layer, and converted to string for use in filtering. Read in NYCHA election districts from various 
#versions of the district shapefile (21D through 22A2) to account for shifting district lines 
#post-redistricting and lag in SBOE data updates

#In dataset, field 'FULL_ED' is obtained by concatenating assembly district and local ED (zero-padded 
#to three characters)

    
filelist = glob.glob('2022_EDs/*.xlsx')

distlist = pd.DataFrame()

for file in filelist:
    newlist = pd.read_excel(file)
    distlist = distlist.append(newlist)
    
nycha_distlist_2022 = set(distlist['ElectDist'].astype(str).unique())

#If isolating all NYC registered voters for purposes of comparison, the appropriate filter column
#is COUNTYCODE, and the appropriate filter set is:
nyc_counties = set(['31', '24', '41', '03', '43'])


                    with voters.open('AllNYSVoters_20220711.txt','r') as file_handle:
    #cols_line = file_handle.readline().decode('utf-8')
    #cols = [re.sub(r'[\s\'\"]+','',col).strip() for col in cols_line.split(',')]
    #print(cols)
    base_df = pd.DataFrame(columns=cols)
    n=0
    
    for chunk in tqdm(read_in_chunks(file_handle, chunk_size=10000000)):
        n+=1
        if n%20 == 0:
            if base_df.shape[0] > 0:
                base_df.reset_index().drop(columns='index').to_json(f'clean_segments/iter_{str(n).zfill(3)}.json')
                base_df = pd.DataFrame(columns=cols)
                
        base_df = pd.concat([base_df, process_csv_chunk(chunk, cols, filter_col='FULL_ED', filter_list = nycha_distlist_2022)], axis=0)
    
    #To account for remaining rows after last output
    base_df.reset_index().drop(columns='index').to_json(f'clean_segments/iter_FINAL.json')


                    caches = glob.iglob('clean_segments/*')

full_df = pd.DataFrame(columns=cols)

for file in tqdm(caches):
    segment_df = pd.read_json(file)
    full_df = pd.concat([full_df, segment_df], axis=0)


                    county_to_borough_dict = {31: 'MANHATTAN',
                          24: 'BROOKLYN',
                          41: 'QUEENS',
                          3:  'BRONX',
                          43: 'STATEN ISLAND'}
full_df['BOROUGH'] = full_df['COUNTYCODE'].apply(lambda x: county_to_borough_dict[x])

for col in ['RADDNUMBER','RSTREETNAME','BOROUGH']:
    full_df[col] = full_df[col].apply(lambda x: str(x).strip())

def get_mod_address_from_row(row):
    address_string = ''
    for col in ['RADDNUMBER','RPREDIRECTION','RSTREETNAME','BOROUGH']:
        if row[col] is not None:
            if col == 'RPREDIRECTION':
                if str(row[col]).strip() != 'nan':
                    address_string = address_string+str(row[col]).strip()
                else:
                    pass
                
            else:
                address_string = address_string+str(row[col]).strip()
                
            if col in ['RSTREETNAME']:
                address_string = address_string+', '
            else:
                address_string = address_string+' '
    
    return(address_string.strip()+', NY')
            
full_df['MOD_ADDRESS'] = full_df.progress_apply(lambda row: get_mod_address_from_row(row), axis=1)


                    full_df.reset_index().drop(columns='index').to_csv('NYCHA_DISTRICT_RECORDS_2022.csv')


                    gc.collect()


                    #The following files and associated documentation are found at:
#https://www1.nyc.gov/site/planning/data-maps/open-data.page#pad
#If direct link fails, search "Property Address Directory" on linked page

bbl = pd.read_csv('pad22a/bobabbl.txt')
addr = pd.read_csv('pad22a/bobaadr.txt')
snd = pd.read_fwf('pad22a/snd22Acow.txt', widths=[1,1,32,1,1,1,5,2,3,2,1,1,2,32,2,20,1,92])
snd.columns = ['rectype','boro','stname','primary_flag','principal_flag','boro1','sc5','lgc','spv','filler','numeric_ind','GFT','len_full_name','full_stname','min_SNL','stn20','ht_name_type_code','filler']


                    #bl_guide: contains official NYCHA stairhall addresses and associated BBL values
bl_guide = pd.read_csv('stairhalls.csv')
bl_guide.columns = [col.upper() for col in bl_guide.columns]


                    boro_num_dict = {'MANHATTAN': 1, 'BRONX': 2, 'BROOKLYN': 3, 'QUEENS': 4, 'STATEN ISLAND': 5}
bl_guide['BORO'] = bl_guide['BOROUGH'].apply(lambda x: boro_num_dict[x])
bl_guide['TDS_NUM'] = bl_guide['LOCATION'].apply(lambda x: int(x[:3]))


                    #Match building BBLs to Property Address Directory BBL table
nycha_bbl = bbl.merge(bl_guide, how='outer', left_on=['boro','block','lot'], right_on=['BORO','BLOCK','LOT'], indicator=True)
nycha_bbl = nycha_bbl[nycha_bbl['_merge'] == 'both']
nycha_bbl = nycha_bbl[['TDS_NUM','boro','block','lot','numaddr', 'numbf']].drop_duplicates()


                    #Merge NYCHA BBLs one-to-many to PAD address table
nycha_addr = nycha_bbl.merge(addr, how='outer', on=['boro','block','lot'])
nycha_addr = nycha_addr[nycha_addr['TDS_NUM'].apply(lambda x: not pd.isna(x))]
nycha_addr = nycha_addr.merge(snd, on=['boro','sc5'], how='inner')


                    #Screen out invalid address numbers/address ranges (those with GARAGE, etc.)

def is_valid_address_number(value):
    test_value = None
    try:
        test_value = int(str(value))
    except:
        return False
    
    if isinstance(test_value, int):
        return True

nycha_addr_ranges = nycha_addr[nycha_addr['lhnd'].apply(lambda x: is_valid_address_number(x))]
nycha_addr_ranges = nycha_addr_ranges[['TDS_NUM','boro','block','lot',
                                       'lhnd','lsos','hhnd','hsos','parity', 'stname_x', 'stname_y','addrtype','GFT']]


                    #Create unique row for each address number in matching address ranges

def expand_address_range(row):
    start = int(row['lhnd'])
    end = int(row['hhnd'])+1
    parity = row['parity']
    
    if parity == 0:
        return row
    else:
        address_range = list(range(start, end, 2))
    
    expanded_df = pd.DataFrame([row]*len(address_range), index=pd.RangeIndex(len(address_range)))
    expanded_df['HOUSE_NUM'] = None
    
    for i in range(len(address_range)):
        expanded_df.at[i,'HOUSE_NUM'] = address_range[i]
        
    return(expanded_df)


                    nycha_addresses_expanded = pd.DataFrame()

for row in tqdm(nycha_addr_ranges.drop_duplicates().iterrows()):
    nycha_addresses_expanded = nycha_addresses_expanded.append(expand_address_range(row[1]))


                    #Add borough names and standardize addresses
boro_num_to_name_dict = {1: 'MANHATTAN',
                         2: 'BRONX',
                         3: 'BROOKLYN',
                         4: 'QUEENS',
                         5: 'STATEN ISLAND'}

nycha_addresses_expanded['BOROUGH'] = nycha_addresses_expanded['boro'].apply(lambda x: boro_num_to_name_dict[x])
nycha_addresses_expanded['STREET_NAME_STD'] = nycha_addresses_expanded['stname_y'].apply(lambda x: re.sub(r'(?<=[\d]+)((TH)|(RD)|(ST)|(ND))', '', x.strip())).apply(lambda x: re.sub(r'[\s]+', ' ', x))
nycha_addresses_expanded['FULL_ADD'] = nycha_addresses_expanded['HOUSE_NUM'].apply(lambda x: str(x).strip())+' '+nycha_addresses_expanded['STREET_NAME_STD']+', '+nycha_addresses_expanded['BOROUGH']+', NY'


                    nycha_addr_full = nycha_addresses_expanded[['TDS_NUM', 'HOUSE_NUM', 'STREET_NAME_STD', 'BOROUGH', 'FULL_ADD']]


                    #Supplement with NYCHA addresses from PTAD shapefile (which lacks many intermediate address nums)
nycha_addr_official = gpd.read_file('STD_NYCHA/NYCHA_Address_Points.geojson')
nycha_addr_official['STREET_NAME_STD'] = nycha_addr_official['STREET'].apply(lambda x: re.sub(r'(?<=[\d]+)((TH)|(RD)|(ST)|(ND))', '', x))
nycha_addr_official['FULL_ADD'] = nycha_addr_official['HOUSE_NUM']+' '+nycha_addr_official['STREET_NAME_STD']+', '+nycha_addr_official['BOROUGH']+', NY'


                    nycha_addr_full = nycha_addr_full.append(nycha_addr_official[['TDS_NUM', 'HOUSE_NUM', 'STREET_NAME_STD', 'BOROUGH', 'FULL_ADD']])

nycha_addr_full = nycha_addr_full.drop_duplicates(['HOUSE_NUM', 'STREET_NAME_STD', 'BOROUGH'])
nycha_addr_full = nycha_addr_full.reset_index().drop(columns='index')


                    #Add St. Nicholas Ave addresses
stn_ave_addr = nycha_addr_full[nycha_addr_full['STREET_NAME_STD'].apply(lambda x: 'SAINT NICHOLAS' in x)].copy()
stn_ave_addr['STREET_NAME_STD'] = 'ST NICHOLAS AVENUE'

#Add 7th Ave addresses
seventh_ave_addr = nycha_addr_full[nycha_addr_full['STREET_NAME_STD'].apply(lambda x: 'POWELL BOULEVARD' in x)].copy()
acp_addr = seventh_ave_addr.copy()

seventh_ave_addr['STREET_NAME_STD'] = '7 AVENUE'
acp_addr['STREET_NAME_STD'] = 'ADAM CLAYTON POWELL BOULEVARD'

#Add 8th Ave addresses
eighth_ave_addr = nycha_addr_full[nycha_addr_full['STREET_NAME_STD'].apply(lambda x: 'FREDERICK DOUGLASS' in x)].copy()
eighth_ave_addr['STREET_NAME_STD'] = '8 AVENUE'

#Misc...
w27 = nycha_addr_full[nycha_addr_full['STREET_NAME_STD'].apply(lambda x: 'WEST 27 DRIVE' in x)].copy()
w27['STREET_NAME_STD'] = 'WEST 27 STREET DRIVE'

ua1 = nycha_addr_full[nycha_addr_full['STREET_NAME_STD'].apply(lambda x: 'UNIVERSITY AVENUE' in x)].copy()
ua2 = ua1.copy()
ua1['STREET_NAME_STD'] = 'DR MARTIN L KING JR BOULEVARD'
ua2['STREET_NAME_STD'] ='DR M L KING JR BOULEVARD'

gb = nycha_addr_full[nycha_addr_full['STREET_NAME_STD'].apply(lambda x: 'GUY BREWER' in x)].copy()
gb['STREET_NAME_STD'] = 'GUY R BREWER BOULEVARD'

ftw = nycha_addr_full[nycha_addr_full['STREET_NAME_STD'].apply(lambda x: 'FORT WASHINGTON' in x)].copy()
ftw['STREET_NAME_STD'] = 'FT WASHINGTON AVENUE'


                    #Put it all together...
new_addr = pd.DataFrame()

for df in [stn_ave_addr, seventh_ave_addr, acp_addr, eighth_ave_addr, w27, ua1, ua2, gb, ftw]:
    new_addr = new_addr.append(df)
    
nycha_addr_full = nycha_addr_full.append(new_addr).reset_index().drop(columns='index')


                    nycha_addr_full.to_csv('NYCHA_addr_full_OFFICIAL_PLUS_BBL.csv')


                    #Load voter records from NYCHA electoral districts isolated above, alongside addresses
nycha_addr_full = pd.read_csv('NYCHA_addr_full_OFFICIAL_PLUS_BBL.csv', index_col=0)
for col in nycha_addr_full.columns:
    nycha_addr_full[col] = nycha_addr_full[col].apply(lambda x: re.sub(r'[\s]+', ' ', str(x)).strip())

nycha_addr_full['TDS_NUM'] = nycha_addr_full['TDS_NUM'].astype(float).astype(int)

full_df = pd.read_csv('NYCHA_DISTRICT_RECORDS_2022.csv', index_col = 0)
for col in ['RADDNUMBER', 'RSTREETNAME', 'BOROUGH']:
    full_df[col] = full_df[col].apply(lambda x: re.sub(r'[\s]+', ' ', str(x)).strip())


                    indexer = recordlinkage.Index()
indexer.block(left_on=['BOROUGH','RADDNUMBER'], right_on=['BOROUGH','HOUSE_NUM'])

candidates = indexer.index(full_df, nycha_addr_full)


                    compare = recordlinkage.Compare()

compare.string('RSTREETNAME','STREET_NAME_STD', threshold=1, label = 'STREET_NAME')


                    #Obtain matching index pairs
features = compare.compute(candidates, full_df, nycha_addr_full)
print(features.shape)


                    #Fetch information on matches for validation
match_candidates = {'FULL_DF_MOD_ADDRESS':[], 'NYCHA_ADDRESSES_FULL_ADD':[], 'FULL_DF_IND':[], 'NYCHA_ADDRESS_IND':[]}

for row in tqdm(features[features['STREET_NAME']>0].reset_index().iterrows()):
    row_values = row[1]
    full_add_ind = int(row_values['level_0'])
    nycha_add_ind = int(row_values['level_1'])
    
    match_candidates['FULL_DF_MOD_ADDRESS'].append(full_df.iloc[full_add_ind]['MOD_ADDRESS'])
    match_candidates['FULL_DF_IND'].append(full_add_ind)
    match_candidates['NYCHA_ADDRESSES_FULL_ADD'].append(nycha_addr_full.iloc[nycha_add_ind]['FULL_ADD'])
    match_candidates['NYCHA_ADDRESS_IND'].append(nycha_add_ind)


                    match_df = pd.DataFrame(match_candidates)
match_df.to_csv('_match_candidates_2022.csv')


                    match_df = pd.read_csv('_match_candidates_2022.csv')


                    #Identify matched rows in voter file
matched_voter_indices = set(match_df['FULL_DF_IND'])

full_df_ind = full_df.copy()
full_df_ind['_match'] = full_df.reset_index()['index'].progress_apply(lambda x: x in matched_voter_indices)


                    match_mapper = {}

for row in tqdm(match_df.iterrows()):
    row_data = row[1]
    match_mapper[row_data['FULL_DF_IND']] = row_data['NYCHA_ADDRESS_IND']


                    def get_matched_dev_info(base_df, mapper, supp_df):
    supp_dict = {'VOTER_IND':[], 'NYCHA_IND':[], 'TDS':[], 'NYCHA_ADD':[]}
    
    matched_df = base_df.reset_index()
    matched_df = matched_df[matched_df['_match']]
    
    matched_set = set(matched_df['index'])
    unmatched_set = set(base_df.reset_index()['index'])-matched_set
    
    for row in tqdm(matched_df.iterrows()):
        try:
            index = row[1]['index']
            supp_ind = mapper[index]
            matching_row = supp_df.iloc[supp_ind]

            supp_dict['VOTER_IND'].append(index)
            supp_dict['NYCHA_IND'].append(supp_ind)
            supp_dict['TDS'].append(matching_row['TDS_NUM'])
            supp_dict['NYCHA_ADD'].append(matching_row['FULL_ADD'])
        except:
            pass

    for index in unmatched_set:
        supp_dict['VOTER_IND'].append(index)
        for col in ['NYCHA_IND','TDS','NYCHA_ADD']:
            supp_dict[col].append(None)
            
    return supp_dict

supp_nycha_info = get_matched_dev_info(full_df_ind, match_mapper, nycha_addr_full)
supp_nycha_df = pd.DataFrame(supp_nycha_info)


                    full_df_supp = full_df_ind.merge(supp_nycha_df, left_index=True, right_on = 'VOTER_IND')


                    nycha_voter_set = full_df_supp[full_df_supp['_match']]
nycha_voter_set = nycha_voter_set.drop(columns=[col for col in nycha_voter_set.columns if 'Unnamed' in col])
nycha_voter_set.to_csv('NYCHA_voter_set_strict.csv')


                    nycha_voter_set = pd.read_csv('NYCHA_voter_set_strict.csv', index_col=0)

#Remove purged voter records
nycha_voter_set = nycha_voter_set[nycha_voter_set['STATUS'] != 'P']


                    nycha_voter_set['ElecList'] = nycha_voter_set['VoterHistory'].progress_apply(lambda x: set([re.sub(r'\(\w+\)', '', name).strip() for name in str(x).split(';')]))
#nycha_voter_set['ElecList'] = nycha_voter_set['VoterHistory'].apply(lambda x: set(name.strip() for name in str(x).split(';')))

nycha_voter_set = nycha_voter_set[['LASTNAME','FIRSTNAME','FULL_ED','LD','CD','SD','AD','BOROUGH','TDS','ElecList']]
nycha_voter_set['_counter'] = 1


                    #Add city council district info by ED
ed_to_council = pd.read_csv('ED22A1_to_Council_District.csv').set_index('ElectDist')
error_eds = []

#@njit
def get_cd(ed_num, df):
    try:
        cd_num = df.loc[ed_num]['coun_dist']
        return cd_num
    
    except:
        error_eds.append(ed_num)
        
    
nycha_voter_set['Council_Dist'] = nycha_voter_set['FULL_ED'].progress_apply(lambda x: get_cd(int(x), ed_to_council))


                    #Group election identifiers and define election types
ID_dict = {'2007GE':set(['20071106 GE', '20071105 GE', '20071106 ??', 'GENERAL 2007', 'GENERAL ELECTION 2007', '2007 GENERAL  ELECTION', '2007 General Election', '2007 GENERAL ELECTION', 'General Election, 2007', '07 GENERAL ELECTION', '20071006 GE', 'General Election 2007', '2007 GENERAL']),
           '2007PR':set(['20070918 PR','PRIMARY ELECTION 2007', '2007 Primary Election']),
           '2007SP':['Special Election 2007'],
           '2008GE':set(['20081104 GE', '2008 GENERAL ELECTION', 'GENERAL 2008', '2008 General Election', 'GENERAL ELECTION 2008','General Election, 2008', '20081104 PR', '20081114 PR','20081111 GE', '2008 General', 'General Election 2008', '20081028 GE','20080909 GE', '20081110 GE', '20081104', '20081120']),
           '2008PP':set(['200802`0 PP','08 PRESIDENTIAL PRIMARY','20080205 PP', '20080205 PR', 'PRESIDENTIAL PRIMARY 2008', '2008 PRESIDENTIAL PRIMARY', '20080205 ??', 'Presidential Primary, 2008', '2008 Presidential Primary', 'Presidential Primary 2008', '2008 Presidential Primary Election', '20080205 SP']),
           '2008PR':set(['20080909 PR', '20080909 PP', '20080911 PR','20080903 PR', '20080902 PR', '20080916 PR', '20080914 PR', '20080913 PR', '20080914', '20080909 SP', '20080912 PR']),
           '2009GE':set(['20091103 GE', 'GE 20091103', '2009 GENERAL ELECTION', 'General Election, 2009', 'GENERAL ELECTION 2009', '2009 General Election', 'GENERAL 2009', '20091103 RO', '20091103 SP']) ,
           '2009RO':set(['20090929 RO', '20090930 RO', '20090929 SP', '20090929 PR', '20090929 GE', '20090929 Run Off', '20090930 PR', '20090922 RO', '20090928 SP']),
           '2009PR':set(['2009 PRIMARY ELECTION','09 PRIMARY ELECTION', '20090915 PR', 'Primary Election, 2009', 'PRIMARY 2009', '20090915 GE', '20091015 PR','20090915 RO', '20090908 PR']),
           '2009SP_0915':set(['20090915 SP']),
           '2009SP_0606':set(['20090602 SP']),
           '2009SP_0421':set(['20090421 SP']),
           '2009SP_0224':set(['20090224 SP']),
           '2010GE':set(['20101103 GE', '10 GENERAL ELECTION', '20101102 GE', 'GE 20101102', '2010 GENERAL ELECTION', 'General Election, 2010', '2010 General Election', 'GENERAL ELECTION 2010', 'GENERAL 2010', 'General Election 2010']),
           '2010PR':set(['Primary Election, 2010', '10 PRIMARY ELECTION', '20100914 PR', '2010 Primary Election', '2010 PRIMARY ELECTION', 'PRIMARY 2010']),
           '2010SP_0323':['20100323 SP'],
           '2010SP_0316':['20100316 SP'],
           '2010SP_0209':['20100209 SP'],
           '2011GE':set(['20111105 GE', '20111108 GE', '2011 General Election', '2011 GENERAL ELECTION', 'GENERAL 2011', 'General Election, 2011', 'GENERAL ELECTION 2011', '20111106 GE', '11 GENERAL  ELECTION']),
           '2011PR':set(['PR 20110913', '2011 PRIMARY ELECTION', 'PRIMARY ELECTION 2011', '20110913 PR', 'PRIMARY 2011', '2011 Primary Election', '11 PRIMARY ELECTION']),
           '2011SP_0913':['20110913 SP', '2011 Special Election 26CD'],
           '2012GE':set(['20121106 GE','2012 General Election', 'GENERAL 2012', '2012 GENERAL ELECTION', 'GENERAL ELECTION 2012', 'General Election, 2012', '12 GENERAL  ELECTION', 'General Election 2012', 'GE 20121106']),
           '2012PR_0913':set(['20120913 PR']),
           '2012PR_0620':set(['20120626 PR', '2012 PRIMARY ELECTION FEDERAL']), 
           '2012PP':set(['20120424 PP','2012 Presidential Primary']),
           '2012SP_0320':['20120320 SP'],
           '2013GE':set(['GE 20131105', '2013 GENERAL ELECTION', 'GENERAL 2013', '2013 General Election', 'GENERAL ELECTION 2013', 'General Election, 2013', '13 GENERAL  ELECTION', 'General Election 2013', '20131105 GE']),
           '2013PR':set(['13 PRIMARY ELECTION', 'PR 20130910', '20131001 PR', 'PRIMARY 2013', '2013 PRIMARY ELECTION', 'Primary Election, 2013', '2013 Primary Election', 'PRIMARY ELECTION 2013', '20130910 PR', '20130910 RO','20130901 PR']),
           '2013RO':set(['20131001 RO', 'RO 20131001', '20131001 PP','20130110']),
           '2013SP_0219':set(['20130219 SP', '20130219 GE']),
           '2014GE':set(['2014 GENERAL ELECTION', 'General Election, 2014', '20141104 GE', 'GE 20141104', '2014  GENERAL ELECTION', '2014 General Election', 'GENERAL ELECTION 2014', 'GENERAL 2014', '14 GENERAL  ELECTION', 'General Election 2014']), 
           '2014PR_0909':set(['20140909 PR', 'PR 20140909', '2014 PRIMARY STATE and LOCAL', '2014 Primary Election (State & Local)']),
           '2014PR_0624':set(['20140624 PR', 'PR 20140624', '2014CONGRESSIONAL PRIMARY']),
           '2015GE':set(['GE 20151103', '2015 General Election', '2015  GENERAL ELECTION', 'GENERAL 2015', '20151108 GE', '2015 GENERAL ELECTION', 'General Election, 2015', '15 GENERAL ELECTION', '20151103 GE']),
           '2015PR':set(['2015 Primary Election', '20150910 PR', 'PRIMARY 2015']),
           '2015SP_0505':set(['20150505 SP', 'SP 20150505']),
           '2016GE':set(['20161108 GE', '20161108 PR', '20161108 SP', 'GE 20161108', 'GENERAL ELECTION 2016', 'GENERAL 2016', '2016 General Election', 'General Election, 2016', '2016 GENERAL ELECTION', 'General Election 2016', '16 GENERAL ELECTION', '2008 General', 'General Election 2008', '20161107 GE', '20081104', '20161103 GE']),
           '2016PR_0913':set(['PR 20160913', '20160913 PR', 'PR 20160913', ]),
           '2016PR_0628':set(['20160628 PP', '20160628 PR', 'PR 20160628', '20160628 SP', 'Federal Primary, 2016']), 
           '2016PP':set(['Presidential Primary 2016', 'Presidential Primary Election 2016', '16 PRESIDENTIAL PRIMARY', '20160419 PP', 'PP 20160419', '2016 Presidential Primary', 'PRESIDENTIAL PRIMARY ELECTION 2016',  '20160412 PR', '20160419 PR', 'PRESIDENTIAL PRIMARY 2016', 'Presidential Primary, 2016', '2016 PRESIDENTIAL PRIMARY', '2016 Presidential Primary Election', ]), 
           '2016SP_0419':set(['20160419 SP', 'SP 20160419', '2016 SD 9 SPECIAL']),
           '2016SP_0213':set(['20160223 SP']),
           '2017GE':set(['20171107 GE', 'GE 20171107','2017 General Election', 'GENERAL ELECTION 2017', 'General Election, 2017', 'GE 20171107', '17 GENERAL ELECTION', 'GENERAL 2017', '2017 GENERAL ELECTION', '20171107 GE']),
           '2017PR':set(['PR 20170912','20170912 RO', 'PR 20170912', '20170926 PR', 'PRIMARY ELECTION 2017', 'PRIMARY 2017', '17 PRIMARY ELECTION', '20170912 SP', '20170912 PR']),
           '2017SP_0524':['20170523 SP'],
           '2017SP_0214':['20170214 SP'],
           '2018GE':set(['2018 General Election', 'General Election, 2018', 'GE 20181204', 'GENERAL 2018', 'GENERAL ELECTION 2018', '18 GENERAL ELECTION', 'General Election 2018', '2018 GENERAL ELECTION', '20181106 GE', 'GE 20181106']),
           '2018PR_0913':set(['20180913 PR','PR 20180913','Primary Election, 2018', '18 PRIMARY ELECTION', 'PRIMARY 2018', 'PRIMARY ELECTION 2018', '2018 Primary Election']),
           '2018PR_0626':set(['20180626 PR','PR 20180626']),
           '2018SP_0424':set(['20180424 SP', 'SP 20180424']),
           '2019GE':set(['GENERAL 2019','GE 20191105','2019 General Election', '20191105 GE', '2019 GENERAL ELECTION', 'GENERAL ELECTION 2019']),
           '2019PR':set(['PR 20190625', '20190625 PR']),
           '2019SP_0514':set(['20190514 SP', 'SP 2019-05-']),
           '2019SP_0226':set(['20190226 SP', 'SP 20190226', '20190226 GE']),
           '2020GE':set(['GE 20201103','General Election, 2020', '2020 GENERAL ELECTION', '20201103 GE', 'GENERAL ELECTION 2020', 'GENERAL 2020', 'GENERAL ELECTION 2020', 'General Election 2020', '2020 General Election']),
           '2020PR':set(['PRIMARY ELECTION 2020', '2020 JUNE PRIMARY', '2020 Presidential Primary Election','20200623 PR', 'PP 20200623', '20200623 PP']),
           '2020SP_1222':set(['SP 20201222', 'SP 20200202', '20201222 SP']),
           '2021GE':set(['GE 20211102','20211102 GE', ]),
           '2021PR':set(['PR 20210622', 'PRIMARY 2021', '20210622 PR']),
           '2021SP_1102':set(['SP 20211102','20211102 GE']),
           '2021SP_0323':set(['20210323 SP','SP 20210323']),
           '2021SP_0223':set(['20210223 SP','SP 20210223']),
           '2021SP_0202':set(['SP 20210202','20210202 SP']),
           '2022PR':set(['20220628 PR','PR 20220628']),
           '2022SP_0524':set(['20220524 SP','SP 20220524']),
           '2022SP_0322':set(['20220322 SP']),
           '2022SP_0215':set(['20220215 SP','SP 20220215']),
           '2022SP_0118':set(['GE 20220118','20220118 SP']),
            '_AMBIG':set(['08 PRIMARY ELECTION', '14 PRIMARY ELECTION', 'PRIMARY 2014', '12 PRIMARY ELECTION', 'Primary Election, 2012', '2012 PRIMARY ELECTION', '2012 Primary Election','Primary Election, 2008', '2008 PRIMARY ELECTION', 'Special Election, 2009', 'SPECIAL ELECTION 2009', '20110502 GE'])}


                    #Standardize election identifiers
def get_common_id(x, id_dict):
    for key, value in id_dict.items():
        if x in value:
            return key
    
    return None

nycha_voter_set['Clean_Elec_List'] = nycha_voter_set['ElecList'].progress_apply(lambda lst: set([get_common_id(item, ID_dict) for item in lst]))


#Create dummy columns where 1 = Voted
for col in tqdm(ID_dict.keys()):
    nycha_voter_set[col] = nycha_voter_set['Clean_Elec_List'].apply(lambda x: 1 if col in x else 0)


                    #Write cleaned set
nycha_voter_set.reset_index().drop(columns=['index']).to_csv('Prepared_NYCHA_Voter_Set_2022.csv')


                    #Reload fully filtered and prepared voter dataset
nycha_voter_set = pd.read_csv('Prepared_NYCHA_Voter_Set_2022.csv', index_col=0)


                    #Produce average turnout at NYCHA developments by election, by borough
boro = nycha_voter_set.groupby('BOROUGH').mean()[nycha_voter_set.columns[13:]]

#...by assembly district:
ad = nycha_voter_set.groupby('AD').mean()[nycha_voter_set.columns[13:]]

#...by state senate district:
sd = nycha_voter_set.groupby('SD').mean()[nycha_voter_set.columns[13:]]

#...by congressional district:
cd = nycha_voter_set.groupby('CD').mean()[nycha_voter_set.columns[13:]]

#...by city council district:
council = nycha_voter_set.groupby('Council_Dist').mean()[nycha_voter_set.columns[13:]]

#...by development, categorized by borough:
dev = nycha_voter_set.groupby(['BOROUGH','TDS']).mean()[nycha_voter_set.columns[13:]]


                    pd.DataFrame(boro.loc['BROOKLYN']).reset_index().tail(30)


                    #Import development info for filtering and display
dev_info = pd.read_csv('overview_table_data.csv')
dev_info = dev_info[['TDS','DEV_NAME','CONS_TDS','CONS_NAME','GEO_BORO']]

#Pull in population data
def get_latest_records(df, group_col, date_col):
    new_df = pd.DataFrame(columns=df.columns)
    
    for group in df.groupby(group_col):
        new_df = pd.concat([new_df, pd.DataFrame(group[1].sort_values(date_col, ascending=False).iloc[0]).transpose()], axis=0)
        
    return new_df

pop_facts = pd.read_excel('Dev_Population_Facts.xlsx')
recent_pop = get_latest_records(pop_facts, 'TDS_NUMBER', 'MONTH_ID')
recent_pop['18_PLUS'] = recent_pop['POPULATION']-recent_pop['MINORS_UNDER_18']

dev_info = dev_info.merge(recent_pop[['TDS_NUMBER','18_PLUS','MONTH_ID']], left_on='TDS', right_on='TDS_NUMBER', how='left')


                    #Merge dev data with voter records
rates_by_dev_enriched = dev_info.merge(dev.reset_index(), how='right', left_on='TDS', right_on='TDS').reset_index(drop=True)
rates_by_dev_enriched = rates_by_dev_enriched.sort_values(['GEO_BORO','CONS_TDS','TDS']).set_index(['GEO_BORO','CONS_TDS','TDS'])


                    elec_type_dict = {'GENERAL': {'2007GE',
                              '2008GE',
                              '2009GE',
                              '2010GE',
                              '2011GE',
                              '2012GE',
                              '2013GE',
                              '2014GE',
                              '2015GE',
                              '2016GE',
                              '2017GE',
                              '2018GE',
                              '2019GE',
                              '2020GE',
                              '2021GE'},
                 'PRIMARY': {'2007PR',
                              '2008PP',
                              '2008PR',
                              '2009PR',
                              '2010PR',
                              '2011PR',
                              '2012PP',
                              '2012PR_0620',
                              '2012PR_0913',
                              '2013PR',
                              '2014PR_0624',
                              '2014PR_0909',
                              '2015PR',
                              '2016PP',
                              '2016PR_0628',
                              '2016PR_0913',
                              '2017PR',
                              '2018PR_0626',
                              '2018PR_0913',
                              '2019PR',
                              '2020PR',
                              '2021PR',
                              '2022PR'},
                 'RUNOFF': {'2009RO', '2013RO'},
                 'SPECIAL': {'2007SP',
                              '2009SP_0224',
                              '2009SP_0421',
                              '2009SP_0606',
                              '2009SP_0915',
                              '2010SP_0209',
                              '2010SP_0316',
                              '2010SP_0323',
                              '2011SP_0913',
                              '2012SP_0320',
                              '2013SP_0219',
                              '2015SP_0505',
                              '2016SP_0213',
                              '2016SP_0419',
                              '2017SP_0214',
                              '2017SP_0524',
                              '2018SP_0424',
                              '2019SP_0514',
                              '2019SP_0226',
                              '2020SP_1222',
                              '2021SP_1102',
                              '2021SP_0323',
                              '2021SP_0223',
                              '2021SP_0202',
                              '2022SP_0524',
                              '2022SP_0322',
                              '2022SP_0215',
                              '2022SP_0118'},
                 None: {'_AMBIG'}}


                    summ_table = rates_by_dev_enriched[['2016GE','2016PP','2017GE','2017PR','2018GE','2018PR_0913','2020GE','2020PR','2021GE','2021PR']].describe().loc['mean'].reset_index()
summ_table['YEAR'] = summ_table['index'].apply(lambda x: str(x)[:4])

def get_type(value, dict_name):
    for k, v in dict_name.items():
        if value in v:
            return k
        
summ_table['TYPE'] = summ_table['index'].apply(lambda x: get_type(x, elec_type_dict))
summ_table = summ_table.drop(columns='index')
summ_table.pivot_table(values='mean', index='YEAR', columns='TYPE').reset_index().sort_values('YEAR', ascending=False)


                    summ_table = rates_by_dev_enriched[['2016GE','2016PP','2017GE','2017PR','2018GE','2018PR_0913','2020GE','2020PR','2021GE','2021PR']].describe()#.loc['mean'].reset_index()
summ_table


                    #Summarize turnout by election type
rates_by_dev_summary = rates_by_dev_enriched.copy().drop(columns=rates_by_dev_enriched.columns[6:])

for col, values in elec_type_dict.items():
    rates_by_dev_summary[f'{col}_MEDIAN'] = rates_by_dev_enriched[values].median(axis=1)


                    rates_by_dev_enriched[rates_by_dev_enriched['DEV_NAME'].apply(lambda x: x is not np.nan)].to_excel('Rates_by_Development_2022.xlsx')
rates_by_dev_summary[rates_by_dev_summary['DEV_NAME'].apply(lambda x: x is not np.nan)].drop(columns='None_MEDIAN').to_excel('Rates_by_Development_Summary_2022.xlsx')


                    #Jackson [120, cons 267], Red Hook West [79], Polo Grounds [149], Soundview [71], Howard [72], Bland [54, cons 186], Nostrand [43, cons 36], Riis[18/19], Queensbridge [5/505], 
#Staten Island TBD

member_dev_list = [120, 79, 149, 71, 72, 54, 43, 18, 19, 5, 505, 114, 38]
member_data = rates_by_dev_enriched.reset_index()[rates_by_dev_enriched.reset_index()['TDS'].apply(lambda x: x in member_dev_list)]


                    member_data_summary = member_data.copy().drop(columns = member_data.columns[9:])
sum_col_list = []
for col, values in elec_type_dict.items():
    member_data_summary[f'{col}_MEDIAN'] = member_data[values].median(axis=1)
    member_data_summary[f'{col}_MAX'] = member_data[values].max(axis=1)
    member_data_summary[f'{col}_MIN'] = member_data[values].min(axis=1)
    
    sum_col_list = sum_col_list + [f'{col}_MEDIAN', f'{col}_MAX', f'{col}_MIN']


                    riis = member_data_summary.iloc[[5,6]]
riis = riis.append(pd.DataFrame(data = {key:None for key in riis.columns}, columns=riis.columns, index=[3]))

qns = member_data_summary.iloc[[8,10]]
qns = qns.append(pd.DataFrame(data = {key:None for key in qns.columns}, columns=qns.columns, index=[3]))


riis_full = member_data.iloc[[5,6]]
riis_full = riis_full.append(pd.DataFrame(data = {key:None for key in riis_full.columns}, columns=riis_full.columns, index=[3]))

qns_full = member_data.iloc[[8,10]]
qns_full = qns_full.append(pd.DataFrame(data = {key:None for key in qns_full.columns}, columns=qns_full.columns, index=[3]))


                    def combine_proportions(df, column):
    votes_list = []
    for row in df.iterrows():
        votes_list.append(row[1][column]*row[1]['_counter'])
        
    return sum(votes_list)/sum(df['_counter'])
                               
        
for col in sum_col_list:
    riis.loc[3, col] = combine_proportions(riis[:2], col)
    qns.loc[3, col] = combine_proportions(qns[:2], col)
    
riis.loc[3, 'DEV_NAME'] = 'RIIS_COMBINED'
qns.loc[3, 'DEV_NAME'] = 'QUEENSBRIDGE_COMBINED'

#Same for full data

for col in member_data.columns[9:]:
    riis_full.loc[3, col] = combine_proportions(riis_full[:2], col)
    qns_full.loc[3, col] = combine_proportions(qns_full[:2], col)
    
riis_full.loc[3, 'DEV_NAME'] = 'RIIS_COMBINED'
qns_full.loc[3, 'DEV_NAME'] = 'QUEENSBRIDGE_COMBINED'


                    member_data_summary = member_data_summary.append(riis.loc[3]).append(qns.loc[3])
member_data = member_data.append(riis_full.loc[3]).append(qns_full.loc[3])


                    member_data_tables = member_data_summary[['DEV_NAME']+sum_col_list]


                    t = member_data_tables.set_index('DEV_NAME').stack().reset_index()
t['TYPE'] = t['level_1'].apply(lambda x: x.split('_')[0])
t['MEASURE'] = t['level_1'].apply(lambda x: x.split('_')[1])
t.columns = ['DEV_NAME','_del','TURNOUT','TYPE', 'MEASURE']
t = t.pivot_table(index=['DEV_NAME','TYPE'], values='TURNOUT', columns='MEASURE')


                    writer = pd.ExcelWriter('_tables_for_members/TABLES.xlsx')

for name in t.index.levels[0]:
    t_sub = t.loc[name].transpose()
    t_sub['OTHER'] = t_sub[['None','RUNOFF','SPECIAL']].mean(axis=1)
    t_sub.drop(columns=['None','RUNOFF','SPECIAL']).to_excel(writer, sheet_name = f'{name}')

writer.save()


                    elec_by_development = member_data[['DEV_NAME']+list(member_data.columns[9:])].set_index('DEV_NAME').transpose()


                    elec_by_development.to_excel('_TEMP_wg_elections.xlsx')

Measuring Voter Turnout among NYCHA Residents¶

Last edited Thursday, October 20, 2022 by Kyle Slugg¶

Mission and Purpose¶

Representative Output¶

Data Sources¶

Overview of Methods¶

Annotated Analytical Workflow¶

Load and process voter records¶

For ease of additional processing, discard all voter records that do not correspond to electoral districts that contain, in part or whole, NYCHA developments¶

Clean and convert various fields:¶

Write out to CSV for flexible workflow¶

Locate full NYCHA address range from BBL¶

Adding official NYCHA addresses to list, if not already contained¶

Manually adjusting select addresses¶

Match voter file addresses and NYCHA addresses to isolate NYCHA voters¶

Reload datasets produced above¶

Set up index using Record Linkage Toolkit¶

Obtain matching records¶

Add NYCHA development information to matched addresses¶

Expand elections field to arrive at turnout rates by election¶

Computing Outputs¶

Developments of Voting Rule WG Members¶