Converting postcode to area#

Imports#

import os
import pandas as pd
import copy
import src.data_loading.loads_from_url as lfu
import src.utils.map_utils as mpu
from src.data_loading.loads_from_url import get_poscode_df

data_file = os.path.join(MAIN_PATH, 'src','utils', 'map_data','combined_data.csv')
df = pd.read_csv(data_file,index_col=0)
df
---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
Cell In[2], line 6
      4 import src.data_loading.loads_from_url as lfu
      5 import src.utils.map_utils as mpu
----> 6 from src.data_loading.loads_from_url import get_poscode_df
      8 data_file = os.path.join(MAIN_PATH, 'src','utils', 'map_data','combined_data.csv')
      9 df = pd.read_csv(data_file,index_col=0)

ImportError: cannot import name 'get_poscode_df' from 'src.data_loading.loads_from_url' (C:\Users\44781\Documents\GitHub\Pesticide\src\data_loading\loads_from_url.py)

Load the poscode data#

map_data_file = os.path.join(MAIN_PATH, "src", "utils", "map_data", "postcode_to_region.csv")
postcodes_df = lfu.get_poscode_df(path_to_csv = map_data_file)

print(len(postcodes_df))
postcodes_df.head()
1874663
Postcode mapArea
0 AL1 1AG Hertfordshire
1 AL1 1AJ Hertfordshire
2 AL1 1AR Hertfordshire
3 AL1 1AS Hertfordshire
4 AL1 1AT Hertfordshire

Convert the postcode to area#

def create_area_col(df, postcodes_df, postcode_column):
    postcode_dict = dict(postcodes_df.values)
    
    df['area_to_plot'] = df.loc[:,postcode_column].map(postcode_dict)
    return df

postcode_column = 'address_postcode'
df = create_area_col(df, postcodes_df, postcode_column)
df
sample_id date_of_sampling description country_of_origin retail_outlet address brand_name packer_/_manufacturer_/_importer pesticide_residues_found_in_mg/kg_(mrl) product address_postcode packer__postcode chem_name amount_detected mrl amount_pc area_to_plot
0 1958/2016 2016-08-08 Bramley Apples UK Asda Creechbarrow Road, Taunton TA1 2AN Asda Asda Stores Ltd Leeds, UK LS11 5AD boscalid 0.03 (MRL = 2) Apple TA1 2AN LS11 5AD boscalid 0.03 2.0 0.015000 Somerset
1 1958/2016 2016-08-08 Bramley Apples UK Asda Creechbarrow Road, Taunton TA1 2AN Asda Asda Stores Ltd Leeds, UK LS11 5AD pyraclostrobin 0.01 (MRL = 0.5) Apple TA1 2AN LS11 5AD pyraclostrobin 0.01 0.5 0.020000 Somerset
2 0230/2016 2016-08-08 Bramley Apples UK Co-op Northgate, Louth LN11 0LT Co-op Co-operative Group Ltd Manchester M60 0AG boscalid 0.05 (MRL = 2) Apple LN11 0LT M60 0AG boscalid 0.05 2.0 0.025000 Lincolnshire
3 0230/2016 2016-08-08 Bramley Apples UK Co-op Northgate, Louth LN11 0LT Co-op Co-operative Group Ltd Manchester M60 0AG flonicamid (sum) 0.02 (MRL = 0.2) Apple LN11 0LT M60 0AG flonicamid (sum) 0.02 0.2 0.100000 Lincolnshire
4 0230/2016 2016-08-08 Bramley Apples UK Co-op Northgate, Louth LN11 0LT Co-op Co-operative Group Ltd Manchester M60 0AG pyraclostrobin 0.03 (MRL = 0.5) Apple LN11 0LT M60 0AG pyraclostrobin 0.03 0.5 0.060000 Lincolnshire
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
23947 0792/2022 2022-09-08 Marfona Unwashed Potatoes UK Stoke Fruit Farm 77 Havant Road, Hayling Island, Hampshire PO11... None stated Stoke Fruit Farm 77 Havant Road, Hayling Islan... maleic hydrazide 7.7 (MRL = 60) Potatoes_(GB) PO11 0PT PO11 0PT maleic hydrazide 7.70 60.0 0.128333 Hampshire
23948 0765/2022 2022-09-14 Sagitta Potatoes UK Total Produce Ltd. Callywith Gate Industrial Estate, Launceston R... None stated CS Duston & Son Colwith Farm, Par, Cornwall maleic hydrazide 17 (MRL = 60) Potatoes_(GB) PL31 2RQ 0 maleic hydrazide 17.00 60.0 0.283333 Cornwall
23949 0938/2022 2022-09-01 Marfona Ware Potatoes UK Tulipland Potatoes Ltd Fengate, Moulton Chapel PE12 0XL None stated Garden of Elveden NaN Potatoes_(GB) PE12 0XL 0 0 0.00 0.0 0.000000 Lincolnshire
23950 0833/2022 2022-08-31 Lady Rosetta England (UK) Tyrrells Crisps Tyrrells Court, Stretford Bridge, Leominster H... None stated KP Crisps NaN Potatoes_(GB) HR6 9DQ 0 0 0.00 0.0 0.000000 Herefordshire
23951 0833/2022 2022-08-31 Lady Rosetta England (UK) Tyrrells Crisps Tyrrells Court, Stretford Bridge, Leominster H... None stated KP Crisps NaN Potatoes_(GB) HR6 9DQ 0 0 0.00 0.0 0.000000 Herefordshire

23952 rows × 17 columns

def how_many_failed(df):
    tot_len = len(df)
    bad_len = len(df.loc[df.area_to_plot.isna()])
    bad_pcode = len(df.loc[df[postcode_column]=='0'])
    bad_len = bad_len - bad_pcode
    
    print(f"Number postcodes not found {bad_len} or {100*bad_len/tot_len:.1f}%,\
 not including {bad_pcode} with no postcode")


how_many_failed(df)
Number postcodes not found 1435 or 6.0%, not including 213 with no postcode

Reduce the length of the postcode#

6% or over 1,000 postcodes were not found. Which is a lot. Maybe a better postcode database is needed.

But another way to find more postcode areas is to use less of the postcode

For example for AL1 1AG

  • try AL1 1A

  • if still lots missing try AL1 1

  • and then AL1

For these smaller post-code areas they can be in more than one plot area. A quick way to choose which area to use is the area which has the most postcodes. Maybe a better way would be to look for postcodes near in the alphabet, but this way is simpler.

So for example, in the below if a postcode AL1 1AA is unknown. Because all the other postcodes AL1 1A are Hertfordshire it will assign that unkown postcode to Hertfordshire. If the areas are split (e.g. AL1 1AS was in West Wales but the rest the same) it would assign the unknown postcode AL1 1AA to Hertfordshire.

postcodes_df.loc[postcodes_df['Postcode'].apply(lambda x: x[:-1])=='AL1 1A']
Postcode mapArea
0 AL1 1AG Hertfordshire
1 AL1 1AJ Hertfordshire
2 AL1 1AR Hertfordshire
3 AL1 1AS Hertfordshire
4 AL1 1AT Hertfordshire
5 AL1 1AU Hertfordshire
6 AL1 1AW Hertfordshire

It works but a tad convoluted#

Create a list

def get_postcode_list(postcodes_df=postcodes_df,
                    usecols=['Postcode','mapArea'],
                    reduce_p_max=6
                    ):
    """
    loads poscode data and returns that as a list of dataframes
    where postcode has has been reduced by 1:reduce_p_max-1 in length
    
    Args:   path_to_csv (path) csv file of postcode data
            usecols (list[str]) what columns to load, also includes latitude,longitude
            reduce_p_max (int) number of chars to take off end of potscode -1
    Returns: 
        list_postcodes_df (list[pd.DataFrame]) list of pandas dataframe of the postcode data
    
    """   

    postcodes_df_temp = postcodes_df.copy()
    
    list_postcodes_df = []
    list_postcodes_df.append( postcodes_df_temp.copy() )
    
    for i in range(1,reduce_p_max):
        colname = f"Postcode_m{i}"
        
        if i!=0:
            postcodes_df_temp[colname] = postcodes_df_temp['Postcode'].str[:-i]    
        
        df = _getpc(postcodes_df_temp, colname)
        df = df.rename(columns={colname:'Postcode'})
        list_postcodes_df.append( df )
    
    
    return list_postcodes_df

def _getpc(postcodes_df_temp, colname):
    """
    Helper function to identify the region when a postcode is across multiple regions
    Does this by selecting region that has most postcodes
    e.g. BH1 8L is in two areas (A and B), area A is in 3 times and B in 5 times, 
        so B is selected as the area
    Args:   postcodes_df_temp (pd.DataFrame) dataframe of postcode data
            colname (str) what column to process some variant of postcode
    Returns: 
        postcode_short (pd.DataFrame) modified dataframe with new variable mapArea
    """
    postcodes_df_temp = postcodes_df_temp.groupby(by=[colname,'mapArea'], as_index=False).count()

    postcode_short = postcodes_df_temp.sort_values('Postcode', ascending=False).drop_duplicates([colname])
    
    postcode_short = postcode_short.loc[:,[colname,'mapArea']].reset_index(drop=True)
    return postcode_short

shorter_pc_list = get_postcode_list()
print( len(shorter_pc_list) )
shorter_pc_list[3].head()
6
Postcode mapArea
0 AB2 North East Scotland
1 AB4 North East Scotland
2 NP4 South East Wales
3 CR0 Greater London
4 NP9 South East Wales
dfnew= copy.deepcopy(df)
dfnew = dfnew.loc[:,['address','address_postcode']]

for removeChar in range(0,6):
    # create a new column
   
    if removeChar!=0:
        postcode_columnNEW =postcode_column+f'{removeChar}'
        dfnew[postcode_columnNEW]=df[postcode_column].str[:-removeChar]
    else:
        postcode_columnNEW=postcode_column
    dfnew = create_area_col(dfnew, 
                     shorter_pc_list[removeChar], postcode_columnNEW)
    print()
    print(f'For postcode smaller by {removeChar} characters')
    how_many_failed(dfnew)
For postcode smaller by 0 characters
Number postcodes not found 1435 or 6.0%, not including 213 with no postcode

For postcode smaller by 1 characters
Number postcodes not found 695 or 2.9%, not including 213 with no postcode

For postcode smaller by 2 characters
Number postcodes not found 213 or 0.9%, not including 213 with no postcode

For postcode smaller by 3 characters
Number postcodes not found 95 or 0.4%, not including 213 with no postcode

For postcode smaller by 4 characters
Number postcodes not found 95 or 0.4%, not including 213 with no postcode

For postcode smaller by 5 characters
Number postcodes not found 92 or 0.4%, not including 213 with no postcode

Update postcode data#

So if we take away 3 characters (for example, AL1 1AG -> AL1) 99.6% of the postcodes are found.

Now we need to add the postcodes which didn’t appear in teh original postcode dataset to that dataset with the new areas found.

def _create_area_col(df, postcodes_df, postcode_column):
    df_ =copy.deepcopy(df)
    postcode_dict = dict(postcodes_df[['Postcode','mapArea']].values)
    
    df_['mapArea'] = df_.loc[:,postcode_column].map(postcode_dict)
    return df_['mapArea']

def dothis(df, MAIN_PATH, shorter_pc_list,postcode_column = 'address_postcode'):

    # load postcode to area df
    map_data_file = os.path.join(MAIN_PATH, "src", "utils", "map_data", "postcode_to_region.csv")
    postcodes_to_area_df = lfu.get_poscode_df(path_to_csv = map_data_file)
    postcodes_to_area_df['removeChar']=0
    
    
    df_pc= copy.deepcopy(df)

    # get the unique post-codes and ignore ones that don't exist ='0'
    df_pc =pd.DataFrame(data=df[postcode_column].unique(),columns=['Postcode'])
    df_pc = df_pc.loc[df_pc.Postcode!='0']

    postcode_column = 'Postcode'

    # check if already in the postcode to area df
    # just use the ones that are not
    df_pc['mapArea'] = _create_area_col(df_pc, postcodes_to_area_df, 'Postcode')
    df_pc = df_pc.loc[df_pc['mapArea'].isna()]

    print(f"After initial mapping\n \
The length of postcode mapper is {len(postcodes_to_area_df)} \n \
and {len(df_pc)} still unknonwn\n")
    
    for removeChar in range(1,4):
        print('>> Post-code length reduced by ',removeChar)
        # remove 1 char from the postcode
        df_pc[postcode_column]=df_pc[postcode_column].str[:-1]

        df_pc['mapArea'] = _create_area_col(df_pc, 
                         shorter_pc_list[removeChar], postcode_column)

           
        # add new values to the dictionary
        df_add = copy.deepcopy(df_pc[['Postcode','mapArea']].loc[df_pc['mapArea'].notna()])
        df_add['removeChar'] = removeChar
        
        
        postcodes_to_area_df = pd.concat([ postcodes_to_area_df, df_add ])      
        print(f"Number of new postcodes added {len(df_add)}")

        # just use postcodes not found in next iteration 
        df_pc = copy.deepcopy(df_pc.loc[df_pc['mapArea'].isna()])
        print(f"The length of postcode mapper is {len(postcodes_to_area_df)} \n\
and {len(df_pc)} still unknonwn")
        
        print()
        
    return postcodes_to_area_df, df_add, df_pc
        
newPC, df_add, df_pc = dothis(df, MAIN_PATH, shorter_pc_list)
After initial mapping
 The length of postcode mapper is 1874663 
 and 147 still unknonwn

>> Post-code length reduced by  1
Number of new postcodes added 80
The length of postcode mapper is 1874743 
and 67 still unknonwn

>> Post-code length reduced by  2
Number of new postcodes added 18
The length of postcode mapper is 1874761 
and 49 still unknonwn

>> Post-code length reduced by  3
Number of new postcodes added 27
The length of postcode mapper is 1874788 
and 22 still unknonwn

Go back

First page