Converting postcode to area#

Imports#

import os
import pandas as pd
import copy
import src.data_loading.loads_from_url as lfu
import src.utils.map_utils as mpu
from src.data_loading.loads_from_url import get_poscode_df

data_file = os.path.join(MAIN_PATH, 'src','utils', 'map_data','combined_data.csv')
df = pd.read_csv(data_file,index_col=0)
df

---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
Cell In[2], line 6
      4 import src.data_loading.loads_from_url as lfu
      5 import src.utils.map_utils as mpu
----> 6 from src.data_loading.loads_from_url import get_poscode_df
      8 data_file = os.path.join(MAIN_PATH, 'src','utils', 'map_data','combined_data.csv')
      9 df = pd.read_csv(data_file,index_col=0)

ImportError: cannot import name 'get_poscode_df' from 'src.data_loading.loads_from_url' (C:\Users\44781\Documents\GitHub\Pesticide\src\data_loading\loads_from_url.py)

Load the poscode data#

map_data_file = os.path.join(MAIN_PATH, "src", "utils", "map_data", "postcode_to_region.csv")
postcodes_df = lfu.get_poscode_df(path_to_csv = map_data_file)

print(len(postcodes_df))
postcodes_df.head()

	Postcode	mapArea
0	AL1 1AG	Hertfordshire
1	AL1 1AJ	Hertfordshire
2	AL1 1AR	Hertfordshire
3	AL1 1AS	Hertfordshire
4	AL1 1AT	Hertfordshire

Convert the postcode to area#

def create_area_col(df, postcodes_df, postcode_column):
    postcode_dict = dict(postcodes_df.values)
    
    df['area_to_plot'] = df.loc[:,postcode_column].map(postcode_dict)
    return df

postcode_column = 'address_postcode'
df = create_area_col(df, postcodes_df, postcode_column)
df

	sample_id	date_of_sampling	description	country_of_origin	retail_outlet	address	brand_name	packer_/_manufacturer_/_importer	pesticide_residues_found_in_mg/kg_(mrl)	product	address_postcode	packer__postcode	chem_name	amount_detected	mrl	amount_pc	area_to_plot
0	1958/2016	2016-08-08	Bramley Apples	UK	Asda	Creechbarrow Road, Taunton TA1 2AN	Asda	Asda Stores Ltd Leeds, UK LS11 5AD	boscalid 0.03 (MRL = 2)	Apple	TA1 2AN	LS11 5AD	boscalid	0.03	2.0	0.015000	Somerset
1	1958/2016	2016-08-08	Bramley Apples	UK	Asda	Creechbarrow Road, Taunton TA1 2AN	Asda	Asda Stores Ltd Leeds, UK LS11 5AD	pyraclostrobin 0.01 (MRL = 0.5)	Apple	TA1 2AN	LS11 5AD	pyraclostrobin	0.01	0.5	0.020000	Somerset
2	0230/2016	2016-08-08	Bramley Apples	UK	Co-op	Northgate, Louth LN11 0LT	Co-op	Co-operative Group Ltd Manchester M60 0AG	boscalid 0.05 (MRL = 2)	Apple	LN11 0LT	M60 0AG	boscalid	0.05	2.0	0.025000	Lincolnshire
3	0230/2016	2016-08-08	Bramley Apples	UK	Co-op	Northgate, Louth LN11 0LT	Co-op	Co-operative Group Ltd Manchester M60 0AG	flonicamid (sum) 0.02 (MRL = 0.2)	Apple	LN11 0LT	M60 0AG	flonicamid (sum)	0.02	0.2	0.100000	Lincolnshire
4	0230/2016	2016-08-08	Bramley Apples	UK	Co-op	Northgate, Louth LN11 0LT	Co-op	Co-operative Group Ltd Manchester M60 0AG	pyraclostrobin 0.03 (MRL = 0.5)	Apple	LN11 0LT	M60 0AG	pyraclostrobin	0.03	0.5	0.060000	Lincolnshire
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
23947	0792/2022	2022-09-08	Marfona Unwashed Potatoes	UK	Stoke Fruit Farm	77 Havant Road, Hayling Island, Hampshire PO11...	None stated	Stoke Fruit Farm 77 Havant Road, Hayling Islan...	maleic hydrazide 7.7 (MRL = 60)	Potatoes_(GB)	PO11 0PT	PO11 0PT	maleic hydrazide	7.70	60.0	0.128333	Hampshire
23948	0765/2022	2022-09-14	Sagitta Potatoes	UK	Total Produce Ltd.	Callywith Gate Industrial Estate, Launceston R...	None stated	CS Duston & Son Colwith Farm, Par, Cornwall	maleic hydrazide 17 (MRL = 60)	Potatoes_(GB)	PL31 2RQ	0	maleic hydrazide	17.00	60.0	0.283333	Cornwall
23949	0938/2022	2022-09-01	Marfona Ware Potatoes	UK	Tulipland Potatoes Ltd	Fengate, Moulton Chapel PE12 0XL	None stated	Garden of Elveden	NaN	Potatoes_(GB)	PE12 0XL	0	0	0.00	0.0	0.000000	Lincolnshire
23950	0833/2022	2022-08-31	Lady Rosetta	England (UK)	Tyrrells Crisps	Tyrrells Court, Stretford Bridge, Leominster H...	None stated	KP Crisps	NaN	Potatoes_(GB)	HR6 9DQ	0	0	0.00	0.0	0.000000	Herefordshire
23951	0833/2022	2022-08-31	Lady Rosetta	England (UK)	Tyrrells Crisps	Tyrrells Court, Stretford Bridge, Leominster H...	None stated	KP Crisps	NaN	Potatoes_(GB)	HR6 9DQ	0	0	0.00	0.0	0.000000	Herefordshire

23952 rows × 17 columns

def how_many_failed(df):
    tot_len = len(df)
    bad_len = len(df.loc[df.area_to_plot.isna()])
    bad_pcode = len(df.loc[df[postcode_column]=='0'])
    bad_len = bad_len - bad_pcode
    
    print(f"Number postcodes not found {bad_len} or {100*bad_len/tot_len:.1f}%,\
 not including {bad_pcode} with no postcode")


how_many_failed(df)

Number postcodes not found 1435 or 6.0%, not including 213 with no postcode

Reduce the length of the postcode#

6% or over 1,000 postcodes were not found. Which is a lot. Maybe a better postcode database is needed.

But another way to find more postcode areas is to use less of the postcode

For example for AL1 1AG

try AL1 1A
if still lots missing try AL1 1
and then AL1

For these smaller post-code areas they can be in more than one plot area. A quick way to choose which area to use is the area which has the most postcodes. Maybe a better way would be to look for postcodes near in the alphabet, but this way is simpler.

So for example, in the below if a postcode AL1 1AA is unknown. Because all the other postcodes AL1 1A are Hertfordshire it will assign that unkown postcode to Hertfordshire. If the areas are split (e.g. AL1 1AS was in West Wales but the rest the same) it would assign the unknown postcode AL1 1AA to Hertfordshire.

postcodes_df.loc[postcodes_df['Postcode'].apply(lambda x: x[:-1])=='AL1 1A']

	Postcode	mapArea
0	AL1 1AG	Hertfordshire
1	AL1 1AJ	Hertfordshire
2	AL1 1AR	Hertfordshire
3	AL1 1AS	Hertfordshire
4	AL1 1AT	Hertfordshire
5	AL1 1AU	Hertfordshire
6	AL1 1AW	Hertfordshire

It works but a tad convoluted#

Create a list

def get_postcode_list(postcodes_df=postcodes_df,
                    usecols=['Postcode','mapArea'],
                    reduce_p_max=6
                    ):
    """
    loads poscode data and returns that as a list of dataframes
    where postcode has has been reduced by 1:reduce_p_max-1 in length
    
    Args:   path_to_csv (path) csv file of postcode data
            usecols (list[str]) what columns to load, also includes latitude,longitude
            reduce_p_max (int) number of chars to take off end of potscode -1
    Returns: 
        list_postcodes_df (list[pd.DataFrame]) list of pandas dataframe of the postcode data
    
    """   

    postcodes_df_temp = postcodes_df.copy()
    
    list_postcodes_df = []
    list_postcodes_df.append( postcodes_df_temp.copy() )
    
    for i in range(1,reduce_p_max):
        colname = f"Postcode_m{i}"
        
        if i!=0:
            postcodes_df_temp[colname] = postcodes_df_temp['Postcode'].str[:-i]    
        
        df = _getpc(postcodes_df_temp, colname)
        df = df.rename(columns={colname:'Postcode'})
        list_postcodes_df.append( df )
    
    
    return list_postcodes_df

def _getpc(postcodes_df_temp, colname):
    """
    Helper function to identify the region when a postcode is across multiple regions
    Does this by selecting region that has most postcodes
    e.g. BH1 8L is in two areas (A and B), area A is in 3 times and B in 5 times, 
        so B is selected as the area
    Args:   postcodes_df_temp (pd.DataFrame) dataframe of postcode data
            colname (str) what column to process some variant of postcode
    Returns: 
        postcode_short (pd.DataFrame) modified dataframe with new variable mapArea
    """
    postcodes_df_temp = postcodes_df_temp.groupby(by=[colname,'mapArea'], as_index=False).count()

    postcode_short = postcodes_df_temp.sort_values('Postcode', ascending=False).drop_duplicates([colname])
    
    postcode_short = postcode_short.loc[:,[colname,'mapArea']].reset_index(drop=True)
    return postcode_short

shorter_pc_list = get_postcode_list()

print( len(shorter_pc_list) )
shorter_pc_list[3].head()

	Postcode	mapArea
0	AB2	North East Scotland
1	AB4	North East Scotland
2	NP4	South East Wales
3	CR0	Greater London
4	NP9	South East Wales

dfnew= copy.deepcopy(df)
dfnew = dfnew.loc[:,['address','address_postcode']]

for removeChar in range(0,6):
    # create a new column
   
    if removeChar!=0:
        postcode_columnNEW =postcode_column+f'{removeChar}'
        dfnew[postcode_columnNEW]=df[postcode_column].str[:-removeChar]
    else:
        postcode_columnNEW=postcode_column
    dfnew = create_area_col(dfnew, 
                     shorter_pc_list[removeChar], postcode_columnNEW)
    print()
    print(f'For postcode smaller by {removeChar} characters')
    how_many_failed(dfnew)

For postcode smaller by 0 characters
Number postcodes not found 1435 or 6.0%, not including 213 with no postcode

For postcode smaller by 1 characters
Number postcodes not found 695 or 2.9%, not including 213 with no postcode

For postcode smaller by 2 characters
Number postcodes not found 213 or 0.9%, not including 213 with no postcode

For postcode smaller by 3 characters
Number postcodes not found 95 or 0.4%, not including 213 with no postcode

For postcode smaller by 4 characters
Number postcodes not found 95 or 0.4%, not including 213 with no postcode

For postcode smaller by 5 characters
Number postcodes not found 92 or 0.4%, not including 213 with no postcode

Update postcode data#

So if we take away 3 characters (for example, AL1 1AG -> AL1) 99.6% of the postcodes are found.

Now we need to add the postcodes which didn’t appear in teh original postcode dataset to that dataset with the new areas found.

def _create_area_col(df, postcodes_df, postcode_column):
    df_ =copy.deepcopy(df)
    postcode_dict = dict(postcodes_df[['Postcode','mapArea']].values)
    
    df_['mapArea'] = df_.loc[:,postcode_column].map(postcode_dict)
    return df_['mapArea']

def dothis(df, MAIN_PATH, shorter_pc_list,postcode_column = 'address_postcode'):

    # load postcode to area df
    map_data_file = os.path.join(MAIN_PATH, "src", "utils", "map_data", "postcode_to_region.csv")
    postcodes_to_area_df = lfu.get_poscode_df(path_to_csv = map_data_file)
    postcodes_to_area_df['removeChar']=0
    
    
    df_pc= copy.deepcopy(df)

    # get the unique post-codes and ignore ones that don't exist ='0'
    df_pc =pd.DataFrame(data=df[postcode_column].unique(),columns=['Postcode'])
    df_pc = df_pc.loc[df_pc.Postcode!='0']

    postcode_column = 'Postcode'

    # check if already in the postcode to area df
    # just use the ones that are not
    df_pc['mapArea'] = _create_area_col(df_pc, postcodes_to_area_df, 'Postcode')
    df_pc = df_pc.loc[df_pc['mapArea'].isna()]

    print(f"After initial mapping\n \
The length of postcode mapper is {len(postcodes_to_area_df)} \n \
and {len(df_pc)} still unknonwn\n")
    
    for removeChar in range(1,4):
        print('>> Post-code length reduced by ',removeChar)
        # remove 1 char from the postcode
        df_pc[postcode_column]=df_pc[postcode_column].str[:-1]

        df_pc['mapArea'] = _create_area_col(df_pc, 
                         shorter_pc_list[removeChar], postcode_column)

           
        # add new values to the dictionary
        df_add = copy.deepcopy(df_pc[['Postcode','mapArea']].loc[df_pc['mapArea'].notna()])
        df_add['removeChar'] = removeChar
        
        
        postcodes_to_area_df = pd.concat([ postcodes_to_area_df, df_add ])      
        print(f"Number of new postcodes added {len(df_add)}")

        # just use postcodes not found in next iteration 
        df_pc = copy.deepcopy(df_pc.loc[df_pc['mapArea'].isna()])
        print(f"The length of postcode mapper is {len(postcodes_to_area_df)} \n\
and {len(df_pc)} still unknonwn")
        
        print()
        
    return postcodes_to_area_df, df_add, df_pc
        
newPC, df_add, df_pc = dothis(df, MAIN_PATH, shorter_pc_list)

After initial mapping
 The length of postcode mapper is 1874663 
 and 147 still unknonwn

>> Post-code length reduced by  1
Number of new postcodes added 80
The length of postcode mapper is 1874743 
and 67 still unknonwn

>> Post-code length reduced by  2
Number of new postcodes added 18
The length of postcode mapper is 1874761 
and 49 still unknonwn

>> Post-code length reduced by  3
Number of new postcodes added 27
The length of postcode mapper is 1874788 
and 22 still unknonwn

Go back

First page

Defra Pesticide Testing, Data Analysis

Converting postcode to area

Contents