Converting postcode to area
Contents
Converting postcode to area#
Imports#
import os
import pandas as pd
import copy
import src.data_loading.loads_from_url as lfu
import src.utils.map_utils as mpu
from src.data_loading.loads_from_url import get_poscode_df
data_file = os.path.join(MAIN_PATH, 'src','utils', 'map_data','combined_data.csv')
df = pd.read_csv(data_file,index_col=0)
df
---------------------------------------------------------------------------
ImportError Traceback (most recent call last)
Cell In[2], line 6
4 import src.data_loading.loads_from_url as lfu
5 import src.utils.map_utils as mpu
----> 6 from src.data_loading.loads_from_url import get_poscode_df
8 data_file = os.path.join(MAIN_PATH, 'src','utils', 'map_data','combined_data.csv')
9 df = pd.read_csv(data_file,index_col=0)
ImportError: cannot import name 'get_poscode_df' from 'src.data_loading.loads_from_url' (C:\Users\44781\Documents\GitHub\Pesticide\src\data_loading\loads_from_url.py)
Load the poscode data#
map_data_file = os.path.join(MAIN_PATH, "src", "utils", "map_data", "postcode_to_region.csv")
postcodes_df = lfu.get_poscode_df(path_to_csv = map_data_file)
print(len(postcodes_df))
postcodes_df.head()
1874663
Postcode | mapArea | |
---|---|---|
0 | AL1 1AG | Hertfordshire |
1 | AL1 1AJ | Hertfordshire |
2 | AL1 1AR | Hertfordshire |
3 | AL1 1AS | Hertfordshire |
4 | AL1 1AT | Hertfordshire |
Convert the postcode to area#
def create_area_col(df, postcodes_df, postcode_column):
postcode_dict = dict(postcodes_df.values)
df['area_to_plot'] = df.loc[:,postcode_column].map(postcode_dict)
return df
postcode_column = 'address_postcode'
df = create_area_col(df, postcodes_df, postcode_column)
df
sample_id | date_of_sampling | description | country_of_origin | retail_outlet | address | brand_name | packer_/_manufacturer_/_importer | pesticide_residues_found_in_mg/kg_(mrl) | product | address_postcode | packer__postcode | chem_name | amount_detected | mrl | amount_pc | area_to_plot | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1958/2016 | 2016-08-08 | Bramley Apples | UK | Asda | Creechbarrow Road, Taunton TA1 2AN | Asda | Asda Stores Ltd Leeds, UK LS11 5AD | boscalid 0.03 (MRL = 2) | Apple | TA1 2AN | LS11 5AD | boscalid | 0.03 | 2.0 | 0.015000 | Somerset |
1 | 1958/2016 | 2016-08-08 | Bramley Apples | UK | Asda | Creechbarrow Road, Taunton TA1 2AN | Asda | Asda Stores Ltd Leeds, UK LS11 5AD | pyraclostrobin 0.01 (MRL = 0.5) | Apple | TA1 2AN | LS11 5AD | pyraclostrobin | 0.01 | 0.5 | 0.020000 | Somerset |
2 | 0230/2016 | 2016-08-08 | Bramley Apples | UK | Co-op | Northgate, Louth LN11 0LT | Co-op | Co-operative Group Ltd Manchester M60 0AG | boscalid 0.05 (MRL = 2) | Apple | LN11 0LT | M60 0AG | boscalid | 0.05 | 2.0 | 0.025000 | Lincolnshire |
3 | 0230/2016 | 2016-08-08 | Bramley Apples | UK | Co-op | Northgate, Louth LN11 0LT | Co-op | Co-operative Group Ltd Manchester M60 0AG | flonicamid (sum) 0.02 (MRL = 0.2) | Apple | LN11 0LT | M60 0AG | flonicamid (sum) | 0.02 | 0.2 | 0.100000 | Lincolnshire |
4 | 0230/2016 | 2016-08-08 | Bramley Apples | UK | Co-op | Northgate, Louth LN11 0LT | Co-op | Co-operative Group Ltd Manchester M60 0AG | pyraclostrobin 0.03 (MRL = 0.5) | Apple | LN11 0LT | M60 0AG | pyraclostrobin | 0.03 | 0.5 | 0.060000 | Lincolnshire |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
23947 | 0792/2022 | 2022-09-08 | Marfona Unwashed Potatoes | UK | Stoke Fruit Farm | 77 Havant Road, Hayling Island, Hampshire PO11... | None stated | Stoke Fruit Farm 77 Havant Road, Hayling Islan... | maleic hydrazide 7.7 (MRL = 60) | Potatoes_(GB) | PO11 0PT | PO11 0PT | maleic hydrazide | 7.70 | 60.0 | 0.128333 | Hampshire |
23948 | 0765/2022 | 2022-09-14 | Sagitta Potatoes | UK | Total Produce Ltd. | Callywith Gate Industrial Estate, Launceston R... | None stated | CS Duston & Son Colwith Farm, Par, Cornwall | maleic hydrazide 17 (MRL = 60) | Potatoes_(GB) | PL31 2RQ | 0 | maleic hydrazide | 17.00 | 60.0 | 0.283333 | Cornwall |
23949 | 0938/2022 | 2022-09-01 | Marfona Ware Potatoes | UK | Tulipland Potatoes Ltd | Fengate, Moulton Chapel PE12 0XL | None stated | Garden of Elveden | NaN | Potatoes_(GB) | PE12 0XL | 0 | 0 | 0.00 | 0.0 | 0.000000 | Lincolnshire |
23950 | 0833/2022 | 2022-08-31 | Lady Rosetta | England (UK) | Tyrrells Crisps | Tyrrells Court, Stretford Bridge, Leominster H... | None stated | KP Crisps | NaN | Potatoes_(GB) | HR6 9DQ | 0 | 0 | 0.00 | 0.0 | 0.000000 | Herefordshire |
23951 | 0833/2022 | 2022-08-31 | Lady Rosetta | England (UK) | Tyrrells Crisps | Tyrrells Court, Stretford Bridge, Leominster H... | None stated | KP Crisps | NaN | Potatoes_(GB) | HR6 9DQ | 0 | 0 | 0.00 | 0.0 | 0.000000 | Herefordshire |
23952 rows × 17 columns
def how_many_failed(df):
tot_len = len(df)
bad_len = len(df.loc[df.area_to_plot.isna()])
bad_pcode = len(df.loc[df[postcode_column]=='0'])
bad_len = bad_len - bad_pcode
print(f"Number postcodes not found {bad_len} or {100*bad_len/tot_len:.1f}%,\
not including {bad_pcode} with no postcode")
how_many_failed(df)
Number postcodes not found 1435 or 6.0%, not including 213 with no postcode
Reduce the length of the postcode#
6% or over 1,000 postcodes were not found. Which is a lot. Maybe a better postcode database is needed.
But another way to find more postcode areas is to use less of the postcode
For example for AL1 1AG
try
AL1 1A
if still lots missing try
AL1 1
and then
AL1
For these smaller post-code areas they can be in more than one plot area. A quick way to choose which area to use is the area which has the most postcodes. Maybe a better way would be to look for postcodes near in the alphabet, but this way is simpler.
So for example, in the below if a postcode AL1 1AA is unknown. Because all the other postcodes AL1 1A are Hertfordshire it will assign that unkown postcode to Hertfordshire. If the areas are split (e.g. AL1 1AS was in West Wales but the rest the same) it would assign the unknown postcode AL1 1AA to Hertfordshire.
postcodes_df.loc[postcodes_df['Postcode'].apply(lambda x: x[:-1])=='AL1 1A']
Postcode | mapArea | |
---|---|---|
0 | AL1 1AG | Hertfordshire |
1 | AL1 1AJ | Hertfordshire |
2 | AL1 1AR | Hertfordshire |
3 | AL1 1AS | Hertfordshire |
4 | AL1 1AT | Hertfordshire |
5 | AL1 1AU | Hertfordshire |
6 | AL1 1AW | Hertfordshire |
It works but a tad convoluted#
Create a list
def get_postcode_list(postcodes_df=postcodes_df,
usecols=['Postcode','mapArea'],
reduce_p_max=6
):
"""
loads poscode data and returns that as a list of dataframes
where postcode has has been reduced by 1:reduce_p_max-1 in length
Args: path_to_csv (path) csv file of postcode data
usecols (list[str]) what columns to load, also includes latitude,longitude
reduce_p_max (int) number of chars to take off end of potscode -1
Returns:
list_postcodes_df (list[pd.DataFrame]) list of pandas dataframe of the postcode data
"""
postcodes_df_temp = postcodes_df.copy()
list_postcodes_df = []
list_postcodes_df.append( postcodes_df_temp.copy() )
for i in range(1,reduce_p_max):
colname = f"Postcode_m{i}"
if i!=0:
postcodes_df_temp[colname] = postcodes_df_temp['Postcode'].str[:-i]
df = _getpc(postcodes_df_temp, colname)
df = df.rename(columns={colname:'Postcode'})
list_postcodes_df.append( df )
return list_postcodes_df
def _getpc(postcodes_df_temp, colname):
"""
Helper function to identify the region when a postcode is across multiple regions
Does this by selecting region that has most postcodes
e.g. BH1 8L is in two areas (A and B), area A is in 3 times and B in 5 times,
so B is selected as the area
Args: postcodes_df_temp (pd.DataFrame) dataframe of postcode data
colname (str) what column to process some variant of postcode
Returns:
postcode_short (pd.DataFrame) modified dataframe with new variable mapArea
"""
postcodes_df_temp = postcodes_df_temp.groupby(by=[colname,'mapArea'], as_index=False).count()
postcode_short = postcodes_df_temp.sort_values('Postcode', ascending=False).drop_duplicates([colname])
postcode_short = postcode_short.loc[:,[colname,'mapArea']].reset_index(drop=True)
return postcode_short
shorter_pc_list = get_postcode_list()
print( len(shorter_pc_list) )
shorter_pc_list[3].head()
6
Postcode | mapArea | |
---|---|---|
0 | AB2 | North East Scotland |
1 | AB4 | North East Scotland |
2 | NP4 | South East Wales |
3 | CR0 | Greater London |
4 | NP9 | South East Wales |
dfnew= copy.deepcopy(df)
dfnew = dfnew.loc[:,['address','address_postcode']]
for removeChar in range(0,6):
# create a new column
if removeChar!=0:
postcode_columnNEW =postcode_column+f'{removeChar}'
dfnew[postcode_columnNEW]=df[postcode_column].str[:-removeChar]
else:
postcode_columnNEW=postcode_column
dfnew = create_area_col(dfnew,
shorter_pc_list[removeChar], postcode_columnNEW)
print()
print(f'For postcode smaller by {removeChar} characters')
how_many_failed(dfnew)
For postcode smaller by 0 characters
Number postcodes not found 1435 or 6.0%, not including 213 with no postcode
For postcode smaller by 1 characters
Number postcodes not found 695 or 2.9%, not including 213 with no postcode
For postcode smaller by 2 characters
Number postcodes not found 213 or 0.9%, not including 213 with no postcode
For postcode smaller by 3 characters
Number postcodes not found 95 or 0.4%, not including 213 with no postcode
For postcode smaller by 4 characters
Number postcodes not found 95 or 0.4%, not including 213 with no postcode
For postcode smaller by 5 characters
Number postcodes not found 92 or 0.4%, not including 213 with no postcode
Update postcode data#
So if we take away 3 characters (for example, AL1 1AG -> AL1) 99.6% of the postcodes are found.
Now we need to add the postcodes which didn’t appear in teh original postcode dataset to that dataset with the new areas found.
def _create_area_col(df, postcodes_df, postcode_column):
df_ =copy.deepcopy(df)
postcode_dict = dict(postcodes_df[['Postcode','mapArea']].values)
df_['mapArea'] = df_.loc[:,postcode_column].map(postcode_dict)
return df_['mapArea']
def dothis(df, MAIN_PATH, shorter_pc_list,postcode_column = 'address_postcode'):
# load postcode to area df
map_data_file = os.path.join(MAIN_PATH, "src", "utils", "map_data", "postcode_to_region.csv")
postcodes_to_area_df = lfu.get_poscode_df(path_to_csv = map_data_file)
postcodes_to_area_df['removeChar']=0
df_pc= copy.deepcopy(df)
# get the unique post-codes and ignore ones that don't exist ='0'
df_pc =pd.DataFrame(data=df[postcode_column].unique(),columns=['Postcode'])
df_pc = df_pc.loc[df_pc.Postcode!='0']
postcode_column = 'Postcode'
# check if already in the postcode to area df
# just use the ones that are not
df_pc['mapArea'] = _create_area_col(df_pc, postcodes_to_area_df, 'Postcode')
df_pc = df_pc.loc[df_pc['mapArea'].isna()]
print(f"After initial mapping\n \
The length of postcode mapper is {len(postcodes_to_area_df)} \n \
and {len(df_pc)} still unknonwn\n")
for removeChar in range(1,4):
print('>> Post-code length reduced by ',removeChar)
# remove 1 char from the postcode
df_pc[postcode_column]=df_pc[postcode_column].str[:-1]
df_pc['mapArea'] = _create_area_col(df_pc,
shorter_pc_list[removeChar], postcode_column)
# add new values to the dictionary
df_add = copy.deepcopy(df_pc[['Postcode','mapArea']].loc[df_pc['mapArea'].notna()])
df_add['removeChar'] = removeChar
postcodes_to_area_df = pd.concat([ postcodes_to_area_df, df_add ])
print(f"Number of new postcodes added {len(df_add)}")
# just use postcodes not found in next iteration
df_pc = copy.deepcopy(df_pc.loc[df_pc['mapArea'].isna()])
print(f"The length of postcode mapper is {len(postcodes_to_area_df)} \n\
and {len(df_pc)} still unknonwn")
print()
return postcodes_to_area_df, df_add, df_pc
newPC, df_add, df_pc = dothis(df, MAIN_PATH, shorter_pc_list)
After initial mapping
The length of postcode mapper is 1874663
and 147 still unknonwn
>> Post-code length reduced by 1
Number of new postcodes added 80
The length of postcode mapper is 1874743
and 67 still unknonwn
>> Post-code length reduced by 2
Number of new postcodes added 18
The length of postcode mapper is 1874761
and 49 still unknonwn
>> Post-code length reduced by 3
Number of new postcodes added 27
The length of postcode mapper is 1874788
and 22 still unknonwn