Source code for packages.geo_utils.geocode

'''
geocode
=======

Tools for geocoding.
'''

import logging
import pandas as pd
import requests
from retrying import retry

from nesta.packages.decorators.ratelimit import ratelimit


[docs]def geocode(**request_kwargs): ''' Geocoder using the Open Street Map Nominatim API. If there are multiple results the first one is returned (they are ranked by importance). The API usage policy allows maximum 1 request per second and no multithreading: https://operations.osmfoundation.org/policies/nominatim/ Args: request_kwargs (dict): Parameters for OSM API. Returns: JSON from API response. ''' # Explictly require json for ease of use request_kwargs["format"] = "json" response = requests.get("https://nominatim.openstreetmap.org/search", params=request_kwargs, headers={'User-Agent': 'Nesta health data geocode'}) response.raise_for_status() geo_data = response.json() if len(geo_data) == 0: raise ValueError(f"No geocode match for {request_kwargs}") return geo_data
[docs]def retry_if_not_value_error(exception): """Forces retry to exit if a valueError is returned. Supplied to the 'retry_on_exception' argument in the retry decorator. Args: exception (Exception): the raised exception, to check Returns: (bool): False if a ValueError, else True """ return not isinstance(exception, ValueError)
@retry(stop_max_attempt_number=10, retry_on_exception=retry_if_not_value_error) @ratelimit(max_per_second=0.5) def _geocode(q=None, **kwargs): '''Extension of geocode to catch invalid requests to the api and handle errors. failure. Args: q (str): query string, multiple words should be separated with + kwargs (str): name and value of any other valid query parameters Returns: dict: lat and lon ''' valid_kwargs = ['street', 'city', 'county', 'state', 'country', 'postalcode'] if not all(kwarg in valid_kwargs for kwarg in kwargs): raise ValueError(f"Invalid query parameter. Not in: {valid_kwargs}") if q and kwargs: raise ValueError("Supply either q OR other query parameters, they cannot be combined.") if not q and not kwargs: raise ValueError("No query parameters supplied") query_kwargs = {'q': q} if q else kwargs try: geo_data = geocode(**query_kwargs) except ValueError: logging.debug(f"Unable to geocode {query_kwargs}") return None # converts to null which is accepted in elasticsearch lat = geo_data[0]['lat'] lon = geo_data[0]['lon'] logging.debug(f"Successfully geocoded {query_kwargs} to {lat, lon}") return {'lat': lat, 'lon': lon}
[docs]def geocode_dataframe(df): ''' A wrapper for the geocode function to process a supplied dataframe using the city and country. Args: df (dataframe): a dataframe containing city and country fields. Returns: a dataframe with a 'coordinates' column appended. ''' in_cols = ['city', 'country'] out_col = 'coordinates' # Only geocode unique city/country combos _df = df[in_cols].drop_duplicates() _df.replace('', pd.np.nan, inplace=True) _df = _df.dropna() if len(_df) == 0: df[out_col] = None return df # Attempt to geocode with city and country _df[out_col] = _df[in_cols].apply(lambda row: _geocode(**row), axis=1) # Attempt to geocode with query for those which failed null = pd.isnull(_df[out_col]) if null.sum() > 0: query = "{city} {country}" _df.loc[null, out_col] = _df.loc[null, in_cols].apply(lambda row: _geocode(query.format(**row)), axis=1) # Merge the results again return pd.merge(df, _df, how='left', left_on=in_cols, right_on=in_cols)
[docs]def geocode_batch_dataframe(df, city='city', country='country', latitude='latitude', longitude='longitude', query_method='both'): """Geocodes a dataframe, first by supplying the city and country to the api, if this fails a second attempt is made supplying the combination using the q= method. The supplied dataframe df is returned with additional columns appended, containing the latitude and longitude as floats. Args: df (:obj:`pandas.DataFrame`): input dataframe city (str): name of the input column containing the city country (str): name of the input column containing the country latitude (str): name of the output column containing the latitude longitude (str): name of the output column containing the longitude query_method (int): query methods to attempt: 'city_country_only': city and country only 'query_only': q method only 'both': city, country with fallback to q method Returns: (:obj:`pandas.DataFrame`): original dataframe with lat and lon appended as floats """ if query_method not in ['city_country_only', 'query_only', 'both']: raise ValueError("Invalid query method, must be 'city_country_only', 'query_only' or 'both'") df[latitude], df[longitude] = None, None for idx, row in df.iterrows(): location = None if query_method in ['city_country_only', 'both']: location = _geocode(city=row[city], country=row[country]) if location is None and query_method in ['query_only', 'both']: query = f"{row[city]} {row[country]}" location = _geocode(q=query) if location is not None: df.loc[idx, latitude] = float(location['lat']) df.loc[idx, longitude] = float(location['lon']) return df
[docs]def generate_composite_key(city=None, country=None): """Generates a composite key to use as the primary key for the geographic data. Args: city (str): name of the city country (str): name of the country Returns: (str): composite key """ try: city = city.replace(' ', '-').lower() country = country.replace(' ', '-').lower() except AttributeError: raise ValueError(f"Invalid city or country name. City: {city} | Country: {country}") return '_'.join([city, country])