Source code for packages.health_data.collect_nih

'''
Collect NIH
===========

Extract all of the NIH World RePORTER data via
their static data dump. :code:`N_TABS` outputs are produced
in CSV format (concatenated across all years), where
:code:`N_TABS` correspondes to the number of tabs in
the main table found at:

    https://exporter.nih.gov/ExPORTER_Catalog.aspx

The data is transferred to the Nesta intermediate data bucket.
'''

from bs4 import BeautifulSoup
import boto3
from io import BytesIO
from io import StringIO
import requests
from zipfile import ZipFile
import csv

# Some constants
BASE_URL = "https://exporter.nih.gov/"
TOP_URL = "https://exporter.nih.gov/ExPORTER_Catalog.aspx"
N_TABS = 5
S3 = boto3.resource('s3')


[docs]def get_data_urls(tab_index): '''Get all CSV URLs from the :code:`tab_index`th tab of the main table found at :code:`TOP_URL`. Args: tab_index (int): Tab number (0-indexed) of table to extract CSV URLs from. Returns: title (str): Title of the tab in the table. hrefs (list): List of URLs pointing to data CSVs. ''' # Make the request process the response r = requests.get(TOP_URL, params={'index':tab_index}) soup = BeautifulSoup(r.text, "html.parser") # Get the selected tab's title title = soup.find('a', class_='selected')['title'] # Extract URLs if 'CSV' is in the URL, but ignore 'DUNS' data hrefs = [] for a in soup.find_all('a', href=True): if ('CSVs' not in a['href']) or ('DUNS' in a['href']): continue if not a['href'].startswith('https'): a['href'] = '{}/{}'.format(BASE_URL, a['href']) hrefs.append(a['href']) return title, hrefs
[docs]def iterrows(url): '''Yield rows from the CSV (found at URL :code:`url`) as JSON (well, :code:`dict` objects). Args: url (str): The URL at which a zipped-up CSV is found. Yields: :code:`dict` object, representing one row of the CSV. ''' # Get the CSV for this URL by unzipping the file at 'url' with BytesIO() as tmp_file: r = requests.get(url) tmp_file.write(r.content) with ZipFile(tmp_file) as tmp_zip: internal_file_names = tmp_zip.namelist() assert len(internal_file_names) == 1 _data = tmp_zip.read(internal_file_names[0]) # Yield JSON chunks of each CSV row _data = _data.decode('cp1252').split("\r\n") _data_it = csv.reader(_data) columns = next(_data_it) for row in _data_it: yield {col.lower(): val for col, val in zip(columns, row)}
if __name__ == '__main__': # Iterate over the number of tabs on the page for i in range(0, N_TABS+1): # Get the URLs which point to the data for this tab title, urls = get_data_urls(i) print(title) # Iterate over URLs for url in urls: print("\t", url) # Iterate over rows in the CSV for row in iterrows(urls): print("\t\t", row) break break