'''
NIH schema
==============
The schema for the World RePORTER data.
'''
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.types import INTEGER, JSON, DATETIME, FLOAT
from sqlalchemy import Column, Table, ForeignKey
from sqlalchemy.orm import relationship
from sqlalchemy.ext.associationproxy import association_proxy
from nesta.core.orms.types import VARCHAR, TEXT
[docs]def getattr_(entity, attribute):
"""Either unpack the attribute from every item in the entity
if the entity is a list, otherwise just return the attribute
from the entity. Returns None if the entity is either None
or empty."""
if entity in (None, []):
return None
if isinstance(entity, list):
return [getattr(item, attribute) for item in entity]
return getattr(entity, attribute)
Base = declarative_base()
[docs]class Projects(Base):
__tablename__ = 'nih_projects'
application_id = Column(INTEGER, primary_key=True, autoincrement=False)
activity = Column(VARCHAR(3))
administering_ic = Column(VARCHAR(2))
application_type = Column(INTEGER)
arra_funded = Column(VARCHAR(1))
award_notice_date = Column(DATETIME)
base_core_project_num = Column(VARCHAR(50), index=True)
budget_start = Column(DATETIME)
budget_end = Column(DATETIME)
cfda_code = Column(TEXT)
core_project_num = Column(VARCHAR(50), index=True)
ed_inst_type = Column(TEXT)
foa_number = Column(TEXT)
full_project_num = Column(VARCHAR(50), index=True)
funding_ics = Column(JSON)
funding_mechanism = Column(TEXT)
fy = Column(INTEGER, index=True)
ic_name = Column(VARCHAR(100), index=True)
org_city = Column(VARCHAR(50), index=True)
org_country = Column(VARCHAR(50), index=True)
org_dept = Column(VARCHAR(100), index=True)
org_district = Column(INTEGER)
org_duns = Column(JSON)
org_fips = Column(VARCHAR(2), index=True)
org_ipf_code = Column(INTEGER)
org_name = Column(VARCHAR(100), index=True)
org_state = Column(VARCHAR(2), index=True)
org_zipcode = Column(VARCHAR(10))
phr = Column(TEXT)
pi_ids = Column(JSON)
pi_names = Column(JSON)
program_officer_name = Column(TEXT)
project_start = Column(DATETIME, index=True)
project_end = Column(DATETIME, index=True)
project_terms = Column(JSON)
project_title = Column(TEXT)
serial_number = Column(VARCHAR(6))
study_section = Column(VARCHAR(4))
study_section_name = Column(TEXT)
suffix = Column(VARCHAR(6))
support_year = Column(VARCHAR(2))
direct_cost_amt = Column(INTEGER)
indirect_cost_amt = Column(INTEGER)
total_cost = Column(INTEGER)
subproject_id = Column(INTEGER, index=True)
total_cost_sub_project = Column(INTEGER)
nih_spending_cats = Column(JSON)
# Pseudo-FKs
abstract = relationship("Abstracts", uselist=False,
foreign_keys=[application_id],
primaryjoin=("Projects.application_id=="
"Abstracts.application_id"))
publications = relationship("LinkTables", uselist=True,
foreign_keys=[core_project_num],
primaryjoin=("Projects.core_project_num=="
"LinkTables.project_number"))
patents = relationship("Patents", uselist=True,
foreign_keys=[core_project_num],
primaryjoin=("Projects.core_project_num=="
"Patents.project_id"))
clinicalstudies = relationship("ClinicalStudies", uselist=True,
foreign_keys=[core_project_num],
primaryjoin=("Projects.core_project_num=="
"ClinicalStudies.core_project_number"))
# Pseudo-fields (populated from relationships)
@property
def abstract_text(self):
return getattr_(self.abstract, "abstract_text")
@property
def patent_ids(self):
return getattr_(self.patents, "patent_id")
@property
def patent_titles(self):
return getattr_(self.patents, "patent_title")
@property
def pmids(self):
return getattr_(self.publications, "pmid")
@property
def clinicaltrial_ids(self):
return getattr_(self.clinicalstudies, "clinicaltrials_gov_id")
@property
def clinicaltrial_titles(self):
return getattr_(self.clinicalstudies, "study")
[docs]class Abstracts(Base):
__tablename__ = 'nih_abstracts'
application_id = Column(INTEGER, primary_key=True, autoincrement=False)
abstract_text = Column(TEXT)
[docs]class Publications(Base):
__tablename__ = 'nih_publications'
pmid = Column(INTEGER, primary_key=True, autoincrement=False)
author_name = Column(TEXT)
affiliation = Column(TEXT)
author_list = Column(JSON)
country = Column(VARCHAR(50), index=True)
issn = Column(VARCHAR(9))
journal_issue = Column(VARCHAR(75))
journal_title = Column(VARCHAR(400), index=True)
journal_title_abbr = Column(VARCHAR(200))
journal_volume = Column(VARCHAR(100))
lang = Column(VARCHAR(3))
page_number = Column(VARCHAR(200))
pub_date = Column(DATETIME)
pub_title = Column(VARCHAR(400), index=True)
pub_year = Column(INTEGER, index=True)
pmc_id = Column(INTEGER, index=True)
[docs]class Patents(Base):
__tablename__ = 'nih_patents'
patent_id = Column(VARCHAR(20), primary_key=True)
patent_title = Column(TEXT)
project_id = Column(VARCHAR(50), index=True)
patent_org_name = Column(TEXT)
[docs]class LinkTables(Base):
__tablename__ = 'nih_linktables'
pmid = Column(INTEGER, primary_key=True, autoincrement=False)
project_number = Column(VARCHAR(50), index=True)
[docs]class ClinicalStudies(Base):
__tablename__ = "nih_clinicalstudies"
clinicaltrials_gov_id = Column(VARCHAR(20), primary_key=True)
core_project_number = Column(VARCHAR(50), index=True)
study = Column(TEXT)
study_status = Column(VARCHAR(30), index=True)
[docs]class PhrVector(Base):
"""Document vectors for NiH Public Health Relevance (PHR) statements."""
__tablename__ = 'nih_phr_vectors'
application_id = Column(INTEGER, ForeignKey(Projects.application_id),
autoincrement=False, primary_key=True)
vector = Column(JSON)
[docs]class AbstractVector(Base):
"""Document vectors for NiH abstracts."""
__tablename__ = 'nih_abstract_vectors'
application_id = Column(INTEGER, ForeignKey(Abstracts.application_id),
autoincrement=False, primary_key=True)
vector = Column(JSON)
[docs]class TextDuplicate(Base):
"""Link table to describe for NiH text-field duplicates,
which probably imply that projects are related, either
formally (if weight > 0.8 they are normally
almost exact duplicates of each other) or contextually
(if weight > 0.5 it is normally in the same general
subject area).
The cut-off for inclusion in this table is a weight of 0.5,
because the core interest for using this method is
to identify texts which are near duplicates,
since texts which are contextually similar can
also be found by other metrics (topic modelling, etc) and there
can be some weird side-effects of using BERT for this;
e.g. finding texts with a similar writing style rather than topic.
"""
__tablename__ = 'nih_duplicates'
application_id_1 = Column(INTEGER, ForeignKey(Projects.application_id),
autoincrement=False, primary_key=True)
application_id_2 = Column(INTEGER, ForeignKey(Projects.application_id),
autoincrement=False, primary_key=True)
text_field = Column(VARCHAR(8), primary_key=True,
index=True) # Either "phr" or "abstract"
weight = Column(FLOAT, index=True)