Source code for core.orms.nih_orm

'''
NIH schema
==============

The schema for the World RePORTER data.
'''

from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.types import INTEGER, JSON, DATETIME, FLOAT
from sqlalchemy import Column, Table, ForeignKey
from sqlalchemy.orm import relationship
from sqlalchemy.ext.associationproxy import association_proxy

from nesta.core.orms.types import VARCHAR, TEXT


[docs]def getattr_(entity, attribute):
    """Either unpack the attribute from every item in the entity
    if the entity is a list, otherwise just return the attribute
    from the entity. Returns None if the entity is either None
    or empty."""
    if entity in (None, []):
        return None
    if isinstance(entity, list):
        return [getattr(item, attribute) for item in entity]
    return getattr(entity, attribute)


Base = declarative_base()

[docs]class Projects(Base):
    __tablename__ = 'nih_projects'

    application_id = Column(INTEGER, primary_key=True, autoincrement=False)
    activity = Column(VARCHAR(3))
    administering_ic = Column(VARCHAR(2))
    application_type = Column(INTEGER)
    arra_funded = Column(VARCHAR(1))
    award_notice_date = Column(DATETIME)
    base_core_project_num = Column(VARCHAR(50), index=True)
    budget_start = Column(DATETIME)
    budget_end = Column(DATETIME)
    cfda_code = Column(TEXT)
    core_project_num = Column(VARCHAR(50), index=True)
    ed_inst_type = Column(TEXT)
    foa_number = Column(TEXT)
    full_project_num = Column(VARCHAR(50), index=True)
    funding_ics = Column(JSON)
    funding_mechanism = Column(TEXT)
    fy = Column(INTEGER, index=True)
    ic_name = Column(VARCHAR(100), index=True)
    org_city = Column(VARCHAR(50), index=True)
    org_country = Column(VARCHAR(50), index=True)
    org_dept = Column(VARCHAR(100), index=True)
    org_district = Column(INTEGER)
    org_duns = Column(JSON)
    org_fips = Column(VARCHAR(2), index=True)
    org_ipf_code = Column(INTEGER)
    org_name = Column(VARCHAR(100), index=True)
    org_state = Column(VARCHAR(2), index=True)
    org_zipcode = Column(VARCHAR(10))
    phr = Column(TEXT)
    pi_ids = Column(JSON)
    pi_names = Column(JSON)
    program_officer_name = Column(TEXT)
    project_start = Column(DATETIME, index=True)
    project_end = Column(DATETIME, index=True)
    project_terms = Column(JSON)
    project_title = Column(TEXT)
    serial_number = Column(VARCHAR(6))
    study_section = Column(VARCHAR(4))
    study_section_name = Column(TEXT)
    suffix = Column(VARCHAR(6))
    support_year = Column(VARCHAR(2))
    direct_cost_amt = Column(INTEGER)
    indirect_cost_amt = Column(INTEGER)
    total_cost = Column(INTEGER)
    subproject_id = Column(INTEGER, index=True)
    total_cost_sub_project = Column(INTEGER)
    nih_spending_cats = Column(JSON)

    # Pseudo-FKs
    abstract = relationship("Abstracts", uselist=False, 
                            foreign_keys=[application_id],
                            primaryjoin=("Projects.application_id=="
                                         "Abstracts.application_id"))
    publications = relationship("LinkTables", uselist=True,
                                foreign_keys=[core_project_num],
                                primaryjoin=("Projects.core_project_num=="
                                             "LinkTables.project_number"))
    patents = relationship("Patents", uselist=True,
                           foreign_keys=[core_project_num],
                           primaryjoin=("Projects.core_project_num=="
                                        "Patents.project_id"))
    clinicalstudies = relationship("ClinicalStudies", uselist=True,
                                   foreign_keys=[core_project_num],
                                   primaryjoin=("Projects.core_project_num=="
                                                "ClinicalStudies.core_project_number"))

    # Pseudo-fields (populated from relationships)
    @property
    def abstract_text(self):
        return getattr_(self.abstract, "abstract_text")

    @property
    def patent_ids(self):
        return getattr_(self.patents, "patent_id")

    @property
    def patent_titles(self):
        return getattr_(self.patents, "patent_title")
        
    @property
    def pmids(self):
        return getattr_(self.publications, "pmid")

    @property
    def clinicaltrial_ids(self):
        return getattr_(self.clinicalstudies, "clinicaltrials_gov_id")

    @property
    def clinicaltrial_titles(self):
        return getattr_(self.clinicalstudies, "study")

        


[docs]class Abstracts(Base):
    __tablename__ = 'nih_abstracts'
    application_id = Column(INTEGER, primary_key=True, autoincrement=False)
    abstract_text = Column(TEXT)
    


[docs]class Publications(Base):
    __tablename__ = 'nih_publications'

    pmid = Column(INTEGER, primary_key=True, autoincrement=False)
    author_name = Column(TEXT)
    affiliation = Column(TEXT)
    author_list = Column(JSON)
    country = Column(VARCHAR(50), index=True)
    issn = Column(VARCHAR(9))
    journal_issue = Column(VARCHAR(75))
    journal_title = Column(VARCHAR(400), index=True)
    journal_title_abbr = Column(VARCHAR(200))
    journal_volume = Column(VARCHAR(100))
    lang = Column(VARCHAR(3))
    page_number = Column(VARCHAR(200))
    pub_date = Column(DATETIME)
    pub_title = Column(VARCHAR(400), index=True)
    pub_year = Column(INTEGER, index=True)
    pmc_id = Column(INTEGER, index=True)

     
[docs]class Patents(Base):
    __tablename__ = 'nih_patents'
    
    patent_id = Column(VARCHAR(20), primary_key=True)
    patent_title = Column(TEXT)
    project_id = Column(VARCHAR(50), index=True)
    patent_org_name = Column(TEXT)


[docs]class LinkTables(Base):
    __tablename__ = 'nih_linktables'

    pmid = Column(INTEGER, primary_key=True, autoincrement=False)
    project_number = Column(VARCHAR(50), index=True)


[docs]class ClinicalStudies(Base):
    __tablename__ = "nih_clinicalstudies"
    
    clinicaltrials_gov_id = Column(VARCHAR(20), primary_key=True)
    core_project_number = Column(VARCHAR(50), index=True)
    study = Column(TEXT)
    study_status = Column(VARCHAR(30), index=True)


[docs]class PhrVector(Base):
    """Document vectors for NiH Public Health Relevance (PHR) statements."""
    __tablename__ = 'nih_phr_vectors'
    application_id = Column(INTEGER, ForeignKey(Projects.application_id),
                            autoincrement=False, primary_key=True)
    vector = Column(JSON)


[docs]class AbstractVector(Base):
    """Document vectors for NiH abstracts."""
    __tablename__ = 'nih_abstract_vectors'
    application_id = Column(INTEGER, ForeignKey(Abstracts.application_id),
                            autoincrement=False, primary_key=True)
    vector = Column(JSON)


[docs]class TextDuplicate(Base):
    """Link table to describe for NiH text-field duplicates,
    which probably imply that projects are related, either
    formally (if weight > 0.8 they are normally 
    almost exact duplicates of each other) or contextually 
    (if weight > 0.5 it is normally in the same general
    subject area).
    
    The cut-off for inclusion in this table is a weight of 0.5,
    because the core interest for using this method is
    to identify texts which are near duplicates,
    since texts which are contextually similar can
    also be found by other metrics (topic modelling, etc) and there
    can be some weird side-effects of using BERT for this;
    e.g. finding texts with a similar writing style rather than topic.
    """
    __tablename__ = 'nih_duplicates'
    application_id_1 = Column(INTEGER, ForeignKey(Projects.application_id),
                              autoincrement=False, primary_key=True)
    application_id_2 = Column(INTEGER, ForeignKey(Projects.application_id),
                              autoincrement=False, primary_key=True)
    text_field = Column(VARCHAR(8), primary_key=True, 
                        index=True)  # Either "phr" or "abstract"
    weight = Column(FLOAT, index=True)