Source code for nesta.core.routines.crunchbase.crunchbase_lolvelty

"""
Novelty score (lolvelty)
========================

Apply "lolvelty" score to Crunchbase data (in Elasticsearch). Note: this is a slow
procedure that is applied on a document-by-document basis.
"""


from nesta.core.luigihacks.estask import LazyElasticsearchTask
from nesta.core.luigihacks.misctools import find_filepath_from_pathstub as f3p
import luigi
from datetime import datetime as dt
import logging

[docs]class CrunchbaseLolveltyRootTask(luigi.WrapperTask): """Apply Lolvelty score to crunchbase data. Args: production (bool): Running in full production mode? index (str): Elasticsearch index to append Lolvelty score to. date (datetime): Date for timestamping this routine. """ production = luigi.BoolParameter(default=False) index = luigi.Parameter(default=None) date = luigi.DateParameter(default=dt.now())
[docs] def requires(self): logging.getLogger().setLevel(logging.INFO) kwargs = {'score_field': 'rank_rhodonite_organisation', 'fields': ['name_of_organisation', 'textBody_descriptive_organisation', 'terms_category_organisation']} test = not self.production routine_id = f"CrunchbaseLolveltyTask-{self.date}-{test}" index = self.index if self.production else 'companies_dev' assert index is not None return LazyElasticsearchTask(routine_id=routine_id, test=test, index=index, dataset='crunchbase', entity_type='company', kwargs=kwargs, batchable=f3p("batchables/novelty/lolvelty"), env_files=[f3p("nesta/"), f3p("config/mysqldb.config"), f3p("config/elasticsearch.config")], job_def="py36_amzn1_image", job_name=routine_id, job_queue="HighPriority", region_name="eu-west-2", poll_time=10, memory=1024, max_live_jobs=10)