Source code for packages.meetup.meetup_utils

import os
import random
import json
import numpy as np
from nesta.core.orms.orm_utils import db_session
from nesta.core.orms.meetup_orm import Group
from collections import Counter


[docs]def get_api_key():
    '''Get a random API key from those listed in the environmental variable
    :code:`MEETUP_API_KEYS`.
    '''
    api_keys = os.environ["MEETUP_API_KEYS"].split(",")
    return random.choice(api_keys)

[docs]def save_sample(json_data, filename, k):
    '''Dump a sample of :code:`k` items from row-oriented JSON
    data :code:`json_data` into file with name :code:`filename`.
    '''
    with open(filename, 'w') as fp:
        json.dump(random.sample(json_data, k), fp)


[docs]def flatten_data(list_json_data, keys, **kwargs):
    '''Flatten nested JSON data from a list of JSON objects, by
    a list of desired keys. Each element in the :code:`keys`
    may also be an ordered iterable of keys,
    such that subsequent keys describe a path through the
    JSON to desired value. For example in order to extract
    `key1` and `key3` from:

    .. code-block:: python

        {'key': <some_value>, 'key2' : {'key3': <some_value>}}

    one would specify :code:`keys` as:

    .. code-block:: python

        ['key1', ('key2', 'key3')]

    Args:
        list_json_data (:obj:`json`): Row-orientated JSON data.
        keys (:obj:`list`): Mixed list of either: individual `str` keys for data values
        which are not nested; **or** sublists of `str`, as described above.
        **kwargs: Any constants to include in every flattened row of the output.

    Returns:
       :obj:`json`: Flattened row-orientated JSON data.
    '''
    # Loop through groups
    output = []
    for info in list_json_data:
        row = dict(**kwargs)
        # Generate the field names and values, if they exist
        for k in keys:
            field_name = k
            try:
                # If the key is just a string
                if type(k) == str:
                    value = info[k]
                # Otherwise, assume its a list of keys
                else:
                    field_name = "_".join(k)
                    # Recursively assign the list of keys
                    value = info
                    for _k in k:
                        value = value[_k]
            # Ignore fields which aren't found (these will appear
            # as NULL in the database anyway)
            except KeyError:
                continue
            row[field_name] = value
        output.append(row)
    return output


[docs]def get_members_by_percentile(engine, perc=10):
    """Get the number of meetup group members for a given percentile
    from the database.
    
    Args:
        engine: A SQL alchemy connectable.
        perc (int): A percentile to evaluate.
    Returns:
        members (float): The number of members corresponding to this percentile.
    """
    with db_session(engine) as session:
        rows = (session
                .query(Group.members)
                .all())
        rows = [r.members for r in rows]
    return float(np.percentile(rows, perc))


[docs]def get_core_topics(engine, core_categories, members_limit, perc=99):
    """Get the most frequent topics from a selection of meetup categories,
    from the database.
    
    Args:
        engine: A SQL alchemy connectable.
        core_categories (list): A list of category_shortnames.
        members_limit (int): Minimum number of members required in a group 
                             for it to be considered.
        perc (int): A percentile to evaluate the most frequent topics.
    Returns:
        topics (set): The set of most frequent topics.
    """
    with db_session(engine) as session:
        rows = (session
                .query(Group.topics)
                .filter(Group.members >= members_limit)
                .filter(Group.category_shortname.in_(core_categories))
                .all())
        rows = [r.topics for r in rows]
    topic_counts = Counter(t['name'] for topics in rows for t in topics)
    topic_cutoff = np.percentile(list(float(v) for v in topic_counts.values()), perc)
    return set(k for k, v in topic_counts.items() if v >= topic_cutoff)