Source code for packages.nlp_utils.preprocess

import re
import string
import gensim
import nltk
from nltk.corpus import stopwords
import numpy as np
from operator import iadd
from functools import reduce
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)

stop_words = set(stopwords.words('english') +
                 list(string.punctuation) +
                 ['\\n'] + ['quot'])

regex_str = [r"http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|"
             r"[!*\(\),](?:%[0-9a-f][0-9a-f]))+",
             r"(?:\w+-\w+){2}",
             r"(?:\w+-\w+)",
             r"(?:\\\+n+)",
             r"(?:@[\w_]+)",
             r"<[^>]+>",
             r"(?:\w+'\w)",
             r"(?:[\w_]+)",
             r"(?:\S)"]

# Create the tokenizer which will be case insensitive and will ignore space.
tokens_re = re.compile(r'('+'|'.join(regex_str)+')',
                       re.VERBOSE | re.IGNORECASE)


[docs]def tokenize_document(text, remove_stops=False, keep_quasi_numeric=True):
    """Preprocess a whole raw document.
    Args:
        text (str): Raw string of text.
        remove_stops (bool): Flag to remove english stopwords
    Return:
        List of preprocessed and tokenized documents
    """
    return [clean_and_tokenize(sentence, remove_stops, keep_quasi_numeric)
            for sentence in nltk.sent_tokenize(text)]


[docs]def clean_and_tokenize(text, remove_stops, keep_quasi_numeric=False):
    """Preprocess a raw string/sentence of text.
    Args:
       text (str): Raw string of text.
       remove_stops (bool): Flag to remove english stopwords
    Return:
       tokens (list, str): Preprocessed tokens.
    """

    tokens = tokens_re.findall(text)
    _tokens = [t.lower() for t in tokens]
    filtered_tokens = [token.replace('-', '_') for token in _tokens
                       if not (remove_stops and len(token) <= 2)
                       and (not remove_stops or token not in stop_words)
                       and (keep_quasi_numeric or not any(x in token for x in string.digits))
                       and any(x in token for x in string.ascii_lowercase)]
    return filtered_tokens


[docs]def filter_by_idf(documents, lower_idf_limit, upper_idf_limit):
    """Remove (from documents) terms which are in a range of IDF values.

    Args:
        documents (list): Either a :obj:`list` of :obj:`str` or a
                          :obj:`list` of :obj:`list` of :obj:`str` to be
                          filtered.
        lower_idf_limit (float): Lower percentile (between 0 and 100) on which
                                 to exclude terms by their IDF.
        upper_idf_limit (float): Upper percentile (between 0 and 100) on which
                                 to exclude terms by their IDF.

    Returns:
        Filtered documents
    """
    # Check the shape of the input documents
    docs = documents
    if type(documents[0]) is list:
        docs = [reduce(iadd, d) for d in documents]
    # Evaluate the TFIDF
    tfidf = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False)
    tfidf.fit(docs)
    lower_idf = np.percentile(tfidf.idf_, lower_idf_limit)
    upper_idf = np.percentile(tfidf.idf_, upper_idf_limit)
    # Pick out the vocab to be dropped
    drop_vocab = set(term for term, idx in tfidf.vocabulary_.items()
                     if tfidf.idf_[idx] < lower_idf
                     or tfidf.idf_[idx] >= upper_idf)
    # Filter the documents
    new_docs = []
    for doc in documents:
        _new_doc = []
        for sent in doc:
            _new_sent = [w for w in sent if w not in drop_vocab]
            if len(_new_sent) == 0:
                continue
            _new_doc.append(_new_sent)
        new_docs.append(_new_doc)
    return new_docs


if __name__ == '__main__':
    nltk.download("gutenberg")
    from nltk.corpus import gutenberg
    docs = []
    for fid in gutenberg.fileids():
        f = gutenberg.open(fid)
        docs.append(f.read())
        f.close()
    docs = [tokenize_document(d) for d in docs]
    docs = filter_by_idf(docs, 10, 90)