Source code for src.features.utils

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 15 14:13:51 2021

@author: Paolo Cozzi <paolo.cozzi@ibba.cnr.it>
"""

import io
import re
import gzip
import logging
import pathlib
import collections

# Get an instance of a logger
logger = logging.getLogger(__name__)


[docs]def sanitize(
        word: str,
        chars=['.', ",", "-", "/", "#"],
        check_mongoengine=True) -> str:
    """Sanitize a word by removing unwanted characters and lowercase it.

    Args:
        word (str): the word to sanitize
        chars (list): a list of characters to remove
        check_mongoengine (bool): true to add '_' after a mongoengine reserved
            word

    Returns:
        str: the sanitized word
    """

    # remove unwanted characters from word by putting spaces
    pattern = "".join(chars)
    tmp = re.sub(r'[%s]' % (pattern), ' ', word)

    # remove spaces from column name and lowercase all
    sanitized = re.sub(r"\s+", "_", tmp).lower()

    if check_mongoengine:
        if sanitized in ['size', 'type']:
            sanitized += "_"

    # remove starting sanitized char (can't be used with namedtuple)
    if sanitized.startswith("_"):
        sanitized = sanitized[1:]

    return sanitized


[docs]def camelCase(string: str) -> str:
    """Convert a string into camel case

    Args:
        string (str): the string to convert

    Returns:
        str: the camel case version of the string
    """

    string = re.sub(r"(_|-|\.)+", " ", string).title().replace(" ", "")
    return string[0].lower() + string[1:]


[docs]class TqdmToLogger(io.StringIO):
    """
        Output stream for TQDM which will output to logger module instead of
        the StdOut.
    """
    logger = None
    level = None
    buf = ''

[docs]    def __init__(self, logger, level=None):
        super(TqdmToLogger, self).__init__()
        self.logger = logger
        self.level = level or logging.INFO

[docs]    def write(self, buf):
        self.buf = buf.strip('\r\n\t ')

[docs]    def flush(self):
        self.logger.log(self.level, self.buf)


[docs]def get_project_dir() -> pathlib.PosixPath:
    """Return smarter project dir (which are three levels upper from the
    module in which this function is stored)

    Returns:
        pathlib.PosixPath: the smarter project base dir
    """
    return pathlib.Path(__file__).parents[2]


[docs]def get_raw_dir() -> pathlib.PosixPath:
    """Return smarter data raw dir

    Returns:
        pathlib.PosixPath: the smarter data raw directory
    """

    return get_project_dir() / "data/raw"


[docs]def get_interim_dir() -> pathlib.PosixPath:
    """Return smarter data temporary dir

    Returns:
        pathlib.PosixPath: the smarter data temporary dir
    """

    return get_project_dir() / "data/interim"


[docs]def get_processed_dir() -> pathlib.PosixPath:
    """Return smarter data processed dir (final processed data)

    Returns:
        pathlib.PosixPath: the smarter data final processed dir
    """

    return get_project_dir() / "data/processed"


[docs]def text_or_gzip_open(path: str, mode: str = None) -> io.TextIOWrapper:
    """Open a file which can be compressed or not. Returns file handle"""

    if pathlib.Path(path).suffix == '.gz':
        if not mode:
            mode = 'rt'

        logger.debug(f"Gzip detected for {path}")
        return gzip.open(path, mode=mode)

    else:
        if not mode:
            mode = 'r'

        return open(path, mode=mode)


[docs]def find_duplicates(header: list) -> list:
    """Find duplicate columns in list. Returns index to remove after the first
    occurence

    Args:
        header (list): a list like the header read from a CSV file

    Returns:
        list: a list of index (numeric)
    """

    to_remove = []

    # count columns and find duplicates
    counts = collections.Counter(header)
    duplicated_cols = [key for key, value in counts.items() if value > 1]

    # now iterate and get duplicates indexes
    for duplicated in duplicated_cols:
        # get all duplicated index
        tmp = [i for i, col in enumerate(header) if col == duplicated]

        # track only from the 2nd occurrence
        to_remove += tmp[1:]

    return to_remove


[docs]def skip_comments(handle: io.TextIOWrapper, comment_char="#") -> (int, list):
    """
    Ignore comments lines from a open file handle. Return the stream position
    immediately after the comments and all the comment lines in a list.

    Parameters
    ----------
    handle : io.TextIOWrapper
        An open file handle.
    comment_char : TYPE, optional
        The comment character used in file. The default is "#".

    Returns
    -------
    (int, list)
        The stream position after the comments and the ignored lines as a list.
    """

    # track skipped lines
    skipped = list()

    # read first line
    line = handle.readline().strip()

    # search for comments in file
    while line[0] == "#":
        logger.debug(f"Skipping: {line}")
        skipped.append(line)
        position = handle.tell()

        # read another line
        line = handle.readline().strip()

    # the position returned is the one before the one I want
    return position, skipped


[docs]class UnknownCountry():
    """Deal with unknown country"""

[docs]    def __init__(self):
        self.name = "Unknown"
        self.alpha_2 = "UN"
        self.alpha_3 = "UNK"
        self.numeric = None