Source code for pulsar_playground.utils

""" Module for common tasks. """
from os import path
import pandas as pd
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from functools import reduce

__dir__ = path.dirname(__file__)


[docs]def make_sets(filename, test_size=0.3, random_state=42, stratify=True):
    """ Splits dataset in two files: 'train.csv' and 'test.csv'. Also binarizes the labels.
    
    Parameters
    -------
    filename : str
        Input filename.
    test_size : float
        Test set ratio.
    random_state: int
        Random seed.
    stratify: bool
        Stratification by label.
    """
    data = pd.read_csv(filename)

    # Binarize labels in case they are categorical.
    lb = LabelBinarizer()
    data.iloc[:,-1] = lb.fit_transform(data.iloc[:,-1])

    if stratify:
        stratify = data.iloc[:,-1]
        
    else:
        stratify = None
        
    train, test = train_test_split(data, test_size=test_size, random_state=random_state, stratify=stratify)

    test.to_csv(__dir__ + "/dataset/test_set.csv", index=False)
    train.to_csv(__dir__ + "/dataset/train_set.csv", index=False)


[docs]def get_n_params(model):
    """ Returns the total number of elements of a param grid.

    Parameters
    -------
    model : str
        Dictionary key from 'model' dict from models.py.
    """
    n_ = []
    for k, v in model.items():
        n_.append(len(v))
    
    n = reduce(lambda a,b: a*b, n_)

    return n