Source code for pulsar_playground.plots

""" Plotting module for data visualization and ML metrics  """
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.distributions.empirical_distribution import ECDF
from sklearn.metrics import precision_recall_curve, confusion_matrix

plt.style.use('seaborn')
scatterargs = {'size':1, 'alpha':0.4, 'legend':False, 'edgecolor':'none'}
tfs = {'none': lambda x: x*1, 'sqrt': np.sqrt, 'log': np.log}


[docs]def plot_info(data, ax=None):
    """ Summary of given dataframe.
    
    Parameters
    -------
    data : DataFrame
        Pandas dataframe.
    ax : Axes
        Matplotlib subfigure axes.
    """
    if ax is None:
        ax = plt.gca()

    text = ('Num. of examples: ' + str(data.shape[0]) +
            '\nNum. of features: ' + str(data.shape[1]-1) +
            '\nFeature names: ' + str(data.columns[0:-1].tolist()) +
            '\nTarget name: ' + '\'' + str(data.columns[-1]) + '\'' +
            '\nClasses: ' + str(data.iloc[:,-1].unique().tolist()) 
           )
    
    ax.set_xticks([])
    ax.set_yticks([])
    ax.text(0.01, 0.5, text, ha='left', va='center', size=14, linespacing=1.5)


[docs]def plot_nulls(data, ax=None):
    """ Percentage of null entries per feature (barplot).
    
    Parameters
    -------
    data : DataFrame
        Pandas dataframe.
    ax : Axes
        Matplotlib subfigure axes.
    """
    if ax is None:
        ax = plt.gca()

    nulls = data.iloc[:,:-1].isnull().sum()/data.shape[0]*100
    nulls.plot(kind='barh', ax=ax)

    ax.set_xlim(0, 100)
    ax.set_title('Percentage of null values per feature')


[docs]def plot_classprop(data, ax=None):
    """ Proportion of examples per class (pieplot).
    
    Parameters
    -------
    data : DataFrame
        Pandas dataframe.
    ax : Axes
        Matplotlib subfigure axes.  
    """
    if ax is None:
        ax = plt.gca()

    data.iloc[:,-1].value_counts(ascending=True).plot(kind='pie', autopct="%1.0f%%", ax=ax)

    ax.set_ylabel('')
    ax.set_title('Proportion of target variable')


[docs]def plot_fcorr(data, x_axis, y_axis, transform_x='none', transform_y='none', ax=None):
    """ Feature vs. feature plot (scatterplot).
    
    Parameters
    -------
    data : DataFrame
        Pandas dataframe.
    x_axis: str
        Column name from dataframe.
    y_axis: str
        Column name from dataframe.
    transform_x: str
        Dictionary key from 'tfs' dict.
    transform_y: str
        Dictionary key from 'tfs' dict.
    ax : Axes
        Matplotlib subfigure axes.
    """
    sns.scatterplot(data[x_axis].apply(tfs[transform_x]), data[y_axis].apply(tfs[transform_y]), hue=data.iloc[:,-1], ax=ax, **scatterargs)   
    
    ax.set_xlabel(x_axis)
    ax.set_ylabel(y_axis)
    ax.set_title('Feature correlation')


[docs]def plot_hist(data, x_axis, bins=10, ax=None):
    """ Plots histograms for each class.
    
    Parameters
    -------
    data : DataFrame
        Pandas dataframe.
    x_axis: str
        Column name from dataframe.
    bins: int
        Number of bins.
    ax : Axes
        Matplotlib subfigure axes.
    """    
    if ax is None:
        ax = plt.gca()

    for c in data.iloc[:,-1].unique().tolist():
        sns.distplot(data[ data.iloc[:,-1] == c ][x_axis], bins=bins, kde=False, ax=ax)
   
    ax.set_title('Hist of ' + x_axis)
    ax.set_xlabel(x_axis)
    ax.set_ylabel('counts')


[docs]def plot_ecdf(data, x_axis, ax=None):
    """ Plots the empirical cumulative distribution for each class.
    
    Parameters
    -------
    data : DataFrame
        Pandas dataframe.
    x_axis: str
        Column name from dataframe.
    ax : Axes
        Matplotlib subfigure axes.
    """    
    if ax is None:
        ax = plt.gca()

    for c in data.iloc[:,-1].unique().tolist():
        ecdf = ECDF(data[ data.iloc[:,-1] == c][x_axis])
        sns.scatterplot(ecdf.x, ecdf.y, ax=ax, **scatterargs)

    ax.set_title('Empirical CDF')
    ax.set_xlabel(x_axis)
    ax.set_ylabel('fraction of data')


[docs]def plot_prc(y_test, y_pred_proba, threshold, ax=None):
    """ Precision and recall vs. threshold curves.
    
    Parameters
    -------
    y_test : array
        Classes from the test split.
    y_pred_proba: array
        Predicted probability.
    threshold: float
        Decision threshold.
    ax : Axes
        Matplotlib subfigure axes.
    """        
    if ax is None:
        ax = plt.gca()
    
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba[:,1])
    
    ax.set_xlim(thresholds[0] -0.02, thresholds[-1] + 0.02)
    ax.set_ylim(0.02, 1.02)
    ax.set_title('PR curve for positive class')
    ax.set_xlabel('Threshold')
    ax.set_ylabel('Precision & Recall')
    ax.plot(thresholds, recall[:-1], label='Recall')
    ax.plot(thresholds, precision[:-1], label='Precision')
    ax.axvline(threshold, c='black', linewidth=0.75, label='Threshold')   
    ax.legend()


[docs]def plot_cm(y_test, y_pred_proba, threshold, ax=None):
    """ Confusion matrix.
    
    Parameters
    -------
    y_test : array
        Classes from the test split.
    y_pred_proba: array
        Predicted probability.
    threshold: float
        Decision threshold.
    ax : Axes
        Matplotlib subfigure axes.
    """
    if ax is None:
        ax = plt.gca()

    y_new = np.zeros(y_pred_proba.shape[0])
    y_new[y_pred_proba[:,1] > threshold] = 1
    y_new[y_pred_proba[:,1] <= threshold] = 0
    
    cm = pd.DataFrame(confusion_matrix(y_test, y_new))
    cm.index.name = 'Actual values'
    cm.columns.name = 'Predicted values'
    
    ax.set_title('Confusion matrix')
    sns.heatmap(cm, cbar=False, cmap='Blues',\
                annot=True, fmt = 'd', linecolor='none', linewidths=1, ax=ax)


[docs]def dump_idx(y_pred_proba, threshold, filename='candidates.csv'):
    """ Save indexes of examples predicted as positive.
   
    Parameters
    -------
    y_pred_proba: array
        Predicted probability.
    threshold: float
        Decision threshold.
    filename : str
        Output file.
    """
    y_new = np.zeros(y_pred_proba.shape[0])
    y_new[y_pred_proba[:,1] > threshold] = 1
    y_new[y_pred_proba[:,1] <= threshold] = 0

    candidates = np.where(y_new == 1)
    np.savetxt(filename, np.transpose(candidates), fmt='%d')

    print('Saved to \'%s\'.' % filename)