Source code for nzpyida.analytics.exploration.distribution

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#-----------------------------------------------------------------------------
# Copyright (c) 2023, IBM Corp.
# All rights reserved.
#
# Distributed under the terms of the BSD Simplified License.
#
# The full license is in the LICENSE file, distributed with this software.
#-----------------------------------------------------------------------------
"""
This module consists of algorithms used to describe the empirical distribution
of single attributes or the joint distribution of multiple—usually two—attributes.
"""

from typing import List
from nzpyida.frame import IdaDataFrame
from nzpyida.analytics.utils import call_proc_df_in_out, q



[docs]
def moments(in_df: IdaDataFrame, in_column: str, by_column: str=None,
    out_table: str=None) -> IdaDataFrame:
    """
    Moments are quantities used to describe certain aspects of continuous attribute
    distributions. Of particular interest are the central moments or moments around
    the mean.

    This function Calculates the moments of a numeric input column: mean, variance,
    stand-ard deviation, skewness and (excess) kurtosis as well as the
    count of cases, the minimum and the maximum.

    Parameters
    ----------
    in_df : IdaDataFrame
        the input data frame

    in_column : str
        the numeric input table column

    by_culumn : str, optional
        the input table column which splits the data into groups for which the
        operation is to be per-formed

    out_table : str, optional
        the output table to write the moments into

    Returns
    -------
    IdaDataFrame
        the data frame with requested data
    """

    params = {
        'incolumn': q(in_column),
        'by': q(by_column)
    }
    return call_proc_df_in_out(proc='MOMENTS', in_df=in_df, params=params, out_table=out_table)[0]



[docs]
def quantile(in_df: IdaDataFrame, in_column: str, quantiles: List[int],
    out_table: str=None) -> IdaDataFrame:
    """
    Quantiles constitute a convenient and intuitive description of continuous attribute
    distribution that allow observation of location, dispersion, and asymmetry.
    Quantiles of a continuous attribute are values from its range taken at regular intervals
    of its cumulative distribution.

    This function calculates quantile limit(s) for a numeric column.

    Parameters
    ----------
    in_df : IdaDataFrame
        the input data frame

    in_column : str
        the numeric input table column

    quantiles : List[int]
        the list of quantiles to be calculated.
        Quantiles are values between 0 and 1 indicating the percentage of sorted
        values to be considered in each quantile.

    out_table : str, optional
        the output table to write the moments into

    Returns
    -------
    IdaDataFrame
        the data frame with requested data
    """

    params = {
        'incolumn': q(in_column),
        'quantiles': quantiles
    }
    return call_proc_df_in_out(proc='QUANTILE', in_df=in_df, params=params, out_table=out_table)[0]



[docs]
def outliers(in_df: IdaDataFrame, in_column: str, multiplier: float=1.5,
    out_table: str=None) -> IdaDataFrame:
    """
    Outliers are the values below the first quartile or above the third quartile by more than
    the inter-quartile range multiplied by a coefficient of the attribute, which controls
    the aggressiveness of outlier detection.

    This function detects outliers of a numeric attribute (a column).

    Parameters
    ----------
    in_df : IdaDataFrame
        the input data frame

    in_column : str
        the numeric input table column

    multiplier : float, optional
        the value of the IQR multiplier

    out_table : str, optional
        the output table to write the moments into

    Returns
    -------
    IdaDataFrame
        the data frame with requested data
    """

    params = {
        'incolumn': q(in_column),
        'multiplier': multiplier
    }
    return call_proc_df_in_out(proc='OUTLIERS', in_df=in_df, params=params, out_table=out_table)[0]



[docs]
def unitable(in_df: IdaDataFrame, in_column: str, out_table: str=None) -> IdaDataFrame:
    """
    A univariate frequency table describes the distribution of a discrete attribute
    by providing the occurrence count for each unique value.

    This function creates a univariate frequency table for one column of the input table.

    Parameters
    ----------
    in_df : IdaDataFrame
        the input data frame

    in_column : str
        the numeric input table column

    out_table : str, optional
        the output table to write the moments into

    Returns
    -------
    IdaDataFrame
        the data frame with requested data
    """

    params = {
        'incolumn': q(in_column)
    }
    return call_proc_df_in_out(proc='UNITABLE', in_df=in_df, params=params, out_table=out_table)[0]



[docs]
def bitable(in_df: IdaDataFrame, in_column: List[str], freq: bool=False, cum: bool=False,
    out_table: str=None) -> IdaDataFrame:
    """
    A bivariate frequency table describes the joint probability distribution of two
    discrete attributes, by providing the occurrence count for each distinct combination
    of their values.

    This function creates a bivariate frequency table for two columns of the input table.

    Parameters
    ----------
    in_df : IdaDataFrame
        the input data frame

    in_column : List[str]
        the list of numeric input table columns, must be two (if there are more, they
        will be ignored)

    freq : bool, optional
        flag indicating whether frequencies should be attached to the output table

    cum : bool, optional
        flag indicating whether cumulative frequencies should be attached to the output 
        table (setting this flag automatically sets freq flag as frequencies have to be 
        calculated prior to cumulative frequencies)

    out_table : str, optional
        the output table to write the moments into

    Returns
    -------
    IdaDataFrame
        the data frame with requested data
    """

    params = {
        'incolumn': q(in_column),
        'freq': freq,
        'cum': cum
    }
    return call_proc_df_in_out(proc='BITABLE', in_df=in_df, params=params, out_table=out_table)[0]



[docs]
def histogram(in_df: IdaDataFrame, in_column: str, nbreaks: int=None, right: bool=True,
    btable: str=None, bcolumn: str=None, density: bool=False, midpoints: bool=False,
    freq: bool=False, cum: bool=False, out_table: str=None) -> IdaDataFrame:
    """
    A histogram is a frequency table counterpart for continuous attributes. Although usually
    presented visually as a graph, it can be considered a table providing occurrence counts
    for a series of disjoint intervals covering the range of the attribute. The intervals
    can be of equal or inequal width. The number of intervals and their boundaries can be
    specified manually to ensure the histogram is most meaningful and readable, or adjusted
    automatically to the distribution.

    This function creates histograms. The number of bins and the bins themselves
    can be specified or are automatically calculated.

    Parameters
    ----------
    in_df : IdaDataFrame
        the input data frame

    in_column : str
        the input table column to build the histogram onto

    nbreaks : int, optional
        the number of bins for the histogram.
        If not specified, the number of bins is calculated auto-matically.

    right : bool, optional
        the flag indicating whether the histogram bins should be
        right-closed (true) or right-open (false)

    btable : str, optional
        the input table with breaks for the histogram.
        If not specified, the bins are calculated automat-ically, using the
        parameter <nbreaks> if specified.

    bcolumn : str, optional
        the <btable> column containing the breaks for the histogram.
        This column must be specified if the parameter <btable> is specified.

    density : bool, optional
        flag indicating whether densities should be attached to the output table

    midpoints : bool, optional
        flag indicating whether the midpoints of the bins should be attached to the output table

    freq : bool, optional
        flag indicating whether frequencies should be attached to the output table

    cum : bool, optional
        flag indicating whether cumulative frequencies should be attached to the output table 
        (setting this flag automatically sets freq flag as frequencies have to be calculated 
        prior to cumulative frequencies)

    out_table : str, optional
        the output table to write the moments into

    Returns
    -------
    IdaDataFrame
        the data frame with requested data
    """

    params = {
        'incolumn': q(in_column),
        'nbreaks': nbreaks,
        'right': right,
        'btable': btable,
        'bcolumn': q(bcolumn),
        'density': density,
        'midpoints': midpoints,
        'freq': freq,
        'cum': cum
    }
    return call_proc_df_in_out(proc='HIST', in_df=in_df, params=params, out_table=out_table)[0]