Source code for nzpyida.analytics.transform.preparation

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# -----------------------------------------------------------------------------
# Copyright (c) 2023, IBM Corp.
# All rights reserved.
#
# Distributed under the terms of the BSD Simplified License.
#
# The full license is in the LICENSE file, distributed with this software.
# -----------------------------------------------------------------------------
"""
This module contains function that can be used to prepare an input data
frame for machine learning.
"""
from typing import List, Tuple
from nzpyida.frame import IdaDataFrame
from nzpyida.analytics.utils import materialize_df, make_temp_table_name, \
    get_auto_delete_context, call_proc_df_in_out, map_to_props, q



[docs]
def std_norm(in_df: IdaDataFrame, in_column: List[str], id_column: str = None,
             by_column: str = None, out_table: str = None) -> IdaDataFrame:
    """
    Standardization and normalization transformations use the original continuous
    attribute a to generate a new continuous attribute a ' that has a different range
    or distribution than the original attribute. Common transformations modify the
    range to fit the [-1,1 ] interval (normalization) or modify the distribution to
    have a mean of 0 and a standard deviation of 1 (standardization).

    This function normalize and stardardize columns of the input data frame and returns
    that in a new data frame.

    Parameters
    ----------
    in_df : IdaDataFrame
        the input data frame

    in_column : List[str]
        the list of input table columns to consider.
        Each column name may be followed by :L to leave it unchanged, by :S to standardize
        its values, by :N to normalize its values or by :U to make it of unit length.
        Additionally, two columns may be indicated, separated by a slash (/), followed
        by :C to make the columns be a row unit vector or by :V to divide the column
        values by the length of the longest row vector.

    id_column : str, optional
        the input table column identifying a unique instance id - if skipped, 
        the input data frame indexer must be set and will be used as an instance id

    by_column : str, optional
        the input table column which splits the data into groups for which the operation
        is to be performed

    out_table : str, optional
        the output table with the modified data

    Returns
    -------
    IdaDataFrame
        the data frame with requested transformations
    """
    if not id_column:
        if in_df.indexer:
            id_column = q(in_df.indexer)
        else:
            raise TypeError('Missing id column - either use id_column attribute or set '
                            'indexer column in the input data frame')

    params = {
        'id': q(id_column),
        'incolumn': q(in_column),
        'by': q(by_column)
    }
    return call_proc_df_in_out(proc='STD_NORM', in_df=in_df, params=params,
                               out_table=out_table, copy_indexer=True)[0]




[docs]
def impute_data(in_df: IdaDataFrame, in_column: str = None, method: str = None,
                numeric_value: float = -1, nominal_value: str = 'missing', out_table: str = None) -> IdaDataFrame:
    """
    Many analytic algorithms require that the data set has no missing attribute values.
    However, real-world data sets frequently suffer from missing attribute values.
    Missing value imputation provides usable attribute values in place of the missing values,
    allowing the algorithms to run.

    This function replaces missing values in the input data frame and returns the result
    in a new data frame.

    Parameters
    ----------
    in_df : IdaDataFrame
        the input data frame

    in_column : str, optional
        the input table column where missing values have to be replaced.
        If not specified, all input data columns are considered.

    method : str, optional
        the data imputation method. Allowed values are: mean, median, freq (most frequent value),
        replace. If not specified, the method is median for the numeric columns and freq for
        the nominal columns. The methods mean and median cannot be used with nominal columns.

    numeric_value : float, optional
        the numeric replacement value when method=replace

    nominal_value : str, optional
        the nominal replacement value when method=replace

    out_table : str, optional
        the output table with the modified data

    Returns
    -------
    IdaDataFrame
        the data frame with requested transformations
    """

    params = {
        'incolumn': q(in_column),
        'method': method,
        'numericvalue': numeric_value,
        'nominalvalue': nominal_value
    }
    return call_proc_df_in_out(proc='IMPUTE_DATA', in_df=in_df, params=params,
                               out_table=out_table, copy_indexer=True)[0]




[docs]
def random_sample(in_df: IdaDataFrame, size: int = None, fraction: float = None, by_column: str = None,
                  out_signature: str = None, rand_seed: int = None, out_table: str = None) -> IdaDataFrame:
    """
    Random sampling procedures are a vital component of many analytical systems. They can
    be used to select a test sample and a training sample for a model building process
    (machine learning). They can also be used to get a smaller sample of the training
    set, which you may do because of learning algorithm complexity considerations.
    In both cases, you would sample without replacement.

    Another application of sampling is the learning methods based on bootstrapping.
    This requires many independent samples from the same data, which are preferentially
    applied if the available data sets are small or for other reasons where the sample
    independence is vital. Samples with replacement are usually drawn in this case.

    In application, sampling is used for promotion campaigns, for example when you want
    only a representative set of customers to be subjects of an action.
    In all cases, whether for use with scientific methods or business practices, uniform
    sampling is important.

    This function creates a random sample of a data frame a fixed size or a fixed
    probability and returns the result in a new data frame.

    Parameters
    ----------
    in_df : IdaDataFrame
        the input data frame

    size : int, optional
        the number of rows in the sample (alias of size). If specified,
        the parameter <fraction> must not be specified. Only one of both parameters <num>
        and <size> must be specified.

    fraction : float, optional
        the probability of each row to be in the sample. If specified, the parameters <num>
        and <size> must not be specified. Otherwise, one of both parameters <num> or <size>
        must be specified.

    by_column : str, optional
        the column used to stratify the input table. If indicated, stratified sampling is
        done: it ensures that each value of the column is represented in the sample in
        about the same percentage as in the original input table.

    out_signature : str, optional
        the input table columns to keep in the sample, separated by a semi-colon (;).
        If not specified, all columns are kept in the output table.

    rand_seed : int, optional
        the seed of the random function

    out_table : str, optional
        the output table with the modified data

    Returns
    -------
    IdaDataFrame
        the data frame with requested transformations
    """

    params = {
        'size': size,
        'fraction': fraction,
        'by': q(by_column),
        'outsignature': out_signature,
        'randseed': rand_seed
    }
    return call_proc_df_in_out(proc='RANDOM_SAMPLE', in_df=in_df, params=params,
                               out_table=out_table, copy_indexer=True)[0]




[docs]
def train_test_split(in_df: IdaDataFrame, out_table_train: str=None, out_table_test: str=None,
                     id_column: str = None, fraction: float = 0.5, 
                     rand_seed: float = None) -> Tuple[IdaDataFrame, IdaDataFrame]:
    """
    Parameters
    ----------
    in_df : IdaDataFrame
        the input data frame

    out_table_train : str, optional
        the name of output dataframe that will contain the given fraction of the input records

    out_table_test : str, optional
        the name of output dataframe that will contain the rest (1-<fraction>) of the input records

    id_column: str, optional
        the input dataframe column identifying a unique instance id

    fraction : float, optional
        the fraction of the data to that goes to the training dataframe

    rand_seed : int, optional
        the seed of the random function

    Returns
    -------
    IdaDataFrame
        the data frame with train data
    
    IdaDataFrame
        the data frame with test data
    """
    
    if not isinstance(in_df, IdaDataFrame):
        raise TypeError("Argument in_df should be an IdaDataFrame")
    
    if not id_column:
        if not in_df.indexer:
            raise ValueError("If dataframe has no indexer 'id_column' has to be provided")
        else:
            id_column = q(in_df.indexer)

    if out_table_train and in_df._idadb.exists_table_or_view(out_table_train):
            in_df._idadb.drop_table(out_table_train)
    if out_table_test and in_df._idadb.exists_table_or_view(out_table_test):
            in_df._idadb.drop_table(out_table_test)

    temp_view_name, need_delete = materialize_df(in_df)
    
    auto_delete_context = None
    if not out_table_train:
        auto_delete_context = get_auto_delete_context('out_table_train')
        out_table_train = make_temp_table_name()

    if not out_table_test:
        auto_delete_context = get_auto_delete_context('out_table_test')
        out_table_test = make_temp_table_name()

    params = {
        'intable': temp_view_name,
        'traintable': out_table_train,
        'testtable': out_table_test,
        'id': q(id_column),
        'fraction': fraction
    }

    if isinstance(rand_seed, float):
        params['seed'] = rand_seed

    params_s = map_to_props(params)

    try:
        in_df.ida_query(f'call NZA..SPLIT_DATA(\'{params_s}\')')
    finally:
        if need_delete:
            in_df._idadb.drop_view(temp_view_name)

    if not in_df._idadb.exists_table_or_view(out_table_train) or \
       not in_df._idadb.exists_table_or_view(out_table_test):
        # stored procedure call was successful by did not produce a table
        return None, None

    if auto_delete_context:
        auto_delete_context.add_table_to_delete(out_table_train)
        auto_delete_context.add_table_to_delete(out_table_test)

    out_df_train = IdaDataFrame(in_df._idadb, out_table_train)
    out_df_test = IdaDataFrame(in_df._idadb, out_table_test)
    
    if in_df.indexer:
        out_df_train.indexer = in_df.indexer
        out_df_test.indexer = in_df.indexer
    return out_df_train, out_df_test