#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# -----------------------------------------------------------------------------
# Copyright (c) 2023, IBM Corp.
# All rights reserved.
#
# Distributed under the terms of the BSD Simplified License.
#
# The full license is in the LICENSE file, distributed with this software.
# -----------------------------------------------------------------------------
"""
This module contains function that can be used to prepare an input data
frame for machine learning.
"""
from typing import List, Tuple
from nzpyida.frame import IdaDataFrame
from nzpyida.analytics.utils import materialize_df, make_temp_table_name, \
get_auto_delete_context, call_proc_df_in_out, map_to_props, q
[docs]
def std_norm(in_df: IdaDataFrame, in_column: List[str], id_column: str = None,
by_column: str = None, out_table: str = None) -> IdaDataFrame:
"""
Standardization and normalization transformations use the original continuous
attribute a to generate a new continuous attribute a ' that has a different range
or distribution than the original attribute. Common transformations modify the
range to fit the [-1,1 ] interval (normalization) or modify the distribution to
have a mean of 0 and a standard deviation of 1 (standardization).
This function normalize and stardardize columns of the input data frame and returns
that in a new data frame.
Parameters
----------
in_df : IdaDataFrame
the input data frame
in_column : List[str]
the list of input table columns to consider.
Each column name may be followed by :L to leave it unchanged, by :S to standardize
its values, by :N to normalize its values or by :U to make it of unit length.
Additionally, two columns may be indicated, separated by a slash (/), followed
by :C to make the columns be a row unit vector or by :V to divide the column
values by the length of the longest row vector.
id_column : str, optional
the input table column identifying a unique instance id - if skipped,
the input data frame indexer must be set and will be used as an instance id
by_column : str, optional
the input table column which splits the data into groups for which the operation
is to be performed
out_table : str, optional
the output table with the modified data
Returns
-------
IdaDataFrame
the data frame with requested transformations
"""
if not id_column:
if in_df.indexer:
id_column = q(in_df.indexer)
else:
raise TypeError('Missing id column - either use id_column attribute or set '
'indexer column in the input data frame')
params = {
'id': q(id_column),
'incolumn': q(in_column),
'by': q(by_column)
}
return call_proc_df_in_out(proc='STD_NORM', in_df=in_df, params=params,
out_table=out_table, copy_indexer=True)[0]
[docs]
def impute_data(in_df: IdaDataFrame, in_column: str = None, method: str = None,
numeric_value: float = -1, nominal_value: str = 'missing', out_table: str = None) -> IdaDataFrame:
"""
Many analytic algorithms require that the data set has no missing attribute values.
However, real-world data sets frequently suffer from missing attribute values.
Missing value imputation provides usable attribute values in place of the missing values,
allowing the algorithms to run.
This function replaces missing values in the input data frame and returns the result
in a new data frame.
Parameters
----------
in_df : IdaDataFrame
the input data frame
in_column : str, optional
the input table column where missing values have to be replaced.
If not specified, all input data columns are considered.
method : str, optional
the data imputation method. Allowed values are: mean, median, freq (most frequent value),
replace. If not specified, the method is median for the numeric columns and freq for
the nominal columns. The methods mean and median cannot be used with nominal columns.
numeric_value : float, optional
the numeric replacement value when method=replace
nominal_value : str, optional
the nominal replacement value when method=replace
out_table : str, optional
the output table with the modified data
Returns
-------
IdaDataFrame
the data frame with requested transformations
"""
params = {
'incolumn': q(in_column),
'method': method,
'numericvalue': numeric_value,
'nominalvalue': nominal_value
}
return call_proc_df_in_out(proc='IMPUTE_DATA', in_df=in_df, params=params,
out_table=out_table, copy_indexer=True)[0]
[docs]
def random_sample(in_df: IdaDataFrame, size: int = None, fraction: float = None, by_column: str = None,
out_signature: str = None, rand_seed: int = None, out_table: str = None) -> IdaDataFrame:
"""
Random sampling procedures are a vital component of many analytical systems. They can
be used to select a test sample and a training sample for a model building process
(machine learning). They can also be used to get a smaller sample of the training
set, which you may do because of learning algorithm complexity considerations.
In both cases, you would sample without replacement.
Another application of sampling is the learning methods based on bootstrapping.
This requires many independent samples from the same data, which are preferentially
applied if the available data sets are small or for other reasons where the sample
independence is vital. Samples with replacement are usually drawn in this case.
In application, sampling is used for promotion campaigns, for example when you want
only a representative set of customers to be subjects of an action.
In all cases, whether for use with scientific methods or business practices, uniform
sampling is important.
This function creates a random sample of a data frame a fixed size or a fixed
probability and returns the result in a new data frame.
Parameters
----------
in_df : IdaDataFrame
the input data frame
size : int, optional
the number of rows in the sample (alias of size). If specified,
the parameter <fraction> must not be specified. Only one of both parameters <num>
and <size> must be specified.
fraction : float, optional
the probability of each row to be in the sample. If specified, the parameters <num>
and <size> must not be specified. Otherwise, one of both parameters <num> or <size>
must be specified.
by_column : str, optional
the column used to stratify the input table. If indicated, stratified sampling is
done: it ensures that each value of the column is represented in the sample in
about the same percentage as in the original input table.
out_signature : str, optional
the input table columns to keep in the sample, separated by a semi-colon (;).
If not specified, all columns are kept in the output table.
rand_seed : int, optional
the seed of the random function
out_table : str, optional
the output table with the modified data
Returns
-------
IdaDataFrame
the data frame with requested transformations
"""
params = {
'size': size,
'fraction': fraction,
'by': q(by_column),
'outsignature': out_signature,
'randseed': rand_seed
}
return call_proc_df_in_out(proc='RANDOM_SAMPLE', in_df=in_df, params=params,
out_table=out_table, copy_indexer=True)[0]
[docs]
def train_test_split(in_df: IdaDataFrame, out_table_train: str=None, out_table_test: str=None,
id_column: str = None, fraction: float = 0.5,
rand_seed: float = None) -> Tuple[IdaDataFrame, IdaDataFrame]:
"""
Parameters
----------
in_df : IdaDataFrame
the input data frame
out_table_train : str, optional
the name of output dataframe that will contain the given fraction of the input records
out_table_test : str, optional
the name of output dataframe that will contain the rest (1-<fraction>) of the input records
id_column: str, optional
the input dataframe column identifying a unique instance id
fraction : float, optional
the fraction of the data to that goes to the training dataframe
rand_seed : int, optional
the seed of the random function
Returns
-------
IdaDataFrame
the data frame with train data
IdaDataFrame
the data frame with test data
"""
if not isinstance(in_df, IdaDataFrame):
raise TypeError("Argument in_df should be an IdaDataFrame")
if not id_column:
if not in_df.indexer:
raise ValueError("If dataframe has no indexer 'id_column' has to be provided")
else:
id_column = q(in_df.indexer)
if out_table_train and in_df._idadb.exists_table_or_view(out_table_train):
in_df._idadb.drop_table(out_table_train)
if out_table_test and in_df._idadb.exists_table_or_view(out_table_test):
in_df._idadb.drop_table(out_table_test)
temp_view_name, need_delete = materialize_df(in_df)
auto_delete_context = None
if not out_table_train:
auto_delete_context = get_auto_delete_context('out_table_train')
out_table_train = make_temp_table_name()
if not out_table_test:
auto_delete_context = get_auto_delete_context('out_table_test')
out_table_test = make_temp_table_name()
params = {
'intable': temp_view_name,
'traintable': out_table_train,
'testtable': out_table_test,
'id': q(id_column),
'fraction': fraction
}
if isinstance(rand_seed, float):
params['seed'] = rand_seed
params_s = map_to_props(params)
try:
in_df.ida_query(f'call NZA..SPLIT_DATA(\'{params_s}\')')
finally:
if need_delete:
in_df._idadb.drop_view(temp_view_name)
if not in_df._idadb.exists_table_or_view(out_table_train) or \
not in_df._idadb.exists_table_or_view(out_table_test):
# stored procedure call was successful by did not produce a table
return None, None
if auto_delete_context:
auto_delete_context.add_table_to_delete(out_table_train)
auto_delete_context.add_table_to_delete(out_table_test)
out_df_train = IdaDataFrame(in_df._idadb, out_table_train)
out_df_test = IdaDataFrame(in_df._idadb, out_table_test)
if in_df.indexer:
out_df_train.indexer = in_df.indexer
out_df_test.indexer = in_df.indexer
return out_df_train, out_df_test