Source code for nzpyida.analytics.predictive.classification

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#-----------------------------------------------------------------------------
# Copyright (c) 2023, IBM Corp.
# All rights reserved.
#
# Distributed under the terms of the BSD Simplified License.
#
# The full license is in the LICENSE file, distributed with this software.
#-----------------------------------------------------------------------------
"""
This module contains a class that is the base for all classification algorithms.
"""
from typing import Tuple
from nzpyida.frame import IdaDataFrame
from nzpyida.base import IdaDataBase
from nzpyida.analytics.utils import map_to_props, materialize_df, make_temp_table_name, \
call_proc_df_in_out
from nzpyida.analytics.utils import get_auto_delete_context
from nzpyida.analytics.predictive.predictive_modeling import PredictiveModeling
from nzpyida.analytics.model_manager import ModelManager
from nzpyida.analytics.utils import q



[docs]
class Classification(PredictiveModeling):
    """
    Base class for classification algorithms.
    """

    def __init__(self, idadb: IdaDataBase, model_name: str):
        """
        Creates the classifier class.

        Parameters
        ----------

        idada : IdaDataBase
            database connector

        model_name : str
            model name - if it exists in the database, it will be used, otherwise
            it must be trained using fit() function before prediction or scoring is called.
        """

        super().__init__(idadb, model_name)
        self.target_column_in_output = idadb.to_def_case('CLASS')
        self.id_column_in_output = idadb.to_def_case('ID')
        self.score_proc = 'CERROR'
        self.score_inv = True
        self.type = None


[docs]
    def predict(self, in_df: IdaDataFrame, out_table: str=None,
        id_column: str=None) -> IdaDataFrame:
        """
        Makes predictions based on this model. The model must exist.

        Parameters
        ----------
        in_df : IdaDataFrame
            the input data frame for predictions

        out_table : str, optional
            the output table where the predictions will be stored

        id_column : str, optional
            the input table column identifying a unique instance id
            Default: id column used to build the model
        """

        params = {
            'id': q(id_column)
        }

        return self._predict(in_df=in_df, params=params, out_table=out_table)



[docs]
    def score(self, in_df: IdaDataFrame, target_column: str, id_column: str=None) -> float:
        """
        Scores the model. The model must exist.

        Parameters
        ----------
        in_df : IdaDataFrame
            the input data frame for scoring

        target_column : str
            the input table column representing the class

        id_column : str, optional
            the input table column identifying a unique instance id - if skipped, 
            the input data frame indexer must be set and will be used as an instance id

        Returns
        -------
        float
            the model score
        """

        params = {
            'id': q(id_column)
        }

        return self._score(in_df=in_df, predict_params=params, target_column=target_column)



[docs]
    def conf_matrix(self, in_df: IdaDataFrame, target_column: str, id_column: str=None,
        out_matrix_table: str=None) -> Tuple[IdaDataFrame, float, float]:
        """
        Makes a predition for a test data set given by the user and returns a confusion matrix,
        together with other stats (ACC and WACC).

        Parameters
        ----------
        in_df : IdaDataFrame
            the input data frame for scoring

        target_column : str
            the input table column representing the class

        id_column : str, optional
            the input table column identifying a unique instance id - if skipped, 
            the input data frame indexer must be set and will be used as an instance id

        out_matrix_table : str, optional
            the output table where the confidence matrix will be stored

        Returns
        -------
        IdaDataFrame
            the confidence matrix data frame

        float
            classification accuracy (ACC)

        float
            weighted classification accuracy (WACC)
        """

        params = {
            'id': q(id_column),
            'target': q(target_column)
        }
        return self._conf_matrix(in_df, out_matrix_table, params)

        
    def _conf_matrix(self, in_df: IdaDataFrame, out_matrix_table: str=None,
                     params: dict={}) -> Tuple[IdaDataFrame, float, float]:

        if not isinstance(in_df, IdaDataFrame):
            raise TypeError("Argument in_df should be an IdaDataFrame")

        if not params.get('id'):
            if in_df.indexer:
                params['id'] = q(in_df.indexer)
            else:
                raise TypeError('Missing id column - either use id_column attribute or set '
                    'indexer column in the input data frame')

        out_table = make_temp_table_name()

        pred_view_needs_delete, true_view_needs_delete = False, False
        try:
            pred_df = self._predict(in_df=in_df, out_table=out_table, params=params)
            pred_view, pred_view_needs_delete = materialize_df(pred_df)
            true_view, true_view_needs_delete = materialize_df(in_df)

            auto_delete_context = None
            if not out_matrix_table:
                auto_delete_context = get_auto_delete_context('out_matrix_table')
                out_matrix_table = make_temp_table_name()

            params_s = map_to_props({
                'resulttable': pred_view,
                'intable': true_view,
                'resultid': self.idadb.to_def_case('ID'),
                'id': params['id'],
                'resulttarget': self.idadb.to_def_case('CLASS'),
                'target': params['target'],
                'matrixTable': out_matrix_table
            })
            self.idadb.ida_query(f'call NZA..CONFUSION_MATRIX(\'{params_s}\')')

            if auto_delete_context:
                auto_delete_context.add_table_to_delete(out_matrix_table)

            out_df = IdaDataFrame(self.idadb, out_matrix_table)

            params = map_to_props({
                'matrixTable': out_matrix_table
            })

            res_acc = self.idadb.ida_query(f'call NZA..CMATRIX_ACC(\'{params}\')')
            res_wacc = self.idadb.ida_query(f'call NZA..CMATRIX_WACC(\'{params}\')')

            return out_df, res_acc[0], res_wacc[0]

        finally:
            self.idadb.drop_table(out_table)
            if pred_view_needs_delete:
                self.idadb.drop_view(pred_view)
            if true_view_needs_delete:
                self.idadb.drop_view(true_view)
    

[docs]
    def cross_validation(self, in_df: IdaDataFrame, target_column: str,  
                         id_column: str=None, out_table: str=None, folds: int=10, 
                         rand_seed: float=None) -> Tuple[IdaDataFrame, float]:
        """
        Performs a cross validation on <in_df> data for given model. Numer of batches 
        and size of train/test split isdetermined by parameter <folds>

        Parameters
        ----------
        in_df : IdaDataFrame
            the input data frame for scoring
        
        target_column : str
            the input table column representing the class

        id_column : str, optional
            the input table column identifying a unique instance id - if skipped, 
            the input data frame indexer must be set and will be used as an instance id

        out_table : str, optional
            the output table where the predicted values will be stored

        Returns
        -------
        IdaDataFrame
            the data frame with predicted values for all <in_df>

        float
            classification accuracy (ACC) for all batches 
        """
        params = {
            'modelType': self.fit_proc,
            'model': self.model_name,
            'intable': in_df,
            'id': q(id_column),
            'target': q(target_column),
            'outtable': out_table,
            'folds': folds,
        }

        if not params.get('id'):
            if in_df.indexer:
                params['id'] = q(in_df.indexer)
            else:
                raise TypeError('Missing id column - either use id_column attribute or set '
                    'indexer column in the input data frame')

        if isinstance(rand_seed, int):
            params['seed'] = rand_seed
        
        ModelManager(self.idadb).drop_model(self.model_name)
        
        ret_df, ret_acc = call_proc_df_in_out(proc="CROSS_VALIDATION", in_df=in_df, params=params,
                                   out_table=out_table)
        
        return ret_df, ret_acc[0]