Source code for nzpyida.analytics.predictive.regression

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#-----------------------------------------------------------------------------
# Copyright (c) 2023, IBM Corp.
# All rights reserved.
#
# Distributed under the terms of the BSD Simplified License.
#
# The full license is in the LICENSE file, distributed with this software.
#-----------------------------------------------------------------------------
"""
This module contains a class that is the base for all regression algorithms.
"""
from typing import Dict
from nzpyida.frame import IdaDataFrame
from nzpyida.base import IdaDataBase
from nzpyida.analytics.utils import map_to_props, materialize_df, make_temp_table_name, q
from nzpyida.analytics.predictive.predictive_modeling import PredictiveModeling



[docs]
class Regression(PredictiveModeling):
    """
    Base class for regression algorithms.
    """

    def __init__(self, idadb: IdaDataBase, model_name: str):
        """
        Creates the regressor class.

        Parameters
        ----------

        idada : IdaDataBase
            database connector

        model_name : str
            model name - if it exists in the database, it will be used, otherwise
            it must be trained using fit() function before prediction or scoring is called.
        """

        super().__init__(idadb, model_name)
        self.score_proc = 'MSE'
        self.id_column_in_output = idadb.to_def_case('ID')


[docs]
    def predict(self, in_df: IdaDataFrame, out_table: str=None,
        id_column: str=None) -> IdaDataFrame:
        """
        Makes predictions based on this model. The model must exist.

        Parameters
        ----------
        in_df : IdaDataFrame
            the input data frame to predict

        out_table : str, optional
            the output table where the predictions will be stored

        id_column : str, optional
            the input table column identifying a unique instance id
            Default: id column used to build the model

        Returns
        -------
        IdaDataFrame
            the data frame containing row identifiers and predicted target values
        """

        params = {
            'id': q(id_column)
        }

        return self._predict(in_df=in_df, params=params, out_table=out_table)



[docs]
    def score(self, in_df: IdaDataFrame, target_column: str,
        id_column: str=None) -> float:
        """
        Scores the model. The model must exist.

        Parameters
        ----------
        in_df : IdaDataFrame
            the input data frame for scoring

        target_column : str
            the input table column representing the class

        id_column : str
            the input table column identifying a unique instance id - if skipped, 
            the input data frame indexer must be set and will be used as an instance id

        Returns
        -------
        float
            the model score
        """

        params = {
            'id': q(id_column)
        }

        return self._score(in_df=in_df, predict_params=params, target_column=target_column)



[docs]
    def score_all(self, in_df: IdaDataFrame, target_column: str,
        id_column: str=None) -> Dict[str, float]:
        """
        Scores the model using MSE, MAE, RSE and RAE. The model must exist.

        Parameters
        ----------
        in_df : IdaDataFrame
            the input data frame for scoring

        target_column : str
            the input table column representing the class

        id_column : str, optional
            the input table column identifying a unique instance id - if skipped, 
            the input data frame indexer must be set and will be used as an instance id

        Returns
        -------
        dict
            the model scores in a dictionary with MSE, MAE, RSE and RAE as keys
        """
        if not isinstance(in_df, IdaDataFrame):
            raise TypeError("Argument in_df should be an IdaDataFrame")

        if not id_column:
            if in_df.indexer:
                id_column = q(in_df.indexer)
            else:
                raise TypeError('Missing id column - either use id_column attribute or set '
                    'indexer column in the input data frame')

        out_table = make_temp_table_name()

        pred_view_needs_delete, true_view_needs_delete = False, False
        try:
            pred_df = self.predict(in_df=in_df, out_table=out_table, id_column=id_column)

            pred_view, pred_view_needs_delete = materialize_df(pred_df)
            true_view, true_view_needs_delete = materialize_df(in_df)

            params = map_to_props({
                'pred_table': pred_view,
                'true_table': true_view,
                'pred_id': q(id_column) if self.id_column_in_output is None
                    else q(self.id_column_in_output),
                'true_id': q(id_column),
                'pred_column': target_column if self.target_column_in_output is None
                    else q(self.target_column_in_output),
                'true_column': q(target_column)
            })

            res1 = pred_df.ida_query(f'call NZA..MSE(\'{params}\')')
            res2 = pred_df.ida_query(f'call NZA..MAE(\'{params}\')')
            res3 = pred_df.ida_query(f'call NZA..RSE(\'{params}\')')
            res4 = pred_df.ida_query(f'call NZA..RAE(\'{params}\')')
            res_dict = {
                "MSE": res1[0],
                "MAE": res2[0],
                "RSE": res3[0],
                "RAE": res4[0]
            }
            return res_dict
        finally:
            self.idadb.drop_table(out_table)
            if pred_view_needs_delete:
                self.idadb.drop_view(pred_view)
            if true_view_needs_delete:
                self.idadb.drop_view(true_view)