Source code for nzpyida.analytics.predictive.classification

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#-----------------------------------------------------------------------------
# Copyright (c) 2023, IBM Corp.
# All rights reserved.
#
# Distributed under the terms of the BSD Simplified License.
#
# The full license is in the LICENSE file, distributed with this software.
#-----------------------------------------------------------------------------
"""
This module contains a class that is the base for all classification algorithms.
"""
from typing import Tuple
from nzpyida.frame import IdaDataFrame
from nzpyida.base import IdaDataBase
from nzpyida.analytics.utils import map_to_props, materialize_df, make_temp_table_name, \
call_proc_df_in_out
from nzpyida.analytics.utils import get_auto_delete_context
from nzpyida.analytics.predictive.predictive_modeling import PredictiveModeling
from nzpyida.analytics.model_manager import ModelManager
from nzpyida.analytics.utils import q


[docs] class Classification(PredictiveModeling): """ Base class for classification algorithms. """ def __init__(self, idadb: IdaDataBase, model_name: str): """ Creates the classifier class. Parameters ---------- idada : IdaDataBase database connector model_name : str model name - if it exists in the database, it will be used, otherwise it must be trained using fit() function before prediction or scoring is called. """ super().__init__(idadb, model_name) self.target_column_in_output = idadb.to_def_case('CLASS') self.id_column_in_output = idadb.to_def_case('ID') self.score_proc = 'CERROR' self.score_inv = True self.type = None
[docs] def predict(self, in_df: IdaDataFrame, out_table: str=None, id_column: str=None) -> IdaDataFrame: """ Makes predictions based on this model. The model must exist. Parameters ---------- in_df : IdaDataFrame the input data frame for predictions out_table : str, optional the output table where the predictions will be stored id_column : str, optional the input table column identifying a unique instance id Default: id column used to build the model """ params = { 'id': q(id_column) } return self._predict(in_df=in_df, params=params, out_table=out_table)
[docs] def score(self, in_df: IdaDataFrame, target_column: str, id_column: str=None) -> float: """ Scores the model. The model must exist. Parameters ---------- in_df : IdaDataFrame the input data frame for scoring target_column : str the input table column representing the class id_column : str, optional the input table column identifying a unique instance id - if skipped, the input data frame indexer must be set and will be used as an instance id Returns ------- float the model score """ params = { 'id': q(id_column) } return self._score(in_df=in_df, predict_params=params, target_column=target_column)
[docs] def conf_matrix(self, in_df: IdaDataFrame, target_column: str, id_column: str=None, out_matrix_table: str=None) -> Tuple[IdaDataFrame, float, float]: """ Makes a predition for a test data set given by the user and returns a confusion matrix, together with other stats (ACC and WACC). Parameters ---------- in_df : IdaDataFrame the input data frame for scoring target_column : str the input table column representing the class id_column : str, optional the input table column identifying a unique instance id - if skipped, the input data frame indexer must be set and will be used as an instance id out_matrix_table : str, optional the output table where the confidence matrix will be stored Returns ------- IdaDataFrame the confidence matrix data frame float classification accuracy (ACC) float weighted classification accuracy (WACC) """ params = { 'id': q(id_column), 'target': q(target_column) } return self._conf_matrix(in_df, out_matrix_table, params)
def _conf_matrix(self, in_df: IdaDataFrame, out_matrix_table: str=None, params: dict={}) -> Tuple[IdaDataFrame, float, float]: if not isinstance(in_df, IdaDataFrame): raise TypeError("Argument in_df should be an IdaDataFrame") if not params.get('id'): if in_df.indexer: params['id'] = q(in_df.indexer) else: raise TypeError('Missing id column - either use id_column attribute or set ' 'indexer column in the input data frame') out_table = make_temp_table_name() pred_view_needs_delete, true_view_needs_delete = False, False try: pred_df = self._predict(in_df=in_df, out_table=out_table, params=params) pred_view, pred_view_needs_delete = materialize_df(pred_df) true_view, true_view_needs_delete = materialize_df(in_df) auto_delete_context = None if not out_matrix_table: auto_delete_context = get_auto_delete_context('out_matrix_table') out_matrix_table = make_temp_table_name() params_s = map_to_props({ 'resulttable': pred_view, 'intable': true_view, 'resultid': self.idadb.to_def_case('ID'), 'id': params['id'], 'resulttarget': self.idadb.to_def_case('CLASS'), 'target': params['target'], 'matrixTable': out_matrix_table }) self.idadb.ida_query(f'call NZA..CONFUSION_MATRIX(\'{params_s}\')') if auto_delete_context: auto_delete_context.add_table_to_delete(out_matrix_table) out_df = IdaDataFrame(self.idadb, out_matrix_table) params = map_to_props({ 'matrixTable': out_matrix_table }) res_acc = self.idadb.ida_query(f'call NZA..CMATRIX_ACC(\'{params}\')') res_wacc = self.idadb.ida_query(f'call NZA..CMATRIX_WACC(\'{params}\')') return out_df, res_acc[0], res_wacc[0] finally: self.idadb.drop_table(out_table) if pred_view_needs_delete: self.idadb.drop_view(pred_view) if true_view_needs_delete: self.idadb.drop_view(true_view)
[docs] def cross_validation(self, in_df: IdaDataFrame, target_column: str, id_column: str=None, out_table: str=None, folds: int=10, rand_seed: float=None) -> Tuple[IdaDataFrame, float]: """ Performs a cross validation on <in_df> data for given model. Numer of batches and size of train/test split isdetermined by parameter <folds> Parameters ---------- in_df : IdaDataFrame the input data frame for scoring target_column : str the input table column representing the class id_column : str, optional the input table column identifying a unique instance id - if skipped, the input data frame indexer must be set and will be used as an instance id out_table : str, optional the output table where the predicted values will be stored Returns ------- IdaDataFrame the data frame with predicted values for all <in_df> float classification accuracy (ACC) for all batches """ params = { 'modelType': self.fit_proc, 'model': self.model_name, 'intable': in_df, 'id': q(id_column), 'target': q(target_column), 'outtable': out_table, 'folds': folds, } if not params.get('id'): if in_df.indexer: params['id'] = q(in_df.indexer) else: raise TypeError('Missing id column - either use id_column attribute or set ' 'indexer column in the input data frame') if isinstance(rand_seed, int): params['seed'] = rand_seed ModelManager(self.idadb).drop_model(self.model_name) ret_df, ret_acc = call_proc_df_in_out(proc="CROSS_VALIDATION", in_df=in_df, params=params, out_table=out_table) return ret_df, ret_acc[0]