Source code for nzpyida.analytics.predictive.glm

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#-----------------------------------------------------------------------------
# Copyright (c) 2023, IBM Corp.
# All rights reserved.
#
# Distributed under the terms of the BSD Simplified License.
#
# The full license is in the LICENSE file, distributed with this software.
#-----------------------------------------------------------------------------

from nzpyida.analytics.utils import call_proc_df_in_out, make_temp_table_name, out_str_to_df
from nzpyida.frame import IdaDataFrame
from nzpyida.base import IdaDataBase
from typing import List
import pandas as pd
from nzpyida.analytics.predictive.regression import Regression
from nzpyida.analytics.utils import q



[docs]
class GLM(Regression):
    """ 
    General Linear Regression model
    """
    def __init__(self, idadb: IdaDataBase, model_name: str):
        """
        Creates GLM object

        Parameters
        ----------

        idada : IdaDataBase
            database connector

        model_name : str
            model name - if it exists in the database, it will be used, otherwise
            it must be trained using fit() function before prediction or scoring is called.
        """
        super().__init__(idadb, model_name)
        self.fit_proc = "GLM"
        self.predict_proc = "PREDICT_GLM"
        self.has_print_proc = True
        self.target_column_in_output = idadb.to_def_case('PRED')
        self.id_column_in_output = None

    

[docs]
    def fit(self, in_df: IdaDataFrame, target_column: str, id_column: str=None, in_columns: List[str]=None,  
            intercept: bool=True, interaction: str='', family_param: float=-1, link: str='logit', 
            link_param: float=1, max_iter: int=20, epsilon: float=1e-3, tolerance: float=1e-7, 
            method: str='irls', trials: str='', debug: bool=False, col_def_type: str=None, 
            col_def_role: str=None, col_properties_table: str=None):
        """
        in_df : IdaDataFrame
            the input data frame
        
        target_column : str
            the input dataframe column to predict a value for. Only numeric type of target column is
            accepted
        
        id_column : str, optional
            the input datafrme column identifying a unique instance id
        
        incolumn : str, optional
            the list of input dataframe columns with special properties, separated. Each column is 
            followed by one or several of the following properties:
            - its type: ':nom' (for nominal), ':cont' (for continuous). Per default, all numerical 
              types are con-tinuous, other types are nominal.
            - its role: ':id', ':target', ':input', ':ignore', ':objweight'.
            (Remark: ':colweight(<wgt>)' is unsupported, i.e. ':colweight(<wgt>)' same as 
            ':colweight(1)' same as ':input').
            If the parameter is undefined, all columns of the input table have default properties
        
        intercept: bool, optional
            flag indicating whether the model is built with or without an intercept value
        
        interaction: str, optional
            the definition of the allowed interactions between input columns. The interaction 
            is a list of factors separated by a semicolon (;). A factor is a list of variables 
            separated by a star (*). A variable is a column name of the input table. Continuous 
            variables can be followed by a caret (^) and a numeric value, in this case the given 
            power of values of this column is meant. Nominal variables can be followed by a sign 
            equal (=) and a value, so that only the given value of the variable is allowed to 
            interact with the other variables of this factor. If no value is indicated after 
            a nominal variable, all distinct val- ues interact independantly with the other 
            variables of the factor. By default, all input columns are considered independent 
            and do not interact with each other
        
        family_param: float, optional
            additional parameter used for some distributions. IF family_param='quasi' then 
            quasi-likelihood in case of Poisson and Binomial distributions is optimized. 
            IF family_param=-1 (or is omitted then mentioned distribution parameter is estimated 
            from data. IF family_param is given explicit then should by > 0
        
        link: str, optional
            the type of the link function. Allowed values are: canbinom, cangeom, cannegbinom, 
            cauchit, clog, cloglog, gaussit, identity, inverse, invnegative, invsquare, log, logit, 
            loglog, oddspower, power, probit, sqrt
        
        link_param: float, optional
            an additional parameter used for some links like: cannegbinom, oddspower, power. 
            The range of value depends on the used link function
        
        max_iter: int, optional
            the maximum number of iterations
        
        epsilon: float, optional
            the maximum (relative) error used as stopping criteria
        
        tolerance: float, optional
            the tolerance for the linear equation solver when to consider a value to be equal to zero
        
        method: str, optional
            the method used to calculate a GLM model. Allowed values are: irls, psgd
        
        trials: str, optional
            the input table column containing the number of trials for the binominal distribution. 
            This parameter must be specified when family=binomial. 
            This parametrs is ignored for other distributions
        
        debug: str, optional
            flag indicating to display debug information
        
        col_def_type: str, optional
            default type of the input dataframe columns. Allowed values are 'nom' and 'cont'.
            If the parameter is undefined, all numeric columns are considered continuous, 
            other columns nominal
        
        col_def_role: str, optional
            default role of the input dataframe columns. Allowed values are 'input' and 'ignore'. 
            If the parameter is undefined, all columns are considered 'input' columns

        col_properties_table: str, optional
            the input table where column properties for the input dataframe columns are stored. 
            The format of this table is the output format of stored procedure nza..COLUMN_PROPER-TIES().
            If the parameter is undefined, the input table column properties will be detected automatically.
            (Remark: colPropertiesTable with "COLWEIGHT" column with value '<wgt>' is unsupported, i.e. same as '1')
        """
        params = {
            'family': self.family,
            'target': q(target_column),
            'id': q(id_column),
            'incolumn': q(in_columns),
            'coldefrole': col_def_role,
            'coldeftype': col_def_type,
            'colPropertiesTable': col_properties_table,
            'intercept': intercept,
            'family_param': family_param,
            'link': link,
            'link_param': link_param,
            'maxit': max_iter,
            'eps': epsilon,
            'tol': tolerance,
            'method': method,
            'debug': debug
        }
        if interaction:
            params['interaction'] = interaction
        if trials:
            params['trials'] = q(trials)

        return self._fit(in_df=in_df, params=params)



[docs]
    def predict(self, in_df: IdaDataFrame, out_table: str=None, id_column: str=None, 
                debug: bool=False):
        """
        in_df : IdaDataFrame
            the input data frame
        
        out_table : str, optional
            the output table where the predictions will be stored
        
        id_column : str, optional
            the input data frame column identifying a unique instance
        
        debug : bool, optional
            flag indicating to display debug information
        
        Returns
        -------
        IdaDataFrame
            the data frame containing row identifiers and predicted target values
        """
        params = {
            'id': q(id_column),
            'debug': debug
        }

        return self._predict(in_df=in_df, params=params, out_table=out_table)




[docs]
class BernoulliRegressor(GLM):
    def __init__(self, idadb: IdaDataBase, model_name: str):
        super().__init__(idadb, model_name)
        self.family = 'bernoulli'




[docs]
class BinomialRegressor(GLM):
    def __init__(self, idadb: IdaDataBase, model_name: str):
        super().__init__(idadb, model_name)
        self.family = 'binomial'



[docs]
class PoissonRegressor(GLM):
    def __init__(self, idadb: IdaDataBase, model_name: str):
        super().__init__(idadb, model_name)
        self.family = 'poisson'



[docs]
class NegativeBinomialRegressor(GLM):
    def __init__(self, idadb: IdaDataBase, model_name: str):
        super().__init__(idadb, model_name)
        self.family = 'negativebinomial,'



[docs]
class GaussianRegressor(GLM):
    def __init__(self, idadb: IdaDataBase, model_name: str):
        super().__init__(idadb, model_name)
        self.family = 'gaussian'



[docs]
class WaldRegressor(GLM):
    def __init__(self, idadb: IdaDataBase, model_name: str):
        super().__init__(idadb, model_name)
        self.family = 'wald'



[docs]
class GammaRegressor(GLM):
    def __init__(self, idadb: IdaDataBase, model_name: str):
        super().__init__(idadb, model_name)
        self.family = 'gamma'