Source code for nzpyida.analytics.predictive.glm

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#-----------------------------------------------------------------------------
# Copyright (c) 2023, IBM Corp.
# All rights reserved.
#
# Distributed under the terms of the BSD Simplified License.
#
# The full license is in the LICENSE file, distributed with this software.
#-----------------------------------------------------------------------------

from nzpyida.analytics.utils import call_proc_df_in_out, make_temp_table_name, out_str_to_df
from nzpyida.frame import IdaDataFrame
from nzpyida.base import IdaDataBase
from typing import List
import pandas as pd
from nzpyida.analytics.predictive.regression import Regression
from nzpyida.analytics.utils import q


[docs] class GLM(Regression): """ General Linear Regression model """ def __init__(self, idadb: IdaDataBase, model_name: str): """ Creates GLM object Parameters ---------- idada : IdaDataBase database connector model_name : str model name - if it exists in the database, it will be used, otherwise it must be trained using fit() function before prediction or scoring is called. """ super().__init__(idadb, model_name) self.fit_proc = "GLM" self.predict_proc = "PREDICT_GLM" self.has_print_proc = True self.target_column_in_output = idadb.to_def_case('PRED') self.id_column_in_output = None
[docs] def fit(self, in_df: IdaDataFrame, target_column: str, id_column: str=None, in_columns: List[str]=None, intercept: bool=True, interaction: str='', family_param: float=-1, link: str='logit', link_param: float=1, max_iter: int=20, epsilon: float=1e-3, tolerance: float=1e-7, method: str='irls', trials: str='', debug: bool=False, col_def_type: str=None, col_def_role: str=None, col_properties_table: str=None): """ in_df : IdaDataFrame the input data frame target_column : str the input dataframe column to predict a value for. Only numeric type of target column is accepted id_column : str, optional the input datafrme column identifying a unique instance id incolumn : str, optional the list of input dataframe columns with special properties, separated. Each column is followed by one or several of the following properties: - its type: ':nom' (for nominal), ':cont' (for continuous). Per default, all numerical types are con-tinuous, other types are nominal. - its role: ':id', ':target', ':input', ':ignore', ':objweight'. (Remark: ':colweight(<wgt>)' is unsupported, i.e. ':colweight(<wgt>)' same as ':colweight(1)' same as ':input'). If the parameter is undefined, all columns of the input table have default properties intercept: bool, optional flag indicating whether the model is built with or without an intercept value interaction: str, optional the definition of the allowed interactions between input columns. The interaction is a list of factors separated by a semicolon (;). A factor is a list of variables separated by a star (*). A variable is a column name of the input table. Continuous variables can be followed by a caret (^) and a numeric value, in this case the given power of values of this column is meant. Nominal variables can be followed by a sign equal (=) and a value, so that only the given value of the variable is allowed to interact with the other variables of this factor. If no value is indicated after a nominal variable, all distinct val- ues interact independantly with the other variables of the factor. By default, all input columns are considered independent and do not interact with each other family_param: float, optional additional parameter used for some distributions. IF family_param='quasi' then quasi-likelihood in case of Poisson and Binomial distributions is optimized. IF family_param=-1 (or is omitted then mentioned distribution parameter is estimated from data. IF family_param is given explicit then should by > 0 link: str, optional the type of the link function. Allowed values are: canbinom, cangeom, cannegbinom, cauchit, clog, cloglog, gaussit, identity, inverse, invnegative, invsquare, log, logit, loglog, oddspower, power, probit, sqrt link_param: float, optional an additional parameter used for some links like: cannegbinom, oddspower, power. The range of value depends on the used link function max_iter: int, optional the maximum number of iterations epsilon: float, optional the maximum (relative) error used as stopping criteria tolerance: float, optional the tolerance for the linear equation solver when to consider a value to be equal to zero method: str, optional the method used to calculate a GLM model. Allowed values are: irls, psgd trials: str, optional the input table column containing the number of trials for the binominal distribution. This parameter must be specified when family=binomial. This parametrs is ignored for other distributions debug: str, optional flag indicating to display debug information col_def_type: str, optional default type of the input dataframe columns. Allowed values are 'nom' and 'cont'. If the parameter is undefined, all numeric columns are considered continuous, other columns nominal col_def_role: str, optional default role of the input dataframe columns. Allowed values are 'input' and 'ignore'. If the parameter is undefined, all columns are considered 'input' columns col_properties_table: str, optional the input table where column properties for the input dataframe columns are stored. The format of this table is the output format of stored procedure nza..COLUMN_PROPER-TIES(). If the parameter is undefined, the input table column properties will be detected automatically. (Remark: colPropertiesTable with "COLWEIGHT" column with value '<wgt>' is unsupported, i.e. same as '1') """ params = { 'family': self.family, 'target': q(target_column), 'id': q(id_column), 'incolumn': q(in_columns), 'coldefrole': col_def_role, 'coldeftype': col_def_type, 'colPropertiesTable': col_properties_table, 'intercept': intercept, 'family_param': family_param, 'link': link, 'link_param': link_param, 'maxit': max_iter, 'eps': epsilon, 'tol': tolerance, 'method': method, 'debug': debug } if interaction: params['interaction'] = interaction if trials: params['trials'] = q(trials) return self._fit(in_df=in_df, params=params)
[docs] def predict(self, in_df: IdaDataFrame, out_table: str=None, id_column: str=None, debug: bool=False): """ in_df : IdaDataFrame the input data frame out_table : str, optional the output table where the predictions will be stored id_column : str, optional the input data frame column identifying a unique instance debug : bool, optional flag indicating to display debug information Returns ------- IdaDataFrame the data frame containing row identifiers and predicted target values """ params = { 'id': q(id_column), 'debug': debug } return self._predict(in_df=in_df, params=params, out_table=out_table)
[docs] class BernoulliRegressor(GLM): def __init__(self, idadb: IdaDataBase, model_name: str): super().__init__(idadb, model_name) self.family = 'bernoulli'
[docs] class BinomialRegressor(GLM): def __init__(self, idadb: IdaDataBase, model_name: str): super().__init__(idadb, model_name) self.family = 'binomial'
[docs] class PoissonRegressor(GLM): def __init__(self, idadb: IdaDataBase, model_name: str): super().__init__(idadb, model_name) self.family = 'poisson'
[docs] class NegativeBinomialRegressor(GLM): def __init__(self, idadb: IdaDataBase, model_name: str): super().__init__(idadb, model_name) self.family = 'negativebinomial,'
[docs] class GaussianRegressor(GLM): def __init__(self, idadb: IdaDataBase, model_name: str): super().__init__(idadb, model_name) self.family = 'gaussian'
[docs] class WaldRegressor(GLM): def __init__(self, idadb: IdaDataBase, model_name: str): super().__init__(idadb, model_name) self.family = 'wald'
[docs] class GammaRegressor(GLM): def __init__(self, idadb: IdaDataBase, model_name: str): super().__init__(idadb, model_name) self.family = 'gamma'