Source code for nzpyida.analytics.predictive.bayesian_networks

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#-----------------------------------------------------------------------------
# Copyright (c) 2023, IBM Corp.
# All rights reserved.
#
# Distributed under the terms of the BSD Simplified License.
#
# The full license is in the LICENSE file, distributed with this software.
#-----------------------------------------------------------------------------
"""
Tree-shaped Bayesian networks formally belong to the data exploration category. 
However, this algorithm is considerably more complex than other data exploration 
algorithms and not as widely known, warranting detailed description.

A Bayesian network can be considered a graphical representation of probabilistically 
described relationships within a set of attributes, allowing probabilistic inference 
to be performed. The representation is created by extracting the structural properties 
of the distribution from the data.

Creating and using general Bayesian networks are algorithmically and computationally 
complex. Tree- shaped Bayesian networks, however, constitute a simplified subclass 
of Bayesian networks with restrictions imposed on the type of attribute relationships 
that can be discovered and represented. The restrictions permit simpler and more efficient 
algorithms as well as more straightforward interpretation. Tree-shaped Bayesian networks 
may be not sufficient for highly-accurate prediction, but provide an excellent 
qualitative description of the relationship structure observed in the data
"""

from typing import Tuple, List
from nzpyida.analytics.model_manager import ModelManager
from nzpyida.frame import IdaDataFrame
from nzpyida.base import IdaDataBase
from nzpyida.analytics.utils import map_to_props, materialize_df, make_temp_table_name
from nzpyida.analytics.utils import get_auto_delete_context, q
from nzpyida.analytics.predictive.regression import Regression
from nzpyida.analytics.predictive.predictive_modeling import PredictiveModeling



[docs]
class TreeBayesNetwork(Regression):
    def __init__(self, idadb: IdaDataBase, model_name: str):
        """
        Creates Tree Shaped Bayesian Network class

        Parameters
        ----------

        idada : IdaDataBase
            database connector

        model_name : str
            model name - if it exists in the database, it will be used, otherwise
            it must be trained using fit() function before prediction or scoring is called
        """
        super().__init__(idadb, model_name)
        self.fit_proc = "TBNET_GROW"
        self.predict_proc = "TBNET_APPLY"


[docs]
    def fit(self, in_df: IdaDataFrame, in_columns: List[str]=None, base_index: int=777,
            sample_size: int=None, talk: str=None, size_warning: str=None, edge_lab_sort: str=None, col_def_type: str=None, 
            col_def_role: str=None, col_properties_table: str=None) -> None:
        """
        Builds a tree-like Bayesian Network for continuous variables. A spanning tree is 
        constructed joining all the variables on grounds of most strong correlations. 
        This gives the user an overview of most significant interrelations governing 
        the whole set of variables

        Parameters
        ----------

        in_df : IdaDataFrame
            the input data frame
        
        in_columns : List[str]
            List of the input dataframe  columns with special properties. 
            Each column is followed by one or several of the following properties:
            - type: ':nom' (for nominal), ':cont' (for continuous). By default, 
              all numerical types are continuous, other types are nominal
            - role: ':id', ':target', ':input', ':ignore'.
            (Remark: ':objweight' is unsupported, i.e. ':objweight' same as ':ignore').
            (Remark: ':colweight(<wgt>)' is unsupported, i.e. ':colweight(<wgt>)' same as 
            ':colweight(1)' same as ':input').

            If the parameter is undefined, all columns of the input dataframe have default properties. 
            Note that this procedure only accepts continuous columns with role 'input'
        
        base_index : int, optional
            the numeric id to be assigned to the first variable
        
        sample_size : int, optional
            the sample size to take if the number of records is too large
        
        talk : str, optional
            if talk=yes then additional information on progress will be displayed
        
        size_warning : str, optional
            if sizewarn=yes then no exception is thrown when there are less records than 
            3 times the number of columns. Instead, a notice is displayed and the stored 
            procedure returns 'sizewarn'
        
        edge_lab_sort : str, optional
            if edge_lab_sort=yes then the left end of the edge will have a name lower 
            in alphabetic order than the right one
        
        col_def_type : str, optional
            default type of the input dataaframe columns. Allowed values are 'nom' and 'cont'.
            If the parameter is undefined, all numeric columns are considered continuous,
            other columns nominal.

        col_def_role : str, optional
            default role of the input dataframe columns.
            Allowed values are 'input' and 'ignore'.
            If the parameter is undefined, all columns are considered 'input' columns.

        col_properties_table : str, optional
            the input table where column properties for the input dataframe columns are stored.
            The format of this table is the output format of stored procedure
            nza..COLUMN_PROPERTIES().
            If the parameter is undefined, the input table column properties will be
            detected automatically.
            (Remark: colPropertiesTable with "COLROLE" column with value 'objweight'
            is unsupported, i.e. same as 'ignore')
            (Remark: colPropertiesTable with "COLWEIGHT" column with value '<wgt>' is unsupported,
            i.e. same as '1')
        """
        params = {
            'incolumn': q(in_columns),
            'coldeftype': col_def_type,
            'coldefrole': col_def_role,
            'colPropertiesTable': col_properties_table,
            'baseidx': base_index,
            'samplesize': sample_size,
            'talk': talk,
            'edgelabsort': edge_lab_sort
        }
        if self.fit_proc == 'TBNET_GROW':
            params['sizewarn'] = size_warning

        self._fit(in_df=in_df, params=params, needs_id=False)

    

[docs]
    def predict(self, in_df: IdaDataFrame, target_column: str=None, id_column: str=None,
                prediction_type: str='best', out_table: str=None) -> IdaDataFrame:
        """
        Makes predictions based on this model. The model must exist.

        Parameters
        ----------

        in_df : IdaDataFrame
            the input data frame
        
        target_column : str
            The model variable to be predicted
        
        id_column : str, optional
            The column of the input dataframe that identifies a unique instance ID
        
        prediction_type : str, optional
            The type of prediction to be made. Valid values are best (most correlated neighbor), 
            neighbors (weighted prediction of neighbors), and nn-neighbors (non null neighbors)
        
        out_table : str, optional
            The name of the output dataframe where the predictions are to be stored
        
        Returns
        -------
        IdaDataFrame
            the data frame containing row identifiers and predicted target values
        """
        params = {
            'id': q(id_column),
            'target': q(target_column),
            'type': prediction_type
        }
        return self._predict(in_df, params, out_table)

    


[docs]
    def score(self, in_df: IdaDataFrame, target_column: str=None, id_column: str=None,
              prediction_type: str='best') -> float:
        """
        Scores the model. The model must exist.

        Parameters
        ----------
        in_df : IdaDataFrame
            the input data frame for scoring

        target_column : str
            the input dataframe column representing the class

        id_column : str
            the input table column identifying a unique instance id - if skipped, 
            the input data frame indexer must be set and will be used as an instance id
        
        prediction_type : str, optional
            The type of prediction to be made. Valid values are best (most correlated neighbor), 
            neighbors (weighted prediction of neighbors), and nn-neighbors (non null neighbors)

        Returns
        -------
        float
            the model score
        """
        params = {
            'id': q(id_column),
            'type': prediction_type,
            'target': q(target_column)
        }
        self.target_column_in_output = f"{target_column}_PRED"
        return self._score(in_df, params, target_column)






[docs]
class BinaryTreeBayesNetwork(TreeBayesNetwork):
    def __init__(self, idadb: IdaDataBase, model_name: str):
        """
        Creates Binary Tree Shaped Bayesian Network class
        """
        super().__init__(idadb, model_name)
        self.fit_proc = "BTBNET_GROW"





[docs]
class TreeAgumentedNetwork(TreeBayesNetwork):
    def __init__(self, idadb: IdaDataBase, model_name: str):
        """
        Creates Tree-shaped Agumented Network object
        """
        super().__init__(idadb, model_name)
        self.fit_proc = "TANET_GROW"
        self.predict_proc = "TANET_APPLY"
    

[docs]
    def fit(self, in_df: IdaDataFrame, in_model: str, class_column: str,
            edge_lab_sort: str=None) -> None:
        """
        Parameters
        ----------
        in_df : IdaDataFrame
            the input data frame
        
        in_model : str
            the name of the input Bayesian Network model

        class_column : str
            the target class; this should be column with nominal variables
        
        edge_lab_sort : str, optional
            if edge_lab_sort=yes then the left end of the edge will have a name lower 
            in alphabetic order than the right one
        """
        params = {
            'inmodel': in_model,
            'class': q(class_column),
            'edge_lab_sort': edge_lab_sort
        }
        self._fit(in_df, params, needs_id=False)





[docs]
class MultiTreeBayesNetwork(TreeBayesNetwork):
    def __init__(self, idadb: IdaDataBase, model_name: str):
        """
        Creates Multi Tree-shaped Bayesian Network object
        """
        super().__init__(idadb, model_name)
        self.fit_proc = "MTBNET_GROW"
        self.predict_proc = "TANET_APPLY"
    

[docs]
    def fit(self, in_df: IdaDataFrame, class_column: str, in_columns: List[str]=None, 
            base_index: int=None, sample_size: int=None, talk: str=None, 
            edge_lab_sort: str=None, col_def_type: str=None, col_def_role: str=None, 
            col_properties_table: str=None) -> None:
        """
        Parameters
        ----------
        in_df : IdaDataFrame
            the input data frame

        class_column : str
            the target class; this should be column with nominal variables

        in_columns : List[str]
            List of the input dataframe columns with special properties. 
            Each column is followed by one or several of the following properties:
            - type: ':nom' (for nominal), ':cont' (for continuous). By default, 
              all numerical types are continuous, other types are nominal
            - role: ':id', ':target', ':input', ':ignore'.
            (Remark: ':objweight' is unsupported, i.e. ':objweight' same as ':ignore').
            (Remark: ':colweight(<wgt>)' is unsupported, i.e. ':colweight(<wgt>)' same as 
            ':colweight(1)' same as ':input').

            If the parameter is undefined, all columns of the input table have default properties. 
            Note that this procedure only accepts continuous columns with role 'input'
            Addition-ally, each column is followed by a colon (:) and either X or Y to distinguish 
            the two sets of variables.
        
        base_index : int, optional
            the numeric id to be assigned to the first variable
        
        sample_size : int, optional
            the sample size to take if the number of records is too large
        
        talk : str, optional
            if talk=yes then additional information on progress will be displayed
        
        edge_lab_sort : str, optional
            if edge_lab_sort=yes then the left end of the edge will have a name lower 
            in alphabetic order than the right one
        
        col_def_type : str, optional
            default type of the input dataframe columns. Allowed values are 'nom' and 'cont'.
            If the parameter is undefined, all numeric columns are considered continuous,
            other columns nominal.

        col_def_role : str, optional
            default role of the input dataframe columns.
            Allowed values are 'input' and 'ignore'.
            If the parameter is undefined, all columns are considered 'input' columns.

        col_properties_table : str, optional
            the input table where column properties for the input dataframe columns are stored.
            The format of this table is the output format of stored procedure
            nza..COLUMN_PROPERTIES().
            If the parameter is undefined, the input dataframe column properties will be
            detected automatically.
            (Remark: colPropertiesTable with "COLROLE" column with value 'objweight'
            is unsupported, i.e. same as 'ignore')
            (Remark: colPropertiesTable with "COLWEIGHT" column with value '<wgt>' is unsupported,
            i.e. same as '1')
        """
        params = {
            'class': q(class_column),
            'incolumn': q(in_columns),
            'coldeftype': col_def_type,
            'coldefrole': col_def_role,
            'colPropertiesTable': col_properties_table,
            'baseidx': base_index,
            'samplesize': sample_size,
            'talk': talk,
            'edgelabsort': edge_lab_sort
        }
        self._fit(in_df, params, needs_id=False)








[docs]
class TreeBayesNetworkBase(PredictiveModeling):
    def __init__(self, idadb: IdaDataBase, model_name: str):
        """
        Creates base class for group of Tree Shaped Bayesian Network models
        """
        super().__init__(idadb, model_name)
    
    def _grow(self, in_df: IdaDataFrame, params: dict):
        """
        Grows the Tree Shaped Bayesian Network on input data and returns
        dataframe with statistics

        Parameters
        ----------
        in_df : IdaDataFrame
            the input data frame

        params : dict
            the dictionary of attributes used for running procedure

        Returns
        -------
        IdaDataFrame
            the data frame containing statistics
        """
        if not isinstance(in_df, IdaDataFrame):
            raise TypeError("Argument in_df should be an IdaDataFrame")

        ModelManager(self.idadb).drop_model(self.model_name)

        temp_view_name, need_delete = materialize_df(in_df)
        params['intable'] = temp_view_name

        params_s = map_to_props(params)

        try:
            in_df.ida_query(f'call NZA..TBNET1G(\'{params_s}\')')
        finally:
            if need_delete:
                in_df._idadb.drop_view(temp_view_name)
        
        table_name = f"INZA.nza_meta_{self.model_name}_model"
            
        if not in_df._idadb.exists_table_or_view(table_name):
            # stored procedure call was successful by did not produce a table
            return None
        
        out_df = IdaDataFrame(in_df._idadb, table_name)
        return out_df




[docs]
class TreeBayesNetwork1G(TreeBayesNetworkBase):
    def __init__(self, idadb: IdaDataBase, model_name: str):
        """
        Creates Tree-shaped Bayesian Network 1G object
        """
        super().__init__(idadb, model_name)
        self.predict_proc = "TBNET1G"


[docs]
    def grow(self, in_df: IdaDataFrame, in_columns: List[str]=None, base_index: int=777, 
            sample_size: int=330000, talk: str=None, no_check: str=None, edge_lab_sort: str=None, 
            col_def_type: str=None, col_def_role: str=None, 
            col_properties_table: str=None) -> IdaDataFrame:
        """
        Parameters
        ----------
        in_df : IdaDataFrame
            the input data frame

        in_columns : List[str]
            List of the input dataframe columns with special properties. 
            Each column is followed by one or several of the following properties:
            - type: ':nom' (for nominal), ':cont' (for continuous). By default, 
              all numerical types are continuous, other types are nominal
            - role: ':id', ':target', ':input', ':ignore'.
            (Remark: ':objweight' is unsupported, i.e. ':objweight' same as ':ignore').
            (Remark: ':colweight(<wgt>)' is unsupported, i.e. ':colweight(<wgt>)' same as 
            ':colweight(1)' same as ':input').

            If the parameter is undefined, all columns of the input table have default properties. 
            Note that this procedure only accepts continuous columns with role 'input'
            Addition-ally, each column is followed by a colon (:) and either X or Y to distinguish 
            the two sets of variables.
        
        base_index : int, optional
            the numeric id to be assigned to the first variable
        
        sample_size : int, optional
            the sample size to take if the number of records is too large
        
        talk : str, optional
            if talk=yes then additional information on progress will be displayed
        
        no_check : str, optional
            if nocheck=yes then no exception is thrown when a column in <in_columns> 
            does not exis
        
        edge_lab_sort : str, optional
            if edge_lab_sort=yes then the left end of the edge will have a name lower 
            in alphabetic order than the right one
        
        col_def_type : str, optional
            default type of the input dataframe columns. Allowed values are 'nom' and 'cont'.
            If the parameter is undefined, all numeric columns are considered continuous,
            other columns nominal.

        col_def_role : str, optional
            default role of the input dataframe columns.
            Allowed values are 'input' and 'ignore'.
            If the parameter is undefined, all columns are considered 'input' columns.

        col_properties_table : str, optional
            the input table where column properties for the input dataframe columns are stored.
            The format of this table is the output format of stored procedure
            nza..COLUMN_PROPERTIES().
            If the parameter is undefined, the input dataframe column properties will be
            detected automatically.
            (Remark: colPropertiesTable with "COLROLE" column with value 'objweight'
            is unsupported, i.e. same as 'ignore')
            (Remark: colPropertiesTable with "COLWEIGHT" column with value '<wgt>' is unsupported,
            i.e. same as '1')
        
        Returns
        -------
        IdaDataFrame
            the data frame containing statistics
        """
        params = {
            'model': self.model_name,
            'incolumn': q(in_columns),
            'coldeftype': col_def_type,
            'coldefrole': col_def_role,
            'colPropertiesTable': col_properties_table,
            'baseidx': base_index,
            'samplesize': sample_size,
            'talk': talk,
            'nocheck': no_check,
            'edgelabsort': edge_lab_sort
            }
        
        return self._grow(in_df, params)


    


[docs]
class TreeBayesNetwork2G(TreeBayesNetworkBase):
    def __init__(self, idadb, model_name) -> None:
        """
        Creates Tree-shaped Bayesian Network 2G object
        """
        super().__init__(idadb, model_name)
        self.predict_proc = "TBNET2G"


[docs]
    def grow(self, in_df: IdaDataFrame, in_columns: List[str]=None, base_index: int=777, 
            talk: str=None, no_check: str=None, edge_lab_sort: str=None, col_def_type: str=None, 
            col_def_role: str=None, col_properties_table: str=None) -> IdaDataFrame:
        """
        Builds a tree-like Bayesian Network for continuous variables.
        A spanning tree is constructed joining all the variables on grounds of most strong 
        correlations. This gives the user an overview of most significant interrelations 
        governing the whole set of variables.

        The stored procedure operates with two sets of variables and the resulting tree 
        will be bi-partite. The correlations between variables within each set will not 
        be calculated. This feature is useful when the two sets characterize distinct 
        objects and only links between the objects are of interest
        
        Parameters
        ----------
        in_df : IdaDataFrame
            the input data frame

        in_columns : List[str]
            List of the input dataframe columns with special properties. 
            Each column is followed by one or several of the following properties:
            - type: ':nom' (for nominal), ':cont' (for continuous). By default, 
              all numerical types are continuous, other types are nominal
            - role: ':id', ':target', ':input', ':ignore'.
            (Remark: ':objweight' is unsupported, i.e. ':objweight' same as ':ignore').
            (Remark: ':colweight(<wgt>)' is unsupported, i.e. ':colweight(<wgt>)' same as 
            ':colweight(1)' same as ':input').

            If the parameter is undefined, all columns of the input table have default properties. 
            Note that this procedure only accepts continuous columns with role 'input'
            Addition-ally, each column is followed by a colon (:) and either X or Y to distinguish 
            the two sets of variables.
        
        base_index : int, optional
            the numeric id to be assigned to the first variable
        
        talk : str, optional
            if talk=yes then additional information on progress will be displayed
        
        no_check : str, optional
            if nocheck=yes then no exception is thrown when a column in <in_columns> 
            does not exis
        
        edge_lab_sort : str, optional
            if edge_lab_sort=yes then the left end of the edge will have a name lower 
            in alphabetic order than the right one
        
        col_def_type : str, optional
            default type of the input dataframe columns. Allowed values are 'nom' and 'cont'.
            If the parameter is undefined, all numeric columns are considered continuous,
            other columns nominal.

        col_def_role : str, optional
            default role of the input dataframe columns.
            Allowed values are 'input' and 'ignore'.
            If the parameter is undefined, all columns are considered 'input' columns.

        col_properties_table : str, optional
            the input table where column properties for the input dataframe columns are stored.
            The format of this table is the output format of stored procedure
            nza..COLUMN_PROPERTIES().
            If the parameter is undefined, the input dataframe column properties will be
            detected automatically.
            (Remark: colPropertiesTable with "COLROLE" column with value 'objweight'
            is unsupported, i.e. same as 'ignore')
            (Remark: colPropertiesTable with "COLWEIGHT" column with value '<wgt>' is unsupported,
            i.e. same as '1')

        Returns
        -------
        IdaDataFrame
            the data frame containing statistics
        """
        params = {
            'model': self.model_name,
            'incolumn': q(in_columns),
            'coldeftype': col_def_type,
            'coldefrole': col_def_role,
            'colPropertiesTable': col_properties_table,
            'baseidx': base_index,
            'talk': talk,
            'nocheck': no_check,
            'edgelabsort': edge_lab_sort
            }
        
        return self._grow(in_df, params)




[docs]
class TreeBayesNetwork1G2P(TreeBayesNetworkBase):
    def __init__(self, idadb, model_name) -> None:
        """
        Creates Tree-shaped Bayesian Network 1G2P object
        """
        super().__init__(idadb, model_name)
        self.predict_proc = "TBNET1G2P"
    

[docs]
    def grow(self, in_df: IdaDataFrame, in_columns: List[str]=None, base_index: int=777, 
            talk: str=None, no_check: str=None, edge_lab_sort: str=None, col_def_type: str=None, 
            col_def_role: str=None, col_properties_table: str=None) -> IdaDataFrame:
        """
        This stored procedure builds a tree-like Bayesian Network for continuous variables. 
        A spanning tree is constructed joining all the variables on grounds of most strong 
        correlations. This gives the user an overview of most significant interrelations 
        governing the whole set of variables.

        The stored procedure constructs the tree in an incremental manner. It calculates 
        correlations on one set of variables, then on the other set of variables, then 
        between variables of the 2 sets. The final model is obtained by joining the 
        three sub-models
        
        Parameters
        ----------
        in_df : IdaDataFrame
            the input data frame

        in_columns : List[str]
            List of the input dataframe columns with special properties. 
            Each column is followed by one or several of the following properties:
            - type: ':nom' (for nominal), ':cont' (for continuous). By default, 
              all numerical types are continuous, other types are nominal
            - role: ':id', ':target', ':input', ':ignore'.
            (Remark: ':objweight' is unsupported, i.e. ':objweight' same as ':ignore').
            (Remark: ':colweight(<wgt>)' is unsupported, i.e. ':colweight(<wgt>)' same as 
            ':colweight(1)' same as ':input').

            If the parameter is undefined, all columns of the input table have default properties. 
            Note that this procedure only accepts continuous columns with role 'input'
            Addition-ally, each column is followed by a colon (:) and either X or Y to distinguish 
            the two sets of variables.
        
        base_index : int, optional
            the numeric id to be assigned to the first variable
        
        talk : str, optional
            if talk=yes then additional information on progress will be displayed
        
        no_check : str, optional
            if nocheck=yes then no exception is thrown when a column in <in_columns> 
            does not exis
        
        edge_lab_sort : str, optional
            if edge_lab_sort=yes then the left end of the edge will have a name lower 
            in alphabetic order than the right one
        
        col_def_type : str, optional
            default type of the input dataframe columns. Allowed values are 'nom' and 'cont'.
            If the parameter is undefined, all numeric columns are considered continuous,
            other columns nominal.

        col_def_role : str, optional
            default role of the input dataframe columns.
            Allowed values are 'input' and 'ignore'.
            If the parameter is undefined, all columns are considered 'input' columns.

        col_properties_table : str, optional
            the input table where column properties for the input dataframe columns are stored.
            The format of this table is the output format of stored procedure
            nza..COLUMN_PROPERTIES().
            If the parameter is undefined, the input dataframe column properties will be
            detected automatically.
            (Remark: colPropertiesTable with "COLROLE" column with value 'objweight'
            is unsupported, i.e. same as 'ignore')
            (Remark: colPropertiesTable with "COLWEIGHT" column with value '<wgt>' is unsupported,
            i.e. same as '1')

        Returns
        -------
        IdaDataFrame
            the data frame containing statistics
        """
        params = {
            'model': self.model_name,
            'incolumn': q(in_columns),
            'coldeftype': col_def_type,
            'coldefrole': col_def_role,
            'colPropertiesTable': col_properties_table,
            'baseidx': base_index,
            'talk': talk,
            'nocheck': no_check,
            'edgelabsort': edge_lab_sort
            }
        
        return self._grow(in_df, params)