#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#-----------------------------------------------------------------------------
# Copyright (c) 2023, IBM Corp.
# All rights reserved.
#
# Distributed under the terms of the BSD Simplified License.
#
# The full license is in the LICENSE file, distributed with this software.
#-----------------------------------------------------------------------------
"""
Tree-shaped Bayesian networks formally belong to the data exploration category.
However, this algorithm is considerably more complex than other data exploration
algorithms and not as widely known, warranting detailed description.
A Bayesian network can be considered a graphical representation of probabilistically
described relationships within a set of attributes, allowing probabilistic inference
to be performed. The representation is created by extracting the structural properties
of the distribution from the data.
Creating and using general Bayesian networks are algorithmically and computationally
complex. Tree- shaped Bayesian networks, however, constitute a simplified subclass
of Bayesian networks with restrictions imposed on the type of attribute relationships
that can be discovered and represented. The restrictions permit simpler and more efficient
algorithms as well as more straightforward interpretation. Tree-shaped Bayesian networks
may be not sufficient for highly-accurate prediction, but provide an excellent
qualitative description of the relationship structure observed in the data
"""
from typing import Tuple, List
from nzpyida.analytics.model_manager import ModelManager
from nzpyida.frame import IdaDataFrame
from nzpyida.base import IdaDataBase
from nzpyida.analytics.utils import map_to_props, materialize_df, make_temp_table_name
from nzpyida.analytics.utils import get_auto_delete_context, q
from nzpyida.analytics.predictive.regression import Regression
from nzpyida.analytics.predictive.predictive_modeling import PredictiveModeling
[docs]
class TreeBayesNetwork(Regression):
def __init__(self, idadb: IdaDataBase, model_name: str):
"""
Creates Tree Shaped Bayesian Network class
Parameters
----------
idada : IdaDataBase
database connector
model_name : str
model name - if it exists in the database, it will be used, otherwise
it must be trained using fit() function before prediction or scoring is called
"""
super().__init__(idadb, model_name)
self.fit_proc = "TBNET_GROW"
self.predict_proc = "TBNET_APPLY"
[docs]
def fit(self, in_df: IdaDataFrame, in_columns: List[str]=None, base_index: int=777,
sample_size: int=None, talk: str=None, size_warning: str=None, edge_lab_sort: str=None, col_def_type: str=None,
col_def_role: str=None, col_properties_table: str=None) -> None:
"""
Builds a tree-like Bayesian Network for continuous variables. A spanning tree is
constructed joining all the variables on grounds of most strong correlations.
This gives the user an overview of most significant interrelations governing
the whole set of variables
Parameters
----------
in_df : IdaDataFrame
the input data frame
in_columns : List[str]
List of the input dataframe columns with special properties.
Each column is followed by one or several of the following properties:
- type: ':nom' (for nominal), ':cont' (for continuous). By default,
all numerical types are continuous, other types are nominal
- role: ':id', ':target', ':input', ':ignore'.
(Remark: ':objweight' is unsupported, i.e. ':objweight' same as ':ignore').
(Remark: ':colweight(<wgt>)' is unsupported, i.e. ':colweight(<wgt>)' same as
':colweight(1)' same as ':input').
If the parameter is undefined, all columns of the input dataframe have default properties.
Note that this procedure only accepts continuous columns with role 'input'
base_index : int, optional
the numeric id to be assigned to the first variable
sample_size : int, optional
the sample size to take if the number of records is too large
talk : str, optional
if talk=yes then additional information on progress will be displayed
size_warning : str, optional
if sizewarn=yes then no exception is thrown when there are less records than
3 times the number of columns. Instead, a notice is displayed and the stored
procedure returns 'sizewarn'
edge_lab_sort : str, optional
if edge_lab_sort=yes then the left end of the edge will have a name lower
in alphabetic order than the right one
col_def_type : str, optional
default type of the input dataaframe columns. Allowed values are 'nom' and 'cont'.
If the parameter is undefined, all numeric columns are considered continuous,
other columns nominal.
col_def_role : str, optional
default role of the input dataframe columns.
Allowed values are 'input' and 'ignore'.
If the parameter is undefined, all columns are considered 'input' columns.
col_properties_table : str, optional
the input table where column properties for the input dataframe columns are stored.
The format of this table is the output format of stored procedure
nza..COLUMN_PROPERTIES().
If the parameter is undefined, the input table column properties will be
detected automatically.
(Remark: colPropertiesTable with "COLROLE" column with value 'objweight'
is unsupported, i.e. same as 'ignore')
(Remark: colPropertiesTable with "COLWEIGHT" column with value '<wgt>' is unsupported,
i.e. same as '1')
"""
params = {
'incolumn': q(in_columns),
'coldeftype': col_def_type,
'coldefrole': col_def_role,
'colPropertiesTable': col_properties_table,
'baseidx': base_index,
'samplesize': sample_size,
'talk': talk,
'edgelabsort': edge_lab_sort
}
if self.fit_proc == 'TBNET_GROW':
params['sizewarn'] = size_warning
self._fit(in_df=in_df, params=params, needs_id=False)
[docs]
def predict(self, in_df: IdaDataFrame, target_column: str=None, id_column: str=None,
prediction_type: str='best', out_table: str=None) -> IdaDataFrame:
"""
Makes predictions based on this model. The model must exist.
Parameters
----------
in_df : IdaDataFrame
the input data frame
target_column : str
The model variable to be predicted
id_column : str, optional
The column of the input dataframe that identifies a unique instance ID
prediction_type : str, optional
The type of prediction to be made. Valid values are best (most correlated neighbor),
neighbors (weighted prediction of neighbors), and nn-neighbors (non null neighbors)
out_table : str, optional
The name of the output dataframe where the predictions are to be stored
Returns
-------
IdaDataFrame
the data frame containing row identifiers and predicted target values
"""
params = {
'id': q(id_column),
'target': q(target_column),
'type': prediction_type
}
return self._predict(in_df, params, out_table)
[docs]
def score(self, in_df: IdaDataFrame, target_column: str=None, id_column: str=None,
prediction_type: str='best') -> float:
"""
Scores the model. The model must exist.
Parameters
----------
in_df : IdaDataFrame
the input data frame for scoring
target_column : str
the input dataframe column representing the class
id_column : str
the input table column identifying a unique instance id - if skipped,
the input data frame indexer must be set and will be used as an instance id
prediction_type : str, optional
The type of prediction to be made. Valid values are best (most correlated neighbor),
neighbors (weighted prediction of neighbors), and nn-neighbors (non null neighbors)
Returns
-------
float
the model score
"""
params = {
'id': q(id_column),
'type': prediction_type,
'target': q(target_column)
}
self.target_column_in_output = f"{target_column}_PRED"
return self._score(in_df, params, target_column)
[docs]
class BinaryTreeBayesNetwork(TreeBayesNetwork):
def __init__(self, idadb: IdaDataBase, model_name: str):
"""
Creates Binary Tree Shaped Bayesian Network class
"""
super().__init__(idadb, model_name)
self.fit_proc = "BTBNET_GROW"
[docs]
class TreeAgumentedNetwork(TreeBayesNetwork):
def __init__(self, idadb: IdaDataBase, model_name: str):
"""
Creates Tree-shaped Agumented Network object
"""
super().__init__(idadb, model_name)
self.fit_proc = "TANET_GROW"
self.predict_proc = "TANET_APPLY"
[docs]
def fit(self, in_df: IdaDataFrame, in_model: str, class_column: str,
edge_lab_sort: str=None) -> None:
"""
Parameters
----------
in_df : IdaDataFrame
the input data frame
in_model : str
the name of the input Bayesian Network model
class_column : str
the target class; this should be column with nominal variables
edge_lab_sort : str, optional
if edge_lab_sort=yes then the left end of the edge will have a name lower
in alphabetic order than the right one
"""
params = {
'inmodel': in_model,
'class': q(class_column),
'edge_lab_sort': edge_lab_sort
}
self._fit(in_df, params, needs_id=False)
[docs]
class MultiTreeBayesNetwork(TreeBayesNetwork):
def __init__(self, idadb: IdaDataBase, model_name: str):
"""
Creates Multi Tree-shaped Bayesian Network object
"""
super().__init__(idadb, model_name)
self.fit_proc = "MTBNET_GROW"
self.predict_proc = "TANET_APPLY"
[docs]
def fit(self, in_df: IdaDataFrame, class_column: str, in_columns: List[str]=None,
base_index: int=None, sample_size: int=None, talk: str=None,
edge_lab_sort: str=None, col_def_type: str=None, col_def_role: str=None,
col_properties_table: str=None) -> None:
"""
Parameters
----------
in_df : IdaDataFrame
the input data frame
class_column : str
the target class; this should be column with nominal variables
in_columns : List[str]
List of the input dataframe columns with special properties.
Each column is followed by one or several of the following properties:
- type: ':nom' (for nominal), ':cont' (for continuous). By default,
all numerical types are continuous, other types are nominal
- role: ':id', ':target', ':input', ':ignore'.
(Remark: ':objweight' is unsupported, i.e. ':objweight' same as ':ignore').
(Remark: ':colweight(<wgt>)' is unsupported, i.e. ':colweight(<wgt>)' same as
':colweight(1)' same as ':input').
If the parameter is undefined, all columns of the input table have default properties.
Note that this procedure only accepts continuous columns with role 'input'
Addition-ally, each column is followed by a colon (:) and either X or Y to distinguish
the two sets of variables.
base_index : int, optional
the numeric id to be assigned to the first variable
sample_size : int, optional
the sample size to take if the number of records is too large
talk : str, optional
if talk=yes then additional information on progress will be displayed
edge_lab_sort : str, optional
if edge_lab_sort=yes then the left end of the edge will have a name lower
in alphabetic order than the right one
col_def_type : str, optional
default type of the input dataframe columns. Allowed values are 'nom' and 'cont'.
If the parameter is undefined, all numeric columns are considered continuous,
other columns nominal.
col_def_role : str, optional
default role of the input dataframe columns.
Allowed values are 'input' and 'ignore'.
If the parameter is undefined, all columns are considered 'input' columns.
col_properties_table : str, optional
the input table where column properties for the input dataframe columns are stored.
The format of this table is the output format of stored procedure
nza..COLUMN_PROPERTIES().
If the parameter is undefined, the input dataframe column properties will be
detected automatically.
(Remark: colPropertiesTable with "COLROLE" column with value 'objweight'
is unsupported, i.e. same as 'ignore')
(Remark: colPropertiesTable with "COLWEIGHT" column with value '<wgt>' is unsupported,
i.e. same as '1')
"""
params = {
'class': q(class_column),
'incolumn': q(in_columns),
'coldeftype': col_def_type,
'coldefrole': col_def_role,
'colPropertiesTable': col_properties_table,
'baseidx': base_index,
'samplesize': sample_size,
'talk': talk,
'edgelabsort': edge_lab_sort
}
self._fit(in_df, params, needs_id=False)
[docs]
class TreeBayesNetworkBase(PredictiveModeling):
def __init__(self, idadb: IdaDataBase, model_name: str):
"""
Creates base class for group of Tree Shaped Bayesian Network models
"""
super().__init__(idadb, model_name)
def _grow(self, in_df: IdaDataFrame, params: dict):
"""
Grows the Tree Shaped Bayesian Network on input data and returns
dataframe with statistics
Parameters
----------
in_df : IdaDataFrame
the input data frame
params : dict
the dictionary of attributes used for running procedure
Returns
-------
IdaDataFrame
the data frame containing statistics
"""
if not isinstance(in_df, IdaDataFrame):
raise TypeError("Argument in_df should be an IdaDataFrame")
ModelManager(self.idadb).drop_model(self.model_name)
temp_view_name, need_delete = materialize_df(in_df)
params['intable'] = temp_view_name
params_s = map_to_props(params)
try:
in_df.ida_query(f'call NZA..TBNET1G(\'{params_s}\')')
finally:
if need_delete:
in_df._idadb.drop_view(temp_view_name)
table_name = f"INZA.nza_meta_{self.model_name}_model"
if not in_df._idadb.exists_table_or_view(table_name):
# stored procedure call was successful by did not produce a table
return None
out_df = IdaDataFrame(in_df._idadb, table_name)
return out_df
[docs]
class TreeBayesNetwork1G(TreeBayesNetworkBase):
def __init__(self, idadb: IdaDataBase, model_name: str):
"""
Creates Tree-shaped Bayesian Network 1G object
"""
super().__init__(idadb, model_name)
self.predict_proc = "TBNET1G"
[docs]
def grow(self, in_df: IdaDataFrame, in_columns: List[str]=None, base_index: int=777,
sample_size: int=330000, talk: str=None, no_check: str=None, edge_lab_sort: str=None,
col_def_type: str=None, col_def_role: str=None,
col_properties_table: str=None) -> IdaDataFrame:
"""
Parameters
----------
in_df : IdaDataFrame
the input data frame
in_columns : List[str]
List of the input dataframe columns with special properties.
Each column is followed by one or several of the following properties:
- type: ':nom' (for nominal), ':cont' (for continuous). By default,
all numerical types are continuous, other types are nominal
- role: ':id', ':target', ':input', ':ignore'.
(Remark: ':objweight' is unsupported, i.e. ':objweight' same as ':ignore').
(Remark: ':colweight(<wgt>)' is unsupported, i.e. ':colweight(<wgt>)' same as
':colweight(1)' same as ':input').
If the parameter is undefined, all columns of the input table have default properties.
Note that this procedure only accepts continuous columns with role 'input'
Addition-ally, each column is followed by a colon (:) and either X or Y to distinguish
the two sets of variables.
base_index : int, optional
the numeric id to be assigned to the first variable
sample_size : int, optional
the sample size to take if the number of records is too large
talk : str, optional
if talk=yes then additional information on progress will be displayed
no_check : str, optional
if nocheck=yes then no exception is thrown when a column in <in_columns>
does not exis
edge_lab_sort : str, optional
if edge_lab_sort=yes then the left end of the edge will have a name lower
in alphabetic order than the right one
col_def_type : str, optional
default type of the input dataframe columns. Allowed values are 'nom' and 'cont'.
If the parameter is undefined, all numeric columns are considered continuous,
other columns nominal.
col_def_role : str, optional
default role of the input dataframe columns.
Allowed values are 'input' and 'ignore'.
If the parameter is undefined, all columns are considered 'input' columns.
col_properties_table : str, optional
the input table where column properties for the input dataframe columns are stored.
The format of this table is the output format of stored procedure
nza..COLUMN_PROPERTIES().
If the parameter is undefined, the input dataframe column properties will be
detected automatically.
(Remark: colPropertiesTable with "COLROLE" column with value 'objweight'
is unsupported, i.e. same as 'ignore')
(Remark: colPropertiesTable with "COLWEIGHT" column with value '<wgt>' is unsupported,
i.e. same as '1')
Returns
-------
IdaDataFrame
the data frame containing statistics
"""
params = {
'model': self.model_name,
'incolumn': q(in_columns),
'coldeftype': col_def_type,
'coldefrole': col_def_role,
'colPropertiesTable': col_properties_table,
'baseidx': base_index,
'samplesize': sample_size,
'talk': talk,
'nocheck': no_check,
'edgelabsort': edge_lab_sort
}
return self._grow(in_df, params)
[docs]
class TreeBayesNetwork2G(TreeBayesNetworkBase):
def __init__(self, idadb, model_name) -> None:
"""
Creates Tree-shaped Bayesian Network 2G object
"""
super().__init__(idadb, model_name)
self.predict_proc = "TBNET2G"
[docs]
def grow(self, in_df: IdaDataFrame, in_columns: List[str]=None, base_index: int=777,
talk: str=None, no_check: str=None, edge_lab_sort: str=None, col_def_type: str=None,
col_def_role: str=None, col_properties_table: str=None) -> IdaDataFrame:
"""
Builds a tree-like Bayesian Network for continuous variables.
A spanning tree is constructed joining all the variables on grounds of most strong
correlations. This gives the user an overview of most significant interrelations
governing the whole set of variables.
The stored procedure operates with two sets of variables and the resulting tree
will be bi-partite. The correlations between variables within each set will not
be calculated. This feature is useful when the two sets characterize distinct
objects and only links between the objects are of interest
Parameters
----------
in_df : IdaDataFrame
the input data frame
in_columns : List[str]
List of the input dataframe columns with special properties.
Each column is followed by one or several of the following properties:
- type: ':nom' (for nominal), ':cont' (for continuous). By default,
all numerical types are continuous, other types are nominal
- role: ':id', ':target', ':input', ':ignore'.
(Remark: ':objweight' is unsupported, i.e. ':objweight' same as ':ignore').
(Remark: ':colweight(<wgt>)' is unsupported, i.e. ':colweight(<wgt>)' same as
':colweight(1)' same as ':input').
If the parameter is undefined, all columns of the input table have default properties.
Note that this procedure only accepts continuous columns with role 'input'
Addition-ally, each column is followed by a colon (:) and either X or Y to distinguish
the two sets of variables.
base_index : int, optional
the numeric id to be assigned to the first variable
talk : str, optional
if talk=yes then additional information on progress will be displayed
no_check : str, optional
if nocheck=yes then no exception is thrown when a column in <in_columns>
does not exis
edge_lab_sort : str, optional
if edge_lab_sort=yes then the left end of the edge will have a name lower
in alphabetic order than the right one
col_def_type : str, optional
default type of the input dataframe columns. Allowed values are 'nom' and 'cont'.
If the parameter is undefined, all numeric columns are considered continuous,
other columns nominal.
col_def_role : str, optional
default role of the input dataframe columns.
Allowed values are 'input' and 'ignore'.
If the parameter is undefined, all columns are considered 'input' columns.
col_properties_table : str, optional
the input table where column properties for the input dataframe columns are stored.
The format of this table is the output format of stored procedure
nza..COLUMN_PROPERTIES().
If the parameter is undefined, the input dataframe column properties will be
detected automatically.
(Remark: colPropertiesTable with "COLROLE" column with value 'objweight'
is unsupported, i.e. same as 'ignore')
(Remark: colPropertiesTable with "COLWEIGHT" column with value '<wgt>' is unsupported,
i.e. same as '1')
Returns
-------
IdaDataFrame
the data frame containing statistics
"""
params = {
'model': self.model_name,
'incolumn': q(in_columns),
'coldeftype': col_def_type,
'coldefrole': col_def_role,
'colPropertiesTable': col_properties_table,
'baseidx': base_index,
'talk': talk,
'nocheck': no_check,
'edgelabsort': edge_lab_sort
}
return self._grow(in_df, params)
[docs]
class TreeBayesNetwork1G2P(TreeBayesNetworkBase):
def __init__(self, idadb, model_name) -> None:
"""
Creates Tree-shaped Bayesian Network 1G2P object
"""
super().__init__(idadb, model_name)
self.predict_proc = "TBNET1G2P"
[docs]
def grow(self, in_df: IdaDataFrame, in_columns: List[str]=None, base_index: int=777,
talk: str=None, no_check: str=None, edge_lab_sort: str=None, col_def_type: str=None,
col_def_role: str=None, col_properties_table: str=None) -> IdaDataFrame:
"""
This stored procedure builds a tree-like Bayesian Network for continuous variables.
A spanning tree is constructed joining all the variables on grounds of most strong
correlations. This gives the user an overview of most significant interrelations
governing the whole set of variables.
The stored procedure constructs the tree in an incremental manner. It calculates
correlations on one set of variables, then on the other set of variables, then
between variables of the 2 sets. The final model is obtained by joining the
three sub-models
Parameters
----------
in_df : IdaDataFrame
the input data frame
in_columns : List[str]
List of the input dataframe columns with special properties.
Each column is followed by one or several of the following properties:
- type: ':nom' (for nominal), ':cont' (for continuous). By default,
all numerical types are continuous, other types are nominal
- role: ':id', ':target', ':input', ':ignore'.
(Remark: ':objweight' is unsupported, i.e. ':objweight' same as ':ignore').
(Remark: ':colweight(<wgt>)' is unsupported, i.e. ':colweight(<wgt>)' same as
':colweight(1)' same as ':input').
If the parameter is undefined, all columns of the input table have default properties.
Note that this procedure only accepts continuous columns with role 'input'
Addition-ally, each column is followed by a colon (:) and either X or Y to distinguish
the two sets of variables.
base_index : int, optional
the numeric id to be assigned to the first variable
talk : str, optional
if talk=yes then additional information on progress will be displayed
no_check : str, optional
if nocheck=yes then no exception is thrown when a column in <in_columns>
does not exis
edge_lab_sort : str, optional
if edge_lab_sort=yes then the left end of the edge will have a name lower
in alphabetic order than the right one
col_def_type : str, optional
default type of the input dataframe columns. Allowed values are 'nom' and 'cont'.
If the parameter is undefined, all numeric columns are considered continuous,
other columns nominal.
col_def_role : str, optional
default role of the input dataframe columns.
Allowed values are 'input' and 'ignore'.
If the parameter is undefined, all columns are considered 'input' columns.
col_properties_table : str, optional
the input table where column properties for the input dataframe columns are stored.
The format of this table is the output format of stored procedure
nza..COLUMN_PROPERTIES().
If the parameter is undefined, the input dataframe column properties will be
detected automatically.
(Remark: colPropertiesTable with "COLROLE" column with value 'objweight'
is unsupported, i.e. same as 'ignore')
(Remark: colPropertiesTable with "COLWEIGHT" column with value '<wgt>' is unsupported,
i.e. same as '1')
Returns
-------
IdaDataFrame
the data frame containing statistics
"""
params = {
'model': self.model_name,
'incolumn': q(in_columns),
'coldeftype': col_def_type,
'coldefrole': col_def_role,
'colPropertiesTable': col_properties_table,
'baseidx': base_index,
'talk': talk,
'nocheck': no_check,
'edgelabsort': edge_lab_sort
}
return self._grow(in_df, params)