Source code for nzpyida.analytics.predictive.two_step_clustering

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#-----------------------------------------------------------------------------
# Copyright (c) 2023, IBM Corp.
# All rights reserved.
#
# Distributed under the terms of the BSD Simplified License.
#
# The full license is in the LICENSE file, distributed with this software.
#-----------------------------------------------------------------------------
"""
TwoStep clustering is a data mining algorithm for large data sets. It is faster than 
traditional methods because it typically scans a data set only once before it saves 
the data to a clustering feature (CF) tree. TwoStep clustering can make clustering 
decisions without repeated data scans, whereas other clustering methods scan all 
data points, which requires multiple iterations. Non- uniform points are not gathered, 
so each iteration requires a reinspection of each data point, regardless of the 
significance of the data point. Because TwoStep clustering treats dense areas 
as a single unit and ignores pattern outliers, it provides high-quality clustering 
results without exceeding memory constraints.

The TwoStep algorithm has the following advantages:
- It automatically determines the optimal number of clusters. You do not have to 
manually create a different clustering model for each number of clusters.
- It detects input columns that are not useful for the clustering process. 
These columns are automatically set to supplementary. Statistics are gathered 
for these columns but they do not influence the clustering algorithm.
- The configuration of the CF tree can be granular, so that you can balance between 
memory usage and model quality, according to the environment and needs.
"""

from typing import List
from nzpyida.frame import IdaDataFrame
from nzpyida.base import IdaDataBase
from nzpyida.analytics.utils import map_to_props, make_temp_table_name
from nzpyida.analytics.utils import get_auto_delete_context
from nzpyida.analytics.predictive.predictive_modeling import PredictiveModeling
from nzpyida.analytics.utils import q



[docs]
class TwoStepClustering(PredictiveModeling):
    """
    Divisive Clustering
    """
    def __init__(self, idadb: IdaDataBase, model_name: str):
        """
        Creates the clusterer class.

        Parameters
        ----------

        idada : IdaDataBase
            database connector

        model_name : str
            model name - if it exists in the database, it will be used, otherwise
            it must be trained using fit() function before prediction or scoring is called.
        """

        super().__init__(idadb, model_name)
        self.fit_proc = 'TWOSTEP'
        self.predict_proc = 'PREDICT_TWOSTEP'
        self.score_proc = 'MSE'
        self.target_column_in_output = idadb.to_def_case('CLUSTER_ID')
        self.id_column_in_output = idadb.to_def_case('ID')
        self.has_print_proc = True


[docs]
    def fit(self, in_df: IdaDataFrame, id_column: str=None, target_column: str=None,
        in_columns: List[str]=None, col_def_type: str=None, col_def_role: str=None,
        col_properties_table: str=None, out_table: str=None, k: int=0, max_k: int=20, 
        bins: int=10, statistics: str=None, rand_seed: int=12345, distance: str='loglikelihood', 
        distance_threshold: float=None, distance_threshold_factor: float=2.0, 
        epsilon: float=0.0, node_capacity: int=6, leaf_capacity: int=8, 
        max_leaves: int=1000, outlier_fraction: float=0.0) -> IdaDataFrame:
        """
        Builds a TwoStep Clustering model that first distributes the input data into 
        a hierarchical tree structure according to the distance between the data records, 
        then reduces the tree into k clusters. A second pass over the data associates 
        the input data records to the next cluster.

        Parameters
        ----------
        in_df : IdaDataFrame
            the input data frame

        id_column : str, optional
            the input table column identifying a unique instance id
        
        target_column : str, optional
            the input table column representing a class or a value to predict, 
            this column is ignored by the TwoStep Clustering algorithm

        in_columns : List[str], optional
            the list of input table columns with special properties.
            Each column is followed by one or several of the following properties:
                its type: ':nom' (for nominal), ':cont' (for continuous).
                Per default, all numerical types are con-tinuous, other types are nominal.
                its role: ':id', ':target', ':input', ':ignore'.
            (Remark: ':objweight' is unsupported, i.e. ':objweight' same as ':ignore').
            (Remark: ':colweight(<wgt>)' is unsupported, i.e. ':colweight(<wgt>)' same
            as ':colweight(1)' same as ':input').
            If the parameter is undefined, all columns of the input table have default properties.

        col_def_type : str, optional
            default type of the input table columns. Allowed values are 'nom' and 'cont'.
            If the parameter is undefined, all numeric columns are considered continuous,
            other columns nominal.

        col_def_role : str, optional
            default role of the input table columns.
            Allowed values are 'input' and 'ignore'.
            If the parameter is undefined, all columns are considered 'input' columns.

        col_properties_table : str, optional
            the input table where column properties for the input table columns are stored.
            The format of this table is the output format of stored procedure
            nza..COLUMN_PROPERTIES().
            If the parameter is undefined, the input table column properties will be
            detected automatically.
            (Remark: colPropertiesTable with "COLROLE" column with value 'objweight'
            is unsupported, i.e. same as 'ignore')
            (Remark: colPropertiesTable with "COLWEIGHT" column with value '<wgt>' is unsupported,
            i.e. same as '1')

        out_table : str, optional
            the output table where clusters are assigned to each input table record
        
        k : int, optional
            the number of clusters. If k is 0 or less, the procedure determines 
            the optimal number of clusters

        max_k : int, optional
            the maximum number of clusters that can be determined automatically. 
            If k is bigger than 0, this parameter is ignored
        
        bins : int, optional
            the average number of bins for numerical statistics with more than <n> values

        statistics : str, optional
            flags indicating which statistics to collect. Allowed values are: none, 
            columns, values:n, all.
            Regardless of the value of the parameter statistics, all statistics are 
            gathered since they are needed to call PREDICT_TWOSTEP on this model. 
            If statistics=none or statistics=columns, the importance of the attributes 
            is not calculated. If statistics=none, statistics=columns or statistics=all, 
            up to 100 discrete values are gathered.
            If statistics=values:n with n a positive number, up to <n> column value 
            statistics are collected:
                - If a nominal column contains more than <n> values, only the <n> most 
                frequent column stat-istics are kept.
                - If a numeric column contains more than <n> values, the values will 
                be discretized and the stat-istics will be collected on the discretized values.
            Indicating statistics=all is equal to statistics=values:100.
            
        rand_seed : int, optional
            the random generator seed

        distance : str, optional
            the distance function. Allowed values are: euclidean, norm_euclidean, loglikelihood

        distance_threshold : float, optional
            the threshold under which 2 data records can be merged into one cluster during 
            the first pass. If not set, the distance threshold is calculated automatically
        
        distance_threshold_factor : float, optional
            the factor used to calculate the distance threshold automatically. 
            The distance threshold is then the median distance value minus 
            distance_threshold_factor times the interquartile distance 
            (or the minimum distance if this value is below it). If distance_threshold is set, 
            this parameter is ignored
        
        epsilon : float, optional
            the value to be used as global variance of all continuous fields for the loglikelihood 
            distance. If the value is 0.0 or less, the global variance is calculated for each 
            continuous field. If distance is not loglikelihood, this parameter is ignored
        
        node_capacity : int, optional
            the branching factor of the internal tree used in pass 1. Each node can have up to 
            node_capacity subnodes
        
        leaf_capacity : int, optional
            the number of clusters per leaf node in the internal tree used in pass 1

        max_leaves : int, optional
            the maximum number of leaf nodes in the internal tree used in pass 1. 
            When the tree contains maxleaves leaf nodes, the following data records are 
            aggregated into the existing clusters
        
        outlier_fraction : float, optional
            the fraction of the records to be considered as outlier in the internal
            tree used in pass 1. Clusters containing less than outlierfraction times 
            the mean number of data records per cluster are removed
        
        Returns
        -------
        IdaDataFrame
            the data frame containing row identifiers, cluster_id and distance to cluster center
        """

        auto_delete_context = None
        if not out_table:
            auto_delete_context = get_auto_delete_context('out_table')
            out_table = make_temp_table_name()

        params = {
            'id': q(id_column),
            'target': q(target_column),
            'incolumn': q(in_columns),
            'coldeftype': col_def_type,
            'coldefrole': col_def_role,
            'colpropertiestable': col_properties_table,
            'k': k,
            'maxk': max_k,
            'bins': bins,
            'statistics': statistics,
            'randseed': rand_seed,
            'distance': distance,
            'distancethreshold': distance_threshold,
            'distancethresholdfactor': distance_threshold_factor,
            'epsilon': epsilon,
            'nodecapacity': node_capacity,
            'leafcapacity': leaf_capacity,
            'maxleaves': max_leaves,
            'outlierfraction': outlier_fraction,
            'outtable': out_table
        }

        self._fit(in_df=in_df, params=params)

        if auto_delete_context:
            auto_delete_context.add_table_to_delete(out_table)

        return IdaDataFrame(self.idadb, out_table)



[docs]
    def predict(self, in_df: IdaDataFrame, out_table: str=None,
        id_column: str=None) -> IdaDataFrame:
        """
        Makes predictions based on this model. The model must exist.

        Parameters
        ----------
        in_df : IdaDataFrame
            the input data frame

        out_table : str, optional
            the output table where the assigned clusters will be stored

        id_column : str, optional
            the input table column identifying a unique instance id
            Default: id column used to build the model

        Returns
        -------
        IdaDataFrame
            the data frame containing row identifiers and predicted target values
        """

        params = {
            'id': q(id_column)
        }

        return self._predict(in_df=in_df, params=params, out_table=out_table)



[docs]
    def score(self, in_df: IdaDataFrame, target_column: str, id_column: str=None) -> float:
        """
        Scores the model. The model must exist.

        Parameters
        ----------
        in_df : IdaDataFrame
            the input data frame for scoring

        target_column : str
            the input table column representing the class

        id_column : str, optional
            the input table column identifying a unique instance id - if skipped, 
            the input data frame indexer must be set and will be used as an instance id

        Returns
        -------
        float
            the model score
        """

        params = {
            'id': q(id_column)
        }

        return self._score(in_df=in_df, predict_params=params, target_column=target_column)