Source code for nzpyida.analytics.predictive.two_step_clustering

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#-----------------------------------------------------------------------------
# Copyright (c) 2023, IBM Corp.
# All rights reserved.
#
# Distributed under the terms of the BSD Simplified License.
#
# The full license is in the LICENSE file, distributed with this software.
#-----------------------------------------------------------------------------
"""
TwoStep clustering is a data mining algorithm for large data sets. It is faster than 
traditional methods because it typically scans a data set only once before it saves 
the data to a clustering feature (CF) tree. TwoStep clustering can make clustering 
decisions without repeated data scans, whereas other clustering methods scan all 
data points, which requires multiple iterations. Non- uniform points are not gathered, 
so each iteration requires a reinspection of each data point, regardless of the 
significance of the data point. Because TwoStep clustering treats dense areas 
as a single unit and ignores pattern outliers, it provides high-quality clustering 
results without exceeding memory constraints.

The TwoStep algorithm has the following advantages:
- It automatically determines the optimal number of clusters. You do not have to 
manually create a different clustering model for each number of clusters.
- It detects input columns that are not useful for the clustering process. 
These columns are automatically set to supplementary. Statistics are gathered 
for these columns but they do not influence the clustering algorithm.
- The configuration of the CF tree can be granular, so that you can balance between 
memory usage and model quality, according to the environment and needs.
"""

from typing import List
from nzpyida.frame import IdaDataFrame
from nzpyida.base import IdaDataBase
from nzpyida.analytics.utils import map_to_props, make_temp_table_name
from nzpyida.analytics.utils import get_auto_delete_context
from nzpyida.analytics.predictive.predictive_modeling import PredictiveModeling
from nzpyida.analytics.utils import q


[docs] class TwoStepClustering(PredictiveModeling): """ Divisive Clustering """ def __init__(self, idadb: IdaDataBase, model_name: str): """ Creates the clusterer class. Parameters ---------- idada : IdaDataBase database connector model_name : str model name - if it exists in the database, it will be used, otherwise it must be trained using fit() function before prediction or scoring is called. """ super().__init__(idadb, model_name) self.fit_proc = 'TWOSTEP' self.predict_proc = 'PREDICT_TWOSTEP' self.score_proc = 'MSE' self.target_column_in_output = idadb.to_def_case('CLUSTER_ID') self.id_column_in_output = idadb.to_def_case('ID') self.has_print_proc = True
[docs] def fit(self, in_df: IdaDataFrame, id_column: str=None, target_column: str=None, in_columns: List[str]=None, col_def_type: str=None, col_def_role: str=None, col_properties_table: str=None, out_table: str=None, k: int=0, max_k: int=20, bins: int=10, statistics: str=None, rand_seed: int=12345, distance: str='loglikelihood', distance_threshold: float=None, distance_threshold_factor: float=2.0, epsilon: float=0.0, node_capacity: int=6, leaf_capacity: int=8, max_leaves: int=1000, outlier_fraction: float=0.0) -> IdaDataFrame: """ Builds a TwoStep Clustering model that first distributes the input data into a hierarchical tree structure according to the distance between the data records, then reduces the tree into k clusters. A second pass over the data associates the input data records to the next cluster. Parameters ---------- in_df : IdaDataFrame the input data frame id_column : str, optional the input table column identifying a unique instance id target_column : str, optional the input table column representing a class or a value to predict, this column is ignored by the TwoStep Clustering algorithm in_columns : List[str], optional the list of input table columns with special properties. Each column is followed by one or several of the following properties: its type: ':nom' (for nominal), ':cont' (for continuous). Per default, all numerical types are con-tinuous, other types are nominal. its role: ':id', ':target', ':input', ':ignore'. (Remark: ':objweight' is unsupported, i.e. ':objweight' same as ':ignore'). (Remark: ':colweight(<wgt>)' is unsupported, i.e. ':colweight(<wgt>)' same as ':colweight(1)' same as ':input'). If the parameter is undefined, all columns of the input table have default properties. col_def_type : str, optional default type of the input table columns. Allowed values are 'nom' and 'cont'. If the parameter is undefined, all numeric columns are considered continuous, other columns nominal. col_def_role : str, optional default role of the input table columns. Allowed values are 'input' and 'ignore'. If the parameter is undefined, all columns are considered 'input' columns. col_properties_table : str, optional the input table where column properties for the input table columns are stored. The format of this table is the output format of stored procedure nza..COLUMN_PROPERTIES(). If the parameter is undefined, the input table column properties will be detected automatically. (Remark: colPropertiesTable with "COLROLE" column with value 'objweight' is unsupported, i.e. same as 'ignore') (Remark: colPropertiesTable with "COLWEIGHT" column with value '<wgt>' is unsupported, i.e. same as '1') out_table : str, optional the output table where clusters are assigned to each input table record k : int, optional the number of clusters. If k is 0 or less, the procedure determines the optimal number of clusters max_k : int, optional the maximum number of clusters that can be determined automatically. If k is bigger than 0, this parameter is ignored bins : int, optional the average number of bins for numerical statistics with more than <n> values statistics : str, optional flags indicating which statistics to collect. Allowed values are: none, columns, values:n, all. Regardless of the value of the parameter statistics, all statistics are gathered since they are needed to call PREDICT_TWOSTEP on this model. If statistics=none or statistics=columns, the importance of the attributes is not calculated. If statistics=none, statistics=columns or statistics=all, up to 100 discrete values are gathered. If statistics=values:n with n a positive number, up to <n> column value statistics are collected: - If a nominal column contains more than <n> values, only the <n> most frequent column stat-istics are kept. - If a numeric column contains more than <n> values, the values will be discretized and the stat-istics will be collected on the discretized values. Indicating statistics=all is equal to statistics=values:100. rand_seed : int, optional the random generator seed distance : str, optional the distance function. Allowed values are: euclidean, norm_euclidean, loglikelihood distance_threshold : float, optional the threshold under which 2 data records can be merged into one cluster during the first pass. If not set, the distance threshold is calculated automatically distance_threshold_factor : float, optional the factor used to calculate the distance threshold automatically. The distance threshold is then the median distance value minus distance_threshold_factor times the interquartile distance (or the minimum distance if this value is below it). If distance_threshold is set, this parameter is ignored epsilon : float, optional the value to be used as global variance of all continuous fields for the loglikelihood distance. If the value is 0.0 or less, the global variance is calculated for each continuous field. If distance is not loglikelihood, this parameter is ignored node_capacity : int, optional the branching factor of the internal tree used in pass 1. Each node can have up to node_capacity subnodes leaf_capacity : int, optional the number of clusters per leaf node in the internal tree used in pass 1 max_leaves : int, optional the maximum number of leaf nodes in the internal tree used in pass 1. When the tree contains maxleaves leaf nodes, the following data records are aggregated into the existing clusters outlier_fraction : float, optional the fraction of the records to be considered as outlier in the internal tree used in pass 1. Clusters containing less than outlierfraction times the mean number of data records per cluster are removed Returns ------- IdaDataFrame the data frame containing row identifiers, cluster_id and distance to cluster center """ auto_delete_context = None if not out_table: auto_delete_context = get_auto_delete_context('out_table') out_table = make_temp_table_name() params = { 'id': q(id_column), 'target': q(target_column), 'incolumn': q(in_columns), 'coldeftype': col_def_type, 'coldefrole': col_def_role, 'colpropertiestable': col_properties_table, 'k': k, 'maxk': max_k, 'bins': bins, 'statistics': statistics, 'randseed': rand_seed, 'distance': distance, 'distancethreshold': distance_threshold, 'distancethresholdfactor': distance_threshold_factor, 'epsilon': epsilon, 'nodecapacity': node_capacity, 'leafcapacity': leaf_capacity, 'maxleaves': max_leaves, 'outlierfraction': outlier_fraction, 'outtable': out_table } self._fit(in_df=in_df, params=params) if auto_delete_context: auto_delete_context.add_table_to_delete(out_table) return IdaDataFrame(self.idadb, out_table)
[docs] def predict(self, in_df: IdaDataFrame, out_table: str=None, id_column: str=None) -> IdaDataFrame: """ Makes predictions based on this model. The model must exist. Parameters ---------- in_df : IdaDataFrame the input data frame out_table : str, optional the output table where the assigned clusters will be stored id_column : str, optional the input table column identifying a unique instance id Default: id column used to build the model Returns ------- IdaDataFrame the data frame containing row identifiers and predicted target values """ params = { 'id': q(id_column) } return self._predict(in_df=in_df, params=params, out_table=out_table)
[docs] def score(self, in_df: IdaDataFrame, target_column: str, id_column: str=None) -> float: """ Scores the model. The model must exist. Parameters ---------- in_df : IdaDataFrame the input data frame for scoring target_column : str the input table column representing the class id_column : str, optional the input table column identifying a unique instance id - if skipped, the input data frame indexer must be set and will be used as an instance id Returns ------- float the model score """ params = { 'id': q(id_column) } return self._score(in_df=in_df, predict_params=params, target_column=target_column)