Source code for nzpyida.groupby

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#-----------------------------------------------------------------------------
# Copyright (c) 2023, IBM Corp.
# All rights reserved.
#
# Distributed under the terms of the BSD Simplified License.
#
# The full license is in the LICENSE file, distributed with this software.
#-----------------------------------------------------------------------------
"""
IdaDataFrameGroupBy
"""
import pandas as pd
import nzpyida


[docs]
class IdaDataFrameGroupBy(object):
    """
    Class representing groupby object, that is object created from IdaDataFrame.
    It implements Pandas-like object for grouping.

    Examples
    ---------
    >>> ida_iris.groupby("CLASS")
    <nzpyida.groupby.IdaDataFrameGroupBy at 0x11c288e10>
        
    >>> df.groupby("CLASS").mean().head()
            AVG_SEPAL_LENGTH 	AVG_SEPAL_WIDTH 	AVG_PETAL_LENGTH 	AVG_PETAL_WIDTH 	CLASS
        0 	5.006 	            3.418 	            1.464 	            0.244 	            Iris-setosa
        1 	5.936 	            2.770 	            4.260 	            1.326 	            Iris-versicolor
        2 	6.588 	            2.974 	            5.552 	            2.026 	            Iris-virginica

    >>> mean_sepal_length = ida_iris[["SEPAL_LENGTH", "CLASS"]].groupby("CLASS").mean()
    >>> mean_sepal_length.head()
            AVG_SEPAL_LENGTH 	CLASS
        0 	5.006 	            Iris-setosa
        1 	5.936 	            Iris-versicolor
        2 	6.588 	            Iris-virginica
        
    >>> iris_groupby = ida_iris[(ida_iris["SEPAL_LENGTH"]>4) & (ida_iris["SEPAL_WIDTH"]<3)][
    >>>     ["SEPAL_WIDTH", "SEPAL_LENGTH", "CLASS"]].groupby("CLASS")
    >>> iris_groupby
    <nzpyida.groupby.IdaDataFrameGroupBy object at 0x119863190>
    >>> iris_groupby.count().head()
            COUNT_SEPAL_WIDTH 	COUNT_SEPAL_LENGTH 	CLASS
        0 	2 	                2 	                Iris-setosa
        1 	21 	                21 	                Iris-virginica
        2 	34 	                34 	                Iris-versicolor
    """
    def __init__(self, in_df, columns_to_aggregate, by_column) -> None:
        """
        Parameters
        -----------
        in_df : IdaDataFrame
            DataFrame is to be grouped

        columns_to_aggregate : List[str]
            List of columns that should be run by with aggregate functions

        by_column : str
            Column to group the in_df by 
        """
        self.in_df = in_df
        self.columns_to_aggregate = columns_to_aggregate
        self.by_column = by_column
        self.name = in_df.internal_state.get_state()
    

[docs]
    def count(self):
        """
        Aggregates all columns with COUNT method. 
        Counts all non null values in column with respect to grouped classes

        Returns
        -------
        IdaDataFrame
            Object with grouped data
        """     
        self.aggregation_method = "COUNT"
        return self._execute_query()

    

[docs]
    def sum(self):
        """
        Aggregates all columns with SUM method.
        Sums all the values in column with respect to grouped classes

        Returns
        -------
        IdaDataFrame
            Object with grouped data
        """   
        self.aggregation_method = "SUM"
        return self._execute_query()



[docs]
    def min(self):
        """
        Aggregates all columns with MIN method.
        Selects the minimum value from column with respect to grouped classes

        Returns
        -------
        IdaDataFrame
            Object with grouped data
        """   
        self.aggregation_method = "MIN"
        return self._execute_query()

    

[docs]
    def max(self):
        """
        Aggregates all columns with MAX method.
        Selects the maximum value from column with respect to grouped classes

        Returns
        -------
        IdaDataFrame
            Object with grouped data
        """   
        self.aggregation_method = "MAX"
        return self._execute_query()

    

[docs]
    def mean(self):
        """
        Aggregates all columns with AVG method.
        Calculates mean of values in column with respect to grouped classes

        Returns
        -------
        IdaDataFrame
            Object with grouped data
        """   
        self.aggregation_method = "AVG"
        return self._execute_query()

    
    def _execute_query(self):
        select_string = f'{self.aggregation_method}(' + \
            f'), {self.aggregation_method}('.join(self.columns_to_aggregate) + \
                ')'
        for col in self.columns_to_aggregate:
            select_string = select_string.replace(
                f'{self.aggregation_method}({col})', 
                f'{self.aggregation_method}(\"{col}\") AS \"{self.aggregation_method}_{col}\"')
            
        groupby_string = f'GROUP BY \"{self.by_column}\"'
        query = 'SELECT ' + select_string + f', \"{self.by_column}\"' + \
            f' FROM ({self.name}) AS TEMP_GB ' + groupby_string

        idadf=nzpyida.IdaGeoDataFrame(self.in_df._idadb, self.in_df.tablename)
        idadf.internal_state._views.append(query)
        return idadf