Source code for nzpyida.frame

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#-----------------------------------------------------------------------------
# Copyright (c) 2015-2023, IBM Corp.
# All rights reserved.
#
# Distributed under the terms of the BSD Simplified License.
#
# The full license is in the LICENSE file, distributed with this software.
#-----------------------------------------------------------------------------

"""
idaDataFrame
---------
An efficient 2D container that looks like a panda's DataFrame and behave the
same, the only difference is that it uses only a reference to a remote database
instead of having the data loaded into memory

Also similar to its R counterpart, data.frame, except providing automatic data
alignment and a host of useful data manipulation methods having to do with the
labeling information
"""
import sys
import os
from copy import deepcopy
import warnings
from numbers import Number
from collections import OrderedDict
from typing import List

import numpy as np
import pandas as pd
from pandas import Index, RangeIndex

from lazy import lazy
import six

import nzpyida
import nzpyida.statistics
import nzpyida.indexing
import nzpyida.aggregation
import nzpyida.filtering
import nzpyida.utils
from nzpyida.groupby import IdaDataFrameGroupBy

from nzpyida.utils import timed, chunklist
from nzpyida.internals import InternalState
from nzpyida.exceptions import IdaDataFrameError
from nzpyida.internals import idadf_state



[docs]
class IdaDataFrame(object):
    """
    An IdaDataFrame object is a reference to a table in a remote Db2 Warehouse
    database. IDA stands for In-DataBase Analytics. IdaDataFrame copies the
    Pandas interface for DataFrame objects to ensure intuitive interaction for
    end-users.

    Examples
    --------
    >>> idadb = IdaDataBase('BLUDB') # See documentation for IdaDataBase
    >>> ida_iris = IdaDataFrame(idadb, 'IRIS')
    >>> ida_iris.cov()
                      sepal_length  sepal_width  petal_length  petal_width
    sepal_length      0.685694    -0.042434      1.274315     0.516271
    sepal_width      -0.042434     0.189979     -0.329656    -0.121639
    petal_length      1.274315    -0.329656      3.116278     1.295609
    petal_width       0.516271    -0.121639      1.295609     0.581006
    """

    # TODO: Check if everything is ok when selecting AND projecting with loc
    # TODO: BUG: Filtering after selection/projection


[docs]
    def __init__(self, idadb, tablename, indexer = None):
        """
        Constructor for IdaDataFrame objects.

        Parameters
        ----------
        idadb : IdaDataBase
            IdaDataBase instance which contains the connection to be used.
        tablename : str
            Name of the table to be opened in the database.
            It should contain only alphanumeric characters and underscores.
            All lower case characters will be converted to upper case characters.
        indexer : str, optional
            Name of the column that should be used as an index. This is
            optional. However, if no indexer is given, the order of rows issued
            by the head and tail functions is not guaranteed. Also, several
            in-database machine learning algorithms need an indexer as a
            parameter to be executed.

        Attributes
        ----------
        _idadb : IdaDataBase
            IdaDataBase object parent of the current instance.
        tablename : str
            Name of the table self references.
        name : str
            Full name of the table self references, including schema.
        schema : str
            Name of the schema the table belongs to.
        indexer : str
            Name of the column used as an index. "None" if no indexer.
        loc : str
            Indexer that enables the selection and projection of IdaDataFrame
            instances. For more information, see the loc class documentation.
        internal_state : InternalState
            Object used to internally store the state of the IdaDataFrame. It
            also allows several non-destructive manipulation methods.
        type : str
            Type of the IdaDataFrame : “Table”, “View”, or “Unknown”.
        dtypes : DataFrame
            Data type in the database for each column.
        index : pandas.core.index
            Index containing the row names in this table.
        columns : pandas.core.index
            Index containing the columns names in this table.
        axes : list
            List containing columns and index attributes.
        shape : Tuple
            Number of rows and number of columns.

        Notes
        -----
        Attributes "type", "dtypes", "index", "columns", "axes", and "shape"
        are evaluated in a lazy way to avoid an overhead when creating an
        IdaDataFrame. Sometimes the index may be too big to be downloaded.

        Examples
        --------
        >>> idadb = IdaDataBase('BLUDB')
        >>> ida_iris = IdaDataFrame(idadb, "IRIS")
        """
        #TODO: Implement equality comparision between two IdaDataFrames

        if not idadb.__class__.__name__ == "IdaDataBase":
            idadb_class = idadb.__class__.__name__
            raise TypeError("Argument 'idadb' is of type %s, expected : IdaDataBase"%idadb_class)
        tablename = nzpyida.utils.check_tablename(tablename, idadb._upper_cased)

        #idadb._reset_attributes("cache_show_tables")

        # TODO: Test what kind of error append when a table in use and cached
        # is suddently deleted

        if idadb.exists_table_or_view(tablename) is False:
            # Try again after refreshing the cache
            idadb._reset_attributes("cache_show_tables")
            # Refresh the show table cache in parent IdaDataBase, because a table
            # could have been created by other mean / user and we have to make sure
            # the lookup is done and is actual.
            if idadb.exists_table_or_view(tablename) is False:
                raise NameError("Table %s does not exist in the database %s."
                                %(tablename, idadb.data_source_name))


        self._idadb = idadb
        self._indexer = indexer

        # Initialise indexer object
        self.loc = nzpyida.indexing.Loc(self)

        if "." in tablename:
            self.schema = tablename.split('.')[0]
            #self.name = tablename.split('.')[-1]
            self._name = tablename
            self.tablename = tablename.split('.')[-1]
        else:
            self.schema = idadb.current_schema
            self._name = idadb.current_schema + '.' + tablename
            self.tablename = tablename

        # self._name is the original name, this is a "final" variable

        # Push a reference to itself in its parent IdaDataBase
        self._idadb._idadfs.append(self)
        # TODO : self.size

        # A cache for unique value of each column
        self._unique = dict()

        # Variable for storing original columns names
        self._org_columns_names = None


###############################################################################
### Attributes & Metadata computation
###############################################################################


[docs]
    @lazy
    def internal_state(self):
        """
         InternalState instances manage the state of an IdaDataFrame instance
         and allow several non-destructive data manipulation methods, such as
         the selection, projection, filtering, and aggregation of columns.

        """
        return InternalState(self)



    @property
    @idadf_state
    def name(self):
        return self.internal_state.current_state


    @property
    def indexer(self):
        """
        The indexer attribute refers to the name of a column that should be
        used to index the table. This makes sense because Db2 Warehouse is a
        column-based database, so row IDs do not make sense and are not
        deterministic. As a consequence, the only way to address a particular
        row is to refer to it by its index. If no indexer is provided, ibmdbpy
        still works but a correct row order is not guaranteed as far as the
        dataset is not sorted. Also, note that the indexer column is not taken
        into account in data mining algorithms.
        """
        if hasattr(self, "_indexer"):
            return self._indexer
        else:
            None    

        
    @indexer.setter
    def indexer(self, value):
        """
        Basic checks for indexer :
        * The column exists in the table.
        * All values are unique.
        """
        if value is None:
            return

        if value not in self.columns:
            raise IdaDataFrameError("'%s' cannot be used as indexer "%value +
                                    " because this is not a column in '%s'"%self._name)

        del self.columns
        #count = self[value].count_distinct() ## TODO: TO FIX, should return directly just a number
        count = self.levels(value)
        if count < self.shape[0]:
            raise IdaDataFrameError("'%s' cannot be used as indexer "%value +
                                    " because it contains non unique values.")
        self._indexer = value



[docs]
    @lazy
    def type(self):
        """
        Type of self: 'Table', 'View'  or 'Unknown'.

        Returns
        -------
        str
            idaDataFrame type.

        Examples
        --------
        >>> ida_iris.type
        'Table'
        """
        return self._get_type()



[docs]
    @lazy
    @idadf_state(force = True)
    def dtypes(self):
        """
        Data type in database for each column in self.

        Returns
        -------
        DataFrame
            In-Database type for each columns.

        Examples
        --------
        >>> ida_iris.dtypes
                     TYPENAME
        sepal_length   DOUBLE
        sepal_width    DOUBLE
        petal_length   DOUBLE
        petal_width    DOUBLE
        species       VARCHAR
        """
        #import pdb ; pdb.set_trace()
        return self._get_columns_dtypes()



[docs]
    @lazy
    @idadf_state
    # to deprecate
    def index(self):
        """
        Index containing the row names in self.

        Returns
        -------
        Index

        Examples
        --------
        >>> ida_iris.index
        Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
            ...
            140, 141, 142, 143, 144, 145, 146, 147, 148, 149],
           dtype='int64', length=150)

        Notes
        -----
        Because indexes in a database can be only numeric, it is not that
        interesting for an IdaDataFrame but can still be useful sometimes. The
        function can break if the table is too large. Ask for the user’s
        approval before downloading an index which has more than 10000 values.
        """
        return self._get_index()


    @lazy
    def get_columns(self):
        if hasattr(self, "internal_state"):
            self.internal_state._create_view()
            cols = self._get_columns()
            self.internal_state._delete_view()
            return cols
        else:
            return self._get_columns()

    @property
    def columns(self):
        """
        Index containing the column names in self.

        Returns
        -------
        Index

        Examples
        --------
        >>> ida_iris.columns
        Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
        'species'],
        dtype='object')
        """
        if not self._org_columns_names:
            self._org_columns_names = list(self.get_columns)
        return self.get_columns
        
    
    @columns.setter
    def columns(self, new_names):
        newColumndict = {new_names[i]: list(self.internal_state.columndict.values())[i] for i in range(len(self.columns))}
        self._reset_attributes(["get_columns", "dtypes"])
        self.internal_state.columndict = newColumndict
        self.internal_state.update()


    @columns.deleter
    def columns(self):
        self._reset_attributes(['get_columns'])

    @lazy
    def org_columns_names(self):
        if not self._org_columns_names:
            return list(self.get_columns)
        return self._org_columns_names


[docs]
    @lazy
    @idadf_state
    # to deprecate (no index)
    def axes(self):
        """
        List containing IdaDataFrame.columns and IdaDataFrame.index attributes.

        Returns
        -------
        list
            List containing two indexes (indexes and column attributes).

        Examples
        --------
        >>> ida_iris.axes
        [Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
             ...
             140, 141, 142, 143, 144, 145, 146, 147, 148, 149],
            dtype='int64', length=150),
        Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
        'species'],
        dtype='object')]
        """
        return [self.index, self.columns]



[docs]
    @lazy
    @idadf_state
    def shape(self):
        """
        Tuple containing number of rows and number of columns.

        Returns
        -------
        tuple

        Examples
        --------
        >>> ida_iris.shape
        (150, 5)
        """
        return self._get_shape()


    @property
    @idadf_state
    def empty(self):
        """
        Boolean that is True if the table is empty (no rows).

        Returns
        -------
        Boolean
        """
        if self.shape[0] == 0:
            return True
        else:
            return False


[docs]
    def __len__(self):
        """
        Number of records.

        Returns
        -------
        int

        Examples
        --------
        >>> len(idadf)
        150
        """
        return self.shape[0]



[docs]
    def __iter__(self):
        """
        Iterate over columns.
        """
        return iter(self.columns)



[docs]
    def __getitem__(self, item):
        """
        Enable label based projection (selection of columns) in IdaDataFrames.

        Enable slice based selection of rows in IdaDataFrames.

        Enable row filtering.

        The syntax is similar to Pandas.

        Examples
        --------

        >>> idadf['col1'] # return an IdaSeries

        >>> idadf[['col1']] # return an IdaDataFrame with one column

        >>> idadf[['col1', 'col2', 'col3']] # return an IdaDataFrame with 3 columns

        >>> idadf[0:9] # Select the 10 first rows

        >>> idadf[idadf['col1'] = "test"]
        # select of rows for which attribute col1 is equal to "test"

        Notes
        -----
        The row order is not guaranteed if no indexer is given and the dataset
        is not sorted
        """

        if isinstance(item, nzpyida.filtering.FilterQuery):
            newidadf = self._clone()
            newidadf.internal_state.update(item)
            newidadf._reset_attributes(["shape"])
        else:
            if isinstance(item, slice):
                return self.loc[item]
            if not (isinstance(item,six.string_types)|isinstance(item, list)):
                raise KeyError(item)
            if isinstance(item, six.string_types):
                # Case when only one column was selected
                if item not in self.columns:
                    raise KeyError(item)
                newidaseries = self._clone_as_serie(item)
                # Form the new columndict
                for column in list(newidaseries.internal_state.columndict):
                    if column != item:
                        del newidaseries.internal_state.columndict[column]
                newColumndict = newidaseries.internal_state.columndict
                
                # Erase attributes
                newidaseries._reset_attributes(["get_columns", "shape", "dtypes"])
                # Set columns and columndict attributes
                newidaseries.internal_state.columns = ["\"%s\""%col for col in item]
                newidaseries.internal_state.columndict = newColumndict
                # Update, i.e. appends an entry to internal_state._cumulative
                newidaseries.internal_state.update()

                # Performance improvement
                # avoid, caused wrong dtypes for the result
                newidaseries.dtypes = self.dtypes.loc[[item]]

                return newidaseries

            # Case of multiple columns
            not_a_column = [x for x in item if x not in self.columns]
            if not_a_column:
                raise KeyError("%s"%not_a_column)

            newidadf = self._clone()

            # Form the new columndict
            newColumndict = OrderedDict()
            for col in item:
                # Column name as key, its definition as value
                newColumndict[col] = self.internal_state.columndict[col]

            # Erase attributes
            newidadf._reset_attributes(["get_columns", "shape", "dtypes"])
            # Set columns and columndict attributes
            newidadf.internal_state.columns = ["\"%s\""%col for col in item]
            newidadf.internal_state.columndict = newColumndict
            # Update, i.e. appends an entry to internal_state._cumulative
            newidadf.internal_state.update()

            # Performance improvement
            # avoid, caused wrong dtypes for the result
            newidadf.dtypes = self.dtypes.loc[item]

        return newidadf



[docs]
    def __setitem__(self, key, item):
        """
        Enable the creation and aggregation of columns.

        Examples
        --------

        >>> idadf['new'] = idadf['sepal_length'] * idadf['sepalwidth']
        # select a new column as the product of two existing columns

        >>> idadf['sepal_length'] = idadf['sepal_length'] / idadf['sepal_length'].mean()
        # modify an existing column
        """
        if not (isinstance(item, IdaDataFrame)):
            raise TypeError("Modifying columns is supported only using "+
                            "IdaDataFrames.")
        if isinstance(key, six.string_types):
            key = [key]
        if len(key) != len(item.columns):
                raise ValueError("Wrong number of items passed %s, placement implies %s"%(len(item.columns),len(key)))
        #form the new columndict
        for newname, oldname in zip(key, item.columns):
            self.internal_state.columndict[newname] = item.internal_state.columndict[oldname]
        newColumndict = self.internal_state.columndict
        #erase attributes
        self._reset_attributes(["get_columns", "shape", "dtypes"])
        #set columns and columndict attributes
        
        self.internal_state.columndict = newColumndict
        self.internal_state.columns = ["\"%s\""%col for col in newColumndict.keys()]
        #update, i.e. appends an entry to internal_state._cumulative
        self.internal_state.update()

        # Flush the "unique" cache
        for column in key:
            if column in self._unique:
                del self._unique[column]



[docs]
    def __delitem__(self, item):
        """
        Enable non-destructive deletion of columns using a Pandas style syntax.
        This happens inplace, which means that the current IdaDataFrame is
        modified.

        Examples
        --------
        >>> idadf = IdaDataFrame(idadb, "IRIS")
        >>> idadf.columns
        Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species'], dtype='object')
        >>> del idadf['sepal_length']
        >>> idadf.columns
        Index(['sepal_width', 'petal_length', 'petal_width', 'species'], dtype='object')
        """
        if not (isinstance(item,six.string_types)):
            raise TypeError
        if item not in self.columns:
            raise KeyError(item)

        # Flush the "unique" cache
        if item in self._unique:
            del self._unique[item]

        self._idadb.delete_column(self, item, destructive = False)
        return


    def __enter__(self):
        """
        Allow the object to be used with a "with" statement
        """
        return self

    def __exit__(self):
        """
        Allow the object to be used with a "with" statement. Make sure that
        allow possible views related to the IdaDataFrame with be deleted when
        the object goes out of scope
        """
        while self.internal_state.viewstack:
            try :
                view = self.internal_state.viewstack.pop()
                # Just a last check to make sure not to drop user's db
                if view != self.tablename:
                    drop = "DROP VIEW \"%s\"" %view
                    self._prepare_and_execute(drop, autocommit = True)
            except: pass

    #We decided not to allow columns access idadf.columnname like this for now.
    #We could decide to allow it but for this we may have to switch all
    #existing attributes to private ("_" as first character) so to avoid
    #conflicts between attributes and columns because for example IdaDataFrame
    #has an attribute "name" -> if a column is labelled "name" this will make
    #it unavailable. Pandas do also poorly manage this issue, for example :
    #DataFrame attribute "size".

    #May be implemented in the future

    #def __getattr__(self, name):
    #    """
    #    After regular attribute access, try look up the name
    #    This allows simpler access to columns for interactive use.
    #    """
        # Note: obj.x will always call obj.__getattribute__('x') prior to
        # calling obj.__getattr__('x').
        #if hasattr(self, name):
        #    return object.__getattribute__(self,name)
        #else:
    #    if name not in self.__dict__["columns"]:
            #return self[name]
    #        return object.__getattribute__(self, name)
    #    raise AttributeError("'%s' object has no attribute '%s'" %
    #                         (type(self).__name__, name))

    #def __setattr__(self, name, value):
    #    if name not in self.__dict__["columns"]:
    #        self.__dict__[name] = value
    #    else:
    #        raise ValueError("It is not allowed to set the value of a column in an IdaDataFrame.")


[docs]
    def __lt__(self, value):
        """
        ibmdbpy.filtering.FilterQuery object when comparing self using "<".
        """
        return nzpyida.filtering.FilterQuery(self.columns, self._name, "lt", value)



[docs]
    def __le__(self, value):
        """
        ibmdbpy.filtering.FilterQuery object when comparing self using "<=".
        """
        return nzpyida.filtering.FilterQuery(self.columns, self._name, "le", value)



[docs]
    def __eq__(self, value):
        """
        ibmdbpy.filtering.FilterQuery object when comparing self using "==".
        """
        return nzpyida.filtering.FilterQuery(self.columns, self._name, "eq", value)



[docs]
    def __ne__(self, value):
        """
        ibmdbpy.filtering.FilterQuery object when comparing self using "!=".
        """
        return nzpyida.filtering.FilterQuery(self.columns, self._name, "ne", value)



[docs]
    def __ge__(self, value):
        """
        ibmdbpy.filtering.FilterQuery object when comparing self using ">=".
        """
        return nzpyida.filtering.FilterQuery(self.columns, self._name, "ge", value)



[docs]
    def __gt__(self, value):
        """
        ibmdbpy.filtering.FilterQuery object when comparing self using ">".
        """
        return nzpyida.filtering.FilterQuery(self.columns, self._name, "gt", value)


    ################ Arithmetic operations


[docs]
    def __add__(self, other):
        """
        Perform an addition between self and another IdaDataFrame or number.

        Examples
        --------
        >>> ida = idadf['sepal_length'] + 3

        Notes
        -----
        Arithmetic operations only make sense if self contains only numeric columns.
        """
        self._combine_check(other)
        return nzpyida.aggregation.aggregate_idadf(self, "add", other)



[docs]
    def __radd__(self, other):
        """
        Enable the reflexivity of the addtion operation.

        Examples
        --------
        >>> ida = idadf['sepal_length'] + 3

        >>> ida = 3 + idadf['sepal_length']
        """
        self._combine_check(other)
        return nzpyida.aggregation.aggregate_idadf(other, "add", self, swap = True)



[docs]
    def __div__(self, other):
        """
        Perform a division between self and another IdaDataFrame or number.

        When __future__.division is not in effect.

        Examples
        --------
        >>> ida = idadf['sepal_length'] / 3

        Notes
        -----
        Arithmetic operations only make sense if self contains only numeric columns.
        """
        self._combine_check(other)
        return nzpyida.aggregation.aggregate_idadf(self, "div", other)



[docs]
    def __rdiv__(self, other):
        """
        Enable the reflexivity of the division operation.

        When __future__.division is not in effect.

        Examples
        --------
        >>> ida = idadf['sepal_length'] / 3

        >>> ida = 3 / idadf['sepal_length']
        """
        self._combine_check(other)
        return nzpyida.aggregation.aggregate_idadf(other, "div", self, swap = True)



[docs]
    def __truediv__(self, other):
        """
        Perform a division between self and another IdaDataFrame or number.

        When __future__.division is in effect.

        Examples
        --------
        >>> ida = idadf['sepal_length'] / 3

        Notes
        -----
        Arithmetic operations only make sense if self contains only numeric columns.
        """
        self._combine_check(other)
        return nzpyida.aggregation.aggregate_idadf(self, "div", other)



[docs]
    def __rtruediv__(self, other):
        """
        Enable the reflexivity of the division operation.

        When __future__.division is in effect.

        Examples
        --------
        >>> ida = idadf['sepal_length'] / 3

        >>> ida = 3 / idadf['sepal_length']
        """
        self._combine_check(other)
        return nzpyida.aggregation.aggregate_idadf(other, "div", self, swap = True)



[docs]
    def __floordiv__(self,other):
        """
        Perform an integer division between self and another IdaDataFrame or number.

        Examples
        --------
        >>> ida = idadf['sepal_length'] // 3

        Notes
        -----
        Arithmetic operations only make sense if self contains only numeric columns.
        """
        self._combine_check(other)
        return nzpyida.aggregation.aggregate_idadf(self, "floordiv", other)



[docs]
    def __rfloordiv__(self, other):
        """
        Enable the reflexivity of the integer division operation.

        Examples
        --------
        >>> ida = idadf['sepal_length'] // 3

        >>> ida = 3 // idadf['sepal_length']
        """
        self._combine_check(other)
        return nzpyida.aggregation.aggregate_idadf(other, "floordiv", self, swap = True)



[docs]
    def __mod__(self,other):
        """
        Perform a modulo operation between self and another IdaDataFrame or number.

        Examples
        --------
        >>> ida = idadf['sepal_length'] % 3

        Notes
        -----
        Arithmetic operations make sense if self has only numeric columns.
        """
        self._combine_check(other)
        return nzpyida.aggregation.aggregate_idadf(self, "mod", other)



[docs]
    def __rmod__(self, other):
        """
        Enable the reflexivity of the modulo operation.

        Examples
        --------
        >>> ida = idadf['sepal_length'] % 3

        >>> ida = 3 % idadf['sepal_length']
        """
        self._combine_check(other)
        return nzpyida.aggregation.aggregate_idadf(other, "mod", self, swap = True)



[docs]
    def __mul__(self,other):
        """
        Perform a multiplication between self and another IdaDataFrame or number.

        Examples
        --------
        >>> ida = idadf['sepal_length'] * 3

        Notes
        -----
        Arithmetic operations only make sense if self contains only numeric columns.
        """
        self._combine_check(other)
        return nzpyida.aggregation.aggregate_idadf(self, "mul", other)



[docs]
    def __rmul__(self, other):
        """
        Enable the reflexivity of the multiplication operation.

        Examples
        --------
        >>> ida = idadf['sepal_length'] % 3

        >>> ida = 3 % idadf['sepal_length']
        """
        self._combine_check(other)
        return nzpyida.aggregation.aggregate_idadf(other, "mul", self, swap = True)



[docs]
    def __neg__(self):
        """
        Calculate the absolute negative of all columns in self.

        Notes
        -----
        Arithmetic operations only make sense if self contains only numeric columns.
        """
        other = None
        return nzpyida.aggregation.aggregate_idadf(self, "neg", other)



[docs]
    def __rpos__(self,other):
        """
        Calculate the absolute positive. No operation required.
        """
        return self



[docs]
    def __pow__(self,other):
        """
        Perform a power operation between self and another IdaDataFrame or number.

        Examples
        --------
        >>> ida = idadf['sepal_length'] ** 3

        Notes
        -----
        Arithmetic operations only make sense if self contains only numeric columns.
        """
        self._combine_check(other)
        return nzpyida.aggregation.aggregate_idadf(self, "pow", other)



[docs]
    def __rpow__(self, other):
        """
        Enable the reflexivity of the power operation.

        Examples
        --------
        >>> ida = idadf['sepal_length'] ** 3

        >>> ida = 3 ** idadf['sepal_length']

        """
        self._combine_check(other)
        return nzpyida.aggregation.aggregate_idadf(other, "pow", self, swap = True)



[docs]
    def __sub__(self,other):
        """
        Perform a substraction between self and another IdaDataFrame or number.

        Examples
        --------
        >>> ida = idadf['sepal_length'] - 3

        Notes
        -----
        Arithmetic operations only make sense if self contains only numeric columns.
        """
        self._combine_check(other)
        return nzpyida.aggregation.aggregate_idadf(self, "sub", other)



[docs]
    def __rsub__(self, other):
        """
        Enable the reflexivity of the substraction operation.

        Examples
        --------
        >>> ida = idadf['sepal_length'] - 3

        >>> ida = 3 - idadf['sepal_length']
        """
        self._combine_check(other)
        return nzpyida.aggregation.aggregate_idadf(other, "sub", self, swap = True)


    #def __truediv__(self,value): ########
    #    pass

    #def __concat__(self,value): ####
    #    pass

    #def __contains__(self,value):
    #    pass

    # TODO: Do the inplace versions (with "i" in front)

###############################################################################
### Database Features
###############################################################################

    ##############################
    ## Delegation from IdaDataBase
    ##############################


[docs]
    def exists(self):
        """
        Convenience function delegated from IdaDataBase.

        Check if the data still exists in the database.
        """
        return self._idadb.exists_table_or_view(self._name)



[docs]
    def is_view(self):
        """
        Convenience function delegated from IdaDataBase.

        Check if the IdaDataFrame corresponds to a view in the database.
        """
        if self.type == 'View':
            return True
        else:
            return False



[docs]
    def is_table(self):
        """
        Convenience function delegated from IdaDataBase.

        Check if the IdaDataFrame corresponds to a table in the database.
        """
        if self.type == 'Table':
            return True
        else:
            return False



[docs]
    def get_primary_key(self):
        # TODO: What happen if the primary key is composed of several columns ?
        """
        Get the name of the primary key of self, if there is one. Otherwise,
        this function returns 0. This function may be deprecated in future
        versions because it is not very useful.
        """
        name = self.internal_state.current_state
        pk = NULL
        # on Netezza primary keys are not supported
        if not self._idadb._is_netezza_system():
            pk = self.ida_query("SELECT NAME FROM SYSIBM.SYSCOLUMNS WHERE TBNAME = '" +
                                name + "' AND KEYSEQ > 0 ORDER BY KEYSEQ ASC", first_row_only = True)
        if pk:
            return pk[0]
        else:
            return False


    # Should we maybe allow this only in IdaDataBase object ?
    #@timed

[docs]
    def ida_query(self, query, silent = False, first_row_only = False, autocommit = False):
        """
        Convenience function delegated from IdaDataBase.

        Prepare, execute and format the result of a query in a data frame or in
        a tuple. See the IdaDataBase.ida_query documentation.
        """
        return self._idadb.ida_query(query, silent, first_row_only, autocommit)



[docs]
    def ida_scalar_query(self, query, silent = False, autocommit = False):
        """
        Convenience function delegated from IdaDataBase.

        Prepare and execute a query and return only the first element as a
        string. See the IdaDataBase.ida_scalar_query documentation.

        """
        return self._idadb.ida_scalar_query(query, silent, autocommit)




###############################################################################
### Data Exploration
###############################################################################

    def print(self):
        print(self.internal_state.get_state())


[docs]
    @idadf_state
    def head(self, nrow=5, sort=True):
        """
        Print the n first rows of the instance, n is set to 5 by default.

        Parameters
        ----------
        nrow : int > 0
            Number of rows to be included in the result.

        sort: default is True
            If set to True and no indexer is set the data will be
            sorted by the first numeric column or if no numeric column
            is available by the first column of the dataframe.
            If set to False and no indexer is set the row order is not
            guaranteed and can vary with each execution. For big tables
            this option might save query processing time.


        Returns
        -------
        DataFrame or Series
            The index of the corresponding row number and the columns are all
            columns of self. If the IdaDataFrame has only one column, it
            returns a Series.


        Examples
        --------
        >>> ida_iris.head()
           sepal_length  sepal_width  petal_length  petal_width species
        0           5.1          3.5           1.4          0.2  setosa
        1           4.9          3.0           1.4          0.2  setosa
        2           4.7          3.2           1.3          0.2  setosa
        3           4.6          3.1           1.5          0.2  setosa
        4           5.0          3.6           1.4          0.2  setosa
        """
        if (nrow < 1) | (not isinstance(nrow, int)):
            raise ValueError("Parameter nrow should be an int greater than 0.")
        else:
            name = self.internal_state.current_state
            order = self.internal_state.get_order()
            if not " ORDER BY " in self.internal_state.get_state():
                if (self.indexer is not None)&(self.indexer in self.columns):
                    order = " ORDER BY \"" + str(self.indexer) + "\" ASC"
                elif self.indexer is None:
                    if sort:
                        column = self.columns[0]
                        if self._get_numerical_columns():
                            column = self._get_numerical_columns()[0]
                        order = " ORDER BY \"" + column + "\" ASC"
                    else:
                        order = ''
            data = self.ida_query("SELECT * FROM %s %s LIMIT %s "%(name, order, nrow))

            if data.shape[0] != 0:
                if isinstance(self, nzpyida.IdaSeries):
                    if not isinstance(data, pd.Series):
                        data = data[self.column]
                elif not 'SELECT ' in name:
                    columns = self.columns
                    data.columns = columns
#                data = ibmdbpy.utils._convert_dtypes(self, data)
            return data


    # TODO : There is a warning in anaconda when there are missing values -> why ?

[docs]
    @idadf_state
    def tail(self, nrow=5, sort=True):
        """
        Print the n last rows of the instance, n is set to 5 by default.

        Parameters
        ----------
        nrow : int > 0
            The number of rows to be included in the result.

        sort: default is True
            If set to True and no indexer is set the data will be
            sorted by the first numeric column or if no numeric column
            is available by the first column of the dataframe.
            If set to False and no indexer is set the row order is not
            guaranteed and can vary with each execution. For big tables
            this option might save query processing time.

        Returns
        -------
        DataFrame
            The index of the corresponding row number and the columns are all
            columns of self.


        Examples
        --------
        >>> ida_iris.tail()
             sepal_length  sepal_width  petal_length  petal_width    species
        145           6.7          3.0           5.2          2.3  virginica
        146           6.3          2.5           5.0          1.9  virginica
        147           6.5          3.0           5.2          2.0  virginica
        148           6.2          3.4           5.4          2.3  virginica
        149           5.9          3.0           5.1          1.8  virginica
        """
        if (nrow < 1) | (not isinstance(nrow, int)):
            raise ValueError("Parameter nrow should be an int greater than 0.")
        else:
            column_string = '\"' + '\", \"'.join(self.columns) + '\"'

            name = self.internal_state.current_state

            if " ORDER BY " in self.internal_state.get_state():
                query = "SELECT * FROM %s LIMIT %s "%(name, nrow)
                data = self.ida_query(query)
                data.columns = self.columns
                data.set_index(data[self.indexer], inplace=True)
            else:
                if self._idadb._is_netezza_system():
                    order = "ORDER BY NULL"
                else:
                    order = ""
                if self.indexer:
                    sortkey = str(self.indexer)
                    order = "ORDER BY \"" + sortkey + "\""
                else:
                    if sort:
                        sortkey = self.columns[0]
                        if self._get_numerical_columns():
                            sortkey = self._get_numerical_columns()[0]
                        order = "ORDER BY \"" + sortkey + "\""
                query = ("SELECT * FROM (SELECT * FROM (SELECT " + column_string +
                         ", ((ROW_NUMBER() OVER(" + order +
                         "))-1) AS ROWNUMBER FROM " + name +
                         ") AS TEMP1 ORDER BY ROWNUMBER DESC LIMIT " + str(nrow) +
                         " ) AS TEMP2 ORDER BY ROWNUMBER ASC")
                data = self.ida_query(query)
                data.set_index(data.columns[-1], inplace=True)  # Set the index
                data.columns = self.columns

            # del data.index.name
            #            data = ibmdbpy.utils._convert_dtypes(self, data)
            if isinstance(self, nzpyida.IdaSeries):
                data = pd.Series(data[data.columns[0]])
            return data



[docs]
    @idadf_state
    def pivot_table(self, values=None, columns=None, max_entries=1000,
                    sort=None, factor_threshold=20, interactive=False,
                    aggfunc='count'):
        """
        Compute an aggregation function over all rows of each column that is
        specified as a value on the dataset. The result grouped by the columns
        defined in “columns”.

        Parameters
        ----------
        values: str or list or str optional
            List of columns on which “aggfunc” is computed.
        columns: str or list or str optional
            List of columns that is used as an index and by which the
            dataframe is grouped.
        max_entries: int, default=1000
            The maximum number of cells to be part of the output. By default,
            set to 1000.
        sort: str, optional
            Admissible values are: “alpha” and “factors”.
                * If “alpha”, the index of the output is sorted according to the alphabetical order.
                * If “factors”, the index of the output will be sorted according to increasing number of the distinct values.

            By default, the index will be sorted in the same order that is specified in “columns” argument.

        factor_threshold: int, default: 20
            Number of distinct values above which a categorical column should
            not be considered categorical anymore and under which a numerical
            column column should not be considered numerical anymore.
        interactive: bool
            If True, the user is asked if he wants to display the output, given
            its size.
        aggfunc: str
            Aggregation function to be computed on each column specified in the
            argument “values”. Admissible values are: “count”, “sum”, “avg”.
            This entry is not case-sensitive.

        Returns
        -------
        Pandas Series with Multi-index (columns)

        Examples
        --------
        >>> val = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
        >>> ida_iris.pivot_table(values= val, aggfunc="avg")
                      species
        sepal_length  setosa        5.006
                      versicolor    5.936
                      virginica     6.588
        sepal_width   setosa        3.428
                      versicolor    2.770
                      virginica     2.974
        petal_length  setosa        1.462
                      versicolor    4.260
                      virginica     5.552
        petal_width   setosa        0.246
                      versicolor    1.326
                      virginica     2.026
        dtype: float64
        """
        # TODO : Support index
        from nzpyida.statistics import pivot_table
        return pivot_table(idadf=self, values=values, columns=columns,
                           max_entries=max_entries, sort=sort,
                           factor_threshold=factor_threshold,
                           interactive=interactive, aggfunc=aggfunc)



[docs]
    @idadf_state
    def groupby(self, by):
        """
        Creates the groupby object 
        
        Parameters
        ----------
        by : str
            Column to group the Data Frame by
        
        Returns
        ----------
        IdaDataFrameGroupBy

        Examples
        --------
        >>> ida_iris.groupby("CLASS")
        <nzpyida.groupby.IdaDataFrameGroupBy at 0x11c288e10>
        """
        if by not in self.columns:
            raise KeyError(by)
        
        columns_to_groupby = [col for col in self.columns if col != by]
        
        groupby_object = IdaDataFrameGroupBy(
            self, columns_to_groupby, by)

        return groupby_object



[docs]
    @idadf_state
    def join(self, other, on: str=None, how: str='left', lsuffix: str='_x', rsuffix: str='_y'):
        """
        Implement pandas-like interface to join tables

        Parameters
        ----------
        other : IdaDataFrame
            IdaDataFrame to join to this object
        on : str, optional
            name of column in both IdaDataFrames to do join on
        how : str, optional
            type of join, possible types - 'outer', 'inner', 'right', 'left', 'cross'
        lsuffix : str, optional
            suffix to add to columns in this IdaDataFrame that are present in both IdaDataFrames
        rsuffix : str, optional
            suffix to add to columns in other IdaDataFrame that are present in both IdaDataFrames
        
        Returns
        -------
        IdaDataFrame
            result of joining IdaDataFrames 
        
        Examples
        --------
        >>> from nzpyida.sampledata.iris import iris
        >>> from nzpyida.sampledata.iris import iris as iris2
        >>> iris.reset_index()
        >>> iris2.reset_index()
        >>> iris2.columns = ['id', 'sepal_length', 'sepal_width', 'petal_length', \
            'PETAL_WIDTH', 'CLASS']
        >>> iris2['id'] = iris2['id'] + 50
        >>> iris1_ida = idadb.as_idadataframe(iris, 'IRIS_TEST1', indexer='index')
        >>> iris2_ida = idadb.as_idadataframe(iris2, 'IRIS_TEST2', indexer='id')
        >>> iris_merge = iris1_ida.join(iris2_ida)
        >>> iris_merge.tail()
            index 	sepal_length_x 	sepal_width_x 	petal_length_x 	petal_width 	species     id 	sepal_length_y 	sepal_width_y 	petal_length_y 	PETAL_WIDTH     CLASS												
        95 	145 	6.7 	        3.0 	        5.2 	        2.3 	        virginica   145 5.7 	        3.0 	        4.2 	        1.2 	        versicolor
        96 	146 	6.3 	        2.5 	        5.0 	        1.9 	        virginica   146 5.7 	        2.9 	        4.2 	        1.3 	        versicolor
        97 	147 	6.5 	        3.0 	        5.2 	        2.0 	        virginica   147 6.2 	        2.9 	        4.3 	        1.3 	        versicolor
        98 	148 	6.2 	        3.4 	        5.4 	        2.3 	        virginica   148 5.1 	        2.5 	        3.0 	        1.1 	        versicolor
        99 	149 	5.9 	        3.0 	        5.1 	        1.8 	        virginica   149 5.7 	        2.8 	        4.1 	        1.3 	        versicolor
        >>> len(iris_merge)
        100
    
        """
        if not on:
            left_index = True
        else:
            left_index = False
        return nzpyida.join_tables.merge(self, other, left_index=left_index, 
                                         left_on=on, right_index=True, 
                                         how=how, suffixes=(lsuffix, rsuffix))

    

[docs]
    def merge(self, right, how: str='inner', on=None, 
          left_on=None, right_on=None, left_index: bool=False, 
          right_index: bool=False, suffixes: List[str]=["_x", "_y"],
          indicator: bool=False):
        """
        Implement pandas-like interface to merge IdaDataFrames

        Parameters
        ----------
        right : IdaDataFrame
            right IdaDataDrame to merge
        how : str, optional
            type of join, possible types - 'outer', 'inner', 'right', 'left', 'cross'
        on : str, optional
            name of column in both IdaDataFrames to do join on
        left_on : str, optional
            name of column in left IdaDataFrame to do join on
        right_on : str, optional
            name of column in right IdaDataFrame to do join on
        left_index : bool, optional
            whether to join on indexer of left IdaDataFrame
        right_index : bool, optional
            whether to join on indexer of right IdaDataFrame
        suffixes: List[str], optional
            list of suffixes to add to columns that are present in both IdaDataFrames
        indicator: bool optional
            whether to add to output IdaDataFrame, column with information
            about the source of a given row
        
        Returns
        -------
        IdaDataFrame
            result of merging IdaDataFrames

        Examples
        --------
        >>> from nzpyida.sampledata.iris import iris
        >>> from nzpyida.sampledata.iris import iris
        >>> iris.reset_index()
        >>> iris2.reset_index()
        >>> iris2.columns = ['id', 'sepal_length', 'sepal_width', 'petal_length', \
            'PETAL_WIDTH', 'CLASS']
        >>> iris2['id'] = iris2['id'] + 50
        >>> iris1_ida = idadb.as_idadataframe(iris, 'IRIS_TEST1', indexer='index')
        >>> iris2_ida = idadb.as_idadataframe(iris2, 'IRIS_TEST2', indexer='id')
        >>> iris_merge = iris1_ida.merge(iris2_ida, left_on='index', right_on='id')
        >>> iris_merge.tail()
            index 	sepal_length_x 	sepal_width_x 	petal_length_x 	petal_width 	species     id 	sepal_length_y 	sepal_width_y 	petal_length_y 	PETAL_WIDTH     CLASS												
        95 	145 	6.7 	        3.0 	        5.2 	        2.3 	        virginica   145 5.7 	        3.0 	        4.2 	        1.2 	        versicolor
        96 	146 	6.3 	        2.5 	        5.0 	        1.9 	        virginica   146 5.7 	        2.9 	        4.2 	        1.3 	        versicolor
        97 	147 	6.5 	        3.0 	        5.2 	        2.0 	        virginica   147 6.2 	        2.9 	        4.3 	        1.3 	        versicolor
        98 	148 	6.2 	        3.4 	        5.4 	        2.3 	        virginica   148 5.1 	        2.5 	        3.0 	        1.1 	        versicolor
        99 	149 	5.9 	        3.0 	        5.1 	        1.8 	        virginica   149 5.7 	        2.8 	        4.1 	        1.3 	        versicolor
        >>> len(iris_merge)
        100
        >>> iris_merge = iris1_ida.merge() iris2_ida, left_on='index', right_index=True, 
                            how='outer', indicator=True, suffixes=("_a", "_b"))
        >>> iris_merge.head()
            index 	sepal_length_a 	sepal_width_a 	petal_length_a 	petal_width 	species 	id 	sepal_length_b 	sepal_width_b 	petal_length_b 	PETAL_WIDTH CLASS 	INDICATOR
        0 	None 	None 	        None 	        None 	        None 	        None 	        152 	7.1 	        3.0 	        5.9 	        2.1         virginica 	right_only
        1 	None 	None 	        None 	        None 	        None 	        None 	        151 	5.8 	        2.7 	        5.1 	        1.9         virginica 	right_only
        2 	None 	None 	        None 	        None 	        None 	        None 	        154 	6.5 	        3.0 	        5.8 	        2.2         virginica 	right_only
        3 	None 	None 	        None 	        None 	        None 	        None 	        153 	6.3 	        2.9 	        5.6 	        1.8         virginica 	right_only
        4 	None 	None 	        None 	        None 	        None 	        None 	        150 	6.3 	        3.3 	        6.0 	        2.5         virginica 	right_only
        """
        return nzpyida.merge(self, right, how, on, left_on, right_on, 
                             left_index, right_index, suffixes, indicator)

        

    # TODO : implement NULL FIRST option

[docs]
    @idadf_state
    def sort(self, columns=None, axis=0, ascending=True, inplace=False):
        """
        Sort the IdaDataFrame row wise or column wise.

        Parameters
        ----------
        columns : str or list of str
            Columns that should be used to sort the rows in the IdaDataFrame.
            If columns is set to None and axis to 0, then the IdaDataFrame
            columns are sorted in lexicographical order.
        axis : int (0/1)
             Axis that is sorted. 0 for sorting row wise, 1 for sorting column
             wise.
        ascending : bool, default: True
            Sorting order, True : ascending, False : descending
        inplace : bool, default: False
            The current object is modified or creates a modified copy. If
            False, the function creates a modified copy of the current
            dataframe. If True,  the function modifies the current dataframe.

        Raises
        ------
        ValueError
            * When sorting by column (column not None), the axis value must be 0 (rows).
            * A column does not belong to self.
            * The axis argument has a value other than 0 or 1.

        Notes
        -----
        If columns is set to None and axis to 0, this undoes all sorting the
        IdaDataFrame and returns the original sorting in the Db2 Warehouse
        database.

        No actual changes are made in Db2 Warehouse, only the querying changes.
        Everything is registered in an InternalState object. Changes can be
        observed by using  head and tail function.
        """
        if isinstance(columns, six.string_types):
            columns = [columns]

        # Sanitiy check
        if columns:
            if axis != 0:
                raise ValueError("When sorting by column, axis must be 0 (rows)")
            for column in columns:
                if column not in self.columns:
                    raise ValueError("Column "+column+" is not in "+self._name)
        if axis not in [0,1]:
            raise ValueError("Allowed values for axis is 0 (rows) or 1 (columns)")

        if inplace:
            idadf = self
        else:
            idadf = self._clone()

        if axis:
            # Sort the columns in ascending or descending lexicographic order
            idadf.internal_state.columndict = OrderedDict(sorted(idadf.internal_state.columndict.items(), reverse = not ascending))
            idadf.internal_state.update()
        else:
            if columns:
                idadf.internal_state.set_order(columns, ascending)
                idadf.internal_state.update()
            else:
                if isinstance(idadf, nzpyida.IdaSeries):
                    idadf.internal_state.set_order([idadf.column], ascending)
                    idadf.internal_state.update()
                else:
                    idadf.internal_state.reset_order()

        if not inplace:
            return idadf


    @idadf_state
    def levels(self, columns = None):
        # TODO: Test, doc, name?
        """
        Return the numbers of distinct values
        """
        if columns is not None:
            if isinstance(columns, six.string_types):
                columns = [columns]
            for column in columns:
                message = ''
                if column not in self.columns:
                    message += "Column %s does not belong to current idadataframe. \n"%column
                if message:
                    raise ValueError(message)
        else:
            columns = self.columns

        agglist = []
        for column in columns:
            agglist.append("COUNT(DISTINCT \"%s\")"%column)

        aggstr = ",".join(agglist)

        query = "SELECT %s FROM %s"%(aggstr, self.name)
        levels_tuple = self.ida_query(query, first_row_only = True)

        if len(levels_tuple) == 1:
            return levels_tuple[0]

        result = pd.Series(levels_tuple)
        result.index = columns
        return result


    #@timed
    @idadf_state
    def count_groupby(self, columns = None, count_only = False, having = None):
        """
        Count the occurence of the values of a column or group of columns
        """
        # TODO: Document, test
        if columns is not None:
            if isinstance(columns, six.string_types):
                columns = [columns]
            for column in columns:
                message = ''
                if column not in self.columns:
                    message += "Column %s does not belong to current idadataframe. \n"
                if message:
                    raise ValueError(message)
        else:
            columns = list(self.columns)
        if having:
            if not isinstance(having, Number):
                raise TypeError("having argument should be a number")

        select_columnstr = "\"" + "\",\"".join(columns) + "\", COUNT(*)"
        if count_only:
            select_columnstr = "COUNT(*)"
        groupby_columnstr = "\"" + "\",\"".join(columns) + "\""
        if having:
            groupby_columnstr = groupby_columnstr + " HAVING count >= %s"%having
        data = self.ida_query("SELECT %s AS count FROM %s GROUP BY %s"%(select_columnstr,self.name,groupby_columnstr))

        data.columns = columns + ["count"]
        return data

    def mean_freq_of_instance(self, columns = None):
        """
        Return the average occurence of the values of a column or group of columns
        """
        # TODO: Document, test
        if columns is not None:
            if isinstance(columns, six.string_types):
                columns = [columns]
            for column in columns:
                message = ''
                if column not in self.columns:
                    message += "Column %s does not belong to current idadataframe. \n"
                if message:
                    raise ValueError(message)
        else:
            columns = list(self.columns)

        select_columnstr = "COUNT(*)"
        groupby_columnstr = "\"" + "\",\"".join(columns) + "\""

        subquery = "SELECT %s AS count FROM %s GROUP BY %s"%(select_columnstr,self.name,groupby_columnstr)
        data = self.ida_scalar_query("SELECT AVG(DISTINCT count) FROM (%s)"%subquery)

        return int(data)

#    @timed
    @idadf_state
    def unique(self, column):
        """
        Return the unique values of a column
        """
        # TODO: Document, test
        if column in self._unique:
            return self._unique[column]

        #name = self.internal_state.current_state

        if not isinstance(column, six.string_types):
            raise TypeError("column argument not of string type")

        if column not in self.columns:
            # idadf.name somewhat false in case of modification
            raise ValueError("Undefined column \"%s\" in table %s"%(column, self._name))

        result = self.ida_query("SELECT DISTINCT \"%s\" FROM %s"%(column, self.name))

        self._unique[column] = result
        return result

    # TODO: to implement
    @timed
    @idadf_state
    def info(self, buf=None):
        """Some information about current IdaDataFrame. NOTIMPLEMENTED"""
        # There is a lot more
        #from pandas.core.format import _put_lines
        raise IdaDataFrameError("NOT IMPLEMENTED")

        if buf is None:  # pragma: no cover
            buf = sys.stdout

        lines = []

        lines.append(str(type(self)))
        lines.append(self.index.summary())

        if len(self.columns) == 0:
            lines.append('Empty %s' % type(self).__name__)
            _put_lines(buf, lines)
            return

###############################################################################
### Descriptive statistics
###############################################################################

    # TODO: to implement for categorical attributes

[docs]
    @timed
    @idadf_state
    def describe(self, percentiles=[0.25, 0.50, 0.75]):
        """
        A basic statistical summary about current IdaDataFrame. If at least one
        numerical column exists, the summary includes:

            * The count of non-missing values for each numerical column.
            * The mean for each numerical column.
            * The standart deviation for each numerical column.
            * The minimum and maximum for each numerical column.
            * A list of percentiles set by the user (default : the quartiles).

        Parameters
        ----------
        idadf : IdaDataFrame
        percentiles : Float or list of floats, default: [0.25, 0.50, 0.75].
            percentiles to be computed on numerical columns.
            All values in percentiles must be > 0  and < 1.

        Returns
        -------
        summary: DataFrame, where
            * Index is the name of the computed values.
            * Columns are either numerical or categorical columns of self.

        """
        from nzpyida.statistics import describe
        return describe(idadf=self, percentiles=percentiles)



    @timed
    @idadf_state
    def summary(self):
        """
        A basic statistical summary about current IdaDataFrame.

        Parameters
        ----------
        idadf : IdaDataFrame


        Returns
        -------
        summary: DataFrame, where
            * Index is the name of the computed values.
            * Columns are either numerical or categorical columns of self.

        """
        from nzpyida.statistics import summary
        return summary(idadf=self)


[docs]
    @timed
    @idadf_state
    def cov(self):
        """
        Compute the covariance matrix, composed of covariance coefficients
        between all pairs of columns in self.
        It must have at least two numeric columns.

        Returns
        -------
        covariance matrix: DataFrame
            The axes are the columns of self and the values are the covariance
            coefficients.
        """
        from nzpyida.statistics import cov
        return cov(idadf=self)



[docs]
    @timed
    @idadf_state
    def corr(self, method="pearson", features=None, ignore_indexer=True):
        """
        Compute the correlation matrix, composed of correlation coefficients
        between all pairs of columns in self.
        It must have at least two numeric columns.

        Parameters
        ----------
        method : str, default: pearson
            Method to be used to compute the correlation. By default, compute
            the pearson correlation coefficient. The Spearman rank correlation
            is also available. Admissible values are: "pearson", "spearman".

        Returns
        -------
        correlation matrix: DataFrame
            The axes are the columns of self and the values are the correlation
            coefficients.

        Notes
        -----
        For the Spearman rank correlation, the ordinal rank of columns is
        computed. For performance reasons this is easier to compute than the
        fractional rank traditionally computed for the Spearman rank
        correlation method. This strategy has the property that the sum of the
        ranking numbers is the same as under ordinal ranking. We then apply
        the pearson correlation coefficient method to these ranks.
        """
        from nzpyida.statistics import corr
        #return corr(idadf=self, features=features, ignore_indexer=ignore_indexer)
        return corr(idadf=self)


    @timed

    def train_test_split(self, train_table, test_table, id, fraction, seed):
        """
        Split the table into train and test sets

        Parameters
        ----------
        train_table : str
            the output table that will contain the given fraction of the input records

        test_table :str
            the output table that will contain the rest (1-<fraction>) of the input records

        id : str
            the input table column identifying a unique instance id

        fraction: float
             the fraction of the data to split

        seed: float
            the seed of the random function

        Returns
        -------
        number of records in train table: float



        """
        from nzpyida.statistics import train_test_split

        return train_test_split(idadf=self, train_table=train_table, test_table=test_table, id=id, fraction=fraction, seed=seed)

    # TODO: to implement
    @timed
    @idadf_state
    def corrwith(self, other):
        """
        Compute the correlation matrix, composed of correlation coefficients
        between the columns of self and the columns of another IdaDataFrame.

        Parameters
        ----------
        other : DataFrame

        Returns
        -------
        correlation matrix: DataFrame
            The columns are the columns of self and the index the columns
            of other. The values are the covariance coefficients.
        """
        raise NotImplementedError("TODO")

    # TODO: to implement
    @timed
    @idadf_state
    def mode(self):
        """
        Compute the most common value for each non numeric column self. NOTIMPLEMENTED

        Returns
        -------
        mode: Series
        """
        raise NotImplementedError("TODO")


[docs]
    @timed
    @idadf_state
    def quantile(self, q=0.5):
        """
        Compute row wise quantiles for each numeric column.

        Parameters
        ----------
        q : float or array-like, default 0.5 (50% quantile)
            0 <= q <= 1, the quantile(s) to compute

        Returns
        -------
        quantiles: Series or DataFrame
            If q is an array, the function returns a DataFrame in which the
            index is q. The columns are the columns of sel, and the values are
            the quantiles. If q is a float, a Series is returned where the
            index is the columns of self and the values are the quantiles.

        """
        from nzpyida.statistics import quantile
        return quantile(idadf=self, q=q)



[docs]
    @timed
    @idadf_state
    def mad(self):
        """
        Compute the mean absolute distance for all numeric columns of self.
        It must have at least two numeric columns.

        Returns
        -------
        mad: Series
            The index consists of the columns of self and the values are the mean absolute distance.
        """
        from nzpyida.statistics import mad
        return mad(idadf=self)



[docs]
    @timed
    @idadf_state
    def min(self):
        """
        Compute the minimum value for all numerics column of self.

        Returns
        -------
        min: Series
            The index consists of the columns of self and the values are the minimum.
        """
        from nzpyida.statistics import ida_min
        return ida_min(idadf=self)



[docs]
    @timed
    @idadf_state
    def max(self):
        """
        Compute the maximum value over for all numeric columns of self.

        Returns
        -------
        max: Series.
            The index consists of the columns of self and the values are the maximum.
        """
        from nzpyida.statistics import ida_max
        return ida_max(idadf=self)



[docs]
    @timed
    @idadf_state
    def count(self):
        """
        Compute the count of non-missing values for all columns of self.

        Returns
        -------
        count: Series.
            The index consists of the columns of self and the values are the number of non-missing values.
        """
        from nzpyida.statistics import count
        return count(idadf=self)



[docs]
    @timed
    @idadf_state
    def count_distinct(self):
        # deprecated, use levels instead
        """
        Compute the count of distinct values for all numeric columns of self.

        Returns
        -------
        disctinct count: Series
            The index consists of the columns of self and values are the number of distinct values.
        """
        from nzpyida.statistics import count_distinct
        return count_distinct(idadf=self)



[docs]
    @timed
    @idadf_state
    def std(self):
        """
        Compute the standard deviation for all numeric columns of self.

        Returns
        -------
        std: Series
            The index consists of the columns of self and the values are the standard deviation.
        """
        from nzpyida.statistics import std
        return std(idadf=self)


    @timed
    @idadf_state
    def within_class_var(self, target, features = None, ignore_indexer=True):
        if features is None:
            numerical_columns = self._get_numerical_columns()
            features = [x for x in numerical_columns if x != target]
        else:
            if isinstance(features, six.string_types):
                features = [features]

        if ignore_indexer is True:
            if self.indexer:
                if self.indexer in features:
                    features.remove(self.indexer)

        result = pd.Series()

        #C = self.levels(target)
        N = len(self)
        if self._idadb._is_netezza_system():
            power_function = "POW"
        else:
            power_function = "POWER"

        if len(features) < 5:

            avglist = ["AVG(\"%s\") as \"average%s\""%(feature, index) for index, feature in enumerate(features)]
            sumlist = ["SUM(CAST(%s(\"%s\" - \"average%s\", 2) as DOUBLE))"%(power_function, feature, index) for index, feature in enumerate(features)]

            avgstr = ", ".join(avglist)
            sumstr = ", ".join(sumlist)

            subquery = "(SELECT \"%s\", %s FROM %s GROUP BY \"%s\") AS B"%(target, avgstr, self.name, target)
            condition = "ON A.\"%s\" = B.\"%s\""%(target, target)
            groupby = "GROUP BY A.\"%s\""%target
            query = "SELECT %s FROM %s AS A INNER JOIN %s %s %s"%(sumstr, self.name, subquery, condition, groupby)

            classvar = self.ida_query(query)
            if len(features) == 1:
                classvar = pd.DataFrame(classvar)

            C = len(classvar)
            if N == C:
                N += 1


            classvar.columns = pd.Index(features)
            result = pd.Series()
            for attr in classvar:
                result[attr] = classvar[attr].sum()/(N -C)

        else:
            chunkgen = chunklist(features, 5)
            result = pd.Series()
            for chunk in chunkgen:
                avglist = ["AVG(\"%s\") as \"average%s\""%(feature, index) for index, feature in enumerate(chunk)]
                sumlist = ["SUM(CAST(%s(\"%s\" - \"average%s\", 2) as DOUBLE))"%(power_function, feature, index) for index, feature in enumerate(chunk)]

                avgstr = ", ".join(avglist)
                sumstr = ", ".join(sumlist)

                subquery = "(SELECT \"%s\", %s FROM %s GROUP BY \"%s\") AS B"%(target, avgstr, self.name, target)
                condition = "ON A.\"%s\" = B.\"%s\""%(target, target)
                groupby = "GROUP BY A.\"%s\""%target
                query = "SELECT %s FROM %s AS A INNER JOIN %s %s %s"%(sumstr, self.name, subquery, condition, groupby)

                classvar = self.ida_query(query)

                if len(chunk) == 1:
                    classvar = pd.DataFrame(classvar)

                C = len(classvar)
                if N == C:
                    N += 1

                classvar.columns = pd.Index(chunk)

                for attr in classvar:
                    result[attr] = classvar[attr].sum()/(N -C)

        return result

    @timed
    @idadf_state
    def within_class_std(self, target, features = None, ignore_indexer= True):
        return np.sqrt(self.within_class_var(target, features, ignore_indexer))


[docs]
    @timed
    @idadf_state
    def var(self):
        """
        Compute the variance for all numeric columns of self.

        Returns
        -------
        var: Series
            The index consists of the columns of self and the values are the variance.
        """
        from nzpyida.statistics import var
        return var(idadf=self)



[docs]
    @timed
    @idadf_state
    def mean(self):
        """
        Compute the mean for each numeric columns of self.

        Returns
        -------
        mean: Series
            The index consists of the columns of self and the values are the mean.
        """
        from nzpyida.statistics import mean
        return mean(idadf=self)


    @timed
    @idadf_state
    def mean_groupby(self, groupby, features = None):
        if features is None:
            features = [x for x in self.columns if x != groupby]
        else:
            if isinstance(features, six.string_types):
                features = [features]

        avglist = ["CAST(AVG(\"%s\") as FLOAT) AS \"%s_AVG\""%(feature, feature) for feature in features]
        avgstr = ", ".join(avglist)

        query = "SELECT \"%s\", %s FROM %s GROUP BY \"%s\""%(groupby, avgstr, self.name, groupby)
        result = self.ida_query(query)
        result = result.pivot_table(index = result.columns[0])
        result.columns = pd.Index(features)
        return result



[docs]
    @timed
    @idadf_state
    def sum(self):
        """
        Compute the sum of values for all numeric columns of self.

        Returns
        -------
        sum: Series
            The index consists of the columns of self and the values are the sum.
        """
        # Behave like having the option "numeric only" to true
        # TODO: Implement the option
        from nzpyida.statistics import ida_sum
        return ida_sum(idadf=self)



[docs]
    @timed
    @idadf_state
    def median(self):
        """
        Compute the median for all numeric columns of self.

        Returns
        -------
        median: Series
            The index consists of the columns of self and the values are the median.
        """
        # Behave like having the option "numeric only" to true
        # TODO: Implement the option
        from nzpyida.statistics import median
        return median(idadf=self)


    # Maybe not be possible
    @idadf_state
    def rank(self):
        """Compute the rank over all entries for all columns of self."""
        raise NotImplementedError("TODO")

    # TODO : cumsum, cummean, cummcountm cummax, cumprod

###############################################################################
### Save current IdaDataFrame to Db2 Warehouse as a table
###############################################################################

    # TODO: Should this function be in IdaDataBase ?
    # To my mind, it is more intuitive to let it here, but it is "destructive"

[docs]
    @timed
    @idadf_state
    def save_as(self, tablename, clear_existing = False):
        """
        Save self as a table name in the remote database with the name
        tablename. This function might erase an existing table if tablename
        already exists and clear_existing is True.

        """
        # TODO: to test !

        with_data = '' if self._idadb._is_netezza_system() else ' WITH DATA'

        if tablename == self.tablename:
            if clear_existing is False:
                raise ValueError("Cannot overwrite current IdaDataFrame if "+
                                " clear_existing option set to False.")
            message = "Table %s already exists."%(tablename)
            warnings.warn(message, UserWarning)
            question = "Are you sure that you want to overwrite %s"%(tablename)
            display_yes = nzpyida.utils.query_yes_no(question)
            if not display_yes:
                return
            tempname = self._idadb._get_valid_tablename()
            self._prepare_and_execute("CREATE TABLE %s AS (SELECT * FROM %s)%s"%(tempname, tablename, with_data))
            try:
                self._idadb.drop_table(tablename)
            except:
                self._idadb.drop_view(tablename)

            newidadf = IdaDataFrame(self._idadb, tempname)
            self._idadb.rename(newidadf, tablename)
            self = newidadf

        if self._idadb.exists_table_or_view(tablename):
            if clear_existing:
                message = "Table %s already exists."%(tablename)
                warnings.warn(message, UserWarning)
                question = "Are you sure that you want to overwrite %s"%(tablename)
                display_yes = nzpyida.utils.query_yes_no(question)
                if not display_yes:
                    return
                try:
                    self._idadb.drop_table(tablename)
                except:
                    self._idadb.drop_view(tablename)
            else:
                raise NameError(("%s already exists, choose a different name "+
                                "or use clear_existing option.")%tablename)

        name = self.internal_state.current_state

        self._prepare_and_execute("CREATE TABLE %s AS (SELECT * FROM %s)%s"%(tablename, name, with_data))

        # Reset the cache
        self._idadb._reset_attributes("cache_show_tables")


###############################################################################
### Import as DataFrame
###############################################################################


[docs]
    @timed
    @idadf_state
    def as_dataframe(self):
        """
        Download and return an in-memory representation of the dataset as
        a Pandas DataFrame.

        Returns
        -------
        DataFrame
            Columns and records are the same as in self.


        Examples
        --------
        >>> iris = ida_iris.as_dataframe()
        >>> iris.head()
           sepal_length  sepal_width  petal_length  petal_width species
        0           5.1          3.5           1.4          0.2  setosa
        1           4.9          3.0           1.4          0.2  setosa
        2           4.7          3.2           1.3          0.2  setosa
        3           4.6          3.1           1.5          0.2  setosa
        4           5.0          3.6           1.4          0.2  setosa
        """
        if os.environ['VERBOSE'] == 'True':
            # We use an empirical estimation
            # Experimental results :
            # 1M row, 12 columns => 550 secs
            # 3.77M, 12 columns => 2256 secs
            raw_estimation = ((self.shape[0]*self.shape[1]) / 4000000) * 3
            if raw_estimation < 0.01:
                # There is a small offset, which is negligeable for higher measurements
                estimation = str(raw_estimation*60 + 0.717) + " seconds."
            elif raw_estimation > 60:
                estimation = str(raw_estimation/60) + " hours."
            else:
                estimation = "%s minutes."%raw_estimation
            print("Your IdaDataFrame has %s rows and %s columns." % (self.shape[0], self.shape[1]))
            print("Estimated download time : ~ " + estimation)
            if raw_estimation > 30:
                message = ("Estimated Download time is greater than" +
                           " %s minutes.")%raw_estimation
                warnings.warn(message, UserWarning)
                question = "Do you want to download the dataset?"
                display_yes = nzpyida.utils.query_yes_no(question)
                if not display_yes:
                    return

        data = self.ida_query(self.internal_state.get_state())
        data.columns = self.columns
        data.name = self.tablename
        # Handle datatypes
#        data = ibmdbpy.utils._convert_dtypes(self, data)
        return data


###############################################################################
### Connection Management
###############################################################################

# Connection is a bit different, this is made to allow user to work on several
# IdaDataFrame and close them, without closing the connection. This is why
# function "close" is overwritten and a function reopen is provided

    # Not sure commit and rollback should be here ?


[docs]
    def commit(self):
        """
        Commit operations in the database.

        Notes
        -----

        All changes that are made in the database after the last commit,
        including those in the child IdaDataFrames, are commited.

        If the environment variable ‘VERBOSE’ is set to True, the commit
        operations are notified in the console.
        """
        self._idadb.commit()



[docs]
    def rollback(self):
        """
        Rollback operations in the database.

        Notes
        -----

        All changes that are made in the database after the last commit,
        including those in the child IdaDataFrames, are discarded.
        """
        self._idadb.rollback()



###############################################################################
### Private functions
###############################################################################

    def _clone(self):
        """
        Clone the actual object.
        """
        newida = IdaDataFrame(self._idadb, self._name, self.indexer)
        # newida.columns = self.columns
        # newida.dtypes = self.dtypes     # avoid recomputing it
        # otherwise risk of infinite loop between
        # idadf.columns and internalstate.columndict

        # This is not possible to use deepcopy on an IdaDataFrame object
        # because the reference to the parents IdaDataBase with the connection
        # object is not pickleable. As a consequence, we create a new
        # IdaDataFrame and copy all the relevant attributes

        newida.internal_state.name = deepcopy(self.internal_state.name)
        newida.internal_state.ascending = deepcopy(self.internal_state.ascending)
        #newida.internal_state.views = deepcopy(self.internal_state.views)
        newida.internal_state._views = deepcopy(self.internal_state._views)
        newida.internal_state._cumulative = deepcopy(self.internal_state._cumulative)
        newida.internal_state.order = deepcopy(self.internal_state.order)
        newida.internal_state.columndict = deepcopy(self.internal_state.columndict)
        newida.columns = self.columns
        newida._org_columns_names = deepcopy(self._org_columns_names)
        newida.dtypes = self.dtypes
        return newida

    def _clone_as_serie(self, column):
        """
        Clone the actual object as an IdaSeries and select one of its columns.
        """
        newida = nzpyida.IdaSeries(idadb = self._idadb, tablename = self._name,
                                   indexer = self.indexer, column = column)

        newida.internal_state.name = deepcopy(self.internal_state.name)
        newida.internal_state.ascending = deepcopy(self.internal_state.ascending)
        #newida.internal_state.views = deepcopy(self.internal_state.views)
        newida.internal_state._views = deepcopy(self.internal_state._views)
        newida.internal_state._cumulative = deepcopy(self.internal_state._cumulative)
        newida.internal_state.order = deepcopy(self.internal_state.order)
        newida.internal_state.columndict = deepcopy(self.internal_state.columndict)
        newida._org_columns_names = [self.internal_state.columndict[column].strip('\"')]
        return newida

    def _get_type(self):
        """
        Type of the IdaDataFrame : "Table", "View" or "Unknown".
        """
        if self._idadb.is_table(self._name):
            return "Table"
        elif self._idadb.is_view(self._name):
            return "View"
        else:
            return "Unkown"

    def _get_columns(self):
        """
        Index containing a list of the columns in self.
        """
        tablename = self.internal_state.current_state
        if self._idadb._con_type == 'odbc':
            if '.' in tablename:
                schema = tablename.split('.')[-2]
                tablename = tablename.split('.')[-1]
            else:
                schema = self._idadb.current_schema
            columns = self._idadb._con.cursor().columns(table=tablename, schema=schema)
            columnlist = [column[3] for column in columns]
            return Index(columnlist)
        elif self._idadb._con_type == 'nzpy':
            cursor = self._idadb._con.cursor()
            cursor.execute("SELECT * FROM %s LIMIT 1"%tablename)
            columnlist = []
            if type(cursor.description[0][0]) == bytes:
                columnlist = [column[0].decode() for column in cursor.description]
            else:
                columnlist = [column[0] for column in cursor.description]
            cursor.close()
            return Index(columnlist)
        elif self._idadb._con_type == 'jdbc':
            cursor = self._idadb._con.cursor()
            cursor.execute("SELECT * FROM %s LIMIT 1"%tablename)
            columnlist = [column[0] for column in cursor.description]
            cursor.close()
            return Index(columnlist)

    def _get_all_columns_in_table(self):
        """Get all columns that exists in the physical table."""
        return self._get_columns()

    # TODO: To deprecate
    def _get_index(self, force=False):
        """
        Index containing a list of the row names in self.
        """

        rows = self.shape[0]

        return RangeIndex(0, rows, 1)

    def _get_shape(self):
        """
        Tuple containing the number of rows and the number of columns in self.
        """
        name = self.internal_state.current_state

        nrow = self.ida_scalar_query("SELECT CAST(COUNT(*) AS INTEGER) FROM %s"%name)
        ncol = len(self.columns)
        return (nrow, ncol)

    def _get_columns_dtypes(self):
        """
        DataFrame containing the column names and database types in self.
        """
        name = self.internal_state.current_state

        # In case the name is composed the following way : SCHEMA.TABLENAME
        # We need to separate it to keep only the TABLENAME for this query.
        if '.' in name :
            name = name.split('.')[-1]

        if self._idadb._is_netezza_system():
            nz_hidden_columns =  "('ROWID', 'DATASLICEID', 'CREATEXID', 'DELETEXID', '_PAGEID', '_EXTENTID') "
            query_select = ("SELECT COLUMN_NAME AS COLNAME, " +
                            "CASE WHEN strpos(TYPE_NAME, '(') = 0 THEN TYPE_NAME " +
                            "ELSE substr(TYPE_NAME, 1, strpos(TYPE_NAME,'(')-1) END AS TYPENAME " +
                            "FROM _V_SYS_COLUMNS ")
            # hidden Netezza columns are not included in the set of columns of this data frame
            query_where1 = "WHERE TABLE_NAME = \'%s\' AND COLUMN_NAME NOT IN " + nz_hidden_columns
            query_where2 = "AND SCHEMA = \'%s\' "
            query_order_by = "ORDER BY ORDINAL_POSITION "
        else:
            query_select = "SELECT COLNAME, TYPENAME FROM SYSCAT.COLUMNS "
            query_where1 = "WHERE TABNAME = \'%s\' "
            query_where2 = "AND TABSCHEMA = \'%s\' "
            query_order_by = "ORDER BY COLNO"

        if name.find("TEMP_VIEW_") == 0:
            #When the column names are going to be retrieved from a temporary
            #view that was created with the definition of the current state of
            #the IdaDataFrame, the schema name cannot be assumed as the same of
            #the IdaDataFrame. Also mind that the name of the temporary view
            #is thought to be random enough to avoid collisions
            data = self.ida_query((query_select + query_where1 + query_order_by)%(name))
        else:
            data = self.ida_query((query_select + query_where1 + query_where2 + query_order_by)%(name, self.schema))

        # Workaround for some ODBC version which does not get the entire
        # string of the column name in the cursor descriptor.
        # This is hardcoded, so be careful
        data.columns = ["COLNAME", "TYPENAME"]
        data.columns = [x.upper() for x in data.columns]
        data.set_index(keys='COLNAME', inplace=True)
        # del data.index.name
        return data

    def _reset_attributes(self, attributes):
        """
        Delete an attribute of self to force its refreshing at the next call.
        """
        if isinstance(attributes, six.string_types):
            attributes = [attributes]

        # Special case : resetting columns
        if "get_columns" in attributes or "columns" in attributes:
             try:
                 del self.internal_state.columndict
             except:
                 pass
            
        if "org_columns_names" in attributes:
            self._org_columns_names = None

        nzpyida.utils._reset_attributes(self, attributes)


###############################################################################
### DB2 Warehouse to pandas type mapping
###############################################################################

    def _table_def(self, factor_threshold=None):
        """
        Classify columns in the idaDataFrame into 4 classes: CATEGORICAL,
        STRING, NUMERIC or NONE. Use the database data type and a
        user-threshold “factor_threshold”:

            * CATEGORICAL columns that have a number of distinct values that is greater
              than the factor_threshold should be considered a STRING.
            * NUMERIC columns that have a number of distinct values that is smaller or equal
              to the factor_threshold should be considered CATEGORICAL.

        Returns
        -------
        DataFrame

            * Index is the columns of self.
            * Column "FACTORS" contains the number of distinct values.
            * Column "VALTYPE" contains the resulting class.

        Examples
        --------
        >>> ida_iris._table_def()
                     TYPENAME FACTORS      VALTYPE
        sepal_length   DOUBLE      35      NUMERIC
        sepal_width    DOUBLE      23      NUMERIC
        petal_length   DOUBLE      43      NUMERIC
        petal_width    DOUBLE      22      NUMERIC
        species       VARCHAR       3  CATEGORICAL
        """
        # We don't want to change the value of the attribute
        data = deepcopy(self.dtypes)

        def _valtype_from_dbtype(tup):
            """
            Decides if a column should be considered categorical or numerical
            """
            categorical_attributes = ['VARCHAR', 'CHARACTER VARYING', 'CHARACTER', 'VARGRAPHIC',
                                      'GRAPHIC', 'CLOB']
            numerical_attributes = ['SMALLINT', 'INTEGER', 'BIGINT', 'REAL',
                                    'DOUBLE', 'DOUBLE PRECISION', 'FLOAT', 'DECIMAL', 'NUMERIC']
            if tup[0].upper() in categorical_attributes:
                if factor_threshold is None:
                    return "CATEGORICAL"
                elif tup[1] <= factor_threshold:
                    return "CATEGORICAL"
                else:
                    return "STRING"
            elif tup[0].upper() in numerical_attributes:
                if factor_threshold is None:
                    return "NUMERIC"
                elif tup[1] > factor_threshold:
                    return "NUMERIC"
                else:
                    return "CATEGORICAL"
            else:
                return "NONE"

        data['FACTORS'] = nzpyida.statistics._count_level(self, data.index.values)
        data['VALTYPE'] = [_valtype_from_dbtype(x) for x in
                           data[['TYPENAME', 'FACTORS']].to_records(index=0)]
        return data

    def _get_numerical_columns(self):
        """
        Get the columns of self that are considered as numerical. Their data
        type in the database determines whether these columns are numerical.
        The following data types are considered numerical:

            SMALLINT, INTEGER, BIGINT, REAL, DOUBLE, FLOAT, DECIMAL, NUMERIC.

        Returns
        -------
        list
            List of numerical column names.

        Examples
        --------
        >>> ida_iris._get_numerical_columns()
        ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
        """
        num = ['SMALLINT', 'INTEGER', 'BIGINT', 'REAL',
                            'DOUBLE', 'DOUBLE PRECISION', 'FLOAT', 'DECIMAL', 'NUMERIC']
        return list(self.dtypes.loc[self.dtypes['TYPENAME'].str.upper().isin(num)].index)

    def _get_categorical_columns(self):
        """
        Get the columns of self that are considered as categorical. Their data
        type in the database determines whether these columns are categorical.
        The following data types are considered categorical:

            VARCHAR,CHARACTER, VARGRAPHIC, GRAPHIC, CLOB.

        Returns
        -------
        list
            List of categorical column names.

        Examples
        --------
        >>> ida_iris._get_categorical_columns()
        ['species']
        """
        cat = ['VARCHAR', 'CHARACTER', 'VARGRAPHIC', 'GRAPHIC', 'CLOB']
        return list(self.dtypes.loc[self.dtypes['TYPENAME'].str.upper().isin(cat)].index)

    def _prepare_and_execute(self, query, autocommit = True, silent = False):
        """
        Prepare and execute a query.

        Parameters
        ----------
        query : str
            Query to be executed.
        autocommit : bool
             If set to true, the autocommit function is available.
        """
        return self._idadb._prepare_and_execute(query, autocommit, silent)

    def _autocommit(self):
        """
        Autocommit the connection. If the environment variable ‘AUTOCOMMIT’ is
        set to True, the function commits the changes.

        Notes
        -----

        If you commit, all changes that are made in the database after the last
        commit, including those in the child IdaDataFrames, are commited.

        If the environment variable ‘VERBOSE’ is set to True, the autocommit
        operations are notified in the console.
        """
        self._idadb._autocommit()

    def _combine_check(self, other):
        """
        Check if self and other refer to the same table and if all columns in
        self and other are numeric. This sanity check is used before performing
        aggregation operations between IdaDataFrame/IdaSeries.
        """
        def check_numeric_columns(idaobject):
            not_valid = []
            numeric_columns = idaobject._get_numerical_columns()
            for column in idaobject.columns:
                if column not in numeric_columns:
                    not_valid.append(column)
            if not_valid:
                raise TypeError("Arithmetic operation are not defined for %s"%not_valid)
        if isinstance(other, IdaDataFrame) | isinstance(other, nzpyida.IdaSeries):
            if self._name != other._name:
                raise IdaDataFrameError("It is not possible to aggregate columns using columns of a different table.")
        
        if not(isinstance(other, IdaDataFrame) | isinstance(other, nzpyida.IdaSeries) | isinstance(other, Number)):
            if other is not None:
                raise TypeError("Aggregation makes only sense with numbers, "+
                                "or IdaDataFrames refering to the same table.")
        
        check_numeric_columns(self)
        if isinstance(other, nzpyida.IdaSeries)|isinstance(other, IdaDataFrame):
            check_numeric_columns(other)

    

[docs]
    def delete_na(self, columns, logic="any", inplace=False):
        """
        Filter rows containing NULL values. Can be done in a destructive way (rows are deleted in the physical table)
        or in a non destructive way (a new IdaDataFrame is defined, original table and IdaDataFrame are preserved). 
        
        Parameters
        -----------
        columns : list of strings
            list of eligible column names
        logic : str, optional
            "any" by default. 
            If logic is set to "any" then all rows which contain a NaN value in any 
            (id est at least one) of the cited columns is deleted, this is a union condition;
            if logic is set to "all", then only rows containing null values in all the fields are deleted, this is an intersection condition.         
        inplace : bool, optional 
            False by default. 
            If True, then the underlying table is physically modified,
            if False, then the original objects remain unaffected, a copy of the IdaDataFrame is made and modified. 
        
        Returns
        --------
        If inplace is True, the original table is modified and the IdaDataFrame will be modified accordingly. No return.
        If inplace is False, a new IdaDataFrame is returned, the original table is not affected. This new IdaDataFrame points to a 
        new table which has been created according to the user defined criterions on columns.
        
        Examples
        --------
        >>> idadf.delete_na(["COL_1", "COL_2"], inplace = True)
        >>> #each row of the original table is physically deleted if there is a NULL value in one of the listed columns
        >>> new_idadf = idadf.delete_na(["COL_1", "COL_2"], logic = 'all')
        >>> # a new IdaDataFrame is created, rows will be selected only if all the listed columns have a NULL value.
        """
        
        # Check if list columns is not empty
        if len(columns)<1:
            raise IdaDataFrameError("You must specify the columns as a list of eligible names")
        # Check if column names are eligible
        for column_name in columns:
            if not isinstance(column_name, six.string_types):
                raise TypeError("%s is not of string type"%(column_name))
            if column_name not in self.columns:
                raise ValueError("%s refers to a columns that doesn't exists in self"%(column_name))
        
        idadb = self._idadb
        
        if inplace == True:
            # Explain
            print("The table %s will be physically modified." %self.tablename)
            print("Any IdaDataFrame pointing this table might be modified accordingly.")
            # Write DELETE query on original table
            tablename = self.tablename
            query = 'DELETE FROM %s WHERE "%s" IS NULL'%(tablename, columns[0])
            
            if len(columns)>1:
                if logic == "any":
                    for col in columns[1:]:
                        query = query + ' OR "%s" IS NULL'%col
                if logic == "all":
                    for col in columns[1:]:
                        query = query + ' AND "%s" IS NULL'%col                        
            idadb.ida_query(query)

        else:            
            # SELECT statement on the original table to create a view
            tablename0 = self.tablename 
            query = 'SELECT * FROM %s WHERE "%s" IS NOT NULL'%(tablename0, columns[0])
            
            if len(columns)>1:
                if logic == "any":
                    for col in columns[1:]:
                        query = query + ' AND "%s" IS NOT NULL'%col
                if logic == "all":
                    for col in columns[1:]:
                        query = query + ' OR "%s" IS NOT NULL'%col                                               
            # Create view with this select statement
            viewname = idadb._get_valid_tablename(prefix="VIEW_")
            self._prepare_and_execute("CREATE VIEW " + viewname + " AS "+ query)
            # Initiate the modified table under a random name
            tablename = idadb._get_valid_tablename(prefix="DATA_FRAME_")
            with_data = '' if self._idadb._is_netezza_system() else ' WITH DATA'
            idadb._prepare_and_execute("CREATE TABLE %s AS (SELECT * FROM %s)%s"%(tablename,viewname,with_data))
            print('A new table with filtered rows is available under the name %s.' %tablename)
            print('The newly created IdaDataFrame refers to this new table.')
            # Drop the view 
            idadb.drop_view(viewname)            
            # Define a new IdaDataFrame pointing to the new table
            idadf = IdaDataFrame(idadb, tablename, indexer = self.indexer)
            return idadf