Source code for banff.proc.proc_outlier

import logging
from pathlib import Path

import pandas
import pyarrow

from banff.io_util import (
    GensysInputDataset,
    GensysOutputDataset,
    c_argtype_input_dataset,
    c_argtype_output_dataset,
    c_argtype_parameters,
    remove_rows_where,
)
from banff.proc import BanffProcedure

#******CLASS DEFINITIONS************************************************************


[docs]
class ProcOutlier(BanffProcedure):
    """Identifies outlying observations using Hidiroglou-Berthelot or Sigma-Gap methods.

    This procedure offers two methods of univariate outlier detection. The Hidiroglou-Berthelot (HB)
    method selects outliers based on their distance from the median, relative to the interquartile
    distance. The Sigma-Gap (SG) method sorts the data in ascending order and searches for significant
    gaps (relative to the standard deviation) between consecutive values, selecting all subsequent
    values as outliers. Both methods can detect two types of outliers, which are flagged on the
    `outstatus` file:

    - Values that are extreme enough to be considered errors.
      These values are flagged as *fields to impute (FTI)* so they can be imputed in a subsequent step.
    - Values that are not extreme enough to be considered errors, but are sufficiently unusual to be
      deemed *fields to exclude (FTE)* by subsequent imputation procedures such as `donorimp` and
      `estimator`. (This flag can also be useful during weighting and robust estimation.)

    For both procedures, users must specify either an imputation or exclusion threshold; no default
    value is provided.

    Additional features of the procedure:

    - Users can run outlier detection on multiple variables (`var`) in one call.
    - Users can also run outlier detection on ratios of variables. In this case, only the numerators
      (`var`) are flagged on `outstatus`. For the denominator, users may select auxiliary variables
      (`with_var`) from the current period (`indata`) or from historical data (`indata_hist`).
    - Outlier detection can be performed to the right, left, or on both sides (`side`).
    - Outlier detection can be performed within by-groups (`by`), with a user-specified minimum number
      of observations (`min_obs`) required to perform outlier detection.
    """

    # static variables
    _proc_name = {"short": "outlier", "long": "Outlier"}
    _arg_types = [
        c_argtype_parameters(),  # parameters
        c_argtype_input_dataset(),  # indata
        c_argtype_input_dataset(),  # indata_hist

        c_argtype_output_dataset(),  # outstatus
        c_argtype_output_dataset(),  # outstatus
        c_argtype_output_dataset(),  # outstatus_detailed
        c_argtype_output_dataset(),  # outstatus_detailed
        c_argtype_output_dataset(),  # outsummary
        c_argtype_output_dataset(),  # outsummary
    ]

    def __init__(self,
            # USER C code parameters
            unit_id: str | None = None,
            weight: str | None = None,
            by: str | None = None,
            var: str | None = None,
            with_var: str | None = None,
            accept_negative: bool | None = None,
            no_by_stats: bool | None = None,
            accept_zero: bool | None = None,
            outlier_stats: bool | None = None,
            beta_e: float | None = None,
            beta_i: float | None = None,
            exponent: float | None = None,
            mdm: float | None = None,
            mei: float | None = None,
            mii: float | None = None,
            start_centile: float | None = None,
            min_obs: int | None = None,
            method: str | None = None,
            side: str | None = None,
            sigma: str | None = None,
            # USER dataset references
            indata: pyarrow.Table | pandas.DataFrame | Path | str | None = None,
            indata_hist: pyarrow.Table | pandas.DataFrame | Path | str | None = None,
            outstatus: Path | str | None = None,
            outstatus_detailed: Path | str | None = None,
            outsummary: Path | str | None = None,

            # Fancy New Options
            presort: bool | None = None,
            exclude_where_indata   = None,

            # super class options
            trace: int | bool | None = None,
            capture: bool | None = False,
            logger: logging.Logger | None = None,
            **kwargs,
        ):
        """Identifies outlying observations using Hidiroglou-Berthelot or Sigma-Gap methods.

        :param unit_id: Identify key variable (unit identifier) on indata and indata_hist. Mandatory.
        :type unit_id: str | None, optional
        :param weight: Variable to be used for weighting.
        :type weight: str | None, optional
        :param by: Variable(s) used to partition indata into by-groups for independent processing.
        :type by: str | None, optional
        :param var: Variables(s) for which to find outliers.
        :type var: str | None, optional
        :param with_var: Historical or auxiliary variables.
        :type with_var: str | None, optional
        :param accept_negative: Treat negative values as valid. Default=False.
        :type accept_negative: bool | None, optional
        :param no_by_stats: Reduces log output by suppressing by-group specific messages. Default=False.
        :type no_by_stats: bool | None, optional
        :param accept_zero: Treat zero values as valid. Default=False in the presence of historical or auxiliary variables, True otherwise.
        :type accept_zero: bool | None, optional
        :param outlier_stats: Add more information to outstatus_detailed output table, including imputation and exclusion interval bounds. Default=False.
        :type outlier_stats: bool | None, optional
        :param beta_e: SG multiplier for exclusion interval (non-negative).
        :type beta_e: float | None, optional
        :param beta_i: SG multiplier for imputation interval (non-negative).
        :type beta_i: float | None, optional
        :param exponent: HB exponent for a ratio or historical trend (between 0 and 1). Default=0.
        :type exponent: float | None, optional
        :param mdm: HB minimum distance multiplier (positive). Default=0.05.
        :type mdm: float | None, optional
        :param mei: HB Multiplier for exclusion interval (positive).
        :type mei: float | None, optional
        :param mii: HB multiplier for imputation interval (positive).
        :type mii: float | None, optional
        :param start_centile: SG centile to be used to determine the starting point (between 0 and 100). Default=75 for 'side="BOTH"', 0 otherwise.
        :type start_centile: float | None, optional
        :param min_obs: Minimum number of observations that must exist in the input table or in a by-group (positive). Default=3 for HB, 5 for SG.
        :type min_obs: int | None, optional
        :param method: Method to be used to detect outlying observations ('CURRENT', 'RATIO', 'HISTORIC' or 'SIGMAGAP'). Mandatory.
        :type method: str | None, optional
        :param side: Side ('LEFT', 'RIGHT', or 'BOTH') of the ordered data to be used for detecting outliers. Default='BOTH'.
        :type side: str | None, optional
        :param sigma: SG type of deviation ('MAD' or 'STD') to be calculated. Default='MAD'.
        :type sigma: str | None, optional
        :param indata: Input statistical data. Mandatory.
        :type indata: pyarrow.Table | pandas.DataFrame | Path | str | None, optional
        :param indata_hist: Input historical data.
        :type indata_hist: pyarrow.Table | pandas.DataFrame | Path | str | None, optional
        :param outstatus: Contains the status of the fields (FTE/FTI) identified as outliers and their values.
        :type outstatus: Path | str | None, optional
        :param outstatus_detailed: Detailed status for the outliers (ODER/ODEL/ODIR/ODIL).
        :type outstatus_detailed: Path | str | None, optional
        :param outsummary: Outlier summary information such as observation counts and acceptance interval bounds.
        :type outsummary: Path | str | None, optional
        :param presort: Sorts input tables before processing, according to procedure requirements. Default=True.
        :type presort: bool | None, optional
        :param exclude_where_indata: Expression in SQL syntax to exclude observations from the outlier detection.
        :type exclude_where_indata: _type_, optional
        :param trace: Control which log levels are included when using the default logger.
        :type trace: int | bool | None, optional
        :param capture: Configure how console output is displayed.
        :type capture: bool | None, optional
        :param logger: Custom logger to use for procedure execution.
        :type logger: logging.Logger | None, optional
        """  # noqa: D401,E501
        # USER C code parameters
        parm_dict = {}
        parm_dict["unit_id"]           = unit_id
        parm_dict["weight"]            = weight
        parm_dict["by"]                = by
        parm_dict["var"]               = var
        parm_dict["with_var"]          = with_var
        parm_dict["accept_negative"]   = accept_negative
        parm_dict["no_by_stats"]       = no_by_stats
        parm_dict["accept_zero"]       = accept_zero
        parm_dict["outlier_stats"]     = outlier_stats
        parm_dict["beta_e"]            = beta_e
        parm_dict["beta_i"]            = beta_i
        parm_dict["exponent"]          = exponent
        parm_dict["mdm"]               = mdm
        parm_dict["mei"]               = mei
        parm_dict["mii"]               = mii
        parm_dict["start_centile"]     = start_centile
        parm_dict["min_obs"]           = min_obs
        parm_dict["method"]            = method
        parm_dict["side"]              = side
        parm_dict["sigma"]             = sigma
        self.c_parms = parm_dict

        # INTERNAL dataset components (they store USER datasets/output specifications)
        self._indata             = GensysInputDataset("indata", indata)
        self._indata_hist        = GensysInputDataset("indata_hist", indata_hist)
        self._outstatus          = GensysOutputDataset("outstatus", outstatus)
        self._outstatus_detailed = GensysOutputDataset("outstatus_detailed", outstatus_detailed, mandatory=False)
        self._outsummary         = GensysOutputDataset("outsummary", outsummary, mandatory=False)

        # call super constructor
        super().__init__(
            trace=trace, capture=capture, logger=logger,
            input_datasets=[
                self._indata,
                self._indata_hist,
            ],
            output_datasets=[
                self._outstatus,
                self._outstatus_detailed,
                self._outsummary,
            ],
            presort=presort,
            prefill_by_vars=False,  # no input status dataset
            exclude_where_indata = exclude_where_indata,
            keyword_args=kwargs,
        )

    ##### property methods
    @property
    def indata(self):
        return self._get_input_dataset(self._indata)
    @indata.setter
    def indata(self, value):
        self._set_input_dataset(ds=self._indata, value=value)

    @property
    def indata_hist(self):
        return self._get_input_dataset(self._indata_hist)
    @indata_hist.setter
    def indata_hist(self, value):
        self._set_input_dataset(ds=self._indata_hist, value=value)

    @property
    def outstatus(self):
        return self._get_output_dataset(self._outstatus)
    @outstatus.setter
    def outstatus(self, value):
        self._set_output_dataset(ds=self._outstatus, value=value)

    @property
    def outstatus_detailed(self):
        return self._get_output_dataset(self._outstatus_detailed)
    @outstatus_detailed.setter
    def outstatus_detailed(self, value):
        self._set_output_dataset(ds=self._outstatus_detailed, value=value)

    @property
    def outsummary(self):
        return self._get_output_dataset(self._outsummary)
    @outsummary.setter
    def outsummary(self, value):
        self._set_output_dataset(ds=self._outsummary, value=value)

    def _call_c_code(self):
        return self._cproc_func(
            self._parm_dict,

            self._indata.c_arg,
            self._indata_hist.c_arg,


            self._outstatus.c_schema,
            self._outstatus.c_array,
            self._outstatus_detailed.c_schema,
            self._outstatus_detailed.c_array,
            self._outsummary.c_schema,
            self._outsummary.c_array,

        )

    def _pp_exclude_where_indata(self):
        """Remove rows matching user-specified criteria.

        Delete rows matching user-provided `exclude_where_indata` from indata.
        Based on Banff Processor 1.x `generateOutlier.sas`
        """
        self._indata.ds_intermediate = remove_rows_where(
            self._indata.ds_intermediate,
            where_stmt=self._exclude_where_indata,
        )