Source code for banff.proc.proc_prorate

import logging
from pathlib import Path

import pandas
import pyarrow

from banff.io_util import (
    GensysInputDataset,
    GensysOutputDataset,
    c_argtype_input_dataset,
    c_argtype_output_dataset,
    c_argtype_parameters,
)
from banff.proc import BanffProcedure

#******CLASS DEFINITIONS************************************************************


[docs]
class ProcProrate(BanffProcedure):
    """Prorates and rounds records to satisfy user-specified edits.

    Unlike other Banff procedures, the `edits` for this procedure follow specific criteria: only
    equalities are permitted, and the set of edits must form a hierarchical structure that sums
    to a grand-total. For example:

    ```plaintext
    subtotal1 + subtotal2 = grandtotal
    a + b + c = subtotal1
    d + e + f = subtotal2
    ```

    Each individual edit must consist of a set of components `x(i)` that sum to a total `y`, i.e.,
    of the form `x(1) + ... x(n) = y`. Inequalities and constants are not permitted. For each individual
    edit equation that is not satisfied, one of the two prorating algorithms (`basic` or `scaling`) is
    applied in order to rake the components to match the total. The procedure takes a top-down approach,
    beginning with the grand-total (which is never changed) and adjusting components as necessary, until
    the full set of edits is satisfied. Missing values are not prorated; they are set to zero during the
    procedure and reset to missing afterwards. Values of zero are never altered.

    Additional features:

    * Automatic rounding to the desired number of decimal places.
    * Optional bounds to constrain the relative change of values during prorating.
    * Control over which variables are eligible for prorating.
    * Option to limit prorating to original or previously imputed values, either globally or for individual variables.
    * Weights to adjust the relative change of individual variables.
    """

    # static variables
    _proc_name = {"short": "prorate", "long": "Prorate"}
    _arg_types = [
        c_argtype_parameters(),  # parameters
        c_argtype_input_dataset(),  # indata
        c_argtype_input_dataset(),  # instatus

        c_argtype_output_dataset(),  # outdata
        c_argtype_output_dataset(),  # outdata
        c_argtype_output_dataset(),  # outstatus
        c_argtype_output_dataset(),  # outstatus
        c_argtype_output_dataset(),  # outreject
        c_argtype_output_dataset(),  # outreject
    ]

    def __init__(self,
            # USER C code parameters
            accept_negative: bool | None = None,
            no_by_stats: bool | None = None,
            verify_edits: bool | None = None,
            lower_bound: float | None = None,
            upper_bound: float | None = None,
            decimal: int | None = None,
            edits: str | None = None,
            method: str | None = None,
            modifier: str | None = None,
            unit_id: str | None = None,
            by: str | None = None,

            # USER dataset references
            indata: pyarrow.Table | pandas.DataFrame | Path | str | None = None,
            instatus: pyarrow.Table | pandas.DataFrame | Path | str | None = None,
            outstatus: Path | str | None = None,
            outdata: Path | str | None = None,
            outreject: Path | str | None = None,

            # Fancy New Options
            presort: bool | None = None,
            prefill_by_vars: bool | None = None,

            # super class options
            trace: int | bool | None = None,
            capture: bool | None = False,
            logger: logging.Logger | None = None,
            **kwargs,
        ):
        """Prorates and rounds records to satisfy user-specified edits.

        :param accept_negative: Treat negative values as valid. Default=False.
        :type accept_negative: bool | None, optional
        :param no_by_stats: Reduce log output by suppressing by-group specific messages. Default=False.
        :type no_by_stats: bool | None, optional
        :param verify_edits: Verify the consistency of the edits without performing any prorating. Default=False.
        :type verify_edits: bool | None, optional
        :param lower_bound: Lower bound on the relative change of the variables. Default = 0.
        :type lower_bound: float | None, optional
        :param upper_bound: Upper bound on the relative change of the variables.
        :type upper_bound: float | None, optional
        :param decimal: Number of decimals used in the rounding algorithm (between 0 and 9). Default=0.
        :type decimal: int | None, optional
        :param edits: List of edits that the prorating procedure must satisfy. Mandatory.
        :type edits: str | None, optional
        :param method: Prorating method ("SCALING" or "BASIC"). Default = "BASIC".
        :type method: str | None, optional
        :param modifier: Global modifier ("ALWAYS", "IMPUTED", "ORIGINAL") to control which values are prorated. Default = "ALWAYS"
        :type modifier: str | None, optional
        :param unit_id: Identify key variable (unit identifier) on indata. Mandatory.
        :type unit_id: str | None, optional
        :param by: Variable(s) used to partition indata into by-groups for independent processing.
        :type by: str | None, optional
        :param indata: Input statistical data. Mandatory.
        :type indata: pyarrow.Table | pandas.DataFrame | Path | str | None, optional
        :param instatus: Input status file containing I-- status flags.
        :type instatus: pyarrow.Table | pandas.DataFrame | Path | str | None, optional
        :param outstatus: Output status file identifying imputed fields with IPR status flags, and their values after imputation.
        :type outstatus: Path | str | None, optional
        :param outdata: Output statistical table containing imputed data.
        :type outdata: Path | str | None, optional
        :param outreject: Output table containing records that failed prorating.
        :type outreject: Path | str | None, optional
        :param presort: Sort input tables before processing, according to procedure requirements. Default=True.
        :type presort: bool | None, optional
        :param prefill_by_vars: Add by-group variable(s) to input status file to improve performance. Default=True.
        :type prefill_by_vars: bool | None, optional
        :param trace: Control which log levels are included when using the default logger.
        :type trace: int | bool | None, optional
        :param capture: Configure how console output is displayed.
        :type capture: bool | None, optional
        :param logger: Custom logger to use for procedure execution.
        :type logger: logging.Logger | None, optional
        """  # noqa: D401,E501
        # USER C code parameters
        parm_dict = {}
        parm_dict["accept_negative"]    = accept_negative
        parm_dict["no_by_stats"]        = no_by_stats
        parm_dict["verify_edits"]       = verify_edits
        parm_dict["lower_bound"]        = lower_bound
        parm_dict["upper_bound"]        = upper_bound
        parm_dict["decimal"]            = decimal
        parm_dict["edits"]              = edits
        parm_dict["method"]             = method
        parm_dict["modifier"]           = modifier
        parm_dict["unit_id"]            = unit_id
        parm_dict["by"]                 = by
        self.c_parms = parm_dict

        # INTERNAL dataset components (they store USER datasets/output specifications)
        self._indata             = GensysInputDataset("indata", indata)
        self._instatus           = GensysInputDataset("instatus", instatus)

        if verify_edits is True:
            self._outdata            = GensysOutputDataset("outdata",   output_specification=False, mandatory=False)
            self._outreject          = GensysOutputDataset("outreject", output_specification=False, mandatory=False)
            self._outstatus          = GensysOutputDataset("outstatus", output_specification=False, mandatory=False)
        else:
            self._outdata            = GensysOutputDataset("outdata",   outdata)
            self._outreject          = GensysOutputDataset("outreject", outreject)
            self._outstatus          = GensysOutputDataset("outstatus", outstatus)

        # call super constructor
        super().__init__(
            trace=trace, capture=capture, logger=logger,
            input_datasets=[
                self._indata,
                self._instatus,
            ],
            output_datasets=[
                self._outdata,
                self._outstatus,
                self._outreject,
            ],
            presort=presort,
            prefill_by_vars=prefill_by_vars,
            keyword_args=kwargs,
        )

    ##### property methods
    @property
    def indata(self):
        return self._get_input_dataset(self._indata)
    @indata.setter
    def indata(self, value):
        self._set_input_dataset(ds=self._indata, value=value)

    @property
    def instatus(self):
        return self._get_input_dataset(self._instatus)
    @instatus.setter
    def instatus(self, value):
        self._set_input_dataset(ds=self._instatus, value=value)

    @property
    def outdata(self):
        return self._get_output_dataset(self._outdata)
    @outdata.setter
    def outdata(self, value):
        self._set_output_dataset(ds=self._outdata, value=value)

    @property
    def outstatus(self):
        return self._get_output_dataset(self._outstatus)
    @outstatus.setter
    def outstatus(self, value):
        self._set_output_dataset(ds=self._outstatus, value=value)

    @property
    def outreject(self):
        return self._get_output_dataset(self._outreject)
    @outreject.setter
    def outreject(self, value):
        self._set_output_dataset(ds=self._outreject, value=value)

    def _call_c_code(self):
        return self._cproc_func(
            self._parm_dict,

            self._indata.c_arg,
            self._instatus.c_arg,

            self._outdata.c_schema,
            self._outdata.c_array,
            self._outstatus.c_schema,
            self._outstatus.c_array,
            self._outreject.c_schema,
            self._outreject.c_array,
        )


[docs]
    def get_sort_list(self, include_by=True, include_unit_id=True):
        """Call superclass implementation using custom default values."""
        return super().get_sort_list(
            include_by=include_by,
            include_unit_id=include_unit_id,
        )