Source code for banff.proc.proc_prorate

import logging
from pathlib import Path

import pandas
import pyarrow

from banff.io_util import (
    GensysInputDataset,
    GensysOutputDataset,
    c_argtype_input_dataset,
    c_argtype_output_dataset,
    c_argtype_parameters,
)
from banff.proc import BanffProcedure

#******CLASS DEFINITIONS************************************************************

[docs] class ProcProrate(BanffProcedure): """Prorates and rounds records to satisfy user-specified edits. Unlike other Banff procedures, the `edits` for this procedure follow specific criteria: only equalities are permitted, and the set of edits must form a hierarchical structure that sums to a grand-total. For example: ```plaintext subtotal1 + subtotal2 = grandtotal a + b + c = subtotal1 d + e + f = subtotal2 ``` Each individual edit must consist of a set of components `x(i)` that sum to a total `y`, i.e., of the form `x(1) + ... x(n) = y`. Inequalities and constants are not permitted. For each individual edit equation that is not satisfied, one of the two prorating algorithms (`basic` or `scaling`) is applied in order to rake the components to match the total. The procedure takes a top-down approach, beginning with the grand-total (which is never changed) and adjusting components as necessary, until the full set of edits is satisfied. Missing values are not prorated; they are set to zero during the procedure and reset to missing afterwards. Values of zero are never altered. Additional features: * Automatic rounding to the desired number of decimal places. * Optional bounds to constrain the relative change of values during prorating. * Control over which variables are eligible for prorating. * Option to limit prorating to original or previously imputed values, either globally or for individual variables. * Weights to adjust the relative change of individual variables. """ # static variables _proc_name = {"short": "prorate", "long": "Prorate"} _arg_types = [ c_argtype_parameters(), # parameters c_argtype_input_dataset(), # indata c_argtype_input_dataset(), # instatus c_argtype_output_dataset(), # outdata c_argtype_output_dataset(), # outdata c_argtype_output_dataset(), # outstatus c_argtype_output_dataset(), # outstatus c_argtype_output_dataset(), # outreject c_argtype_output_dataset(), # outreject ] def __init__(self, # USER C code parameters accept_negative: bool | None = None, no_by_stats: bool | None = None, verify_edits: bool | None = None, lower_bound: float | None = None, upper_bound: float | None = None, decimal: int | None = None, edits: str | None = None, method: str | None = None, modifier: str | None = None, unit_id: str | None = None, by: str | None = None, # USER dataset references indata: pyarrow.Table | pandas.DataFrame | Path | str | None = None, instatus: pyarrow.Table | pandas.DataFrame | Path | str | None = None, outstatus: Path | str | None = None, outdata: Path | str | None = None, outreject: Path | str | None = None, # Fancy New Options presort: bool | None = None, prefill_by_vars: bool | None = None, # super class options trace: int | bool | None = None, capture: bool | None = False, logger: logging.Logger | None = None, **kwargs, ): """Prorates and rounds records to satisfy user-specified edits. :param accept_negative: Treat negative values as valid. Default=False. :type accept_negative: bool | None, optional :param no_by_stats: Reduce log output by suppressing by-group specific messages. Default=False. :type no_by_stats: bool | None, optional :param verify_edits: Verify the consistency of the edits without performing any prorating. Default=False. :type verify_edits: bool | None, optional :param lower_bound: Lower bound on the relative change of the variables. Default = 0. :type lower_bound: float | None, optional :param upper_bound: Upper bound on the relative change of the variables. :type upper_bound: float | None, optional :param decimal: Number of decimals used in the rounding algorithm (between 0 and 9). Default=0. :type decimal: int | None, optional :param edits: List of edits that the prorating procedure must satisfy. Mandatory. :type edits: str | None, optional :param method: Prorating method ("SCALING" or "BASIC"). Default = "BASIC". :type method: str | None, optional :param modifier: Global modifier ("ALWAYS", "IMPUTED", "ORIGINAL") to control which values are prorated. Default = "ALWAYS" :type modifier: str | None, optional :param unit_id: Identify key variable (unit identifier) on indata. Mandatory. :type unit_id: str | None, optional :param by: Variable(s) used to partition indata into by-groups for independent processing. :type by: str | None, optional :param indata: Input statistical data. Mandatory. :type indata: pyarrow.Table | pandas.DataFrame | Path | str | None, optional :param instatus: Input status file containing I-- status flags. :type instatus: pyarrow.Table | pandas.DataFrame | Path | str | None, optional :param outstatus: Output status file identifying imputed fields with IPR status flags, and their values after imputation. :type outstatus: Path | str | None, optional :param outdata: Output statistical table containing imputed data. :type outdata: Path | str | None, optional :param outreject: Output table containing records that failed prorating. :type outreject: Path | str | None, optional :param presort: Sort input tables before processing, according to procedure requirements. Default=True. :type presort: bool | None, optional :param prefill_by_vars: Add by-group variable(s) to input status file to improve performance. Default=True. :type prefill_by_vars: bool | None, optional :param trace: Control which log levels are included when using the default logger. :type trace: int | bool | None, optional :param capture: Configure how console output is displayed. :type capture: bool | None, optional :param logger: Custom logger to use for procedure execution. :type logger: logging.Logger | None, optional """ # noqa: D401,E501 # USER C code parameters parm_dict = {} parm_dict["accept_negative"] = accept_negative parm_dict["no_by_stats"] = no_by_stats parm_dict["verify_edits"] = verify_edits parm_dict["lower_bound"] = lower_bound parm_dict["upper_bound"] = upper_bound parm_dict["decimal"] = decimal parm_dict["edits"] = edits parm_dict["method"] = method parm_dict["modifier"] = modifier parm_dict["unit_id"] = unit_id parm_dict["by"] = by self.c_parms = parm_dict # INTERNAL dataset components (they store USER datasets/output specifications) self._indata = GensysInputDataset("indata", indata) self._instatus = GensysInputDataset("instatus", instatus) if verify_edits is True: self._outdata = GensysOutputDataset("outdata", output_specification=False, mandatory=False) self._outreject = GensysOutputDataset("outreject", output_specification=False, mandatory=False) self._outstatus = GensysOutputDataset("outstatus", output_specification=False, mandatory=False) else: self._outdata = GensysOutputDataset("outdata", outdata) self._outreject = GensysOutputDataset("outreject", outreject) self._outstatus = GensysOutputDataset("outstatus", outstatus) # call super constructor super().__init__( trace=trace, capture=capture, logger=logger, input_datasets=[ self._indata, self._instatus, ], output_datasets=[ self._outdata, self._outstatus, self._outreject, ], presort=presort, prefill_by_vars=prefill_by_vars, keyword_args=kwargs, ) ##### property methods @property def indata(self): return self._get_input_dataset(self._indata) @indata.setter def indata(self, value): self._set_input_dataset(ds=self._indata, value=value) @property def instatus(self): return self._get_input_dataset(self._instatus) @instatus.setter def instatus(self, value): self._set_input_dataset(ds=self._instatus, value=value) @property def outdata(self): return self._get_output_dataset(self._outdata) @outdata.setter def outdata(self, value): self._set_output_dataset(ds=self._outdata, value=value) @property def outstatus(self): return self._get_output_dataset(self._outstatus) @outstatus.setter def outstatus(self, value): self._set_output_dataset(ds=self._outstatus, value=value) @property def outreject(self): return self._get_output_dataset(self._outreject) @outreject.setter def outreject(self, value): self._set_output_dataset(ds=self._outreject, value=value) def _call_c_code(self): return self._cproc_func( self._parm_dict, self._indata.c_arg, self._instatus.c_arg, self._outdata.c_schema, self._outdata.c_array, self._outstatus.c_schema, self._outstatus.c_array, self._outreject.c_schema, self._outreject.c_array, )
[docs] def get_sort_list(self, include_by=True, include_unit_id=True): """Call superclass implementation using custom default values.""" return super().get_sort_list( include_by=include_by, include_unit_id=include_unit_id, )