Source code for banff.proc.proc_errorloc

import logging
from pathlib import Path

import pandas
import pyarrow

from banff.io_util import (
    GensysInputDataset,
    GensysOutputDataset,
    c_argtype_input_dataset,
    c_argtype_output_dataset,
    c_argtype_parameters,
)
from banff.proc import BanffProcedure

#******CLASS DEFINITIONS************************************************************

[docs] class ProcErrorloc(BanffProcedure): """For each record, selects the minimum number of variables to impute such that each observation can be made to pass all edits. Consistency edits specify relationships between variables that a record must satisfy. When a record fails to satisfy these relationships, users must choose which variables to change, a process known as error localization. The Banff error localization procedure follows the Fellegi-Holt minimum-change principle, and uses an algorithm to select which variables to change. This process is performed independently on each record. Selected values are saved in the outstatus file, with a status flag of FTI (Field to impute). This procedure requires a set of edits, consisting of linear equalities and inequalities, that must be internally consistent. The procedure will only perform error localization on the variables included in the list of edits. Any missing values from amongst the listed variables will automatically be selected for imputation. By default, the procedure will minimize the number of variables to change. Users may also specify variable weights, in which case the procedure will minimize the weighted count of variables to change. For some records, the error localization problem may have multiple solutions (i.e., choices of variables) that satisfy the minimum-change principle; in this case one of the solutions is selected at random. """ # static variables _proc_name = {"short": "errorloc", "long": "Error Localization"} _arg_types = [ c_argtype_parameters(), # parameters c_argtype_input_dataset(), # indata c_argtype_input_dataset(), # instatus c_argtype_output_dataset(), # outstatus c_argtype_output_dataset(), # outstatus c_argtype_output_dataset(), # outreject c_argtype_output_dataset(), # outreject ] def __init__(self, # USER C code parameters unit_id: str | None = None, by: str | None = None, rand_num_var: str | None = None, edits: str | None = None, weights: str | None = None, cardinality: float | None = None, time_per_obs: float | None = None, seed: int | None = None, display_level: int | None = None, accept_negative: bool | None = None, no_by_stats: bool | None = None, # USER dataset references indata: pyarrow.Table | pandas.DataFrame | Path | str | None = None, instatus: pyarrow.Table | pandas.DataFrame | Path | str | None = None, outstatus: Path | str | None = None, outreject: Path | str | None = None, # Fancy New Options presort: bool | None = None, prefill_by_vars: bool | None = None, # super class options trace: int | bool | None = None, capture: bool | None = False, logger: logging.Logger | None = None, **kwargs, ): """For each record, selects the minimum number of variables to impute such that each observation can be made to pass all edits. :param unit_id: Identify key variable (unit identifier) on indata. Mandatory. :type unit_id: str | None, optional :param by: Variable(s) used to partition indata into by-groups for independent processing. :type by: str | None, optional :param rand_num_var: Specify a random number variable to be used when having to make a choice during error localization. :type rand_num_var: str | None, optional :param edits: List of consistency edits. Mandatory. :type edits: str | None, optional :param weights: Specify the error localization weights. :type weights: str | None, optional :param cardinality: Specify the maximum cardinality. :type cardinality: float | None, optional :param time_per_obs: Specify the maximum processing time allowed per observation. :type time_per_obs: float | None, optional :param seed: Specify the root for the random number generator. :type seed: int | None, optional :param display_level: Value (0 or 1) to request detail output to the log in relation to the random number variable. Default=0. :type display_level: int | None, optional :param accept_negative: Treat negative values as valid. Default=False. :type accept_negative: bool | None, optional :param no_by_stats: Reduce log output by suppressing by-group specific messages. Default=False. :type no_by_stats: bool | None, optional :param indata: Input statistical data. Mandatory. :type indata: pyarrow.Table | pandas.DataFrame | Path | str | None, optional :param instatus: Input status file containing FTI status flags. :type instatus: pyarrow.Table | pandas.DataFrame | Path | str | None, optional :param outstatus: Output status file identifying selected fields with FTI status flags, and their values. :type outstatus: Path | str | None, optional :param outreject: Output table containing records that failed error localization. :type outreject: Path | str | None, optional :param presort: Sort input tables before processing, according to procedure requirements. Default=True. :type presort: bool | None, optional :param prefill_by_vars: Add by-group variable(s) to input status file(s) to improve performance. Default=True. :type prefill_by_vars: bool | None, optional :param trace: Control which log levels are included when using the default logger. :type trace: int | bool | None, optional :param capture: Configure how console output is displayed. :type capture: bool | None, optional :param logger: Custom logger to use for procedure execution. :type logger: logging.Logger | None, optional """ # noqa: D401,E501 # USER C code parameters parm_dict = {} parm_dict["unit_id"] = unit_id parm_dict["by"] = by parm_dict["rand_num_var"] = rand_num_var parm_dict["edits"] = edits parm_dict["weights"] = weights parm_dict["cardinality"] = cardinality parm_dict["time_per_obs"] = time_per_obs parm_dict["seed"] = seed parm_dict["display_level"] = display_level parm_dict["accept_negative"] = accept_negative parm_dict["no_by_stats"] = no_by_stats self.c_parms = parm_dict # INTERNAL dataset components (they store USER datasets/output specifications) self._indata = GensysInputDataset("indata", indata) self._instatus = GensysInputDataset("instatus", instatus) self._outreject = GensysOutputDataset("outreject", outreject) self._outstatus = GensysOutputDataset("outstatus", outstatus) # call super constructor super().__init__( trace=trace, capture=capture, logger=logger, input_datasets=[ self._indata, self._instatus, ], output_datasets=[ self._outstatus, self._outreject, ], presort=presort, prefill_by_vars=prefill_by_vars, keyword_args=kwargs, ) ##### property methods @property def indata(self): return self._get_input_dataset(self._indata) @indata.setter def indata(self, value): self._set_input_dataset(ds=self._indata, value=value) @property def instatus(self): return self._get_input_dataset(self._instatus) @instatus.setter def instatus(self, value): self._set_input_dataset(ds=self._instatus, value=value) @property def outstatus(self): return self._get_output_dataset(self._outstatus) @outstatus.setter def outstatus(self, value): self._set_output_dataset(ds=self._outstatus, value=value) @property def outreject(self): return self._get_output_dataset(self._outreject) @outreject.setter def outreject(self, value): self._set_output_dataset(ds=self._outreject, value=value) def _call_c_code(self): return self._cproc_func( self._parm_dict, self._indata.c_arg, self._instatus.c_arg, self._outstatus.c_schema, self._outstatus.c_array, self._outreject.c_schema, self._outreject.c_array, )
[docs] def get_sort_list(self, include_by=True, include_unit_id=True): """Call superclass implementation using custom default values.""" return super().get_sort_list( include_by=include_by, include_unit_id=include_unit_id, )