Source code for banff.proc.proc_massimpu

import logging
from pathlib import Path

import pandas
import pyarrow

from banff.io_util import (
    GensysInputDataset,
    GensysOutputDataset,
    c_argtype_input_dataset,
    c_argtype_output_dataset,
    c_argtype_parameters,
)
from banff.proc import BanffProcedure

#******CLASS DEFINITIONS************************************************************

[docs] class ProcMassimpu(BanffProcedure): """Performs donor imputation for a block of variables using a nearest neighbour approach or random selection. The `massimp` procedure is intended for use when a large block of variables is missing for a set of respondents, typically when detailed information is collected only for a subsample (or second phase sample) of units. While the `donorimp` procedure uses both system and user matching fields, mass imputation only considers user matching fields to find a valid record (donor) that is most similar to the one which needs imputation (recipient). Mass imputation considers a recipient any record for which all the variables to impute (`must_impute`) are missing on `indata`, and considers a donors any record for which none of the listed variables are missing. If matching fields (`must_match`) are provided by the user, the `massimp` procedure uses them to find the nearest donor using the same distance function as `donorimp`. If matching fields are not provided, a donor is selected at random. Unlike `donorimp`, the `massimp` procedure does not use edits. Before running the procedure, users should ensure that the pool of potential donors do not include any errors, including outliers or consistency errors. Users may create by-groups by specifying `by` variables. These by-groups act as imputation classes. Use the `min_donors` and `percent_donors` parameters to ensure an appropriate number or ratio of recipients and donors exist in each imputation class before performing imputation. """ # static variables _proc_name = {"short": "massimpu", "long": "Mass Imputation"} _arg_types = [ c_argtype_parameters(), # parameters c_argtype_input_dataset(), # indata c_argtype_output_dataset(), # outdata c_argtype_output_dataset(), # outdata c_argtype_output_dataset(), # outstatus c_argtype_output_dataset(), # outstatus c_argtype_output_dataset(), # outdonormap c_argtype_output_dataset(), # outdonormap ] def __init__(self, # USER C code parameters accept_negative: bool | None = None, no_by_stats: bool | None = None, random: bool | None = None, mrl: float | None = None, percent_donors: float | None = None, min_donors: int | None = None, n_limit: int | None = None, seed: int | None = None, unit_id: str | None = None, by: str | None = None, must_impute: str | None = None, must_match: str | None = None, # USER dataset references indata: pyarrow.Table | pandas.DataFrame | Path | str | None = None, outdata: Path | str | None = None, outstatus: Path | str | None = None, outdonormap: Path | str | None = None, # Fancy New Options presort: bool | None = None, # super class options trace: int | bool | None = None, capture: bool | None = False, logger: logging.Logger | None = None, **kwargs, ): """Performs donor imputation for a block of variables using a nearest neighbour approach or random selection. :param accept_negative: Treat negative values as valid. Default=False. :type accept_negative: bool | None, optional :param no_by_stats: Reduces log output by suppressing by-group specific messages. Default=False. :type no_by_stats: bool | None, optional :param random: Random selection of donors. :type random: bool | None, optional :param mrl: Multiplier ratio limit. :type mrl: float | None, optional :param percent_donors: Minimum percentage of donors required to perform imputation. Default=30. :type percent_donors: float | None, optional :param min_donors: Minimum number of donors required to perform imputation. Default=30. :type min_donors: int | None, optional :param n_limit: Maximum number of times a donor can be used. :type n_limit: int | None, optional :param seed: Specify the root for the random number generator. :type seed: int | None, optional :param unit_id: Identify key variable (unit identifier) on indata. Mandatory. :type unit_id: str | None, optional :param by: Variable(s) used to partition indata into by-groups for independent processing. :type by: str | None, optional :param must_impute: Variables(s) to be imputed. Mandatory. :type must_impute: str | None, optional :param must_match: User defined matching field(s). :type must_match: str | None, optional :param indata: Input statistical data. Mandatory. :type indata: pyarrow.Table | pandas.DataFrame | Path | str | None, optional :param outdata: Output statistical table containing imputed data. :type outdata: Path | str | None, optional :param outstatus: Output status file identifying imputed fields with IMAS status flags, and their values after imputation. :type outstatus: Path | str | None, optional :param outdonormap: Output table of recipient-donor pairs for successfully imputed records. :type outdonormap: Path | str | None, optional :param presort: Sorts input tables before processing, according to procedure requirements. Default=True. :type presort: bool | None, optional :param trace: Control which log levels are included when using the default logger. :type trace: int | bool | None, optional :param capture: Configure how console output is displayed. :type capture: bool | None, optional :param logger: Custom logger to use for procedure execution. :type logger: logging.Logger | None, optional """ # noqa: D401,E501 # USER C code parameters parm_dict = {} parm_dict["accept_negative"] = accept_negative parm_dict["no_by_stats"] = no_by_stats parm_dict["random"] = random parm_dict["mrl"] = mrl parm_dict["percent_donors"] = percent_donors parm_dict["min_donors"] = min_donors parm_dict["n_limit"] = n_limit parm_dict["seed"] = seed parm_dict["unit_id"] = unit_id parm_dict["by"] = by parm_dict["must_impute"] = must_impute parm_dict["must_match"] = must_match self.c_parms = parm_dict # INTERNAL dataset components (they store USER datasets/output specifications) self._indata = GensysInputDataset("indata", indata) self._outdata = GensysOutputDataset("outdata", outdata) self._outdonormap = GensysOutputDataset("outdonormap", outdonormap) self._outstatus = GensysOutputDataset("outstatus", outstatus) # call super constructor super().__init__( trace=trace, capture=capture, logger=logger, input_datasets=[ self._indata, ], output_datasets=[ self._outdata, self._outstatus, self._outdonormap, ], presort=presort, prefill_by_vars=False, # no input status dataset keyword_args=kwargs, ) ##### property methods @property def indata(self): return self._get_input_dataset(self._indata) @indata.setter def indata(self, value): self._set_input_dataset(ds=self._indata, value=value) @property def outdata(self): return self._get_output_dataset(self._outdata) @outdata.setter def outdata(self, value): self._set_output_dataset(ds=self._outdata, value=value) @property def outstatus(self): return self._get_output_dataset(self._outstatus) @outstatus.setter def outstatus(self, value): self._set_output_dataset(ds=self._outstatus, value=value) @property def outdonormap(self): return self._get_output_dataset(self._outdonormap) @outdonormap.setter def outdonormap(self, value): self._set_output_dataset(ds=self._outdonormap, value=value) def _call_c_code(self): return self._cproc_func( self._parm_dict, self._indata.c_arg, self._outdata.c_schema, self._outdata.c_array, self._outstatus.c_schema, self._outstatus.c_array, self._outdonormap.c_schema, self._outdonormap.c_array, )