Source code for banff.proc.proc_massimpu

import logging
from pathlib import Path

import pandas
import pyarrow

from banff.io_util import (
    GensysInputDataset,
    GensysOutputDataset,
    c_argtype_input_dataset,
    c_argtype_output_dataset,
    c_argtype_parameters,
)
from banff.proc import BanffProcedure

#******CLASS DEFINITIONS************************************************************


[docs]
class ProcMassimpu(BanffProcedure):
    """Performs donor imputation for a block of variables using a nearest neighbour approach or random selection.

    The `massimp` procedure is intended for use when a large block of variables is missing for a set of
    respondents, typically when detailed information is collected only for a subsample (or second phase
    sample) of units. While the `donorimp` procedure uses both system and user matching fields,
    mass imputation only considers user matching fields to find a valid record (donor) that is most
    similar to the one which needs imputation (recipient).

    Mass imputation considers a recipient any record for which all the variables to impute (`must_impute`)
    are missing on `indata`, and considers a donors any record for which none of the listed variables are
    missing. If matching fields (`must_match`) are provided by the user, the `massimp` procedure uses them
    to find the nearest donor using the same distance function as `donorimp`. If matching fields are not
    provided, a donor is selected at random.

    Unlike `donorimp`, the `massimp` procedure does not use edits. Before running the procedure, users
    should ensure that the pool of potential donors do not include any errors, including outliers or
    consistency errors.

    Users may create by-groups by specifying `by` variables. These by-groups act as imputation classes.
    Use the `min_donors` and `percent_donors` parameters to ensure an appropriate number or ratio of
    recipients and donors exist in each imputation class before performing imputation.
    """

    # static variables
    _proc_name = {"short": "massimpu", "long": "Mass Imputation"}
    _arg_types = [
        c_argtype_parameters(),  # parameters
        c_argtype_input_dataset(),  # indata

        c_argtype_output_dataset(),  # outdata
        c_argtype_output_dataset(),  # outdata
        c_argtype_output_dataset(),  # outstatus
        c_argtype_output_dataset(),  # outstatus
        c_argtype_output_dataset(),  # outdonormap
        c_argtype_output_dataset(),  # outdonormap
    ]

    def __init__(self,
            # USER C code parameters
            accept_negative: bool | None = None,
            no_by_stats: bool | None = None,
            random: bool | None = None,
            mrl: float | None = None,
            percent_donors: float | None = None,
            min_donors: int | None = None,
            n_limit: int | None = None,
            seed: int | None = None,
            unit_id: str | None = None,
            by: str | None = None,
            must_impute: str | None = None,
            must_match: str | None = None,

            # USER dataset references
            indata: pyarrow.Table | pandas.DataFrame | Path | str | None = None,
            outdata: Path | str | None = None,
            outstatus: Path | str | None = None,
            outdonormap: Path | str | None = None,

            # Fancy New Options
            presort: bool | None = None,

            # super class options
            trace: int | bool | None = None,
            capture: bool | None = False,
            logger: logging.Logger | None = None,
            **kwargs,
        ):
        """Performs donor imputation for a block of variables using a nearest neighbour approach or random selection.

        :param accept_negative: Treat negative values as valid. Default=False.
        :type accept_negative: bool | None, optional
        :param no_by_stats: Reduces log output by suppressing by-group specific messages. Default=False.
        :type no_by_stats: bool | None, optional
        :param random: Random selection of donors.
        :type random: bool | None, optional
        :param mrl: Multiplier ratio limit.
        :type mrl: float | None, optional
        :param percent_donors: Minimum percentage of donors required to perform imputation. Default=30.
        :type percent_donors: float | None, optional
        :param min_donors: Minimum number of donors required to perform imputation. Default=30.
        :type min_donors: int | None, optional
        :param n_limit: Maximum number of times a donor can be used.
        :type n_limit: int | None, optional
        :param seed: Specify the root for the random number generator.
        :type seed: int | None, optional
        :param unit_id: Identify key variable (unit identifier) on indata. Mandatory.
        :type unit_id: str | None, optional
        :param by: Variable(s) used to partition indata into by-groups for independent processing.
        :type by: str | None, optional
        :param must_impute: Variables(s) to be imputed. Mandatory.
        :type must_impute: str | None, optional
        :param must_match: User defined matching field(s).
        :type must_match: str | None, optional
        :param indata: Input statistical data. Mandatory.
        :type indata: pyarrow.Table | pandas.DataFrame | Path | str | None, optional
        :param outdata: Output statistical table containing imputed data.
        :type outdata: Path | str | None, optional
        :param outstatus: Output status file identifying imputed fields with IMAS status flags, and their values after imputation.
        :type outstatus: Path | str | None, optional
        :param outdonormap: Output table of recipient-donor pairs for successfully imputed records.
        :type outdonormap: Path | str | None, optional
        :param presort: Sorts input tables before processing, according to procedure requirements. Default=True.
        :type presort: bool | None, optional
        :param trace: Control which log levels are included when using the default logger.
        :type trace: int | bool | None, optional
        :param capture: Configure how console output is displayed.
        :type capture: bool | None, optional
        :param logger: Custom logger to use for procedure execution.
        :type logger: logging.Logger | None, optional
        """  # noqa: D401,E501
        # USER C code parameters
        parm_dict = {}
        parm_dict["accept_negative"]    = accept_negative
        parm_dict["no_by_stats"]        = no_by_stats
        parm_dict["random"]             = random
        parm_dict["mrl"]                = mrl
        parm_dict["percent_donors"]     = percent_donors
        parm_dict["min_donors"]         = min_donors
        parm_dict["n_limit"]            = n_limit
        parm_dict["seed"]               = seed
        parm_dict["unit_id"]            = unit_id
        parm_dict["by"]                 = by
        parm_dict["must_impute"]        = must_impute
        parm_dict["must_match"]         = must_match
        self.c_parms = parm_dict

        # INTERNAL dataset components (they store USER datasets/output specifications)
        self._indata             = GensysInputDataset("indata", indata)
        self._outdata            = GensysOutputDataset("outdata", outdata)
        self._outdonormap        = GensysOutputDataset("outdonormap", outdonormap)
        self._outstatus          = GensysOutputDataset("outstatus", outstatus)

        # call super constructor
        super().__init__(
            trace=trace, capture=capture, logger=logger,
            input_datasets=[
                self._indata,
            ],
            output_datasets=[
                self._outdata,
                self._outstatus,
                self._outdonormap,
            ],
            presort=presort,
            prefill_by_vars=False,  # no input status dataset
            keyword_args=kwargs,
        )

    ##### property methods
    @property
    def indata(self):
        return self._get_input_dataset(self._indata)
    @indata.setter
    def indata(self, value):
        self._set_input_dataset(ds=self._indata, value=value)

    @property
    def outdata(self):
        return self._get_output_dataset(self._outdata)
    @outdata.setter
    def outdata(self, value):
        self._set_output_dataset(ds=self._outdata, value=value)

    @property
    def outstatus(self):
        return self._get_output_dataset(self._outstatus)
    @outstatus.setter
    def outstatus(self, value):
        self._set_output_dataset(ds=self._outstatus, value=value)

    @property
    def outdonormap(self):
        return self._get_output_dataset(self._outdonormap)
    @outdonormap.setter
    def outdonormap(self, value):
        self._set_output_dataset(ds=self._outdonormap, value=value)

    def _call_c_code(self):
        return self._cproc_func(
            self._parm_dict,

            self._indata.c_arg,

            self._outdata.c_schema,
            self._outdata.c_array,
            self._outstatus.c_schema,
            self._outstatus.c_array,
            self._outdonormap.c_schema,
            self._outdonormap.c_array,
        )