Source code for banffprocessor.metadata.models.estimators

"""Metadata model for Estimators."""

import duckdb
import numpy as np

from banffprocessor.exceptions import MetadataConstraintError
from banffprocessor.metadata.models.metadataclass import MetadataClass
from banffprocessor.nls import _



[docs]
def builtin_estimators() -> dict:
    """Return built-in estimators as a dictionary, where the key is the name and the value is the type."""
    return {
        "AUXTREND": "EF",
        "AUXTREND2": "EF",
        "CURAUX": "EF",
        "CURAUXMEAN": "EF",
        "CURMEAN": "EF",
        "CURSUM2": "EF",
        "CURSUM3": "EF",
        "CURSUM4": "EF",
        "DIFTREND": "EF",
        "PREAUX": "EF",
        "PREVALUE": "EF",
        "PREMEAN": "EF",
        "PREAUXMEAN": "LR",
        "CURRATIO": "LR",
        "CURRATIO2": "LR",
        "CURREG": "LR",
        "CURREG_E2": "LR",
        "CURREG2": "LR",
        "CURREG3": "LR",
        "HISTREG": "LR",
    }



[docs]
class Estimators(MetadataClass):
    """Estimators metadata class."""

    def __init__(self, estimatorid: str, seqno: float, fieldid: str, algorithmname: str, randomerror: bool,
                 auxvariables: str | None =None, weightvariable: str | None =None,
                 variancevariable: str | None =None, varianceexponent: float | None =None,
                 varianceperiod: str | None =None, excludeimputed: bool | None =None,
                 excludeoutliers: bool | None =None, countcriteria: int | None =None,
                 percentcriteria: float | None =None, dbconn: duckdb.DuckDBPyConnection = duckdb) -> None:
        """Validate and create metadata entry, if validation passes."""
        self.estimatorid = estimatorid
        self.seqno = float(seqno)
        self.fieldid = fieldid
        self.algorithmname = algorithmname
        self.auxvariables = auxvariables
        self.weightvariable = weightvariable

        self.variancevariable = variancevariable
        self.varianceexponent = None if varianceexponent is None else float(varianceexponent)
        self.varianceperiod = varianceperiod

        #constraint invalidVariance
        variance_empty = [self.variancevariable is None,
                         self.varianceexponent is None,
                         self.varianceperiod is None]

        if(not all(variance_empty) and any(variance_empty)):
            msg = _("Constraint violated in {} table: ").format(self.__class__.__name__)
            msg += _("variance fields must either all be empty or all have values.")
            raise MetadataConstraintError(msg)

        # Currently gives value None if field is empty or whitespace
        if excludeimputed is not None and excludeimputed != " ":
            if(excludeimputed.upper() == "Y"):
                self.excludeimputed = True
            elif(excludeimputed.upper() == "N"):
                self.excludeimputed = False
            else:
                msg = _("Constraint violated in {} table: ").format(self.__class__.__name__)
                msg += _("{} value must be one of {}.").format("ExcludeImputed", " ' ', 'Y', 'N' ")
                raise MetadataConstraintError(msg)
        else:
            self.excludeimputed = None

        # Currently gives value None if field is empty or whitespace
        if excludeoutliers is not None and excludeoutliers != " ":
            if(excludeoutliers.upper() == "Y"):
                self.excludeoutliers = True
            elif(excludeoutliers.upper() == "N"):
                self.excludeoutliers = False
            else:
                msg = _("Constraint violated in {} table: ").format(self.__class__.__name__)
                msg += _("{} value must be one of {}.").format("ExcludeOutliers", " ' ', 'Y', 'N' ")
                raise MetadataConstraintError(msg)
        else:
            self.excludeoutliers = None

        # Field is mandatory and when used in generateEstimator.sas,
        # only checked for Y value, therefore default is False
        if randomerror is not None and randomerror != " ":
            if(randomerror.upper() == "Y"):
                self.randomerror = True
            elif(randomerror.upper() == "N"):
                self.randomerror = False
            else:
                msg = _("Constraint violated in {} table: ").format(self.__class__.__name__)
                msg += _("{} value must be one of {}.").format("RandomError", " ' ', 'Y', 'N' ")
                raise MetadataConstraintError(msg)
        else:
            self.randomerror = False

        self.countcriteria = None if countcriteria is None else int(countcriteria)
        #constraint invalidCountCriteria
        if ((self.countcriteria is not None) and (self.countcriteria <= 0)):
            msg = _("Constraint violated in {} table: ").format(self.__class__.__name__)
            msg += _("{} value must be greater than {}.").format("CountCriteria","0")
            raise MetadataConstraintError(msg)

        self.percentcriteria = None if percentcriteria is None else float(percentcriteria)
        #constraint invalidPercentCriteria
        if ((self.percentcriteria is not None) and (self.percentcriteria <= 0 or self.percentcriteria >= 100)):
            msg = _("Constraint violated in {} table: ").format(self.__class__.__name__)
            msg += _("PercentCriteria value must be greater than 0 and less than 100.")
            raise MetadataConstraintError(msg)

        # Note that the order of attributes must match the order in the create statement
        statement = f"INSERT INTO  banff.{self.__class__.__name__} VALUES (?, ?, ?, ?, ?, ?, ?, ?,?, ?, ?, ?, ?, ?)" # noqa: S608
        dbconn.execute(statement,[self.estimatorid, self.seqno, self.fieldid, self.auxvariables,
                                  self.weightvariable, self.variancevariable, self.varianceexponent, self.varianceperiod,
                                  self.excludeimputed, self.excludeoutliers, self.countcriteria, self.percentcriteria,
                                  self.randomerror, self.algorithmname])


[docs]
    @classmethod
    def initialize(cls, dbconn: duckdb.DuckDBPyConnection = duckdb) -> None:
        """Create duckdb table to store the metadata."""
        cls.setup(dbconn=dbconn)

        create_statement =  f"""CREATE TABLE banff.{cls.__name__} (
            estimatorid VARCHAR NOT NULL,
            seqno INT NOT NULL,
            fieldid VARCHAR NOT NULL,
            auxvariables VARCHAR,
            weightvariable VARCHAR,
            variancevariable VARCHAR,
            varianceexponent Real,
            varianceperiod VARCHAR,
            excludeimputed BOOLEAN,
            excludeoutliers BOOLEAN,
            countcriteria  INT,
            percentcriteria REAL,
            randomerror BOOLEAN,
            algorithmname VARCHAR NOT NULL,
            PRIMARY KEY (estimatorid, seqno)
            )
        """
        dbconn.execute(create_statement)



[docs]
    def to_dict(self) -> dict[str, str | int | float]:
        """Return the object as a dictionary.

        Used for creating a Dataframe from the object. Explicitly makes all fields values reflect their type, if no value was provided.
        This way there is no possiblity of an incorrect datatype (character seen as numeric or vice versa) for any empty fields when the
        constructed dataframe is passed to the Banff package c-code.
        """
        return {
            "estimatorid": self.estimatorid if self.estimatorid else "",
            "seqno": self.seqno if self.seqno is not None else np.nan,
            "fieldid": self.fieldid if self.fieldid else "",
            "algorithmname": self.algorithmname if self.algorithmname else "",
            "randomerror": "Y" if self.randomerror else "N",
            "auxvariables": self.auxvariables if self.auxvariables else "",
            "weightvariable": self.weightvariable if self.weightvariable else "",
            "variancevariable": self.variancevariable if self.variancevariable else "",
            "varianceexponent": self.varianceexponent if self.varianceexponent is not None else np.nan,
            "varianceperiod": self.varianceperiod if self.varianceperiod else "",
            # Neither excludeimputed or excludeoutliers are used by the SAS processor at all strangely
            "excludeimputed": "Y" if self.excludeimputed else (" " if self.excludeimputed is None else "N"),
            "excludeoutliers": "Y" if self.excludeoutliers else (" " if self.excludeoutliers is None else "N"),
            "countcriteria": self.countcriteria if self.countcriteria is not None else np.nan,
            "percentcriteria": self.percentcriteria if self.percentcriteria is not None else np.nan,
        }




[docs]
    @staticmethod
    def get_schema(root_element_name: str = "banffProcessor") -> str:
        """Return schema (XSD) contents as a string."""
        return f"""<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema">
<xs:element name="{root_element_name}">
<xs:complexType>
<xs:sequence>
<xs:element name="estimators" maxOccurs="5000" minOccurs="0">
<xs:complexType>

<xs:all>
<xs:element name="estimatorid">
<xs:simpleType>
<xs:restriction base="xs:string">
<xs:minLength value="1"/>
<xs:maxLength value="100"/>
</xs:restriction>
</xs:simpleType>
</xs:element>
<xs:element name="seqno">
<xs:simpleType>
<xs:restriction base="xs:positiveInteger"></xs:restriction>
</xs:simpleType>
</xs:element>
<xs:element name="fieldid">
<xs:simpleType>
<xs:restriction base="xs:string">
<xs:minLength value="1"/>
<xs:maxLength value="{MetadataClass.DATA_FIELD_SCHEMA_MAX_LENGTH}"/>
</xs:restriction>
</xs:simpleType>
</xs:element>
<xs:element minOccurs="0" name="auxvariables" nillable="true">
<xs:simpleType>
<xs:restriction base="xs:string">
<xs:minLength value="1"/>
<xs:maxLength value="1000"/>
</xs:restriction>
</xs:simpleType>
</xs:element>
<xs:element minOccurs="0" name="weightvariable" nillable="true">
<xs:simpleType>
<xs:restriction base="xs:string">
<xs:minLength value="0"/>
<xs:maxLength value="{MetadataClass.DATA_FIELD_SCHEMA_MAX_LENGTH}"/>
</xs:restriction>
</xs:simpleType>
</xs:element>
<xs:element minOccurs="0" name="variancevariable" nillable="true">
<xs:simpleType>
<xs:restriction base="xs:string">
<xs:minLength value="0"/>
<xs:maxLength value="{MetadataClass.DATA_FIELD_SCHEMA_MAX_LENGTH}"/>
</xs:restriction>
</xs:simpleType>
</xs:element>
<xs:element minOccurs="0" name="varianceexponent" nillable="true">
<xs:simpleType>
<xs:restriction base="xs:float"></xs:restriction>
</xs:simpleType>
</xs:element>
<xs:element minOccurs="0" name="varianceperiod" nillable="true">
<xs:simpleType>
<xs:restriction base="xs:string">
<xs:enumeration value="C"/>
<xs:enumeration value="c"/>
<xs:enumeration value="H"/>
<xs:enumeration value="h"/>
</xs:restriction>
</xs:simpleType>
</xs:element>
<xs:element minOccurs="0" name="excludeimputed" nillable="true">
<xs:simpleType>
<xs:restriction base="xs:string">
<xs:enumeration value="Y"/>
<xs:enumeration value="N"/>
<xs:enumeration value="y"/>
<xs:enumeration value="n"/>
</xs:restriction>
</xs:simpleType>
</xs:element>
<xs:element minOccurs="0" name="excludeoutliers" nillable="true">
<xs:simpleType>
<xs:restriction base="xs:string">
<xs:enumeration value="Y"/>
<xs:enumeration value="N"/>
<xs:enumeration value="y"/>
<xs:enumeration value="n"/>
</xs:restriction>
</xs:simpleType>
</xs:element>
<xs:element minOccurs="0" name="countcriteria" nillable="true">
<xs:simpleType>
<xs:restriction base="xs:integer">
<xs:minExclusive value="0"/>
</xs:restriction>
</xs:simpleType>
</xs:element>
<xs:element minOccurs="0" name="percentcriteria" nillable="true">
<xs:simpleType>
<xs:restriction base="xs:float">
<xs:minExclusive value="0"/>
<xs:maxExclusive value="100"/>
</xs:restriction>
</xs:simpleType>
</xs:element>
<xs:element name="randomerror">
<xs:simpleType>
<xs:restriction base="xs:string">
<xs:enumeration value="Y"/>
<xs:enumeration value="N"/>
<xs:enumeration value="y"/>
<xs:enumeration value="n"/>
</xs:restriction>
</xs:simpleType>
</xs:element>
<xs:element name="algorithmname">
<xs:simpleType>
<xs:restriction base="xs:string">
<xs:minLength value="1"/>
<xs:maxLength value="100"/>
</xs:restriction>
</xs:simpleType>
</xs:element>
</xs:all>

</xs:complexType>
</xs:element>
</xs:sequence>
</xs:complexType>
</xs:element>
</xs:schema>"""