Source code for banff._common.src.testing.data_helper

import contextlib
import math

import pyarrow as pa



[docs]
def PAT_from_string(foo, sep=None):
    """Generate a pyarrow table from a text string.

    This function aids in the creation of control datasets for test cases.
    It generates a pyarrow table from a specially crafted multi-line text string which
    provides the table's column names, types, and values.

    For example, a table having 1 string and 5 numeric columns with 2 rows of data,
    including an empty string in the ident column and missing value in Q4:
        '''
        s     n      n   n   n   n
        ident total  Q1  Q2  Q3  Q4
        REC04 1000  150 250 130 250
        ''    1001  151 251 131 NaN
        '''

    Empty lines are ignored
    1st line describes the column types
        s- string
        n- numeric
        NOTE: types are always separated by whitespace, regardless of `sep=`
    2nd line names the columns
    subsequent lines (if any) provide values for each row

    Whitespace is used to delimit values in each row by default.
        specify `sep` to override: example `sep=','` to use commas

    When fewer types (1st line) are provided that columns, the final type is applied
    to all remaining columns
        i.e. the type line above with "s n" would produce the same result

    Empty Strings:
        When using the default `sep=None`, empty string values should be specified as
        empty single quotes (''), which are converted to an empty string.
        Alternatively (and if the value SHOULD be two single quotes), specify `sep=` to use a
        non-whitespace character (like a comma).
    """
    col_values=[]
    # for each row in string, discarding empty lines
    for i, row in enumerate(foo.strip().splitlines()):
        if i == 0:  # extract type specifiers ('n'- numeric, 's' - string)
            type_list = row.split()
        elif i == 1:  # extract column names
            col_names = row.split(sep=sep)

            # add placeholders to values, build schema type list
            sch_type_list = []
            cur_type="<not specified>"
            for j, name in enumerate(col_names):
                # add placeholder for column's values
                col_values.append([])

                # determine column's type
                with contextlib.suppress(IndexError):  # use last extracted type if `IndexError` occurs
                    cur_type = type_list[j].lower()

                # add column to schema type list
                match cur_type:
                    case "n":
                        sch_type_list.append((name, pa.float64()))
                    case "s":
                        sch_type_list.append((name, pa.large_string()))
                    case _:
                        mesg = f"invalid value in type list: {cur_type}"
                        raise ValueError(mesg)
        else:  # extract values for each column
            for j, val in enumerate(row.split(sep=sep)):
                # determine column type
                with contextlib.suppress(IndexError):  # use last extracted type if `IndexError` occurs
                    cur_type = type_list[j].lower()

                if cur_type == "n":
                    if len(val) == 0 or math.isnan(float(val)):
                        col_values[j].append(None)  # use `None` instead of `NaN` so pyarrow marks it as missing
                    else:
                        col_values[j].append(float(val))
                else:
                    if sep is None and val == "''":
                        val=""
                    col_values[j].append(val)

    # create schema
    sch = pa.schema(sch_type_list)
    # create table
    pat = pa.table(data=col_values, schema=sch)

    return pat