Source code for banff._common.src.testing.data_helper
import contextlib
import math
import pyarrow as pa
[docs]
def PAT_from_string(foo, sep=None):
"""Generate a pyarrow table from a text string.
This function aids in the creation of control datasets for test cases.
It generates a pyarrow table from a specially crafted multi-line text string which
provides the table's column names, types, and values.
For example, a table having 1 string and 5 numeric columns with 2 rows of data,
including an empty string in the ident column and missing value in Q4:
'''
s n n n n n
ident total Q1 Q2 Q3 Q4
REC04 1000 150 250 130 250
'' 1001 151 251 131 NaN
'''
Empty lines are ignored
1st line describes the column types
s- string
n- numeric
NOTE: types are always separated by whitespace, regardless of `sep=`
2nd line names the columns
subsequent lines (if any) provide values for each row
Whitespace is used to delimit values in each row by default.
specify `sep` to override: example `sep=','` to use commas
When fewer types (1st line) are provided that columns, the final type is applied
to all remaining columns
i.e. the type line above with "s n" would produce the same result
Empty Strings:
When using the default `sep=None`, empty string values should be specified as
empty single quotes (''), which are converted to an empty string.
Alternatively (and if the value SHOULD be two single quotes), specify `sep=` to use a
non-whitespace character (like a comma).
"""
col_values=[]
# for each row in string, discarding empty lines
for i, row in enumerate(foo.strip().splitlines()):
if i == 0: # extract type specifiers ('n'- numeric, 's' - string)
type_list = row.split()
elif i == 1: # extract column names
col_names = row.split(sep=sep)
# add placeholders to values, build schema type list
sch_type_list = []
cur_type="<not specified>"
for j, name in enumerate(col_names):
# add placeholder for column's values
col_values.append([])
# determine column's type
with contextlib.suppress(IndexError): # use last extracted type if `IndexError` occurs
cur_type = type_list[j].lower()
# add column to schema type list
match cur_type:
case "n":
sch_type_list.append((name, pa.float64()))
case "s":
sch_type_list.append((name, pa.large_string()))
case _:
mesg = f"invalid value in type list: {cur_type}"
raise ValueError(mesg)
else: # extract values for each column
for j, val in enumerate(row.split(sep=sep)):
# determine column type
with contextlib.suppress(IndexError): # use last extracted type if `IndexError` occurs
cur_type = type_list[j].lower()
if cur_type == "n":
if len(val) == 0 or math.isnan(float(val)):
col_values[j].append(None) # use `None` instead of `NaN` so pyarrow marks it as missing
else:
col_values[j].append(float(val))
else:
if sep is None and val == "''":
val=""
col_values[j].append(val)
# create schema
sch = pa.schema(sch_type_list)
# create table
pat = pa.table(data=col_values, schema=sch)
return pat