Source code for abraxos.transform
"""DataFrame transformation with error isolation."""
from __future__ import annotations
import collections.abc as a
import typing as t
import pandas as pd
from abraxos import utils
__all__ = ['TransformResult', 'transform']
[docs]
class TransformResult(t.NamedTuple):
"""
Result of applying a transformation to a DataFrame.
Attributes
----------
errors : list of Exception
Exceptions raised during transformation.
errored_df : pandas.DataFrame
Rows that failed to transform.
success_df : pandas.DataFrame
Successfully transformed rows.
"""
errors: list[Exception]
errored_df: pd.DataFrame
success_df: pd.DataFrame
[docs]
def transform(
df: pd.DataFrame,
transformer: a.Callable[[pd.DataFrame], pd.DataFrame],
chunks: int = 2
) -> TransformResult:
"""
Applies a transformation function to a DataFrame with error isolation.
If the transformation raises an exception on a chunk, the DataFrame
is split into smaller chunks recursively to isolate errors. Ultimately,
rows that fail even as single-row DataFrames are collected separately.
Parameters
----------
df : pd.DataFrame
The input DataFrame to transform.
transformer : Callable[[pd.DataFrame], pd.DataFrame]
A function that transforms a DataFrame and returns a new DataFrame.
chunks : int, optional
Number of subchunks to divide the DataFrame into if transformation fails (default is 2).
Returns
-------
TransformResult
A named tuple with:
- errors: A list of exceptions that occurred during transformation.
- errored_df: A DataFrame of rows that could not be transformed.
- success_df: A DataFrame of successfully transformed rows.
Examples
--------
>>> import pandas as pd
>>> def double_values(df): return df.assign(value=df['value'] * 2)
>>> df = pd.DataFrame({'value': [1, 2, 3]})
>>> result = transform(df, double_values)
>>> result.success_df
value
0 2
1 4
2 6
>>> result.errored_df.empty
True
"""
errors: list[Exception] = []
errored_dfs: list[pd.DataFrame] = []
success_dfs: list[pd.DataFrame] = []
try:
return TransformResult([], utils.clear(df), transformer(df))
except Exception:
if len(df) > 1:
for df_c in utils.split(df, chunks):
result: TransformResult = transform(df_c, transformer, chunks)
errors.extend(result.errors)
errored_dfs.append(result.errored_df)
success_dfs.append(result.success_df)
else:
try:
return TransformResult([], utils.clear(df), transformer(df))
except Exception as e:
return TransformResult([e], df, utils.clear(df))
return TransformResult(
errors,
pd.concat(errored_dfs),
pd.concat(success_dfs)
)