Source code for abraxos.extract

"""CSV reading utilities with bad line recovery."""

from __future__ import annotations

import collections.abc as a
import typing as t

import pandas as pd

__all__ = ['ReadCsvResult', 'read_csv', 'read_csv_chunks']


[docs] class ReadCsvResult(t.NamedTuple): """ A named tuple representing the result of reading a CSV file. Attributes ---------- bad_lines : list of list of str List of lines that could not be parsed correctly. dataframe : pandas.DataFrame Parsed portion of the CSV file. """ bad_lines: list[list[str]] dataframe: pd.DataFrame
[docs] def read_csv_chunks( path: str, chunksize: int, **kwargs: t.Any ) -> a.Generator[ReadCsvResult, None, None]: """ Reads a CSV file in chunks and captures malformed lines. Parameters ---------- path : str Path to the CSV file. chunksize : int Number of rows per chunk. **kwargs : dict Additional arguments passed to `pandas.read_csv`. Yields ------ ReadCsvResult A named tuple containing bad lines and the parsed DataFrame for the chunk. Examples -------- >>> for result in read_csv_chunks('data.csv', chunksize=100): ... print(result.bad_lines) ... print(result.dataframe) """ bad_lines: list[list[str]] = [] kwargs.update({"on_bad_lines": bad_lines.append, "engine": "python"}) chunks = pd.read_csv(path, chunksize=chunksize, **kwargs) for chunk in chunks: yield ReadCsvResult(bad_lines.copy(), chunk) bad_lines.clear()
def read_csv_all( path: str, **kwargs: t.Any ) -> ReadCsvResult: """ Reads an entire CSV file and captures malformed lines. Parameters ---------- path : str Path to the CSV file. **kwargs : dict Additional arguments passed to `pandas.read_csv`. Returns ------- ReadCsvResult A named tuple containing bad lines and the parsed DataFrame. Examples -------- >>> result = read_csv_all('data.csv') >>> print(result.bad_lines) >>> print(result.dataframe) """ bad_lines: list[list[str]] = [] kwargs.update({"on_bad_lines": bad_lines.append, "engine": "python"}) df: pd.DataFrame = pd.read_csv(path, **kwargs) return ReadCsvResult(bad_lines, df)
[docs] def read_csv( path: str, *, chunksize: int | None = None, **kwargs: t.Any ) -> ReadCsvResult | a.Generator[ReadCsvResult, None, None]: """ Reads a CSV file and optionally processes it in chunks, capturing malformed lines. Parameters ---------- path : str Path to the CSV file. chunksize : int, optional Number of rows per chunk. If specified, the file is read in chunks. If None (default), the entire file is read at once. **kwargs : dict Additional arguments passed to `pandas.read_csv`. Returns ------- ReadCsvResult or Generator of ReadCsvResult If `chunksize` is None, returns a single ReadCsvResult. Otherwise, returns a generator yielding ReadCsvResult for each chunk. Examples -------- >>> result = read_csv('data.csv') >>> print(result.bad_lines) >>> print(result.dataframe) >>> for result in read_csv('data.csv', chunksize=50): ... print(result.bad_lines) ... print(result.dataframe) """ if chunksize is not None: return read_csv_chunks(path, chunksize, **kwargs) return read_csv_all(path, **kwargs)