Skip to content

mecfs_bio.build_system.meta.read_spec.read_dataframe

Functions:

Attributes:

ValidBackend module-attribute

ValidBackend = Literal['ibis', 'polars', 'duckdb']

logger module-attribute

logger = get_logger()

scan_dataframe

scan_dataframe(
    path: Path,
    spec: DataFrameReadSpec,
    parquet_backend: ValidBackend = "polars",
) -> nw.LazyFrame
Source code in mecfs_bio/build_system/meta/read_spec/read_dataframe.py
def scan_dataframe(
    path: Path, spec: DataFrameReadSpec, parquet_backend: ValidBackend = "polars"
) -> nw.LazyFrame:
    if isinstance(spec.format, DataFrameParquetFormat):
        logger.debug(f"Scanning parquet asset at {path} with backend {parquet_backend}")
        return nw.scan_parquet(path, backend=parquet_backend)
    if isinstance(spec.format, DataFrameTextFormat):
        if spec.format.column_names is not None:
            col_list: list[str] = spec.format.column_names
            col_func = lambda x: col_list
        else:
            col_func = None
        if parquet_backend == "polars":
            logger.debug(
                f"scanning text table asset at {path} with backend {parquet_backend}"
            )
            polars_scan = pl.scan_csv(
                path,
                separator=spec.format.separator,
                null_values=spec.format.null_values,
                schema_overrides=spec.format.schema_overrides,
                with_column_names=col_func,
                has_header=spec.format.has_header,
                skip_rows=spec.format.skip_rows,
                comment_prefix=spec.format.comment_char,
                infer_schema_length=10000,
            )
            return nw.from_native(polars_scan)
        raise ValueError("Only polars backend can be used to read text files")
    if isinstance(spec.format, DataFrameWhiteSpaceSepTextFormat):
        extra_options: dict = {}
        if spec.format.col_names is not None:
            extra_options = extra_options | {"names": spec.format.col_names}
        return nw.from_native(
            pl.from_pandas(
                pd.read_csv(
                    path, sep=r"\s+", comment=spec.format.comment_code, **extra_options
                )
            )
        ).lazy()

    raise ValueError("Unknown format")

scan_dataframe_asset

scan_dataframe_asset(
    asset: Asset,
    meta: Meta,
    parquet_backend: ValidBackend = "polars",
) -> nw.LazyFrame

Use the information in an Asset's metadata to read it as a DataFrame

Source code in mecfs_bio/build_system/meta/read_spec/read_dataframe.py
def scan_dataframe_asset(
    asset: Asset, meta: Meta, parquet_backend: ValidBackend = "polars"
) -> nw.LazyFrame:
    """
    Use the information in an Asset's metadata to read it as a DataFrame
    """
    assert isinstance(asset, FileAsset)
    assert isinstance(meta, FileMeta)
    return _scan_dataframe_asset(asset, meta, parquet_backend)