Skip to content

mecfs_bio.build_system.task.extract_dataframe_from_rdata_task

Classes:

  • ExtractDataFrameFromRDataTask

    RData files can bundle together many R objects. This is a Task to extract a single dataframe from such a file.

ExtractDataFrameFromRDataTask

Bases: Task

RData files can bundle together many R objects. This is a Task to extract a single dataframe from such a file.

Methods:

Attributes:

deps property

deps: list[Task]

meta property

meta: Meta

r_dataframe_name instance-attribute

r_dataframe_name: str

r_package_list instance-attribute

r_package_list: Sequence[str]

rdata_file_task instance-attribute

rdata_file_task: Task

create classmethod

create(
    asset_id: str,
    rdata_file_task: Task,
    r_dataframe_name: str,
    r_package_list: Sequence[str],
)
Source code in mecfs_bio/build_system/task/extract_dataframe_from_rdata_task.py
@classmethod
def create(
    cls,
    asset_id: str,
    rdata_file_task: Task,
    r_dataframe_name: str,
    r_package_list: Sequence[str],
):
    source_meta = rdata_file_task.meta
    if isinstance(source_meta, ReferenceFileMeta):
        meta = ReferenceFileMeta(
            group=source_meta.group,
            sub_group=source_meta.sub_group,
            sub_folder=PurePath("processed"),
            id=AssetId(asset_id),
            extension=".parquet",
            read_spec=DataFrameReadSpec(DataFrameParquetFormat()),
        )
    else:
        raise ValueError(f"Unknown meta: {source_meta}")

    return cls(
        meta=meta,
        rdata_file_task=rdata_file_task,
        r_dataframe_name=r_dataframe_name,
        r_package_list=r_package_list,
    )

execute

execute(scratch_dir: Path, fetch: Fetch, wf: WF) -> Asset
Source code in mecfs_bio/build_system/task/extract_dataframe_from_rdata_task.py
def execute(self, scratch_dir: Path, fetch: Fetch, wf: WF) -> Asset:
    for package in self.r_package_list:
        importr(package)
    conv = ro.default_converter + pandas2ri.converter + numpy2ri.converter
    rdata_asset = fetch(self.rdata_file_task.asset_id)
    assert isinstance(rdata_asset, FileAsset)
    pth = rdata_asset.path
    load: Any = robjects.r["load"]
    load(str(pth))
    r_dataframe = robjects.r[self.r_dataframe_name]
    with localconverter(conv):
        py_dataframe: pd.DataFrame = ro.conversion.get_conversion().rpy2py(
            r_dataframe
        )
    out_path = scratch_dir / "df.parquet"
    py_dataframe.to_parquet(out_path)
    return FileAsset(out_path)