Skip to content

mecfs_bio.build_system.task.specificity_cepo_task

Classes:

  • PrepareSpecificityCepo

    Task to compute the specificity of genes for cell types using the CEPO specificity metric.

Attributes:

DIFFERENTIAL_STABILITY module-attribute

DIFFERENTIAL_STABILITY = 'differential_stability'

PrepareSpecificityCepo

Bases: Task

Task to compute the specificity of genes for cell types using the CEPO specificity metric.

The CEPO metric is based on differential stability: a gene is considered specific for a cell type if its expression in stable in that cell type, but not other cell types

see Kim, Hani Jieun, et al. "Cepo uncovers cell identity through differential stability." bioRxiv (2021): 2021-01.

Methods:

Attributes:

cell_col instance-attribute

cell_col: str

cell_type_col instance-attribute

cell_type_col: str

count_col instance-attribute

count_col: str

deps property

deps: list[Task]

epsilon class-attribute instance-attribute

epsilon: float = 0.0001

gene_col instance-attribute

gene_col: str

long_count_df_task instance-attribute

long_count_df_task: Task

meta property

meta: Meta

min_cells_per_type class-attribute instance-attribute

min_cells_per_type: int = 0

out_format class-attribute instance-attribute

out_format: OutFormat = ParquetOutFormat()

post_pipe class-attribute instance-attribute

post_pipe: DataProcessingPipe = IdentityPipe()

pre_pipe class-attribute instance-attribute

pre_pipe: DataProcessingPipe = IdentityPipe()

create classmethod

create(
    asset_id: str,
    long_count_df_task: Task,
    cell_type_col: str,
    count_col: str,
    gene_col: str,
    cell_col: str,
    min_cells_per_type: int,
    epsilon: float = 0.0001,
    out_format: OutFormat = ParquetOutFormat(),
    pre_pipe: DataProcessingPipe = IdentityPipe(),
    post_pipe: DataProcessingPipe = IdentityPipe(),
)
Source code in mecfs_bio/build_system/task/specificity_cepo_task.py
@classmethod
def create(
    cls,
    asset_id: str,
    long_count_df_task: Task,
    cell_type_col: str,
    count_col: str,
    gene_col: str,
    cell_col: str,
    min_cells_per_type: int,
    epsilon: float = 0.0001,
    out_format: OutFormat = ParquetOutFormat(),
    pre_pipe: DataProcessingPipe = IdentityPipe(),
    post_pipe: DataProcessingPipe = IdentityPipe(),
):
    extension, read_spec = get_extension_and_read_spec_from_format(
        out_format=out_format
    )
    source_meta = long_count_df_task.meta
    if isinstance(source_meta, ReferenceFileMeta):
        meta = ReferenceFileMeta(
            group=source_meta.group,
            sub_group=source_meta.sub_group,
            sub_folder=source_meta.sub_folder,
            id=AssetId(asset_id),
            filename=None,
            extension=extension,
            read_spec=read_spec,
        )
    else:
        raise ValueError(f"Unknown meta: {source_meta}")
    return cls(
        meta=meta,
        long_count_df_task=long_count_df_task,
        cell_type_col=cell_type_col,
        count_col=count_col,
        gene_col=gene_col,
        epsilon=epsilon,
        out_format=out_format,
        pre_pipe=pre_pipe,
        post_pipe=post_pipe,
        cell_col=cell_col,
        min_cells_per_type=min_cells_per_type,
    )

execute

execute(scratch_dir: Path, fetch: Fetch, wf: WF) -> Asset
Source code in mecfs_bio/build_system/task/specificity_cepo_task.py
def execute(self, scratch_dir: Path, fetch: Fetch, wf: WF) -> Asset:
    long_count_df_asset = fetch(self.long_count_df_task.asset_id)

    df = scan_dataframe_asset(
        long_count_df_asset, meta=self.long_count_df_task.meta
    )
    df = self.pre_pipe.process(df)

    df = filter_by_cell_count(
        df,
        cell_type_col=self.cell_type_col,
        cell_col=self.cell_col,
        min_cells=self.min_cells_per_type,
    )
    df = filter_missing_genes(df, gene_col=self.gene_col, count_col=self.count_col)
    num_genes = df.select(narwhals.col(self.gene_col).n_unique()).collect().item()
    _check_not_sparse(df=df, cell_col=self.cell_col, gene_col=self.gene_col)
    df = _compute_inv_coef_prop_zero(
        df=df,
        cell_type_col=self.cell_type_col,
        count_col=self.count_col,
        gene_col=self.gene_col,
        epsilon=self.epsilon,
    )
    df = _compute_ranks(df=df, cell_type_col=self.cell_type_col)
    df = _compute_stability(
        df=df,
        num_genes=num_genes,
    )
    df = _compute_differential_stability(
        df=df,
        cell_type_col=self.cell_type_col,
        gene_col=self.gene_col,
    )
    df = df.select([self.cell_type_col, self.gene_col, DIFFERENTIAL_STABILITY])
    df = self.post_pipe.process(df)
    out_path = scratch_dir / "out"
    if isinstance(self.out_format, CSVOutFormat):
        df.collect().to_pandas().to_csv(
            out_path, index=False, sep=self.out_format.sep
        )
    elif isinstance(self.out_format, ParquetOutFormat):
        df.sink_parquet(out_path)
    return FileAsset(out_path)