Skip to content

mecfs_bio.build_system.task.specificity_frac_task

Classes:

Functions:

Attributes:

NORMALIZED_MEAN module-attribute

NORMALIZED_MEAN = 'normalized_mean'

PrepareSpecificityFraction

Bases: Task

Task to compute the specificity of genes for cell types using the fractional specificity metric.

In this metric, the specificity of a gene for a cell type is (mean expression in cell type)/(sum over all cell types of mean expression in those cell types)

Methods:

Attributes:

cell_col instance-attribute

cell_col: str

cell_type_col instance-attribute

cell_type_col: str

count_col instance-attribute

count_col: str

deps property

deps: list[Task]

gene_col instance-attribute

gene_col: str

long_count_df_task instance-attribute

long_count_df_task: Task

meta property

meta: Meta

min_cells_per_type class-attribute instance-attribute

min_cells_per_type: int = 0

out_format class-attribute instance-attribute

out_format: OutFormat = ParquetOutFormat()

post_pipe class-attribute instance-attribute

post_pipe: DataProcessingPipe = IdentityPipe()

pre_pipe class-attribute instance-attribute

pre_pipe: DataProcessingPipe = IdentityPipe()

create classmethod

create(
    asset_id: str,
    long_count_df_task: Task,
    cell_type_col: str,
    count_col: str,
    gene_col: str,
    cell_col: str,
    min_cells_per_type: int,
    out_format: OutFormat = ParquetOutFormat(),
    pre_pipe: DataProcessingPipe = IdentityPipe(),
    post_pipe: DataProcessingPipe = IdentityPipe(),
)
Source code in mecfs_bio/build_system/task/specificity_frac_task.py
@classmethod
def create(
    cls,
    asset_id: str,
    long_count_df_task: Task,
    cell_type_col: str,
    count_col: str,
    gene_col: str,
    cell_col: str,
    min_cells_per_type: int,
    out_format: OutFormat = ParquetOutFormat(),
    pre_pipe: DataProcessingPipe = IdentityPipe(),
    post_pipe: DataProcessingPipe = IdentityPipe(),
):
    extension, read_spec = get_extension_and_read_spec_from_format(
        out_format=out_format
    )
    source_meta = long_count_df_task.meta
    if isinstance(source_meta, ReferenceFileMeta):
        meta = ReferenceFileMeta(
            group=source_meta.group,
            sub_group=source_meta.sub_group,
            sub_folder=source_meta.sub_folder,
            id=AssetId(asset_id),
            filename=None,
            extension=extension,
            read_spec=read_spec,
        )
    else:
        raise ValueError(f"Unknown meta: {source_meta}")
    return cls(
        meta=meta,
        long_count_df_task=long_count_df_task,
        cell_type_col=cell_type_col,
        count_col=count_col,
        gene_col=gene_col,
        out_format=out_format,
        pre_pipe=pre_pipe,
        post_pipe=post_pipe,
        cell_col=cell_col,
        min_cells_per_type=min_cells_per_type,
    )

execute

execute(scratch_dir: Path, fetch: Fetch, wf: WF) -> Asset
Source code in mecfs_bio/build_system/task/specificity_frac_task.py
def execute(self, scratch_dir: Path, fetch: Fetch, wf: WF) -> Asset:
    long_count_df_asset = fetch(self.long_count_df_task.asset_id)

    df = scan_dataframe_asset(
        long_count_df_asset, meta=self.long_count_df_task.meta
    )
    df = self.pre_pipe.process(df)
    df = filter_by_cell_count(
        df,
        cell_type_col=self.cell_type_col,
        cell_col=self.cell_col,
        min_cells=self.min_cells_per_type,
    )
    df = filter_missing_genes(df, gene_col=self.gene_col, count_col=self.count_col)
    df = _compute_cell_type_means(
        df,
        cell_type_col=self.cell_type_col,
        count_col=self.count_col,
        gene_col=self.gene_col,
    )
    df = _compute_normalized_mean(
        df=df,
        gene_col=self.gene_col,
    )
    df = df.select([self.cell_type_col, self.gene_col, NORMALIZED_MEAN])
    df = self.post_pipe.process(df)
    out_path = scratch_dir / "out"
    if isinstance(self.out_format, CSVOutFormat):
        df.collect().to_pandas().to_csv(
            out_path, index=False, sep=self.out_format.sep
        )
    elif isinstance(self.out_format, ParquetOutFormat):
        df.sink_parquet(out_path)
    return FileAsset(out_path)

filter_by_cell_count

filter_by_cell_count(
    df: LazyFrame,
    cell_type_col: str,
    cell_col: str,
    min_cells: int,
) -> narwhals.LazyFrame
Source code in mecfs_bio/build_system/task/specificity_frac_task.py
def filter_by_cell_count(
    df: narwhals.LazyFrame, cell_type_col: str, cell_col: str, min_cells: int
) -> narwhals.LazyFrame:
    cc = df.group_by(cell_type_col).agg(
        narwhals.col(cell_col).n_unique().alias("__cell_count")
    )
    cc = cc.filter(narwhals.col("__cell_count") >= min_cells)
    return df.join(cc, on=cell_type_col)

filter_missing_genes

filter_missing_genes(
    df: LazyFrame, gene_col: str, count_col: str
) -> narwhals.LazyFrame
Source code in mecfs_bio/build_system/task/specificity_frac_task.py
def filter_missing_genes(
    df: narwhals.LazyFrame,
    gene_col: str,
    count_col: str,
) -> narwhals.LazyFrame:
    nz = df.group_by(gene_col).agg(
        (narwhals.col(count_col) > 0).sum().alias("nonzero_count")
    )
    nz = nz.filter(narwhals.col("nonzero_count") > 0)
    return df.join(nz, on=gene_col)