Skip to content

mecfs_bio.build_system.task.assign_rsids_via_snp151_task

Assign RSIDs to variants via joining a database file. Only works for single-nucleotide variations.

Classes:

Functions:

AssignRSIDSToSNPsViaSNP151Task

Bases: Task

Assigns RSIDS to the SNP genetic variants in a file of Gwas summary statistics Uses SNP151 database file Assumes the GWASLAB naming conventions are used in the summary statistics file Assumes that both input files are in parquet format

Note that non-SNP variations (e.g. insertions or deletions) are excluded. This operates exclusively on SNPs

Methods:

Attributes:

chrom_replace_rules instance-attribute

chrom_replace_rules: Mapping[str, int]

database_id property

database_id: AssetId

database_meta property

database_meta: Meta

deps property

deps: list[Task]

meta property

meta: Meta

raw_snp_data_task instance-attribute

raw_snp_data_task: Task

snp151_database_file_task instance-attribute

snp151_database_file_task: Task

snp_data_id property

snp_data_id: AssetId

snp_data_meta property

snp_data_meta: Meta

valid_chroms instance-attribute

valid_chroms: list[str]

create classmethod

create(
    snp151_database_file_task: Task,
    raw_snp_data_task: Task,
    asset_id: str,
    valid_chroms: list[str],
    chrom_replace_rules: Mapping[str, int],
)
Source code in mecfs_bio/build_system/task/assign_rsids_via_snp151_task.py
@classmethod
def create(
    cls,
    snp151_database_file_task: Task,
    raw_snp_data_task: Task,
    asset_id: str,
    valid_chroms: list[str],
    chrom_replace_rules: Mapping[str, int],
):
    source_meta = raw_snp_data_task.meta
    meta = create_new_meta(source_meta, asset_id=asset_id)
    return cls(
        meta=meta,
        snp151_database_file_task=snp151_database_file_task,
        raw_snp_data_task=raw_snp_data_task,
        valid_chroms=valid_chroms,
        chrom_replace_rules=chrom_replace_rules,
    )

execute

execute(scratch_dir: Path, fetch: Fetch, wf: WF) -> Asset
Source code in mecfs_bio/build_system/task/assign_rsids_via_snp151_task.py
def execute(self, scratch_dir: Path, fetch: Fetch, wf: WF) -> Asset:
    rename_frame = nw.from_native(
        ibis.memtable(
            {
                "chrom": list(self.chrom_replace_rules.keys()),
                GWASLAB_CHROM_COL: list(self.chrom_replace_rules.values()),
            }
        )
    )
    source_data_asset = fetch(self.snp_data_id)
    source_data_lf = scan_dataframe_asset(
        source_data_asset, meta=self.snp_data_meta, parquet_backend="ibis"
    )
    # filter for single nucleotide changes only
    source_data_lf = source_data_lf.filter(
        nw.col(GWASLAB_NON_EFFECT_ALLELE_COL).str.len_chars() == 1
    ).filter(nw.col(GWASLAB_EFFECT_ALLELE_COL).str.len_chars() == 1)
    database_asset = fetch(self.database_id)
    database_lf = scan_dataframe_asset(
        database_asset, meta=self.database_meta, parquet_backend="ibis"
    )
    processed_database_lf = database_lf.filter(nw.col("class") == "single").filter(
        nw.col("chrom").is_in(self.valid_chroms)
    )
    processed_database_lf = processed_database_lf.join(rename_frame, on="chrom")
    processed_database_lf = processed_database_lf.with_columns(
        (nw.col("chromStart_zero_based") + 1).alias(GWASLAB_POS_COL),
        (nw.col("name")).alias(GWASLAB_RSID_COL),
    ).select(
        GWASLAB_CHROM_COL,
        GWASLAB_POS_COL,
        GWASLAB_RSID_COL,
    )
    result: nw.LazyFrame = source_data_lf.join(
        processed_database_lf, on=[GWASLAB_POS_COL, GWASLAB_CHROM_COL]
    )
    out_path = scratch_dir / "snps_with_rsids.parquet"
    result.sink_parquet(
        out_path,
    )
    return FileAsset(out_path)

create_new_meta

create_new_meta(
    source_meta: Meta,
    asset_id: str,
    format: DataFrameFormat = DataFrameParquetFormat(),
    extension=".parquet",
) -> Meta
Source code in mecfs_bio/build_system/task/assign_rsids_via_snp151_task.py
def create_new_meta(
    source_meta: Meta,
    asset_id: str,
    format: DataFrameFormat = DataFrameParquetFormat(),
    extension=".parquet",
) -> Meta:
    meta: Meta
    if isinstance(source_meta, SimpleFileMeta):
        meta = SimpleFileMeta(
            id=AssetId(asset_id),
            read_spec=DataFrameReadSpec(format=format),
        )
    elif isinstance(source_meta, FilteredGWASDataMeta):
        meta = FilteredGWASDataMeta(
            id=AssetId(asset_id),
            project=source_meta.project,
            trait=source_meta.trait,
            sub_dir=source_meta.sub_dir,
            read_spec=DataFrameReadSpec(format=format),
            extension=extension,
        )
    else:
        raise ValueError("unknown source meta")
    return meta