Skip to content

mecfs_bio.build_system.task.gwaslab.gwaslab_region_plots_task

Classes:

Functions:

Attributes:

logger module-attribute

logger = get_logger()

GwasLabRegionPlotsFromLeadVariantsTask

Bases: Task

A task to generate region plots near the lead variants described by GWAS summary statistics Useful for visualizing the local significance structure around lead variants, and their nearby genes. see https://cloufield.github.io/gwaslab/tutorial_3.4/#quick-regional-plot-without-ld-information Gwaslab can also use a vcf reference file to plot the linkage disequilibrium structure around the lead variants (vcf_name_for_lead_variants). Doing this at a reasonable speed requires the installation of the "tabix" binary.

Methods:

Attributes:

deps property

deps: list[Task]

meta property

meta: GWASLabRegionPlotsMeta

plot_top class-attribute instance-attribute

plot_top: int | None = None

short_id class-attribute instance-attribute

short_id: AssetId = field(converter=AssetId)

vcf_name_for_ld instance-attribute

vcf_name_for_ld: GWASLabVCFRefFile | None

execute

execute(
    scratch_dir: Path, fetch: Fetch, wf: WF
) -> DirectoryAsset
Source code in mecfs_bio/build_system/task/gwaslab/gwaslab_region_plots_task.py
def execute(self, scratch_dir: Path, fetch: Fetch, wf: WF) -> DirectoryAsset:
    target_path = scratch_dir / "plot_dir"
    target_path.mkdir(parents=True, exist_ok=True)
    sumstats = read_sumstats(fetch(self._sumstats_id))
    variant_df = (
        scan_dataframe_asset(
            asset=fetch(self._lead_variants_id), meta=self._lead_variants_task_meta
        )
        .collect()
        .to_pandas()
    )
    variant_df = get_top_var_df(variant_df, plot_top=self.plot_top)
    variants = df_to_variants(variant_df)
    plot_region_around_variants(
        sumstats=sumstats,
        variants=variants,
        output_dir=target_path,
        vcf_name_for_ld=self.vcf_name_for_ld,
    )
    return DirectoryAsset(
        path=target_path,
    )

get_top_var_df

get_top_var_df(
    df: DataFrame, plot_top: int | None
) -> pd.DataFrame
Source code in mecfs_bio/build_system/task/gwaslab/gwaslab_region_plots_task.py
def get_top_var_df(df: pd.DataFrame, plot_top: int | None) -> pd.DataFrame:
    if plot_top is None:
        return df
    if "P" in df.columns:
        return df.sort_values(by="P").iloc[:plot_top]
    if "MLOG10P" in df.columns:
        return df.sort_values(by="MLOG10P", ascending=False).iloc[:plot_top]
    raise ValueError(
        f"task attempted to plot top variants, but neither P nor MLOG10P was in the variant dataframe. columns were: {df.columns}"
    )

plot_region_around_variant

plot_region_around_variant(
    sumstats: Sumstats,
    chrom: int,
    pos: int,
    buffer: int,
    output_path: Path,
    vcf_name_for_ld: GWASLabVCFRefFile | None,
) -> None
Source code in mecfs_bio/build_system/task/gwaslab/gwaslab_region_plots_task.py
def plot_region_around_variant(
    sumstats: gl.Sumstats,
    chrom: int,
    pos: int,
    buffer: int,
    output_path: Path,
    vcf_name_for_ld: GWASLabVCFRefFile | None,
) -> None:
    output_path.parent.mkdir(parents=True, exist_ok=True)
    if vcf_name_for_ld is not None:
        gwaslab_download_ref_if_missing(vcf_name_for_ld)
        vcf_path = gl.get_path(vcf_name_for_ld)
    else:
        vcf_path = None
    scaled = "MLOG10P" in sumstats.data.columns
    sumstats.plot_mqq(
        mode="r",
        skip=2,
        cut=20,
        scaled=scaled,
        region_grid=True,
        region=(chrom, max(pos - buffer, 0), pos + buffer),
        save=str(
            output_path,
        ),
        save_args={"dpi": 400, "facecolor": "white"},
        vcf_path=vcf_path,
    )

plot_region_around_variants

plot_region_around_variants(
    sumstats: Sumstats,
    variants: Sequence[Variant],
    output_dir: Path,
    vcf_name_for_ld: GWASLabVCFRefFile | None,
    buffer: int = 500000,
) -> None
Source code in mecfs_bio/build_system/task/gwaslab/gwaslab_region_plots_task.py
def plot_region_around_variants(
    sumstats: gl.Sumstats,
    variants: Sequence[Variant],
    output_dir: Path,
    vcf_name_for_ld: GWASLabVCFRefFile | None,
    buffer: int = 500_000,
) -> None:
    for variant in variants:
        logger.debug(f"Creating region plot around variant {variant.id}")
        plot_region_around_variant(
            sumstats=sumstats,
            chrom=variant.chromosome,
            pos=variant.position,
            buffer=buffer,
            output_path=output_dir / str(variant.id_normalized + ".png"),
            vcf_name_for_ld=vcf_name_for_ld,
        )