Skip to content

mecfs_bio.build_system.task.gwaslab.gwaslab_create_sumstats_task

Task to read in a dataframe from disk and process it using the GWASLAB pipeline.

Classes:

Functions:

Attributes:

GenomeBuild module-attribute

GenomeBuild = Literal['19', '38']

GenomeBuildMode module-attribute

GenomeBuildMode = Literal['infer', '19', '38']

ValidGwaslabFormat module-attribute

ValidGwaslabFormat = (
    GwaslabKnownFormat | GWASLabColumnSpecifiers
)

GWASLabColumnSpecifiers

Methods:

Attributes:

OR class-attribute instance-attribute

OR: str | None = None

beta class-attribute instance-attribute

beta: str | None = None

chi_sq class-attribute instance-attribute

chi_sq: str | None = None

chrom class-attribute instance-attribute

chrom: str | None = None

ea class-attribute instance-attribute

ea: str | None = None

eaf class-attribute instance-attribute

eaf: str | None = None

info class-attribute instance-attribute

info: str | None = None

mlog10p class-attribute instance-attribute

mlog10p: str | None = None

n class-attribute instance-attribute

n: str | None = None

ncase class-attribute instance-attribute

ncase: str | None = None

ncontrol class-attribute instance-attribute

ncontrol: str | None = None

nea class-attribute instance-attribute

nea: str | None = None

neaf class-attribute instance-attribute

neaf: str | None = None

neff class-attribute instance-attribute

neff: str | None = None

or_95l class-attribute instance-attribute

or_95l: str | None = None

or_95u class-attribute instance-attribute

or_95u: str | None = None

p class-attribute instance-attribute

p: str | None = None

pos class-attribute instance-attribute

pos: str | None = None

rsid class-attribute instance-attribute

rsid: str | None = None

se class-attribute instance-attribute

se: str | None = None

snpid class-attribute instance-attribute

snpid: str | None = None

get_selection_pipe

get_selection_pipe() -> SelectColPipe
Source code in mecfs_bio/build_system/task/gwaslab/gwaslab_create_sumstats_task.py
def get_selection_pipe(self) -> SelectColPipe:
    fields = attrs.asdict(self)
    cols = []
    for v in fields.values():
        if v is not None:
            cols.append(v)
    return SelectColPipe(cols)

GWASLabCreateSumstatsTask

Bases: Task

Task that processes a DataFrame of GWAS summary statistics using the GWASLab pipeline. see: https://cloufield.github.io/gwaslab/SumstatsObject/

Methods:

Attributes:

basic_check instance-attribute

basic_check: bool

deps property

deps: list[Task]

drop_col_list class-attribute instance-attribute

drop_col_list: Sequence[str] = tuple()

exclude_hla class-attribute instance-attribute

exclude_hla: bool = False

exclude_sexchr class-attribute instance-attribute

exclude_sexchr: bool = False

filter_hapmap3 class-attribute instance-attribute

filter_hapmap3: bool = False

filter_indels class-attribute instance-attribute

filter_indels: bool = False

filter_palindromic class-attribute instance-attribute

filter_palindromic: bool = False

fmt class-attribute instance-attribute

fmt: GwaslabKnownFormat | GWASLabColumnSpecifiers = (
    "regenie"
)

genome_build instance-attribute

genome_build: GenomeBuildMode

harmonize_options class-attribute instance-attribute

harmonize_options: HarmonizationOptions | None = None

liftover_to class-attribute instance-attribute

liftover_to: GenomeBuild | None = None

meta property

meta: Meta

pre_pipe class-attribute instance-attribute

pre_pipe: DataProcessingPipe = IdentityPipe()

execute

execute(
    scratch_dir: Path, fetch: Fetch, wf: WF
) -> FileAsset
Source code in mecfs_bio/build_system/task/gwaslab/gwaslab_create_sumstats_task.py
def execute(self, scratch_dir: Path, fetch: Fetch, wf: WF) -> FileAsset:
    df = scan_dataframe_asset(asset=fetch(self._source_id), meta=self._source_meta)
    logger.debug("Applying pre-pipe")
    df = self.pre_pipe.process(df)
    logger.debug("Fetching source dataframe asset...")
    sumstats = _get_sumstats(df, self.fmt, drop_cols=self.drop_col_list)
    transform_spec = GwasLabTransformSpec(
        basic_check=self.basic_check,
        genome_build=self.genome_build,
        filter_hapmap3=self.filter_hapmap3,
        filter_indels=self.filter_indels,
        filter_palindromic=self.filter_palindromic,
        exclude_hla=self.exclude_hla,
        exclude_sexchr=self.exclude_sexchr,
        harmonize_options=self.harmonize_options,
        liftover_to=self.liftover_to,
    )
    sumstats = transform_gwaslab_sumstats(sumstats, spec=transform_spec)
    out_path = scratch_dir / "pickled_sumstats.pickle"
    gl.dump_pickle(sumstats, path=out_path)
    return FileAsset(out_path)

GWASLabVCFRef

Attributes:

extra_downloads class-attribute instance-attribute

extra_downloads: Sequence[str] = tuple()

name instance-attribute

name: str

ref_alt_freq instance-attribute

ref_alt_freq: str

GwasLabTransformSpec

Attributes:

basic_check class-attribute instance-attribute

basic_check: bool = True

exclude_hla class-attribute instance-attribute

exclude_hla: bool = False

exclude_sexchr class-attribute instance-attribute

exclude_sexchr: bool = False

filter_hapmap3 class-attribute instance-attribute

filter_hapmap3: bool = False

filter_indels class-attribute instance-attribute

filter_indels: bool = False

filter_palindromic class-attribute instance-attribute

filter_palindromic: bool = False

genome_build class-attribute instance-attribute

genome_build: GenomeBuildMode = 'infer'

harmonize_options class-attribute instance-attribute

harmonize_options: HarmonizationOptions | None = None

liftover_to class-attribute instance-attribute

liftover_to: GenomeBuild | None = None

HarmonizationOptions

Options for the call to GWASLab's harmonize function.

gwaslab's harmonization function changes the status codes in the STATUS column. These status codes are described here: https://cloufield.github.io/gwaslab/StatusCode/

Below I explain some points that were initially not clear to me from the gwaslab documentation.

Two reference files are used in harmonization:

  • A VCF file (ref_infer). This is basically a table of genetic variants.

In some cases, this table is in dbSNP VCF format. In this case, each row describes a given genetic variant. Sometimes this description includes allele frequency.

In other cases, (such as when using a thousand genomes reference data) this table is in genotype VCF format. In this case, the rows of the VCF file correspond to variants, and the columns correspond to individuals (from the thousand genomes project, for example). For each individual and each variant, the table tells us whether that individual has that variant. Variant frequency information can be calculated from this individual-level genome data.

  • A FASTA file (ref_seq). This is a consensus human genome sequence. Here is an example of some rows from the hg19 FASTA file:

    TAAGTTTTGTCTGGTAATAAAGGTATATTTTCAAAAGAGAGGTAAATAGA TCCACATACTGTGGAGGGAATAAAATACTTTTTGAAAAACAAACAACAAG TTGGATTTTTAGACACATAGAAATTGAATATGTACATTTATAAATATTTT TGGATTGAACTATTTCAAAATTATACCATAAAATAACTTGTAAAAATGTA GGCAAAATGTATATAATTATGGCATGAGGTATGCAACTTTAGGCAAGGAA GCAAAAGCAGAAACCATGAAAAAAGTCTAAATTTTACCATATTGAATTTA AATTTTCAAAAACAAAAATAAAGACAAAGTGGGAAAAATATGTATGCTTC ATGTGTGACAAGCCACTGATACCTATTAAATATGAAGAATATTATAAATC ATATCAATAACCACAACATTCAAGCTGTCAGTTTGAATAGACaatgtaaa tgacaaaactacatactcaacaagataacagcaaaccagcttcgacagca cgttaaaggggtcatacaacataatcgagtagaatttatctctgagatgc aagaatggttcaaaatatggaaaccaataaatgtgatatgccacactaac agaataaaaaataaaaatcatattatcatctcaatagatgcagaaaaagc attaacaaaagtaaacattctttcataataagacatcagataaaacaaat taggaatagaaggaatgtaccgcaacacaataaaggccatatataacaag cccacagctaacatcataatagtaaaatcatcacactggtaaaaaaaatg

gwaslab uses these two reference files to harmonize summary statistics. These two reference files each affect a different digit of the gwaslab STATUS code column.

  • Digit 7 of the status code is determined by the ability of gwaslab to find the variant in the reference VCF (ref_infer) : see here: https://github.com/Cloufield/gwaslab/blob/d639b67c5264b1ac7ec89e284e638f2c8454ac48/src/gwaslab/hm/hm_harmonize_sumstats.py#L1521-L1530 Values of 7 or 8 here mean that the variant is palindromic, and the database could not be used to disambiguate the strand of the variant, or the variant was not found in the database
  • Digit 6 of the status code is instead determined by the ability of the gwaslab to find the variant in the reference genome build FASTA file (ref_seq) see here: https://github.com/Cloufield/gwaslab/blob/d639b67c5264b1ac7ec89e284e638f2c8454ac48/src/gwaslab/hm/hm_harmonize_sumstats.py#L968-L975 a value of 8 means a failure to find the variant in the FASTA file.

Set drop_missing_from_ref_seq to drop based on digit 6. Set drop_missing_from_ref_infer to drop based on digit 7.

Attributes:

check_ref_files instance-attribute

check_ref_files: bool

cores instance-attribute

cores: int

drop_missing_from_ref_infer_or_ambiguous class-attribute instance-attribute

drop_missing_from_ref_infer_or_ambiguous: bool = True

drop_missing_from_ref_seq instance-attribute

drop_missing_from_ref_seq: bool

ref_infer instance-attribute

ref_infer: GWASLabVCFRef

ref_seq instance-attribute

ref_seq: str

transform_gwaslab_sumstats

transform_gwaslab_sumstats(
    sumstats: Sumstats, spec: GwasLabTransformSpec
) -> gl.Sumstats
Source code in mecfs_bio/build_system/task/gwaslab/gwaslab_create_sumstats_task.py
def transform_gwaslab_sumstats(
    sumstats: gl.Sumstats,
    spec: GwasLabTransformSpec,
) -> gl.Sumstats:
    logger.debug("Running gwas summary statistics through gwaslab pipelines...")
    logger.debug(f"Initial sumstats has shape {sumstats.data.shape}")
    if spec.basic_check:
        sumstats.basic_check()
    if spec.genome_build == "infer":
        sumstats.fix_chr()
        sumstats.infer_build()
        build = sumstats.meta["gwaslab"]["genome_build"]
        forced_build = None
        print(f"Build is {build}")
    else:
        build = spec.genome_build
        forced_build = build

    if spec.liftover_to is not None and (build != spec.liftover_to):
        sumstats.liftover(to_build=spec.liftover_to, from_build=forced_build)
    if spec.filter_hapmap3:
        sumstats.filter_hapmap3(inplace=True, build=forced_build)
    if spec.filter_indels:
        sumstats.filter_indel(inplace=True, mode="out")
    if spec.filter_palindromic:
        sumstats.filter_palindromic(inplace=True, mode="out")
    if spec.exclude_hla:
        sumstats.exclude_hla(inplace=True)
    if spec.exclude_sexchr:
        sumstats = _exclude_sexchr(sumstats)
    if spec.harmonize_options is not None:
        _do_harmonization(
            sumstats,
            basic_check=(not spec.basic_check),
            options=spec.harmonize_options,
        )

    _sumstats_raise_on_error(sumstats)
    logger.debug(f"Finished gwaslab pipe.  Data has shape {sumstats.data.shape}")
    return sumstats