Source code for bioframe.io.resources

import urllib
from functools import partial
from typing import Union
from urllib.parse import urljoin

import numpy as np
import pandas as pd

from .assembly import assembly_info
from .fileops import read_chromsizes, read_table
from .schemas import SCHEMAS

__all__ = [
    "fetch_chromsizes",
    "fetch_centromeres",
    "UCSCClient",
]



[docs]
def fetch_chromsizes(
    db: str,
    *,
    provider: str = "local",
    as_bed: bool = False,
    filter_chroms: bool = True,
    chrom_patterns: tuple = (r"^chr[0-9]+$", r"^chr[XY]$", r"^chrM$"),
    natsort: bool = True,
    **kwargs,
) -> Union[pd.Series, pd.DataFrame]:
    """
    Fetch chromsizes from local storage or the UCSC database.

    Parameters
    ----------
    db : str
        Assembly name.
    provider : str, optional [default: "local"]
        The provider of chromsizes. Either "local" for local storage or "ucsc".
    as_bed : bool, optional
        If True, return chromsizes as an interval DataFrame (chrom, start, end)
        instead of a Series.

    The remaining options only apply to provider="ucsc".

    filter_chroms : bool, optional
        Filter for chromosome names given in ``chrom_patterns``.
    chrom_patterns : sequence, optional
        Sequence of regular expressions to capture desired sequence names.
    natsort : bool, optional
        Sort each captured group of names in natural order. Default is True.
    **kwargs :
        Passed to :func:`pandas.read_csv`

    Returns
    -------
    Series of integer bp lengths indexed by sequence name or BED3 DataFrame.

    Notes
    -----
    For more fine-grained control over the chromsizes from local storage,
    use :func:`bioframe.assembly_info`.

    Examples
    --------
    >>> fetch_chromsizes("hg38")
    name
    chr1     248956422
    chr2     242193529
    chr3     198295559
    ...      ...
    chrX     156040895
    chrY      57227415
    chrM         16569
    Name: length, dtype: int64

    >>> fetch_chromsizes("hg38", as_bed=True)
            chrom      start        end
    0        chr1          0  248956422
    1        chr2          0  242193529
    2        chr3          0  198295559
    ...      ...
    21       chrX          0  156040895
    22       chrY          0   57227415
    23       chrM          0      16569

    See also
    --------
    bioframe.assembly_info
    bioframe.UCSCClient
    """
    if provider == "local":
        assembly = assembly_info(db)
        if as_bed:
            return assembly.viewframe[["chrom", "start", "end"]].copy()
        else:
            return assembly.chromsizes
    elif provider == "ucsc":
        return UCSCClient(db).fetch_chromsizes(
            filter_chroms=filter_chroms,
            chrom_patterns=chrom_patterns,
            natsort=natsort,
            as_bed=as_bed,
            **kwargs,
        )
    else:
        raise ValueError(f"Unknown provider '{provider}'")



def _origins_from_cytoband(
    cyb: pd.DataFrame, band_col: str = "gieStain"
) -> pd.DataFrame:
    """
    Extract chromosomal origin positions separating chromosome arms from
    cytological band data. Takes the cytological origin, i.e. the boundary
    between the two bands labeled 'acen'.

    Parameters
    ----------
    cyb : pandas.DataFrame
        DataFrame with cytoband data.

    Returns
    -------
    pandas.DataFrame
        A dataframe with columns 'chrom', 'start', 'end', 'mid'.
    """
    cyb = cyb[cyb[band_col] == "acen"]
    grouped = cyb.groupby("chrom", sort=False)
    cens = []
    for chrom, group in grouped:
        if not len(group) == 2:
            raise ValueError(f"Expected 2 'acen' bands for {chrom}, found {len(group)}")
        acens = group.sort_values("start")
        cens.append(
            {
                "chrom": chrom,
                "start": acens.iloc[0]["start"],
                "end": acens.iloc[1]["end"],
                "mid": acens.iloc[0]["end"],
            }
        )
    return pd.DataFrame.from_records(cens)


def _origins_from_ucsccentromeres(cens: pd.DataFrame) -> pd.DataFrame:
    """
    Extract chromosomal origin positions from UCSC centromeres.txt table
    describing centromere model sequences. Takes the midpoint of all
    modeled centromere sequences.

    Parameters
    ----------
    cens : pandas.DataFrame
        DataFrame with centromeres.txt data.

    Returns
    -------
    pandas.DataFrame
        A dataframe with columns 'chrom', 'start', 'end', 'mid'.
    """
    cens = cens.groupby("chrom").agg({"start": np.min, "end": np.max}).reset_index()
    cens["mid"] = (cens["start"] + cens["end"]) // 2
    cens = (
        cens[["chrom", "start", "end", "mid"]]
        .sort_values("chrom")
        .reset_index(drop=True)
    )
    return cens



[docs]
def fetch_centromeres(db: str, provider: str = "local") -> pd.DataFrame:
    """
    Extract centromere locations for a given assembly 'db' from a variety
    of file formats in UCSC (cytoband, centromeres) depending on
    availability, returning a DataFrame.

    Parameters
    ----------
    db : str
        Assembly name.
    provider : str, optional [default: "local"]
        The provider of centromere data. Either "local" for local storage
        or "ucsc".

    Returns
    -------
    DataFrame with centromere 'chrom', 'start', 'end', 'mid'.

    Notes
    -----
    When provider="local", centromeres are derived from cytoband tables
    in local storage.

    Whe provider="ucsc", the fallback priority goes as follows:
    - UCSC cytoBand
    - UCSC cytoBandIdeo
    - UCSC centromeres.txt

    Note that UCSC "gap" files no longer provide centromere information.

    Currently only works for human assemblies.

    See also
    --------
    bioframe.assembly_info
    bioframe.UCSCClient
    """
    if provider == "local":
        assembly = assembly_info(db)
        cyb = assembly.cytobands
        if cyb is None:
            raise ValueError(
                f"No source for centromere data found from provider '{provider}'."
            )
        return _origins_from_cytoband(cyb, band_col="stain")

    elif provider == "ucsc":
        client = UCSCClient(db)
        fetchers = [
            ("cytoband", client.fetch_cytoband),
            ("cytoband", partial(client.fetch_cytoband, ideo=True)),
            ("centromeres", client.fetch_centromeres),
        ]

        for schema, fetcher in fetchers:
            try:
                df = fetcher()
                break
            except urllib.error.HTTPError:
                pass
        else:
            raise ValueError(
                f"No source for centromere data found from provider '{provider}'."
            )

        if schema == "centromeres":
            return _origins_from_ucsccentromeres(df)
        else:
            return _origins_from_cytoband(df)

    else:
        raise ValueError(f"Unknown provider '{provider}'")



class UCSCClient:
    BASE_URL = "https://hgdownload.soe.ucsc.edu/"

    def __init__(self, db: str):
        self._db = db
        self._db_url = urljoin(self.BASE_URL, f"goldenPath/{db}/")

    def fetch_chromsizes(
        self,
        filter_chroms: bool = True,
        chrom_patterns: tuple = (r"^chr[0-9]+$", r"^chr[XY]$", r"^chrM$"),
        natsort: bool = True,
        as_bed: bool = False,
        **kwargs,
    ) -> Union[pd.Series, pd.DataFrame]:
        url = urljoin(self._db_url, f"bigZips/{self._db}.chrom.sizes")
        return read_chromsizes(
            url,
            filter_chroms=filter_chroms,
            chrom_patterns=chrom_patterns,
            natsort=natsort,
            as_bed=as_bed,
            **kwargs,
        )

    def fetch_centromeres(self, **kwargs) -> pd.DataFrame:
        url = urljoin(self._db_url, "database/centromeres.txt.gz")
        return read_table(url, schema="centromeres", **kwargs)

    def fetch_gaps(self, **kwargs):
        url = urljoin(self._db_url, "database/gap.txt.gz")
        return read_table(
            url,
            schema="gap",
            usecols=["chrom", "start", "end", "length", "type", "bridge"],
            **kwargs,
        )

    def fetch_cytoband(self, ideo: bool = False, **kwargs) -> pd.DataFrame:
        if ideo:
            url = urljoin(self._db_url, "database/cytoBandIdeo.txt.gz")
        else:
            url = urljoin(self._db_url, "database/cytoBand.txt.gz")
        return read_table(url, schema="cytoband")

    def fetch_mrna(self, **kwargs) -> pd.DataFrame:
        url = urljoin(self._db_url, "database/all_mrna.txt.gz")
        return read_table(
            url,
            schema=SCHEMAS["all_mrna"],
            **kwargs,
        )