Source code for bioframe.io.resources

import urllib
from functools import partial
from typing import Union
from urllib.parse import urljoin

import numpy as np
import pandas as pd

from .assembly import assembly_info
from .fileops import read_chromsizes, read_table
from .schemas import SCHEMAS

__all__ = [
    "fetch_chromsizes",
    "fetch_centromeres",
    "UCSCClient",
]


[docs] def fetch_chromsizes( db: str, *, provider: str = "local", as_bed: bool = False, filter_chroms: bool = True, chrom_patterns: tuple = (r"^chr[0-9]+$", r"^chr[XY]$", r"^chrM$"), natsort: bool = True, **kwargs, ) -> Union[pd.Series, pd.DataFrame]: """ Fetch chromsizes from local storage or the UCSC database. Parameters ---------- db : str Assembly name. provider : str, optional [default: "local"] The provider of chromsizes. Either "local" for local storage or "ucsc". as_bed : bool, optional If True, return chromsizes as an interval DataFrame (chrom, start, end) instead of a Series. The remaining options only apply to provider="ucsc". filter_chroms : bool, optional Filter for chromosome names given in ``chrom_patterns``. chrom_patterns : sequence, optional Sequence of regular expressions to capture desired sequence names. natsort : bool, optional Sort each captured group of names in natural order. Default is True. **kwargs : Passed to :func:`pandas.read_csv` Returns ------- Series of integer bp lengths indexed by sequence name or BED3 DataFrame. Notes ----- For more fine-grained control over the chromsizes from local storage, use :func:`bioframe.assembly_info`. Examples -------- >>> fetch_chromsizes("hg38") name chr1 248956422 chr2 242193529 chr3 198295559 ... ... chrX 156040895 chrY 57227415 chrM 16569 Name: length, dtype: int64 >>> fetch_chromsizes("hg38", as_bed=True) chrom start end 0 chr1 0 248956422 1 chr2 0 242193529 2 chr3 0 198295559 ... ... 21 chrX 0 156040895 22 chrY 0 57227415 23 chrM 0 16569 See also -------- bioframe.assembly_info bioframe.UCSCClient """ if provider == "local": assembly = assembly_info(db) if as_bed: return assembly.viewframe[["chrom", "start", "end"]].copy() else: return assembly.chromsizes elif provider == "ucsc": return UCSCClient(db).fetch_chromsizes( filter_chroms=filter_chroms, chrom_patterns=chrom_patterns, natsort=natsort, as_bed=as_bed, **kwargs, ) else: raise ValueError(f"Unknown provider '{provider}'")
def _origins_from_cytoband( cyb: pd.DataFrame, band_col: str = "gieStain" ) -> pd.DataFrame: """ Extract chromosomal origin positions separating chromosome arms from cytological band data. Takes the cytological origin, i.e. the boundary between the two bands labeled 'acen'. Parameters ---------- cyb : pandas.DataFrame DataFrame with cytoband data. Returns ------- pandas.DataFrame A dataframe with columns 'chrom', 'start', 'end', 'mid'. """ cyb = cyb[cyb[band_col] == "acen"] grouped = cyb.groupby("chrom", sort=False) cens = [] for chrom, group in grouped: if not len(group) == 2: raise ValueError(f"Expected 2 'acen' bands for {chrom}, found {len(group)}") acens = group.sort_values("start") cens.append( { "chrom": chrom, "start": acens.iloc[0]["start"], "end": acens.iloc[1]["end"], "mid": acens.iloc[0]["end"], } ) return pd.DataFrame.from_records(cens) def _origins_from_ucsccentromeres(cens: pd.DataFrame) -> pd.DataFrame: """ Extract chromosomal origin positions from UCSC centromeres.txt table describing centromere model sequences. Takes the midpoint of all modeled centromere sequences. Parameters ---------- cens : pandas.DataFrame DataFrame with centromeres.txt data. Returns ------- pandas.DataFrame A dataframe with columns 'chrom', 'start', 'end', 'mid'. """ cens = cens.groupby("chrom").agg({"start": np.min, "end": np.max}).reset_index() cens["mid"] = (cens["start"] + cens["end"]) // 2 cens = ( cens[["chrom", "start", "end", "mid"]] .sort_values("chrom") .reset_index(drop=True) ) return cens
[docs] def fetch_centromeres(db: str, provider: str = "local") -> pd.DataFrame: """ Extract centromere locations for a given assembly 'db' from a variety of file formats in UCSC (cytoband, centromeres) depending on availability, returning a DataFrame. Parameters ---------- db : str Assembly name. provider : str, optional [default: "local"] The provider of centromere data. Either "local" for local storage or "ucsc". Returns ------- DataFrame with centromere 'chrom', 'start', 'end', 'mid'. Notes ----- When provider="local", centromeres are derived from cytoband tables in local storage. Whe provider="ucsc", the fallback priority goes as follows: - UCSC cytoBand - UCSC cytoBandIdeo - UCSC centromeres.txt Note that UCSC "gap" files no longer provide centromere information. Currently only works for human assemblies. See also -------- bioframe.assembly_info bioframe.UCSCClient """ if provider == "local": assembly = assembly_info(db) cyb = assembly.cytobands if cyb is None: raise ValueError( f"No source for centromere data found from provider '{provider}'." ) return _origins_from_cytoband(cyb, band_col="stain") elif provider == "ucsc": client = UCSCClient(db) fetchers = [ ("cytoband", client.fetch_cytoband), ("cytoband", partial(client.fetch_cytoband, ideo=True)), ("centromeres", client.fetch_centromeres), ] for schema, fetcher in fetchers: try: df = fetcher() break except urllib.error.HTTPError: pass else: raise ValueError( f"No source for centromere data found from provider '{provider}'." ) if schema == "centromeres": return _origins_from_ucsccentromeres(df) else: return _origins_from_cytoband(df) else: raise ValueError(f"Unknown provider '{provider}'")
class UCSCClient: BASE_URL = "https://hgdownload.soe.ucsc.edu/" def __init__(self, db: str): self._db = db self._db_url = urljoin(self.BASE_URL, f"goldenPath/{db}/") def fetch_chromsizes( self, filter_chroms: bool = True, chrom_patterns: tuple = (r"^chr[0-9]+$", r"^chr[XY]$", r"^chrM$"), natsort: bool = True, as_bed: bool = False, **kwargs, ) -> Union[pd.Series, pd.DataFrame]: url = urljoin(self._db_url, f"bigZips/{self._db}.chrom.sizes") return read_chromsizes( url, filter_chroms=filter_chroms, chrom_patterns=chrom_patterns, natsort=natsort, as_bed=as_bed, **kwargs, ) def fetch_centromeres(self, **kwargs) -> pd.DataFrame: url = urljoin(self._db_url, "database/centromeres.txt.gz") return read_table(url, schema="centromeres", **kwargs) def fetch_gaps(self, **kwargs): url = urljoin(self._db_url, "database/gap.txt.gz") return read_table( url, schema="gap", usecols=["chrom", "start", "end", "length", "type", "bridge"], **kwargs, ) def fetch_cytoband(self, ideo: bool = False, **kwargs) -> pd.DataFrame: if ideo: url = urljoin(self._db_url, "database/cytoBandIdeo.txt.gz") else: url = urljoin(self._db_url, "database/cytoBand.txt.gz") return read_table(url, schema="cytoband") def fetch_mrna(self, **kwargs) -> pd.DataFrame: url = urljoin(self._db_url, "database/all_mrna.txt.gz") return read_table( url, schema=SCHEMAS["all_mrna"], **kwargs, )