import numpy as np
import pandas as pd
from .. import ops
from . import construction
from .specs import _get_default_colnames, _verify_column_dtypes, _verify_columns
__all__ = [
"is_bedframe",
"is_cataloged",
"is_overlapping",
"is_viewframe",
"is_contained",
"is_covering",
"is_tiling",
"is_sorted",
]
[docs]
def is_bedframe(
df,
raise_errors=False,
cols=None,
):
"""
Checks that required bedframe properties are satisfied for dataframe `df`.
This includes:
- chrom, start, end columns
- columns have valid dtypes
- for each interval, if any of chrom, start, end are null, then all are
null
- all starts < ends.
Parameters
----------
df : pandas.DataFrame
raise_errors : bool, optional [default: False]
If True, raises errors instead of returning a boolean False for invalid
properties.
cols : (str, str, str) or None
The names of columns containing the chromosome, start and end of the
genomic intervals, provided separately for each set. The default
values are 'chrom', 'start', 'end'.
Returns
-------
is_bedframe:bool
Notes
-----
Valid dtypes for chrom are object, string, or categorical.
Valid dtypes for start and end are int/Int64Dtype.
"""
ck1, sk1, ek1 = _get_default_colnames() if cols is None else cols
if not _verify_columns(df, [ck1, sk1, ek1], return_as_bool=True):
if raise_errors:
raise TypeError("Invalid bedFrame: Invalid column names")
return False
if not _verify_column_dtypes(df, cols=[ck1, sk1, ek1], return_as_bool=True):
if raise_errors:
raise TypeError("Invalid bedFrame: Invalid column dtypes")
return False
nan_intervals = pd.isnull(df[[ck1, sk1, ek1]])
if (~(~nan_intervals.any(axis=1) | nan_intervals.all(axis=1))).any():
if raise_errors:
raise ValueError(
"Invalid bedFrame: Invalid null values "
"(if any of chrom, start, end are null, then all must be null)"
)
return False
if ((df[ek1] - df[sk1]) < 0).any():
if raise_errors:
raise ValueError(
f"Invalid bedframe: starts exceed ends for "
f"{sum((df[ek1] - df[sk1]) < 0)} intervals"
)
return False
return True
[docs]
def is_cataloged(
df, view_df, raise_errors=False, df_view_col="view_region", view_name_col="name"
):
"""
Tests if all region names in `df[df_view_col]` are present in
`view_df[view_name_col]`.
Parameters
----------
df : pandas.DataFrame
view_df : pandas.DataFrame
raise_errors : bool
If True, raises errors instead of returning a boolean False for invalid
properties. Default False.
df_view_col: str
Name of column from df that indicates region in view.
view_name_col: str
Name of column from view that specifies region name.
Returns
-------
is_cataloged:bool
Notes
-----
Does not check if names in `view_df[view_name_col]` are unique.
"""
if not _verify_columns(df, [df_view_col], return_as_bool=True):
if raise_errors:
raise ValueError(f"Could not find `{df_view_col}` column in df")
return False
if not _verify_columns(view_df, [view_name_col], return_as_bool=True):
if raise_errors:
raise ValueError(f"Could not find \
`{view_name_col}` \
column in view_df")
return False
if not set(df[df_view_col].copy().dropna().values).issubset(
set(view_df[view_name_col].values)
):
if raise_errors:
missing_regions = set(df[df_view_col].values).difference(
set(view_df[view_name_col].values)
)
raise ValueError(
f"The following regions in df[df_view_col] not in "
f"view_df[view_name_col]: \n{missing_regions}"
)
return False
return True
[docs]
def is_overlapping(df, cols=None):
"""
Tests if any genomic intervals in a bioframe `df` overlap.
Also see :func:`bioframe.ops.merge()`.
Parameters
----------
df : pandas.DataFrame
cols : (str, str, str) or None
The names of columns containing the chromosome, start and end of the
genomic intervals, provided separately for each set. The default
values are 'chrom', 'start', 'end'.
Returns
-------
is_overlapping:bool
"""
from ..ops import merge
ck1, sk1, ek1 = _get_default_colnames() if cols is None else cols
df_merged = merge(df, cols=cols)
total_interval_len = np.sum((df[ek1] - df[sk1]).values)
total_interval_len_merged = np.sum((df_merged[ek1] - df_merged[sk1]).values)
if total_interval_len > total_interval_len_merged:
return True
else:
return False
[docs]
def is_viewframe(region_df, raise_errors=False, view_name_col="name", cols=None):
"""
Checks that `region_df` is a valid viewFrame.
This includes:
- it satisfies requirements for a bedframe, including columns for
('chrom', 'start', 'end')
- it has an additional column, view_name_col, with default 'name'
- it does not contain null values
- entries in the view_name_col are unique.
- intervals are non-overlapping
Parameters
----------
region_df : pandas.DataFrame
Dataframe of genomic intervals to be tested.
raise_errors : bool
If True, raises errors instead of returning a boolean False for invalid
properties. Default False.
view_name_col : str
Specifies column name of the view regions. Default 'name'.
cols : (str, str, str) or None
The names of columns containing the chromosome, start and end of the
genomic intervals, provided separately for each set. The default
values are 'chrom', 'start', 'end'.
Returns
-------
is_viewframe:bool
"""
ck1, sk1, ek1 = _get_default_colnames() if cols is None else cols
if not _verify_columns(
region_df, [ck1, sk1, ek1, view_name_col], return_as_bool=True
):
if raise_errors:
raise TypeError("Invalid view: invalid column names")
return False
if not is_bedframe(region_df, cols=cols):
if raise_errors:
raise ValueError("Invalid view: not a bedframe")
return False
if pd.isna(region_df).values.any():
if raise_errors:
raise ValueError("Invalid view: cannot contain NAs")
return False
if len(set(region_df[view_name_col])) < len(region_df[view_name_col].values):
if raise_errors:
raise ValueError(
"Invalid view: entries in \
region_df[view_name_col] must be unique"
)
return False
if is_overlapping(region_df, cols=cols):
if raise_errors:
raise ValueError("Invalid view: entries must be non-overlapping")
return False
return True
[docs]
def is_contained(
df,
view_df,
raise_errors=False,
df_view_col=None,
view_name_col="name",
cols=None,
cols_view=None,
):
"""
Tests if all genomic intervals in a bioframe `df` are cataloged and do not
extend beyond their associated region in the view `view_df`.
Parameters
----------
df : pandas.DataFrame
view_df : pandas.DataFrame
Valid viewframe.
raise_errors : bool
If True, raises errors instead of returning a boolean False for invalid
properties. Default False.
df_view_col:
Column from df used to associate interviews with view regions.
Default `view_region`.
view_name_col:
Column from view_df with view region names. Default `name`.
cols: (str, str, str)
Column names for chrom, start, end in df.
cols_view: (str, str, str)
Column names for chrom, start, end in view_df.
Returns
-------
is_contained:bool
"""
from ..ops import trim
ck1, sk1, ek1 = _get_default_colnames() if cols is None else cols
ck2, sk2, ek2 = _get_default_colnames() if cols_view is None else cols_view
if df_view_col is None:
try:
df_view_assigned = ops.overlap(df, view_df, cols1=cols, cols2=cols_view)
# ek2 = end_ is the default value
assert (df_view_assigned[ek2 + "_"].isna()).sum() == 0
# sk2 = start_ is the default value
assert (df_view_assigned[sk2 + "_"].isna()).sum() == 0
assert (df_view_assigned[ek1] <= df_view_assigned[ek2 + "_"]).all()
# ek1 = end is the default value
# sk1 = start is the default value
assert (df_view_assigned[sk1] >= df_view_assigned[sk2 + "_"]).all()
except AssertionError:
if raise_errors:
raise AssertionError("df not contained in view_df")
else:
return False
return True
if not is_cataloged(
df, view_df, df_view_col=df_view_col, view_name_col=view_name_col
):
if raise_errors:
raise ValueError("df not cataloged in view_df")
return False
df_trim = trim(
df,
view_df=view_df,
df_view_col=df_view_col,
view_name_col=view_name_col,
cols=cols,
cols_view=cols_view,
)
is_start_trimmed = np.any(df[sk1].values != df_trim[sk1].values)
is_end_trimmed = np.any(df[ek1].values != df_trim[ek1].values)
if is_start_trimmed or is_end_trimmed:
if raise_errors:
raise ValueError("df not contained in view_df")
return False
else:
return True
[docs]
def is_covering(df, view_df, view_name_col="name", cols=None, cols_view=None):
"""
Tests if a view `view_df` is covered by the set of genomic intervals in
the bedframe `df`.
This test is true if ``complement(df,view_df)`` is empty. Also note this
test ignores regions assigned to intervals in `df` since regions are
re-assigned in :func:`bioframe.ops.complement`.
Parameters
----------
df : pandas.DataFrame
view_df : pandas.DataFrame
Valid viewFrame.
view_name_col:
Column from view_df with view region names. Default `name`.
cols : (str, str, str) or None
The names of columns containing the chromosome, start and end of the
genomic intervals, provided separately for each set. The default
values are 'chrom', 'start', 'end'.
cols_view: (str, str, str) or None
The names of columns containing the chromosome, start and end of the
genomic intervals in view_df, provided separately for
each set. The default
values are 'chrom', 'start', 'end'.
Returns
-------
is_covering:bool
"""
from ..ops import complement
if complement(
df,
view_df=view_df,
view_name_col=view_name_col,
cols=cols,
cols_view=cols_view,
).empty:
return True
else:
return False
[docs]
def is_tiling(
df,
view_df,
raise_errors=False,
df_view_col="view_region",
view_name_col="name",
cols=None,
cols_view=None,
):
"""
Tests if a view `view_df` is tiled by the set of genomic intervals in the
bedframe `df`.
This is true if:
- df is not overlapping
- df is covering view_df
- df is contained in view_df
Parameters
----------
df : pandas.DataFrame
view_df : pandas.DataFrame
valid viewFrame
raise_errors : bool
If True, raises errors instead of returning a boolean False for invalid
properties. Default False.
df_view_col: str
Name of column from df that indicates region in view.
view_name_col: str
Name of column from view that specifies unique region name.
cols : (str, str, str) or None
The names of columns containing the chromosome, start and end of the
genomic intervals, provided separately for each set. The default
values are 'chrom', 'start', 'end'.
cols_view: (str, str, str) or None
The names of columns containing the chromosome, start and end of the
genomic intervals in view_df, provided
separately for each set. The default
values are 'chrom', 'start', 'end'.
Returns
-------
is_tiling:bool
"""
view_df = construction.make_viewframe(
view_df, view_name_col=view_name_col, cols=cols_view
)
if is_overlapping(df, cols=cols):
if raise_errors:
raise ValueError("overlaps")
return False
if not is_covering(
df, view_df, view_name_col=view_name_col, cols=cols, cols_view=cols_view
):
if raise_errors:
raise ValueError("not covered")
return False
if not is_contained(
df,
view_df,
df_view_col=df_view_col,
view_name_col=view_name_col,
cols=cols,
cols_view=cols_view,
):
if raise_errors:
raise ValueError("not contained")
return False
return True
[docs]
def is_sorted(
df,
view_df=None,
reset_index=True,
df_view_col=None,
view_name_col="name",
cols=None,
cols_view=None,
):
"""
Tests if a bedframe is changed by sorting.
Also see :func:`bioframe.ops.sort_bedframe`.
Parameters
----------
df : pandas.DataFrame
view_df : pandas.DataFrame | dict-like
Optional view to pass to ``sort_bedframe``.
When it is dict-like :func:'bioframe.make_viewframe' will
be used to convert to viewframe. If view_df is not provided
df is assumed to be sorted by chrom and start.
reset_index : bool
Optional argument to pass to ``sort_bedframe``.
df_view_col: None | str
Name of column from df that indicates region in view.
If None, :func:'bioframe.assign_view' will be used to assign view
regions. Default None.
view_name_col: str
Name of column from view that specifies unique region name.
cols : (str, str, str) or None
The names of columns containing the chromosome, start and end of the
genomic intervals, provided separately for each set. The default
values are 'chrom', 'start', 'end'.
cols_view: (str, str, str) or None
The names of columns containing the chromosome, start and end of the
genomic intervals in view_df, provided separately for each set.
The default
values are 'chrom', 'start', 'end'.
Returns
-------
is_sorted : bool
"""
from ..ops import sort_bedframe
df_sorted = sort_bedframe(
df.copy(),
view_df=view_df,
reset_index=reset_index,
df_view_col=df_view_col,
view_name_col=view_name_col,
cols=cols,
cols_view=cols_view,
)
if df.equals(df_sorted):
return True
else:
return False