import numpy as np
import pandas as pd
from . import checks
from .specs import _get_default_colnames, _verify_columns, is_chrom_dtype
from .stringops import is_complete_ucsc_string, parse_region_string, to_ucsc_string
__all__ = [
"from_any",
"from_dict",
"from_list",
"from_series",
"make_viewframe",
"sanitize_bedframe",
]
### conversions from various input formats into dataframes ###
[docs]
def from_dict(regions, cols=None):
"""
Makes a dataframe from a dictionary of {str,int} pairs, interpreted as
chromosome names.
Note that {str,(int,int)} dictionaries of tuples are no longer supported!
Parameters
----------
regions : dict
name_col : str
Default 'name'.
cols : (str, str, str) or None
The names of columns containing the chromosome, start and end of the
genomic intervals, provided separately for each set. The default
values are 'chrom', 'start', 'end'.
Returns
-------
df : pandas.DataFrame
"""
ck1, sk1, ek1 = _get_default_colnames() if cols is None else cols
data = []
for k, v in dict(regions).items():
chrom = k
if np.isscalar(v):
start = 0
end = v
else:
raise ValueError("Unsupported dict format: {type(v)}")
data.append([chrom, start, end])
return pd.DataFrame(data, columns=[ck1, sk1, ek1])
def from_series(regions, cols=None):
ck1, sk1, ek1 = _get_default_colnames() if cols is None else cols
chroms = regions.index.values
data = {ck1: chroms, sk1: 0, ek1: regions.values}
return pd.DataFrame(data)
def from_list(regions, name_col="name", cols=None):
ck1, sk1, ek1 = _get_default_colnames() if cols is None else cols
df = pd.DataFrame(regions)
if df.shape[1] == 3:
df.columns = [ck1, sk1, ek1]
elif df.shape[1] == 4:
df.columns = [ck1, sk1, ek1, name_col]
else:
raise ValueError("wrong number of columns for list input format")
return df
def from_ucsc_string_list(region_list, cols=None):
ck1, sk1, ek1 = _get_default_colnames() if cols is None else cols
parsed = [parse_region_string(i) for i in region_list]
df = pd.DataFrame(parsed, columns=[ck1, sk1, ek1])
return df
[docs]
def from_any(regions, fill_null=False, name_col="name", cols=None):
"""
Attempts to make a genomic interval dataframe with columns
[chr, start, end, name_col] from a variety of input types.
Parameters
----------
regions : supported input
Currently supported inputs:
- dataframe
- series of UCSC strings
- dictionary of {str:int} key value pairs
- pandas series where the index is interpreted as chromosomes and
values are interpreted as end
- list of tuples or lists, either [(chrom,start,end)] or
[(chrom,start,end,name)]
- tuple of tuples or lists, either [(chrom,start,end)] or
[(chrom,start,end,name)]
fill_null : False or dictionary
Accepts a dictionary of {str:int} pairs, interpreted as chromosome sizes.
Kept or backwards compatibility. Default False.
name_col : str
Column name. Only used if 4 column list is provided. Default "name".
cols : (str,str,str)
Names for dataframe columns.
Default None sets them with get_default_colnames().
Returns
-------
out_df:dataframe
"""
ck1, sk1, ek1 = _get_default_colnames() if cols is None else cols
if isinstance(regions, pd.DataFrame):
if {ck1, sk1, ek1}.issubset(regions.columns):
out_df = regions.copy()
elif (len(regions[name_col].values.shape) == 1) and is_complete_ucsc_string(
regions[name_col].values[0]
):
out_df = from_ucsc_string_list(
regions[name_col].values, cols=[ck1, sk1, ek1]
)
else:
raise ValueError("Unknown dataFrame format: check column names")
elif isinstance(regions, dict):
out_df = from_dict(regions, cols=[ck1, sk1, ek1])
elif isinstance(regions, pd.Series):
out_df = from_series(regions, cols=[ck1, sk1, ek1])
elif isinstance(regions, tuple):
if np.shape(regions) == (3,):
out_df = from_list([regions], name_col=name_col, cols=[ck1, sk1, ek1])
elif len(np.shape(regions)) == 1 and isinstance(regions[0], str):
out_df = from_ucsc_string_list(regions, cols=[ck1, sk1, ek1])
else:
out_df = from_list(list(regions), name_col=name_col, cols=[ck1, sk1, ek1])
elif isinstance(regions, list):
if np.shape(regions) == (3,):
out_df = from_list([regions], name_col=name_col, cols=[ck1, sk1, ek1])
elif len(np.shape(regions)) == 1 and isinstance(regions[0], str):
out_df = from_ucsc_string_list(regions, cols=[ck1, sk1, ek1])
else:
out_df = from_list(regions, name_col=name_col, cols=[ck1, sk1, ek1])
else:
raise ValueError(f"Unknown input format: {type(regions)}")
if fill_null:
out_df[sk1] = pd.to_numeric(out_df[sk1]).fillna(0)
try:
ends = []
for i in range(len(out_df)):
if out_df[ek1].values[i] is None:
ends.append(fill_null[out_df[ck1].values[i]])
else:
ends.append(out_df[ek1].values[i])
out_df[ek1] = ends
except Exception as e:
raise ValueError("could not fill ends with provided chromsizes") from e
return out_df
def add_ucsc_name_column(reg_df, name_col="name", cols=None):
"""
Auto-creates a UCSC name 'chrom:start-end' for each region
(chrom,start,end) in reg_df.
Replaces name_col if it exists.
"""
ck1, sk1, ek1 = _get_default_colnames() if cols is None else cols
df = reg_df.copy()
_verify_columns(df, [ck1, sk1, ek1])
data = zip(df[ck1], df[sk1], df[ek1])
df[name_col] = [to_ucsc_string(i) for i in data]
return df
[docs]
def make_viewframe(
regions,
check_bounds=None,
name_style=None,
view_name_col="name",
cols=None,
):
"""
Makes and validates a dataframe `view_df` out of regions.
Parameters
----------
regions : supported input type
Currently supported input types:
- a dictionary where keys are strings and values are integers
{str:int}, specifying regions (chrom, 0, end, chrom)
- a pandas series of chromosomes lengths with index specifying region names
- a list of tuples [(chrom,start,end), ...] or [(chrom,start,end,name), ...]
- a pandas DataFrame, skips to validation step
name_style : None or "ucsc"
If None and no column view_name_col, propagate values from cols[0]
If "ucsc" and no column view_name_col, create UCSC style names
check_bounds : None, or chromosome sizes provided as any of valid formats above
Optional, if provided checks if regions in the view are contained by
regions supplied in check_bounds, typically provided as a series of
chromosome sizes. Default None.
view_name_col : str
Specifies column name of the view regions. Default 'name'.
cols : (str, str, str) or None
The names of columns containing the chromosome, start and end of the
genomic intervals, provided separately for each set. The default
values are 'chrom', 'start', 'end'.
Returns
-------
view_df:dataframe satisfying properties of a view
"""
ck1, _sk1, _ek1 = _get_default_colnames() if cols is None else cols
view_df = from_any(regions, name_col=view_name_col, cols=cols)
if check_bounds is not None:
bounds_df = from_any(check_bounds, name_col="bounds", cols=cols)
if not checks.is_contained(
view_df,
bounds_df,
df_view_col=None,
view_name_col="bounds",
cols=cols,
):
raise ValueError(
"Invalid input to make a viewFrame, regions not contained by bounds"
)
if view_name_col not in view_df.columns:
if name_style is None:
view_df[view_name_col] = view_df[ck1].values
elif name_style.lower() == "ucsc":
view_df = add_ucsc_name_column(view_df, name_col=view_name_col, cols=cols)
else:
raise ValueError("unknown value for name_style")
if checks.is_viewframe(
view_df, view_name_col=view_name_col, cols=cols, raise_errors=True
):
return view_df
else:
raise ValueError("could not make valid viewFrame, retry with new input")
[docs]
def sanitize_bedframe(
df1,
recast_dtypes=True,
drop_null=False,
start_exceed_end_action=None,
cols=None,
):
"""
Attempts to clean a genomic interval dataframe to be a valid bedframe.
Parameters
----------
df1 : pandas.DataFrame
recast_dtypes : bool
Whether to attempt to recast column dtypes to pandas nullable dtypes.
drop_null : bool
Drops rows with pd.NA. Default False.
start_exceed_end_action : str or None
Options: 'flip' or 'drop' or None. Default None.
- If 'flip', attempts to sanitize by flipping intervals with start>end.
- If 'drop' attempts to sanitize dropping intervals with start>end.
- If None, does not alter these intervals if present.
cols : (str, str, str) or None
The names of columns containing the chromosome, start and end of the
genomic intervals, provided separately for each set. The default
values are 'chrom', 'start', 'end'.
Returns
-------
out_df : pandas.DataFrame
Sanitized dataframe satisfying the properties of a bedframe.
Notes
------
The option ``start_exceed_end_action='flip'`` may be useful for gff files
with strand information but starts > ends.
"""
ck1, sk1, ek1 = _get_default_colnames() if cols is None else cols
out_df = df1.copy()
_verify_columns(out_df, [ck1, sk1, ek1])
if recast_dtypes:
chrom_dtype, start_dtype, end_dtype = out_df.dtypes[[ck1, sk1, ek1]]
if not is_chrom_dtype(chrom_dtype):
out_df[ck1] = out_df[ck1].astype(str)
if not ((start_dtype is pd.Int64Dtype()) and (end_dtype is pd.Int64Dtype())):
out_df[sk1] = out_df[sk1].astype(pd.Int64Dtype())
out_df[ek1] = out_df[ek1].astype(pd.Int64Dtype())
nan_intervals = pd.isnull(out_df[[ck1, sk1, ek1]]).any(axis=1)
out_df.loc[nan_intervals, [ck1, sk1, ek1]] = pd.NA
if drop_null:
out_df.dropna(axis=0, inplace=True)
out_df.reset_index(drop=True, inplace=True)
if start_exceed_end_action is not None:
start_exceed_end_action = start_exceed_end_action.lower()
if ((out_df[ek1] - out_df[sk1]) < 0).any():
inds = ((out_df[ek1] - out_df[sk1]) < 0).values
if start_exceed_end_action == "drop":
out_df = out_df.loc[inds == 0]
elif start_exceed_end_action == "flip":
out_df.loc[inds, [sk1, ek1]] = out_df.loc[inds, [ek1, sk1]].values
else:
raise ValueError("unknown action for intervals with start>end")
out_df.reset_index(drop=True, inplace=True)
if checks.is_bedframe(out_df, cols=cols):
return out_df
else:
raise ValueError("could not sanitize")