Module ethik.utils
Expand source code
import decimal
import numpy as np
import pandas as pd
import plotly.graph_objs as go
__all__ = [
"decimal_range",
"extract_category",
"join_with_overlap",
"safe_scale",
"to_pandas",
"yield_masks",
]
plot_template = go.layout.Template(
layout=go.Layout(
plot_bgcolor="white",
xaxis=dict(
showline=True,
linewidth=1,
linecolor="black",
zeroline=False,
showgrid=True,
gridcolor="#eee",
),
yaxis=dict(
showline=True,
linewidth=1,
linecolor="black",
zeroline=False,
showgrid=True,
gridcolor="#eee",
),
)
)
def set_fig_size(fig, size, width=None, height=None):
if size is not None:
width, height = size
fig.update_layout(width=width, height=height)
def safe_scale(x):
# If the std is zero, the values are constant and equal to the mean, so
# the difference is zero.
if isinstance(x, pd.DataFrame):
return (x - x.mean()) / x.std().replace(0, 1)
return (x - x.mean(axis=0)) / (x.std(axis=0) or 1)
def extract_category(X, cat):
return pd.get_dummies(X)[cat].rename(f"{X.name}={cat}")
def decimal_range(start: float, stop: float, step: float):
"""Like the `range` function but works for decimal values.
This is more accurate than using `np.arange` because it doesn't introduce
any round-off errors.
"""
start = decimal.Decimal(str(start))
stop = decimal.Decimal(str(stop))
step = decimal.Decimal(str(step))
while start <= stop:
yield float(start)
start += step
def to_pandas(x):
"""Converts an array-like to a Series or a DataFrame depending on the dimensionality."""
if isinstance(x, (pd.Series, pd.DataFrame)):
return x
if isinstance(x, np.ndarray):
if x.ndim > 2:
raise ValueError("x must have 1 or 2 dimensions")
if x.ndim == 2:
return pd.DataFrame(x)
return pd.Series(x)
return to_pandas(np.asarray(x))
def join_with_overlap(left, right, on):
"""Joins left with right while handling overlapping columns.
Currently, pandas raises an error if left and right have overlapping columns. This function
takes care of preserving the values from the columns of left that exist in right.
"""
overlap = left[left.columns.intersection(right.columns)]
left = left.drop(columns=overlap.columns).join(right, on=on)
for col in overlap:
left[col] = left[col].fillna(overlap[col])
return left
def yield_masks(n_masks, n, p):
"""Generates a list of `n_masks` to keep a proportion `p` of `n` items.
Args:
n_masks (int): The number of masks to yield. It corresponds to the number
of samples we use to compute the confidence interval.
n (int): The number of items being filtered. It corresponds to the size
of the dataset.
p (float): The proportion of items to keep.
Returns:
generator: A generator of `n_masks` lists of `n` booleans being generated
with a binomial distribution. As it is a probabilistic approach,
we may get more or fewer than `p*n` items kept, but it is not a problem
with large datasets.
"""
if p < 0 or p > 1:
raise ValueError(f"p must be between 0 and 1, got {p}")
if p < 1:
for _ in range(n_masks):
yield np.random.binomial(1, p, size=n).astype(bool)
else:
for _ in range(n_masks):
yield np.full(shape=n, fill_value=True)
Functions
def decimal_range(start, stop, step)
-
Like the
range
function but works for decimal values.This is more accurate than using
np.arange
because it doesn't introduce any round-off errors.Expand source code
def decimal_range(start: float, stop: float, step: float): """Like the `range` function but works for decimal values. This is more accurate than using `np.arange` because it doesn't introduce any round-off errors. """ start = decimal.Decimal(str(start)) stop = decimal.Decimal(str(stop)) step = decimal.Decimal(str(step)) while start <= stop: yield float(start) start += step
def extract_category(X, cat)
-
Expand source code
def extract_category(X, cat): return pd.get_dummies(X)[cat].rename(f"{X.name}={cat}")
def join_with_overlap(left, right, on)
-
Joins left with right while handling overlapping columns.
Currently, pandas raises an error if left and right have overlapping columns. This function takes care of preserving the values from the columns of left that exist in right.
Expand source code
def join_with_overlap(left, right, on): """Joins left with right while handling overlapping columns. Currently, pandas raises an error if left and right have overlapping columns. This function takes care of preserving the values from the columns of left that exist in right. """ overlap = left[left.columns.intersection(right.columns)] left = left.drop(columns=overlap.columns).join(right, on=on) for col in overlap: left[col] = left[col].fillna(overlap[col]) return left
def safe_scale(x)
-
Expand source code
def safe_scale(x): # If the std is zero, the values are constant and equal to the mean, so # the difference is zero. if isinstance(x, pd.DataFrame): return (x - x.mean()) / x.std().replace(0, 1) return (x - x.mean(axis=0)) / (x.std(axis=0) or 1)
def to_pandas(x)
-
Converts an array-like to a Series or a DataFrame depending on the dimensionality.
Expand source code
def to_pandas(x): """Converts an array-like to a Series or a DataFrame depending on the dimensionality.""" if isinstance(x, (pd.Series, pd.DataFrame)): return x if isinstance(x, np.ndarray): if x.ndim > 2: raise ValueError("x must have 1 or 2 dimensions") if x.ndim == 2: return pd.DataFrame(x) return pd.Series(x) return to_pandas(np.asarray(x))
def yield_masks(n_masks, n, p)
-
Generates a list of
n_masks
to keep a proportionp
ofn
items.Args
n_masks
:int
- The number of masks to yield. It corresponds to the number of samples we use to compute the confidence interval.
n
:int
- The number of items being filtered. It corresponds to the size of the dataset.
p
:float
- The proportion of items to keep.
Returns
generator
- A generator of
n_masks
lists ofn
booleans being generated with a binomial distribution. As it is a probabilistic approach, we may get more or fewer thanp*n
items kept, but it is not a problem with large datasets.
Expand source code
def yield_masks(n_masks, n, p): """Generates a list of `n_masks` to keep a proportion `p` of `n` items. Args: n_masks (int): The number of masks to yield. It corresponds to the number of samples we use to compute the confidence interval. n (int): The number of items being filtered. It corresponds to the size of the dataset. p (float): The proportion of items to keep. Returns: generator: A generator of `n_masks` lists of `n` booleans being generated with a binomial distribution. As it is a probabilistic approach, we may get more or fewer than `p*n` items kept, but it is not a problem with large datasets. """ if p < 0 or p > 1: raise ValueError(f"p must be between 0 and 1, got {p}") if p < 1: for _ in range(n_masks): yield np.random.binomial(1, p, size=n).astype(bool) else: for _ in range(n_masks): yield np.full(shape=n, fill_value=True)