Source code for phenotypic.analysis.abc_._set_analyzer

from __future__ import annotations

import abc
from typing import TYPE_CHECKING, Callable, List

import pandas as pd
import numpy as np
from collections.abc import Iterable
from typing import Any, Mapping


[docs] class SetAnalyzer(abc.ABC): def __init__( self, on: str, groupby: List[str], agg_func: Callable | str | list | dict | None = "mean", *, num_workers=1, ): self.groupby = groupby self.agg_func = agg_func self.on = on self.n_jobs = num_workers self._latest_measurements: pd.DataFrame = pd.DataFrame()
[docs] @abc.abstractmethod def analyze(self, data: pd.DataFrame) -> pd.DataFrame: pass
[docs] @abc.abstractmethod def show(self): pass
[docs] @abc.abstractmethod def results(self): pass
@staticmethod @abc.abstractmethod def _apply2group_func(group: pd.DataFrame, **kwargs): pass @staticmethod def _filter_by( df: pd.DataFrame, criteria: Mapping[str, Any], *, copy: bool = True, match_na: bool = False, ) -> pd.DataFrame: """Row-wise filter by column-value criteria. This helper builds a boolean mask across rows using an "AND across columns" logic based on a mapping from column names to desired values. It is intentionally lightweight and side-effect free (unless ``copy=False``), making it convenient to pre-filter measurement tables before grouping or aggregation in concrete ``SetAnalyzer`` implementations. Matching rules per criterion (for each ``col -> val``): - If ``val`` is a scalar (not list-like): keep rows where ``df[col] == val``. - If ``val`` is list-like (e.g., list/tuple/set/ndarray): keep rows where ``df[col]`` is contained in that collection (``isin`` semantics). - If ``val`` is NA and ``match_na=True``: treat NA as a match for NA values in ``df[col]``. If ``match_na=False``, NA does not match anything. The final mask is the conjunction (logical AND) of every per-column mask. If any referenced column is missing, a ``KeyError`` is raised. The function may short-circuit and return an empty frame early if intermediate masks eliminate all rows. Parameters ---------- df : pandas.DataFrame Input DataFrame to filter. criteria : Mapping[str, Any] Mapping from column name to either a scalar value or an iterable of acceptable values for that column. copy : bool, default True If True, return a copy of the filtered frame to avoid pandas' view warnings. If False, return a view when possible. match_na : bool, default False Whether NA values provided in ``criteria`` should match NA values in the corresponding DataFrame column. Returns ------- pandas.DataFrame The filtered DataFrame (empty if no rows satisfy all criteria). Raises ------ KeyError If a column specified in ``criteria`` is not present in ``df``. Notes ----- - String values are treated as scalars, not list-like. - For list-like criteria, presence of NA in the list only matters when ``match_na=True``; in that case, NA in the column is also considered a match. Examples -------- Filter by a single scalar value: >>> import pandas as pd >>> from phenotypic.analysis.abc_._set_analyzer import SetAnalyzer >>> data = pd.DataFrame({ ... 'plate': ['P1', 'P1', 'P2', 'P2'], ... 'strain': ['WT', 'KO', 'WT', 'KO'], ... 'rep': [1, 1, 2, 2], ... 'value': [10.0, 12.5, 9.7, 11.2], ... }) >>> SetAnalyzer._filter_by(data, {'plate': 'P1'}) plate strain rep value 0 P1 WT 1 10.0 1 P1 KO 1 12.5 Filter where a column is in a list of acceptable values: >>> SetAnalyzer._filter_by(data, {'strain': ['WT', 'KO'], 'rep': [2]}) plate strain rep value 2 P2 WT 2 9.7 3 P2 KO 2 11.2 Match NA values explicitly: >>> data2 = data.copy() >>> data2.loc[1, 'strain'] = pd.NA >>> SetAnalyzer._filter_by(data2, {'strain': [pd.NA, 'WT']}, match_na=True) plate strain rep value 0 P1 WT 1 10.0 1 P1 <NA> 1 12.5 """ def _is_list_like(x: Any) -> bool: return isinstance(x, Iterable) and not isinstance(x, (str, bytes)) mask = pd.Series(True, index=df.index) for col, val in criteria.items(): if col not in df.columns: raise KeyError(f"Column not found: {col}") s = df[col] if _is_list_like(val): vals = list(val) part = s.isin(vals) if match_na and any(pd.isna(v) for v in vals): part = part | s.isna() else: if pd.isna(val): part = s.isna() if match_na else pd.Series(False, index=s.index) else: part = s.eq(val) mask &= part # Short-circuit if empty if not mask.any(): return df.iloc[0:0].copy() if copy else df.iloc[0:0] out = df[mask] return out.copy() if copy else out @staticmethod def _ensure_float_array(arr): """ Detects dtype and converts string-numeric or mixed arrays to float. Leaves numeric arrays unchanged. """ k = arr.dtype.kind # Already numeric if k in {"i", "u", "f", "c"}: return arr.astype(float) # String or object with strings if k in {"U", "S", "O"}: return SetAnalyzer.__smart_float_convert(arr) raise TypeError(f"Unsupported array dtype: {arr.dtype}") @staticmethod def __smart_float_convert(arr): out = [] for x in arr: if x is None: out.append(np.nan) continue try: out.append(float(str(x).replace(",", "").strip())) except ValueError: raise ValueError(f"Value '{x}' cannot be converted to float") return np.array(out, dtype=float)