Source code for phenotypic.refine._residual_outlier_remover
from __future__ import annotations
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from phenotypic import GridImage
import numpy as np
from typing import Optional
from phenotypic.abc_ import GridObjectRefiner
from phenotypic.measure import MeasureGridLinRegStats
from phenotypic.measure._measure_grid_linreg_stats import GRID_LINREG_STATS
from phenotypic.tools.constants_ import GRID
[docs]
class ResidualOutlierRemover(GridObjectRefiner):
"""Remove objects with large regression residuals in noisy grid rows/columns.
Intuition:
In grid assays, colony centroids should align along near-linear trends
within each row/column. Rows or columns with high variability suggest
mis-detections or artifacts. Within such noisy lines, this operation
removes objects whose positional residuals exceed a robust cutoff.
Why this is useful for agar plates:
Condensation, glare, and debris can produce off-grid detections that
inflate row/column variance and break gridding assumptions. Pruning
residual outliers restores alignment and improves subsequent measures.
Use cases:
- Cleaning rows with multiple off-line blobs before measuring growth.
- Stabilizing grid registration when a subset of positions is noisy.
Caveats:
- If true colonies deviate due to warping or growth spreading, strict
cutoffs may remove real data.
- Depends on reasonable initial grid fit; with severe misregistration
it may prune valid colonies.
Attributes:
axis (Optional[int]): Axis to analyze for outliers. ``None`` analyzes
both rows and columns; ``0`` analyzes rows; ``1`` analyzes columns.
Restricting the axis can speed up processing or focus on suspected
directions of error.
cutoff_multiplier (float): Multiplier applied to a robust dispersion
estimate (IQR-based in implementation) to set the outlier cutoff.
Higher values are more permissive (fewer removals) and preserve
edge cases; lower values prune more aggressively.
max_coeff_variance (int): Maximum coefficient of variance (std/mean)
allowed for a row/column before it is considered for outlier
pruning. Smaller values trigger cleaning sooner; larger values only
clean severely noisy lines.
Examples:
.. dropdown:: Remove objects with large regression residuals
>>> from phenotypic.refine import ResidualOutlierRemover
>>> op = ResidualOutlierRemover(axis=None, stddev_multiplier=1.5, max_coeff_variance=1)
>>> image = op.apply(image, inplace=True) # doctest: +SKIP
"""
[docs]
def __init__(
self,
axis: Optional[int] = None,
stddev_multiplier=1.5,
max_coeff_variance: int = 1,
):
"""Initialize the remover.
Args:
axis (Optional[int]): Axis selection for analysis. ``None`` runs
both directions; ``0`` rows; ``1`` columns. Limiting the axis
reduces runtime and targets known problem directions.
stddev_multiplier (float): Robust residual cutoff multiplier. Lower
values remove more outliers (stronger cleanup) but risk dropping
valid off-center colonies; higher values are conservative.
max_coeff_variance (int): Threshold for row/column variability
(std/mean) to trigger outlier analysis. Lower values clean more
lines; higher values only address extremely noisy lines.
Raises:
ValueError: If parameters are not consistent with the operation
(e.g., invalid types). Errors may arise during execution when
measuring grid statistics.
"""
self.axis = axis # Either none for both axis, 0 for row, or 1 for column
self.cutoff_multiplier = stddev_multiplier
self.max_coeff_variance = max_coeff_variance
def _operate(self, image: GridImage) -> GridImage:
"""Identify and remove residual outliers per noisy row/column.
Args:
image (GridImage): Grid image with object map and grid metadata.
Returns:
GridImage: Modified grid image with outlier objects removed.
Raises:
ValueError: If parameters are misconfigured in a way that prevents
computation (propagated from measurement utilities).
"""
# Generate cached version of grid_info
linreg_stat_extractor = MeasureGridLinRegStats()
grid_info = linreg_stat_extractor.measure(image)
# Create container to hold the id of objects to be removed
outlier_obj_ids = []
# Row-wise residual outlier discovery
if self.axis is None or self.axis == 0:
# Calculate the coefficient of variance (std/mean)
# Collect the standard deviation
row_variance = grid_info.groupby(str(GRID.ROW_NUM))[
str(GRID_LINREG_STATS.RESIDUAL_ERR)
].std()
# Divide standard deviation by mean
row_variance = (
row_variance
/ grid_info.groupby(str(GRID.ROW_NUM))[
str(GRID_LINREG_STATS.RESIDUAL_ERR)
].mean()
)
over_limit_row_variance = row_variance.loc[
row_variance > self.max_coeff_variance
]
# Collect outlier objects in the nrows with a variance over the maximum
for row_idx in over_limit_row_variance.index:
row_err = grid_info.loc[
grid_info.loc[:, str(GRID.ROW_NUM)] == row_idx,
str(GRID_LINREG_STATS.RESIDUAL_ERR),
]
row_err_mean = row_err.mean()
row_q3, row_q1 = row_err.quantile([0.75, 0.25])
row_iqr = row_q3 - row_q1
# row_stddev = row_err.std()
# upper_row_cutoff = row_err_mean + row_stddev * self.cutoff_multiplier
upper_row_cutoff = row_err_mean + row_iqr * self.cutoff_multiplier
outlier_obj_ids += row_err.loc[
row_err >= upper_row_cutoff
].index.tolist()
# Column-wise residual outlier discovery
if self.axis is None or self.axis == 1:
# Calculate the coefficient of variance (std/mean)
# Collect the standard deviation
col_variance = grid_info.groupby(str(GRID.COL_NUM))[
str(GRID_LINREG_STATS.RESIDUAL_ERR)
].std()
# Divide standard deviation by mean
col_variance = (
col_variance
/ grid_info.groupby(str(GRID.COL_NUM))[
str(GRID_LINREG_STATS.RESIDUAL_ERR)
].mean()
)
over_limit_col_variance = col_variance.loc[
col_variance > self.max_coeff_variance
]
# Collect outlier objects in the columns with a variance over the maximum
for col_idx in over_limit_col_variance.index:
col_err = grid_info.loc[
grid_info.loc[:, str(GRID.COL_NUM)] == col_idx,
str(GRID_LINREG_STATS.RESIDUAL_ERR),
]
col_err_mean = col_err.mean()
col_q3, col_q1 = col_err.quantile([0.75, 0.25])
col_iqr = col_q3 - col_q1
# col_stddev = col_err.std()
upper_col_cutoff = col_err_mean + col_iqr * self.cutoff_multiplier
outlier_obj_ids += col_err.loc[
col_err >= upper_col_cutoff
].index.tolist()
# Remove objects from obj map
image.objmap[np.isin(image.objmap[:], outlier_obj_ids)] = 0
return image