Source code for calibrated_explanations.core.validation
"""Validation helpers shared across the core package.
These utilities centralize defensive argument checks while preserving
existing behavior, ensuring future refactors can rely on a consistent
error vocabulary.
ADR-002 compliance: Validation functions use the exception taxonomy
(ValidationError, DataShapeError, NotFittedError, ConfigurationError, etc.)
and accept optional details payloads.
"""
from __future__ import annotations
import sys
from typing import Any, Literal, Type, cast
import numpy as np
import numpy.typing as npt
from ..utils.exceptions import (
CalibratedError,
DataShapeError,
ModelNotSupportedError,
NotFittedError,
ValidationError,
)
[docs]
def validate_not_none(value: Any, name: str) -> None:
"""Raise ``ValidationError`` when ``value`` is ``None``."""
if value is None:
raise ValidationError(f"Argument '{name}' must not be None.")
[docs]
def validate_type(value: Any, expected_type: type, name: str) -> None:
"""Ensure that ``value`` is an instance of ``expected_type``."""
if not isinstance(value, expected_type):
raise DataShapeError(
f"Argument '{name}' must be of type {expected_type.__name__}, got {type(value).__name__}."
)
[docs]
def validate_non_empty(value: Any, name: str) -> None:
"""Ensure that length-aware inputs are not empty."""
if hasattr(value, "__len__") and len(value) == 0:
raise ValidationError(f"Argument '{name}' must not be empty.")
[docs]
def validate_inputs(
x: Any,
y: Any | None = None,
task: Literal["auto", "classification", "regression"] = "auto",
allow_nan: bool = False,
require_y: bool = False,
n_features: int | None = None,
class_labels: Any | None = None,
check_finite: bool = True,
) -> None:
"""Validate input features and target for downstream operations.
This function provides the primary validation entry point per ADR-002,
accepting feature matrix x and optional target y with comprehensive
shape, dtype, and value checks.
Parameters
----------
x : array-like
Feature matrix of shape (n_samples, n_features). Can be a NumPy array,
pandas DataFrame, or similar.
y : array-like, optional
Target vector of shape (n_samples,). If provided, length must match x.
Default is None.
task : {"auto", "classification", "regression"}, default="auto"
Task type. When "auto", inferred from model capabilities or y dtype.
allow_nan : bool, default=False
If False, raises ValidationError when x or y contain NaN values.
require_y : bool, default=False
If True, raises ValidationError when y is None.
n_features : int, optional
Expected number of features in x. If provided and mismatch occurs,
raises DataShapeError.
class_labels : array-like, optional
Class labels for classification tasks. Stored for later use.
check_finite : bool, default=True
If True, checks that x and y contain only finite values (except NaN
when allow_nan=True).
Raises
------
ValidationError
When y is required but None, or when values contain NaN/inf unexpectedly.
DataShapeError
When x is not 2D, feature count mismatches, or y length mismatches.
Examples
--------
>>> from calibrated_explanations.core.validation import validate_inputs
>>> import numpy as np
>>> x = np.array([[1.0, 2.0], [3.0, 4.0]])
>>> y = np.array([0, 1])
>>> validate_inputs(x, y, task="classification", require_y=True, n_features=2)
# Passes silently if valid
"""
validate_not_none(x, "x")
x_arr = _as_2d_array(x)
if x_arr.ndim != 2:
raise DataShapeError(
"Argument 'x' must be 2D (n_samples, n_features).",
details={"param": "x", "ndim": x_arr.ndim, "expected": 2},
)
n_samples = x_arr.shape[0]
if n_features is not None and x_arr.shape[1] != n_features:
raise DataShapeError(
f"Argument 'x' must have {n_features} features, got {x_arr.shape[1]}.",
details={
"param": "x",
"expected_features": n_features,
"actual_features": x_arr.shape[1],
},
)
if require_y and y is None:
raise ValidationError(
"Argument 'y' must be provided when require_y=True.",
details={"param": "y", "requirement": "required", "task": task},
)
if y is not None:
y_arr = _as_1d_array(y)
if y_arr.shape[0] != n_samples:
raise DataShapeError(
f"Length of 'y' ({y_arr.shape[0]}) does not match number of samples in x ({n_samples}).",
details={
"param": "y",
"y_length": y_arr.shape[0],
"x_samples": n_samples,
},
)
if (
check_finite
and not allow_nan
and np.issubdtype(y_arr.dtype, np.number)
and not np.isfinite(y_arr).all()
):
raise ValidationError(
"Argument 'y' contains NaN or infinite values.",
details={"param": "y", "check": "finitude", "allow_nan": allow_nan},
)
if check_finite and not allow_nan and not np.isfinite(x_arr).all():
raise ValidationError(
"Argument 'x' contains NaN or infinite values.",
details={"param": "x", "check": "finitude", "allow_nan": allow_nan},
)
# Store class_labels if provided (for metadata tracking)
if class_labels is not None:
validate_not_none(class_labels, "class_labels")
# Reserve task inference for future behavior without changing runtime output.
_ = infer_task(x, y, None) if task == "auto" else task
[docs]
def infer_task(
x: Any = None, y: Any = None, model: Any = None
) -> Literal["classification", "regression"]:
"""Infer the task type using model capabilities or target dtype.
Priority is given to model capabilities (``predict_proba`` implies
classification). When a model is unavailable, heuristics based on the
target dtype are used. Regression is the safe fallback.
"""
if model is not None:
if hasattr(model, "predict_proba"):
return "classification"
return "regression"
if y is not None:
y_arr = _as_1d_array(y)
if np.issubdtype(y_arr.dtype, np.floating):
return "regression"
return "classification"
return "regression"
def _as_2d_array(x: Any) -> npt.NDArray[np.generic]:
"""Return ``x`` coerced to a 2D ``ndarray``."""
if hasattr(x, "values") and hasattr(x, "shape"):
try:
return cast(npt.NDArray[np.generic], np.asarray(x.values))
except: # noqa: E722
if not isinstance(sys.exc_info()[1], Exception):
raise
# pragma: no cover - fallback
return cast(npt.NDArray[np.generic], np.asarray(x))
return cast(npt.NDArray[np.generic], np.asarray(x))
def _as_1d_array(y: Any) -> npt.NDArray[np.generic]:
"""Return ``y`` coerced to a flattened 1D ``ndarray``."""
if hasattr(y, "values") and not isinstance(y, np.ndarray):
y = y.values
arr = cast(npt.NDArray[np.generic], np.asarray(y))
return cast(npt.NDArray[np.generic], arr.reshape(-1))
[docs]
def validate_inputs_matrix(
x: Any,
y: Any | None = None,
*,
task: Literal["auto", "classification", "regression"] = "auto",
allow_nan: bool = False,
require_y: bool = False,
n_features: int | None = None,
check_finite: bool = True,
) -> None:
"""Validate a feature/target matrix pair for downstream operations.
- Ensure ``x`` is 2D and matches the expected feature count when provided.
- Confirm that ``y`` has the same number of samples when supplied.
- Guard against NaN or infinite values unless explicitly allowed.
"""
validate_not_none(x, "x")
x_arr = _as_2d_array(x)
if x_arr.ndim != 2:
raise DataShapeError("Argument 'x' must be 2D (n_samples, n_features).")
n_samples = x_arr.shape[0]
if n_features is not None and x_arr.shape[1] != n_features:
raise DataShapeError(f"Argument 'x' must have {n_features} features, got {x_arr.shape[1]}.")
if require_y and y is None:
raise ValidationError("Argument 'y' must be provided when require_y=True.")
if y is not None:
y_arr = _as_1d_array(y)
if y_arr.shape[0] != n_samples:
raise DataShapeError(
f"Length of 'y' ({y_arr.shape[0]}) does not match number of samples in x ({n_samples})."
)
if (
check_finite
and not allow_nan
and np.issubdtype(y_arr.dtype, np.number)
and not np.isfinite(y_arr).all()
):
raise ValidationError("Argument 'y' contains NaN or infinite values.")
if check_finite and not allow_nan and not np.isfinite(x_arr).all():
raise ValidationError("Argument 'x' contains NaN or infinite values.")
# Reserve task inference for future behavior without changing runtime output.
_ = infer_task(x, y, None) if task == "auto" else task
[docs]
def validate_model(model: Any) -> None:
"""Validate minimal model protocol requirements."""
validate_not_none(model, "model")
if not hasattr(model, "predict"):
raise ModelNotSupportedError("Model must implement a 'predict' method.")
[docs]
def validate_fit_state(obj: Any, *, require: bool = True) -> None:
"""Validate fit state flags before executing stateful operations."""
if not require:
return
if hasattr(obj, "fitted") and not obj.fitted:
raise NotFittedError("Operation requires a fitted estimator/explainer.")
[docs]
def validate(
condition: bool,
exc_cls: Type[CalibratedError],
message: str,
*,
details: dict[str, Any] | None = None,
) -> None:
"""Conditional validation helper for common patterns.
Raises an exception when a condition is False, enabling concise guard clauses.
Parameters
----------
condition : bool
Condition to check. If False, raises exc_cls.
exc_cls : Type[CalibratedError]
Exception class to raise when condition is False.
message : str
Error message.
details : dict, optional
Structured error details to attach to the exception.
Raises
------
exc_cls
If condition is False.
Examples
--------
>>> from calibrated_explanations.core.validation import validate
>>> from calibrated_explanations.utils.exceptions import ValidationError
>>> validate(len(x) > 0, ValidationError, "x must not be empty", details={"param": "x"})
"""
if not condition:
raise exc_cls(message, details=details)
__all__ = [
"validate_inputs",
"validate_not_none",
"validate_type",
"validate_non_empty",
"validate_inputs_matrix",
"validate_model",
"validate_fit_state",
"infer_task",
"validate",
]