Source code for utils.rank_fragility.metrics

"""Metric helpers for rank-fragility leaderboard evaluation."""

from __future__ import annotations

import warnings

import numpy as np
from sklearn.metrics import (
    accuracy_score,
    average_precision_score,
    balanced_accuracy_score,
    brier_score_loss,
    log_loss as sklearn_log_loss,
    mean_absolute_error,
    mean_squared_error,
    r2_score,
    roc_auc_score,
)


_HIGHER_IS_BETTER = {
    "auroc": True,
    "auprc": True,
    "accuracy": True,
    "balanced_accuracy": True,
    "log_loss": False,
    "brier": False,
    "rmse": False,
    "mae": False,
    "r2": True,
}



[docs]
def higher_is_better(metric: str) -> bool:
    """Return whether larger values indicate better performance."""
    key = metric.lower()
    if key not in _HIGHER_IS_BETTER:
        raise ValueError(f"unsupported metric: {metric}")
    return _HIGHER_IS_BETTER[key]



def _as_float_arrays(y_true, y_pred) -> tuple[np.ndarray, np.ndarray]:
    yt = np.asarray(y_true, dtype=float)
    yp = np.asarray(y_pred, dtype=float)
    mask = ~(np.isnan(yt) | np.isnan(yp))
    return yt[mask], yp[mask]


def _clipped_prob(y_pred) -> np.ndarray:
    return np.clip(np.asarray(y_pred, dtype=float), 1e-15, 1.0 - 1e-15)



[docs]
def compute_metric(y_true, y_pred, task: str, metric: str) -> float:
    """Compute one supported classification or regression metric."""
    task = task.lower()
    metric = metric.lower()
    yt, yp = _as_float_arrays(y_true, y_pred)
    if len(yt) == 0:
        warnings.warn(f"{metric} is undefined on an empty subset", RuntimeWarning, stacklevel=2)
        return float("nan")

    if task == "classification":
        if metric == "auroc":
            if len(np.unique(yt)) < 2:
                warnings.warn("AUROC is undefined when a subset has one class", RuntimeWarning, stacklevel=2)
                return float("nan")
            return float(roc_auc_score(yt, yp))
        if metric == "auprc":
            if len(np.unique(yt)) < 2:
                warnings.warn("AUPRC is undefined when a subset has one class", RuntimeWarning, stacklevel=2)
                return float("nan")
            return float(average_precision_score(yt, yp))
        if metric == "accuracy":
            return float(accuracy_score(yt, yp >= 0.5))
        if metric == "balanced_accuracy":
            return float(balanced_accuracy_score(yt, yp >= 0.5))
        if metric == "log_loss":
            return float(sklearn_log_loss(yt, _clipped_prob(yp), labels=[0, 1]))
        if metric == "brier":
            return float(brier_score_loss(yt, _clipped_prob(yp)))
        raise ValueError(f"unsupported classification metric: {metric}")

    if task == "regression":
        if metric == "rmse":
            return float(np.sqrt(mean_squared_error(yt, yp)))
        if metric == "mae":
            return float(mean_absolute_error(yt, yp))
        if metric == "r2":
            if len(yt) < 2:
                warnings.warn("R2 is undefined for fewer than two samples", RuntimeWarning, stacklevel=2)
                return float("nan")
            return float(r2_score(yt, yp))
        raise ValueError(f"unsupported regression metric: {metric}")

    raise ValueError(f"task must be classification or regression, got {task!r}")




[docs]
def per_sample_loss(y_true, y_pred, task: str, loss: str) -> np.ndarray:
    """Return per-example loss values for attribution summaries."""
    task = task.lower()
    loss = loss.lower()
    yt = np.asarray(y_true, dtype=float)
    yp = np.asarray(y_pred, dtype=float)

    if task == "classification":
        prob = _clipped_prob(yp)
        if loss == "log_loss":
            return -(yt * np.log(prob) + (1.0 - yt) * np.log(1.0 - prob))
        if loss == "brier":
            return (prob - yt) ** 2
        raise ValueError(f"unsupported classification per-sample loss: {loss}")

    if task == "regression":
        if loss == "absolute_error":
            return np.abs(yt - yp)
        if loss == "squared_error":
            return (yt - yp) ** 2
        raise ValueError(f"unsupported regression per-sample loss: {loss}")

    raise ValueError(f"task must be classification or regression, got {task!r}")