Source code for utils.rank_fragility.counterfactual

"""Counterfactual panel evaluation and aggregate stability summaries."""

from __future__ import annotations

import numpy as np
import pandas as pd
from scipy.stats import kendalltau

from .leaderboard import evaluate_models, original_leaderboard, rank_models
from .metrics import higher_is_better


def _delta(sota_score: float, baseline_score: float, metric: str) -> float:
    if pd.isna(sota_score) or pd.isna(baseline_score):
        return float("nan")
    return float(sota_score - baseline_score) if higher_is_better(metric) else float(baseline_score - sota_score)


def _ci(values: pd.Series) -> tuple[float, float]:
    clean = pd.to_numeric(values, errors="coerce").dropna()
    if clean.empty:
        return float("nan"), float("nan")
    return float(clean.quantile(0.025)), float(clean.quantile(0.975))


[docs] def run_counterfactual_evaluation( pred_audit_df: pd.DataFrame, panel_manifest: pd.DataFrame, task: str, metric: str, baseline_model: str, sota_model: str, ) -> dict[str, pd.DataFrame]: """Evaluate all models on each counterfactual panel and aggregate stability summaries.""" if panel_manifest.empty: empty = pd.DataFrame() return { "counterfactual_scores": empty, "counterfactual_ranks": empty, "sota_margin_by_panel": empty, "kendall_tau_by_panel": empty, "rank_probabilities": empty, "sota_margin_by_composition": empty, "kendall_tau_by_composition": empty, } original = original_leaderboard(pred_audit_df, task=task, metric=metric) original_rank = original.set_index("model")["rank"] score_frames: list[pd.DataFrame] = [] rank_frames: list[pd.DataFrame] = [] margin_rows: list[dict] = [] tau_rows: list[dict] = [] panel_meta = panel_manifest[["panel_id", "mode", "target_rate"]].drop_duplicates("panel_id") for panel_id, panel in panel_manifest.groupby("panel_id", sort=False): ids = panel["molecule_id"].astype(str).tolist() mode = panel["mode"].iloc[0] target_rate = panel["target_rate"].iloc[0] scores = evaluate_models(pred_audit_df, ids, task=task, metric=metric) scores.insert(0, "target_rate", target_rate) scores.insert(0, "mode", mode) scores.insert(0, "panel_id", panel_id) score_frames.append(scores) ranks = rank_models(scores[["model", "score"]], metric=metric) ranks.insert(0, "target_rate", target_rate) ranks.insert(0, "mode", mode) ranks.insert(0, "panel_id", panel_id) rank_frames.append(ranks) score_map = scores.set_index("model")["score"] margin_rows.append( { "panel_id": panel_id, "mode": mode, "target_rate": target_rate, "sota_model": sota_model, "baseline_model": baseline_model, "delta": _delta(score_map.get(sota_model, np.nan), score_map.get(baseline_model, np.nan), metric), "positive_means_sota_better": True, } ) panel_rank = ranks.set_index("model")["rank"] common = sorted(set(original_rank.index) & set(panel_rank.index)) if len(common) < 2: tau = np.nan else: tau = kendalltau(original_rank.loc[common].to_numpy(), panel_rank.loc[common].to_numpy()).statistic tau_rows.append( { "panel_id": panel_id, "mode": mode, "target_rate": target_rate, "kendall_tau_to_original": float(tau) if tau is not None else np.nan, } ) counterfactual_scores = pd.concat(score_frames, ignore_index=True) if score_frames else pd.DataFrame() counterfactual_ranks = pd.concat(rank_frames, ignore_index=True) if rank_frames else pd.DataFrame() sota_margin_by_panel = pd.DataFrame(margin_rows) kendall_tau_by_panel = pd.DataFrame(tau_rows) rank_rows = [] for (mode, target_rate, model), group in counterfactual_ranks.groupby(["mode", "target_rate", "model"], dropna=False): ranks = pd.to_numeric(group["rank"], errors="coerce") rank_rows.append( { "mode": mode, "target_rate": target_rate, "model": model, "probability_rank_1": float((ranks == 1.0).mean()) if len(ranks) else np.nan, "median_rank": float(ranks.median()) if len(ranks) else np.nan, "rank_q025": float(ranks.quantile(0.025)) if len(ranks) else np.nan, "rank_q975": float(ranks.quantile(0.975)) if len(ranks) else np.nan, } ) rank_probabilities = pd.DataFrame(rank_rows) margin_rows_agg = [] for (mode, target_rate), group in sota_margin_by_panel.groupby(["mode", "target_rate"], dropna=False): lower, upper = _ci(group["delta"]) delta = pd.to_numeric(group["delta"], errors="coerce") margin_rows_agg.append( { "mode": mode, "target_rate": target_rate, "mean_delta": float(delta.mean()) if len(delta) else np.nan, "median_delta": float(delta.median()) if len(delta) else np.nan, "ci_lower": lower, "ci_upper": upper, "fraction_delta_positive": float((delta > 0).mean()) if len(delta) else np.nan, } ) sota_margin_by_composition = pd.DataFrame(margin_rows_agg) tau_rows_agg = [] for (mode, target_rate), group in kendall_tau_by_panel.groupby(["mode", "target_rate"], dropna=False): lower, upper = _ci(group["kendall_tau_to_original"]) values = pd.to_numeric(group["kendall_tau_to_original"], errors="coerce") tau_rows_agg.append( { "mode": mode, "target_rate": target_rate, "mean_kendall_tau": float(values.mean()) if len(values) else np.nan, "median_kendall_tau": float(values.median()) if len(values) else np.nan, "ci_lower": lower, "ci_upper": upper, } ) kendall_tau_by_composition = pd.DataFrame(tau_rows_agg) return { "counterfactual_scores": counterfactual_scores, "counterfactual_ranks": counterfactual_ranks, "sota_margin_by_panel": sota_margin_by_panel, "kendall_tau_by_panel": kendall_tau_by_panel, "rank_probabilities": rank_probabilities, "sota_margin_by_composition": sota_margin_by_composition, "kendall_tau_by_composition": kendall_tau_by_composition, }