Source code for merton.backtest.metrics
r"""Backtest metrics for PD models.
All metrics here use the Wilcoxon-Mann-Whitney / rank-based identities so
there's no sklearn dependency. They match ``sklearn.metrics`` outputs to
machine precision on the binary-classification case.
"""
from __future__ import annotations
import numpy as np
from scipy.stats import rankdata
from .._typing import ArrayLike, FloatArray
from ..exceptions import MertonInputError
def _check_pair(predictions: ArrayLike, defaults: ArrayLike) -> tuple[FloatArray, FloatArray]:
p = np.asarray(predictions, dtype=np.float64)
y = np.asarray(defaults, dtype=np.float64)
if p.shape != y.shape:
raise MertonInputError(f"predictions shape {p.shape} != defaults shape {y.shape}")
if p.ndim != 1:
raise MertonInputError("predictions and defaults must be 1-D")
if not np.all(np.isin(y, [0.0, 1.0])):
raise MertonInputError("defaults must be 0/1 indicators")
return p, y
[docs]
def auc(predictions: ArrayLike, defaults: ArrayLike) -> float:
r"""Area under the ROC curve.
Uses the Wilcoxon-Mann-Whitney identity: AUC equals the probability
that a randomly chosen defaulter has a higher predicted PD than a
randomly chosen non-defaulter.
.. math::
\mathrm{AUC} = \frac{R_+ - n_+(n_+ + 1)/2}{n_+ \cdot n_-}
where ``R_+`` is the sum of ranks of the positive class, ``n_+`` is
the number of positives, and ``n_-`` the number of negatives.
"""
p, y = _check_pair(predictions, defaults)
n_pos = int(y.sum())
n_neg = int(y.size - n_pos)
if n_pos == 0 or n_neg == 0:
raise MertonInputError("AUC undefined when only one class is present")
ranks = rankdata(p)
r_plus = float(ranks[y == 1].sum())
return (r_plus - n_pos * (n_pos + 1) / 2.0) / (n_pos * n_neg)
[docs]
def accuracy_ratio(predictions: ArrayLike, defaults: ArrayLike) -> float:
"""Gini coefficient ``AR = 2 · AUC − 1``."""
return 2.0 * auc(predictions, defaults) - 1.0
[docs]
def brier(predictions: ArrayLike, defaults: ArrayLike) -> float:
"""Brier score (mean squared error of probabilistic predictions)."""
p, y = _check_pair(predictions, defaults)
return float(np.mean((p - y) ** 2))
[docs]
def ks_statistic(predictions: ArrayLike, defaults: ArrayLike) -> float:
"""Kolmogorov-Smirnov statistic — max gap between the two empirical CDFs."""
p, y = _check_pair(predictions, defaults)
if not (y == 1).any() or not (y == 0).any():
raise MertonInputError("KS undefined when only one class is present")
grid = np.sort(np.unique(p))
pos = np.sort(p[y == 1])
neg = np.sort(p[y == 0])
# CDF at each grid point.
f_pos = np.searchsorted(pos, grid, side="right") / pos.size
f_neg = np.searchsorted(neg, grid, side="right") / neg.size
return float(np.max(np.abs(f_pos - f_neg)))
[docs]
def hosmer_lemeshow(
predictions: ArrayLike,
defaults: ArrayLike,
*,
bins: int = 10,
) -> tuple[float, float]:
"""Hosmer-Lemeshow goodness-of-fit χ² and (chi², dof).
Returns ``(chi_squared, degrees_of_freedom)``. P-values come from
``scipy.stats.chi2.sf(chi2, dof)``.
"""
p, y = _check_pair(predictions, defaults)
n = p.size
if n < bins * 5:
raise MertonInputError(f"need at least {bins * 5} obs for Hosmer-Lemeshow with bins={bins}")
# Equal-count bins by predicted-PD rank.
order = np.argsort(p)
p_sorted = p[order]
y_sorted = y[order]
bin_sizes = [n // bins] * bins
for i in range(n % bins):
bin_sizes[i] += 1
chi2 = 0.0
idx = 0
for sz in bin_sizes:
seg_p = p_sorted[idx : idx + sz]
seg_y = y_sorted[idx : idx + sz]
observed_pos = float(seg_y.sum())
expected_pos = float(seg_p.sum())
observed_neg = sz - observed_pos
expected_neg = sz - expected_pos
if expected_pos > 0:
chi2 += (observed_pos - expected_pos) ** 2 / expected_pos
if expected_neg > 0:
chi2 += (observed_neg - expected_neg) ** 2 / expected_neg
idx += sz
return float(chi2), float(bins - 2)
__all__ = [
"accuracy_ratio",
"auc",
"brier",
"hosmer_lemeshow",
"ks_statistic",
]