"""
Evaluation metrics for ordinal regression and classification.
This module provides a comprehensive set of metrics specifically designed for evaluating
ordinal regression and classification models. It includes both hard-label metrics (based on
predicted class labels) and probability-based metrics (based on predicted probabilities).
The metrics are designed to account for the ordinal nature of the data, where classes
have a natural ordering and misclassification costs increase with the distance between
predicted and true classes.
Available Metrics:
------------------
Hard Label Metrics:
- accuracy: Standard classification accuracy
- adjacent_accuracy: Proportion of predictions within one class of true label
- mze: Mean Zero-One Error (1 - accuracy)
- mae: Mean Absolute Error
- mse: Mean Squared Error
- weighted_kappa: Cohen's Kappa with linear or quadratic weights
- cem: Closeness Evaluation Measure
- spearman_correlation: Spearman's rank correlation
- kendall_tau: Kendall's Tau correlation
Probability-Based Metrics:
- ranked_probability_score: RPS for probabilistic predictions
- ordinal_weighted_ce: Ordinal weighted cross-entropy loss (Ordinal Log Loss)
"""
import numpy as np
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error
from scipy.stats import spearmanr
from sklearn.metrics import cohen_kappa_score
from scipy.stats import kendalltau
[docs]
def accuracy(y_true, y_pred):
"""
Calculate accuracy for ordinal regression.
This is the standard classification accuracy, measuring the proportion of
correct predictions. While simple, it doesn't account for the ordinal nature
of the data.
Parameters
----------
y_true : array-like of shape (n_samples,)
True ordinal labels
y_pred : array-like of shape (n_samples,)
Predicted ordinal labels
Returns
-------
float
Accuracy score between 0 and 1, where 1 indicates perfect predictions
"""
return accuracy_score(y_true, y_pred)
[docs]
def mze(y_true, y_pred):
"""
Calculate Mean Zero-One Error (MZE) for ordinal regression.
MZE is the complement of accuracy (1 - accuracy). It measures the proportion
of incorrect predictions, treating all misclassifications equally regardless
of their distance from the true class.
Parameters
----------
y_true : array-like of shape (n_samples,)
True ordinal labels
y_pred : array-like of shape (n_samples,)
Predicted ordinal labels
Returns
-------
float
Mean Zero-One Error between 0 and 1, where 0 indicates perfect predictions
"""
return 1 - accuracy_score(y_true, y_pred)
[docs]
def mae(y_true, y_pred):
"""
Calculate Mean Absolute Error (MAE) for ordinal regression.
MAE measures the average absolute difference between predicted and true labels.
Unlike accuracy, it accounts for the ordinal nature of the data by penalizing
predictions based on their distance from the true class.
Parameters
----------
y_true : array-like of shape (n_samples,)
True ordinal labels
y_pred : array-like of shape (n_samples,)
Predicted ordinal labels
Returns
-------
float
Mean Absolute Error, where 0 indicates perfect predictions
"""
return mean_absolute_error(y_true, y_pred)
[docs]
def mse(y_true, y_pred):
"""
Calculate Mean Squared Error (MSE) for ordinal regression.
MSE measures the average squared difference between predicted and true labels.
It penalizes larger errors more heavily than MAE due to the squaring operation.
Parameters
----------
y_true : array-like of shape (n_samples,)
True ordinal labels
y_pred : array-like of shape (n_samples,)
Predicted ordinal labels
Returns
-------
float
Mean Squared Error, where 0 indicates perfect predictions
"""
return mean_squared_error(y_true, y_pred)
[docs]
def weighted_kappa(y_true, y_pred, weights='quadratic'):
"""
Calculate weighted kappa for ordinal regression.
Weighted kappa extends Cohen's kappa to account for the ordinal nature of the data
by applying weights to the confusion matrix (Cohen (1968)). The weights can be linear or quadratic,
with quadratic weights penalizing larger misclassifications more heavily.
Parameters
----------
y_true : array-like of shape (n_samples,)
True ordinal labels
y_pred : array-like of shape (n_samples,)
Predicted ordinal labels
weights : {'linear', 'quadratic', 'none'}, default='quadratic'
Weighting scheme for the confusion matrix:
- 'linear': Linear weights based on distance
- 'quadratic': Quadratic weights (squared distance)
- 'none': No weights (standard kappa)
Returns
-------
float
Weighted kappa score between -1 and 1, where:
- 1 indicates perfect agreement
- 0 indicates agreement equivalent to chance
- -1 indicates perfect disagreement
"""
return cohen_kappa_score(y_true, y_pred, weights=weights)
[docs]
def _get_class_counts(y):
"""
Calculate the count of items per class.
Parameters
----------
y : array-like of shape (n_samples,)
Array of class labels
Returns
-------
dict
Dictionary mapping class labels to their counts
"""
unique_labels, counts = np.unique(y, return_counts=True)
return dict(zip(unique_labels, counts))
[docs]
def _calculate_proximity(c1, c2, class_counts, total_items):
"""
Calculate proximity between two classes.
This is a helper function for the CEM metric that calculates the proximity
between two classes based on their positions and the distribution of classes
in the dataset.
Parameters
----------
c1 : int
First class label
c2 : int
Second class label
class_counts : dict
Dictionary mapping class labels to their counts
total_items : int
Total number of items in the dataset
Returns
-------
float
Proximity value between the two classes
"""
if c1 == c2:
return -np.log(class_counts[c1] / (2 * total_items))
if c1 > c2:
c1, c2 = c2, c1
sum_counts = 0
for k in range(c1 + 1, c2 + 1):
sum_counts += class_counts.get(k, 0)
return -np.log((class_counts[c1] / 2 + sum_counts) / total_items)
[docs]
def cem(y_true, y_pred, class_counts=None):
"""
Calculate Closeness Evaluation Measure (CEM) for ordinal classification.
CEM is a metric proposed by Amigo et al. (2020) that evaluates the performance of ordinal classifiers based on measure and information theory. It uses a proximity-based
approach that penalizes misclassifications based on their distance from the true class
and the distribution of classes in the dataset.
Parameters
----------
y_true : array-like of shape (n_samples,)
True ordinal labels
y_pred : array-like of shape (n_samples,)
Predicted ordinal labels
class_counts : dict, optional
Dictionary mapping class labels to their counts. If None, calculated from y_true.
Useful for local explanations where class distribution might differ from training.
Returns
-------
float
CEM score between 0 and 1, where:
- 1 indicates perfect predictions
- 0 indicates worst possible predictions
"""
y_true = np.asarray(y_true)
y_pred = np.asarray(y_pred)
if class_counts is None:
class_counts = _get_class_counts(y_true)
total_items = len(y_true)
else:
total_items = sum(class_counts.values())
sum_pred_proximities = 0
sum_true_proximities = 0
for true_label, pred_label in zip(y_true, y_pred):
pred_prox = _calculate_proximity(pred_label, true_label, class_counts, total_items)
sum_pred_proximities += pred_prox
true_prox = _calculate_proximity(true_label, true_label, class_counts, total_items)
sum_true_proximities += true_prox
if sum_true_proximities == 0:
return 0.0
return sum_pred_proximities / sum_true_proximities
[docs]
def spearman_correlation(y_true, y_pred):
"""
Calculate Spearman rank correlation for ordinal regression.
Spearman (1904)'s rank correlation measures the monotonic relationship between predicted and
true labels. It's particularly useful for ordinal data as it only considers
the ranking of values, not their absolute differences.
Parameters
----------
y_true : array-like of shape (n_samples,)
True ordinal labels
y_pred : array-like of shape (n_samples,)
Predicted ordinal labels
Returns
-------
float
Spearman rank correlation coefficient between -1 and 1, where:
- 1 indicates perfect positive correlation
- 0 indicates no correlation
- -1 indicates perfect negative correlation
"""
correlation, _ = spearmanr(y_true, y_pred)
return correlation
[docs]
def kendall_tau(y_true, y_pred):
"""
Calculate Kendall's Tau correlation coefficient for ordinal data.
Kendall(1945)'s Tau-b measures the ordinal association between two rankings. It's
particularly suitable for ordinal data as it considers the concordance of
pairs of observations and the number of tied ranks.
Parameters
----------
y_true : array-like of shape (n_samples,)
True ordinal labels
y_pred : array-like of shape (n_samples,)
Predicted ordinal labels
Returns
-------
float
Kendall's Tau correlation coefficient between -1 and 1, where:
- 1 indicates perfect agreement in rankings
- 0 indicates no association between rankings
- -1 indicates perfect disagreement in rankings
"""
correlation, _ = kendalltau(y_true, y_pred)
return correlation
[docs]
def _create_one_hot_encoding(y_true, n_classes=None, zero_indexed = False):
"""
Create one-hot encoding for ordinal labels.
This helper function converts ordinal labels to one-hot encoded format,
handling arbitrary label ranges by shifting to 0-based indexing.
Parameters
----------
y_true : array-like of shape (n_samples,)
True ordinal labels
n_classes : int, optional
Number of classes. If None, inferred from unique labels.
Returns
-------
tuple
(one_hot_matrix, min_label, n_classes)
- one_hot_matrix: 2D array of shape (n_samples, n_classes)
- min_label: Minimum label value in the original data
- n_classes: Number of unique classes
"""
y_true = np.asarray(y_true)
unique_labels = np.unique(y_true)
if zero_indexed:
min_label = 0
else:
min_label = np.min(unique_labels)
if n_classes is None:
n_classes = len(unique_labels)
n_samples = len(y_true)
y_true_one_hot = np.zeros((n_samples, n_classes))
shifted_labels = y_true - min_label
for i, label in enumerate(shifted_labels):
y_true_one_hot[i, int(label)] = 1
return y_true_one_hot, min_label, n_classes
[docs]
def ranked_probability_score(y_true, y_pred_proba, zero_indexed = False):
"""
Calculate Ranked Probability Score (RPS) for ordinal regression.
Epstein (1969)'s Ranked Probability Score (RPS) evaluates probabilistic predictions for ordinal data by comparing the
cumulative predicted probabilities with the cumulative observed probabilities.
It penalizes predictions that deviate from the true class distribution.
Parameters
----------
y_true : array-like of shape (n_samples,)
True ordinal labels
y_pred_proba : array-like of shape (n_samples, n_classes)
Predicted probabilities for each class
Returns
-------
float
Ranked Probability Score, where:
- 0 indicates perfect predictions
- Higher values indicate worse predictions
"""
y_true = np.asarray(y_true)
y_pred_proba = np.asarray(y_pred_proba)
y_true_one_hot, _, _ = _create_one_hot_encoding(y_true, n_classes=y_pred_proba.shape[1], zero_indexed = zero_indexed)
y_pred_cumsum = np.cumsum(y_pred_proba, axis=1)
y_true_cumsum = np.cumsum(y_true_one_hot, axis=1)
rps = np.mean(np.sum((y_pred_cumsum - y_true_cumsum) ** 2, axis=1))
return rps
[docs]
def ordinal_weighted_ce(y_true, y_pred_proba, alpha=1, zero_indexed = False):
"""
Calculate ordinal weighted cross-entropy loss.
This loss function extends standard cross-entropy to account for the ordinal
nature of the data by weighting the loss based on the distance between
predicted and true classes, see Polat et al. (2025). Also known as ordinal log loss (Castagnos et al. (2022)).
Parameters
----------
y_true : array-like of shape (n_samples,)
True ordinal labels
y_pred_proba : array-like of shape (n_samples, n_classes)
Predicted probabilities for each class
alpha : float, default=1
Exponent for the absolute difference. Higher values increase the penalty
for predictions far from the true class.
Returns
-------
float
Loss value, where:
- Lower values indicate better predictions
- The loss is always non-negative
"""
y_true = np.asarray(y_true)
y_pred_proba = np.asarray(y_pred_proba)
n_samples, n_classes = y_pred_proba.shape
eps = 1e-15 # To avoid log(0)
if zero_indexed:
min_label = 0
else:
min_label = np.min(y_true)
y_true_shifted = y_true - min_label
loss = 0.0
for i in range(n_samples):
for k in range(n_classes):
pi_k = np.clip(y_pred_proba[i, k], eps, 1 - eps)
loss += (np.log(1 - pi_k) * np.power(abs(k - y_true_shifted[i]),alpha))
loss = -loss / n_samples
return loss
[docs]
def adjacent_accuracy(y_true, y_pred):
"""
Calculate Adjacent Accuracy for ordinal regression.
Adjacent accuracy measures the proportion of predictions that are either
correct or off by one class. This is particularly useful for ordinal data
where predictions close to the true class are more acceptable than those
far away.
Parameters
----------
y_true : array-like of shape (n_samples,)
True ordinal labels
y_pred : array-like of shape (n_samples,)
Predicted ordinal labels
Returns
-------
float
Adjacent accuracy score between 0 and 1, where:
- 1 indicates all predictions are either correct or off by one class
- 0 indicates all predictions are off by more than one class
"""
y_true = np.asarray(y_true)
y_pred = np.asarray(y_pred)
correct_or_adjacent = np.sum(np.abs(y_true - y_pred) <= 1)
return correct_or_adjacent / len(y_true)
[docs]
def evaluate_ordinal_model(y_true, y_pred, y_pred_proba=None, metrics=None, class_counts=None, zero_indexed = False):
"""
Evaluate an ordinal regression model using multiple metrics.
This function computes a comprehensive set of evaluation metrics for ordinal
regression models, including both hard-label metrics and probability-based metrics
if probability predictions are available.
Parameters
----------
y_true : array-like of shape (n_samples,)
True ordinal labels
y_pred : array-like of shape (n_samples,)
Predicted ordinal labels
y_pred_proba : array-like of shape (n_samples, n_classes), optional
Predicted class probabilities
metrics : list of str, optional
List of metric names to compute. If None, all available metrics are used.
class_counts : dict, optional
Dictionary mapping class labels to their counts. If None, calculated from y_true.
Useful for local explanations where class distribution might differ from training.
zero_indexed : bool, optional
Whether the labels are zero-indexed. If False, the labels are shifted to zero-indexed.
Returns
-------
dict
Dictionary containing evaluation results for each metric
Notes
-----
The function automatically selects appropriate metrics based on the available
predictions. Probability-based metrics are only computed if y_pred_proba is provided.
"""
available_hard_metrics = {
'accuracy': accuracy,
'adjacent_accuracy': adjacent_accuracy,
'mze': mze,
'mae': mae,
'mse': mse,
'weighted_kappa_quadratic': lambda yt, yp: weighted_kappa(yt, yp, weights='quadratic'),
'weighted_kappa_linear': lambda yt, yp: weighted_kappa(yt, yp, weights='linear'),
'cem': lambda yt, yp: cem(yt, yp, class_counts=class_counts),
'spearman_correlation': spearman_correlation,
'kendall_tau': kendall_tau,
}
available_proba_metrics = {
'ranked_probability_score': lambda yt, yp: ranked_probability_score(yt, yp, zero_indexed=zero_indexed),
'ordinal_weighted_ce_linear': lambda yt, yp: ordinal_weighted_ce(yt, yp, alpha=1, zero_indexed=zero_indexed),
'ordinal_weighted_ce_quadratic': lambda yt, yp: ordinal_weighted_ce(yt, yp, alpha=2, zero_indexed=zero_indexed),
}
if metrics is None:
metrics = list(available_hard_metrics.keys()) + list(available_proba_metrics.keys())
results = {}
for metric, func in available_hard_metrics.items():
if metric in metrics:
try:
results[metric] = func(y_true, y_pred)
except Exception as e:
print(f"Warning: Could not calculate {metric}: {e}")
if y_pred_proba is not None:
try:
y_pred_proba = np.asarray(y_pred_proba)
if len(y_pred_proba.shape) == 1:
y_true_one_hot, min_label, n_classes = _create_one_hot_encoding(y_true)
one_hot = np.zeros((len(y_pred), n_classes))
for i, pred in enumerate(y_pred):
one_hot[i, int(pred - min_label)] = y_pred_proba[i]
y_pred_proba = one_hot
row_sums = y_pred_proba.sum(axis=1)
if not np.allclose(row_sums, 1.0):
y_pred_proba = y_pred_proba / row_sums[:, np.newaxis]
except Exception as e:
print(f"Warning: Could not preprocess y_pred_proba: {e}")
y_pred_proba = None
if y_pred_proba is not None:
for metric, func in available_proba_metrics.items():
if metric in metrics:
try:
results[metric] = func(y_true, y_pred_proba)
except Exception as e:
print(f"Warning: Could not calculate {metric}: {e}")
return results
[docs]
def print_evaluation_results(results):
"""
Print evaluation results in a formatted way.
This function provides a clear, formatted output of the evaluation metrics,
grouping them into hard label metrics and probability-based metrics.
Parameters
----------
results : dict
Dictionary containing evaluation metrics as returned by evaluate_ordinal_model
Notes
-----
- Metrics are printed with 4 decimal places
- Hard label metrics are printed first, followed by probability-based metrics
- Metric names are formatted for better readability
"""
print("\nOrdinal Regression Evaluation Results:")
print("-" * 50)
print("Hard Label Metrics:")
for metric in ['accuracy', 'adjacent_accuracy', 'mze', 'mae', 'mse', 'weighted_kappa_quadratic', 'weighted_kappa_linear', 'cem',
'spearman_correlation', 'kendall_tau']:
if metric in results:
print(f" {metric.replace('_', ' ').title()}: {results[metric]:.4f}")
if 'ranked_probability_score' in results:
print("\nProbability-Based Metrics:")
for metric in ['ranked_probability_score', 'ordinal_weighted_ce_linear', 'ordinal_weighted_ce_quadratic']:
if metric in results:
print(f" {metric.replace('_', ' ').title()}: {results[metric]:.4f}")
print("-" * 50)