Source code for ordinal_xai.models.clm

"""
Cumulative Link Model (CLM) for ordinal regression.

This module implements a Cumulative Link Model, also known as an Ordered Logit/Probit model,
for ordinal regression. The model uses a link function (logit or probit) to model the
cumulative probabilities of ordinal outcomes.

The model is implemented as a scikit-learn compatible estimator, allowing it to be used
with scikit-learn's pipeline and cross-validation tools.
"""

from typing import Optional, List, Dict, Union, Callable, Tuple
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from ..utils.data_utils import transform_features
from sklearn.utils.validation import check_X_y, check_is_fitted, validate_data
from statsmodels.miscmodels.ordinal_model import OrderedModel
from .base_model import BaseOrdinalModel

[docs] class CLM(BaseEstimator, BaseOrdinalModel): """ Cumulative Link Model for ordinal regression. This class implements a Cumulative Link Model (CLM) for ordinal regression, which models the cumulative probabilities of ordinal outcomes using either a logit or probit link function. The model is particularly suitable for ordinal data where the response variable has a natural ordering. Parameters ---------- link : {'logit', 'probit'}, default='logit' The link function to use: - 'logit': Logistic link function (default) - 'probit': Probit link function Attributes ---------- feature_names_ : list Names of features used during training n_features_in_ : int Number of features seen during training ranks_ : ndarray Unique ordinal class labels _encoder : OneHotEncoder Encoder for categorical features _scaler : StandardScaler Scaler for numerical features _model : OrderedModel The fitted statsmodels OrderedModel _result : OrderedModelResults Results from fitting the model params_ : ndarray Model parameters is_fitted_ : bool Whether the model has been fitted Notes ----- - The model handles both categorical and numerical features automatically - Categorical features are one-hot encoded - Numerical features are standardized - The model assumes ordinal classes are consecutive integers """
[docs] def __init__(self, link: str = "logit"): """ Initialize the Cumulative Link Model. Parameters ---------- link : {'logit', 'probit'}, default='logit' The link function to use for modeling cumulative probabilities """ super().__init__() self.link = link self._encoder = None self._scaler = None self.is_fitted_ = False
[docs] def get_params(self, deep: bool = True) -> Dict[str, any]: """ Get parameters for this estimator. Parameters ---------- deep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns ------- dict Parameter names mapped to their values """ return {"link": self.link}
[docs] def set_params(self, **params: any) -> "CLM": """ Set the parameters of this estimator. Parameters ---------- **params : dict Estimator parameters Returns ------- self : CLM The estimator instance """ for key, value in params.items(): setattr(self, key, value) return self
[docs] def fit(self, X: pd.DataFrame, y: pd.Series) -> "CLM": """ Fit the Cumulative Link Model. This method fits the model to the training data, handling both categorical and numerical features appropriately. Parameters ---------- X : pd.DataFrame of shape (n_samples, n_features) Training data y : pd.Series of shape (n_samples,) Target values Returns ------- self : CLM The fitted model Raises ------ ValueError If the link function is invalid If the input data contains invalid values """ # Store feature names and metadata self.feature_names_ = X.columns.tolist() self.n_features_in_ = X.shape[1] self.ranks_ = np.unique(y) # Transform features X_transformed = self.transform(X, fit=True) # Validate input X, y = check_X_y(X_transformed, y, ensure_2d=True) # Validate link function link_functions = {"logit": "logit", "probit": "probit"} if self.link not in link_functions: raise ValueError( f"Invalid link function '{self.link}'. " f"Choose from {list(link_functions.keys())}." ) # Fit the model self._model = OrderedModel(y, X, distr=link_functions[self.link]) self._result = self._model.fit(method='bfgs', disp=False) self.params_ = self._result.params # Set fitted flag self.is_fitted_ = True return self
[docs] def predict(self, X: pd.DataFrame) -> np.ndarray: """ Predict ordinal class labels. Parameters ---------- X : pd.DataFrame of shape (n_samples, n_features) Samples to predict Returns ------- ndarray of shape (n_samples,) Predicted ordinal class labels Raises ------ NotFittedError If the model has not been fitted """ return self.predict_proba(X).argmax(axis=1)
[docs] def predict_proba(self, X: pd.DataFrame) -> np.ndarray: """ Predict class probabilities. Parameters ---------- X : pd.DataFrame of shape (n_samples, n_features) Samples to predict probabilities for Returns ------- ndarray of shape (n_samples, n_classes) Predicted class probabilities Raises ------ NotFittedError If the model has not been fitted """ # Check if model is fitted check_is_fitted(self) # Transform features X_transformed = self.transform(X, fit=False) # Compute probabilities return self._result.predict(X_transformed.values)
[docs] def transform(self, X: pd.DataFrame, fit: bool = False, no_scaling: bool = False) -> pd.DataFrame: """ Transform input data into the format expected by the model. This method handles both categorical and numerical features: - Categorical features are one-hot encoded - Numerical features are standardized (unless no_scaling=True) Parameters ---------- X : pd.DataFrame of shape (n_samples, n_features) Input data to transform fit : bool, default=False Whether to fit new encoder/scaler or use existing ones no_scaling : bool, default=False Whether to skip scaling of numerical features Returns ------- pd.DataFrame Transformed data Raises ------ ValueError If the input data has different features than training data """ X_transformed, encoder, scaler = transform_features( X, fit=fit, encoder=self._encoder, scaler=self._scaler, no_scaling=no_scaling ) if fit: self._encoder = encoder self._scaler = scaler return X_transformed