Source code for ordinal_xai.models.obd

"""
Ordinal Binary Decomposition (OBD) Model.

This module implements the Ordinal Binary Decomposition (OBD) model, which is a framework
for ordinal classification that decomposes the ordinal problem into a series of binary
classification tasks. The model supports two decomposition strategies:

1. "one-vs-following" (default):
   - Classifier 1: Class 0 vs. Classes 1,2,...,K-1
   - Classifier 2: Class 1 vs. Classes 2,3,...,K-1
   - ...and so on

2. "one-vs-next":
   - Classifier 1: Class 0 vs. Class 1
   - Classifier 2: Class 1 vs. Class 2
   - ...and so on

The model can use various base classifiers (logistic regression, SVM, random forest, XGBoost)
to solve each binary classification task. The final prediction is obtained by combining
the probabilities from all binary classifiers in a way that respects the ordinal nature
of the problem.

Example:
    >>> from models.obd import OBD
    >>> import pandas as pd
    >>> import numpy as np
    >>> 
    >>> # Create sample data
    >>> X = pd.DataFrame(np.random.randn(100, 5))
    >>> y = pd.Series(np.random.randint(0, 3, 100))
    >>> 
    >>> # Initialize and train model
    >>> model = OBD(base_classifier='svm', decomposition_type='one-vs-next')
    >>> model.fit(X, y)
    >>> 
    >>> # Make predictions
    >>> predictions = model.predict(X)
    >>> probabilities = model.predict_proba(X)
"""

from typing import Optional, List, Dict, Union, Callable, Tuple
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from ..utils.data_utils import transform_features
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.dummy import DummyClassifier
from .base_model import BaseOrdinalModel

[docs] class OBD(BaseEstimator, BaseOrdinalModel): """ Ordinal Binary Decomposition (OBD) Model. This model implements ordinal classification by decomposing the problem into a series of binary classification tasks. It supports two decomposition strategies and various base classifiers. Parameters ---------- base_classifier : str, default='logistic' The base classifier to use for each binary classification task. Options are: - 'logistic': LogisticRegression with default parameters - 'svm': SVC with probability=True and balanced class weights - 'rf': RandomForestClassifier with conservative defaults - 'xgb': XGBClassifier with conservative defaults decomposition_type : str, default='one-vs-following' The type of binary decomposition to use. Options are: - 'one-vs-following': Each class is compared against all following classes - 'one-vs-next': Each class is compared only against the next class **kwargs : dict Additional parameters to pass to the base classifier. These will override the default parameters set for each classifier type. Attributes ---------- feature_names_ : list Names of the features used during training n_features_in_ : int Number of features seen during training ranks_ : ndarray Unique ordinal class labels in ascending order _models : list List of trained binary classifiers _encoder : object Feature encoder used for categorical variables _scaler : object Feature scaler used for numerical variables is_fitted_ : bool Flag indicating whether the model has been fitted Notes ----- - The model automatically handles categorical and numerical features through the transform_features utility - For each binary classification task, if only one class is present in the training data, a DummyClassifier is used instead of the specified base classifier - The predict_proba method returns class probabilities that sum to 1 for each sample - The model is compatible with scikit-learn's cross-validation and grid search Examples -------- >>> from models.obd import OBD >>> import pandas as pd >>> import numpy as np >>> >>> # Create sample data >>> X = pd.DataFrame(np.random.randn(100, 5)) >>> y = pd.Series(np.random.randint(0, 3, 100)) >>> >>> # Initialize model with SVM base classifier >>> model = OBD(base_classifier='svm', decomposition_type='one-vs-next') >>> >>> # Train the model >>> model.fit(X, y) >>> >>> # Make predictions >>> predictions = model.predict(X) >>> probabilities = model.predict_proba(X) """
[docs] def __init__(self, base_classifier='logistic', decomposition_type='one-vs-following', **kwargs): """ Initialize the OBD model. Parameters ---------- base_classifier : str, default='logistic' The base classifier to use. Options are: - 'logistic': LogisticRegression - 'svm': SVC with probability=True - 'rf': RandomForestClassifier - 'xgb': XGBClassifier decomposition_type : str, default='one-vs-following' The type of binary decomposition to use. Options are: - 'one-vs-following': Each class is compared against all following classes - 'one-vs-next': Each class is compared only against the next class **kwargs : dict Additional parameters to pass to the base classifier. """ super().__init__() # Initialize base class self.base_classifier = base_classifier self.decomposition_type = decomposition_type self.kwargs = kwargs self._models = None self._encoder = None self._scaler = None
[docs] def _get_base_classifier(self): """ Get the appropriate base classifier instance with sensible defaults. Returns ------- estimator : object An instance of the specified base classifier with appropriate default parameters. Raises ------ ValueError If an unknown base classifier is specified. """ if self.base_classifier == 'logistic': return LogisticRegression(**self.kwargs) elif self.base_classifier == 'svm': # Set sensible defaults for SVM if not provided svm_params = { 'C': 1.0, # Regularization parameter 'kernel': 'rbf', # Radial basis function kernel 'gamma': 'scale', # Automatic gamma scaling 'probability': True, # Enable probability estimates 'class_weight': 'balanced', # Handle class imbalance 'random_state': 42, 'cache_size': 1000, # Increase cache size for better performance 'tol': 1e-3 # Tolerance for stopping criterion } svm_params.update(self.kwargs) return SVC(**svm_params) elif self.base_classifier == 'rf': # Set conservative defaults if not provided rf_params = { 'n_estimators': 100, 'max_depth': 5, 'min_samples_leaf': 5, 'random_state': 42 } rf_params.update(self.kwargs) return RandomForestClassifier(**rf_params) elif self.base_classifier == 'xgb': xgb_params = { 'n_estimators': 100, 'max_depth': 3, 'learning_rate': 0.1, 'subsample': 0.8, 'colsample_bytree': 0.8, 'reg_alpha': 1, 'reg_lambda': 1, 'use_label_encoder': False, 'eval_metric': 'mlogloss', 'random_state': 42 } xgb_params.update(self.kwargs) return XGBClassifier(**xgb_params) else: raise ValueError(f"Unknown base classifier: {self.base_classifier}. Use 'logistic', 'svm', 'rf', or 'xgb'.")
[docs] def get_params(self, deep=True): """ Get parameters for this estimator. Parameters ---------- deep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns ------- params : dict Parameter names mapped to their values. """ params = { "base_classifier": self.base_classifier, "decomposition_type": self.decomposition_type } params.update(self.kwargs) return params
[docs] def set_params(self, **params): """ Set the parameters of this estimator. Parameters ---------- **params : dict Estimator parameters. Returns ------- self : object Estimator instance. """ if 'base_classifier' in params: self.base_classifier = params.pop('base_classifier') if 'decomposition_type' in params: self.decomposition_type = params.pop('decomposition_type') self.kwargs.update(params) return self
[docs] def fit(self, X: pd.DataFrame, y: pd.Series) -> "OBD": """ Fit the ordinal binary decomposition model. Parameters ---------- X : pd.DataFrame Training data of shape (n_samples, n_features) y : pd.Series Target values of shape (n_samples,) Returns ------- self : object Returns self. Raises ------ TypeError If X is not a DataFrame or y is not a Series ValueError If X and y have different number of samples If X or y contains missing values If decomposition_type is not 'one-vs-following' or 'one-vs-next' """ # Validate input types if not isinstance(X, pd.DataFrame): raise TypeError("X must be a pandas DataFrame") if not isinstance(y, pd.Series): raise TypeError("y must be a pandas Series") # Validate input shapes if len(X) != len(y): raise ValueError(f"X and y have different number of samples: X has {len(X)} samples, y has {len(y)} samples") # Validate missing values if X.isnull().any().any(): missing_cols = X.columns[X.isnull().any()].tolist() raise ValueError(f"X contains missing values in columns: {missing_cols}") if y.isnull().any(): raise ValueError("y contains missing values") # Validate decomposition type valid_types = ['one-vs-following', 'one-vs-next'] if self.decomposition_type not in valid_types: raise ValueError(f"decomposition_type must be one of {valid_types}, got {self.decomposition_type}") # Store feature names and ranks self.feature_names_ = X.columns.tolist() self.n_features_in_ = X.shape[1] self.ranks_ = np.unique(y) # Transform input data X_transformed = self.transform(X, fit=True) # Initialize models for each binary classification task self._models = [] for i in range(len(self.ranks_) - 1): # Create binary labels based on decomposition type if self.decomposition_type == 'one-vs-following': y_binary = (y > self.ranks_[i]).astype(int) elif self.decomposition_type == 'one-vs-next': # Only compare current class with next class mask = (y == self.ranks_[i]) | (y == self.ranks_[i + 1]) y_binary = (y[mask] > self.ranks_[i]).astype(int) X_binary = X_transformed[mask] else: raise ValueError(f"Unknown decomposition type: {self.decomposition_type}. Use 'one-vs-following' or 'one-vs-next'.") # Check if we have samples from both classes unique_classes = np.unique(y_binary) if len(unique_classes) < 2: # If only one class, create a dummy model that always predicts that class class_value = unique_classes[0] model = DummyClassifier(strategy='constant', constant=class_value) print(f"Warning: Using DummyClassifier for threshold {i} as only class {class_value} is present") else: # Get and fit base classifier model = self._get_base_classifier() # Fit the model with appropriate data if self.decomposition_type == 'one-vs-following': model.fit(X_transformed, y_binary) else: # one-vs-next model.fit(X_binary, y_binary) self._models.append(model) # Set fitted flag self.is_fitted_ = True return self
[docs] def predict(self, X: pd.DataFrame) -> np.ndarray: """ Predict ordinal class labels. Parameters ---------- X : pd.DataFrame Samples of shape (n_samples, n_features) Returns ------- y_pred : ndarray Predicted class labels of shape (n_samples,) """ return self.predict_proba(X).argmax(axis=1)
[docs] def predict_proba(self, X: pd.DataFrame) -> np.ndarray: """ Predict class probabilities. Parameters ---------- X : pd.DataFrame Samples of shape (n_samples, n_features) Returns ------- proba : ndarray Class probabilities of shape (n_samples, n_classes) Raises ------ TypeError If X is not a DataFrame ValueError If X contains missing values If X has different number of features than training data Notes ----- The probabilities are computed differently based on the decomposition type: For 'one-vs-following': - P(class 0) = 1 - P(class > 0) - P(class i) = P(class > i-1) - P(class > i) - P(class K-1) = P(class > K-2) For 'one-vs-next': - P(class 0) = 1 - P(class > 0) - P(class i) = P(class > i-1) * (1 - P(class > i)) - P(class K-1) = P(class > K-2) """ check_is_fitted(self) # Validate input type if not isinstance(X, pd.DataFrame): raise TypeError("X must be a pandas DataFrame") # Validate missing values if X.isnull().any().any(): missing_cols = X.columns[X.isnull().any()].tolist() raise ValueError(f"X contains missing values in columns: {missing_cols}") # Validate number of features if X.shape[1] != self.n_features_in_: raise ValueError(f"X has {X.shape[1]} features, but model was trained on {self.n_features_in_} features") X_transformed = self.transform(X, fit=False) # Get probabilities for each binary classifier binary_probs = [] for model in self._models: if isinstance(model, DummyClassifier): # DummyClassifier returns single column, convert to two columns prob = model.predict_proba(X_transformed) if prob.shape[1] == 1: # If only one class, create two columns prob = np.column_stack([1 - prob, prob]) else: prob = model.predict_proba(X_transformed) binary_probs.append(prob[:, 1]) # Get probability of positive class binary_probs = np.array(binary_probs) # Convert to ordinal probabilities based on decomposition type n_samples = len(X) n_classes = len(self.ranks_) probs = np.zeros((n_samples, n_classes)) if self.decomposition_type == 'one-vs-following': # First class probability probs[:, 0] = 1 - binary_probs[0] # Middle class probabilities for i in range(1, n_classes - 1): probs[:, i] = binary_probs[i-1] - binary_probs[i] # Last class probability probs[:, -1] = binary_probs[-1] else: # one-vs-next # First class probability probs[:, 0] = 1 - binary_probs[0] # Middle class probabilities for i in range(1, n_classes - 1): probs[:, i] = binary_probs[i-1] * (1 - binary_probs[i]) # Last class probability probs[:, -1] = binary_probs[-1] # Normalize probabilities to sum to 1 probs = probs / probs.sum(axis=1, keepdims=True) return probs
[docs] def transform(self, X: pd.DataFrame, fit=False, no_scaling=False) -> pd.DataFrame: """ Transform input data into the format expected by the model. Parameters ---------- X : pd.DataFrame Input data of shape (n_samples, n_features) fit : bool, default=False Whether this is being called during fit or predict no_scaling : bool, default=False Whether to skip feature scaling Returns ------- X_transformed : pd.DataFrame Transformed data ready for model input """ X_transformed, encoder, scaler = transform_features( X, fit=fit, encoder=self._encoder, scaler=self._scaler, no_scaling=no_scaling ) if fit: self._encoder = encoder self._scaler = scaler return X_transformed