Source code for ordinal_xai.models.ogboost

"""
Ordinal Gradient Boosting Model for ordinal regression.

This module implements a wrapper around the GradientBoostingOrdinal model from the
ogboost package for ordinal regression. The model uses gradient boosting to learn
ordinal relationships while maintaining compatibility with the ordinal_xai framework.

The model is implemented as a scikit-learn compatible estimator, allowing it to be used
with scikit-learn's pipeline and cross-validation tools.
"""

from typing import Optional, List, Dict, Union, Callable, Tuple
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from ..utils.data_utils import transform_features
from sklearn.utils.validation import check_X_y, check_is_fitted, validate_data
from .base_model import BaseOrdinalModel

try:
    from ogboost import GradientBoostingOrdinal
    from sklearn.tree import DecisionTreeRegressor
except ImportError:
    raise ImportError(
        "ogboost package is required for OGBoost model. "
        "Please install it with: pip install ogboost"
    )


[docs] class OGBoost(BaseEstimator, BaseOrdinalModel): """ Ordinal Gradient Boosting Model for ordinal regression. This class implements a wrapper around the GradientBoostingOrdinal model from the ogboost package. The model uses gradient boosting to learn ordinal relationships and is particularly effective for complex non-linear patterns in ordinal data. Parameters ---------- base_learner : estimator, default=DecisionTreeRegressor(max_depth=3) The base learner used to update the latent function n_estimators : int, default=100 Maximum number of boosting iterations learning_rate : float, default=0.1 Learning rate for the latent function updates learning_rate_thresh : float, default=0.001 Learning rate for the threshold updates validation_fraction : float, default=0.1 Fraction of data to use as a holdout set for early stopping n_iter_no_change : int or None, default=None Number of iterations with no improvement to wait before stopping early tol : float, default=1e-4 Tolerance for measuring improvement in early stopping link_function : {'probit', 'logit', 'loglog', 'cloglog', 'cauchit'}, default='probit' Link function used to transform latent scores to probabilities subsample : float, default=1.0 Fraction of samples used to fit each base learner verbose : int, default=0 Verbosity level random_state : int, RandomState instance or None, default=None Seed or random state for reproducibility cv_early_stopping_splits : int or None, default=None If an integer > 1, uses K-fold cross-validation for early stopping Attributes ---------- feature_names_ : list Names of features used during training n_features_in_ : int Number of features seen during training ranks_ : ndarray Unique ordinal class labels _encoder : OneHotEncoder Encoder for categorical features _scaler : StandardScaler Scaler for numerical features _model : GradientBoostingOrdinal The fitted ogboost GradientBoostingOrdinal model is_fitted_ : bool Whether the model has been fitted Notes ----- - The model handles both categorical and numerical features automatically - Categorical features are one-hot encoded - Numerical features are standardized - The model assumes ordinal classes are consecutive integers starting from 0 """
[docs] def __init__( self, base_learner = None, n_estimators: int = 100, learning_rate: float = 0.1, learning_rate_thresh: float = 0.001, validation_fraction: float = 0.1, n_iter_no_change: Optional[int] = None, tol: float = 1e-4, link_function: str = 'probit', subsample: float = 1.0, verbose: int = 0, random_state: Optional[int] = None, cv_early_stopping_splits: Optional[int] = None ): """ Initialize the Ordinal Gradient Boosting Model. Parameters ---------- base_learner : estimator, default=None The base learner used to update the latent function. If None, uses DecisionTreeRegressor(max_depth=3) n_estimators : int, default=100 Maximum number of boosting iterations learning_rate : float, default=0.1 Learning rate for the latent function updates learning_rate_thresh : float, default=0.001 Learning rate for the threshold updates validation_fraction : float, default=0.1 Fraction of data to use as a holdout set for early stopping n_iter_no_change : int or None, default=None Number of iterations with no improvement to wait before stopping early tol : float, default=1e-4 Tolerance for measuring improvement in early stopping link_function : str, default='probit' Link function used to transform latent scores to probabilities subsample : float, default=1.0 Fraction of samples used to fit each base learner verbose : int, default=0 Verbosity level random_state : int, RandomState instance or None, default=None Seed or random state for reproducibility cv_early_stopping_splits : int or None, default=None If an integer > 1, uses K-fold cross-validation for early stopping """ super().__init__() self.base_learner = base_learner if base_learner is not None else DecisionTreeRegressor(max_depth=3) self.n_estimators = n_estimators self.learning_rate = learning_rate self.learning_rate_thresh = learning_rate_thresh self.validation_fraction = validation_fraction self.n_iter_no_change = n_iter_no_change self.tol = tol self.link_function = link_function self.subsample = subsample self.verbose = verbose self.random_state = random_state self.cv_early_stopping_splits = cv_early_stopping_splits self._encoder = None self._scaler = None self.is_fitted_ = False
[docs] def get_params(self, deep: bool = True) -> Dict[str, any]: """ Get parameters for this estimator. Parameters ---------- deep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns ------- dict Parameter names mapped to their values """ return { "base_learner": self.base_learner, "n_estimators": self.n_estimators, "learning_rate": self.learning_rate, "learning_rate_thresh": self.learning_rate_thresh, "validation_fraction": self.validation_fraction, "n_iter_no_change": self.n_iter_no_change, "tol": self.tol, "link_function": self.link_function, "subsample": self.subsample, "verbose": self.verbose, "random_state": self.random_state, "cv_early_stopping_splits": self.cv_early_stopping_splits, }
[docs] def set_params(self, **params: any) -> "OGBoost": """ Set the parameters of this estimator. Parameters ---------- **params : dict Estimator parameters Returns ------- self : OGBoost The estimator instance """ for key, value in params.items(): setattr(self, key, value) return self
[docs] def fit(self, X: pd.DataFrame, y: pd.Series) -> "OGBoost": """ Fit the Ordinal Gradient Boosting Model. This method fits the model to the training data, handling both categorical and numerical features appropriately. Parameters ---------- X : pd.DataFrame of shape (n_samples, n_features) Training data y : pd.Series of shape (n_samples,) Target values Returns ------- self : OGBoost The fitted model Raises ------ ValueError If the input data contains invalid values """ # Store feature names and metadata self.feature_names_ = X.columns.tolist() self.n_features_in_ = X.shape[1] self.ranks_ = np.unique(y) # Transform features X_transformed = self.transform(X, fit=True) # Validate input X, y = check_X_y(X_transformed, y, ensure_2d=True) # Initialize the GradientBoostingOrdinal model self._model = GradientBoostingOrdinal( base_learner=self.base_learner, n_estimators=self.n_estimators, learning_rate=self.learning_rate, learning_rate_thresh=self.learning_rate_thresh, validation_fraction=self.validation_fraction, n_iter_no_change=self.n_iter_no_change, tol=self.tol, link_function=self.link_function, subsample=self.subsample, verbose=self.verbose, random_state=self.random_state, cv_early_stopping_splits=self.cv_early_stopping_splits ) # Fit the model self._model.fit(X, y) # Set fitted flag self.is_fitted_ = True return self
[docs] def predict(self, X: pd.DataFrame) -> np.ndarray: """ Predict ordinal class labels. Parameters ---------- X : pd.DataFrame of shape (n_samples, n_features) Samples to predict Returns ------- ndarray of shape (n_samples,) Predicted ordinal class labels Raises ------ NotFittedError If the model has not been fitted """ # Check if model is fitted check_is_fitted(self) # Transform features X_transformed = self.transform(X, fit=False) # Make predictions return self._model.predict(X_transformed.values)
[docs] def predict_proba(self, X: pd.DataFrame) -> np.ndarray: """ Predict class probabilities. Parameters ---------- X : pd.DataFrame of shape (n_samples, n_features) Samples to predict probabilities for Returns ------- ndarray of shape (n_samples, n_classes) Predicted class probabilities Raises ------ NotFittedError If the model has not been fitted """ # Check if model is fitted check_is_fitted(self) # Transform features X_transformed = self.transform(X, fit=False) # Compute probabilities return self._model.predict_proba(X_transformed.values)
[docs] def transform(self, X: pd.DataFrame, fit: bool = False, no_scaling: bool = False) -> pd.DataFrame: """ Transform input data into the format expected by the model. This method handles both categorical and numerical features: - Categorical features are one-hot encoded - Numerical features are standardized (unless no_scaling=True) Parameters ---------- X : pd.DataFrame of shape (n_samples, n_features) Input data to transform fit : bool, default=False Whether to fit new encoder/scaler or use existing ones no_scaling : bool, default=False Whether to skip scaling of numerical features Returns ------- pd.DataFrame Transformed data Raises ------ ValueError If the input data has different features than training data """ X_transformed, encoder, scaler = transform_features( X, fit=fit, encoder=self._encoder, scaler=self._scaler, no_scaling=no_scaling ) if fit: self._encoder = encoder self._scaler = scaler return X_transformed
[docs] def decision_function(self, X: pd.DataFrame) -> np.ndarray: """ Compute the latent function values for input samples. This method returns the scalar value of the latent function for each observation, which can be used as a high-resolution alternative to class labels for comparing and ranking observations. Parameters ---------- X : pd.DataFrame of shape (n_samples, n_features) Samples to compute decision function for Returns ------- ndarray of shape (n_samples,) Latent function values Raises ------ NotFittedError If the model has not been fitted """ # Check if model is fitted check_is_fitted(self) # Transform features X_transformed = self.transform(X, fit=False) # Compute decision function return self._model.decision_function(X_transformed.values)
[docs] def feature_importances_(self) -> np.ndarray: """ Get feature importances from the fitted model. Note: This method may not be available for all base learners. Returns ------- ndarray of shape (n_features,) Feature importances if available Raises ------ NotFittedError If the model has not been fitted AttributeError If the base learner doesn't support feature importances """ check_is_fitted(self) if hasattr(self._model, 'feature_importances_'): return self._model.feature_importances_ else: raise AttributeError( "Feature importances are not available for the current base learner. " "Use a base learner that supports feature importances (e.g., DecisionTreeRegressor)." )
[docs] def get_booster_params(self) -> Dict[str, any]: """ Get parameters of the underlying boosting model. Returns ------- dict Parameters of the underlying GradientBoostingOrdinal model Raises ------ NotFittedError If the model has not been fitted """ check_is_fitted(self) return self._model.get_params()