Source code for ordinal_xai.utils.data_utils

import os
import pandas as pd
import numpy as np
from typing import Union, Tuple, Optional
from sklearn.preprocessing import OneHotEncoder, StandardScaler


[docs]
def load_data(
    data_path: str,
    target: Union[int, str] = -1,
    sep: str = ";",
    label_map: Optional[dict] = None,
    drop: Optional[list] = None,
    handle_nan: str = 'drop'
) -> Tuple[pd.DataFrame, pd.Series]:
    """
    Load and preprocess a dataset from a file.
    
    Parameters
    ----------
    data_path : str
        Full path to the data file
    target : Union[int, str], default=-1
        Target variable specification. Can be:
        - int: Index of target column (e.g., -1 for last column)
        - str: Name of target column
    sep : str, default=';'
        Delimiter to use when reading the file
    label_map : Optional[dict], default=None
        Optional mapping to convert target labels to numeric values.
        If None, labels will be mapped to 0-based continuous indices.
    drop : Optional[list], default=None
        List of feature indices or names to drop from the features DataFrame.
    handle_nan : str, default='drop'
        How to handle NaN values. Options are:
        - 'drop': Drop rows containing any NaN values
        - 'error': Raise an error if NaN values are found
        - 'warn': Print a warning if NaN values are found but continue
    
    Returns
    -------
    Tuple[pd.DataFrame, pd.Series]
        X: Features DataFrame
        y: Target Series with mapped labels
    
    Raises
    ------
    FileNotFoundError
        If the data file doesn't exist
    ValueError
        If target specification is invalid
        If handle_nan is not one of ['drop', 'error', 'warn']
        If handle_nan='error' and NaN values are found
    """
    if not os.path.exists(data_path):
        raise FileNotFoundError(f"Data file not found at: {data_path}")

    if handle_nan not in ['drop', 'error', 'warn']:
        raise ValueError("handle_nan must be one of ['drop', 'error', 'warn']")

    # Read the data
    df = pd.read_csv(data_path, sep=sep)
    
    # Handle NaN values
    if df.isnull().any().any():
        if handle_nan == 'error':
            raise ValueError("Data contains NaN values")
        elif handle_nan == 'warn':
            print(f"Warning: Data contains NaN values in columns: {df.columns[df.isnull().any()].tolist()}")
        elif handle_nan == 'drop':
            original_len = len(df)
            df = df.dropna()
            if len(df) < original_len:
                print(f"Dropped {original_len - len(df)} rows containing NaN values")
    
    # Handle target specification
    if isinstance(target, int):
        if target < -len(df.columns) or target >= len(df.columns):
            raise ValueError(f"Target index {target} out of range")
        y = df.iloc[:, target]
        X = df.drop(df.columns[target], axis=1)
    elif isinstance(target, str):
        if target not in df.columns:
            raise ValueError(f"Target column '{target}' not found in data")
        y = df[target]
        X = df.drop(target, axis=1)
    else:
        raise ValueError("Target must be either an integer index or column name")
    
    # Drop specified features if requested
    if drop is not None:
        if all(isinstance(d, int) for d in drop):
            drop_cols = X.columns[drop]
        else:
            drop_cols = drop
        X = X.drop(columns=drop_cols)
    
    # Map labels if needed
    if label_map is None:
        unique_labels = sorted(y.unique())
        label_map = {old: new for new, old in enumerate(unique_labels)}
    
    y = y.map(label_map)
    
    return X, y 



[docs]
def transform_features(
    X: pd.DataFrame,
    fit: bool = False,
    no_scaling: bool = False,
    encoder: Optional[OneHotEncoder] = None,
    scaler: Optional[StandardScaler] = None,
    categorical_columns: Optional[list] = None,
    handle_nan: str = 'drop'
) -> Tuple[pd.DataFrame, OneHotEncoder, Optional[StandardScaler]]:
    """
    Transform input data using one-hot encoding for categoricals and scaling for numericals.
    
    Parameters
    ----------
    X : pd.DataFrame
        Input data to transform
    fit : bool, default=False
        Whether to fit new encoder/scaler or use existing ones
    no_scaling : bool, default=False
        Whether to skip scaling of numerical features
    encoder : Optional[OneHotEncoder], default=None
        Existing encoder to use if fit=False
    scaler : Optional[StandardScaler], default=None
        Existing scaler to use if fit=False
    categorical_columns : Optional[list], default=None
        List of categorical column names. If None, inferred from data types
    handle_nan : str, default='drop'
        How to handle NaN values. Options are:
        - 'drop': Drop rows containing any NaN values
        - 'error': Raise an error if NaN values are found
        - 'warn': Print a warning if NaN values are found but continue
    
    Returns
    -------
    Tuple[pd.DataFrame, OneHotEncoder, Optional[StandardScaler]]
        X_transformed: Transformed DataFrame
        encoder: Fitted OneHotEncoder
        scaler: Fitted StandardScaler (None if no_scaling=True)
        
    Raises
    ------
    ValueError
        If handle_nan is not one of ['drop', 'error', 'warn']
        If handle_nan='error' and NaN values are found
    """
    if handle_nan not in ['drop', 'error', 'warn']:
        raise ValueError("handle_nan must be one of ['drop', 'error', 'warn']")

    # Handle NaN values
    if X.isnull().any().any():
        if handle_nan == 'error':
            raise ValueError("Data contains NaN values")
        elif handle_nan == 'warn':
            print(f"Warning: Data contains NaN values in columns: {X.columns[X.isnull().any()].tolist()}")
        elif handle_nan == 'drop':
            original_len = len(X)
            X = X.dropna()
            if len(X) < original_len:
                print(f"Dropped {original_len - len(X)} rows containing NaN values")

    if categorical_columns is None:
        categorical_columns = X.select_dtypes(include=["object"]).columns.tolist()

    # Handle categorical features
    if fit:
        encoder = OneHotEncoder(drop="first", handle_unknown="ignore", sparse_output=False)
        categorical_features = encoder.fit_transform(X[categorical_columns]) if len(categorical_columns) > 0 else np.empty((len(X), 0))
    else:
        categorical_features = encoder.transform(X[categorical_columns]) if len(categorical_columns) > 0 else np.empty((len(X), 0))

    if len(categorical_columns) > 0:
        categorical_features = pd.DataFrame(
            categorical_features,
            columns=encoder.get_feature_names_out(categorical_columns),
            index=X.index
        )
    else:
        categorical_features = pd.DataFrame(index=X.index)

    # Handle numerical features
    numerical_columns = X.drop(columns=categorical_columns, axis=1).columns.tolist()
    if len(numerical_columns) > 0:
        if no_scaling:
            numerical_df = X[numerical_columns].copy()
            scaler = None
        else:
            if fit:
                scaler = StandardScaler()
                numerical_features = scaler.fit_transform(X[numerical_columns])
            else:
                numerical_features = scaler.transform(X[numerical_columns])
            numerical_df = pd.DataFrame(numerical_features, columns=numerical_columns, index=X.index)
        X_transformed = pd.concat([numerical_df, categorical_features], axis=1)
    else:
        X_transformed = categorical_features

    return X_transformed, encoder, scaler