Source code for ordinal_xai.utils.data_utils
import os
import pandas as pd
import numpy as np
from typing import Union, Tuple, Optional
from sklearn.preprocessing import OneHotEncoder, StandardScaler
[docs]
def load_data(
data_path: str,
target: Union[int, str] = -1,
sep: str = ";",
label_map: Optional[dict] = None,
drop: Optional[list] = None,
handle_nan: str = 'drop'
) -> Tuple[pd.DataFrame, pd.Series]:
"""
Load and preprocess a dataset from a file.
Parameters
----------
data_path : str
Full path to the data file
target : Union[int, str], default=-1
Target variable specification. Can be:
- int: Index of target column (e.g., -1 for last column)
- str: Name of target column
sep : str, default=';'
Delimiter to use when reading the file
label_map : Optional[dict], default=None
Optional mapping to convert target labels to numeric values.
If None, labels will be mapped to 0-based continuous indices.
drop : Optional[list], default=None
List of feature indices or names to drop from the features DataFrame.
handle_nan : str, default='drop'
How to handle NaN values. Options are:
- 'drop': Drop rows containing any NaN values
- 'error': Raise an error if NaN values are found
- 'warn': Print a warning if NaN values are found but continue
Returns
-------
Tuple[pd.DataFrame, pd.Series]
X: Features DataFrame
y: Target Series with mapped labels
Raises
------
FileNotFoundError
If the data file doesn't exist
ValueError
If target specification is invalid
If handle_nan is not one of ['drop', 'error', 'warn']
If handle_nan='error' and NaN values are found
"""
if not os.path.exists(data_path):
raise FileNotFoundError(f"Data file not found at: {data_path}")
if handle_nan not in ['drop', 'error', 'warn']:
raise ValueError("handle_nan must be one of ['drop', 'error', 'warn']")
# Read the data
df = pd.read_csv(data_path, sep=sep)
# Handle NaN values
if df.isnull().any().any():
if handle_nan == 'error':
raise ValueError("Data contains NaN values")
elif handle_nan == 'warn':
print(f"Warning: Data contains NaN values in columns: {df.columns[df.isnull().any()].tolist()}")
elif handle_nan == 'drop':
original_len = len(df)
df = df.dropna()
if len(df) < original_len:
print(f"Dropped {original_len - len(df)} rows containing NaN values")
# Handle target specification
if isinstance(target, int):
if target < -len(df.columns) or target >= len(df.columns):
raise ValueError(f"Target index {target} out of range")
y = df.iloc[:, target]
X = df.drop(df.columns[target], axis=1)
elif isinstance(target, str):
if target not in df.columns:
raise ValueError(f"Target column '{target}' not found in data")
y = df[target]
X = df.drop(target, axis=1)
else:
raise ValueError("Target must be either an integer index or column name")
# Drop specified features if requested
if drop is not None:
if all(isinstance(d, int) for d in drop):
drop_cols = X.columns[drop]
else:
drop_cols = drop
X = X.drop(columns=drop_cols)
# Map labels if needed
if label_map is None:
unique_labels = sorted(y.unique())
label_map = {old: new for new, old in enumerate(unique_labels)}
y = y.map(label_map)
return X, y
[docs]
def transform_features(
X: pd.DataFrame,
fit: bool = False,
no_scaling: bool = False,
encoder: Optional[OneHotEncoder] = None,
scaler: Optional[StandardScaler] = None,
categorical_columns: Optional[list] = None,
handle_nan: str = 'drop'
) -> Tuple[pd.DataFrame, OneHotEncoder, Optional[StandardScaler]]:
"""
Transform input data using one-hot encoding for categoricals and scaling for numericals.
Parameters
----------
X : pd.DataFrame
Input data to transform
fit : bool, default=False
Whether to fit new encoder/scaler or use existing ones
no_scaling : bool, default=False
Whether to skip scaling of numerical features
encoder : Optional[OneHotEncoder], default=None
Existing encoder to use if fit=False
scaler : Optional[StandardScaler], default=None
Existing scaler to use if fit=False
categorical_columns : Optional[list], default=None
List of categorical column names. If None, inferred from data types
handle_nan : str, default='drop'
How to handle NaN values. Options are:
- 'drop': Drop rows containing any NaN values
- 'error': Raise an error if NaN values are found
- 'warn': Print a warning if NaN values are found but continue
Returns
-------
Tuple[pd.DataFrame, OneHotEncoder, Optional[StandardScaler]]
X_transformed: Transformed DataFrame
encoder: Fitted OneHotEncoder
scaler: Fitted StandardScaler (None if no_scaling=True)
Raises
------
ValueError
If handle_nan is not one of ['drop', 'error', 'warn']
If handle_nan='error' and NaN values are found
"""
if handle_nan not in ['drop', 'error', 'warn']:
raise ValueError("handle_nan must be one of ['drop', 'error', 'warn']")
# Handle NaN values
if X.isnull().any().any():
if handle_nan == 'error':
raise ValueError("Data contains NaN values")
elif handle_nan == 'warn':
print(f"Warning: Data contains NaN values in columns: {X.columns[X.isnull().any()].tolist()}")
elif handle_nan == 'drop':
original_len = len(X)
X = X.dropna()
if len(X) < original_len:
print(f"Dropped {original_len - len(X)} rows containing NaN values")
if categorical_columns is None:
categorical_columns = X.select_dtypes(include=["object"]).columns.tolist()
# Handle categorical features
if fit:
encoder = OneHotEncoder(drop="first", handle_unknown="ignore", sparse_output=False)
categorical_features = encoder.fit_transform(X[categorical_columns]) if len(categorical_columns) > 0 else np.empty((len(X), 0))
else:
categorical_features = encoder.transform(X[categorical_columns]) if len(categorical_columns) > 0 else np.empty((len(X), 0))
if len(categorical_columns) > 0:
categorical_features = pd.DataFrame(
categorical_features,
columns=encoder.get_feature_names_out(categorical_columns),
index=X.index
)
else:
categorical_features = pd.DataFrame(index=X.index)
# Handle numerical features
numerical_columns = X.drop(columns=categorical_columns, axis=1).columns.tolist()
if len(numerical_columns) > 0:
if no_scaling:
numerical_df = X[numerical_columns].copy()
scaler = None
else:
if fit:
scaler = StandardScaler()
numerical_features = scaler.fit_transform(X[numerical_columns])
else:
numerical_features = scaler.transform(X[numerical_columns])
numerical_df = pd.DataFrame(numerical_features, columns=numerical_columns, index=X.index)
X_transformed = pd.concat([numerical_df, categorical_features], axis=1)
else:
X_transformed = categorical_features
return X_transformed, encoder, scaler