Source code for gpytorchwrapper.src.data.data_transform
import logging
from typing import Optional
import pandas as pd
from gpytorchwrapper.src.config.config_classes import TransformConf
from gpytorchwrapper.src.config.model_factory import get_transformer
logger = logging.getLogger(__name__)
[docs]
def transform_data(
x: pd.DataFrame, transformer: object, columns: list[int] | None = None
) -> pd.DataFrame | tuple[pd.DataFrame, object]:
"""
Transform the input data using the selected transformer
Parameters
-----------
x : pd.DataFrame
The input data
transformer : object
The selected transformer
columns : list, optional
The columns on which the transformer has to operate
Returns
--------
x : pd.DataFrame
The transformed input data
transformer : object
The fitted transformer
"""
# scikit-learn transformers have this attribute n_features_in_ when they are fitted
# check to see if the transformer already has been fitted
def is_fit_called(obj):
return hasattr(obj, "n_features_in_")
if not isinstance(x, pd.DataFrame) and not isinstance(x, pd.Series):
raise NotImplementedError(f"{type(x)} is not a pandas DataFrame or Series.")
if columns is not None:
if all(isinstance(column, str) for column in columns):
columns_to_transform = columns
elif all(isinstance(column, int) for column in columns):
columns_to_transform = x.columns[columns]
else:
raise NotImplementedError(
"All items in the columns list should be the same type (int or string)."
)
X_transformed = x[columns_to_transform].copy()
if is_fit_called(transformer):
X_transformed = transformer.transform(X_transformed.values)
x.loc[:, columns_to_transform] = X_transformed
return x
else:
X_transformed = transformer.fit_transform(X_transformed.values)
x.loc[:, columns_to_transform] = X_transformed
return x, transformer
else:
if is_fit_called(transformer):
x = pd.DataFrame(transformer.transform(x.values))
return x
else:
x = pd.DataFrame(transformer.fit_transform(x.values))
return x, transformer
[docs]
def transform(
train_x: pd.DataFrame,
train_y: pd.DataFrame,
test_x: Optional[pd.DataFrame],
test_y: Optional[pd.DataFrame],
transform_conf: TransformConf,
) -> (
tuple[
pd.DataFrame,
pd.DataFrame,
pd.DataFrame,
pd.DataFrame,
Optional[object],
Optional[object],
]
| tuple[
pd.DataFrame,
pd.DataFrame,
None,
None,
object,
object,
]
):
"""
Applies transformations to training and test datasets based on configuration.
Parameters
----------
train_x : pandas.DataFrame
Input features for the training dataset.
train_y : pandas.DataFrame
Output targets for the training dataset.
test_x : pandas.DataFrame or None
Input features for the test dataset, or None if not provided.
test_y : pandas.DataFrame or None
Output targets for the test dataset, or None if not provided.
transform_conf : TransformConf
Configuration object containing settings for input and output transformations.
Returns
-------
tuple
A tuple containing:
- Transformed training input features (pandas.DataFrame)
- Transformed test input features (pandas.DataFrame or None)
- Transformed training targets (pandas.DataFrame)
- Transformed test targets (pandas.DataFrame or None)
- Input transformer object used or None
- Output transformer object used or None
"""
logging.info("Transforming data.")
# Transform the input
transform_input = transform_conf.transform_input
transform_output = transform_conf.transform_output
if transform_input.transform_data:
input_transformer = get_transformer(transform_input)
train_x, input_transformer = transform_data(
train_x, input_transformer, transform_input.columns
)
if test_x is not None:
test_x = transform_data(test_x, input_transformer, transform_input.columns)
logging.info(f"Transforming input values using {input_transformer}.")
else:
input_transformer = None
logging.info("Using raw input values.")
# Transform the output
if transform_output.transform_data:
output_transformer = get_transformer(transform_output)
train_y, transformer = transform_data(train_y, output_transformer, columns=None)
if test_y is not None:
test_y = transform_data(test_y, output_transformer, columns=None)
logging.info(f"Transforming output values using {output_transformer}.\n")
else:
output_transformer = None
logging.info("Using raw output values.\n")
return train_x, test_x, train_y, test_y, input_transformer, output_transformer