Source code for utilities.fit_data_scalers

"""Fitting of data scalers.

Functions
---------
fit_data_scaler_from_dataset
    Get fitted data scaler for given data set features type.
mean_std_batch_fit
    Perform batch fitting of standardization data scaler.
min_max_batch_fit
    Perform batch fitting of min-max data scaler.
"""
#
#                                                                       Modules
# =============================================================================
# Third-party
import torch
import tqdm
import sklearn.preprocessing
# Local
from time_series_data.time_dataset import get_time_series_data_loader
from utilities.data_scalers import TorchStandardScaler, TorchMinMaxScaler
#
#                                                          Authorship & Credits
# =============================================================================
__author__ = 'Bernardo Ferreira (bernardo_ferreira@brown.edu)'
__credits__ = ['Bernardo Ferreira', ]
__status__ = 'Stable'
# =============================================================================
#
# =============================================================================
[docs]def fit_data_scaler_from_dataset(dataset, features_type, n_features,
                                 scaling_type='mean-std',
                                 scaling_parameters={}):
    """Fit features type data scaler from given data set.
    
    Data scaler normalization tensors are fitted from given data set,
    overriding provided data scaling parameters.
    
    Parameters
    ----------
    dataset : torch.utils.data.Dataset
        Time series data set. Each sample is stored as a dictionary where each
        feature (key, str) data is a torch.Tensor(2d) of shape
        (sequence_length, n_features).
    features_type : str
        Features for which data scaler is fitted (e.g., 'features_in',
        'features_out'). Must be directly available from data set samples.
    n_features : int
        Number of features (dimensionality).
    scaling_type : {'min-max', 'mean-std'}, default='mean-std'
        Type of data scaling. Min-Max scaling ('min-max') or standardization
        ('mean-std').
    scaling_parameters : dict, default={}
        Data scaling parameters (item, dict) for each features type
        (key, str). For 'min-max' data scaling, the parameters are the
        'minimum' and 'maximum' features normalization tensors, as well as
        the 'norm_minimum' and 'norm_maximum' normalization bounds. For
        'mean-std' data scaling, the parameters are the 'mean' and 'std'
        features normalization tensors.
    
    Returns
    -------
    data_scaler : {TorchStandardScaler, TorchMinMaxScaler}
        Data scaler.
    """
    # Set available data scaling types
    available_scaling_types = ('mean-std', 'min-max')
    # Check data scaling type
    if scaling_type not in available_scaling_types:
        raise RuntimeError(f'Unknown data scaling type \'{scaling_type}\'.'
                           f'\n\nAvailable data scaling types: '
                           f'{available_scaling_types}')
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Get scaling parameters
    if features_type in scaling_parameters.keys():
        scaling_parameters_type = scaling_parameters[features_type]
    else:
        scaling_parameters_type = {}
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Fit data scaler
    if scaling_type == 'mean-std':
        # Initialize data scaler
        data_scaler = \
            TorchStandardScaler(n_features, **scaling_parameters_type)
        # Perform batch fitting of data scaler parameters
        mean, std = mean_std_batch_fit(dataset, features_type, n_features)
        # Set fitted data scaler
        data_scaler.set_mean_and_std(mean, std)
    elif scaling_type == 'min-max':
        # Initialize data scaler
        data_scaler = TorchMinMaxScaler(n_features, **scaling_parameters_type)
        # Perform batch fitting of data scaler parameters
        minimum, maximum = \
            min_max_batch_fit(dataset, features_type, n_features)
        # Set fitted data scaler
        data_scaler.set_minimum_and_maximum(minimum, maximum)
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    return data_scaler
# =============================================================================
def mean_std_batch_fit(dataset, features_type, n_features, is_verbose=False):
    """Perform batch fitting of standardization data scaler.
    
    Parameters
    ----------
    dataset : torch.utils.data.Dataset
        Time series data set. Each sample is stored as a dictionary where each
        feature (key, str) data is a torch.Tensor(2d) of shape
        (sequence_length, n_features).
    features_type : str
        Features for which data scaler is fitted (e.g., 'features_in',
        'features_out'). Must be directly available from data set samples.
    n_features : int
        Number of features (dimensionality).
    is_verbose : bool, default=False
        If True, enable verbose output.
    
    Returns
    -------
    mean : torch.Tensor
        Features standardization mean tensor stored as a torch.Tensor with
        shape (n_features,).
    std : torch.Tensor
        Features standardization standard deviation tensor stored as a
        torch.Tensor with shape (n_features,).
        
    Notes
    -----
    A biased estimator is used to compute the standard deviation according with
    scikit-learn 1.3.2 documentation (sklearn.preprocessing.StandardScaler).
    """
    # Instantiate data scaler
    data_scaler = sklearn.preprocessing.StandardScaler()
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Set data loader
    data_loader = get_time_series_data_loader(dataset=dataset)
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Loop over samples
    for sample in tqdm.tqdm(data_loader,
                            desc='> Processing data samples: ',
                            disable=not is_verbose):        
        # Check sample
        if not isinstance(sample, dict):
            raise RuntimeError('Time series sample must be dictionary where '
                               'each feature (key, str) data is a '
                               'torch.Tensor(2d) of shape '
                               '(sequence_length, n_featues).')
        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # Check sample features tensor
        if features_type not in sample.keys():
            raise RuntimeError(f'Unavailable feature from sample.')
        else:
            features_tensor = sample[features_type]
        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # Process sample to fit data scaler
        if isinstance(features_tensor, torch.Tensor):
            # Check number of features
            if features_tensor.shape[-1] != n_features:
                raise RuntimeError(f'Mismatch between features tensor '
                                   f'({features_tensor.shape[-1]}) and '
                                   f'number of expected features '
                                   f'({n_features}) for features type: '
                                   f'\'{features_type}\'')
            # Process sample
            data_scaler.partial_fit(features_tensor[:, 0, :].clone())
        else:
            raise RuntimeError('Sample features tensor is not torch.Tensor.')
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Get fitted mean and standard deviation tensors
    mean = torch.tensor(data_scaler.mean_)
    std = torch.sqrt(torch.tensor(data_scaler.var_))
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Check features standardization mean tensor
    if not isinstance(mean, torch.Tensor):
        raise RuntimeError('Features standardization mean tensor is not a '
                           'torch.Tensor.')
    elif len(mean) != features_tensor.shape[-1]:
        raise RuntimeError('Features standardization mean tensor is not a '
                           'torch.Tensor(1d) with shape (n_features,).')
    # Check features standardization standard deviation tensor
    if not isinstance(std, torch.Tensor):
        raise RuntimeError('Features standardization standard deviation '
                           'tensor is not a torch.Tensor.')
    elif len(std) != features_tensor.shape[-1]:
        raise RuntimeError('Features standardization standard deviation '
                           'tensor is not a torch.Tensor(1d) with shape '
                           '(n_features,).')
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    return mean, std
# =============================================================================
def min_max_batch_fit(dataset, features_type, n_features, is_verbose=False):
    """Perform batch fitting of min-max data scaler.
    
    Parameters
    ----------
    dataset : torch.utils.data.Dataset
        Time series data set. Each sample is stored as a dictionary where each
        feature (key, str) data is a torch.Tensor(2d) of shape
        (sequence_length, n_features).
    features_type : str
        Features for which data scaler is fitted (e.g., 'features_in',
        'features_out'). Must be directly available from data set samples.
    n_features : int
        Number of features (dimensionality).
    is_verbose : bool, default=False
        If True, enable verbose output.
    
    Returns
    -------
    minimum : torch.Tensor
        Features normalization minimum tensor stored as a torch.Tensor with
        shape (n_features,).
    maximum : torch.Tensor
        Features normalization maximum tensor stored as a torch.Tensor with
        shape (n_features,).
    """
    # Instantiate data scaler
    data_scaler = sklearn.preprocessing.MinMaxScaler()
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Set data loader
    data_loader = get_time_series_data_loader(dataset=dataset)
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Loop over samples
    for sample in tqdm.tqdm(data_loader,
                            desc='> Processing data samples: ',
                            disable=not is_verbose):        
        # Check sample
        if not isinstance(sample, dict):
            raise RuntimeError('Time series sample must be dictionary where '
                               'each feature (key, str) data is a '
                               'torch.Tensor(2d) of shape '
                               '(sequence_length, n_featues).')
        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # Check sample features tensor
        if features_type not in sample.keys():
            raise RuntimeError(f'Unavailable feature from sample.')
        else:
            features_tensor = sample[features_type]
        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # Process sample to fit data scaler
        if isinstance(features_tensor, torch.Tensor):
            # Check number of features
            if features_tensor.shape[-1] != n_features:
                raise RuntimeError(f'Mismatch between features tensor '
                                   f'({features_tensor.shape[-1]}) and '
                                   f'number of expected features '
                                   f'({n_features}) for features type: '
                                   f'\'{features_type}\'')
            # Process sample
            data_scaler.partial_fit(features_tensor[:, 0, :].clone())
        else:
            raise RuntimeError('Sample features tensor is not torch.Tensor.')
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Get fitted minimum and maximum tensors
    minimum = torch.tensor(data_scaler.data_min_)
    maximum = torch.tensor(data_scaler.data_max_)
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Check features minimum tensor
    if not isinstance(minimum, torch.Tensor):
        raise RuntimeError('Features minimum tensor is not a torch.Tensor.')
    elif len(minimum) != features_tensor.shape[-1]:
        raise RuntimeError('Features minimum tensor is not a torch.Tensor(1d) '
                           'with shape (n_features,).')
    # Check features maximum tensor
    if not isinstance(maximum, torch.Tensor):
        raise RuntimeError('Features maximum tensor is not a torch.Tensor.')
    elif len(maximum) != features_tensor.shape[-1]:
        raise RuntimeError('Features maximum tensor is not a torch.Tensor(1d) '
                           'with shape (n_features,).')
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    return minimum, maximum