Source code for utilities.fit_data_scalers

"""Fitting of data scalers.

Functions
---------
fit_data_scaler_from_dataset
    Get fitted data scaler for given data set features type.
mean_std_batch_fit
    Perform batch fitting of standardization data scaler.
min_max_batch_fit
    Perform batch fitting of min-max data scaler.
"""
#
#                                                                       Modules
# =============================================================================
# Third-party
import torch
import tqdm
import sklearn.preprocessing
# Local
from time_series_data.time_dataset import get_time_series_data_loader
from utilities.data_scalers import TorchStandardScaler, TorchMinMaxScaler
#
#                                                          Authorship & Credits
# =============================================================================
__author__ = 'Bernardo Ferreira (bernardo_ferreira@brown.edu)'
__credits__ = ['Bernardo Ferreira', ]
__status__ = 'Stable'
# =============================================================================
#
# =============================================================================
[docs]def fit_data_scaler_from_dataset(dataset, features_type, n_features, scaling_type='mean-std', scaling_parameters={}): """Fit features type data scaler from given data set. Data scaler normalization tensors are fitted from given data set, overriding provided data scaling parameters. Parameters ---------- dataset : torch.utils.data.Dataset Time series data set. Each sample is stored as a dictionary where each feature (key, str) data is a torch.Tensor(2d) of shape (sequence_length, n_features). features_type : str Features for which data scaler is fitted (e.g., 'features_in', 'features_out'). Must be directly available from data set samples. n_features : int Number of features (dimensionality). scaling_type : {'min-max', 'mean-std'}, default='mean-std' Type of data scaling. Min-Max scaling ('min-max') or standardization ('mean-std'). scaling_parameters : dict, default={} Data scaling parameters (item, dict) for each features type (key, str). For 'min-max' data scaling, the parameters are the 'minimum' and 'maximum' features normalization tensors, as well as the 'norm_minimum' and 'norm_maximum' normalization bounds. For 'mean-std' data scaling, the parameters are the 'mean' and 'std' features normalization tensors. Returns ------- data_scaler : {TorchStandardScaler, TorchMinMaxScaler} Data scaler. """ # Set available data scaling types available_scaling_types = ('mean-std', 'min-max') # Check data scaling type if scaling_type not in available_scaling_types: raise RuntimeError(f'Unknown data scaling type \'{scaling_type}\'.' f'\n\nAvailable data scaling types: ' f'{available_scaling_types}') # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Get scaling parameters if features_type in scaling_parameters.keys(): scaling_parameters_type = scaling_parameters[features_type] else: scaling_parameters_type = {} # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Fit data scaler if scaling_type == 'mean-std': # Initialize data scaler data_scaler = \ TorchStandardScaler(n_features, **scaling_parameters_type) # Perform batch fitting of data scaler parameters mean, std = mean_std_batch_fit(dataset, features_type, n_features) # Set fitted data scaler data_scaler.set_mean_and_std(mean, std) elif scaling_type == 'min-max': # Initialize data scaler data_scaler = TorchMinMaxScaler(n_features, **scaling_parameters_type) # Perform batch fitting of data scaler parameters minimum, maximum = \ min_max_batch_fit(dataset, features_type, n_features) # Set fitted data scaler data_scaler.set_minimum_and_maximum(minimum, maximum) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ return data_scaler
# ============================================================================= def mean_std_batch_fit(dataset, features_type, n_features, is_verbose=False): """Perform batch fitting of standardization data scaler. Parameters ---------- dataset : torch.utils.data.Dataset Time series data set. Each sample is stored as a dictionary where each feature (key, str) data is a torch.Tensor(2d) of shape (sequence_length, n_features). features_type : str Features for which data scaler is fitted (e.g., 'features_in', 'features_out'). Must be directly available from data set samples. n_features : int Number of features (dimensionality). is_verbose : bool, default=False If True, enable verbose output. Returns ------- mean : torch.Tensor Features standardization mean tensor stored as a torch.Tensor with shape (n_features,). std : torch.Tensor Features standardization standard deviation tensor stored as a torch.Tensor with shape (n_features,). Notes ----- A biased estimator is used to compute the standard deviation according with scikit-learn 1.3.2 documentation (sklearn.preprocessing.StandardScaler). """ # Instantiate data scaler data_scaler = sklearn.preprocessing.StandardScaler() # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Set data loader data_loader = get_time_series_data_loader(dataset=dataset) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Loop over samples for sample in tqdm.tqdm(data_loader, desc='> Processing data samples: ', disable=not is_verbose): # Check sample if not isinstance(sample, dict): raise RuntimeError('Time series sample must be dictionary where ' 'each feature (key, str) data is a ' 'torch.Tensor(2d) of shape ' '(sequence_length, n_featues).') # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Check sample features tensor if features_type not in sample.keys(): raise RuntimeError(f'Unavailable feature from sample.') else: features_tensor = sample[features_type] # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Process sample to fit data scaler if isinstance(features_tensor, torch.Tensor): # Check number of features if features_tensor.shape[-1] != n_features: raise RuntimeError(f'Mismatch between features tensor ' f'({features_tensor.shape[-1]}) and ' f'number of expected features ' f'({n_features}) for features type: ' f'\'{features_type}\'') # Process sample data_scaler.partial_fit(features_tensor[:, 0, :].clone()) else: raise RuntimeError('Sample features tensor is not torch.Tensor.') # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Get fitted mean and standard deviation tensors mean = torch.tensor(data_scaler.mean_) std = torch.sqrt(torch.tensor(data_scaler.var_)) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Check features standardization mean tensor if not isinstance(mean, torch.Tensor): raise RuntimeError('Features standardization mean tensor is not a ' 'torch.Tensor.') elif len(mean) != features_tensor.shape[-1]: raise RuntimeError('Features standardization mean tensor is not a ' 'torch.Tensor(1d) with shape (n_features,).') # Check features standardization standard deviation tensor if not isinstance(std, torch.Tensor): raise RuntimeError('Features standardization standard deviation ' 'tensor is not a torch.Tensor.') elif len(std) != features_tensor.shape[-1]: raise RuntimeError('Features standardization standard deviation ' 'tensor is not a torch.Tensor(1d) with shape ' '(n_features,).') # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ return mean, std # ============================================================================= def min_max_batch_fit(dataset, features_type, n_features, is_verbose=False): """Perform batch fitting of min-max data scaler. Parameters ---------- dataset : torch.utils.data.Dataset Time series data set. Each sample is stored as a dictionary where each feature (key, str) data is a torch.Tensor(2d) of shape (sequence_length, n_features). features_type : str Features for which data scaler is fitted (e.g., 'features_in', 'features_out'). Must be directly available from data set samples. n_features : int Number of features (dimensionality). is_verbose : bool, default=False If True, enable verbose output. Returns ------- minimum : torch.Tensor Features normalization minimum tensor stored as a torch.Tensor with shape (n_features,). maximum : torch.Tensor Features normalization maximum tensor stored as a torch.Tensor with shape (n_features,). """ # Instantiate data scaler data_scaler = sklearn.preprocessing.MinMaxScaler() # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Set data loader data_loader = get_time_series_data_loader(dataset=dataset) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Loop over samples for sample in tqdm.tqdm(data_loader, desc='> Processing data samples: ', disable=not is_verbose): # Check sample if not isinstance(sample, dict): raise RuntimeError('Time series sample must be dictionary where ' 'each feature (key, str) data is a ' 'torch.Tensor(2d) of shape ' '(sequence_length, n_featues).') # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Check sample features tensor if features_type not in sample.keys(): raise RuntimeError(f'Unavailable feature from sample.') else: features_tensor = sample[features_type] # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Process sample to fit data scaler if isinstance(features_tensor, torch.Tensor): # Check number of features if features_tensor.shape[-1] != n_features: raise RuntimeError(f'Mismatch between features tensor ' f'({features_tensor.shape[-1]}) and ' f'number of expected features ' f'({n_features}) for features type: ' f'\'{features_type}\'') # Process sample data_scaler.partial_fit(features_tensor[:, 0, :].clone()) else: raise RuntimeError('Sample features tensor is not torch.Tensor.') # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Get fitted minimum and maximum tensors minimum = torch.tensor(data_scaler.data_min_) maximum = torch.tensor(data_scaler.data_max_) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Check features minimum tensor if not isinstance(minimum, torch.Tensor): raise RuntimeError('Features minimum tensor is not a torch.Tensor.') elif len(minimum) != features_tensor.shape[-1]: raise RuntimeError('Features minimum tensor is not a torch.Tensor(1d) ' 'with shape (n_features,).') # Check features maximum tensor if not isinstance(maximum, torch.Tensor): raise RuntimeError('Features maximum tensor is not a torch.Tensor.') elif len(maximum) != features_tensor.shape[-1]: raise RuntimeError('Features maximum tensor is not a torch.Tensor(1d) ' 'with shape (n_features,).') # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ return minimum, maximum