Source code for time_series_data.time_dataset

"""Time series data set.

Classes
-------
TimeSeriesDatasetInMemory(torch.utils.data.Dataset)
    Time series data set (in-memory storage only).
TimeSeriesDataset(torch.utils.data.Dataset)
    Time series data set.

Functions
---------
time_collator
    Collate time series batch data.
get_time_series_data_loader
    Get time series data set data loader.
save_dataset
    Save PyTorch time series data set to file.
load_dataset
    Load PyTorch time series data set.
change_dataset_features_labels
    Change time series data set features labels.
add_dataset_feature_init
    Add feature initialization to all samples of time series data set.
concatenate_dataset_features
    Concatenate existing features of time series data set into new feature.
sum_dataset_features
    Sum existing features of time series data set into new feature.
write_time_series_dataset_summary_file
    Write summary data file for time series data set generation.
split_dataset
    Randomly split data set into non-overlapping subsets.
get_parent_dataset_indices
    Get parent data set indices from subset indices.
"""
#
#                                                                       Modules
# =============================================================================
# Standard
import os
import datetime
import copy
import pickle
# Third-party
import torch
import numpy as np
# Local
from ioput.iostandard import write_summary_file
#
#                                                          Authorship & Credits
# =============================================================================
__author__ = 'Bernardo Ferreira (bernardo_ferreira@brown.edu)'
__credits__ = ['Bernardo Ferreira', ]
__status__ = 'Stable'
# =============================================================================
#
# =============================================================================
def time_collator(batch_data):
    """Collate time series batch data.
    
    Ignores all features that are not stored as a torch.Tensor(2d) of shape
    (sequence_length, n_features).
    
    Parameters
    ----------
    batch_data : list[dict]
        Each batch sample data is stored as a dictionary where each feature
        (key, str) data is a torch.Tensor(2d) of shape
        (sequence_length, n_features).
        
    Returns
    -------
    batch : dict
        Collated batch sample data for each feature (key, str), stored as
        torch.Tensor(3d) of shape (sequence_length, batch_size, n_features).
    """
    # Probe features from first batch sample
    features = tuple(batch_data[0].keys())
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Initialize collated batch sample data
    batch = {}
    # Loop over features
    for feature in features:
        # Probe feature type and shape from first batch sample
        is_batchable = (isinstance(batch_data[0][feature], torch.Tensor)
                        and len(batch_data[0][feature].shape) == 2)
        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # Collate batch sample feature data
        if is_batchable:
            batch[feature] = torch.stack([x[feature] for x in batch_data], 1)
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    return batch
# =============================================================================
[docs]def get_time_series_data_loader(dataset, batch_size=1, is_shuffle=False, **kwargs): """Get time series data set data loader. Parameters ---------- dataset : torch.utils.data.Dataset Time series data set. Each sample is stored as a dictionary where each feature (key, str) data is a torch.Tensor(2d) of shape (sequence_length, n_features). batch_size : int, default=1 Number of samples loaded per batch. is_shuffle : bool, default=False Reshuffle data set at every epoch. **kwargs Arguments of torch.utils.data.DataLoader. Returns ------- data_loader : torch.utils.data.DataLoader Time series data set data loader. """ # Time series data set data loader data_loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=batch_size, shuffle=is_shuffle, collate_fn=time_collator, **kwargs) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ return data_loader
# =============================================================================
[docs]def save_dataset(dataset, dataset_basename, dataset_directory, is_append_n_sample=True): """Save PyTorch time series data set to file. Parameters ---------- dataset : torch.utils.data.Dataset Time series data set. Each sample is stored as a dictionary where each feature (key, str) data is a torch.Tensor(2d) of shape (sequence_length, n_features). dataset_basename : str Data set file basename. dataset_directory : str Directory where the times series data set is stored. is_append_n_sample : bool, default=True If True, then data set size (number of samples) is appended to data set filename. Returns ------- dataset_file_path : str PyTorch data set file path. """ # Check data set directory if not os.path.isdir(dataset_directory): raise RuntimeError('The data set directory has not been found:\n\n' + dataset_directory) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Set data set file dataset_file = dataset_basename # Append data set size if is_append_n_sample: dataset_file += f'_n{len(dataset)}' # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Set data set file path dataset_file_path = os.path.join(dataset_directory, dataset_file + '.pkl') # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Save data set with open(dataset_file_path, 'wb') as dataset_file: pickle.dump(dataset, dataset_file) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ return dataset_file_path
# =============================================================================
[docs]def load_dataset(dataset_file_path): """Load PyTorch time series data set. Parameters ---------- dataset_file_path : str PyTorch data set file path. Returns ------- dataset : torch.utils.data.Dataset PyTorch data set. """ # Check PyTorch data set file if not os.path.isfile(dataset_file_path): raise RuntimeError('PyTorch data set file has not been found:\n\n' + dataset_file_path) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Load PyTorch data set with open(dataset_file_path, 'rb') as dataset_file: dataset = pickle.load(dataset_file) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Check PyTorch data set if not isinstance(dataset, torch.utils.data.Dataset): raise RuntimeError('Loaded data set is not a ' 'torch.utils.data.Dataset.') # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ return dataset
# ============================================================================= def change_dataset_features_labels(dataset, features_label_map): """Change time series data set features labels. Parameters ---------- dataset : torch.utils.data.Dataset Time series data set. Each sample is stored as a dictionary where each feature (key, str) data is a torch.Tensor(2d) of shape (sequence_length, n_features). features_label_map : dict Features labels mapping. For each original feature label (key, str), specifies the new label (item, str). Nonexistent features are silently ignored. Returns ------- dataset : torch.utils.data.Dataset Time series data set. Each sample is stored as a dictionary where each feature (key, str) data is a torch.Tensor(2d) of shape (sequence_length, n_features). """ # Loop over features for old_label, new_label in features_label_map.items(): # Probe feature existence if old_label not in tuple(dataset[0].keys()): continue # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Loop over samples for i in range(len(dataset)): # Get sample sample = dataset[i] # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Change feature label sample[new_label] = sample.pop(old_label) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Update data set sample if isinstance(dataset, TimeSeriesDataset): dataset.update_dataset_sample(i, sample) else: dataset[i] = sample # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ return dataset # =============================================================================
[docs]def add_dataset_feature_init(dataset, feature_label, feature_init): """Add feature initialization to all samples of time series data set. Parameters ---------- dataset : torch.utils.data.Dataset Time series data set. Each sample is stored as a dictionary where each feature (key, str) data is a torch.Tensor(2d) of shape (sequence_length, n_features). feature_label : str Feature label. feature_init : torch.Tensor(2d) Feature initialization data stored as torch.Tensor(2d) of shape (n, n_features), where n is a general size. Returns ------- dataset : torch.utils.data.Dataset Time series data set. Each sample is stored as a dictionary where each feature (key, str) data is a torch.Tensor(2d) of shape (sequence_length, n_features). """ # Check feature initialization if not isinstance(feature_init, torch.Tensor): raise RuntimeError('Feature initialization was not provided as ' 'torch.Tensor(2d).') elif len(feature_init.shape) != 2: raise RuntimeError('Feature initialization was not provided as ' 'torch.Tensor(2d).') # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Loop over samples for i in range(len(dataset)): # Get sample sample = dataset[i] # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Add new feature sample[str(feature_label)] = feature_init # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Update data set sample if isinstance(dataset, TimeSeriesDataset): dataset.update_dataset_sample(i, sample) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ return dataset
# =============================================================================
[docs]def concatenate_dataset_features(dataset, new_feature_label, cat_features_labels, is_remove_features=False): """Concatenate existing features of time series data set into new feature. The new feature is stored as a torch.Tensor(2d) of shape (sequence_length, n_concat_features) resulting from the concatenation of the existing features tensors along the second dimension. Parameters ---------- dataset : torch.utils.data.Dataset Time series data set. Each sample is stored as a dictionary where each feature (key, str) data is a torch.Tensor(2d) of shape (sequence_length, n_features). new_feature_label : str New feature label. cat_features_labels : tuple[str] Labels of existing features to be concatenated into new feature. Concatenation is sorted accordingly. is_remove_features : bool, default=False If True, then remove concatenated features from data set after concatenating the new feature. Returns ------- dataset : torch.utils.data.Dataset Time series data set. Each sample is stored as a dictionary where each feature (key, str) data is a torch.Tensor(2d) of shape (sequence_length, n_features). """ # Check concatenation features for label in cat_features_labels: # Probe sample existence from first sample if label not in dataset[0].keys(): raise RuntimeError(f'The feature "{label}" cannot be concatenated ' f'because it does not exist in the data set.') # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Set data set storage type if isinstance(dataset, TimeSeriesDataset): is_in_memory_dataset = False else: is_in_memory_dataset = True # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Loop over samples for i in range(len(dataset)): # Get sample sample = dataset[i] # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Concatenate features if len(cat_features_labels) > 1: sample[str(new_feature_label)] = torch.cat( [sample[label] for label in cat_features_labels], dim=1) else: sample[str(new_feature_label)] = \ sample[cat_features_labels[0]] # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Remove concatenated features (in-memory data set only) if is_remove_features and is_in_memory_dataset: # Loop over concatenated features for label in cat_features_labels: sample.pop(label) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Update data set sample if isinstance(dataset, TimeSeriesDataset): dataset.update_dataset_sample(i, sample) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ return dataset
# ============================================================================= def sum_dataset_features(dataset, new_feature_label, sum_features_labels, features_weights=None, is_remove_features=False): """Sum existing features of time series data set into new feature. The new feature is stored as a torch.Tensor(2d) of shape (sequence_length, n_features) resulting from the sum of the existing features tensors along the second dimension. Parameters ---------- dataset : torch.utils.data.Dataset Time series data set. Each sample is stored as a dictionary where each feature (key, str) data is a torch.Tensor(2d) of shape (sequence_length, n_features). new_feature_label : str New feature label. sum_features_labels : tuple[str] Labels of existing features to be summed into new feature. features_weights : dict, default=None Scalar weights (item, float) multiplied by each existing feature (key, str) in the summing process. If None, then defaults to 1.0 for all features. is_remove_features : bool, default=False If True, then remove summed features from data set after computing the new feature. Returns ------- dataset : torch.utils.data.Dataset Time series data set. Each sample is stored as a dictionary where each feature (key, str) data is a torch.Tensor(2d) of shape (sequence_length, n_features). """ # Check summing features for label in sum_features_labels: # Probe sample existence from first sample if label not in dataset[0].keys(): raise RuntimeError(f'The feature "{label}" cannot be summed ' f'because it does not exist in the data set.') # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Initialize sum weights sum_weights = {} # Set sum weights for feature_label in sum_features_labels: if feature_label in features_weights.keys(): sum_weights[feature_label] = float(features_weights[feature_label]) else: sum_weights[feature_label] = 1.0 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Set data set storage type if isinstance(dataset, TimeSeriesDataset): is_in_memory_dataset = False else: is_in_memory_dataset = True # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Loop over samples for i in range(len(dataset)): # Get sample sample = dataset[i] # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Add features if len(sum_features_labels) > 1: # Compute new feature sample[str(new_feature_label)] = \ torch.sum(torch.stack([sum_weights[label]*sample[label] for label in sum_features_labels], dim=0), dim=0) else: # Get feature label label = sum_features_labels[0] # Set new feature sample[str(new_feature_label)] = sum_weights[label]*sample[label] # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Remove summed features (in-memory data set only) if is_remove_features and is_in_memory_dataset: # Loop over summed features for label in sum_features_labels: sample.pop(label) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Update data set sample if isinstance(dataset, TimeSeriesDataset): dataset.update_dataset_sample(i, sample) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ return dataset # ============================================================================= def write_time_series_dataset_summary_file( dataset_directory, n_sample, total_time_sec, avg_time_sample, features=None, targets=None, filename='summary'): """Write summary data file for time series data set generation. Parameters ---------- dataset_directory : str Directory where the times series data set is stored. n_sample : int Data set size (number of samples). total_time_sec : int Total generation time in seconds. avg_time_sample : float Average generation time per sample. features : tuple[str], default=None Time series data set features. targets : tuple[str], default=None Time series data set targets. filename : str, default='summary' Summary file name. """ # Set summary data summary_data = {} summary_data['n_sample'] = n_sample summary_data['features'] = features summary_data['targets'] = targets summary_data['Total generation time'] = \ str(datetime.timedelta(seconds=int(total_time_sec))) summary_data['Avg. generation time per path'] = \ str(datetime.timedelta(seconds=int(avg_time_sample))) # Set summary title summary_title = 'Summary: Time series data set generation' # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Write summary file write_summary_file( summary_directory=dataset_directory, filename=filename, summary_title=summary_title, **summary_data) # =============================================================================
[docs]class TimeSeriesDatasetInMemory(torch.utils.data.Dataset): """Time series data set (in-memory storage only). Attributes ---------- _dataset_samples : list Time series data set samples data. Each sample is stored as a dictionary where each feature (key, str) data is a torch.Tensor(2d) of shape (sequence_length, n_features). Methods ------- __len__(self): Return size of data set (number of samples). __getitem__(self, index) Return data set sample from corresponding index. get_dataset_samples(self) Get data set samples data. add_dataset_samples(self, samples) Add samples to data set. remove_dataset_samples(self, indices) Remove samples from data set. from_dataset(cls, dataset) Convert data set to TimeSeriesDatasetInMemory data set. """
[docs] def __init__(self, dataset_samples): """Constructor. Parameters ---------- dataset_samples : list[dict] Time series data set samples data. Each sample is stored as a dictionary where each feature (key, str) data is a torch.Tensor(2d) of shape (sequence_length, n_features). """ # Initialize data set from base class super(TimeSeriesDatasetInMemory, self).__init__() # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Set data set samples self._dataset_samples = list(dataset_samples)
# ------------------------------------------------------------------------- def __len__(self): """Return size of data set (number of samples). Returns ------- n_sample : int Data set size (number of samples). """ return len(self._dataset_samples) # -------------------------------------------------------------------------
[docs] def __getitem__(self, index): """Return data set sample from corresponding index. Parameters ---------- index : int Index of returned data set sample (index must be in [0, n_sample]). Returns ------- sample_data : dict Data set sample stored as a dictionary where each feature (key, str) data is a torch.Tensor(2d) of shape (sequence_length, n_features). """ # Get data set sample sample_data = self._dataset_samples[index] # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ return sample_data
# -------------------------------------------------------------------------
[docs] def get_dataset_samples(self): """Get data set samples data. Returns ------- dataset_samples : list[dict] Time series data set samples data. Each sample is stored as a dictionary where each feature (key, str) data is a torch.Tensor(2d) of shape (sequence_length, n_features). """ return copy.deepcopy(self._dataset_samples)
# -------------------------------------------------------------------------
[docs] def add_dataset_samples(self, samples): """Add samples to data set. Parameters ---------- samples : list[dict] Time series samples data. Each sample is stored as a dictionary where each feature (key, str) data is a torch.Tensor(2d) of shape (sequence_length, n_features). """ # Append samples data self._dataset_samples += samples
# -------------------------------------------------------------------------
[docs] def remove_dataset_samples(self, indices): """Remove samples from data set. Parameters ---------- indices : list[int] Indices of data set samples to be removed (index must be in [0, n_sample]). """ # Remove samples data self._dataset_samples = \ [sample for i, sample in enumerate(self._dataset_samples) if i not in set(indices)]
# -------------------------------------------------------------------------
[docs] @classmethod def from_dataset(cls, dataset): """Convert data set to TimeSeriesDatasetInMemory data set. Parameters ---------- dataset : torch.utils.data.Dataset Time series data set. Each sample is stored as a dictionary where each feature (key, str) data is a torch.Tensor(2d) of shape (sequence_length, n_features). Returns ------- dataset : TimeSeriesDatasetInMemory Time series data set. Each sample is stored as a dictionary where each feature (key, str) data is a torch.Tensor(2d) of shape (sequence_length, n_features). """ # Collect data set samples dataset_samples = [x for x in dataset] # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Create data set dataset = cls(dataset_samples) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ return dataset
# =============================================================================
[docs]class TimeSeriesDataset(torch.utils.data.Dataset): """Time series data set. Attributes ---------- _dataset_directory : str Directory where the time series data set is stored (all data set samples files). _dataset_sample_files : list[str] Time series data set samples files paths. Each sample file contains a dictionary where each feature (key, str) data is a torch.Tensor(2d) of shape (sequence_length, n_features). _dataset_basename : str Data set file base name. Methods ------- __len__(self) Return size of data set (number of samples). __getitem__(self, index) Return data set sample from corresponding index. update_dataset_sample(self, index, time_series) Update data set sample time series data. get_dataset_directory(self) Get directory where time series data set is stored. get_dataset_sample_files(self) Get time series data set samples files paths. set_dataset_basename(self, dataset_basename) Set data set file base name. get_dataset_basename(self) Get data set file base name. update_dataset_file_internal_directory(dataset_file_path, \ new_directory, \ is_reload_data=False) Update internal directory of stored data set in provided file. _update_dataset_directory(self, dataset_directory, is_reload_data=False) Update directory where time series data set is stored. """
[docs] def __init__(self, dataset_directory, dataset_sample_files, dataset_basename='time_series_dataset'): """Constructor. Parameters ---------- dataset_directory : str Directory where the time series data set is stored (all data set samples files). dataset_sample_files : list[str] Time series data set samples files paths. Each sample file contains a dictionary where each feature (key, str) data is a torch.Tensor(2d) of shape (sequence_length, n_features). dataset_basename : str, default='time_series_dataset' Data set file base name. dataset_samples : list[dict] Time series data set samples data. Each sample is stored as a dictionary where each feature (key, str) data is a torch.Tensor(2d) of shape (sequence_length, n_features). """ # Initialize data set from base class super(TimeSeriesDataset, self).__init__() # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Set data set directory if not os.path.isdir(dataset_directory): raise RuntimeError('The time series data set directory has not ' 'been found:\n\n' + dataset_directory) else: self._dataset_directory = os.path.normpath(dataset_directory) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Check data set sample files for file_path in dataset_sample_files: # Check if sample file exists if not os.path.isfile(file_path): raise RuntimeError('Time series data set sample file has not ' 'been found:\n\n' + file_path) elif os.path.dirname(file_path) \ != os.path.normpath(dataset_directory): raise RuntimeError('Time series data set sample file is not ' 'in dataset directory:\n\n' + dataset_directory) # Store data set samples file paths self._dataset_sample_files = dataset_sample_files # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Set data set file base name self._dataset_basename = str(dataset_basename)
# -------------------------------------------------------------------------
[docs] def __len__(self): """Return size of data set (number of samples). Returns ------- n_sample : int Data set size (number of samples). """ return len(self._dataset_sample_files)
# -------------------------------------------------------------------------
[docs] def __getitem__(self, index): """Return data set sample from corresponding index. Parameters ---------- index : int Index of returned data set sample (index must be in [0, n_sample]). Returns ------- time_series : dict Data set sample defined as a dictionary where each feature (key, str) data is a torch.Tensor(2d) of shape (sequence_length, n_features). """ # Get data set sample time_series = torch.load(self._dataset_sample_files[index]) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ return time_series
# -------------------------------------------------------------------------
[docs] def update_dataset_sample(self, index, time_series): """Update data set sample time series data. Parameters ---------- index : int Index of returned data set sample (index must be in [0, n_sample]). time_series : dict Data set sample defined as a dictionary where each feature (key, str) data is a torch.Tensor(2d) of shape (sequence_length, n_features). """ # Update data set sample time series data torch.save(time_series, self._dataset_sample_files[index])
# -------------------------------------------------------------------------
[docs] def get_dataset_directory(self): """Get directory where time series data set is stored. Returns ------- dataset_directory : str Directory where the time series data set is stored (all data set samples files). """ return self._dataset_directory
# -------------------------------------------------------------------------
[docs] def get_dataset_sample_files(self): """Get time series data set samples files paths. Returns ------- dataset_sample_files : list[str] Time series data set samples files paths. Each sample file contains a dictionary where each feature (key, str) data is a torch.Tensor(2d) of shape (sequence_length, n_features). """ return self._dataset_sample_files
# -------------------------------------------------------------------------
[docs] def set_dataset_basename(self, dataset_basename): """Set data set file base name. Parameters ---------- dataset_basename : str Data set file base name. """ self._dataset_basename = str(dataset_basename)
# -------------------------------------------------------------------------
[docs] def get_dataset_basename(self): """Get data set file base name. Returns ------- dataset_basename : str Data set file base name. """ return self._dataset_basename
# -------------------------------------------------------------------------
[docs] @staticmethod def update_dataset_file_internal_directory( dataset_file_path, new_directory, is_reload_data=False): """Update internal directory of stored data set in provided file. Update is only performed if the new directory does not match the internal directory of the stored data set. Parameters ---------- dataset_file_path : str Data set file path. new_directory : str Data set new directory. is_reload_data : bool, default=False Reload and store data set samples in attribute dataset_samples_data. Only effective if is_store_dataset=True. """ # Check new data set directory if not os.path.isdir(new_directory): raise RuntimeError('The new data set directory has not been ' 'found:\n\n' + new_directory) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Load PyTorch data set loaded_dataset = load_dataset(dataset_file_path) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Check data set type if isinstance(loaded_dataset, torch.utils.data.Subset): # Get parent data set dataset = loaded_dataset.dataset elif isinstance(loaded_dataset, TimeSeriesDataset): # Get data set dataset = loaded_dataset else: raise RuntimeError('The data set must be either ' 'torch.utils.data.Subset (extracted from ' 'TimeSeriesDataset data set) or ' 'TimeSeriesDataset data set.') # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Get loaded PyTorch data set internal directory stored_directory = dataset.get_dataset_directory() # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Update PyTorch data set file if the new directory does not match the # internal directory of the loaded data set if stored_directory != new_directory: # Update directory of PyTorch data set dataset._update_dataset_directory( new_directory, is_reload_data=is_reload_data) # Save updated PyTorch data set (overwrite existing data set file) with open(dataset_file_path, 'wb') as dataset_file: pickle.dump(loaded_dataset, dataset_file)
# -------------------------------------------------------------------------
[docs] def _update_dataset_directory(self, dataset_directory): """Update directory where time series data set is stored. Stored data set samples files paths directory is updated according with the new directory. Parameters ---------- dataset_directory : str Directory where the time series data set is stored (all data set samples files). """ # Set new data set directory if not os.path.isdir(dataset_directory): raise RuntimeError('The new time series data set directory has ' 'not been found:\n\n' + dataset_directory) else: self._dataset_directory = dataset_directory # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Loop over samples files for i, file_path in enumerate(self._dataset_sample_files): # Set new sample file path (update directory) new_file_path = os.path.join(os.path.normpath(dataset_directory), os.path.basename(file_path)) # Update sample file path if not os.path.isfile(new_file_path): raise RuntimeError('Time series data set sample file is not ' 'in new dataset directory:\n\n' + dataset_directory) else: self._dataset_sample_files[i] = new_file_path
# =============================================================================
[docs]def split_dataset(dataset, split_sizes, is_copy_dataset=False, is_save_subsets=False, subsets_directory=None, subsets_basename=None, seed=None): """Randomly split data set into non-overlapping subsets. Parameters ---------- dataset : torch.utils.data.Dataset Time series data set. Each sample is stored as a dictionary where each feature (key, str) data is a torch.Tensor(2d) of shape (sequence_length, n_features). split_sizes : dict Size (item, float) of each data subset name (key, str), where size is a fraction contained between 0 and 1. The sum of all sizes must equal 1. is_copy_dataset : bool, default=False If True, then subsets are disconnected from original parent data set. is_save_subsets : bool, default=False If True, then save data subsets to files. subsets_directory : str, default=None Directory where the data subsets files are stored. subset_basename : str, default=None Subset file base name. seed : int, default=None Seed for random data set split generator. Returns ------- dataset_split : dict Data subsets (key, str, item, torch.utils.data.Subset). """ # Initialize data subsets names and sizes subsets_names = [] subsets_sizes = [] # Assemble data subsets names and sizes for key, val in split_sizes.items(): # Check if subset size is valid if val < 0.0 or val > 1.0: raise RuntimeError(f'Subset size must be contained between 0 and ' f'1. Check subset (size): {key} ({val})') # Assemble subset name and size subsets_names.append(str(key)) subsets_sizes.append(val) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Check if split sizes are valid if not np.isclose(np.sum(subsets_sizes), 1.0): raise RuntimeError('Sum of subset split sizes must equal 1. ' f'Current sum: {np.sum(subsets_sizes):.2f}') # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Set random split generator if seed is not None: generator = torch.Generator().manual_seed(seed) else: generator = None # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Copy original data set if is_copy_dataset: dataset = copy.deepcopy(dataset) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Randomly split data set into non-overlapping subsets subsets_list = torch.utils.data.random_split(dataset, subsets_sizes, generator=generator) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Build data subsets dataset_split = {} for i, subset in enumerate(subsets_names): # Check subset if len(subsets_list[i]) < 1: raise RuntimeError(f'Subset "{subset}" resulting from data set ' f'split is empty.') # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Assemble data subet dataset_split[str(subset)] = subsets_list[i] # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Save data subsets files if is_save_subsets: # Check subsets directory if not os.path.isdir(subsets_directory): raise RuntimeError('The data subsets directory has not been ' 'specified or found:\n\n' + subsets_directory) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Loop over data subsets for key, val in dataset_split.items(): # Set data subset file name subset_file = '' if subsets_basename is not None: subset_file += f'{subsets_basename}_' subset_file += f'{str(key)}_n{len(val)}' # Set data subset file path subset_path = os.path.join(subsets_directory, subset_file + '.pkl') # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Save data subset with open(subset_path, 'wb') as subset_file: pickle.dump(val, subset_file) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ return dataset_split
# =============================================================================
[docs]def get_parent_dataset_indices(subset, subset_indices): """Get parent data set indices from subset indices. Parameters ---------- subset : torch.utils.data.Subset Time series subset. Each sample is stored as a dictionary where each feature (key, str) data is a torch.Tensor(2d) of shape (sequence_length, n_features). subset_indices : list[int] Subset samples indices. Returns ------- parent_indices : list[int] Parent data set samples indices. """ # Check subset if not isinstance(subset, torch.utils.data.Subset): raise RuntimeError(f'Expecting time series subset of class ' f'torch.utils.data.Subset, but found ' f'{type(subset).__name__} instead.') # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Get parent data set samples indices parent_indices = [subset.indices[i] for i in subset_indices] # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ return parent_indices