Source code for

"""Transformers for dataset (multi-example) operations, e.g., shuffling and batching.

These are mostly wrappers for standard ops.

import numpy as np
import tensorflow as tf
from import expand_to_rank
import attr
from typing import List, Text, Optional, Any, Callable, Dict

[docs]@attr.s(auto_attribs=True) class Shuffler: """Shuffling transformer for use in pipelines. The input to this transformer should not be repeated or batched (though the latter would technically work). Repeating prevents the shuffling from going through "epoch" or "iteration" loops in the underlying dataset. Though batching before shuffling works and respects epoch boundaries, it is not recommended as it implies that the same examples will always be optimized for together within a mini-batch. This is not as effective for promoting generalization as element-wise shuffling which produces new combinations of elements within mini- batches. The ideal pipeline follows the order: shuffle -> batch -> repeat Attributes: shuffle: If False, returns the input dataset unmodified. buffer_size: Number of examples to keep in a buffer to sample uniformly from. If set too high, it may take a long time to fill the initial buffer, especially if it resets every epoch. reshuffle_each_iteration: If True, resets the sampling buffer every iteration through the underlying dataset. """ shuffle: bool = True buffer_size: int = 64 reshuffle_each_iteration: bool = True @property def input_keys(self) -> List[Text]: """Return the keys that incoming elements are expected to have.""" return [] @property def output_keys(self) -> List[Text]: """Return the keys that outgoing elements will have.""" return []
[docs] def transform_dataset(self, ds_input: -> """Create a dataset with shuffled element order. Args: ds_input: Any dataset. Returns: A `` with elements containing the same keys, but in a shuffled order, if enabled. If the input dataset is repeated, this doesn't really respect epoch boundaries since it never reaches the end of the iterator. """ if self.shuffle: return ds_input.shuffle( buffer_size=self.buffer_size, reshuffle_each_iteration=self.reshuffle_each_iteration, ) else: return ds_input
[docs]@attr.s(auto_attribs=True) class Batcher: """Batching transformer for use in pipelines. This class enables variable-length example keys to be batched by converting them to ragged tensors prior to concatenation, then converting them back to dense tensors. See the notes in the `Shuffling` and `Repeater` transformers if training. If using in inference, this transformer will be used on its own without dropping remainders. The ideal (training) pipeline follows the order: shuffle -> batch -> repeat Attributes: batch_size: Number of elements within a batch. Every key will be stacked within their first axis (with expansion) such that it has `batch_size` length. drop_remainder: If True, final elements with fewer than `batch_size` examples will be dropped once the end of the input dataset iteration is reached. This should be True for training and False for inference. """ batch_size: int = 8 drop_remainder: bool = False @property def input_keys(self) -> List[Text]: """Return the keys that incoming elements are expected to have.""" return [] @property def output_keys(self) -> List[Text]: """Return the keys that outgoing elements will have.""" return []
[docs] def transform_dataset(self, ds_input: -> """Create a dataset with batched elements. Args: ds_input: Any dataset that produces dictionaries keyed by strings and values with any rank tensors. Returns: A `` with elements containing the same keys, but with each tensor promoted to 1 rank higher (except for scalars with rank 0 will be promoted to rank 2). The keys of each element will contain `batch_size` individual elements stacked along the axis 0, such that length (`.shape[0]`) is equal to `batch_size`. Any keys that had variable length elements within the batch will be padded with NaNs to the size of the largest element's length for that key. """ def expand(example): """Expand all keys to a minimum rank of 1.""" for key in example: example[key] = expand_to_rank(example[key], target_rank=1, prepend=True) return example def unrag(example): """Convert all keys back to dense tensors NaN padded.""" for key in example: if isinstance(example[key], tf.RaggedTensor): example[key] = example[key].to_tensor( default_value=tf.cast(np.nan, example[key].dtype) ) return example # Ensure that all keys have a rank of at least 1 (i.e., scalars). ds_output = expand, ) # Batch elements as ragged tensors. ds_output = ds_output.apply( batch_size=self.batch_size, drop_remainder=self.drop_remainder ) ) # Convert elements back into dense tensors with padding. ds_output = unrag, ) return ds_output
[docs]@attr.s(auto_attribs=True) class Unbatcher: """Unbatching transformer for use in pipelines.""" @property def input_keys(self) -> List[Text]: """Return the keys that incoming elements are expected to have.""" return [] @property def output_keys(self) -> List[Text]: """Return the keys that outgoing elements will have.""" return []
[docs] def transform_dataset(self, ds_input: -> """Create a dataset with unbatched elements.""" return ds_input.unbatch()
[docs]@attr.s(auto_attribs=True) class Repeater: """Repeating transformer for use in pipelines. Repeats the underlying elements indefinitely or for a number of "iterations" or "epochs". If placed before batching, this can create mini-batches with examples from across epoch boundaries. If placed after batching, this may never reach examples that are dropped as remainders if not shuffling. The ideal pipeline follows the order: shuffle -> batch -> repeat Attributes: repeat: If False, returns the input dataset unmodified. epochs: If -1, repeats the input dataset elements infinitely. Otherwise, loops through the elements of the input dataset this number of times. """ repeat: bool = True epochs: int = -1 @property def input_keys(self) -> List[Text]: """Return the keys that incoming elements are expected to have.""" return [] @property def output_keys(self) -> List[Text]: """Return the keys that outgoing elements will have.""" return []
[docs] def transform_dataset(self, ds_input: -> """Create a dataset with repeated loops over the input elements. Args: ds_input: Any dataset. Returns: A `` with elements containing the same keys, but repeated for `epochs` iterations. """ if self.repeat: return ds_input.repeat(count=self.epochs) else: return ds_input
[docs]@attr.s(auto_attribs=True) class Prefetcher: """Prefetching transformer for use in pipelines. Prefetches elements from the input dataset to minimize the processing bottleneck as elements are requested since prefetching can occur in parallel. Attributes: prefetch: If False, returns the input dataset unmodified. buffer_size: Keep `buffer_size` elements loaded in the buffer. If set to -1 (``), this value will be optimized automatically to decrease latency. """ prefetch: bool = True buffer_size: int = @property def input_keys(self) -> List[Text]: """Return the keys that incoming elements are expected to have.""" return [] @property def output_keys(self) -> List[Text]: """Return the keys that outgoing elements will have.""" return []
[docs] def transform_dataset(self, ds_input: -> """Create a dataset with prefetching to maintain a buffer during iteration. Args: ds_input: Any dataset. Returns: A `` with identical elements. Processing that occurs with the elements that are produced can be done in parallel (e.g., training on the GPU) while new elements are generated from the pipeline. """ if self.prefetch: return ds_input.prefetch(buffer_size=self.buffer_size) else: return ds_input
[docs]@attr.s(auto_attribs=True) class Preloader: """Preload elements of the underlying dataset to generate in-memory examples. This transformer can lead to considerable performance improvements at the cost of memory consumption. This is functionally equivalent to ``, except the cached examples are accessible directly via the `examples` attribute. Attributes: examples: Stored list of preloaded elements. """ examples: List[Any] = attr.ib(init=False, factory=list) @property def input_keys(self) -> List[Text]: """Return the keys that incoming elements are expected to have.""" return [] @property def output_keys(self) -> List[Text]: """Return the keys that outgoing elements will have.""" return []
[docs] def transform_dataset(self, ds_input: -> """Create a dataset that generates preloaded elements. Args: ds_input: Any `` that generates examples as a dictionary of tensors. Should not be repeating infinitely. Return: A dataset that generates the same examples. This is similar to prefetching, except that examples are yielded through a generator and loaded when this method is called rather than during pipeline iteration. """ # Preload examples from the input dataset. self.examples = list(iter(ds_input)) # Store example metadata. keys = list(self.examples[0].keys()) dtypes = [self.examples[0][key].dtype for key in keys] def gen(): for example in self.examples: yield tuple(example[key] for key in keys) ds_output =, output_types=tuple(dtypes)) ds_output = lambda *example: {key: val for key, val in zip(keys, example)} ) return ds_output
[docs]@attr.s(auto_attribs=True) class LambdaFilter: """Transformer for filtering examples out of a dataset. This class is useful for eliminating examples that fail to meet some criteria, e.g., when no peaks are found. Attributes: filter_fn: Callable that takes an example dictionary as input and returns True if the the element should be kept. """ filter_fn: Callable[[Dict[Text, tf.Tensor]], bool] @property def input_keys(self) -> List[Text]: """Return the keys that incoming elements are expected to have.""" return [] @property def output_keys(self) -> List[Text]: """Return the keys that outgoing elements will have.""" return []
[docs] def transform_dataset(self, ds_input: -> """Create a dataset with filtering applied. Args: ds_input: Any dataset that produces dictionaries keyed by strings and values with any rank tensors. Returns: A `` with elements containing the same keys, but with potentially fewer elements. """ # Filter and return. ds_output = ds_input.filter(self.filter_fn) return ds_output