Source code for datawork.api.data

"""Module implementing abstract Data class."""

from abc import abstractmethod, abstractstaticmethod

from .cache import Cached, Hashable
from .graph import Node


[docs]class Data(Cached, Hashable, Node): """ Data placeholder class. This class represents data that has either not yet been computed, or is furthermore not fully specified. Classes inheriting :class:`~.data.Data` implement placeholders for specific data types, e.g. Pandas dataframes or numpy arrays. Subclasses of :class:`~.data.Data` are typically instantiated by invocations of :class:`~.tool.Tool`. Thus :class:`~.data.Data` and :class:`~.invocation.Invocation` are connected and form the backbone of the computational graph, with :class:`~.tool.Tool` objects connected to :class:`~.invocation.Invocation` as objects that can be configured. Note that the `provider` attribute itself an :class:`~.invocation.Invocation`, can be "partial", in which case the data object itself is callable. When called, arguments are passed to the provider which will create new invocations; potentially now non-partial ones. """
[docs] def __init__(self, desc=None, name=None): """ Construct a placeholder data object. :param desc: a plain-text description of this data object :param name: a short-hand name for this data object """ self.desc = desc self._data = None self._populated = False self._missing_args = {} self.name = name self.provider = None self.provider_slot = None
[docs] def __repr__(self): """Represent data including provider and name.""" prov = repr(self.provider) if self.provider is None: prov = "None" name = self.name if self.name is None: name = "None" return (f"{self.__class__.__name__}(name={name}, " f"_populated={repr(self._populated)}, " f"_data={repr(self._data)}, " f"provider={prov})")
[docs] def __call__(self, *args): """Enable calling for placeholder Data objects.""" if self._populated: raise ValueError("Cannot invoke populated Data object") if self.provider is None: if len(args) != 1: raise ValueError( "Can only invoke placeholder Data with single argument" ) if args[0] is None: # This arg should remain a placeholder return self if not isinstance(args[0], type(self)): raise TypeError( "Cannot invoke placeholder type t with any type other than t" ) return args[0] # if there's a provider, then invoke with these args outps = self.provider.invoke(*args).o if isinstance(outps, tuple): assert self.provider_slot < len(outps) return outps[self.provider_slot] else: assert self.provider_slot == 0 return outps
[docs] def missing_args(self): """Count number of missing arguments.""" if self._populated: return 0 if self.provider is None: return 1 # This is a placeholder return self.provider.missing_args()
[docs] def get_hash(self): """Return hash of provider if exists, or of data itself for constants.""" import hashlib hashbase = self.__class__.__name__ # + self.version() if self.provider is None: if self.data is None: hashbase += "NODATA" else: hashbase += self.serialize(self.data) else: hashbase += self.provider.get_hash() return hashlib.md5(hashbase.encode("utf-8")).hexdigest()
[docs] @classmethod def constant(cls, val, name="constant"): """Create a constant from appropriately typed variable.""" ob = cls(name=name) ob.data = val return ob
[docs] @abstractstaticmethod def serialize(data): """Convert data to string.""" return NotImplemented
[docs] def get_data(self): """Getter for data attribute.""" if not self._populated: if self.provider is None: raise Exception("Cannot populate data without provider") self.provider.populate() return self._data
[docs] @abstractstaticmethod def check_type(value): """Guard value to ensure it is of proper type.""" pass
[docs] def set_data(self, value, cache=True): """Setter for data attribute.""" if not self.check_type(value): raise ValueError( f"Cannot set value of {self.__class__.__name__} " f"with object of type {type(value)}" ) self._data = value self._populated = True if cache and self.provider is not None: self.cache_write()
data = property(get_data, set_data)
[docs] @abstractmethod def read(self, filename): """Read data from disk.""" self.set_data(NotImplemented, cache=False)
[docs] @abstractmethod def write(self, filename): """Write data to disk.""" raise NotImplementedError()
[docs] def parents(self): """Return provider as only parent if it is set.""" if self.provider is None: return [] return [self.provider]