Source code for datawork.instances.data

"""Instances of Data for common data payloads."""
import json
import os

from ..api.data import Data


[docs]class PandasData(Data): """Data type for Pandas DataFrames and Series."""
[docs] def __init__(self, *args, **kwargs): """Construct PandasData.""" Data.__init__(self, *args, **kwargs) self.format = format
[docs] @staticmethod def serialize(data): """Write to msgpack.""" import base64 return str(base64.b64encode(data.to_msgpack()))
[docs] def read(self, filename): """Read from msgpack.""" import pandas as pd df = pd.read_msgpack(filename) self.set_data(df, cache=False)
[docs] @staticmethod def check_type(value): """Check that value is a DataFrame or Series.""" import pandas as pd return isinstance(value, (pd.DataFrame, pd.Series))
[docs] def write(self, filename): """Write msgpack.""" self.data.to_msgpack(filename)
[docs]class JSONData(Data): """ A Data class for primitive JSON serializable types. The so-called "primitive types" in JSON are: - string - numeric types - object (in python this is a :class:`dict`) - array - boolean - null In this class, hierarchies of the following types are supported: - :class:`bool` - :class:`dict` - :class:`float` - :class:`int` - :class:`list` - :obj:`None` - :class:`str` Note that although other types than these may be serializable in Python (by subclassing :class:`json.JSONEncoder`), the primitive types can be serialized/deserialized unambiguously. For example, we do not support tuples, although the :mod:`json` module supports serializing them by casting them to lists. """
[docs] @staticmethod def serialize(data): """Convert to JSON text.""" return json.dumps(data, sort_keys=True)
[docs] def read(self, filename): """Read JSON text.""" self.set_data(json.load(open(filename, "r")), cache=False)
[docs] def write(self, filename): """Write as JSON text.""" json.dump(self.data, open(filename, "w"), sort_keys=True)
[docs] @staticmethod def check_type(value): """Check that value is a hierarchy of primitive JSON types.""" return isinstance(value, (int, float, list, dict, str))
[docs]class KerasModelData(Data): """A Data class for Keras models."""
[docs] @staticmethod def check_type(value): """Check that value is a keras.models.Model.""" from keras.models import Model return isinstance(value, Model)
[docs] def read(self, filename): """Read from HDF5.""" from keras.models import load_model self.data = load_model(filename)
[docs] def write(self, filename): """Write to HDF5.""" self.data.save(filename)
[docs]class TorchModelData(Data): """A Data class for PyTorch models."""
[docs] @staticmethod def check_type(value): """Check that value is a torch.nn.Module.""" from torch.nn import Module return isinstance(value, Module)
[docs] def read(self, filename): """Load state dict and module class.""" from torch import load self.data = load(filename)
[docs] def write(self, filename): """Write state dict and serialize module class.""" from torch import save save(self.data, filename)
[docs]class FileData(Data): """ Base class for any disk-native data. For example, SQLiteData will use this as a base class. """
[docs] @staticmethod def check_type(value): """Check that value is a filename.""" return isinstance(value, str) and os.path.exists(value)
[docs] @staticmethod def serialize(data): """Simply return the filename.""" return data
[docs] def read(self, filename): """Read by setting the filename.""" self.set_data(filename)
[docs] def write(self, filename): """Copy file to new location.""" from shutil import copyfile if filename != self.cache_new_location(): copyfile(self.cache_new_location(), filename)