Source code for syncopy.io.save_spy_container

# -*- coding: utf-8 -*-
#
# Save Syncopy data objects to disk
#

# Builtin/3rd party package imports
import os
import json
import h5py
import shutil
import numpy as np
from collections import OrderedDict
import syncopy as spy

# Local imports
from syncopy.shared.filetypes import FILE_EXT
from syncopy.shared.parsers import filename_parser, data_parser
from syncopy.shared.errors import SPYIOError, SPYTypeError, SPYError, SPYWarning
from syncopy.io.utils import hash_file, startInfoDict
from syncopy import __storage__

__all__ = ["save"]


[docs]def save(out, container=None, tag=None, filename=None, overwrite=False):
    r"""Save Syncopy data object to disk

    The underlying array data object is stored in a HDF5 file, the metadata in
    a JSON file. Both can be placed inside a Syncopy container, which is a
    regular directory with the extension '.spy'.

    Parameters
    ----------
    out : Syncopy data object
        Object to be stored on disk.
    container : str
        Path to Syncopy container folder (\*.spy) to be used for saving. If
        omitted, the extension '.spy' will be added to the folder name.
    tag : str
        Tag to be appended to container basename
    filename :  str
        Explicit path to data file. This is only necessary if the data should
        not be part of a container folder. An extension (\*.<dataclass>) is
        added if omitted. The `tag` argument is ignored.
    overwrite : bool
        If `True` an existing HDF5 file and its accompanying JSON file is
        overwritten (without prompt).

    Returns
    -------
    Nothing : None

    Notes
    ------
    Syncopy objects may also be saved using the class method ``.save`` that
    acts as a wrapper for :func:`syncopy.save`, e.g.,

    >>> save(obj, container="new_spy_container")

    is equivalent to

    >>> obj.save(container="new_spy_container")

    However, once a Syncopy object has been saved, the class method ``.save``
    can be used as a shortcut to quick-save recent changes, e.g.,

    >>> obj.save()

    writes the current state of `obj` to the data/meta-data files on-disk
    associated with `obj` (overwriting both in the process). Similarly,

    >>> obj.save(tag='newtag')

    saves `obj` in the current container 'new_spy_container' under a different
    tag.

    Examples
    --------
    Save the Syncopy data object `obj` on disk in the current working directory
    without creating a spy-container

    >>> spy.save(obj, filename="session1")
    >>> # --> os.getcwd()/session1.<dataclass>
    >>> # --> os.getcwd()/session1.<dataclass>.info

    Save `obj` without creating a spy-container using an absolute path

    >>> spy.save(obj, filename="/tmp/session1")
    >>> # --> /tmp/session1.<dataclass>
    >>> # --> /tmp/session1.<dataclass>.info

    Save `obj` in a new spy-container created in the current working directory

    >>> spy.save(obj, container="container.spy")
    >>> # --> os.getcwd()/container.spy/container.<dataclass>
    >>> # --> os.getcwd()/container.spy/container.<dataclass>.info

    Save `obj` in a new spy-container created by providing an absolute path

    >>> spy.save(obj, container="/tmp/container.spy")
    >>> # --> /tmp/container.spy/container.<dataclass>
    >>> # --> /tmp/container.spy/container.<dataclass>.info

    Save `obj` in a new (or existing) spy-container under a different tag

    >>> spy.save(obj, container="session1.spy", tag="someTag")
    >>> # --> os.getcwd()/session1.spy/session1_someTag.<dataclass>
    >>> # --> os.getcwd()/session1.spy/session1_someTag.<dataclass>.info

    See also
    --------
    syncopy.load : load data created with :func:`syncopy.save`
    """

    # Make sure `out` is a valid Syncopy data object
    data_parser(out, varname="out", writable=None, empty=False)

    if filename is None and container is None:
        raise SPYError("filename and container cannot both be `None`")

    if container is not None and filename is None:
        # construct filename from container name
        if not isinstance(container, str):
            raise SPYTypeError(container, varname="container", expected="str")
        if not os.path.splitext(container)[1] == ".spy":
            container += ".spy"
        fileInfo = filename_parser(container)
        filename = os.path.join(fileInfo["folder"], fileInfo["container"], fileInfo["basename"])
        # handle tag
        if tag is not None:
            if not isinstance(tag, str):
                raise SPYTypeError(tag, varname="tag", expected="str")
            filename += "_" + tag

    elif container is not None and filename is not None:
        raise SPYError("container and filename cannot be used at the same time")

    if not isinstance(filename, str):
        raise SPYTypeError(filename, varname="filename", expected="str")

    # add extension if not part of the filename
    if "." not in os.path.splitext(filename)[1]:
        filename += out._classname_to_extension()

    if not isinstance(overwrite, bool):
        raise SPYTypeError(overwrite, varname="overwrite", expected="bool")

    # Parse filename for validity and construct full path to HDF5 file
    fileInfo = filename_parser(filename)
    if fileInfo["extension"] != out._classname_to_extension():
        raise SPYError(
            """Extension in filename ('{ext}') does not match data
                    class ({dclass}), expected '{exp}'.""".format(
                ext=fileInfo["extension"],
                dclass=out.__class__.__name__,
                exp=out._classname_to_extension(),
            )
        )
    dataFile = os.path.join(fileInfo["folder"], fileInfo["filename"])

    # If `out` is to replace its own on-disk representation, be more careful
    if overwrite and dataFile == out.filename:
        replace = True
    else:
        replace = False

    # Prevent `out` from trying to re-create its own data file
    if replace:
        out.data.flush()
        h5f = out.data.file
        dat = out.data
        trl = h5f["trialdefinition"]
    else:
        if not os.path.exists(fileInfo["folder"]):
            try:
                os.makedirs(fileInfo["folder"])
            except IOError:
                raise SPYIOError(fileInfo["folder"])
            except Exception as exc:
                raise exc
        else:
            if os.path.exists(dataFile):
                if not os.path.isfile(dataFile):
                    raise SPYIOError(dataFile)
                if overwrite:
                    try:
                        h5f = h5py.File(dataFile, mode="w")
                        h5f.close()
                    except Exception as exc:
                        msg = "Cannot overwrite {} - file may still be open. "
                        msg += "Original error message below\n{}"
                        raise SPYError(msg.format(dataFile, str(exc)))
                else:
                    raise SPYIOError(dataFile, exists=True)
        h5f = h5py.File(dataFile, mode="w")

        # Save each member of `_hdfFileDatasetProperties` in target HDF file
        for datasetName in out._hdfFileDatasetProperties:
            dataset = getattr(out, "_" + datasetName)
            if dataset is not None:
                spy.log(
                    f"Writing dataset '{datasetName}' ({len(out._hdfFileDatasetProperties)} datasets total) to HDF5 file '{dataFile}'.",
                    level="DEBUG",
                )
                dat = h5f.create_dataset(datasetName, data=dataset)
            else:
                spy.log(
                    f"Not writing 'None 'dataset '{datasetName}' ({len(out._hdfFileDatasetProperties)} datasets total) to HDF5 file '{dataFile}'.",
                    level="DEBUG",
                )

    # Now write trial-related information
    trl_arr = np.array(out.trialdefinition)
    if replace:
        trl[()] = trl_arr
        trl.flush()
    else:
        trl = h5f.create_dataset("trialdefinition", data=trl_arr, maxshape=(None, trl_arr.shape[1]))

    # Write to log already here so that the entry can be exported to json
    infoFile = dataFile + FILE_EXT["info"]
    out.log = "Wrote files " + dataFile + "\n\t\t\t" + 2 * " " + infoFile

    # Assemble dict for JSON output: order things by their "readability"
    outDict = OrderedDict(startInfoDict)
    outDict["filename"] = fileInfo["filename"]
    outDict["dataclass"] = out.__class__.__name__
    outDict["data_dtype"] = dat.dtype.name
    outDict["data_shape"] = dat.shape
    outDict["data_offset"] = dat.id.get_offset()
    outDict["trl_dtype"] = trl.dtype.name
    outDict["trl_shape"] = trl.shape
    outDict["trl_offset"] = trl.id.get_offset()
    if isinstance(out.data, np.ndarray):
        if np.isfortran(out.data):
            outDict["order"] = "F"
    else:
        outDict["order"] = "C"

    for key in out._infoFileProperties:
        value = getattr(out, key)
        if isinstance(value, np.ndarray):
            value = value.tolist()
        # potentially nested dicts
        elif isinstance(value, dict):
            value = dict(value)
            _dict_converter(value)
        outDict[key] = value

    # Save relevant stuff as HDF5 attributes
    for key in out._hdfFileAttributeProperties:
        if outDict[key] is None:
            h5f.attrs[key] = "None"
        else:
            try:
                h5f.attrs[key] = outDict[key]
            except RuntimeError:
                msg = (
                    "Too many entries in `{}` - truncating HDF5 attribute. "
                    + "Please refer to {} for complete listing."
                )
                info_fle = os.path.split(os.path.split(filename.format(ext=FILE_EXT["info"]))[0])[1]
                info_fle = os.path.join(info_fle, os.path.basename(filename.format(ext=FILE_EXT["info"])))
                SPYWarning(msg.format(key, info_fle))
                h5f.attrs[key] = [outDict[key][0], "...", outDict[key][-1]]

    # Save the dataset names that should be loaded later into the JSON.
    outDict["_hdfFileDatasetProperties"] = list(out._hdfFileDatasetProperties)

    # Re-assign filename after saving (and remove source in case it came from `__storage__`)
    if not replace:
        h5f.close()
        # points to source file path
        if __storage__ in out.filename:
            is_virtual = out.data.is_virtual
            out.data.file.close()
            try:
                os.unlink(out.filename)
                if is_virtual:
                    virtual_dir_path = os.path.splitext(out.filename)[0]
                    shutil.rmtree(virtual_dir_path)
            except PermissionError as ex:
                spy.log(
                    f"Could not delete file '{out.filename}': {str(ex)}.",
                    level="IMPORTANT",
                )
        out.data = dataFile

    # Compute checksum and finally write JSON (automatically overwrites existing)
    outDict["file_checksum"] = hash_file(dataFile)

    with open(infoFile, "w") as out_json:
        json.dump(outDict, out_json, indent=4)
    spy.log(f"Wrote container to {os.path.dirname(out.filename)}", level="INFO")


def _dict_converter(dct, firstrun=True):
    """
    Convert all dict values having NumPy dtypes to corresponding builtin types

    Also works w/ nested dict of dicts and is cycle-save, i.e., it can
    handle self-referencing dictionaires. For instance, consider a nested dict
    w/ back-edge (the dict is no longer an n-ary tree):

    dct = {}
    dct["a"] = {}
    dct["a"]["a.1"] = 3
    dct["b"]  = {}
    dct["b"]["b.1"] = 4000
    dct["b"]["b.2"] = dct["a"]
    dct["b"]["b.3"] = dct

    Here, b.2 points to value of `a` and b.3 is a self-reference.

    https://stackoverflow.com/questions/10756427/loop-through-all-nested-dictionary-values
    """
    global visited
    if firstrun:
        visited = set()
    for key, value in dct.items():
        if isinstance(value, dict):
            if key not in visited:
                visited.add(key)
                _dict_converter(dct[key], firstrun=False)
        elif isinstance(value, list):
            if key not in visited:
                visited.add(key)
                for el in value:
                    if isinstance(el, dict):
                        _dict_converter(el, firstrun=False)
        elif isinstance(value, np.ndarray):
            dct[key] = value.tolist()
        else:
            if hasattr(value, "item"):
                value = value.item()
            try:
                json.dumps(value)
            except (TypeError, OverflowError):
                value = str(value)
            dct[key] = value
    return