# -*- coding: utf-8 -*-
#
# Load data from Syncopy containers
#
# Builtin/3rd party package imports
import os
import json
import h5py
import sys
import numpy as np
from glob import glob
# Local imports
from syncopy.shared.filetypes import FILE_EXT
from syncopy.shared.parsers import io_parser, data_parser, filename_parser, array_parser
from syncopy.shared.errors import (
SPYTypeError,
SPYValueError,
SPYIOError,
SPYError,
SPYWarning,
)
from syncopy.io.utils import hash_file, startInfoDict
import syncopy.datatype as spd
import syncopy as spy
# to allow loading older spy containers
legacy_not_required = ["info"]
__all__ = ["load"]
[docs]def load(filename, tag=None, dataclass=None, checksum=False, mode="r+", out=None):
"""
Load Syncopy data object(s) from disk
Either loads single files within or outside of '.spy'-containers or loads
multiple objects from a single '.spy'-container. Loading from containers can
be further controlled by imposing restrictions on object class(es) (via
`dataclass`) and file-name tag(s) (via `tag`).
Parameters
----------
filename : str
Either path to Syncopy container folder (`*.spy`, if omitted, the extension
'.spy' will be appended) or name of data or metadata file. If `filename`
points to a container and no further specifications are provided, the
entire contents of the container is loaded. Otherwise, specific objects
may be selected using the `dataclass` or `tag` keywords (see below).
tag : None or str or list
If `filename` points to a container, `tag` may be used to filter objects
by filename-`tag`. Multiple tags can be provided using a list, e.g.,
``tag = ['experiment1', 'experiment2']``. Can be combined with `dataclass`
(see below). Invalid if `filename` points to a single file.
dataclass : None or str or list
If provided, only objects of provided dataclass are loaded from disk.
Available options are '.analog', '.spectral', .spike' and '.event'
(as listed in ``spy.FILE_EXT["data"]``). Multiple class specifications
can be provided using a list, e.g., ``dataclass = ['.analog', '.spike']``.
Can be combined with `tag` (see above) and is also valid if `filename`
points to a single file (e.g., to ensure loaded object is of a specific
type).
checksum : bool
If `True`, checksum-matching is performed on loaded object(s) to ensure
data-integrity (impairs performance particularly when loading large files).
mode : str
Data access mode of loaded objects (can be 'r' for read-only, 'r+' or 'w'
for read/write access).
out : Syncopy data object
Empty object to be filled with data loaded from disk. Has to match the
type of the on-disk file (e.g., ``filename = 'mydata.analog'`` requires
`out` to be a :class:`syncopy.AnalogData` object). Can only be used
when loading single objects from disk (`out` is ignored when multiple
files are loaded from a container).
Returns
-------
Nothing : None
If a single file is loaded and `out` was provided, `out` is filled with
data loaded from disk, i.e., :func:`syncopy.load` does **not** create a
new object
obj : Syncopy data object
If a single file is loaded and `out` was `None`, :func:`syncopy.load`
returns a new object.
objdict : dict
If multiple files are loaded, :func:`syncopy.load` creates a new object
for each file and places them in a dictionary whose keys are the base-names
(sans path) of the corresponding files.
Notes
-----
All of Syncopy's classes offer (limited) support for data loading upon object
creation. Just as the class method ``.save`` can be used as a shortcut for
:func:`syncopy.save`, Syncopy objects can be created from Syncopy data-files
upon creation, e.g.,
>>> adata = spy.AnalogData('/path/to/session1.analog')
creates a new :class:`syncopy.AnalogData` object and immediately fills it
with data loaded from the file "/path/to/session1.analog".
Since only one object can be created at a time, this loading shortcut only
supports single file specifications (i.e., ``spy.AnalogData("container.spy")``
is invalid).
Examples
--------
Load all objects found in the spy-container "sessionName" (the extension ".spy"
may or may not be provided)
>>> objectDict = spy.load("sessionName")
>>> # --> returns a dict with base-filenames as keys
Load all :class:`syncopy.AnalogData` and :class:`syncopy.SpectralData` objects
from the spy-container "sessionName"
>>> objectDict = spy.load("sessionName.spy", dataclass=['analog', 'spectral'])
Load a specific :class:`syncopy.AnalogData` object from the above spy-container
>>> obj = spy.load("sessionName.spy/sessionName_someTag.analog")
This is equivalent to
>>> obj = spy.AnalogData("sessionName.spy/sessionName_someTag.analog")
If the "sessionName" spy-container only contains one object with the tag
"someTag", the above call is equivalent to
>>> obj = spy.load("sessionName.spy", tag="someTag")
If there are multiple objects of different types using the same tag "someTag",
the above call can be further narrowed down to only load the requested
:class:`syncopy.AnalogData` object
>>> obj = spy.load("sessionName.spy", tag="someTag", dataclass="analog")
See also
--------
syncopy.save : save syncopy object on disk
"""
# Ensure `filename` is either a valid .spy container or data file: if `filename`
# is a directory w/o '.spy' extension, append it
if not isinstance(filename, str):
raise SPYTypeError(filename, varname="filename", expected="str")
if len(os.path.splitext(os.path.abspath(os.path.expanduser(filename)))[1]) == 0:
filename += FILE_EXT["dir"]
try:
fileInfo = filename_parser(filename)
except Exception as exc:
raise exc
if tag is not None:
if isinstance(tag, str):
tags = [tag]
else:
tags = tag
try:
array_parser(tags, varname="tag", ntype=str)
except Exception as exc:
raise exc
if fileInfo["filename"] is not None:
raise SPYError("Only containers can be loaded with `tag` keyword!")
for tk in range(len(tags)):
tags[tk] = "*" + tags[tk] + "*"
else:
tags = "*"
# If `dataclass` was provided, format it for our needs (e.g. 'spike' -> ['.spike'])
if dataclass is not None:
if isinstance(dataclass, str):
dataclass = [dataclass]
try:
array_parser(dataclass, varname="dataclass", ntype=str)
except Exception as exc:
raise exc
dataclass = ["." + dclass if not dclass.startswith(".") else dclass for dclass in dataclass]
extensions = set(dataclass).intersection(FILE_EXT["data"])
if len(extensions) == 0:
lgl = "extension(s) '" + "or '".join(ext + "' " for ext in FILE_EXT["data"])
raise SPYValueError(legal=lgl, varname="dataclass", actual=str(dataclass))
# Avoid any misunderstandings here...
if not isinstance(checksum, bool):
raise SPYTypeError(checksum, varname="checksum", expected="bool")
# Abuse `AnalogData.mode`-setter to check `mode`
try:
spd.AnalogData().mode = mode
except Exception as exc:
raise exc
# If `filename` points to a spy container, `glob` what's inside, otherwise just load
if fileInfo["filename"] is None:
if dataclass is None:
extensions = FILE_EXT["data"]
container = os.path.join(fileInfo["folder"], fileInfo["container"])
fileList = []
for ext in extensions:
for tag in tags:
fileList.extend(glob(os.path.join(container, tag + ext)))
if len(fileList) == 0:
fsloc = os.path.join(
container,
""
+ "or ".join(tag + " " for tag in tags)
+ "with extensions "
+ "or ".join(ext + " " for ext in extensions),
)
raise SPYIOError(fsloc, exists=False)
if len(fileList) == 1:
return _load(fileList[0], checksum, mode, out)
if out is not None:
msg = "When loading multiple objects, the `out` keyword is ignored"
SPYWarning(msg)
objectDict = {}
for fname in fileList:
obj = _load(fname, checksum, mode, None)
objectDict[os.path.basename(obj.filename)] = obj
return objectDict
else:
if dataclass is not None:
if os.path.splitext(fileInfo["filename"])[1] not in dataclass:
lgl = "extension '" + "or '".join(dclass + "' " for dclass in dataclass)
raise SPYValueError(legal=lgl, varname="filename", actual=fileInfo["filename"])
return _load(filename, checksum, mode, out)
def _load(filename, checksum, mode, out):
"""
Local helper
"""
fileInfo = filename_parser(filename)
hdfFile = os.path.join(fileInfo["folder"], fileInfo["filename"])
jsonFile = hdfFile + FILE_EXT["info"]
try:
_ = io_parser(hdfFile, varname="hdfFile", isfile=True, exists=True)
_ = io_parser(jsonFile, varname="jsonFile", isfile=True, exists=True)
except Exception as exc:
raise exc
with open(jsonFile, "r") as file:
jsonDict = json.load(file)
if "dataclass" not in jsonDict.keys():
raise SPYError("Info file {} does not contain a dataclass field".format(jsonFile))
if hasattr(spd, jsonDict["dataclass"]):
dataclass = getattr(spd, jsonDict["dataclass"])
else:
raise SPYError("Unknown data class {class}".format(jsonDict["dataclass"]))
requiredFields = tuple(startInfoDict.keys()) + dataclass._infoFileProperties
for key in requiredFields:
if key not in jsonDict.keys() and key not in legacy_not_required:
raise SPYError(
"Required field {field} for {cls} not in {file}".format(
field=key, cls=dataclass.__name__, file=jsonFile
)
)
# FIXME: add version comparison (syncopy.__version__ vs jsonDict["_version"])
# If wanted, perform checksum matching
if checksum:
hsh_msg = "hash = {hsh:s}"
hsh = hash_file(hdfFile)
if hsh != jsonDict["file_checksum"]:
raise SPYValueError(
legal=hsh_msg.format(hsh=jsonDict["file_checksum"]),
varname=os.path.basename(hdfFile),
actual=hsh_msg.format(hsh=hsh),
)
# Parsing is done, create new or check provided object
dimord = jsonDict.pop("dimord")
if out is not None:
try:
data_parser(out, varname="out", writable=True, dataclass=jsonDict["dataclass"])
except Exception as exc:
raise exc
new_out = False
out.dimord = dimord
else:
out = dataclass(dimord=dimord)
new_out = True
# Access data on disk (error checking is done by setters)
out.mode = mode
# If the JSON contains `_hdfFileDatasetProperties`, load all datasets listed in there. Otherwise, load the ones
# already defined by `out._hdfFileDatasetProperties` and defined in the respective data class.
# This is needed to load both new files with, and legacy files without the `_hdfFileDatasetProperties` in the JSON.
json_hdfFileDatasetProperties = jsonDict.pop(
"_hdfFileDatasetProperties", None
) # They may not be in there for legacy files, so allow None.
if json_hdfFileDatasetProperties is not None:
out._hdfFileDatasetProperties = tuple(
json_hdfFileDatasetProperties
) # It's a list in the JSON, so convert to tuple.
for datasetProperty in out._hdfFileDatasetProperties:
targetProperty = datasetProperty if datasetProperty == "data" else "_" + datasetProperty
try:
setattr(out, targetProperty, h5py.File(hdfFile, mode="r")[datasetProperty])
except KeyError:
if datasetProperty == "data":
raise SPYError(
"Data file {file} does not contain a dataset named 'data'.".format(file=hdfFile)
)
else:
spy.log(
f"Dataset '{datasetProperty}' not present in HDF5 file, cannot load it. Setting to None.",
level="DEBUG",
)
# It is fine if an extra dataset is not present in the file, e.g., the SpikeData waveform dataset is not present when set to None.
setattr(out, targetProperty, None)
# Abuse ``definetrial`` to set trial-related props
trialdef = h5py.File(hdfFile, mode="r")["trialdefinition"][()]
out.definetrial(trialdef)
# Assign metadata
for key in [
prop for prop in dataclass._infoFileProperties if prop != "dimord" and prop in jsonDict.keys()
]:
setattr(out, key, jsonDict[key])
thisMethod = sys._getframe().f_code.co_name.replace("_", "")
# Write log-entry
msg = "Read files v. {ver:s} ".format(ver=jsonDict["_version"])
msg += "{hdf:s}\n\t" + (len(msg) + len(thisMethod) + 2) * " " + "{json:s}"
out.log = msg.format(hdf=hdfFile, json=jsonFile)
# Happy breakdown
return out if new_out else None