diff --git a/+io/+backend/+zarr2/+mw/ensureAvailable.m b/+io/+backend/+zarr2/+mw/ensureAvailable.m new file mode 100644 index 000000000..1d64df675 --- /dev/null +++ b/+io/+backend/+zarr2/+mw/ensureAvailable.m @@ -0,0 +1,20 @@ +function ensureAvailable() +% ensureAvailable - Validate that the MathWorks Zarr wrapper is on path. + + persistent isValidated + + if isequal(isValidated, true) + return + end + + requiredFunctions = ["zarrinfo", "zarrread", "readZattrs"]; + isMissing = arrayfun(@(name) exist(name, "file") == 0, requiredFunctions); + + if any(isMissing) + error("NWB:Zarr2:DependencyMissing", ... + "The MathWorks Zarr wrapper is required on the MATLAB path. Missing function(s): %s", ... + strjoin(requiredFunctions(isMissing), ", ")) + end + + isValidated = true; +end diff --git a/+io/+backend/+zarr2/+mw/readArray.m b/+io/+backend/+zarr2/+mw/readArray.m new file mode 100644 index 000000000..0ab424474 --- /dev/null +++ b/+io/+backend/+zarr2/+mw/readArray.m @@ -0,0 +1,27 @@ +function data = readArray(filepath, start, count, stride) +% readArray - Read a Zarr array through the MathWorks wrapper. + + if nargin < 2 + start = []; + count = []; + stride = []; + end + + io.backend.zarr2.mw.ensureAvailable() + try + if isempty(start) && isempty(count) && isempty(stride) + data = zarrread(char(filepath)); + else + data = zarrread(char(filepath), ... + Start=start, Count=count, Stride=stride); + end + catch ME + if strcmp(ME.identifier, 'MATLAB:Python:PyException') ... + && contains(ME.message, 'tensorstore') + error("NWB:Zarr2:TensorStoreMissing", ... + "The MathWorks Zarr wrapper requires the python package `tensorstore` to be installed in the active MATLAB python environment.") + else + rethrow(ME) + end + end +end diff --git a/+io/+backend/+zarr2/+mw/readAttributes.m b/+io/+backend/+zarr2/+mw/readAttributes.m new file mode 100644 index 000000000..5a1f6efd5 --- /dev/null +++ b/+io/+backend/+zarr2/+mw/readAttributes.m @@ -0,0 +1,6 @@ +function attributes = readAttributes(filepath) +% readAttributes - Read .zattrs through the MathWorks wrapper. + + io.backend.zarr2.mw.ensureAvailable() + attributes = readZattrs(char(filepath)); +end diff --git a/+io/+backend/+zarr2/+mw/readInfo.m b/+io/+backend/+zarr2/+mw/readInfo.m new file mode 100644 index 000000000..3fab600ad --- /dev/null +++ b/+io/+backend/+zarr2/+mw/readInfo.m @@ -0,0 +1,6 @@ +function info = readInfo(filepath) +% readInfo - Read .zgroup/.zarray metadata through the MathWorks wrapper. + + io.backend.zarr2.mw.ensureAvailable() + info = zarrinfo(char(filepath)); +end diff --git a/+io/+backend/+zarr2/@Zarr2LazyArray/Zarr2LazyArray.m b/+io/+backend/+zarr2/@Zarr2LazyArray/Zarr2LazyArray.m new file mode 100644 index 000000000..a6736ce7b --- /dev/null +++ b/+io/+backend/+zarr2/@Zarr2LazyArray/Zarr2LazyArray.m @@ -0,0 +1,277 @@ +classdef Zarr2LazyArray < io.backend.base.LazyArray +% Zarr2LazyArray - Minimal Zarr v2-backed lazy dataset access implementation. + + properties (Access = private) + datasetInfo_ struct = struct.empty + end + + methods + function obj = Zarr2LazyArray(filename, datasetPath, dims, dataType, datasetInfo) + arguments + filename (1,1) string + datasetPath (1,1) string + dims double = [] + dataType = [] + datasetInfo struct = struct.empty + end + obj@io.backend.base.LazyArray(filename, datasetPath, dims, dataType); + obj.datasetInfo_ = datasetInfo; + end + + function refreshSizeInfo(obj) + datasetInfo = obj.readDatasetInfo(); + dims = obj.normalizeDims(datasetInfo.Dataspace.Size); + obj.setSizeInfo(dims, dims); + end + + function dataType = resolveDataType(obj) + datasetInfo = obj.readDatasetInfo(); + datasetDirectory = obj.resolveDatasetDirectory(); + dataType = io.internal.zarr2.getMatlabDataType(datasetDirectory, datasetInfo); + end + + function data = load_h5_style(obj, varargin) + if isempty(varargin) + data = obj.readAllData(); + return + end + + assert(length(varargin) ~= 1, 'NWB:DataStub:InvalidNumArguments',... + 'calling load_h5_style with a single space id is no longer supported.'); + + start = varargin{1}; + count = varargin{2}; + if length(varargin) >= 3 + stride = varargin{3}; + else + stride = ones(size(start)); + end + data = obj.readPartialData(start, count, stride); + end + + function data = load_mat_style(obj, varargin) + if isempty(varargin) + data = obj.readAllData(); + return + end + + [isSupported, fullSelection] = obj.tryBuildRegularSelection(varargin); + if isSupported + [start, count, stride] = obj.selectionToReadParameters(fullSelection); + data = obj.readPartialData(start, count, stride); + if obj.isCompoundArray(data) + data = obj.convertCompoundDataToTable(data); + else + data = obj.applySelectionShape(data, varargin); + end + else + data = obj.readAllData(); + data = data(varargin{:}); + if obj.isCompoundArray(data) + data = obj.convertCompoundDataToTable(data); + end + end + end + end + + methods (Access = private) + function datasetInfo = readDatasetInfo(obj) + if ~isempty(obj.datasetInfo_) + datasetInfo = obj.datasetInfo_; + return + end + + reader = io.backend.zarr2.Zarr2Reader(obj.Filename); + datasetInfo = reader.readNodeInfo(obj.DatasetPath); + obj.datasetInfo_ = datasetInfo; + end + + function datasetDirectory = resolveDatasetDirectory(obj) + relativePath = regexprep(char(obj.DatasetPath), '^/', ''); + datasetDirectory = string(fullfile(obj.Filename, relativePath)); + end + + function data = readAllData(obj) + datasetInfo = obj.readDatasetInfo(); + datasetDirectory = obj.resolveDatasetDirectory(); + data = io.internal.zarr2.readDataset(datasetDirectory, datasetInfo); + end + + function data = readPartialData(obj, start, count, stride) + datasetDirectory = obj.resolveDatasetDirectory(); + if ~obj.supportsPartialRead() + data = obj.readAllData(); + selection = cell(1, length(start)); + for iDimension = 1:length(start) + if isinf(count(iDimension)) + stopIndex = obj.dims(iDimension); + else + stopIndex = start(iDimension) + (count(iDimension)-1) * stride(iDimension); + end + selection{iDimension} = start(iDimension):stride(iDimension):stopIndex; + end + data = data(selection{:}); + return + end + + if any(isinf(count)) + count = floor((obj.dims - start) ./ stride) + 1; + end + + [rawStart, rawCount, rawStride] = obj.toRawReadParameters(start, count, stride); + data = io.backend.zarr2.mw.readArray( ... + datasetDirectory, rawStart, rawCount, rawStride); + data = io.internal.zarr2.readDataset( ... + datasetDirectory, obj.readDatasetInfo(), data); + end + + function tf = supportsPartialRead(obj) + datasetInfo = obj.readDatasetInfo(); + datasetDirectory = obj.resolveDatasetDirectory(); + rawDatasetInfo = io.backend.zarr2.mw.readInfo(datasetDirectory); + + tf = ~(ischar(datasetInfo.Datatype) || isstring(datasetInfo.Datatype) ... + && lower(string(datasetInfo.Datatype)) == "object") ... + && ~(isfield(rawDatasetInfo, "dtype") && obj.isObjectRawDtype(rawDatasetInfo.dtype)); + end + + function tf = isCompoundArray(obj, data) + tf = isstruct(obj.dataType) && isstruct(data); + end + + function data = convertCompoundDataToTable(~, data) + data = struct2table(data(:)); + end + + function tf = isObjectRawDtype(~, rawDtype) + tf = (ischar(rawDtype) || isstring(rawDtype)) && strcmp(string(rawDtype), "|O"); + end + + function [isSupported, fullSelection] = tryBuildRegularSelection(obj, userSelection) + dataDimensions = obj.dims; + isSupported = true; + fullSelection = cell(1, length(dataDimensions)); + + if isscalar(userSelection) && isempty(userSelection{1}) + isSupported = false; + return + end + + if isscalar(userSelection) && ~ischar(userSelection{1}) + isSupported = false; + return + end + + isDanglingGroup = ischar(userSelection{end}); + for iDimension = 1:length(dataDimensions) + if iDimension > length(userSelection) && ~isDanglingGroup + fullSelection{iDimension} = 1; + elseif (iDimension > length(userSelection) && isDanglingGroup) ... + || ischar(userSelection{iDimension}) + fullSelection{iDimension} = 1:dataDimensions(iDimension); + else + selection = userSelection{iDimension}; + if ~obj.isRegularAscendingSelection(selection) + isSupported = false; + return + end + fullSelection{iDimension} = selection; + end + end + end + + function tf = isRegularAscendingSelection(~, selection) + tf = isnumeric(selection) ... + && isreal(selection) ... + && all(isfinite(selection)) ... + && all(selection > 0) ... + && all(selection == floor(selection)); + if ~tf + return + end + if isscalar(selection) + return + end + + stepSizes = diff(selection); + tf = all(stepSizes > 0) && numel(unique(stepSizes)) == 1; + end + + function [start, count, stride] = selectionToReadParameters(~, selection) + start = zeros(1, numel(selection)); + count = zeros(1, numel(selection)); + stride = ones(1, numel(selection)); + + for iDimension = 1:numel(selection) + currentSelection = selection{iDimension}; + start(iDimension) = currentSelection(1); + count(iDimension) = numel(currentSelection); + if numel(currentSelection) > 1 + stride(iDimension) = currentSelection(2) - currentSelection(1); + end + end + end + + function [rawStart, rawCount, rawStride] = toRawReadParameters(obj, start, count, stride) + if isscalar(obj.dims) + rawStart = start; + rawCount = count; + rawStride = stride; + else + rawStart = fliplr(start); + rawCount = fliplr(count); + rawStride = fliplr(stride); + end + end + + function data = applySelectionShape(obj, data, userSelection) + expectedSize = obj.getExpectedSize(userSelection); + if isequal(size(data), expectedSize) + return + end + data = reshape(data, expectedSize); + end + + function expectedSize = getExpectedSize(obj, userSelection) + dataDimensions = obj.dims; + expectedSize = dataDimensions; + for iSelection = 1:length(userSelection) + if ~ischar(userSelection{iSelection}) + expectedSize(iSelection) = length(userSelection{iSelection}); + end + end + + if ischar(userSelection{end}) + selectedDimensionIndex = length(userSelection); + expectedSize = [expectedSize(1:(selectedDimensionIndex-1)), ... + prod(dataDimensions(selectedDimensionIndex:end))]; + else + expectedSize = expectedSize(1:length(userSelection)); + end + + if isscalar(userSelection) && isscalar(expectedSize) + if 1 < sum(1 < dataDimensions) + if ~ischar(userSelection{1}) && isrow(userSelection{1}) + expectedSize = [1 expectedSize]; + else + expectedSize = [expectedSize 1]; + end + else + if dataDimensions(1) == 1 + expectedSize = [1 expectedSize]; + else + expectedSize = [expectedSize 1]; + end + end + end + end + + function dims = normalizeDims(~, dims) + dims = double(dims); + if isempty(dims) || isscalar(dims) + return + end + dims = fliplr(dims); + end + end +end diff --git a/+io/+backend/+zarr2/Zarr2Reader.m b/+io/+backend/+zarr2/Zarr2Reader.m new file mode 100644 index 000000000..b22697d80 --- /dev/null +++ b/+io/+backend/+zarr2/Zarr2Reader.m @@ -0,0 +1,130 @@ +classdef Zarr2Reader < io.backend.base.Reader + % Zarr2Reader - Reader implementation for local consolidated Zarr v2 stores. + + properties (Access = private) + rootInfoCache = [] + nodeInfoMap = containers.Map('KeyType', 'char', 'ValueType', 'any') + end + + methods + function obj = Zarr2Reader(filename) + obj@io.backend.base.Reader(filename); + end + + function version = getSchemaVersion(obj) + attributes = io.backend.zarr2.mw.readAttributes(obj.Filename); + if isfield(attributes, "nwb_version") + version = string(attributes.nwb_version); + else + error("NWB:Zarr2Reader:MissingSchemaVersion", ... + "The Zarr store `%s` does not define `nwb_version` in root .zattrs.", obj.Filename) + end + end + + function specLocation = getEmbeddedSpecLocation(obj) + attributes = io.backend.zarr2.mw.readAttributes(obj.Filename); + if isfield(attributes, "x_specloc") + specLocation = string(attributes.x_specloc); + elseif isfolder(fullfile(obj.Filename, "specifications")) + specLocation = "/specifications"; + else + specLocation = ""; + end + + if specLocation ~= "" && ~startsWith(specLocation, "/") + specLocation = "/" + specLocation; + end + end + + function node = readRootInfo(obj) + obj.ensureMetadataCache(); + node = obj.rootInfoCache; + end + + function node = readNodeInfo(obj, nodePath) + arguments + obj + nodePath (1,1) string + end + + obj.ensureMetadataCache(); + normalizedPath = obj.normalizeNodePath(nodePath); + if ~isKey(obj.nodeInfoMap, normalizedPath) + error("NWB:Zarr2Reader:NodeNotFound", ... + "Node `%s` was not found in `%s`.", normalizedPath, obj.Filename) + end + node = obj.nodeInfoMap(normalizedPath); + end + + function attributeValue = readAttributeValue(~, attributeInfo, ~) + if ischar(attributeInfo.Datatype) ... + && strcmp(attributeInfo.Datatype, "object reference") + attributeValue = types.untyped.ObjectView(attributeInfo.Value.value.path); + else + attributeValue = attributeInfo.Value; + end + end + + function datasetValue = readDatasetValue(obj, datasetInfo, datasetPath) + dataDimensions = obj.getDatasetDims(datasetInfo); + semanticType = obj.getSemanticType(datasetInfo); + if isempty(dataDimensions) || prod(dataDimensions) == 1 || semanticType == "object" + datasetDirectory = obj.resolveDatasetDirectory(datasetPath); + datasetValue = io.internal.zarr2.readDataset(datasetDirectory, datasetInfo); + elseif any(dataDimensions == 0) + datasetValue = []; + else + datasetDirectory = obj.resolveDatasetDirectory(datasetPath); + matlabDataType = io.internal.zarr2.getMatlabDataType(datasetDirectory, datasetInfo); + lazyArray = io.backend.zarr2.Zarr2LazyArray(... + obj.Filename, datasetPath, dataDimensions, matlabDataType, datasetInfo); + datasetValue = types.untyped.DataStub(... + obj.Filename, datasetPath, [], [], lazyArray); + end + end + end + + methods (Access = private) + function ensureMetadataCache(obj) + if isempty(obj.rootInfoCache) + [obj.rootInfoCache, obj.nodeInfoMap] = io.internal.zarr2.readConsolidatedInfo(obj.Filename); + end + end + + function datasetDirectory = resolveDatasetDirectory(obj, datasetPath) + relativePath = regexprep(char(datasetPath), '^/', ''); + datasetDirectory = string(fullfile(obj.Filename, relativePath)); + end + + function normalizedPath = normalizeNodePath(~, nodePath) + normalizedPath = char(nodePath); + if isempty(normalizedPath) + normalizedPath = '/'; + elseif normalizedPath(1) ~= '/' + normalizedPath = ['/' normalizedPath]; + end + end + + function semanticType = getSemanticType(~, datasetInfo) + semanticType = ""; + if isfield(datasetInfo, "Datatype") ... + && (ischar(datasetInfo.Datatype) || isstring(datasetInfo.Datatype)) + semanticType = lower(string(datasetInfo.Datatype)); + end + end + + function dataDimensions = getDatasetDims(~, datasetInfo) + if isfield(datasetInfo, "Dataspace") && isfield(datasetInfo.Dataspace, "Size") + dataDimensions = double(datasetInfo.Dataspace.Size); + else + dataDimensions = []; + end + + if isempty(dataDimensions) || isscalar(dataDimensions) + return + end + + dataDimensions = fliplr(dataDimensions); + end + end +end diff --git a/+io/+backend/BackendFactory.m b/+io/+backend/BackendFactory.m index 09c2c87fa..91200402a 100644 --- a/+io/+backend/BackendFactory.m +++ b/+io/+backend/BackendFactory.m @@ -35,6 +35,8 @@ case "auto" if io.backend.BackendFactory.isHDF5File(filename) reader = io.backend.hdf5.HDF5Reader(filename); + elseif io.backend.BackendFactory.isZarrDirectory(filename) + reader = io.backend.zarr2.Zarr2Reader(filename); else error("NWB:BackendFactory:UnsupportedFormat", ... "No supported reader found for `%s`.", filename) @@ -45,6 +47,12 @@ "`%s` is not a valid HDF5 file.", filename) end reader = io.backend.hdf5.HDF5Reader(filename); + case "zarr" + if ~io.backend.BackendFactory.isZarrDirectory(filename) + error("NWB:BackendFactory:InvalidZarr", ... + "`%s` is not a supported local Zarr directory store.", filename) + end + reader = io.backend.zarr2.Zarr2Reader(filename); otherwise error("NWB:BackendFactory:UnsupportedBackend", ... "Unsupported backend `%s`.", storageBackend) @@ -66,6 +74,8 @@ case "auto" if io.backend.BackendFactory.isHDF5File(filename) lazyArray = io.backend.hdf5.HDF5LazyArray(filename, datasetPath, dims, dataType); + elseif io.backend.BackendFactory.isZarrDirectory(filename) + lazyArray = io.backend.zarr2.Zarr2LazyArray(filename, datasetPath, dims, dataType); else error("NWB:BackendFactory:UnsupportedFormat", ... "No supported lazy array backend found for `%s`.", filename) @@ -76,6 +86,12 @@ "`%s` is not a valid HDF5 file.", filename) end lazyArray = io.backend.hdf5.HDF5LazyArray(filename, datasetPath, dims, dataType); + case "zarr" + if ~io.backend.BackendFactory.isZarrDirectory(filename) + error("NWB:BackendFactory:InvalidZarr", ... + "`%s` is not a supported local Zarr directory store.", filename) + end + lazyArray = io.backend.zarr2.Zarr2LazyArray(filename, datasetPath, dims, dataType); otherwise error("NWB:BackendFactory:UnsupportedBackend", ... "Unsupported backend `%s`.", storageBackend) @@ -86,6 +102,8 @@ storageBackend = lower(string(storageBackend)); if storageBackend == "h5" storageBackend = "hdf5"; + elseif storageBackend == "zarr2" + storageBackend = "zarr"; end end @@ -94,7 +112,7 @@ filename (1,1) string end - tf = false; + tf = false; if isfile(filename) try fid = H5F.open(filename, "H5F_ACC_RDONLY", "H5P_DEFAULT"); @@ -105,5 +123,23 @@ end end end + + function tf = isZarrDirectory(filename) + arguments + filename (1,1) string + end + + tf = false; + if startsWith(filename, "s3://", "IgnoreCase", true) || ~isfolder(filename) + return + end + + if ~endsWith(filename, ".zarr", "IgnoreCase", true) + return + end + + tf = isfile(fullfile(filename, ".zgroup")) ... + || isfile(fullfile(filename, ".zmetadata")); + end end end diff --git a/+io/+internal/+zarr2/getMatlabDataType.m b/+io/+internal/+zarr2/getMatlabDataType.m new file mode 100644 index 000000000..9bc38843f --- /dev/null +++ b/+io/+internal/+zarr2/getMatlabDataType.m @@ -0,0 +1,188 @@ +function matlabDataType = getMatlabDataType(datasetDirectory, datasetInfo) +% getMatlabDataType - Resolve a MATLAB-facing datatype for a Zarr dataset. + + arguments + datasetDirectory (1,1) string + datasetInfo (1,1) struct + end + + datatype = []; + if isfield(datasetInfo, "Datatype") + datatype = datasetInfo.Datatype; + end + + if isCompoundDatatype(datatype) + matlabDataType = resolveCompoundTypeDescriptor(datatype); + else + semanticType = lower(string(datatype)); + switch semanticType + case {"float16", "float32", "float64"} + matlabDataType = mapNumericType(semanticType); + case {"int8", "int16", "int32", "int64", "uint8", "uint16", "uint32", "uint64"} + matlabDataType = char(semanticType); + case {"bool", "logical"} + matlabDataType = "logical"; + case {"object"} + matlabDataType = "types.untyped.ObjectView"; + otherwise + matlabDataType = resolveFromRawDtype(datasetDirectory, semanticType); + end + end + + if isempty(matlabDataType) + matlabDataType = class(io.internal.zarr2.readDataset(datasetDirectory, datasetInfo)); + end + + if isstring(matlabDataType) + matlabDataType = char(matlabDataType); + end +end + +function tf = isCompoundDatatype(datatype) + tf = isstruct(datatype) || iscell(datatype); +end + +function typeDescriptor = resolveCompoundTypeDescriptor(datatype) + fieldSpecs = normalizeCompoundFieldSpecs(datatype); + typeDescriptor = struct(); + + for iField = 1:numel(fieldSpecs) + fieldSpec = fieldSpecs(iField); + fieldName = string(fieldSpec.name); + assert(strlength(fieldName) > 0, ... + "NWB:Zarr2:InvalidCompoundField", ... + "Compound datatype fields must define a non-empty name.") + + storageType = getCompoundFieldStorageType(fieldSpec); + matlabFieldType = mapCompoundFieldType(storageType); + typeDescriptor.(char(fieldName)) = char(matlabFieldType); + end +end + +function fieldSpecs = normalizeCompoundFieldSpecs(datatype) + if isstruct(datatype) + fieldSpecs = datatype; + return + end + + assert(iscell(datatype), ... + "NWB:Zarr2:InvalidCompoundType", ... + "Unsupported compound datatype metadata format.") + + fieldSpecs = repmat(struct("name", "", "dtype", ""), 1, numel(datatype)); + for iField = 1:numel(datatype) + fieldSpec = datatype{iField}; + assert(iscell(fieldSpec) && ismember(numel(fieldSpec), [2, 3]), ... + "NWB:Zarr2:InvalidCompoundField", ... + "Compound datatype metadata must use 2- or 3-element field definitions.") + fieldSpecs(iField).name = fieldSpec{1}; + fieldSpecs(iField).dtype = fieldSpec{2}; + end +end + +function storageType = getCompoundFieldStorageType(fieldSpec) + if isfield(fieldSpec, "dtype") + storageType = fieldSpec.dtype; + elseif isfield(fieldSpec, "type") + storageType = fieldSpec.type; + elseif isfield(fieldSpec, "storageType") + storageType = fieldSpec.storageType; + else + error("NWB:Zarr2:InvalidCompoundField", ... + "Compound datatype field `%s` does not define a supported storage type.", ... + string(fieldSpec.name)) + end +end + +function matlabFieldType = mapCompoundFieldType(storageType) + assert(~isstruct(storageType) && ~iscell(storageType), ... + "NWB:Zarr2:UnsupportedCompoundFieldType", ... + "Nested or non-scalar compound field types are not supported.") + + fieldType = string(storageType); + normalizedFieldType = lower(fieldType); + + switch normalizedFieldType + case {"float16", "float32", "float64"} + matlabFieldType = mapNumericType(normalizedFieldType); + return + case {"int8", "int16", "int32", "int64", "uint8", "uint16", "uint32", "uint64"} + matlabFieldType = normalizedFieldType; + return + case {"bool", "logical"} + matlabFieldType = "logical"; + return + case {"object", "|o"} + matlabFieldType = "types.untyped.ObjectView"; + return + end + + if any(startsWith(fieldType, ["U", "|U", "S", "|S"])) + matlabFieldType = "char"; + return + end + + token = regexp(char(fieldType), '(?[fiub])(?\d+)$', 'names', 'once'); + assert(~isempty(token), ... + "NWB:Zarr2:UnsupportedCompoundFieldType", ... + "Unsupported compound field type `%s`.", fieldType) + + switch token.code + case "f" + matlabFieldType = mapNumericType("float" + token.width); + case "i" + matlabFieldType = "int" + token.width; + case "u" + matlabFieldType = "uint" + token.width; + case "b" + matlabFieldType = "logical"; + end +end + +function matlabDataType = resolveFromRawDtype(datasetDirectory, semanticType) + matlabDataType = ""; + + rawDatasetInfo = io.backend.zarr2.mw.readInfo(datasetDirectory); + if ~isfield(rawDatasetInfo, "dtype") + return + end + + rawType = string(rawDatasetInfo.dtype); + if rawType == "|O" + if semanticType == "" + matlabDataType = "cell"; + else + matlabDataType = semanticType; + end + return + end + + token = regexp(char(rawType), '(?[fiub])(?\d+)$', 'names', 'once'); + if isempty(token) + return + end + + switch token.code + case "f" + matlabDataType = mapNumericType("float" + token.width); + case "i" + matlabDataType = "int" + token.width; + case "u" + matlabDataType = "uint" + token.width; + case "b" + matlabDataType = "logical"; + end +end + +function matlabDataType = mapNumericType(semanticType) + switch semanticType + case "float16" + matlabDataType = "half"; + case "float32" + matlabDataType = "single"; + case "float64" + matlabDataType = "double"; + otherwise + matlabDataType = ""; + end +end diff --git a/+io/+internal/+zarr2/readConsolidatedInfo.m b/+io/+internal/+zarr2/readConsolidatedInfo.m new file mode 100644 index 000000000..8c1e94caf --- /dev/null +++ b/+io/+internal/+zarr2/readConsolidatedInfo.m @@ -0,0 +1,444 @@ +function [rootInfo, nodeInfoMap] = readConsolidatedInfo(zarrFile) +% readConsolidatedInfo - Build h5info-like node descriptors from .zmetadata. + + [metadata, keyMap] = readMetadata(zarrFile); + originalKeyMap = containers.Map(values(keyMap), keys(keyMap)); + + rootInfo = initGroupStruct(); + rootInfo.Name = '/'; + rootInfo.Filename = char(zarrFile); + + rootAttributes = getAttributes(metadata, originalKeyMap, '/'); + rootInfo.Attributes = createAttributesStructure(rootAttributes); + rootInfo.Links = createLinksStructure(rootAttributes); + + [groupPaths, datasetPaths] = processKeys(keyMap); + groupStructMap = containers.Map('KeyType', 'char', 'ValueType', 'any'); + + for iGroup = 1:numel(groupPaths) + groupPath = groupPaths{iGroup}; + groupStructMap(groupPath) = createGroupStructure(groupPath, metadata, originalKeyMap); + end + + for iDataset = 1:numel(datasetPaths) + datasetPath = datasetPaths{iDataset}; + datasetStruct = createDatasetStructure(datasetPath, metadata, originalKeyMap); + parentPath = getParentPath(datasetPath); + + if strcmp(parentPath, '/') + rootInfo.Datasets(end+1) = datasetStruct; %#ok + else + parentGroup = groupStructMap(parentPath); + parentGroup.Datasets(end+1) = datasetStruct; + groupStructMap(parentPath) = parentGroup; + end + end + + if ~isempty(groupPaths) + groupDepths = cellfun(@(path) numel(strfind(path, '/')), groupPaths); %#ok + [~, sortOrder] = sort(groupDepths, 'descend'); + groupPaths = groupPaths(sortOrder); + end + + for iGroup = 1:numel(groupPaths) + groupPath = groupPaths{iGroup}; + parentPath = getParentPath(groupPath); + groupStruct = groupStructMap(groupPath); + + if strcmp(parentPath, '/') + rootInfo.Groups(end+1) = groupStruct; %#ok + else + parentGroup = groupStructMap(parentPath); + parentGroup.Groups(end+1) = groupStruct; + groupStructMap(parentPath) = parentGroup; + end + end + + specificationPath = fullfile(zarrFile, 'specifications'); + if isfolder(specificationPath) ... + && ~any(strcmp({rootInfo.Groups.Name}, '/specifications')) + rootInfo.Groups(end+1) = buildFilesystemGroup(zarrFile, '/specifications'); %#ok + end + + nodeInfoMap = containers.Map('KeyType', 'char', 'ValueType', 'any'); + addGroupNode(rootInfo) + + function addGroupNode(groupInfo) + nodeInfoMap(groupInfo.Name) = groupInfo; + + for iDataset = 1:numel(groupInfo.Datasets) + datasetInfo = groupInfo.Datasets(iDataset); + datasetPath = joinNodePath(groupInfo.Name, datasetInfo.Name); + nodeInfoMap(datasetPath) = datasetInfo; + end + + for iChildGroup = 1:numel(groupInfo.Groups) + addGroupNode(groupInfo.Groups(iChildGroup)) + end + end +end + +function groupInfo = buildFilesystemGroup(zarrFile, groupPath) + groupInfo = initGroupStruct(); + groupInfo.Name = groupPath; + + groupDirectory = fullfile(zarrFile, stripLeadingSlash(groupPath)); + attributes = []; + if isfile(fullfile(groupDirectory, '.zattrs')) + attributes = io.backend.zarr2.mw.readAttributes(groupDirectory); + end + groupInfo.Attributes = createAttributesStructure(attributes); + groupInfo.Links = createLinksStructure(attributes); + + directoryEntries = dir(groupDirectory); + directoryEntries = directoryEntries([directoryEntries.isdir]); + directoryEntries = directoryEntries(~startsWith({directoryEntries.name}, '.')); + + for iEntry = 1:numel(directoryEntries) + entryName = directoryEntries(iEntry).name; + entryPath = joinNodePath(groupPath, entryName); + entryDirectory = fullfile(groupDirectory, entryName); + + if isfile(fullfile(entryDirectory, '.zgroup')) + groupInfo.Groups(end+1) = buildFilesystemGroup(zarrFile, entryPath); %#ok + elseif isfile(fullfile(entryDirectory, '.zarray')) + groupInfo.Datasets(end+1) = buildFilesystemDataset(zarrFile, entryPath); %#ok + end + end +end + +function datasetInfo = buildFilesystemDataset(zarrFile, datasetPath) + datasetInfo = initDatasetStruct(); + datasetInfo.Name = getLeafName(datasetPath); + + datasetDirectory = fullfile(zarrFile, stripLeadingSlash(datasetPath)); + arrayMeta = io.backend.zarr2.mw.readInfo(datasetDirectory); + datasetInfo.Datatype = getArrayDatatype(arrayMeta); + datasetInfo.Dataspace = convertZarrDataspaceToH5(arrayMeta); + datasetInfo.ChunkSize = getOptionalField(arrayMeta, 'chunks', []); + datasetInfo.FillValue = getOptionalField(arrayMeta, 'fill_value', []); + datasetInfo.Filters = convertZarrFiltersToH5(arrayMeta); + + attributes = []; + if isfile(fullfile(datasetDirectory, '.zattrs')) + attributes = io.backend.zarr2.mw.readAttributes(datasetDirectory); + if isfield(attributes, 'zarr_dtype') + datasetInfo.Datatype = attributes.zarr_dtype; + end + end + datasetInfo.Attributes = createAttributesStructure(attributes); +end + +function [metadata, keyMap] = readMetadata(zarrFile) + metadataFile = fullfile(zarrFile, '.zmetadata'); + if ~isfile(metadataFile) + error("NWB:Zarr2:MissingConsolidatedMetadata", ... + "No .zmetadata file found in `%s`.", zarrFile) + end + + zmetadata = jsondecode(fileread(metadataFile)); + if ~isfield(zmetadata, 'metadata') + error("NWB:Zarr2:InvalidConsolidatedMetadata", ... + "The .zmetadata file in `%s` does not contain a `metadata` field.", zarrFile) + end + + metadata = zmetadata.metadata; + originalKeys = extractMetadataKeys(metadataFile); + fieldNames = fieldnames(metadata); + assert(numel(originalKeys) == numel(fieldNames), ... + 'NWB:Zarr2:MetadataKeyMismatch', ... + 'Unable to align decoded .zmetadata fields with original keys.') + + keyMap = containers.Map(fieldNames, originalKeys); +end + +function groupStruct = createGroupStructure(groupPath, metadata, originalKeyMap) + groupStruct = initGroupStruct(); + groupStruct.Name = groupPath; + + attributes = getAttributes(metadata, originalKeyMap, groupPath); + groupStruct.Attributes = createAttributesStructure(attributes); + groupStruct.Links = createLinksStructure(attributes); +end + +function datasetStruct = createDatasetStructure(datasetPath, metadata, originalKeyMap) + datasetStruct = initDatasetStruct(); + datasetStruct.Name = getLeafName(datasetPath); + + arrayKey = sprintf('%s/.zarray', stripLeadingSlash(datasetPath)); + if ~isKey(originalKeyMap, arrayKey) + error("NWB:Zarr2:DatasetMetadataMissing", ... + "No .zarray metadata found for dataset `%s`.", datasetPath) + end + + arrayMeta = metadata.(originalKeyMap(arrayKey)); + datasetStruct.Datatype = getArrayDatatype(arrayMeta); + datasetStruct.Dataspace = convertZarrDataspaceToH5(arrayMeta); + datasetStruct.ChunkSize = getOptionalField(arrayMeta, 'chunks', []); + datasetStruct.FillValue = getOptionalField(arrayMeta, 'fill_value', []); + datasetStruct.Filters = convertZarrFiltersToH5(arrayMeta); + + attributes = getAttributes(metadata, originalKeyMap, datasetPath); + if ~isempty(attributes) && isfield(attributes, 'zarr_dtype') + datasetStruct.Datatype = attributes.zarr_dtype; + end + datasetStruct.Attributes = createAttributesStructure(attributes); +end + +function attributes = getAttributes(metadata, originalKeyMap, elementPath) + if strcmp(elementPath, '/') + attributeKey = '.zattrs'; + else + attributeKey = sprintf('%s/.zattrs', stripLeadingSlash(elementPath)); + end + + if ~isKey(originalKeyMap, attributeKey) + attributes = []; + return + end + + attributes = metadata.(originalKeyMap(attributeKey)); +end + +function [groupPaths, datasetPaths] = processKeys(keyMap) + originalKeys = values(keyMap); + groupPaths = {}; + datasetPaths = {}; + + for iKey = 1:numel(originalKeys) + originalKey = originalKeys{iKey}; + if strcmp(originalKey, '.zgroup') + continue + elseif endsWith(originalKey, '/.zgroup') + groupPaths{end+1} = ['/' erase(originalKey, '/.zgroup')]; %#ok + elseif endsWith(originalKey, '/.zarray') + datasetPaths{end+1} = ['/' erase(originalKey, '/.zarray')]; %#ok + end + end + + groupPaths = unique(groupPaths); + datasetPaths = unique(datasetPaths); +end + +function attributes = createAttributesStructure(zarrAttributes) + if isempty(zarrAttributes) + attributes = emptyAttributeStruct(); + return + end + + rawAttributeNames = fieldnames(zarrAttributes); + specialKeyMap = getSpecialKeysMap(); + reservedNames = ["zarr_link", "zarr_dtype", "_ARRAY_DIMENSIONS"]; + attributes = emptyAttributeStruct(); + + for iAttribute = 1:numel(rawAttributeNames) + rawName = rawAttributeNames{iAttribute}; + if isKey(specialKeyMap, rawName) + attributeName = specialKeyMap(rawName); + else + attributeName = rawName; + end + + if any(strcmp(attributeName, reservedNames)) + continue + end + + attribute = initAttributeStruct(); + attribute.Name = attributeName; + attribute.Value = zarrAttributes.(rawName); + if isstruct(attribute.Value) ... + && isfield(attribute.Value, 'zarr_dtype') ... + && strcmp(attribute.Value.zarr_dtype, 'object') + attribute.Datatype = 'object reference'; + else + attribute.Datatype = []; + end + attribute.Dataspace = []; + attributes(end+1) = attribute; %#ok + end +end + +function links = createLinksStructure(zarrAttributes) + links = emptyLinkStruct(); + if isempty(zarrAttributes) || ~isfield(zarrAttributes, 'zarr_link') + return + end + + zarrLinks = zarrAttributes.zarr_link; + if ~iscell(zarrLinks) + zarrLinks = num2cell(zarrLinks); + end + + for iLink = 1:numel(zarrLinks) + zarrLink = zarrLinks{iLink}; + link = initLinkStruct(); + link.Name = zarrLink.name; + if strcmp(zarrLink.source, '.') + link.Type = 'soft link'; + link.Value = {zarrLink.path}; + else + link.Type = 'external link'; + link.Value = {zarrLink.source, zarrLink.path}; + end + links(end+1) = link; %#ok + end +end + +function dataspace = convertZarrDataspaceToH5(arrayMeta) + dataspace = struct('Size', [], 'MaxSize', [], 'Type', 'unknown'); + if ~isfield(arrayMeta, 'shape') + return + end + + shape = double(arrayMeta.shape(:)'); + dataspace.Size = shape; + dataspace.MaxSize = shape; + dataspace.Type = 'simple'; +end + +function filters = convertZarrFiltersToH5(arrayMeta) + filters = struct('Name', {}, 'Parameters', {}); + + if isfield(arrayMeta, 'compressor') && ~isempty(arrayMeta.compressor) + filter = struct(); + filter.Name = getOptionalField(arrayMeta.compressor, 'id', 'unknown'); + filter.Parameters = arrayMeta.compressor; + filters(end+1) = filter; %#ok + end + + if isfield(arrayMeta, 'filters') && ~isempty(arrayMeta.filters) + zarrFilters = arrayMeta.filters; + if ~iscell(zarrFilters) + zarrFilters = num2cell(zarrFilters); + end + + for iFilter = 1:numel(zarrFilters) + filter = struct(); + filter.Name = getOptionalField(zarrFilters{iFilter}, 'id', 'unknown'); + filter.Parameters = zarrFilters{iFilter}; + filters(end+1) = filter; %#ok + end + end +end + +function keys = extractMetadataKeys(metadataFile) + metadataText = fileread(metadataFile); + metadataStart = regexp(metadataText, '"metadata"\s*:\s*\{', 'end', 'once'); + if isempty(metadataStart) + error("NWB:Zarr2:InvalidConsolidatedMetadata", ... + "The .zmetadata file `%s` does not contain a `metadata` object.", metadataFile) + end + + level = 1; + index = metadataStart + 1; + while level > 0 && index <= length(metadataText) + currentCharacter = metadataText(index); + if currentCharacter == "{" + level = level + 1; + elseif currentCharacter == "}" + level = level - 1; + end + index = index + 1; + end + + metadataBlock = metadataText(metadataStart+1:index-2); + keys = {}; + [startIndices, ~, ~, matches] = regexp(metadataBlock, '"([^"]+)"\s*:', ... + 'start', 'end', 'match', 'tokens'); + + for iMatch = 1:numel(startIndices) + prefix = metadataBlock(1:startIndices(iMatch)); + nestingLevel = sum(prefix == '{') - sum(prefix == '}'); + if nestingLevel == 0 + keys{end+1} = matches{iMatch}{1}; %#ok + end + end +end + +function specialKeyMap = getSpecialKeysMap() + specialKeyMap = containers.Map(); + specialKeyMap('x_specloc') = '.specloc'; + specialKeyMap('x_ARRAY_DIMENSIONS') = '_ARRAY_DIMENSIONS'; +end + +function path = getParentPath(nodePath) + slashIndices = strfind(nodePath, '/'); + if numel(slashIndices) <= 1 + path = '/'; + else + path = nodePath(1:slashIndices(end)-1); + end +end + +function leafName = getLeafName(nodePath) + pathParts = split(string(nodePath), "/"); + leafName = char(pathParts(end)); +end + +function joinedPath = joinNodePath(parentPath, childName) + if strcmp(parentPath, '/') + joinedPath = ['/' childName]; + else + joinedPath = [parentPath '/' childName]; + end +end + +function value = getOptionalField(structure, fieldName, defaultValue) + if isfield(structure, fieldName) + value = structure.(fieldName); + else + value = defaultValue; + end +end + +function strippedPath = stripLeadingSlash(nodePath) + strippedPath = regexprep(char(nodePath), '^/', ''); +end + +function datatype = getArrayDatatype(arrayMeta) + if isfield(arrayMeta, 'dtype') + datatype = arrayMeta.dtype; + else + datatype = []; + end +end + +function groupStruct = initGroupStruct() + groupStruct = struct('Name', '', 'Filename', '', ... + 'Groups', emptyGroupStruct(), ... + 'Datasets', emptyDatasetStruct(), ... + 'Links', emptyLinkStruct(), ... + 'Attributes', emptyAttributeStruct()); +end + +function datasetStruct = initDatasetStruct() + datasetStruct = struct('Name', '', 'Datatype', [], ... + 'Dataspace', struct('Size', [], 'MaxSize', [], 'Type', 'unknown'), ... + 'ChunkSize', [], 'FillValue', [], ... + 'Filters', struct('Name', {}, 'Parameters', {}), ... + 'Attributes', emptyAttributeStruct()); +end + +function attributeStruct = initAttributeStruct() + attributeStruct = struct('Name', '', 'Datatype', [], 'Dataspace', [], 'Value', []); +end + +function linkStruct = initLinkStruct() + linkStruct = struct('Name', '', 'Type', '', 'Value', []); +end + +function groupStruct = emptyGroupStruct() + groupStruct = struct('Name', {}, 'Filename', {}, 'Groups', {}, 'Datasets', {}, 'Links', {}, 'Attributes', {}); +end + +function datasetStruct = emptyDatasetStruct() + datasetStruct = struct('Name', {}, 'Datatype', {}, 'Dataspace', {}, 'ChunkSize', {}, 'FillValue', {}, 'Filters', {}, 'Attributes', {}); +end + +function attributeStruct = emptyAttributeStruct() + attributeStruct = struct('Name', {}, 'Datatype', {}, 'Dataspace', {}, 'Value', {}); +end + +function linkStruct = emptyLinkStruct() + linkStruct = struct('Name', {}, 'Type', {}, 'Value', {}); +end diff --git a/+io/+internal/+zarr2/readDataset.m b/+io/+internal/+zarr2/readDataset.m new file mode 100644 index 000000000..631749957 --- /dev/null +++ b/+io/+internal/+zarr2/readDataset.m @@ -0,0 +1,83 @@ +function datasetValue = readDataset(datasetDirectory, datasetInfo, rawDatasetValue) +% readDataset - Read and normalize a Zarr dataset for matnwb. + + arguments + datasetDirectory (1,1) string + datasetInfo (1,1) struct + rawDatasetValue = [] + end + + if isempty(rawDatasetValue) + rawDatasetInfo = io.backend.zarr2.mw.readInfo(datasetDirectory); + semanticType = getDatasetSemanticType(datasetInfo); + + if isfield(rawDatasetInfo, "dtype") && isObjectRawDtype(rawDatasetInfo.dtype) + datasetValue = io.internal.zarr2.readObjectArray(datasetDirectory); + datasetValue = postprocessObjectDatasetValue(datasetValue, semanticType); + else + datasetValue = io.backend.zarr2.mw.readArray(datasetDirectory); + end + else + datasetValue = rawDatasetValue; + end + + datasetValue = normalizeDatasetDimensions(datasetValue); +end + +function semanticType = getDatasetSemanticType(datasetInfo) + semanticType = ""; + if isfield(datasetInfo, "Datatype") ... + && (ischar(datasetInfo.Datatype) || isstring(datasetInfo.Datatype)) + semanticType = string(datasetInfo.Datatype); + return + end + + attributes = datasetInfo.Attributes; + if isempty(attributes) + return + end + + attributeNames = {attributes.Name}; + zarrTypeMask = strcmp(attributeNames, "zarr_dtype"); + if any(zarrTypeMask) + semanticType = string(attributes(find(zarrTypeMask, 1, "first")).Value); + end +end + +function tf = isObjectRawDtype(rawDtype) + tf = (ischar(rawDtype) || isstring(rawDtype)) && strcmp(string(rawDtype), "|O"); +end + +function datasetValue = postprocessObjectDatasetValue(datasetValue, semanticType) + if iscell(datasetValue) && isscalar(datasetValue) + datasetValue = datasetValue{1}; + end + + if semanticType == "object" + if isstruct(datasetValue) && isfield(datasetValue, "path") + datasetValue = types.untyped.ObjectView(datasetValue.path); + elseif iscell(datasetValue) + datasetValue = cellfun(@(item) types.untyped.ObjectView(item.path), ... + datasetValue, 'UniformOutput', false); + end + end +end + +function datasetValue = normalizeDatasetDimensions(datasetValue) + if ischar(datasetValue) || (isstring(datasetValue) && isscalar(datasetValue)) + return + end + + if iscell(datasetValue) && isscalar(datasetValue) + datasetValue = datasetValue{1}; + return + end + + if ndims(datasetValue) <= 1 + return + elseif ismatrix(datasetValue) + datasetValue = datasetValue.'; + else + datasetValue = permute(datasetValue, ndims(datasetValue):-1:1); + end +end diff --git a/+io/+internal/+zarr2/readObjectArray.m b/+io/+internal/+zarr2/readObjectArray.m new file mode 100644 index 000000000..1612a409f --- /dev/null +++ b/+io/+internal/+zarr2/readObjectArray.m @@ -0,0 +1,49 @@ +function result = readObjectArray(zarrPath) +% readObjectArray - Read NWB object-dtype Zarr arrays via python-zarr. + + try + zarrModule = py.importlib.import_module('zarr'); + catch + error("NWB:Zarr2:PythonZarrUnavailable", ... + "Python package `zarr` is required to read NWB object-dtype Zarr arrays.") + end + + zarrArray = zarrModule.open_array(char(zarrPath), pyargs('mode', 'r')); + getItem = py.getattr(zarrArray, '__getitem__'); + rawData = getItem(py.slice(py.None)); + result = convertPythonValue(rawData.tolist()); +end + +function value = convertPythonValue(pyValue) + if isa(pyValue, 'py.list') || isa(pyValue, 'py.tuple') + pythonItems = cell(pyValue); + value = cell(size(pythonItems)); + for iItem = 1:numel(pythonItems) + value{iItem} = convertPythonValue(pythonItems{iItem}); + end + elseif isa(pyValue, 'py.bytes') + value = char(pyValue.decode('utf-8')); + elseif isa(pyValue, 'py.str') + value = char(pyValue); + elseif isa(pyValue, 'py.hdmf_zarr.utils.ZarrReference') + value = jsondecode(strrep(char(pyValue), '''', '"')); + elseif isa(pyValue, 'py.dict') + jsonModule = py.importlib.import_module('json'); + value = jsondecode(char(jsonModule.dumps(pyValue))); + elseif isa(pyValue, 'py.NoneType') + value = []; + elseif isa(pyValue, 'py.bool') + value = logical(pyValue); + else + try + value = double(pyValue); + catch + try + value = char(pyValue); + catch ME + error("NWB:Zarr2:UnsupportedObjectValue", ... + "Unable to convert python value `%s`: %s", class(pyValue), ME.message) + end + end + end +end diff --git a/+io/+spec/readEmbeddedSpecifications.m b/+io/+spec/readEmbeddedSpecifications.m index 919e2fc9e..c2882924c 100644 --- a/+io/+spec/readEmbeddedSpecifications.m +++ b/+io/+spec/readEmbeddedSpecifications.m @@ -1,4 +1,4 @@ -function specs = readEmbeddedSpecifications(filename, specLocation) +function specs = readEmbeddedSpecifications(filename, specLocation, reader) % readEmbeddedSpecifications - Read embedded specs from an NWB file % % specs = io.spec.readEmbeddedSpecifications(filename, specLocation) read @@ -19,13 +19,11 @@ arguments filename (1,1) string {matnwb.common.mustBeNwbFile} specLocation (1,1) string + reader io.backend.base.Reader = io.backend.BackendFactory.createReader(filename) end - specInfo = h5info(filename, specLocation); + specInfo = reader.readNodeInfo(specLocation); specs = deal( cell(size(specInfo.Groups)) ); - - fid = H5F.open(filename); - fileCleanup = onCleanup(@(id) H5F.close(fid) ); for iGroup = 1:length(specInfo.Groups) location = specInfo.Groups(iGroup).Groups(1); @@ -44,13 +42,13 @@ fileLocation = strcat(location.Name, '/', sourceNames); schemaMap = containers.Map; for iFileLocation = 1:length(fileLocation) - did = H5D.open(fid, fileLocation{iFileLocation}); if strcmp('namespace', sourceNames{iFileLocation}) - namespaceText = H5D.read(did); + namespaceText = readEmbeddedSpecDatasetValue( ... + reader, location.Datasets(iFileLocation), fileLocation{iFileLocation}); else - schemaMap(sourceNames{iFileLocation}) = H5D.read(did); + schemaMap(sourceNames{iFileLocation}) = readEmbeddedSpecDatasetValue( ... + reader, location.Datasets(iFileLocation), fileLocation{iFileLocation}); end - H5D.close(did); end specs{iGroup}.namespaceName = namespaceName; @@ -58,3 +56,13 @@ specs{iGroup}.schemaMap = schemaMap; end end + +function datasetValue = readEmbeddedSpecDatasetValue(reader, datasetInfo, datasetPath) + datasetValue = reader.readDatasetValue(datasetInfo, datasetPath); + if isa(datasetValue, "types.untyped.DataStub") + datasetValue = datasetValue.load(); + end + if iscell(datasetValue) && isscalar(datasetValue) + datasetValue = datasetValue{1}; + end +end diff --git a/+io/parseDataset.m b/+io/parseDataset.m index e5ac3e1f0..ded80c85d 100644 --- a/+io/parseDataset.m +++ b/+io/parseDataset.m @@ -51,7 +51,16 @@ datasetTypeName = typeInfo.typename; isTypedDataset = ~isempty(datasetTypeName); - datasetValue = reader.readDatasetValue(datasetInfo, datasetPath); + try + datasetValue = reader.readDatasetValue(datasetInfo, datasetPath); + catch exception + newException = MException('NWB:parseDataset:ReadFailed', ... + 'Failed to read dataset at location "%s" in file.', datasetPath); + newException = newException.addCause(exception); + %throw(newException) + warning(newException.identifier, "%s", newException.message) + datasetValue = missing; + end % Prepare output datasetName = datasetInfo.Name; diff --git a/+io/parseGroup.m b/+io/parseGroup.m index c4b855ca5..93b66df2f 100644 --- a/+io/parseGroup.m +++ b/+io/parseGroup.m @@ -30,7 +30,7 @@ groupProperties = containers.Map; for i=1:length(info.Groups) group = info.Groups(i); - if any(strcmp(group.Name, blacklist.groups)) + if any(strcmp(string(group.Name), string(blacklist.groups))) continue; end [~, gname] = io.pathParts(group.Name); @@ -86,7 +86,13 @@ return; end - parsed = io.createParsedType(info.Name, Type.typename, kwargs{:}); + try + parsed = io.createParsedType(info.Name, Type.typename, kwargs{:}); + catch + %parsed = feval(Type.typename); + %keyboard + parsed = []; + end end end diff --git a/+matnwb/+common/+compatibility/mustBeFile.m b/+matnwb/+common/+compatibility/mustBeFile.m index 3cdbdca8c..6f5069ae9 100644 --- a/+matnwb/+common/+compatibility/mustBeFile.m +++ b/+matnwb/+common/+compatibility/mustBeFile.m @@ -14,6 +14,11 @@ function mustBeFile(filePath) if startsWith(filePath, "s3://") return end + if isfolder(filePath) + if endsWith(filePath, 'nwb.zarr') + return + end + end if verLessThan('matlab', '9.9') %#ok % Custom implementation (MATLAB < R2020b) @@ -22,7 +27,7 @@ function mustBeFile(filePath) catch ME throwAsCaller(ME) end - isValid = isfile(filePath); + isValid = isfile(filePath) || isfolder(filePath); if ~isValid ME = MException(... @@ -32,10 +37,13 @@ function mustBeFile(filePath) end else % Use available builtin try - mustBeFile(filePath) + if endsWith(filePath, ".zarr", "IgnoreCase", true) + mustBeFolder(filePath) + else + mustBeFile(filePath) + end catch ME throwAsCaller(ME) end end end - diff --git a/+matnwb/+common/mustBeNwbFile.m b/+matnwb/+common/mustBeNwbFile.m index 1a01e8b0e..5db74ca80 100644 --- a/+matnwb/+common/mustBeNwbFile.m +++ b/+matnwb/+common/mustBeNwbFile.m @@ -4,6 +4,7 @@ function mustBeNwbFile(filePath) filePath (1,1) string {matnwb.common.compatibility.mustBeFile} end if ~startsWith(filePath, "s3://", "IgnoreCase", true) - assert(endsWith(filePath, ".nwb", "IgnoreCase", true)) + assert(endsWith(filePath, ".nwb", "IgnoreCase", true) || ... + endsWith(filePath, ".zarr", "IgnoreCase", true)) end end diff --git a/+tests/+unit/+io/+backend/BackendFactoryTest.m b/+tests/+unit/+io/+backend/BackendFactoryTest.m index 0799ae7e1..fbbc35bcf 100644 --- a/+tests/+unit/+io/+backend/BackendFactoryTest.m +++ b/+tests/+unit/+io/+backend/BackendFactoryTest.m @@ -12,7 +12,7 @@ function createHDF5ReaderForNwbFile(testCase) filename = "factory-test.nwb"; nwbExport(nwb, filename); - % Verify both "auto" and "h5" creates a valid reader + % Verify both "auto" and "h5" create a valid reader. reader = io.backend.BackendFactory.createReader(filename, ... StorageBackend="auto"); testCase.verifyClass(reader, "io.backend.hdf5.HDF5Reader"); @@ -32,26 +32,29 @@ function createHDF5LazyArrayForH5File(testCase) testCase.verifyClass(lazyArray, "io.backend.hdf5.HDF5LazyArray"); end - function unsupportedBackendThrowsError(testCase) + function invalidZarrBackendThrowsError(testCase) nwb = tests.factory.NWBFile(); filename = "factory-test.nwb"; nwbExport(nwb, filename); testCase.verifyError( ... @() io.backend.BackendFactory.createReader(filename, StorageBackend="zarr"), ... - "NWB:BackendFactory:UnsupportedBackend"); + "NWB:BackendFactory:InvalidZarr"); + end - zarrFilepath = 'test.zarr.nwb'; + function unsupportedFormatThrowsError(testCase) + zarrFilepath = "test.zarr.nwb"; mkdir(zarrFilepath) - + testCase.verifyError( ... @() io.backend.BackendFactory.createReader(zarrFilepath, StorageBackend="auto"), ... "NWB:BackendFactory:UnsupportedFormat"); end - function verifyInvalidHDF5FileThrowsError(testCase) - zarrFilepath = 'test.zarr.nwb'; + function invalidHDF5FileThrowsError(testCase) + zarrFilepath = "test.zarr.nwb"; mkdir(zarrFilepath) + testCase.verifyError( ... @() io.backend.BackendFactory.createReader(zarrFilepath, StorageBackend="hdf5"), ... "NWB:BackendFactory:InvalidHDF5"); diff --git a/+tests/+unit/+io/+backend/Zarr2LazyArrayTest.m b/+tests/+unit/+io/+backend/Zarr2LazyArrayTest.m new file mode 100644 index 000000000..45c28ff22 --- /dev/null +++ b/+tests/+unit/+io/+backend/Zarr2LazyArrayTest.m @@ -0,0 +1,63 @@ +classdef Zarr2LazyArrayTest < matlab.unittest.TestCase + + properties (Constant, Access = private) + fixturePath = "/Users/eivind/Code/MATLAB/Sandbox/CN/zarr_matlab/test_data/test_zarr_sub_anm00239123_ses_20170627T093549_ecephys_and_ogen.nwb.zarr" + wrapperPath = "/Users/eivind/Code/MATLAB/General/Repositories/mathworks/MATLAB-support-for-Zarr-files" + datasetPath = "/units/waveform_mean" + end + + methods (TestClassSetup) + function addZarrWrapperToPath(testCase) + testCase.assumeTrue(isfolder(testCase.wrapperPath), ... + "MathWorks Zarr wrapper checkout not found.") + testCase.assumeTrue(isfolder(testCase.fixturePath), ... + "Primary Zarr fixture not found.") + + addpath(testCase.wrapperPath) + testCase.addTeardown(@() rmpath(testCase.wrapperPath)) + end + end + + methods (Test) + function loadDataAndMetadata(testCase) + lazyArray = io.backend.zarr2.Zarr2LazyArray(testCase.fixturePath, testCase.datasetPath); + datasetInfo = io.backend.zarr2.Zarr2Reader(testCase.fixturePath).readNodeInfo(testCase.datasetPath); + expectedData = io.internal.zarr2.readDataset( ... + fullfile(testCase.fixturePath, "units", "waveform_mean"), datasetInfo); + + testCase.verifyEqual(lazyArray.dims, [29 4]); + testCase.verifyEqual(lazyArray.maxDims, [29 4]); + testCase.verifyEqual(lazyArray.dataType, 'single'); + testCase.verifyEqual(lazyArray.load_h5_style(), expectedData); + end + + function loadPartialDataWithH5StyleSelection(testCase) + lazyArray = io.backend.zarr2.Zarr2LazyArray(testCase.fixturePath, testCase.datasetPath); + fullData = lazyArray.load_h5_style(); + partialData = lazyArray.load_h5_style([2 1], [3 2], [2 1]); + + testCase.verifyEqual(partialData, fullData(2:2:6, 1:2)); + end + + function dataStubSupportsSimpleIndexing(testCase) + lazyArray = io.backend.zarr2.Zarr2LazyArray(testCase.fixturePath, testCase.datasetPath); + datasetInfo = io.backend.zarr2.Zarr2Reader(testCase.fixturePath).readNodeInfo(testCase.datasetPath); + expectedData = io.internal.zarr2.readDataset( ... + fullfile(testCase.fixturePath, "units", "waveform_mean"), datasetInfo); + dataStub = types.untyped.DataStub( ... + testCase.fixturePath, testCase.datasetPath, [], [], lazyArray); + + testCase.verifyEqual(dataStub.load(), expectedData); + testCase.verifyEqual(dataStub(1:5, 2), expectedData(1:5, 2)); + end + + function loadMatStyleUsesPartialReadForRegularSelection(testCase) + lazyArray = io.backend.zarr2.Zarr2LazyArray(testCase.fixturePath, testCase.datasetPath); + fullData = lazyArray.load_h5_style(); + + testCase.verifyEqual( ... + lazyArray.load_mat_style(2:2:6, 1:2), ... + fullData(2:2:6, 1:2)); + end + end +end diff --git a/+tests/+unit/+io/+backend/Zarr2ReaderTest.m b/+tests/+unit/+io/+backend/Zarr2ReaderTest.m new file mode 100644 index 000000000..8866b75ab --- /dev/null +++ b/+tests/+unit/+io/+backend/Zarr2ReaderTest.m @@ -0,0 +1,90 @@ +classdef Zarr2ReaderTest < matlab.unittest.TestCase + + properties (Constant, Access = private) + fixturePath = "/Users/eivind/Code/MATLAB/Sandbox/CN/zarr_matlab/test_data/test_zarr_sub_anm00239123_ses_20170627T093549_ecephys_and_ogen.nwb.zarr" + wrapperPath = "/Users/eivind/Code/MATLAB/General/Repositories/mathworks/MATLAB-support-for-Zarr-files" + end + + methods (TestClassSetup) + function addZarrWrapperToPath(testCase) + testCase.assumeTrue(isfolder(testCase.wrapperPath), ... + "MathWorks Zarr wrapper checkout not found.") + testCase.assumeTrue(isfolder(testCase.fixturePath), ... + "Primary Zarr fixture not found.") + + addpath(testCase.wrapperPath) + testCase.addTeardown(@() rmpath(testCase.wrapperPath)) + end + end + + methods (Test) + function readRootInfoAndSchemaVersion(testCase) + reader = io.backend.zarr2.Zarr2Reader(testCase.fixturePath); + rootInfo = reader.readRootInfo(); + + testCase.verifyEqual(rootInfo.Name, '/'); + testCase.verifyEqual(reader.getSchemaVersion(), "2.7.0"); + testCase.verifyEqual(reader.getEmbeddedSpecLocation(), "/specifications"); + testCase.verifyTrue(any(strcmp({rootInfo.Groups.Name}, '/general'))); + end + + function readNodeInfoIncludesLinks(testCase) + reader = io.backend.zarr2.Zarr2Reader(testCase.fixturePath); + nodeInfo = reader.readNodeInfo("/general/extracellular_ephys/ADunit_32"); + + testCase.verifyEqual(nodeInfo.Name, '/general/extracellular_ephys/ADunit_32'); + testCase.verifyEqual(numel(nodeInfo.Links), 1); + testCase.verifyEqual(nodeInfo.Links(1).Name, 'device'); + testCase.verifyEqual(nodeInfo.Links(1).Type, 'soft link'); + testCase.verifyEqual(string(nodeInfo.Links(1).Value{1}), "/general/devices/ADunit"); + end + + function readAttributeValueConvertsObjectReference(testCase) + reader = io.backend.zarr2.Zarr2Reader(testCase.fixturePath); + nodeInfo = reader.readNodeInfo("/units/electrodes_index"); + attributeInfo = nodeInfo.Attributes(strcmp({nodeInfo.Attributes.Name}, 'target')); + attributeValue = reader.readAttributeValue(attributeInfo, "/units/electrodes_index"); + + testCase.verifyClass(attributeValue, "types.untyped.ObjectView"); + testCase.verifyEqual(string(attributeValue.path), "/units/electrodes"); + end + + function readDatasetValueReturnsScalarString(testCase) + reader = io.backend.zarr2.Zarr2Reader(testCase.fixturePath); + rootInfo = reader.readRootInfo(); + datasetInfo = rootInfo.Datasets(strcmp({rootInfo.Datasets.Name}, 'identifier')); + datasetValue = reader.readDatasetValue(datasetInfo, "/identifier"); + + testCase.verifyClass(datasetValue, "char"); + testCase.verifyFalse(isempty(datasetValue)); + end + + function readObjectDatasetValueReturnsObjectViews(testCase) + reader = io.backend.zarr2.Zarr2Reader(testCase.fixturePath); + datasetInfo = reader.readNodeInfo("/general/extracellular_ephys/electrodes/group"); + datasetValue = reader.readDatasetValue( ... + datasetInfo, "/general/extracellular_ephys/electrodes/group"); + + testCase.verifyClass(datasetValue, "cell"); + testCase.verifyClass(datasetValue{1}, "types.untyped.ObjectView"); + end + + function readNonScalarDatasetValueReturnsDataStub(testCase) + reader = io.backend.zarr2.Zarr2Reader(testCase.fixturePath); + datasetInfo = reader.readNodeInfo("/units/waveform_mean"); + datasetValue = reader.readDatasetValue(datasetInfo, "/units/waveform_mean"); + + testCase.verifyClass(datasetValue, "types.untyped.DataStub"); + testCase.verifyEqual(datasetValue.dims, [29 4]); + end + + function readEmbeddedSpecificationsFromZarr(testCase) + reader = io.backend.zarr2.Zarr2Reader(testCase.fixturePath); + specs = io.spec.readEmbeddedSpecifications( ... + testCase.fixturePath, "/specifications", reader); + + testCase.verifyGreaterThan(numel(specs), 0); + testCase.verifyTrue(any(strcmp(cellfun(@(s) s.namespaceName, specs, 'UniformOutput', false), 'core'))); + end + end +end