From fffa1fb3ea64c95aca5ee501a5a6609c755665f9 Mon Sep 17 00:00:00 2001 From: Kai Muehlbauer Date: Fri, 13 Dec 2019 15:19:10 +0100 Subject: [PATCH 1/7] ENH: enable `H5NetCDFStore` to work with already open h5netcdf.File and h5netcdf.Group objects, add test --- xarray/backends/api.py | 4 +-- xarray/backends/h5netcdf_.py | 54 ++++++++++++++++++++++++++--------- xarray/tests/test_backends.py | 22 +++++++++++++- 3 files changed, 64 insertions(+), 16 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index eea1fb9ddce..1e593e98338 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -503,7 +503,7 @@ def maybe_decode_store(store, lock=False): elif engine == "pydap": store = backends.PydapDataStore.open(filename_or_obj, **backend_kwargs) elif engine == "h5netcdf": - store = backends.H5NetCDFStore( + store = backends.H5NetCDFStore.open( filename_or_obj, group=group, lock=lock, **backend_kwargs ) elif engine == "pynio": @@ -981,7 +981,7 @@ def open_mfdataset( WRITEABLE_STORES: Dict[str, Callable] = { "netcdf4": backends.NetCDF4DataStore.open, "scipy": backends.ScipyDataStore, - "h5netcdf": backends.H5NetCDFStore, + "h5netcdf": backends.H5NetCDFStore.open, } diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 51ed512f98b..c928d7b6746 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -4,9 +4,9 @@ from .. import Variable from ..core import indexing -from ..core.utils import FrozenDict +from ..core.utils import FrozenDict, is_remote_uri from .common import WritableCFDataStore -from .file_manager import CachingFileManager +from .file_manager import CachingFileManager, DummyFileManager from .locks import HDF5_LOCK, combine_locks, ensure_lock, get_write_lock from .netCDF4_ import ( BaseNetCDF4Array, @@ -69,8 +69,42 @@ class H5NetCDFStore(WritableCFDataStore): """Store for reading and writing data via h5netcdf """ + __slots__ = ( + "autoclose", + "format", + "is_remote", + "lock", + "_filename", + "_group", + "_manager", + "_mode", + ) + def __init__( - self, + self, manager, group=None, mode=None, lock=HDF5_LOCK, autoclose=False + ): + + import h5netcdf + + if isinstance(manager, (h5netcdf.File, h5netcdf.Group)): + if group is None: + root, group = manager._root, manager.name + else: + root = manager + manager = DummyFileManager(root) + + self._manager = manager + self._group = group + self._mode = mode + self.format = None + self._filename = self.ds._root.filename + self.is_remote = is_remote_uri(self._filename) + self.lock = ensure_lock(lock) + self.autoclose = autoclose + + @classmethod + def open( + cls, filename, mode="r", format=None, @@ -86,22 +120,16 @@ def __init__( kwargs = {"invalid_netcdf": invalid_netcdf} - self._manager = CachingFileManager( - h5netcdf.File, filename, mode=mode, kwargs=kwargs - ) - if lock is None: if mode == "r": lock = HDF5_LOCK else: lock = combine_locks([HDF5_LOCK, get_write_lock(filename)]) - self._group = group - self.format = format - self._filename = filename - self._mode = mode - self.lock = ensure_lock(lock) - self.autoclose = autoclose + manager = CachingFileManager( + h5netcdf.File, filename, mode=mode, kwargs=kwargs + ) + return cls(manager, group=group, mode=mode, lock=lock, autoclose=autoclose) def _acquire(self, needs_lock=True): with self._manager.acquire_context(needs_lock) as root: diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 4fccdf2dd6c..d9ef90adaae 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2182,7 +2182,7 @@ class TestH5NetCDFData(NetCDF4Base): @contextlib.contextmanager def create_store(self): with create_tmp_file() as tmp_file: - yield backends.H5NetCDFStore(tmp_file, "w") + yield backends.H5NetCDFStore.open(tmp_file, "w") @pytest.mark.filterwarnings("ignore:complex dtypes are supported by h5py") @pytest.mark.parametrize( @@ -2345,6 +2345,26 @@ def test_dump_encodings_h5py(self): assert actual.x.encoding["compression"] == "lzf" assert actual.x.encoding["compression_opts"] is None + def test_already_open_dataset_group(self): + import h5netcdf + with create_tmp_file() as tmp_file: + with nc4.Dataset(tmp_file, mode="w") as nc: + group = nc.createGroup("g") + v = group.createVariable("x", "int") + v[...] = 42 + + h5 = h5netcdf.File(tmp_file, mode="r") + store = backends.H5NetCDFStore(h5["g"]) + with open_dataset(store) as ds: + expected = Dataset({"x": ((), 42)}) + assert_identical(expected, ds) + + h5 = h5netcdf.File(tmp_file, mode="r") + store = backends.H5NetCDFStore(h5, group="g") + with open_dataset(store) as ds: + expected = Dataset({"x": ((), 42)}) + assert_identical(expected, ds) + @requires_h5netcdf class TestH5NetCDFFileObject(TestH5NetCDFData): From 4a0c3625b93d1a50a9872197192702bae1ea03b5 Mon Sep 17 00:00:00 2001 From: Kai Muehlbauer Date: Fri, 13 Dec 2019 15:44:40 +0100 Subject: [PATCH 2/7] FIX: add `.open` method for file-like objects --- xarray/backends/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 1e593e98338..0990fc46219 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -527,7 +527,7 @@ def maybe_decode_store(store, lock=False): if engine == "scipy": store = backends.ScipyDataStore(filename_or_obj, **backend_kwargs) elif engine == "h5netcdf": - store = backends.H5NetCDFStore( + store = backends.H5NetCDFStore.open( filename_or_obj, group=group, lock=lock, **backend_kwargs ) From cfd6ce8311ee7e3e9a1ff9f5f0d0b48ff3bd4c1a Mon Sep 17 00:00:00 2001 From: Kai Muehlbauer Date: Sat, 14 Dec 2019 11:24:10 +0100 Subject: [PATCH 3/7] FIX: reformat using black --- xarray/backends/h5netcdf_.py | 8 ++------ xarray/tests/test_backends.py | 1 + 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index c928d7b6746..f85c0761336 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -80,9 +80,7 @@ class H5NetCDFStore(WritableCFDataStore): "_mode", ) - def __init__( - self, manager, group=None, mode=None, lock=HDF5_LOCK, autoclose=False - ): + def __init__(self, manager, group=None, mode=None, lock=HDF5_LOCK, autoclose=False): import h5netcdf @@ -126,9 +124,7 @@ def open( else: lock = combine_locks([HDF5_LOCK, get_write_lock(filename)]) - manager = CachingFileManager( - h5netcdf.File, filename, mode=mode, kwargs=kwargs - ) + manager = CachingFileManager(h5netcdf.File, filename, mode=mode, kwargs=kwargs) return cls(manager, group=group, mode=mode, lock=lock, autoclose=autoclose) def _acquire(self, needs_lock=True): diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index d9ef90adaae..0436ae9d244 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2347,6 +2347,7 @@ def test_dump_encodings_h5py(self): def test_already_open_dataset_group(self): import h5netcdf + with create_tmp_file() as tmp_file: with nc4.Dataset(tmp_file, mode="w") as nc: group = nc.createGroup("g") From cbe1b08cb44acbe992ab8b387788eae2e6fd7cdb Mon Sep 17 00:00:00 2001 From: Kai Muehlbauer Date: Mon, 13 Jan 2020 11:04:17 +0100 Subject: [PATCH 4/7] WIP: add item to whats-new.rst --- doc/whats-new.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 6eeb55c23cb..9759a152a4c 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -25,6 +25,11 @@ Breaking changes New Features ~~~~~~~~~~~~ +- Support using an existing, opened h5netcdf ``File`` with + :py:class:`~xarray.backends.H5NetCDFStore`. This permits creating an + :py:class:`~xarray.Dataset` from a h5netcdf ``File`` that has been opened + using other means (:issue:`3618`). + By `Kai Mühlbauer `_. - Implement :py:func:`median` and :py:func:`nanmedian` for dask arrays. This works by rechunking to a single chunk along all reduction axes. (:issue:`2999`). By `Deepak Cherian `_. From 8372ac981ffc8b3849a374a346bf72ebec72043f Mon Sep 17 00:00:00 2001 From: Kai Muehlbauer Date: Mon, 13 Jan 2020 14:20:14 +0100 Subject: [PATCH 5/7] FIX: temporary fix to tackle issue #3680 --- xarray/backends/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/common.py b/xarray/backends/common.py index 9786e0a0203..e15397f6eef 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -34,7 +34,7 @@ def find_root_and_group(ds): """Find the root and group name of a netCDF4/h5netcdf dataset.""" hierarchy = () while ds.parent is not None: - hierarchy = (ds.name,) + hierarchy + hierarchy = (ds.name.split('/')[-1],) + hierarchy ds = ds.parent group = "/" + "/".join(hierarchy) return ds, group From 89b4c06390724a88f5100e7153fa1d6d6012e1e3 Mon Sep 17 00:00:00 2001 From: Kai Muehlbauer Date: Mon, 13 Jan 2020 14:33:51 +0100 Subject: [PATCH 6/7] FIX: do not use private API, use find_root_and_group instead --- xarray/backends/h5netcdf_.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index f85c0761336..203213afe82 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -5,7 +5,7 @@ from .. import Variable from ..core import indexing from ..core.utils import FrozenDict, is_remote_uri -from .common import WritableCFDataStore +from .common import WritableCFDataStore, find_root_and_group from .file_manager import CachingFileManager, DummyFileManager from .locks import HDF5_LOCK, combine_locks, ensure_lock, get_write_lock from .netCDF4_ import ( @@ -86,8 +86,13 @@ def __init__(self, manager, group=None, mode=None, lock=HDF5_LOCK, autoclose=Fal if isinstance(manager, (h5netcdf.File, h5netcdf.Group)): if group is None: - root, group = manager._root, manager.name + root, group = find_root_and_group(manager) else: + if not type(manager) is h5netcdf.File: + raise ValueError( + "must supply a h5netcdf.File if the group " + "argument is provided" + ) root = manager manager = DummyFileManager(root) @@ -95,7 +100,9 @@ def __init__(self, manager, group=None, mode=None, lock=HDF5_LOCK, autoclose=Fal self._group = group self._mode = mode self.format = None - self._filename = self.ds._root.filename + # todo: utilizing find_root_and_group seems a bit clunky + # making filename available on h5netcdf.Group seems better + self._filename = find_root_and_group(self.ds)[0].filename self.is_remote = is_remote_uri(self._filename) self.lock = ensure_lock(lock) self.autoclose = autoclose From 0ec6d581dc4eb708908427749494c6b12a1eb2d3 Mon Sep 17 00:00:00 2001 From: Kai Muehlbauer Date: Mon, 13 Jan 2020 14:41:15 +0100 Subject: [PATCH 7/7] FIX: reformat using black --- xarray/backends/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/common.py b/xarray/backends/common.py index e15397f6eef..fa3ee19f542 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -34,7 +34,7 @@ def find_root_and_group(ds): """Find the root and group name of a netCDF4/h5netcdf dataset.""" hierarchy = () while ds.parent is not None: - hierarchy = (ds.name.split('/')[-1],) + hierarchy + hierarchy = (ds.name.split("/")[-1],) + hierarchy ds = ds.parent group = "/" + "/".join(hierarchy) return ds, group