Source code for pde.storage.file

"""Defines a class storing data on the file system using the hierarchical data format
(hdf)

.. codeauthor:: David Zwicker <david.zwicker@ds.mpg.de>
"""

from __future__ import annotations

import json
from pathlib import Path
from typing import TYPE_CHECKING, Any, Literal

import numpy as np
from typing_extensions import Self

from ..tools.misc import ensure_directory_exists, hdf_write_attributes
from .base import InfoDict, StorageBase, WriteModeType

if TYPE_CHECKING:
    from numpy.typing import DTypeLike

    from ..fields.base import FieldBase
    from ..tools.typing import NumericArray



[docs]
class FileStorage(StorageBase):
    """Store discretized fields in a hdf5 file."""

    def __init__(
        self,
        filename: str | Path,
        *,
        info: InfoDict | None = None,
        write_mode: WriteModeType = "truncate_once",
        max_length: int | None = None,
        compression: bool = True,
        keep_opened: bool = True,
        check_mpi: bool = True,
    ):
        """
        Args:
            filename (str):
                The path to the hdf5-file where the data is stored
            info (dict):
                Supplies extra information that is stored in the storage
            write_mode (str):
                Determines how new data is added to already existing data. Possible
                values are: 'append' (data is always appended), 'truncate' (data is
                cleared every time this storage is used for writing), or 'truncate_once'
                (data is cleared for the first writing, but appended subsequently).
                Alternatively, specifying 'readonly' will disable writing completely.
            max_length (int, optional):
                Maximal number of entries that will be stored in the file. This can be
                used to preallocate data, which can lead to smaller files, but is also
                less flexible. Giving `max_length = None`, allows for arbitrarily large
                data, which might lead to larger files.
            compression (bool):
                Whether to store the data in compressed form. Automatically enabled
                chunked storage.
            keep_opened (bool):
                Flag indicating whether the file should be kept opened after each
                writing. If `False`, the file will be closed after writing a dataset.
                This keeps the file in a consistent state, but also requires more work
                before data can be written.
            check_mpi (bool):
                If True, files will only be opened in the main node for an parallel
                simulation using MPI. This flag has no effect in serial code.
        """
        super().__init__(info=info, write_mode=write_mode)
        self.filename = Path(filename)
        self.compression = compression
        self.keep_opened = keep_opened

        self._file: Any = None
        self._is_writing = False
        self._data_length: int = None  # type: ignore
        self._max_length: int | None = max_length

        if check_mpi:
            from ..tools import mpi

            self._is_child_node = not mpi.is_main
        else:
            self._is_child_node = False
        if self._is_child_node:
            return  # do not actually open the file if we're on a child process

        if self.filename.is_file() and self.filename.stat().st_size > 0:
            try:
                self._open("reading")
            except (OSError, KeyError):
                self.close()
                self._logger.warning(
                    "File `%s` could not be opened for reading", filename
                )

    def __del__(self):
        self.close()  # ensure open files are closed when the FileStorage is deleted

    @property
    def _file_state(self) -> str:
        """str: the state that the file is currently in"""
        if self._file is None:
            return "closed"
        if self._file.mode == "r":
            return "reading"
        if self._file.mode == "r+":
            return "writing"
        msg = f"Do not understand mode `{self._file.mode}"
        raise NotImplementedError(msg)


[docs]
    def close(self) -> None:
        """Close the currently opened file."""
        if self._file is not None:
            self._logger.info("Close file `%s`", self.filename)
            self._file.close()
            self._file = None
            self._data_length = None  # type: ignore


    def __enter__(self) -> Self:
        return self

    def __exit__(self, exc_type, exc_value, exc_traceback):
        self.close()

    def _create_hdf_dataset(
        self,
        name: str,
        shape: tuple[int, ...] = (),
        dtype: DTypeLike = np.double,
    ):
        """Create a hdf5 dataset with the given name and data_shape.

        Args:
            name (str): Identifier of the hdf5 dataset
            shape (tuple): Data shape of the dataset
            dtype: The data type of the dataset
        """
        if self.compression:
            kwargs = {"chunks": (1, *shape), "compression": "gzip"}
        else:
            kwargs = {}

        if self._max_length:
            shape = (self._max_length, *shape)
            return self._file.create_dataset(name, shape=shape, dtype=dtype, **kwargs)
        return self._file.create_dataset(
            name,
            shape=(0, *shape),
            dtype=dtype,
            maxshape=(None, *shape),
            **kwargs,
        )

    def _open(
        self,
        mode: Literal["reading", "appending", "writing", "closed"] = "reading",
        info: InfoDict | None = None,
    ) -> None:
        """Open the hdf file in a particular mode.

        Args:
            mode (str):
                Determines how the file is opened. Possible values are `reading`,
                `appending`, `writing`, and `closed`.
            info (dict):
                Supplies extra information that is stored in the storage
        """
        import h5py  # lazy loading so it's not a hard dependence

        if self._is_child_node:
            self._logger.warning("Do not open file on MPI child nodes")
            return

        state = self._file_state

        if mode == "reading":
            # open file for reading
            if state in ["reading", "appending", "writing"]:
                return  # we can read data when file is open for writing

            # close file to open it again for reading or appending
            if self._file:
                self._file.close()
            self._logger.info("Open file `%s` for reading", self.filename)
            self._file = h5py.File(self.filename, mode="r")
            self._times = self._file["times"]
            self._data = self._file["data"]
            for k, v in self._file.attrs.items():
                self.info[k] = json.loads(v)
            if info:
                self.info.update(info)

            self._data_shape = self.data.shape[1:]
            self._dtype = self.data.dtype
            self._data_length = self.info.get("data_length")  # type: ignore

        elif mode == "appending":
            # open file for writing without deleting data
            if state in ["appending", "writing"]:
                return  # we are already in a mode where we can append data
            if self.keep_opened and self._is_writing:
                msg = "Currently writing data, so mode cannot be switched."
                raise RuntimeError(msg)
            self.close()

            # open file for reading or appending
            self._logger.info("Open file `%s` for appending", self.filename)
            self._file = h5py.File(self.filename, mode="a")

            if "times" in self._file and "data" in self._file:
                # extract data from datasets in the existing file
                self._times = self._file["times"]
                self._data = self._file["data"]

                # extract information
                for k, v in self._file.attrs.items():
                    self.info[k] = json.loads(v)
                self._data_shape = self.data.shape[1:]
                self._dtype = self.data.dtype
                self._data_length = self.info.get("data_length", self.data.shape[0])

            else:
                # create new datasets
                self._times = self._create_hdf_dataset("times")
                self._data = self._create_hdf_dataset(
                    "data",
                    self.data_shape,
                    dtype=self._dtype,  # type: ignore
                )
                self._data_length = 0

            if info:
                self.info.update(info)

        elif mode == "writing":
            # open file for writing data; delete potentially present data
            if self._is_writing:
                msg = "Currently writing data, so mode cannot be switched"
                raise RuntimeError(msg)
            if self._file:
                self.close()
            else:
                ensure_directory_exists(self.filename.parent)
            self._logger.info("Open file `%s` for writing", self.filename)
            self._file = h5py.File(self.filename, "w")
            self._times = self._create_hdf_dataset("times")
            self._data = self._create_hdf_dataset(
                "data",
                self.data_shape,
                dtype=self._dtype,  # type: ignore
            )
            if info:
                self.info.update(info)
            self._data_length = 0  # start writing from the beginning

        elif mode == "closed":
            self.close()

        else:
            msg = f"Mode `{mode}` not implemented"
            raise RuntimeError(msg)

    def __len__(self):
        """Return the number of stored items, i.e., time steps."""
        # determine size of data in HDF5 file
        try:
            length = len(self.times)
        except OSError:
            length = 0

        if self._data_length is None:
            return length
        # size of stored data is smaller since preallocation was used
        return min(length, self._data_length)

    @property
    def times(self):
        """:class:`~numpy.ndarray`: The times at which data is available."""
        self._open("reading")
        return self._times

    @property
    def data(self):
        """:class:`~numpy.ndarray`: The actual data for all time."""
        self._open("reading")
        return self._data


[docs]
    def clear(self, clear_data_shape: bool = False):
        """Truncate the storage by removing all stored data.

        Args:
            clear_data_shape (bool):
                Flag determining whether the data shape is also deleted.
        """
        if self._is_child_node:
            self._logger.warning("Do not clear file on MPI child nodes")
            return

        if self._is_writing:
            # remove data from opened file
            self._logger.info("Truncate data in hdf5 file")
            if "times" in self._file:
                del self._file["times"]
            self._times = self._create_hdf_dataset("times")

            if "data" in self._file:
                del self._file["data"]
            self._data = self._create_hdf_dataset(
                "data", self.data_shape, dtype=self.dtype
            )
            self._data_length = 0  # start writing from start

        elif self.filename.is_file():
            # we do not currently write to the file => clear by removing file completely
            self._logger.info("Truncate data by removing hdf5 file")

            self._open("closed")  # close file if it was opened
            self.filename.unlink()  # remove file

        else:
            self._logger.debug("Truncate is no-op since file does not exist")

        super().clear(clear_data_shape=clear_data_shape)



[docs]
    def start_writing(self, field: FieldBase, info: InfoDict | None = None) -> None:
        """Initialize the storage for writing data.

        Args:
            field (:class:`~pde.fields.FieldBase`):
                An example of the data that will be written to extract the grid and the
                data_shape
            info (dict):
                Supplies extra information that is stored in the storage
        """
        if self._is_writing:
            msg = f"{self.__class__.__name__} is already in writing mode"
            raise RuntimeError(msg)

        # delete data if truncation is requested. This is for instance necessary
        # to remove older data with incompatible data_shape
        if self.write_mode == "truncate" or self.write_mode == "truncate_once":
            self.clear(clear_data_shape=True)

        # initialize the writing, setting current data shape
        super().start_writing(field, info=info)

        # initialize the file for writing with the correct mode
        self._logger.debug("Start writing with mode '%s'", self.write_mode)
        if self.write_mode == "truncate_once":
            self._open("writing", info)
            self.write_mode = "append"  # do not truncate for next writing

        elif self.write_mode == "truncate":
            self._open("writing", info)

        elif self.write_mode == "append":
            self._open("appending", info)

        elif self.write_mode == "readonly":
            msg = "Cannot write in read-only mode"
            raise RuntimeError(msg)

        else:
            msg = (
                f"Unknown write mode `{self.write_mode}`. Possible values are "
                "`truncate_once`, `truncate`, and `append`"
            )
            raise ValueError(msg)

        if not self.keep_opened:
            # store extra information as attributes
            hdf_write_attributes(self._file, self.info)

        self._is_writing = True


    def _append_data(self, data: NumericArray, time: float) -> None:
        """Append a new data set.

        Args:
            data (:class:`~numpy.ndarray`): The actual data
            time (float): The time point associated with the data
        """
        if self._is_child_node:
            self._logger.warning("Trying to append data on MPI child node.")
            return

        if self.keep_opened:
            if not self._is_writing or self._data_length is None:
                msg = (
                    f"Writing not initialized. Call `{self.__class__.__name__}."
                    f"start_writing`. {self._is_child_node=}"
                )
                raise RuntimeError(msg)

        else:
            # need to reopen the file
            self._open("appending")

        # write the new data
        if self._data_length >= len(self._data):
            self._data.resize((self._data_length + 1, *self.data_shape))
        self._data[self._data_length] = data

        # write the new time
        if time is None:
            if len(self._times) == 0:
                time = 0
            else:
                time = self._times[self._data_length - 1] + 1
        if self._data_length >= len(self._times):
            self._times.resize((self._data_length + 1,))
        self._times[self._data_length] = time

        self._data_length += 1
        self.info["data_length"] = self._data_length

        if not self.keep_opened:
            self.close()


[docs]
    def end_writing(self) -> None:
        """Finalize the storage after writing.

        This makes sure the data is actually written to a file when self.keep_opened ==
        False
        """
        if not self._is_writing:
            return  # writing mode was already ended
        if self._is_child_node:
            self._logger.warning("Trying to end writing on MPI child node.")
            return
        self._logger.debug("End writing")

        # store extra information as attributes
        hdf_write_attributes(self._file, self.info)
        self._file.flush()
        self.close()
        self._is_writing = False