././@PaxHeader0000000000000000000000000000003400000000000010212 xustar0028 mtime=1627995375.6317217 EXtra-data-1.7.0/0000775000175000017500000000000000000000000014312 5ustar00takluyvertakluyver././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1591102641.0 EXtra-data-1.7.0/.coveragerc0000644000175000017500000000006700000000000016434 0ustar00takluyvertakluyver[run] omit = */tests/* concurrency = "multiprocessing" ././@PaxHeader0000000000000000000000000000003400000000000010212 xustar0028 mtime=1627995375.6067216 EXtra-data-1.7.0/EXtra_data.egg-info/0000775000175000017500000000000000000000000020020 5ustar00takluyvertakluyver././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1627995375.0 EXtra-data-1.7.0/EXtra_data.egg-info/PKG-INFO0000664000175000017500000000410000000000000021110 0ustar00takluyvertakluyverMetadata-Version: 2.1 Name: EXtra-data Version: 1.7.0 Summary: Tools to read and analyse data from European XFEL Home-page: https://github.com/European-XFEL/EXtra-data Author: European XFEL GmbH Author-email: da-support@xfel.eu Maintainer: Thomas Michelat License: BSD-3-Clause Description: [![Build Status](https://github.com/European-XFEL/EXtra-data/workflows/Tests/badge.svg)](https://github.com/European-XFEL/EXtra-data/actions?query=workflow%3ATests) [![codecov](https://codecov.io/gh/European-XFEL/EXtra-data/branch/master/graph/badge.svg)](https://codecov.io/gh/European-XFEL/EXtra-data) Python 3 tools for reading European XFEL's HDF5 files. [Documentation](https://extra-data.readthedocs.io/en/latest/) Installing ========== *EXtra-data* is available on our Anaconda installation on the Maxwell cluster: module load exfel exfel_anaconda3 You can also install it [from PyPI](https://pypi.org/project/extra-data/) to use in other environments with Python 3.6 or later: pip install extra_data If you get a permissions error, add the `--user` flag to that command. Contributing =========== Tests ----- Tests can be run as follows: python3 -m pytest -v --pyargs extra_data In the source directory, you can also omit `--pyargs extra_data`. Platform: UNKNOWN Classifier: Development Status :: 5 - Production/Stable Classifier: Environment :: Console Classifier: Intended Audience :: Developers Classifier: Intended Audience :: Science/Research Classifier: License :: OSI Approved :: BSD License Classifier: Operating System :: POSIX :: Linux Classifier: Programming Language :: Python :: 3 Classifier: Topic :: Scientific/Engineering :: Information Analysis Classifier: Topic :: Scientific/Engineering :: Physics Requires-Python: >=3.6 Description-Content-Type: text/markdown Provides-Extra: docs Provides-Extra: test ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1627995375.0 EXtra-data-1.7.0/EXtra_data.egg-info/SOURCES.txt0000664000175000017500000000402600000000000021706 0ustar00takluyvertakluyver.coveragerc LICENSE MANIFEST.in README.md pytest.ini setup.py EXtra_data.egg-info/PKG-INFO EXtra_data.egg-info/SOURCES.txt EXtra_data.egg-info/dependency_links.txt EXtra_data.egg-info/entry_points.txt EXtra_data.egg-info/requires.txt EXtra_data.egg-info/top_level.txt extra_data/__init__.py extra_data/components.py extra_data/exceptions.py extra_data/export.py extra_data/file_access.py extra_data/h5index.py extra_data/keydata.py extra_data/locality.py extra_data/lsxfel.py extra_data/read_machinery.py extra_data/reader.py extra_data/run_files_map.py extra_data/stacking.py extra_data/utils.py extra_data/validation.py extra_data/write_cxi.py extra_data/writer.py extra_data/cli/__init__.py extra_data/cli/make_virtual_cxi.py extra_data/tests/__init__.py extra_data/tests/conftest.py extra_data/tests/make_examples.py extra_data/tests/test_bad_trains.py extra_data/tests/test_components.py extra_data/tests/test_keydata.py extra_data/tests/test_lsxfel.py extra_data/tests/test_open_file_limiter.py extra_data/tests/test_read_machinery.py extra_data/tests/test_reader_mockdata.py extra_data/tests/test_run_files_map.py extra_data/tests/test_slice_objs.py extra_data/tests/test_stacking.py extra_data/tests/test_streamer.py extra_data/tests/test_utils.py extra_data/tests/test_validation.py extra_data/tests/test_writer.py extra_data/tests/cli/__init__.py extra_data/tests/cli/test_make_virtual_cxi.py extra_data/tests/mockdata/__init__.py extra_data/tests/mockdata/adc.py extra_data/tests/mockdata/base.py extra_data/tests/mockdata/basler_camera.py extra_data/tests/mockdata/control_common.py extra_data/tests/mockdata/dctrl.py extra_data/tests/mockdata/detectors.py extra_data/tests/mockdata/gauge.py extra_data/tests/mockdata/gec_camera.py extra_data/tests/mockdata/imgfel.py extra_data/tests/mockdata/jungfrau.py extra_data/tests/mockdata/mkfile.py extra_data/tests/mockdata/motor.py extra_data/tests/mockdata/mpod.py extra_data/tests/mockdata/sidemic_camera.py extra_data/tests/mockdata/tsens.py extra_data/tests/mockdata/uvlamp.py extra_data/tests/mockdata/xgm.py././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1627995375.0 EXtra-data-1.7.0/EXtra_data.egg-info/dependency_links.txt0000664000175000017500000000000100000000000024066 0ustar00takluyvertakluyver ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1627995375.0 EXtra-data-1.7.0/EXtra_data.egg-info/entry_points.txt0000664000175000017500000000041100000000000023312 0ustar00takluyvertakluyver[console_scripts] extra-data-locality = extra_data.locality:main extra-data-make-virtual-cxi = extra_data.cli.make_virtual_cxi:main extra-data-validate = extra_data.validation:main karabo-bridge-serve-files = extra_data.export:main lsxfel = extra_data.lsxfel:main ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1627995375.0 EXtra-data-1.7.0/EXtra_data.egg-info/requires.txt0000664000175000017500000000030100000000000022412 0ustar00takluyvertakluyverfabio h5py>=2.10 karabo-bridge>=0.6 matplotlib numpy pandas psutil xarray [docs] sphinx nbsphinx ipython sphinxcontrib_github_alt [test] coverage dask[array] nbval pytest pytest-cov testpath ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1627995375.0 EXtra-data-1.7.0/EXtra_data.egg-info/top_level.txt0000664000175000017500000000001300000000000022544 0ustar00takluyvertakluyverextra_data ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1584883216.0 EXtra-data-1.7.0/LICENSE0000644000175000017500000000301400000000000015313 0ustar00takluyvertakluyverBSD 3-Clause License Copyright (c) 2017, European X-Ray Free-Electron Laser Facility GmbH All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1584883216.0 EXtra-data-1.7.0/MANIFEST.in0000644000175000017500000000011100000000000016037 0ustar00takluyvertakluyverinclude LICENSE include README.md include .coveragerc include pytest.ini ././@PaxHeader0000000000000000000000000000003400000000000010212 xustar0028 mtime=1627995375.6307218 EXtra-data-1.7.0/PKG-INFO0000664000175000017500000000410000000000000015402 0ustar00takluyvertakluyverMetadata-Version: 2.1 Name: EXtra-data Version: 1.7.0 Summary: Tools to read and analyse data from European XFEL Home-page: https://github.com/European-XFEL/EXtra-data Author: European XFEL GmbH Author-email: da-support@xfel.eu Maintainer: Thomas Michelat License: BSD-3-Clause Description: [![Build Status](https://github.com/European-XFEL/EXtra-data/workflows/Tests/badge.svg)](https://github.com/European-XFEL/EXtra-data/actions?query=workflow%3ATests) [![codecov](https://codecov.io/gh/European-XFEL/EXtra-data/branch/master/graph/badge.svg)](https://codecov.io/gh/European-XFEL/EXtra-data) Python 3 tools for reading European XFEL's HDF5 files. [Documentation](https://extra-data.readthedocs.io/en/latest/) Installing ========== *EXtra-data* is available on our Anaconda installation on the Maxwell cluster: module load exfel exfel_anaconda3 You can also install it [from PyPI](https://pypi.org/project/extra-data/) to use in other environments with Python 3.6 or later: pip install extra_data If you get a permissions error, add the `--user` flag to that command. Contributing =========== Tests ----- Tests can be run as follows: python3 -m pytest -v --pyargs extra_data In the source directory, you can also omit `--pyargs extra_data`. Platform: UNKNOWN Classifier: Development Status :: 5 - Production/Stable Classifier: Environment :: Console Classifier: Intended Audience :: Developers Classifier: Intended Audience :: Science/Research Classifier: License :: OSI Approved :: BSD License Classifier: Operating System :: POSIX :: Linux Classifier: Programming Language :: Python :: 3 Classifier: Topic :: Scientific/Engineering :: Information Analysis Classifier: Topic :: Scientific/Engineering :: Physics Requires-Python: >=3.6 Description-Content-Type: text/markdown Provides-Extra: docs Provides-Extra: test ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1623678132.0 EXtra-data-1.7.0/README.md0000664000175000017500000000172200000000000015573 0ustar00takluyvertakluyver[![Build Status](https://github.com/European-XFEL/EXtra-data/workflows/Tests/badge.svg)](https://github.com/European-XFEL/EXtra-data/actions?query=workflow%3ATests) [![codecov](https://codecov.io/gh/European-XFEL/EXtra-data/branch/master/graph/badge.svg)](https://codecov.io/gh/European-XFEL/EXtra-data) Python 3 tools for reading European XFEL's HDF5 files. [Documentation](https://extra-data.readthedocs.io/en/latest/) Installing ========== *EXtra-data* is available on our Anaconda installation on the Maxwell cluster: module load exfel exfel_anaconda3 You can also install it [from PyPI](https://pypi.org/project/extra-data/) to use in other environments with Python 3.6 or later: pip install extra_data If you get a permissions error, add the `--user` flag to that command. Contributing =========== Tests ----- Tests can be run as follows: python3 -m pytest -v --pyargs extra_data In the source directory, you can also omit `--pyargs extra_data`. ././@PaxHeader0000000000000000000000000000003400000000000010212 xustar0028 mtime=1627995375.6147218 EXtra-data-1.7.0/extra_data/0000775000175000017500000000000000000000000016426 5ustar00takluyvertakluyver././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1627995347.0 EXtra-data-1.7.0/extra_data/__init__.py0000664000175000017500000000371600000000000020546 0ustar00takluyvertakluyver# coding: utf-8 """The extra_data package. Copyright (c) 2017, European X-Ray Free-Electron Laser Facility GmbH All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You should have received a copy of the 3-Clause BSD License along with this program. If not, see """ __version__ = "1.7.0" from .exceptions import ( SourceNameError, PropertyNameError, TrainIDError, MultiRunError ) from .keydata import KeyData from .reader import * from .stacking import * from .utils import * __all__ = reader.__all__ + utils.__all__ + stacking.__all__ ././@PaxHeader0000000000000000000000000000003400000000000010212 xustar0028 mtime=1627995375.6157217 EXtra-data-1.7.0/extra_data/cli/0000775000175000017500000000000000000000000017175 5ustar00takluyvertakluyver././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1602757031.0 EXtra-data-1.7.0/extra_data/cli/__init__.py0000644000175000017500000000005100000000000021300 0ustar00takluyvertakluyver"""extra_data command-line interfaces""" ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1626691885.0 EXtra-data-1.7.0/extra_data/cli/make_virtual_cxi.py0000664000175000017500000001032000000000000023071 0ustar00takluyvertakluyverimport argparse import logging import os import os.path as osp import re import sys from textwrap import dedent from extra_data import RunDirectory from extra_data.components import identify_multimod_detectors log = logging.getLogger(__name__) def parse_number(number:str): try: return float(number) except ValueError: return int(number, 0) def main(argv=None): example = dedent(""" Example: extra-data-make-virtual-cxi -o ./out_file.h5 --min-modules 15 \\ --fill-value data 0 --fill-value gain 1 /path/to/source/run """) ap = argparse.ArgumentParser( 'extra-data-make-virtual-cxi', epilog=example, formatter_class=argparse.RawDescriptionHelpFormatter, description='Write a virtual CXI file to access the detector data.' ) ap.add_argument('run_dir', help="Path to an EuXFEL run directory") # Specifying a proposal directory & a run number is the older interface. # If the run_number argument is passed, run_dir is used as proposal. ap.add_argument('run_number', nargs="?", help=argparse.SUPPRESS) ap.add_argument( '-o', '--output', help="Filename or path for the CXI output file. " "By default, it is written in the proposal's scratch directory." ) ap.add_argument( '--min-modules', type=int, default=None, metavar='N', help='Include trains where at least N modules have data (default:' ' half+1 of all detector modules).' ) ap.add_argument( '--n-modules', type=int, default=None, metavar='N', help='Number of detector modules in the experiment setup.' ' Should be used only for JUNGFRAU data.' ) ap.add_argument( '--fill-value', action='append', nargs=2, metavar=('DS', 'V'), help='define fill value (V) for individual dataset (DS). Datasets are' ' "data", "gain" and "mask". (defaults: data: nan (proc, float32)' ' or 0 (raw, uint16); gain: 0; mask: 0xffffffff)' ) ap.add_argument( '--exc-suspect-trains', action='store_true', help='Exclude suspect trains. This tries to avoid some issues with' ' incorrect train IDs in the data, but may mean less data is' ' available.' ) args = ap.parse_args(argv) out_file = args.output fill_values = None if args.fill_value: fill_values = {ds: parse_number(value) for ds, value in args.fill_value} logging.basicConfig(level=logging.INFO) if args.run_number: # proposal directory, run number run = 'r%04d' % int(args.run_number) proposal = args.run_dir run_dir = osp.join(args.run_dir, 'proc', run) if out_file is None: out_file = osp.join(proposal, 'scratch', '{}_detectors_virt.cxi'.format(run)) else: # run directory run_dir = os.path.abspath(args.run_dir) if out_file is None: m = re.search(r'/(raw|proc)/(r\d{4})/?$', run_dir) if not m: sys.exit("ERROR: '-o outfile' option needed when " "input directory doesn't look like .../proc/r0123") proposal = run_dir[:m.start()] fname = '{}_{}_detectors_virt.cxi'.format(*m.group(2, 1)) out_file = osp.join(proposal, 'scratch', fname) out_dir = osp.dirname(osp.abspath(out_file)) if not os.access(run_dir, os.R_OK): sys.exit("ERROR: Don't have read access to {}".format(run_dir)) if not os.access(out_dir, os.W_OK): sys.exit("ERROR: Don't have write access to {}".format(out_dir)) log.info("Reading run directory %s", run_dir) inc_suspect = not args.exc_suspect_trains run = RunDirectory(run_dir, inc_suspect_trains=inc_suspect) _, det_class = identify_multimod_detectors(run, single=True) n_modules = det_class.n_modules kwargs = {} if n_modules == 0: n_modules = args.n_modules kwargs['n_modules'] = n_modules min_modules = args.min_modules if min_modules is None: min_modules = 1 if (n_modules is None) else (n_modules // 2) + 1 det = det_class(run, min_modules=min_modules, **kwargs) det.write_virtual_cxi(out_file, fill_values) if __name__ == '__main__': main() ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1626691885.0 EXtra-data-1.7.0/extra_data/components.py0000664000175000017500000014043600000000000021175 0ustar00takluyvertakluyver"""Interfaces to data from specific instruments """ import logging import numpy as np import pandas as pd import re import xarray from .exceptions import SourceNameError from .reader import DataCollection, by_id, by_index from .writer import FileWriter from .write_cxi import XtdfCXIWriter, JUNGFRAUCXIWriter __all__ = [ 'AGIPD1M', 'AGIPD500K', 'DSSC1M', 'LPD1M', 'JUNGFRAU', 'identify_multimod_detectors', ] log = logging.getLogger(__name__) MAX_PULSES = 2700 def multimod_detectors(detector_cls): """ Decorator for multimod detector classes (e.g. AGIPD/LPD/JUNGFRAU) to store them in a list 'multimod_detectors.list' and their names in 'multimod_detectors.names'. Parameters ---------- detector_cls: class Decorated detector class to append to the list. Returns ------- detector_cls: class Unmodified decorated detector class. """ multimod_detectors.list = getattr(multimod_detectors, 'list', list()) multimod_detectors.list.append(detector_cls) multimod_detectors.names = getattr(multimod_detectors, 'names', list()) multimod_detectors.names.append(detector_cls.__name__) return detector_cls def _check_pulse_selection(pulses): """Check and normalise a pulse selection""" if not isinstance(pulses, (by_id, by_index)): pulses = by_index[pulses] val = pulses.value if isinstance(pulses.value, slice): # Ensure start/stop/step are all real numbers start = val.start if (val.start is not None) else 0 stop = val.stop if (val.stop is not None) else MAX_PULSES step = val.step if (val.step is not None) else 1 if not all(isinstance(s, int) for s in (start, stop, step)): raise TypeError("Pulse selection slice must use integers or None") if step < 1: raise ValueError("Pulse selection slice must have positive step") if (start < 0) or (stop < 0): raise NotImplementedError("Negative pulse indices not supported") return type(pulses)(slice(start, stop, step)) # Convert everything except slices to numpy arrays elif isinstance(pulses.value, int): val = np.array([val], dtype=np.uint64) else: val = np.asarray(val, dtype=np.uint64) if (val < 0).any(): if isinstance(pulses, by_id): raise ValueError("Pulse IDs cannot be negative") else: raise NotImplementedError("Negative pulse indices not supported") return type(pulses)(val) class MultimodDetectorBase: """Base class for detectors made of several modules as separate data sources """ _source_re = re.compile(r'(?P.+)/DET/(\d+)CH') # Override in subclass _main_data_key = '' # Key to use for checking data counts match module_shape = (0, 0) n_modules = 0 def __init__(self, data: DataCollection, detector_name=None, modules=None, *, min_modules=1): if detector_name is None: detector_name = self._find_detector_name(data) if min_modules <= 0: raise ValueError("min_modules must be a positive integer, not " f"{min_modules!r}") source_to_modno = self._identify_sources(data, detector_name, modules) data = data.select([(src, '*') for src in source_to_modno]) self.detector_name = detector_name self.source_to_modno = source_to_modno # pandas' missing-data handling converts the data to floats if there # are any gaps - so fill them with 0s and convert back to uint64. mod_data_counts = pd.DataFrame({ src: data.get_data_counts(src, self._main_data_key) for src in source_to_modno }).fillna(0).astype(np.uint64) # Within any train, all modules should have same count or zero frame_counts = pd.Series(0, index=mod_data_counts.index, dtype=np.uint64) for tid, data_counts in mod_data_counts.iterrows(): count_vals = set(data_counts) - {0} if len(count_vals) > 1: raise ValueError( f"Inconsistent frame counts for train {tid}: {count_vals}" ) elif count_vals: frame_counts[tid] = count_vals.pop() self.data = self._select_trains(data, mod_data_counts, min_modules) # This should be a reversible 1-to-1 mapping self.modno_to_source = {m: s for (s, m) in source_to_modno.items()} assert len(self.modno_to_source) == len(self.source_to_modno) train_id_arr = np.asarray(self.data.train_ids) split_indices = np.where(np.diff(train_id_arr) != 1)[0] + 1 self.train_id_chunks = np.split(train_id_arr, split_indices) self.frame_counts = frame_counts[train_id_arr] @classmethod def _find_detector_name(cls, data): detector_names = set() for source in data.instrument_sources: m = cls._source_re.match(source) if m: detector_names.add(m.group('detname')) if not detector_names: raise SourceNameError(cls._source_re.pattern) elif len(detector_names) > 1: raise ValueError( "Multiple detectors found in the data: {}. " "Pass a name to data.detector() to pick one.".format( ', '.join(repr(n) for n in detector_names) ) ) return detector_names.pop() def _source_matches(self, data, detector_name): for source in data.instrument_sources: m = self._source_re.match(source) if m and m.group('detname') == detector_name: yield source, int(m.group('modno')) def _identify_sources(self, data, detector_name, modules=None): source_to_modno = dict(self._source_matches(data, detector_name)) if modules is not None: source_to_modno = {s: n for (s, n) in source_to_modno.items() if n in modules} if not source_to_modno: raise SourceNameError(f'{detector_name}/DET/...') return source_to_modno @classmethod def _select_trains(cls, data, mod_data_counts, min_modules): modules_present = (mod_data_counts > 0).sum(axis=1) mod_data_counts = mod_data_counts[modules_present >= min_modules] ntrains = len(mod_data_counts) if not ntrains: raise ValueError("No data found with >= {} modules present" .format(min_modules)) log.info("Found %d trains with data for at least %d modules", ntrains, min_modules) train_ids = mod_data_counts.index.values return data.select_trains(by_id[train_ids]) @property def train_ids(self): return self.data.train_ids @property def frames_per_train(self): counts = set(self.frame_counts.unique()) - {0} if len(counts) > 1: raise ValueError(f"Varying number of frames per train: {counts}") return counts.pop() def __repr__(self): return "<{}: Data interface for detector {!r} with {} modules>".format( type(self).__name__, self.detector_name, len(self.source_to_modno), ) @staticmethod def _concat(arrays, index, fill_value, astype): dtype = arrays[0].dtype if astype is None else np.dtype(astype) if fill_value is None: fill_value = np.nan if dtype.kind == 'f' else 0 fill_value = dtype.type(fill_value) return xarray.concat( [a.astype(dtype, copy=False) for a in arrays], pd.Index(index, name='module'), fill_value=fill_value ) def get_array(self, key, *, fill_value=None, roi=(), astype=None): """Get a labelled array of detector data Parameters ---------- key: str The data to get, e.g. 'image.data' for pixel values. fill_value: int or float, optional Value to use for missing values. If None (default) the fill value is 0 for integers and np.nan for floats. roi: tuple Specify e.g. ``np.s_[10:60, 100:200]`` to select pixels within each module when reading data. The selection is applied to each individual module, so it may only be useful when working with a single module. astype: Type Data type of the output array. If None (default) the dtype matches the input array dtype """ arrays = [] modnos = [] for modno, source in sorted(self.modno_to_source.items()): arrays.append(self.data.get_array(source, key, roi=roi)) modnos.append(modno) return self._concat(arrays, modnos, fill_value, astype) def get_dask_array(self, key, fill_value=None, astype=None): """Get a labelled Dask array of detector data Parameters ---------- key: str The data to get, e.g. 'image.data' for pixel values. fill_value: int or float, optional Value to use for missing values. If None (default) the fill value is 0 for integers and np.nan for floats. astype: Type Data type of the output array. If None (default) the dtype matches the input array dtype """ arrays = [] modnos = [] for modno, source in sorted(self.modno_to_source.items()): modnos.append(modno) mod_arr = self.data.get_dask_array(source, key, labelled=True) arrays.append(mod_arr) return self._concat(arrays, modnos, fill_value, astype) def trains(self, require_all=True): """Iterate over trains for detector data. Parameters ---------- require_all: bool If True (default), skip trains where any of the selected detector modules are missing data. Yields ------ train_data: dict A dictionary mapping key names (e.g. ``image.data``) to labelled arrays. """ return MPxDetectorTrainIterator(self, require_all=require_all) class XtdfDetectorBase(MultimodDetectorBase): """Common machinery for a group of detectors with similar data format AGIPD, DSSC & LPD all store pulse-resolved data in an "image" group, with both trains and pulses along the first dimension. This allows a different number of frames to be stored for each train, which makes access more complicated. """ n_modules = 16 _main_data_key = 'image.data' def __init__(self, data: DataCollection, detector_name=None, modules=None, *, min_modules=1): super().__init__(data, detector_name, modules, min_modules=min_modules) @staticmethod def _select_pulse_ids(pulses, data_pulse_ids): """Select pulses by ID across a chunk of trains Returns an array or slice of the indexes to include. """ if isinstance(pulses.value, slice): if pulses.value == slice(0, MAX_PULSES, 1): # All pulses included return slice(0, len(data_pulse_ids)) else: s = pulses.value desired = np.arange(s.start, s.stop, step=s.step, dtype=np.uint64) else: desired = pulses.value return np.nonzero(np.isin(data_pulse_ids, desired))[0] @staticmethod def _select_pulse_indices(pulses, firsts, counts): """Select pulses by index across a chunk of trains Returns an array or slice of the indexes to include. """ if isinstance(pulses.value, slice): if pulses.value == slice(0, MAX_PULSES, 1): # All pulses included return slice(0, counts.sum()) else: s = pulses.value desired = np.arange(s.start, s.stop, step=s.step, dtype=np.uint64) else: desired = pulses.value positions = [] for first, count in zip(firsts, counts): train_desired = desired[desired < count] positions.append(first + train_desired) return np.concatenate(positions) def _make_image_index(self, tids, inner_ids, inner_name='pulse'): """ Prepare indices for data per inner coordinate. Parameters ---------- tids: np.array Train id repeated for each inner coordinate. inner_ids: np.array Array of inner coordinate values. inner_name: string Name of the inner coordinate. Returns ------- pd.MultiIndex MultiIndex of 'train_ids' x 'inner_ids'. """ # Overridden in LPD1M for parallel gain mode return pd.MultiIndex.from_arrays( [tids, inner_ids], names=['train', inner_name] ) @staticmethod def _guess_axes(data, train_pulse_ids, unstack_pulses): # Raw files have a spurious extra dimension if data.ndim >= 2 and data.shape[1] == 1: data = data[:, 0] # TODO: this assumes we can tell what the axes are just from the # number of dimensions. Works for the data we've seen, but we # should look for a more reliable way. if data.ndim == 4: # image.data in raw data dims = ['train_pulse', 'data_gain', 'slow_scan', 'fast_scan'] elif data.ndim == 3: # image.data, image.gain, image.mask in calibrated data dims = ['train_pulse', 'slow_scan', 'fast_scan'] else: # Everything else seems to be 1D dims = ['train_pulse'] arr = xarray.DataArray(data, {'train_pulse': train_pulse_ids}, dims=dims) if unstack_pulses: # Separate train & pulse dimensions, and arrange dimensions # so that the data is contiguous in memory. dim_order = train_pulse_ids.names + dims[1:] return arr.unstack('train_pulse').transpose(*dim_order) else: return arr def _get_module_pulse_data(self, source, key, pulses, unstack_pulses, inner_index='pulseId', roi=()): def get_inner_ids(f, data_slice, ix_name='pulseId'): ids = f.file[f'/INSTRUMENT/{source}/{group}/{ix_name}'][ data_slice ] # Raw files have a spurious extra dimension if ids.ndim >= 2 and ids.shape[1] == 1: ids = ids[:, 0] return ids seq_arrays = [] data_path = "/INSTRUMENT/{}/{}".format(source, key.replace('.', '/')) for f in self.data._source_index[source]: group = key.partition('.')[0] firsts, counts = f.get_index(source, group) for chunk_tids in self.train_id_chunks: if chunk_tids[-1] < f.train_ids[0] or chunk_tids[0] > f.train_ids[-1]: # No overlap continue first_tid = max(chunk_tids[0], f.train_ids[0]) first_train_idx = np.nonzero(f.train_ids == first_tid)[0][0] last_tid = min(chunk_tids[-1], f.train_ids[-1]) last_train_idx = np.nonzero(f.train_ids == last_tid)[0][0] chunk_firsts = firsts[first_train_idx : last_train_idx + 1] chunk_counts = counts[first_train_idx : last_train_idx + 1] data_slice = slice( chunk_firsts[0], int(chunk_firsts[-1] + chunk_counts[-1]) ) inner_ids = get_inner_ids(f, data_slice, inner_index) trainids = np.repeat( np.arange(first_tid, last_tid + 1, dtype=np.uint64), chunk_counts.astype(np.intp), ) index = self._make_image_index( trainids, inner_ids, inner_index[:-2] ) if isinstance(pulses, by_id): # Get the pulse ID values out of the MultiIndex rather than # using inner_ids, because LPD1M in parallel_gain mode # makes the MultiIndex differently, repeating pulse IDs. if inner_index == 'pulseId': pulse_id = index.get_level_values('pulse') else: pulse_id = self._make_image_index( trainids, get_inner_ids(f, data_slice, 'pulseId'), ).get_level_values('pulse') positions = self._select_pulse_ids(pulses, pulse_id) else: # by_index positions = self._select_pulse_indices( pulses, chunk_firsts - data_slice.start, chunk_counts ) index = index[positions] if isinstance(positions, slice): data_positions = slice( int(data_slice.start + positions.start), int(data_slice.start + positions.stop), positions.step ) else: # ndarray data_positions = data_slice.start + positions dset = f.file[data_path] if dset.ndim >= 2 and dset.shape[1] == 1: # Ensure ROI applies to pixel dimensions, not the extra # dim in raw data (except AGIPD, where it is data/gain) sel_args = (data_positions, np.s_[:]) + roi else: sel_args = (data_positions,) + roi data = f.file[data_path][sel_args] arr = self._guess_axes(data, index, unstack_pulses) seq_arrays.append(arr) non_empty = [a for a in seq_arrays if (a.size > 0)] if not non_empty: if seq_arrays: # All per-file arrays are empty, so just return the first one. return seq_arrays[0] raise Exception( "Unable to get data for source {!r}, key {!r}. " "Please report an issue so we can investigate" .format(source, key) ) return xarray.concat( sorted(non_empty, key=lambda a: a.coords['train'][0]), dim=('train' if unstack_pulses else 'train_pulse'), ) def get_array(self, key, pulses=np.s_[:], unstack_pulses=True, *, fill_value=None, subtrain_index='pulseId', roi=(), astype=None): """Get a labelled array of detector data Parameters ---------- key: str The data to get, e.g. 'image.data' for pixel values. pulses: slice, array, by_id or by_index Select the pulses to include from each train. by_id selects by pulse ID, by_index by index within the data being read. The default includes all pulses. Only used for per-pulse data. unstack_pulses: bool Whether to separate train and pulse dimensions. fill_value: int or float, optional Value to use for missing values. If None (default) the fill value is 0 for integers and np.nan for floats. subtrain_index: str Specify 'pulseId' (default) or 'cellId' to label the frames recorded within each train. Pulse ID should allow this data to be matched with other devices, but depends on how the detector was manually configured when the data was taken. Cell ID refers to the memory cell used for that frame in the detector hardware. roi: tuple Specify e.g. ``np.s_[10:60, 100:200]`` to select pixels within each module when reading data. The selection is applied to each individual module, so it may only be useful when working with a single module. For AGIPD raw data, each module records a frame as a 3D array with 2 entries on the first dimension, for data & gain information, so ``roi=np.s_[0]`` will select only the data part of each frame. astype: Type data type of the output array. If None (default) the dtype matches the input array dtype """ if subtrain_index not in {'pulseId', 'cellId'}: raise ValueError("subtrain_index must be 'pulseId' or 'cellId'") if not isinstance(roi, tuple): roi = (roi,) if key.startswith('image.'): pulses = _check_pulse_selection(pulses) arrays, modnos = [], [] for modno, source in sorted(self.modno_to_source.items()): arrays.append(self._get_module_pulse_data( source, key, pulses, unstack_pulses, subtrain_index, roi=roi )) modnos.append(modno) return self._concat(arrays, modnos, fill_value, astype) else: return super().get_array( key, fill_value=fill_value, roi=roi, astype=astype ) def get_dask_array(self, key, subtrain_index='pulseId', fill_value=None, astype=None): """Get a labelled Dask array of detector data Dask does lazy, parallelised computing, and can work with large data volumes. This method doesn't immediately load the data: that only happens once you trigger a computation. Parameters ---------- key: str The data to get, e.g. 'image.data' for pixel values. subtrain_index: str, optional Specify 'pulseId' (default) or 'cellId' to label the frames recorded within each train. Pulse ID should allow this data to be matched with other devices, but depends on how the detector was manually configured when the data was taken. Cell ID refers to the memory cell used for that frame in the detector hardware. fill_value: int or float, optional Value to use for missing values. If None (default) the fill value is 0 for integers and np.nan for floats. astype: Type, optional data type of the output array. If None (default) the dtype matches the input array dtype """ if subtrain_index not in {'pulseId', 'cellId'}: raise ValueError("subtrain_index must be 'pulseId' or 'cellId'") arrays = [] modnos = [] for modno, source in sorted(self.modno_to_source.items()): modnos.append(modno) mod_arr = self.data.get_dask_array(source, key, labelled=True) # At present, all the per-pulse data is stored in the 'image' key. # If that changes, this check will need to change as well. if key.startswith('image.'): # Add pulse IDs to create multi-level index inner_ix = self.data.get_array(source, 'image.' + subtrain_index) # Raw files have a spurious extra dimension if inner_ix.ndim >= 2 and inner_ix.shape[1] == 1: inner_ix = inner_ix[:, 0] mod_arr = mod_arr.rename({'trainId': 'train_pulse'}) mod_arr.coords['train_pulse'] = self._make_image_index( mod_arr.coords['train_pulse'].values, inner_ix.values, inner_name=subtrain_index, ).set_names('trainId', level=0) # This uses 'trainId' where a concrete array from the same class # uses 'train'. I didn't notice that inconsistency when I # introduced it, and now code may be relying on each name. arrays.append(mod_arr) return self._concat(arrays, modnos, fill_value, astype) def trains(self, pulses=np.s_[:], require_all=True): """Iterate over trains for detector data. Parameters ---------- pulses: slice, array, by_index or by_id Select which pulses to include for each train. The default is to include all pulses. require_all: bool If True (default), skip trains where any of the selected detector modules are missing data. Yields ------ train_data: dict A dictionary mapping key names (e.g. ``image.data``) to labelled arrays. """ return MPxDetectorTrainIterator(self, pulses, require_all=require_all) def write_virtual_cxi(self, filename, fillvalues=None): """Write a virtual CXI file to access the detector data. The virtual datasets in the file provide a view of the detector data as if it was a single huge array, but without copying the data. Creating and using virtual datasets requires HDF5 1.10. Parameters ---------- filename: str The file to be written. Will be overwritten if it already exists. fillvalues: dict, optional keys are datasets names (one of: data, gain, mask) and associated fill value for missing data (default is np.nan for float arrays and zero for integer arrays) """ XtdfCXIWriter(self).write(filename, fillvalues=fillvalues) def write_frames(self, filename, trains, pulses): """Write selected detector frames to a new EuXFEL HDF5 file trains and pulses should be 1D arrays of the same length, containing train IDs and pulse IDs (corresponding to the pulse IDs recorded by the detector). i.e. (trains[i], pulses[i]) identifies one frame. """ if (trains.ndim != 1) or (pulses.ndim != 1): raise ValueError("trains & pulses must be 1D arrays") inc_tp_ids = zip_trains_pulses(trains, pulses) writer = FramesFileWriter(filename, self.data, inc_tp_ids) try: writer.write() finally: writer.file.close() def zip_trains_pulses(trains, pulses): """Combine two similar arrays of train & pulse IDs as one struct array """ if trains.shape != pulses.shape: raise ValueError( f"Train & pulse arrays don't match ({trains.shape} != {pulses.shape})" ) res = np.zeros(trains.shape, dtype=np.dtype([ ('trainId', np.uint64), ('pulseId', np.uint64) ])) res['trainId'] = trains res['pulseId'] = pulses return res class FramesFileWriter(FileWriter): """Write selected detector frames in European XFEL HDF5 format""" def __init__(self, path, data, inc_tp_ids): super().__init__(path, data) self.inc_tp_ids = inc_tp_ids def _guess_number_of_storing_entries(self, source, key): if source in self.data.instrument_sources and key.startswith("image."): # Start with an empty dataset, grow it as we add each file return 0 else: return super()._guess_number_of_storing_entries(source, key) def copy_image_data(self, source, keys): """Copy selected frames of the detector image data""" frame_tids_piecewise = [] src_files = sorted( self.data._source_index[source], key=lambda fa: fa.train_ids[0] ) for fa in src_files: _, counts = fa.get_index(source, 'image') file_tids = np.repeat(fa.train_ids, counts.astype(np.intp)) file_pids = fa.file[f'/INSTRUMENT/{source}/image/pulseId'][:] if file_pids.ndim == 2 and file_pids.shape[1] == 1: # Raw data has a spurious extra dimension file_pids = file_pids[:, 0] # Data can have trailing 0s, seemingly file_pids = file_pids[:len(file_tids)] file_tp_ids = zip_trains_pulses(file_tids, file_pids) # indexes of selected frames in datasets under .../image in this file ixs = np.isin(file_tp_ids, self.inc_tp_ids).nonzero()[0] nframes = ixs.shape[0] for key in keys: path = f"INSTRUMENT/{source}/{key.replace('.', '/')}" dst_ds = self.file[path] dst_cursor = dst_ds.shape[0] dst_ds.resize(dst_cursor + nframes, axis=0) dst_ds[dst_cursor: dst_cursor+nframes] = fa.file[path][ixs] frame_tids_piecewise.append(file_tids[ixs]) frame_tids = np.concatenate(frame_tids_piecewise) self._make_index(source, 'image', frame_tids) def copy_source(self, source): """Copy all the relevant data for one detector source""" if source not in self.data.instrument_sources: return super().copy_source(source) all_keys = self.data.keys_for_source(source) img_keys = {k for k in all_keys if k.startswith('image.')} for key in sorted(all_keys - img_keys): self.copy_dataset(source, key) self.copy_image_data(source, sorted(img_keys)) class MPxDetectorTrainIterator: """Iterate over trains in detector data, assembling arrays. Created by :meth:`DetectorData.trains`. """ def __init__(self, data, pulses=by_index[:], require_all=True): self.data = data self.pulses = _check_pulse_selection(pulses) self.require_all = require_all # {(source, key): (f, dataset)} self._datasets_cache = {} def _find_data(self, source, key, tid): """ Find FileAccess instance and dataset corresponding to source, key, and train id tid. Parameters ---------- source: string Path to keys in HD5 file, e.g.: 'SPB_DET_AGIPD1M-1/DET/5CH0:xtdf'. key: string Key for data at source separated by dot, e.g.: 'image.data'. tid: np.int Train id. Returns ------- Tuple[FileAccess, int, h5py.Dataset] FileAccess Instance for the HD5 file with requested data. int Starting index for the requested data. h5py.Dataset h5py dataset with found data. """ file, ds = self._datasets_cache.get((source, key), (None, None)) if ds: ixs = (file.train_ids == tid).nonzero()[0] if ixs.size > 0: return file, ixs[0], ds data = self.data.data path = '/INSTRUMENT/{}/{}'.format(source, key.replace('.', '/')) f, pos = data._find_data(source, tid) if f is not None: ds = f.file[path] self._datasets_cache[(source, key)] = (f, ds) return f, pos, ds return None, None, None def _get_slow_data(self, source, key, tid): """ Get an array of slow (per train) data corresponding to source, key, and train id tid. Also used for JUNGFRAU data with memory cell dimension. Parameters ---------- source: string Path to keys in HD5 file, e.g.: 'SPB_DET_AGIPD1M-1/DET/5CH0:xtdf'. key: string Key for data at source separated by dot, e.g.: 'header.pulseCount'. tid: np.int Train id. Returns ------- xarray.DataArray Array of selected slow data. In case there are more than one frame for the train id tid - train id dimension is kept indexing frames within tid. """ file, pos, ds = self._find_data(source, key, tid) if file is None: return None group = key.partition('.')[0] firsts, counts = file.get_index(source, group) first, count = firsts[pos], counts[pos] if count == 1: return xarray.DataArray(ds[first]) else: return xarray.DataArray(ds[first : first + count]) def _get_pulse_data(self, source, key, tid): """ Get an array of per pulse data corresponding to source, key, and train id tid. Used only for AGIPD-like detectors, for JUNGFRAU-like per-cell data '_get_slow_data' is used. Parameters ---------- source: string Path to keys in HD5 file, e.g.: 'SPB_DET_AGIPD1M-1/DET/5CH0:xtdf'. key: string Key for data at source separated by dot, e.g.: 'image.data'. tid: np.int Train id. Returns ------- xarray.DataArray Array of selected per pulse data. """ file, pos, ds = self._find_data(source, key, tid) if file is None: return None group = key.partition('.')[0] firsts, counts = file.get_index(source, group) first, count = firsts[pos], counts[pos] pulse_ids = file.file['/INSTRUMENT/{}/{}/pulseId'.format(source, group)][ first : first + count ] # Raw files have a spurious extra dimension if pulse_ids.ndim >= 2 and pulse_ids.shape[1] == 1: pulse_ids = pulse_ids[:, 0] if isinstance(self.pulses, by_id): positions = self._select_pulse_ids(pulse_ids) else: # by_index positions = self._select_pulse_indices(count) pulse_ids = pulse_ids[positions] train_ids = np.array([tid] * len(pulse_ids), dtype=np.uint64) train_pulse_ids = self.data._make_image_index(train_ids, pulse_ids) if isinstance(positions, slice): data_positions = slice( int(first + positions.start), int(first + positions.stop), positions.step ) else: # ndarray data_positions = first + positions return self.data._guess_axes( ds[data_positions], train_pulse_ids, unstack_pulses=True ) def _select_pulse_ids(self, pulse_ids): """Select pulses by ID Returns an array or slice of the indexes to include. """ val = self.pulses.value N = len(pulse_ids) if isinstance(val, slice): if val.step == 1: after_start = np.nonzero(pulse_ids >= val.start)[0] after_stop = np.nonzero(pulse_ids >= val.stop)[0] start_ix = after_start[0] if (after_start.size > 0) else N stop_ix = after_stop[0] if (after_stop.size > 0) else N return slice(start_ix, stop_ix) # step != 1 desired = np.arange(val.start, val.stop, step=val.step, dtype=np.uint64) else: desired = val return np.nonzero(np.isin(pulse_ids, desired))[0] def _select_pulse_indices(self, count): """Select pulses by index Returns an array or slice of the indexes to include. """ val = self.pulses.value if isinstance(val, slice): return slice(val.start, min(val.stop, count), val.step) # ndarray return val[val < count] def _assemble_data(self, tid): """ Assemble data for all keys into a dictionary for specified train id. Parameters ---------- tid: int Train id. Returns ------- Dict[str, xarray]: str Key name. xarray Assembled data array. """ key_module_arrays = {} for modno, source in sorted(self.data.modno_to_source.items()): for key in self.data.data._keys_for_source(source): # At present, all the per-pulse data is stored in the 'image' key. # If that changes, this check will need to change as well. if key.startswith('image.'): mod_data = self._get_pulse_data(source, key, tid) else: mod_data = self._get_slow_data(source, key, tid) if mod_data is None: continue if key not in key_module_arrays: key_module_arrays[key] = [], [] modnos, data = key_module_arrays[key] modnos.append(modno) data.append(mod_data) # Assemble the data for each key into one xarray return { k: xarray.concat(data, pd.Index(modnos, name='module')) for (k, (modnos, data)) in key_module_arrays.items() } def __iter__(self): """ Iterate over train ids and yield assembled data dictionaries. Yields ------ Tuple[int, Dict[str, xarray]]: int train id. Dict[str, xarray] assembled {key: data array} dictionary. """ for tid in self.data.train_ids: tid = int(tid) # Convert numpy int to regular Python int if self.require_all and self.data.data._check_data_missing(tid): continue yield tid, self._assemble_data(tid) @multimod_detectors class AGIPD1M(XtdfDetectorBase): """An interface to AGIPD-1M data. Parameters ---------- data: DataCollection A data collection, e.g. from :func:`.RunDirectory`. modules: set of ints, optional Detector module numbers to use. By default, all available modules are used. detector_name: str, optional Name of a detector, e.g. 'SPB_DET_AGIPD1M-1'. This is only needed if the dataset includes more than one AGIPD detector. min_modules: int Include trains where at least n modules have data. Default is 1. """ _source_re = re.compile(r'(?P.+_AGIPD1M.*)/DET/(?P\d+)CH') module_shape = (512, 128) @multimod_detectors class AGIPD500K(XtdfDetectorBase): """An interface to AGIPD-500K data Detector names are like 'HED_DET_AGIPD500K2G', otherwise this is identical to :class:`AGIPD1M`. """ _source_re = re.compile(r'(?P.+_AGIPD500K.*)/DET/(?P\d+)CH') module_shape = (512, 128) n_modules = 8 @multimod_detectors class DSSC1M(XtdfDetectorBase): """An interface to DSSC-1M data. Parameters ---------- data: DataCollection A data collection, e.g. from :func:`.RunDirectory`. modules: set of ints, optional Detector module numbers to use. By default, all available modules are used. detector_name: str, optional Name of a detector, e.g. 'SCS_DET_DSSC1M-1'. This is only needed if the dataset includes more than one DSSC detector. min_modules: int Include trains where at least n modules have data. Default is 1. """ _source_re = re.compile(r'(?P.+_DSSC1M.*)/DET/(?P\d+)CH') module_shape = (128, 512) @multimod_detectors class LPD1M(XtdfDetectorBase): """An interface to LPD-1M data. Parameters ---------- data: DataCollection A data collection, e.g. from :func:`.RunDirectory`. modules: set of ints, optional Detector module numbers to use. By default, all available modules are used. detector_name: str, optional Name of a detector, e.g. 'FXE_DET_LPD1M-1'. This is only needed if the dataset includes more than one LPD detector. min_modules: int Include trains where at least n modules have data. Default is 1. parallel_gain: bool Set to True to read this data as parallel gain data, where high, medium and low gain data are stored sequentially within each train. This will repeat the pulse & cell IDs from the first 1/3 of each train, and add gain stage labels from 0 (high-gain) to 2 (low-gain). """ _source_re = re.compile(r'(?P.+_LPD1M.*)/DET/(?P\d+)CH') module_shape = (256, 256) def __init__(self, data: DataCollection, detector_name=None, modules=None, *, min_modules=1, parallel_gain=False): super().__init__(data, detector_name, modules, min_modules=min_modules) self.parallel_gain = parallel_gain if parallel_gain: if ((self.frame_counts % 3) != 0).any(): raise ValueError( "parallel_gain=True needs the frames in each train to be divisible by 3" ) def _select_pulse_indices(self, pulses, firsts, counts): if not self.parallel_gain: return super()._select_pulse_indices(pulses, firsts, counts) if isinstance(pulses.value, slice): if pulses.value == slice(0, MAX_PULSES, 1): # All pulses included return slice(0, counts.sum()) else: s = pulses.value desired = np.arange(s.start, s.stop, step=s.step, dtype=np.uint64) else: desired = pulses.value positions = [] for ix, frames in zip(firsts, counts): n_per_gain_stage = int(frames // 3) train_desired = desired[desired < n_per_gain_stage] for stage in range(3): start = ix + np.uint64(stage * n_per_gain_stage) positions.append(start + train_desired) return np.concatenate(positions) def _make_image_index(self, tids, inner_ids, inner_name='pulse'): if not self.parallel_gain: return super()._make_image_index(tids, inner_ids, inner_name) # In 'parallel gain' mode, the first 1/3 of pulse/cell IDs in each train # are valid, but the remaining 2/3 are junk. So we'll repeat the valid # ones 3 times (in inner_ids_fixed). At the same time, we make a gain # stage index (0-2), so each frame has a unique entry in the MultiIndex # (train ID, gain, pulse/cell ID) gain = np.zeros_like(inner_ids, dtype=np.uint8) inner_ids_fixed = np.zeros_like(inner_ids) _, firsts, counts = np.unique(tids, return_index=True, return_counts=True) for ix, frames in zip(firsts, counts): # Iterate through trains n_per_gain_stage = int(frames // 3) train_inner_ids = inner_ids[ix: ix + n_per_gain_stage] for stage in range(3): start = ix + (stage * n_per_gain_stage) end = start + n_per_gain_stage gain[start:end] = stage inner_ids_fixed[start:end] = train_inner_ids return pd.MultiIndex.from_arrays( [tids, gain, inner_ids_fixed], names=['train', 'gain', inner_name] ) @multimod_detectors class JUNGFRAU(MultimodDetectorBase): """An interface to JUNGFRAU data. JNGFR, JF1M, JF4M all store data in a "data" group, with trains along the first and memory cells along the second dimension. This allows only a set number of frames to be stored for each train. Parameters ---------- data: DataCollection A data collection, e.g. from :func:`.RunDirectory`. detector_name: str, optional Name of a detector, e.g. 'SPB_IRDA_JNGFR'. This is only needed if the dataset includes more than one JUNGFRAU detector. modules: set of ints, optional Detector module numbers to use. By default, all available modules are used. min_modules: int Include trains where at least n modules have data. Default is 1. n_modules: int Number of detector modules in the experiment setup. Default is None, in which case it will be estimated from the available data. """ # We appear to have a few different formats for source names: # SPB_IRDA_JNGFR/DET/MODULE_1:daqOutput (e.g. in p 2566, r 61) # SPB_IRDA_JF4M/DET/JNGFR03:daqOutput (e.g. in p 2732, r 12) # FXE_XAD_JF1M/DET/RECEIVER-1 _source_re = re.compile( r'(?P.+_(JNGFR|JF[14]M))/DET/' r'(MODULE_|RECEIVER-|JNGFR)(?P\d+)' ) _main_data_key = 'data.adc' module_shape = (512, 1024) def __init__(self, data: DataCollection, detector_name=None, modules=None, *, min_modules=1, n_modules=None): super().__init__(data, detector_name, modules, min_modules=min_modules) if n_modules is not None: self.n_modules = int(n_modules) else: # For JUNGFRAU modules are indexed from 1 self.n_modules = max(modno for (_, modno) in self._source_matches( data, self.detector_name )) @staticmethod def _label_dims(arr): # Label dimensions to match the AGIPD/DSSC/LPD data access ndim_pertrain = arr.ndim if 'trainId' in arr.dims: arr = arr.rename({'trainId': 'train'}) ndim_pertrain = arr.ndim - 1 if ndim_pertrain == 4: arr = arr.rename({ 'dim_0': 'cell', 'dim_1': 'slow_scan', 'dim_2': 'fast_scan' }) elif ndim_pertrain == 2: arr = arr.rename({'dim_0': 'cell'}) return arr def get_array(self, key, *, fill_value=None, roi=(), astype=None): """Get a labelled array of detector data Parameters ---------- key: str The data to get, e.g. 'data.adc' for pixel values. fill_value: int or float, optional Value to use for missing values. If None (default) the fill value is 0 for integers and np.nan for floats. roi: tuple Specify e.g. ``np.s_[:, 10:60, 100:200]`` to select data within each module & each train when reading data. The first dimension is pulses, then there are two pixel dimensions. The same selection is applied to data from each module, so selecting pixels may only make sense if you're using a single module. astype: Type data type of the output array. If None (default) the dtype matches the input array dtype """ arr = super().get_array(key, fill_value=fill_value, roi=roi, astype=astype) return self._label_dims(arr) def get_dask_array(self, key, fill_value=None, astype=None): """Get a labelled Dask array of detector data Dask does lazy, parallelised computing, and can work with large data volumes. This method doesn't immediately load the data: that only happens once you trigger a computation. Parameters ---------- key: str The data to get, e.g. 'data.adc' for pixel values. fill_value: int or float, optional Value to use for missing values. If None (default) the fill value is 0 for integers and np.nan for floats. astype: Type data type of the output array. If None (default) the dtype matches the input array dtype """ arr = super().get_dask_array(key, fill_value=fill_value, astype=astype) return self._label_dims(arr) def trains(self, require_all=True): """Iterate over trains for detector data. Parameters ---------- require_all: bool If True (default), skip trains where any of the selected detector modules are missing data. Yields ------ train_data: dict A dictionary mapping key names (e.g. 'data.adc') to labelled arrays. """ for tid, d in super().trains(require_all=require_all): yield tid, {k: self._label_dims(a) for (k, a) in d.items()} def write_virtual_cxi(self, filename, fillvalues=None): """Write a virtual CXI file to access the detector data. The virtual datasets in the file provide a view of the detector data as if it was a single huge array, but without copying the data. Creating and using virtual datasets requires HDF5 1.10. Parameters ---------- filename: str The file to be written. Will be overwritten if it already exists. fillvalues: dict, optional keys are datasets names (one of: data, gain, mask) and associated fill value for missing data (default is np.nan for float arrays and zero for integer arrays) """ JUNGFRAUCXIWriter(self).write(filename, fillvalues=fillvalues) def identify_multimod_detectors( data, detector_name=None, *, single=False, clses=None ): """Identify multi-module detectors in the data Various detectors record data for individual X-ray pulses within trains, and we often want to process whichever detector was used in a run. This tries to identify the detector, so a user doesn't have to specify it manually. If ``single=True``, this returns a tuple of (detector_name, access_class), throwing ``ValueError`` if there isn't exactly 1 detector found. If ``single=False``, it returns a set of these tuples. *clses* may be a list of acceptable detector classes to check. """ if clses is None: clses = multimod_detectors.list res = set() for cls in clses: for source in data.instrument_sources: m = cls._source_re.match(source) if m: name = m.group('detname') if (detector_name is None) or (name == detector_name): res.add((name, cls)) if single: if len(res) < 1: raise ValueError("No detector sources identified in the data") elif len(res) > 1: raise ValueError("Multiple detectors identified: {}".format( ", ".join(name for (name, _) in res) )) return res.pop() return res ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1623678132.0 EXtra-data-1.7.0/extra_data/exceptions.py0000664000175000017500000000214600000000000021164 0ustar00takluyvertakluyver"""Exception classes specific to extra_data.""" class SourceNameError(KeyError): def __init__(self, source): self.source = source def __str__(self): return ( "This data has no source named {!r}.\n" "See data.all_sources for available sources.".format(self.source) ) class PropertyNameError(KeyError): def __init__(self, prop, source): self.prop = prop self.source = source def __str__(self): return "No property {!r} for source {!r}".format(self.prop, self.source) class TrainIDError(KeyError): def __init__(self, train_id): self.train_id = train_id def __str__(self): return "Train ID {!r} not found in this data".format(self.train_id) class MultiRunError(ValueError): def __str__(self): return ( "The requested data is only available for a single run. This " "EXtra-data DataCollection may have data from multiple runs, e.g. " "because you have used .union() to combine data. Please retrieve " "this information before combining." ) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1626691885.0 EXtra-data-1.7.0/extra_data/export.py0000664000175000017500000001572300000000000020331 0ustar00takluyvertakluyver# coding: utf-8 """Expose data to different interface ZMQStream explose to a ZeroMQ socket in a REQ/REP pattern. Copyright (c) 2017, European X-Ray Free-Electron Laser Facility GmbH All rights reserved. You should have received a copy of the 3-Clause BSD License along with this program. If not, see """ from argparse import ArgumentParser import os.path as osp from warnings import warn from karabo_bridge import ServerInThread from karabo_bridge.server import Sender from .components import XtdfDetectorBase from .exceptions import SourceNameError from .reader import RunDirectory, H5File from .stacking import stack_detector_data from .utils import find_infiniband_ip __all__ = ['ZMQStreamer', 'serve_files'] class ZMQStreamer(ServerInThread): def __init__(self, port, sock='REP', maxlen=10, protocol_version='2.2', dummy_timestamps=False): warn("Please use :ref:karabo_bridge.ServerInThread instead", DeprecationWarning, stacklevel=2) endpoint = f'tcp://*:{port}' super().__init__(endpoint, sock=sock, maxlen=maxlen, protocol_version=protocol_version, dummy_timestamps=dummy_timestamps) def _iter_trains(data, merge_detector=False): """Iterate over trains in data and merge detector tiles in a single source :data: DataCollection :merge_detector: bool if True and data contains detector data (e.g. AGIPD) individual sources for each detector tiles are merged in a single source. The new source name keep the original prefix, but replace the last 2 part with '/DET/APPEND'. Individual sources are removed from the train data :yield: dict train data """ det, source_name = None, '' if merge_detector: for detector in XtdfDetectorBase.__subclasses__(): try: det = detector(data) source_name = f'{det.detector_name}/DET/APPEND' except SourceNameError: continue else: break for tid, train_data in data.trains(): if not train_data: continue if det is not None: det_data = { k: v for k, v in train_data.items() if k in det.data.detector_sources } # get one of the module to reference other datasets train_data[source_name] = mod_data = next(iter(det_data.values())) stacked = stack_detector_data(det_data, 'image.data') mod_data['image.data'] = stacked mod_data['metadata']['source'] = source_name if 'image.gain' in mod_data: stacked = stack_detector_data(det_data, 'image.gain') mod_data['image.gain'] = stacked if 'image.mask' in mod_data: stacked = stack_detector_data(det_data, 'image.mask') mod_data['image.mask'] = stacked # remove individual module sources for src in det.data.detector_sources: del train_data[src] yield tid, train_data def serve_files(path, port, source_glob='*', key_glob='*', append_detector_modules=False, dummy_timestamps=False, use_infiniband=False, sock='REP'): """Stream data from files through a TCP socket. Parameters ---------- path: str Path to the HDF5 file or file folder. port: str or int A ZMQ endpoint (e.g. 'tcp://*:44444') or a TCP port to bind the socket to. Integers or strings of all digits are treated as port numbers. source_glob: str Only stream sources matching this glob pattern. Streaming data selectively is more efficient than streaming everything. key_glob: str Only stream keys matching this glob pattern in the selected sources. append_detector_modules: bool Combine multi-module detector data in a single data source (sources for individual modules are removed). The last section of the source name is replaces with 'APPEND', example: 'SPB_DET_AGIPD1M-1/DET/#CH0:xtdf' -> 'SPB_DET_AGIPD1M-1/DET/APPEND' Supported detectors: AGIPD, DSSC, LPD dummy_timestamps: bool Whether to add mock timestamps if the metadata lacks them. use_infiniband: bool Use infiniband interface if available (if port specifies a TCP port) sock: str socket type - supported: REP, PUB, PUSH (default REP). """ if osp.isdir(path): data = RunDirectory(path) else: data = H5File(path) data = data.select(source_glob, key_glob) if isinstance(port, int) or port.isdigit(): endpt = f'tcp://{find_infiniband_ip() if use_infiniband else "*"}:{port}' else: endpt = port sender = Sender(endpt, sock=sock, dummy_timestamps=dummy_timestamps) print(f'Streamer started on: {sender.endpoint}') for tid, data in _iter_trains(data, merge_detector=append_detector_modules): sender.send(data) # The karabo-bridge code sets linger to 0 so that it doesn't get stuck if # the client goes away. But this would also mean that we close the socket # when the last messages have been queued but not sent. So if we've # successfully queued all the messages, set linger -1 (i.e. infinite) to # wait until ZMQ has finished transferring them before the socket is closed. sender.server_socket.close(linger=-1) def main(argv=None): ap = ArgumentParser(prog="karabo-bridge-serve-files") ap.add_argument("path", help="Path of a file or run directory to serve") ap.add_argument("port", help="TCP port or ZMQ endpoint to send data on") ap.add_argument( "--source", help="Stream only matching sources ('*' is a wildcard)", default='*', ) ap.add_argument( "--key", help="Stream only matching keys ('*' is a wildcard)", default='*', ) ap.add_argument( "--append-detector-modules", help="combine multiple module sources" " into one (will only work for AGIPD data currently).", action='store_true' ) ap.add_argument( "--dummy-timestamps", help="create dummy timestamps if the meta-data" " lacks proper timestamps", action='store_true' ) ap.add_argument( "--use-infiniband", help="Use infiniband interface if available " "(if a TCP port is specified)", action='store_true' ) ap.add_argument( "-z", "--socket-type", help="ZeroMQ socket type", choices=['PUB', 'PUSH', 'REP'], default='REP' ) args = ap.parse_args(argv) try: serve_files( args.path, args.port, source_glob=args.source, key_glob=args.key, append_detector_modules=args.append_detector_modules, dummy_timestamps=args.dummy_timestamps, use_infiniband=args.use_infiniband, sock=args.socket_type ) except KeyboardInterrupt: pass print('\nStopped.') ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1627645754.0 EXtra-data-1.7.0/extra_data/file_access.py0000664000175000017500000003654600000000000021256 0ustar00takluyvertakluyver"""Internal module for accessing EuXFEL HDF5 files This includes convenience features for getting the metadata & indexes from a file, as well as machinery to close less recently accessed files, so we don't run into the limit on the number of open files. """ from collections import defaultdict, OrderedDict import h5py, h5py.h5o import numpy as np import os import os.path as osp import resource from warnings import warn from weakref import WeakValueDictionary from .exceptions import SourceNameError # Track all FileAccess objects - {path: FileAccess} file_access_registry = WeakValueDictionary() class OpenFilesLimiter(object): """ Working with FileAccess, keep the number of opened HDF5 files under the given limit by closing files accessed longest time ago. """ def __init__(self, maxfiles=128): self._maxfiles = maxfiles # We don't use the values, but OrderedDict is a handy as a queue # with efficient removal of entries by key. self._cache = OrderedDict() @property def maxfiles(self): return self._maxfiles @maxfiles.setter def maxfiles(self, maxfiles): """Set the new file limit and closes files over the limit""" self._maxfiles = maxfiles self.close_old_files() def _check_files(self): # Discard entries from self._cache if their FileAccess no longer exists self._cache = OrderedDict.fromkeys( path for path in self._cache if path in file_access_registry ) def n_open_files(self): self._check_files() return len(self._cache) def close_old_files(self): if len(self._cache) <= self.maxfiles: return # Now check how many paths still have an existing FileAccess object n = self.n_open_files() while n > self.maxfiles: path, _ = self._cache.popitem(last=False) file_access = file_access_registry.get(path, None) if file_access is not None: file_access.close() n -= 1 def touch(self, filename): """ Add/move the touched file to the end of the `cache`. If adding a new file takes it over the limit of open files, another file will be closed. For use of the file cache, FileAccess should use `touch(filename)` every time it provides the underlying instance of `h5py.File` for reading. """ if filename in self._cache: self._cache.move_to_end(filename) else: self._cache[filename] = None self.close_old_files() def closed(self, filename): """Discard a closed file from the cache""" self._cache.pop(filename, None) def init_open_files_limiter(): # Raise the limit for open files (1024 -> 4096 on Maxwell) nofile = resource.getrlimit(resource.RLIMIT_NOFILE) resource.setrlimit(resource.RLIMIT_NOFILE, (nofile[1], nofile[1])) maxfiles = nofile[1] // 2 return OpenFilesLimiter(maxfiles) open_files_limiter = init_open_files_limiter() class FileAccess: """Access an EuXFEL HDF5 file. This does not necessarily keep the real file open, but opens it on demand. It assumes that the file is not changing on disk while this object exists. Parameters ---------- filename: str A path to an HDF5 file """ _file = None _format_version = None metadata_fstat = None def __new__(cls, filename, _cache_info=None): # Create only one FileAccess for each path, and store it in a registry filename = osp.abspath(filename) inst = file_access_registry.get(filename, None) if inst is None: inst = file_access_registry[filename] = super().__new__(cls) return inst def __init__(self, filename, _cache_info=None): self.filename = osp.abspath(filename) if _cache_info: self.train_ids = _cache_info['train_ids'] self.control_sources = _cache_info['control_sources'] self.instrument_sources = _cache_info['instrument_sources'] self.validity_flag = _cache_info.get('flag', None) else: tid_data = self.file['INDEX/trainId'][:] self.train_ids = tid_data[tid_data != 0] self.control_sources, self.instrument_sources = self._read_data_sources() self.validity_flag = None if self.validity_flag is None: if self.format_version == '0.5': self.validity_flag = self._guess_valid_trains() else: self.validity_flag = self.file['INDEX/flag'][:len(self.train_ids)].astype(bool) if self.format_version == '1.1': # File format version 1.1 changed the semantics of # INDEX/flag from a boolean flag to an index, with # the time server device being hardcoded to occur # at index 0. Inverting the flag after the boolean # cast above restores compatibility with format # version 1.0, with any "invalid" train having an # index >= 1, thus being casted to True and inverted # to False. Format 1.2 restored the 1.0 semantics. np.logical_not(self.validity_flag, out=self.validity_flag) warn( 'Train validation is not fully supported for data ' 'format version 1.1. If you have issues accessing ' 'these files, please contact da-support@xfel.eu.', stacklevel=2 ) if self._file is not None: # Store the stat of the file as it was when we read the metadata. # This is used by the run files map. self.metadata_fstat = os.stat(self.file.id.get_vfd_handle()) # {(file, source, group): (firsts, counts)} self._index_cache = {} # {source: set(keys)} self._keys_cache = {} self._run_keys_cache = {} # {source: set(keys)} - including incomplete sets self._known_keys = defaultdict(set) @property def file(self): open_files_limiter.touch(self.filename) if self._file is None: self._file = h5py.File(self.filename, 'r') return self._file @property def valid_train_ids(self): return self.train_ids[self.validity_flag] def has_train_ids(self, tids: np.ndarray, inc_suspect=False): f_tids = self.train_ids if inc_suspect else self.valid_train_ids return np.intersect1d(tids, f_tids).size > 0 def close(self): """Close* the HDF5 file this refers to. The file may not actually be closed if there are still references to objects from it, e.g. while iterating over trains. This is what HDF5 calls 'weak' closing. """ if self._file: self._file = None open_files_limiter.closed(self.filename) @property def format_version(self): if self._format_version is None: version_ds = self.file.get('METADATA/dataFormatVersion') if version_ds is not None: self._format_version = version_ds[0].decode('ascii') else: # The first version of the file format had no version number. # Numbering started at 1.0, so we call the first version 0.5. self._format_version = '0.5' return self._format_version def _read_data_sources(self): control_sources, instrument_sources = set(), set() # The list of data sources moved in file format 1.0 if self.format_version == '0.5': data_sources_path = 'METADATA/dataSourceId' else: data_sources_path = 'METADATA/dataSources/dataSourceId' for source in self.file[data_sources_path][:]: if not source: continue source = source.decode() category, _, h5_source = source.partition('/') if category == 'INSTRUMENT': device, _, chan_grp = h5_source.partition(':') chan, _, group = chan_grp.partition('/') source = device + ':' + chan instrument_sources.add(source) # TODO: Do something with groups? elif category == 'CONTROL': control_sources.add(h5_source) elif category == 'Karabo_TimerServer': # Ignore virtual data source used only in file format # version 1.1 / pclayer-1.10.3-2.10.5. pass else: raise ValueError("Unknown data category %r" % category) return frozenset(control_sources), frozenset(instrument_sources) def _guess_valid_trains(self): # File format version 1.0 includes a flag which is 0 if a train ID # didn't come from the time server. We use this to skip bad trains, # especially for AGIPD. # Older files don't have this flag, so this tries to estimate validity. # The goal is to have a monotonic sequence within the file with the # fewest trains skipped. train_ids = self.train_ids flag = np.ones_like(train_ids, dtype=bool) for ix in np.nonzero(train_ids[1:] <= train_ids[:-1])[0]: # train_ids[ix] >= train_ids[ix + 1] invalid_before = train_ids[:ix+1] >= train_ids[ix+1] invalid_after = train_ids[ix+1:] <= train_ids[ix] # Which side of the downward jump in train IDs would need fewer # train IDs invalidated? if np.count_nonzero(invalid_before) < np.count_nonzero(invalid_after): flag[:ix+1] &= ~invalid_before else: flag[ix+1:] &= ~invalid_after return flag def __hash__(self): return hash(self.filename) def __eq__(self, other): return isinstance(other, FileAccess) and (other.filename == self.filename) def __repr__(self): return "{}({})".format(type(self).__name__, repr(self.filename)) def __getstate__(self): """ Allows pickling `FileAccess` instance. """ state = self.__dict__.copy() state['_file'] = None return state def __getnewargs__(self): """Ensure that __new__ gets the filename when unpickling""" return (self.filename,) @property def all_sources(self): return self.control_sources | self.instrument_sources def get_index(self, source, group): """Get first index & count for a source and for a specific train ID. Indices are cached; this appears to provide some performance benefit. """ try: return self._index_cache[(source, group)] except KeyError: ix = self._read_index(source, group) self._index_cache[(source, group)] = ix return ix def _read_index(self, source, group): """Get first index & count for a source. This is 'real' reading when the requested index is not in the cache. """ ntrains = len(self.train_ids) ix_group = self.file['/INDEX/{}/{}'.format(source, group)] firsts = ix_group['first'][:ntrains] if 'count' in ix_group: counts = ix_group['count'][:ntrains] else: status = ix_group['status'][:ntrains] counts = np.uint64((ix_group['last'][:ntrains] - firsts + 1) * status) return firsts, counts def metadata(self) -> dict: """Get the contents of the METADATA group as a dict Not including the lists of data sources """ if self.format_version == '0.5': # Pretend this is actually there, like format version 1.0 return {'dataFormatVersion': '0.5'} r = {} for k, ds in self.file['METADATA'].items(): if not isinstance(ds, h5py.Dataset): continue v = ds[0] if isinstance(v, bytes): v = v.decode('utf-8', 'surrogateescape') r[k] = v return r def get_keys(self, source): """Get keys for a given source name Keys are found by walking the HDF5 file, and cached for reuse. """ try: return self._keys_cache[source] except KeyError: pass if source in self.control_sources: group = '/CONTROL/' + source elif source in self.instrument_sources: group = '/INSTRUMENT/' + source else: raise SourceNameError(source) res = set() def add_key(key, value): if isinstance(value, h5py.Dataset): res.add(key.replace('/', '.')) self.file[group].visititems(add_key) self._keys_cache[source] = res return res def get_run_keys(self, source): """Get the keys in the RUN section for a given control source name Keys are found by walking the HDF5 file, and cached for reuse. """ try: return self._run_keys_cache[source] except KeyError: pass if source not in self.control_sources: raise SourceNameError(source) res = set() def add_key(key, value): if isinstance(value, h5py.Dataset): res.add(key.replace('/', '.')) self.file['/RUN/' + source].visititems(add_key) self._keys_cache[source] = res return res def has_source_key(self, source, key): """Check if the given source and key exist in this file This doesn't scan for all the keys in the source, as .get_keys() does. """ try: return key in self._keys_cache[source] except KeyError: pass if key in self._known_keys[source]: return True if source in self.control_sources: path = '/CONTROL/{}/{}'.format(source, key.replace('.', '/')) elif source in self.instrument_sources: path = '/INSTRUMENT/{}/{}'.format(source, key.replace('.', '/')) else: raise SourceNameError(source) # self.file.get(path, getclass=True) works, but is weirdly slow. # Checking like this is much faster. if (path in self.file) and isinstance( h5py.h5o.open(self.file.id, path.encode()), h5py.h5d.DatasetID ): self._known_keys[source].add(key) return True return False def dset_proxy(self, ds_path: str): return DatasetProxy(self, ds_path) class DatasetProxy: """A picklable reference to an HDF5 dataset, suitable for dask.array Dask tries to do this automatically for h5py Dataset objects, but with some limitations: - It only works with Dask distributed, not Dask's local schedulers. - Dask storing references to h5py Datasets keeps the files open, breaking our attempts to manage the number of open files. """ def __init__(self, file_acc: FileAccess, ds_path: str): # We could just store the file name and use h5py on demand, but storing # our FileAccess object lets it use our cache of open files. self.file_acc = file_acc self.ds_path = ds_path ds = file_acc.file[ds_path] # dask.array expects these three array-like attributes: self.shape = ds.shape self.ndim = ds.ndim self.dtype = ds.dtype def __getitem__(self, item): return self.file_acc.file[self.ds_path][item] ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1627645754.0 EXtra-data-1.7.0/extra_data/h5index.py0000664000175000017500000000141600000000000020346 0ustar00takluyvertakluyverimport csv import h5py import sys import warnings warnings.warn( "extra_data.h5index is deprecated and likely to be removed. " "If you are using it, please contact da-support@xfel.eu.", stacklevel=2, ) def hdf5_datasets(grp): """Print CSV data of all datasets in an HDF5 file. path, shape, dtype """ all_datasets = [] def visitor(path, item): if isinstance(item, h5py.Dataset): all_datasets.append([path, item.shape, item.dtype.str]) grp.visititems(visitor) writer = csv.writer(sys.stdout) writer.writerow(['path', 'shape', 'dtype']) for row in sorted(all_datasets): writer.writerow(row) def main(): file = h5py.File(sys.argv[1]) hdf5_datasets(file) if __name__ == '__main__': main() ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1627647111.0 EXtra-data-1.7.0/extra_data/keydata.py0000664000175000017500000003303700000000000020430 0ustar00takluyvertakluyverfrom typing import List, Optional, Tuple import numpy as np from .exceptions import TrainIDError from .file_access import FileAccess from .read_machinery import ( contiguous_regions, DataChunk, select_train_ids, split_trains, ) class KeyData: """Data for one key in one source Don't create this directly; get it from ``run[source, key]``. """ def __init__( self, source, key, *, train_ids, files, section, dtype, eshape, inc_suspect_trains=True, ): self.source = source self.key = key self.train_ids = train_ids self.files: List[FileAccess] = files self.section = section self.dtype = dtype self.entry_shape = eshape self.ndim = len(eshape) + 1 self.inc_suspect_trains = inc_suspect_trains def _find_chunks(self): """Find contiguous chunks of data for this key, in any order.""" for file in self.files: firsts, counts = file.get_index(self.source, self._key_group) # Of trains in this file, which are in selection include = np.isin(file.train_ids, self.train_ids) if not self.inc_suspect_trains: include &= file.validity_flag # Assemble contiguous chunks of data from this file for _from, _to in contiguous_regions(include): yield DataChunk( file, self.hdf5_data_path, first=firsts[_from], train_ids=file.train_ids[_from:_to], counts=counts[_from:_to], ) _cached_chunks = None @property def _data_chunks(self) -> List[DataChunk]: """An ordered list of chunks containing data""" if self._cached_chunks is None: self._cached_chunks = sorted( [c for c in self._find_chunks() if c.total_count], key=lambda c: c.train_ids[0] ) return self._cached_chunks def __repr__(self): return f" 0] return self._only_tids(list(tids)) def split_trains(self, parts=None, trains_per_part=None): """Split this data into chunks with a fraction of the trains each. Either *parts* or *trains_per_part* must be specified. This returns an iterator yielding new :class:`KeyData` objects. The parts will have similar sizes, e.g. splitting 11 trains with ``trains_per_part=8`` will produce 5 & 6 trains, not 8 & 3. Selected trains count even if they are missing data, so different keys from the same run can be split into matching chunks. Parameters ---------- parts: int How many parts to split the data into. If trains_per_part is also specified, this is a minimum, and it may make more parts. It may also make fewer if there are fewer trains in the data. trains_per_part: int A maximum number of trains in each part. Parts will often have fewer trains than this. """ for s in split_trains(len(self.train_ids), parts, trains_per_part): yield self.select_trains(s) def data_counts(self, labelled=True): """Get a count of data entries in each train. If *labelled* is True, returns a pandas series with an index of train IDs. Otherwise, returns a NumPy array of counts to match ``.train_ids``. """ train_ids = np.concatenate([c.train_ids for c in self._data_chunks]) counts = np.concatenate([c.counts for c in self._data_chunks]) if labelled: import pandas as pd return pd.Series(counts, index=train_ids) else: # self.train_ids is always sorted. The train IDs from chunks # should be in order, but sometimes trains are written out of order. # Reorder the counts to match self.train_ids. assert len(train_ids) == len(self.train_ids) assert np.isin(train_ids, self.train_ids).all() idxs = np.argsort(train_ids) return counts[idxs] # Getting data as different kinds of array: ------------------------------- def ndarray(self, roi=()): """Load this data as a numpy array *roi* may be a ``numpy.s_[]`` expression to load e.g. only part of each image from a camera. """ if not isinstance(roi, tuple): roi = (roi,) # Find the shape of the array with the ROI applied roi_dummy = np.zeros((0,) + self.entry_shape) # extra 0 dim: use less memory roi_shape = roi_dummy[np.index_exp[:] + roi].shape[1:] out = np.empty(self.shape[:1] + roi_shape, dtype=self.dtype) # Read the data from each chunk into the result array dest_cursor = 0 for chunk in self._data_chunks: dest_chunk_end = dest_cursor + chunk.total_count slices = (chunk.slice,) + roi chunk.dataset.read_direct( out[dest_cursor:dest_chunk_end], source_sel=slices ) dest_cursor = dest_chunk_end return out def _trainid_index(self): """A 1D array of train IDs, corresponding to self.shape[0]""" chunks_trainids = [ np.repeat(chunk.train_ids, chunk.counts.astype(np.intp)) for chunk in self._data_chunks ] return np.concatenate(chunks_trainids) def xarray(self, extra_dims=None, roi=(), name=None): """Load this data as a labelled xarray.DataArray. The first dimension is labelled with train IDs. Other dimensions may be named by passing a list of names to *extra_dims*. Parameters ---------- extra_dims: list of str Name extra dimensions in the array. The first dimension is automatically called 'train'. The default for extra dimensions is dim_0, dim_1, ... roi: numpy.s_[], slice, tuple of slices, or by_index The region of interest. This expression selects data in all dimensions apart from the first (trains) dimension. If the data holds a 1D array for each entry, roi=np.s_[:8] would get the first 8 values from every train. If the data is 2D or more at each entry, selection looks like roi=np.s_[:8, 5:10] . name: str Name the array itself. The default is the source and key joined by a dot. """ import xarray ndarr = self.ndarray(roi=roi) # Dimension labels if extra_dims is None: extra_dims = ['dim_%d' % i for i in range(ndarr.ndim - 1)] dims = ['trainId'] + extra_dims # Train ID index coords = {} if self.shape[0]: coords = {'trainId': self._trainid_index()} if name is None: name = f'{self.source}.{self.key}' if name.endswith('.value') and self.section == 'CONTROL': name = name[:-6] return xarray.DataArray(ndarr, dims=dims, coords=coords, name=name) def series(self): """Load this data as a pandas Series. Only for 1D data. """ import pandas as pd if self.ndim > 1: raise TypeError("pandas Series are only available for 1D data") name = self.source + '/' + self.key if name.endswith('.value') and self.section == 'CONTROL': name = name[:-6] index = pd.Index(self._trainid_index(), name='trainId') data = self.ndarray() return pd.Series(data, name=name, index=index) def dask_array(self, labelled=False): """Make a Dask array for this data. Dask is a system for lazy parallel computation. This method doesn't actually load the data, but gives you an array-like object which you can operate on. Dask loads the data and calculates results when you ask it to, e.g. by calling a ``.compute()`` method. See the Dask documentation for more details. If your computation depends on reading lots of data, consider creating a dask.distributed.Client before calling this. If you don't do this, Dask uses threads by default, which is not efficient for reading HDF5 files. Parameters ---------- labelled: bool If True, label the train IDs for the data, returning an xarray.DataArray object wrapping a Dask array. """ import dask.array as da chunks_darrs = [] for chunk in self._data_chunks: chunk_dim0 = chunk.total_count chunk_shape = (chunk_dim0,) + chunk.dataset.shape[1:] itemsize = chunk.dataset.dtype.itemsize # Find chunk size of maximum 2 GB. This is largely arbitrary: # we want chunks small enough that each worker can have at least # a couple in memory (Maxwell nodes have 256-768 GB in late 2019). # But bigger chunks means less overhead. # Empirically, making chunks 4 times bigger/smaller didn't seem to # affect speed dramatically - but this could depend on many factors. # TODO: optional user control of chunking limit = 2 * 1024 ** 3 while np.product(chunk_shape) * itemsize > limit and chunk_dim0 > 1: chunk_dim0 //= 2 chunk_shape = (chunk_dim0,) + chunk.dataset.shape[1:] chunks_darrs.append( da.from_array( chunk.file.dset_proxy(chunk.dataset_path), chunks=chunk_shape )[chunk.slice] ) dask_arr = da.concatenate(chunks_darrs, axis=0) if labelled: # Dimension labels dims = ['trainId'] + ['dim_%d' % i for i in range(dask_arr.ndim - 1)] # Train ID index coords = {'trainId': self._trainid_index()} import xarray return xarray.DataArray(dask_arr, dims=dims, coords=coords) else: return dask_arr # Getting data by train: -------------------------------------------------- def _find_tid(self, tid) -> Tuple[Optional[FileAccess], int]: for fa in self.files: matches = (fa.train_ids == tid).nonzero()[0] if self.inc_suspect_trains and matches.size > 0: return fa, matches[0] for ix in matches: if fa.validity_flag[ix]: return fa, ix return None, 0 def train_from_id(self, tid): """Get data for the given train ID as a numpy array. Returns (train ID, array) """ if tid not in self.train_ids: raise TrainIDError(tid) fa, ix = self._find_tid(tid) if fa is None: return np.empty((0,) + self.entry_shape, dtype=self.dtype) firsts, counts = fa.get_index(self.source, self._key_group) first, count = firsts[ix], counts[ix] if count == 1: return tid, fa.file[self.hdf5_data_path][first] else: return tid, fa.file[self.hdf5_data_path][first: first+count] def train_from_index(self, i): """Get data for a train by index (starting at 0) as a numpy array. Returns (train ID, array) """ return self.train_from_id(self.train_ids[i]) def trains(self): """Iterate through trains containing data for this key Yields pairs of (train ID, array). Skips trains where data is missing. """ for chunk in self._data_chunks: start = chunk.first ds = chunk.dataset for tid, count in zip(chunk.train_ids, chunk.counts): if count > 1: yield tid, ds[start: start+count] elif count == 1: yield tid, ds[start] start += count ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1623678132.0 EXtra-data-1.7.0/extra_data/locality.py0000664000175000017500000000766400000000000020635 0ustar00takluyvertakluyver""" Tools to check a file locality at EuXFEL May be used to avoiding hangs on reading files from dCache if they are not available or stored only on tape """ import os import sys from collections import defaultdict import multiprocessing as mp UNAVAIL = 1 ONTAPE = 2 ONDISK = 4 ANY = 7 DC_LOC_RESP = { 'UNAVAILABLE': UNAVAIL, 'NEARLINE': ONTAPE, 'ONLINE': ONDISK, 'ONLINE_AND_NEARLINE': ONTAPE | ONDISK, 'NOT_ON_DCACHE': ONDISK, } LOCMSG = { 0: 'Unknown locality', 1: 'Unavailable', 2: 'Only on tape', 4: 'On disk', 6: 'On disk', } def get_locality(path): """ Returns locality of the file (path) """ basedir, filename = os.path.split(path) dotcmd = os.path.join(basedir, '.(get)({})(locality)'.format(filename)) try: with open(dotcmd, 'r') as f: return path, f.read().strip() except FileNotFoundError: return path, 'NOT_ON_DCACHE' def list_locality(files): """ Returns locality of the list of files """ with mp.Pool() as p: yield from p.imap_unordered(get_locality, files) def print_counts(fpart): """ Prints the counters of different localities """ n_ondisk = len(fpart['NOT_ON_DCACHE']) + len(fpart['ONLINE_AND_NEARLINE']) + len(fpart['ONLINE']) n_ontape = len(fpart['NEARLINE']) n_unavail = len(fpart['UNAVAILABLE']) print(f"{n_ondisk} on disk, {n_ontape} only on tape, {n_unavail} unavailable ", end='\r') def silent(fpart): """ Prints nothing """ pass def partition(files, cb_disp=silent): """ Partition files by locality """ fpart = defaultdict(list) for path, loc in list_locality(files): fpart[loc].append(path) cb_disp(fpart) return fpart def lc_match(files, accept=ONDISK): """ Returns files which has accepted locality """ filtered = [] for path, loc in list_locality(files): code = DC_LOC_RESP.get(loc, 0) if code & accept: filtered.append(path) else: print(f"Skipping file {path}", file=sys.stderr) print(f" ({LOCMSG[loc]})", file=sys.stderr) return filtered def lc_any(files): """ Returns all files, does nothing """ return files def lc_ondisk(files): """Returns files on disk, excluding any which would be read from tape""" return lc_match(files, ONDISK) def lc_avail(files): """Returns files which are available on disk or tape Excludes files which dCache reports are unavailable. """ return lc_match(files, ONTAPE | ONDISK) def check_dir(basedir): """ Check basedir and prints results """ if os.path.isdir(basedir): ls = ( os.path.join(basedir, f) for f in os.listdir(basedir) ) files = [ f for f in ls if os.path.isfile(f) ] elif os.path.isfile(basedir): files = [ basedir ] else: files = [] print(f"Checking {len(files)} files in {basedir}") fp = partition(files, print_counts) print("") retcode = 0 if fp['NEARLINE']: retcode |= 1 print("Only on tape:") for file in sorted(fp['NEARLINE']): print(f" {file}") if fp['UNAVAILABLE']: retcode |= 2 print("Unavailable:") for file in sorted(fp['UNAVAILABLE']): print(f" {file}") unknown_locality = set(fp) - set(DC_LOC_RESP) if unknown_locality: retcode |= 4 print("Unknown locality:", unknown_locality) return retcode from argparse import ArgumentParser def main(argv=None): if argv is None: argv = sys.argv[1:] ap = ArgumentParser(prog='extra-data-locality', description="Checks locality of files in the directory") ap.add_argument('path', help="run directory of HDF5 files.") args = ap.parse_args(argv) if not os.path.exists(args.path): print(f"Path '{args.path}' is not found") return 255 return check_dir(args.path) if __name__ == "__main__": sys.exit(main()) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1622824936.0 EXtra-data-1.7.0/extra_data/lsxfel.py0000664000175000017500000001403400000000000020277 0ustar00takluyvertakluyver"""Summarise XFEL data in files or folders """ import argparse from collections import defaultdict import os import os.path as osp import re import sys from .read_machinery import FilenameInfo from .reader import H5File, RunDirectory def describe_file(path, details_for_sources=()): """Describe a single HDF5 data file""" basename = os.path.basename(path) info = FilenameInfo(basename) print(basename, ":", info.description) h5file = H5File(path) h5file.info(details_for_sources) def summarise_file(path): basename = os.path.basename(path) info = FilenameInfo(basename) print(basename, ":", info.description) h5file = H5File(path) ntrains = len(h5file.train_ids) if info.is_detector: detector_source = next(iter(h5file.detector_sources)) dinfo = h5file.detector_info(detector_source) print(" {} trains, {} frames/train, {} total frames".format( len(h5file.train_ids), dinfo['frames_per_train'], dinfo['total_frames'] )) else: print(" {} trains, {} sources".format(ntrains, len(h5file.sources))) def describe_run(path, details_for_sources=()): basename = os.path.basename(path) print(basename, ": Run directory") print() run = RunDirectory(path) run.info(details_for_sources) def summarise_run(path, indent=''): basename = os.path.basename(path) # Accessing all the files in a run can be slow. To get the number of trains, # pick one set of segments (time slices of data from the same source). # This relies on each set of segments recording the same number of trains. segment_sequences = defaultdict(list) n_detector = n_other = 0 for f in sorted(os.listdir(path)): m = re.match(r'(.+)-S\d+\.h5', osp.basename(f)) if m: segment_sequences[m.group(1)].append(f) if FilenameInfo(f).is_detector: n_detector += 1 else: n_other += 1 if len(segment_sequences) < 1: raise ValueError("No data files recognised in %s" % path) # Take the shortest group of segments to make reading quicker first_group = sorted(segment_sequences.values(), key=len)[0] train_ids = set() for f in first_group: train_ids.update(H5File(osp.join(path, f)).train_ids) print("{}{} : Run of {:>4} trains, with {:>3} detector files and {:>3} others".format( indent, basename, len(train_ids), n_detector, n_other )) def main(argv=None): ap = argparse.ArgumentParser( prog='lsxfel', description="Summarise XFEL data in files or folders" ) ap.add_argument('paths', nargs='*', help="Files/folders to look at") ap.add_argument('--detail', action='append', default=[], help="Show details on keys & data for specified sources. " "This can slow down lsxfel considerably. " "Wildcard patterns like '*/XGM/*' are allowed, though you may " "need single quotes to prevent the shell processing them. " "Can be used more than once to include several patterns. " "Only used when inspecting a single run or file." ) args = ap.parse_args(argv) paths = args.paths or [os.path.abspath(os.getcwd())] if len(paths) == 1: path = paths[0] basename = os.path.basename(os.path.abspath(path.rstrip('/'))) if os.path.isdir(path): contents = sorted(os.listdir(path)) if any(f.endswith('.h5') for f in contents): # Run directory describe_run(path, args.detail) elif any(re.match(r'r\d+', f) for f in contents): # Proposal directory, containing runs print(basename, ": Proposal data directory") print() for f in contents: child_path = os.path.join(path, f) if re.match(r'r\d+', f) and os.path.isdir(child_path): summarise_run(child_path, indent=' ') elif osp.isdir(osp.join(path, 'raw')): print(basename, ": Proposal directory") print() print('{}/raw/'.format(basename)) for f in sorted(os.listdir(osp.join(path, 'raw'))): child_path = os.path.join(path, 'raw', f) if re.match(r'r\d+', f) and os.path.isdir(child_path): summarise_run(child_path, indent=' ') else: print(basename, ": Unrecognised directory") elif os.path.isfile(path): if path.endswith('.h5'): describe_file(path, args.detail) else: print(basename, ": Unrecognised file") return 2 else: print(path, ': File/folder not found') return 2 else: exit_code = 0 for path in paths: basename = os.path.basename(path) if os.path.isdir(path): contents = os.listdir(path) if any(f.endswith('.h5') for f in contents): # Run directory summarise_run(path) elif any(re.match(r'r\d+', f) for f in contents): # Proposal directory, containing runs print(basename, ": Proposal directory") print() for f in contents: child_path = os.path.join(path, f) if re.match(r'r\d+', f) and os.path.isdir(child_path): summarise_run(child_path, indent=' ') else: print(basename, ": Unrecognised directory") exit_code = 2 elif os.path.isfile(path): if path.endswith('.h5'): summarise_file(path) else: print(basename, ": Unrecognised file") exit_code = 2 else: print(path, ': File/folder not found') exit_code = 2 return exit_code if __name__ == '__main__': sys.exit(main()) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1626691885.0 EXtra-data-1.7.0/extra_data/read_machinery.py0000664000175000017500000001746100000000000021763 0ustar00takluyvertakluyver"""Machinery for reading Karabo HDF5 files The public API is in extra_data.reader; this is internal code. """ from collections import defaultdict from glob import iglob import logging import math import numpy as np import os.path as osp import re import time from .exceptions import SourceNameError log = logging.getLogger(__name__) DETECTOR_NAMES = {'AGIPD', 'DSSC', 'LPD'} DETECTOR_SOURCE_RE = re.compile(r'(.+)/DET/(\d+)CH') DATA_ROOT_DIR = '/gpfs/exfel/exp' class _SliceConstructor(type): """Allows instantiation like subclass[1:5] """ def __getitem__(self, item): return self(item) class _SliceConstructable(metaclass=_SliceConstructor): def __init__(self, value): self.value = value def __repr__(self): indices = self.value if not isinstance(indices, tuple): indices = (indices,) return "{}[{}]".format( type(self).__name__, ', '.join(self._indexing_repr(v) for v in indices) ) @staticmethod def _indexing_repr(value): """Represent values as used in canonical slicing syntax""" if value is Ellipsis: return '...' elif isinstance(value, slice): start = value.start if (value.start is not None) else '' stop = value.stop if (value.stop is not None) else '' step = ':{}'.format(value.step) if (value.step is not None) else '' return '{}:{}{}'.format(start, stop, step) return repr(value) class by_id(_SliceConstructable): pass class by_index(_SliceConstructable): pass def _tid_to_slice_ix(tid, train_ids, stop=False): """Convert a train ID to an integer index for slicing the dataset Throws ValueError if the slice won't overlap the trains in the data. The *stop* parameter tells it which end of the slice it is making. """ if tid is None: return None try: return train_ids.index(tid) except ValueError: pass if tid < train_ids[0]: if stop: raise ValueError("Train ID {} is before this run (starts at {})" .format(tid, train_ids[0])) else: return None elif tid > train_ids[-1]: if stop: return None else: raise ValueError("Train ID {} is after this run (ends at {})" .format(tid, train_ids[-1])) else: # This train ID is within the run, but doesn't have an entry. # Find the first ID in the run greater than the one given. return (train_ids > tid).nonzero()[0][0] def select_train_ids(train_ids, sel): if isinstance(sel, by_index): sel = sel.value elif isinstance(sel, int): sel = slice(sel, sel+1, None) if isinstance(sel, by_id) and isinstance(sel.value, slice): # Slice by train IDs start_ix = _tid_to_slice_ix(sel.value.start, train_ids, stop=False) stop_ix = _tid_to_slice_ix(sel.value.stop, train_ids, stop=True) return train_ids[start_ix: stop_ix: sel.value.step] elif isinstance(sel, by_id) and isinstance(sel.value, (list, np.ndarray)): # Select a list of trains by train ID new_train_ids = sorted(set(train_ids).intersection(sel.value)) if not new_train_ids: raise ValueError( "Given train IDs not found among {} trains in " "collection".format(len(train_ids)) ) return new_train_ids elif isinstance(sel, slice): # Slice by indexes in this collection return train_ids[sel] elif isinstance(sel, (list, np.ndarray)): # Select a list of trains by index in this collection return sorted(np.asarray(train_ids)[sel]) else: raise TypeError(type(sel)) def split_trains(n_trains, parts=None, trains_per_part=None) -> [slice]: if trains_per_part is not None: assert trains_per_part >= 1 n_parts = math.ceil(n_trains / trains_per_part) if parts is not None: n_parts = max(n_parts, min(parts, n_trains)) elif parts is not None: assert parts >= 1 n_parts = min(parts, n_trains) else: raise ValueError("Either parts or trains_per_part must be specified") return [ slice(i * n_trains // n_parts, (i + 1) * n_trains // n_parts) for i in range(n_parts) ] class DataChunk: """Reference to a contiguous chunk of data for one or more trains.""" def __init__(self, file, dataset_path, first, train_ids, counts): self.file = file self.dataset_path = dataset_path self.first = first self.train_ids = train_ids self.counts = counts @property def slice(self): return slice(self.first, self.first + np.sum(self.counts)) @property def total_count(self): return int(np.sum(self.counts, dtype=np.uint64)) @property def dataset(self): return self.file.file[self.dataset_path] # contiguous_regions() by Joe Kington on Stackoverflow # https://stackoverflow.com/a/4495197/434217 # Used here under Stackoverflow's default CC-BY-SA 3.0 license. def contiguous_regions(condition): """Finds contiguous True regions of the boolean array "condition". Returns a 2D array where the first column is the start index of the region and the second column is the end index.""" # Find the indices of changes in "condition" d = np.diff(condition) idx, = d.nonzero() # We need to start things after the change in "condition". Therefore, # we'll shift the index by 1 to the right. idx += 1 if condition[0]: # If the start of condition is True prepend a 0 idx = np.r_[0, idx] if condition[-1]: # If the end of condition is True, append the length of the array idx = np.r_[idx, condition.size] # Edit # Reshape the result into two columns idx.shape = (-1,2) return idx def union_selections(selections): """Merge together different selections A selection is a dict of {source: set(keys)}, or {source: None} to include all keys for a given source. """ selection_multi = defaultdict(list) for seln in selections: for source, keys in seln.items(): selection_multi[source].append(keys) # Merge selected keys; None -> all keys selected return { source: None if (None in keygroups) else set().union(*keygroups) for (source, keygroups) in selection_multi.items() } class FilenameInfo: is_detector = False detector_name = None detector_moduleno = -1 _rawcorr_descr = {'RAW': 'Raw', 'CORR': 'Corrected'} def __init__(self, path): self.basename = osp.basename(path) nameparts = self.basename[:-3].split('-') assert len(nameparts) == 4, self.basename rawcorr, runno, datasrc, segment = nameparts m = re.match(r'([A-Z]+)(\d+)', datasrc) if m and m.group(1) == 'DA': self.description = "Aggregated data" elif m and m.group(1) in DETECTOR_NAMES: self.is_detector = True name, moduleno = m.groups() self.detector_name = name self.detector_moduleno = moduleno self.description = "{} detector data from {} module {}".format( self._rawcorr_descr.get(rawcorr, '?'), name, moduleno ) else: self.description = "Unknown data source ({})", datasrc def find_proposal(propno): """Find the proposal directory for a given proposal on Maxwell""" if '/' in propno: # Already passed a proposal directory return propno t0 = time.monotonic() for d in iglob(osp.join(DATA_ROOT_DIR, '*/*/{}'.format(propno))): dt = time.monotonic() - t0 log.info("Found proposal dir %r in %.2g s", d, dt) return d raise Exception("Couldn't find proposal dir for {!r}".format(propno)) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1627898640.0 EXtra-data-1.7.0/extra_data/reader.py0000664000175000017500000015457100000000000020257 0ustar00takluyvertakluyver# coding: utf-8 """ Collection of classes and functions to help reading HDF5 file generated at The European XFEL. Copyright (c) 2017, European X-Ray Free-Electron Laser Facility GmbH All rights reserved. You should have received a copy of the 3-Clause BSD License along with this program. If not, see """ from collections import defaultdict from collections.abc import Iterable import datetime import fnmatch import h5py from itertools import groupby import logging from multiprocessing import Pool import numpy as np from operator import index import os import os.path as osp import psutil import re import signal import sys import tempfile import time from typing import Tuple from warnings import warn from .exceptions import ( SourceNameError, PropertyNameError, TrainIDError, MultiRunError, ) from .keydata import KeyData from .read_machinery import ( DETECTOR_SOURCE_RE, FilenameInfo, by_id, by_index, select_train_ids, split_trains, union_selections, find_proposal, ) from .run_files_map import RunFilesMap from . import locality from .file_access import FileAccess __all__ = [ 'H5File', 'RunDirectory', 'open_run', 'FileAccess', 'DataCollection', 'by_id', 'by_index', 'SourceNameError', 'PropertyNameError', ] log = logging.getLogger(__name__) RUN_DATA = 'RUN' INDEX_DATA = 'INDEX' METADATA = 'METADATA' class DataCollection: """An assemblage of data generated at European XFEL Data consists of *sources* which each have *keys*. It is further organised by *trains*, which are identified by train IDs. You normally get an instance of this class by calling :func:`H5File` for a single file or :func:`RunDirectory` for a directory. """ def __init__( self, files, selection=None, train_ids=None, ctx_closes=False, *, inc_suspect_trains=True, is_single_run=False, ): self.files = list(files) self.ctx_closes = ctx_closes self.inc_suspect_trains = inc_suspect_trains self.is_single_run = is_single_run # selection: {source: set(keys)} # None as value -> all keys for this source if selection is None: selection = {} for f in self.files: selection.update(dict.fromkeys(f.control_sources)) selection.update(dict.fromkeys(f.instrument_sources)) self.selection = selection self.control_sources = set() self.instrument_sources = set() self._source_index = defaultdict(list) for f in self.files: self.control_sources.update(f.control_sources.intersection(selection)) self.instrument_sources.update(f.instrument_sources.intersection(selection)) for source in (f.control_sources | f.instrument_sources): self._source_index[source].append(f) # Throw an error if we have conflicting data for the same source self._check_source_conflicts() self.control_sources = frozenset(self.control_sources) self.instrument_sources = frozenset(self.instrument_sources) if train_ids is None: if inc_suspect_trains: tid_sets = [f.train_ids for f in files] else: tid_sets = [f.valid_train_ids for f in files] train_ids = sorted(set().union(*tid_sets)) self.train_ids = train_ids @staticmethod def _open_file(path, cache_info=None): try: fa = FileAccess(path, _cache_info=cache_info) except Exception as e: return osp.basename(path), str(e) else: return osp.basename(path), fa @classmethod def from_paths( cls, paths, _files_map=None, *, inc_suspect_trains=True, is_single_run=False ): files = [] uncached = [] for path in paths: cache_info = _files_map and _files_map.get(path) if cache_info: filename, fa = cls._open_file(path, cache_info=cache_info) if isinstance(fa, FileAccess): files.append(fa) else: print(f"Skipping file {filename}", file=sys.stderr) print(f" (error was: {fa})", file=sys.stderr) else: uncached.append(path) if uncached: def initializer(): # prevent child processes from receiving KeyboardInterrupt signal.signal(signal.SIGINT, signal.SIG_IGN) # cpu_affinity give a list of cpu cores we can use, can be all or a # subset of the cores the machine has. nproc = min(len(psutil.Process().cpu_affinity()), len(uncached)) with Pool(processes=nproc, initializer=initializer) as pool: for fname, fa in pool.imap_unordered(cls._open_file, uncached): if isinstance(fa, FileAccess): files.append(fa) else: print(f"Skipping file {fname}", file=sys.stderr) print(f" (error was: {fa})", file=sys.stderr) if not files: raise Exception("All HDF5 files specified are unusable") return cls( files, ctx_closes=True, inc_suspect_trains=inc_suspect_trains, is_single_run=is_single_run, ) @classmethod def from_path(cls, path, *, inc_suspect_trains=True): files = [FileAccess(path)] return cls( files, ctx_closes=True, inc_suspect_trains=inc_suspect_trains, is_single_run=True ) def __enter__(self): if not self.ctx_closes: raise Exception( "Only DataCollection objects created by opening " "files directly can be used in a 'with' statement, " "not those created by selecting from or merging " "others." ) return self def __exit__(self, exc_type, exc_val, exc_tb): # Close the files if this collection was created by opening them. if self.ctx_closes: for file in self.files: file.close() @property def all_sources(self): return self.control_sources | self.instrument_sources @property def detector_sources(self): return set(filter(DETECTOR_SOURCE_RE.match, self.instrument_sources)) def _check_field(self, source, key): if source not in self.all_sources: raise SourceNameError(source) if not self._has_source_key(source, key): raise PropertyNameError(key, source) def _has_source_key(self, source, key): selected_keys = self.selection[source] if selected_keys is not None: return key in selected_keys for f in self._source_index[source]: return f.has_source_key(source, key) def keys_for_source(self, source): """Get a set of key names for the given source If you have used :meth:`select` to filter keys, only selected keys are returned. Only one file is used to find the keys. Within a run, all files should have the same keys for a given source, but if you use :meth:`union` to combine two runs where the source was configured differently, the result can be unpredictable. """ selected_keys = self.selection[source] if selected_keys is not None: return selected_keys # The same source may be in multiple files, but this assumes it has # the same keys in all files that it appears in. for f in self._source_index[source]: return f.get_keys(source) # Leave old name in case anything external was using it: _keys_for_source = keys_for_source def _get_key_data(self, source, key): self._check_field(source, key) section = 'INSTRUMENT' if source in self.instrument_sources else 'CONTROL' files = self._source_index[source] ds0 = files[0].file[f"{section}/{source}/{key.replace('.', '/')}"] return KeyData( source, key, train_ids=self.train_ids, files=files, section=section, dtype=ds0.dtype, eshape=ds0.shape[1:], inc_suspect_trains=self.inc_suspect_trains, ) def __getitem__(self, item): if isinstance(item, tuple) and len(item) == 2: return self._get_key_data(*item) raise TypeError("Expected data[source, key]") def get_entry_shape(self, source, key): """Get the shape of a single data entry for the given source & key""" return self._get_key_data(source, key).entry_shape def get_dtype(self, source, key): """Get the numpy data type for the given source & key""" return self._get_key_data(source, key).dtype def _check_data_missing(self, tid) -> bool: """Return True if a train does not have data for all sources""" for source in self.control_sources: file, _ = self._find_data(source, tid) if file is None: return True for source in self.instrument_sources: file, pos = self._find_data(source, tid) if file is None: return True groups = {k.partition('.')[0] for k in self.keys_for_source(source)} for group in groups: _, counts = file.get_index(source, group) if counts[pos] == 0: return True return False def trains(self, devices=None, train_range=None, *, require_all=False, flat_keys=False): """Iterate over all trains in the data and gather all sources. :: run = Run('/path/to/my/run/r0123') for train_id, data in run.select("*/DET/*", "image.data").trains(): mod0 = data["FXE_DET_LPD1M-1/DET/0CH0:xtdf"]["image.data"] Parameters ---------- devices: dict or list, optional Filter data by sources and keys. Refer to :meth:`select` for how to use this. train_range: by_id or by_index object, optional Iterate over only selected trains, by train ID or by index. Refer to :meth:`select_trains` for how to use this. require_all: bool False (default) returns any data available for the requested trains. True skips trains which don't have all the selected data; this only makes sense if you make a selection with *devices* or :meth:`select`. flat_keys: bool False (default) returns nested dictionaries in each iteration indexed by source and then key. True returns a flat dictionary indexed by (source, key) tuples. Yields ------ tid : int The train ID of the returned train data : dict The data for this train, keyed by device name """ dc = self if devices is not None: dc = dc.select(devices) if train_range is not None: dc = dc.select_trains(train_range) return iter(TrainIterator(dc, require_all=require_all, flat_keys=flat_keys)) def train_from_id(self, train_id, devices=None, *, flat_keys=False): """Get train data for specified train ID. Parameters ---------- train_id: int The train ID devices: dict or list, optional Filter data by sources and keys. Refer to :meth:`select` for how to use this. flat_keys: bool False (default) returns a nested dict indexed by source and then key. True returns a flat dictionary indexed by (source, key) tuples. Returns ------- tid : int The train ID of the returned train data : dict The data for this train, keyed by device name Raises ------ KeyError if `train_id` is not found in the run. """ if train_id not in self.train_ids: raise TrainIDError(train_id) if devices is not None: return self.select(devices).train_from_id(train_id) res = {} for source in self.control_sources: source_data = res[source] = { 'metadata': {'source': source, 'timestamp.tid': train_id} } file, pos = self._find_data(source, train_id) if file is None: continue for key in self.keys_for_source(source): path = '/CONTROL/{}/{}'.format(source, key.replace('.', '/')) source_data[key] = file.file[path][pos] for source in self.instrument_sources: source_data = res[source] = { 'metadata': {'source': source, 'timestamp.tid': train_id} } file, pos = self._find_data(source, train_id) if file is None: continue for key in self.keys_for_source(source): group = key.partition('.')[0] firsts, counts = file.get_index(source, group) first, count = firsts[pos], counts[pos] if not count: continue path = '/INSTRUMENT/{}/{}'.format(source, key.replace('.', '/')) if count == 1: source_data[key] = file.file[path][first] else: source_data[key] = file.file[path][first : first + count] if flat_keys: # {src: {key: data}} -> {(src, key): data} res = {(src, key): v for src, source_data in res.items() for (key, v) in source_data.items()} return train_id, res def train_from_index(self, train_index, devices=None, *, flat_keys=False): """Get train data of the nth train in this data. Parameters ---------- train_index: int Index of the train in the file. devices: dict or list, optional Filter data by sources and keys. Refer to :meth:`select` for how to use this. flat_keys: bool False (default) returns a nested dict indexed by source and then key. True returns a flat dictionary indexed by (source, key) tuples. Returns ------- tid : int The train ID of the returned train data : dict The data for this train, keyed by device name """ train_id = self.train_ids[train_index] return self.train_from_id(int(train_id), devices=devices, flat_keys=flat_keys) def get_data_counts(self, source, key): """Get a count of data points in each train for the given data field. Returns a pandas series with an index of train IDs. Parameters ---------- source: str Source name, e.g. "SPB_DET_AGIPD1M-1/DET/7CH0:xtdf" key: str Key of parameter within that device, e.g. "image.data". """ return self._get_key_data(source, key).data_counts() def get_series(self, source, key): """Return a pandas Series for a 1D data field defined by source & key. See :meth:`.KeyData.series` for details. """ return self._get_key_data(source, key).series() def get_dataframe(self, fields=None, *, timestamps=False): """Return a pandas dataframe for given data fields. :: df = run.get_dataframe(fields=[ ("*_XGM/*", "*.i[xy]Pos"), ("*_XGM/*", "*.photonFlux") ]) This links together multiple 1-dimensional datasets as columns in a table. Parameters ---------- fields : dict or list, optional Select data sources and keys to include in the dataframe. Selections are defined by lists or dicts as in :meth:`select`. timestamps : bool If false (the default), exclude the timestamps associated with each control data field. """ import pandas as pd if fields is not None: return self.select(fields).get_dataframe(timestamps=timestamps) series = [] for source in self.all_sources: for key in self.keys_for_source(source): if (not timestamps) and key.endswith('.timestamp'): continue series.append(self.get_series(source, key)) return pd.concat(series, axis=1) def get_array(self, source, key, extra_dims=None, roi=(), name=None): """Return a labelled array for a data field defined by source and key. see :meth:`.KeyData.xarray` for details. """ if isinstance(roi, by_index): roi = roi.value return self._get_key_data(source, key).xarray( extra_dims=extra_dims, roi=roi, name=name) def get_dask_array(self, source, key, labelled=False): """Get a Dask array for a data field defined by source and key. see :meth:`.KeyData.dask_array` for details. """ return self._get_key_data(source, key).dask_array(labelled=labelled) def get_run_value(self, source, key): """Get a single value from the RUN section of data files. RUN records each property of control devices as a snapshot at the beginning of the run. This includes properties which are not saved continuously in CONTROL data. This method is intended for use with data from a single run. If you combine data from multiple runs, it will raise MultiRunError. Parameters ---------- source: str Control device name, e.g. "HED_OPT_PAM/CAM/SAMPLE_CAM_4". key: str Key of parameter within that device, e.g. "triggerMode". """ if not self.is_single_run: raise MultiRunError if source not in self.control_sources: raise SourceNameError(source) # Arbitrary file - should be the same across a run fa = self._source_index[source][0] ds = fa.file['RUN'][source].get(key.replace('.', '/')) if isinstance(ds, h5py.Group): # Allow for the .value suffix being omitted ds = ds.get('value') if not isinstance(ds, h5py.Dataset): raise PropertyNameError(key, source) val = ds[0] if isinstance(val, bytes): # bytes -> str return val.decode('utf-8', 'surrogateescape') return val def get_run_values(self, source) -> dict: """Get a dict of all RUN values for the given source This includes keys which are also in CONTROL. Parameters ---------- source: str Control device name, e.g. "HED_OPT_PAM/CAM/SAMPLE_CAM_4". """ if not self.is_single_run: raise MultiRunError if source not in self.control_sources: raise SourceNameError(source) # Arbitrary file - should be the same across a run fa = self._source_index[source][0] res = {} def visitor(path, obj): if isinstance(obj, h5py.Dataset): val = obj[0] if isinstance(val, bytes): val = val.decode('utf-8', 'surrogateescape') res[path.replace('/', '.')] = val fa.file['RUN'][source].visititems(visitor) return res def union(self, *others): """Join the data in this collection with one or more others. This can be used to join multiple sources for the same trains, or to extend the same sources with data for further trains. The order of the datasets doesn't matter. Returns a new :class:`DataCollection` object. """ files = set(self.files) train_ids = set(self.train_ids) for other in others: files.update(other.files) train_ids.update(other.train_ids) train_ids = sorted(train_ids) selection = union_selections([self.selection] + [o.selection for o in others]) return DataCollection( files, selection=selection, train_ids=train_ids, inc_suspect_trains=self.inc_suspect_trains, ) def _expand_selection(self, selection): res = defaultdict(set) if isinstance(selection, dict): # {source: {key1, key2}} # {source: {}} or {source: None} -> all keys for this source for source, in_keys in selection.items(): if source not in self.all_sources: raise SourceNameError(source) # Keys of the current DataCollection. cur_keys = self.selection[source] # Keys input as the new selection. if in_keys: # If a specific set of keys is selected, make sure # they are all valid. for key in in_keys: if not self._has_source_key(source, key): raise PropertyNameError(key, source) else: # Catches both an empty set and None. # While the public API describes an empty set to # refer to all keys, the internal API actually uses # None for this case. This method is supposed to # accept both cases in order to natively support # passing a DataCollection as the selector. To keep # the conditions below clearer, any non-True value # is converted to None. in_keys = None if cur_keys is None and in_keys is None: # Both the new and current keys select all. res[source] = None elif cur_keys is not None and in_keys is not None: # Both the new and current keys are specific, take # the intersection of both. This should never be # able to result in an empty set, but to prevent the # code further below from breaking, assert it. res[source] = cur_keys & in_keys assert res[source] elif cur_keys is None and in_keys is not None: # Current keys are unspecific but new ones are, just # use the new keys. res[source] = in_keys elif cur_keys is not None and in_keys is None: # The current keys are specific but new ones are # not, use the current keys. res[source] = cur_keys elif isinstance(selection, Iterable): # selection = [('src_glob', 'key_glob'), ...] res = union_selections( self._select_glob(src_glob, key_glob) for (src_glob, key_glob) in selection ) elif isinstance(selection, DataCollection): return self._expand_selection(selection.selection) elif isinstance(selection, KeyData): res[selection.source] = {selection.key} else: raise TypeError("Unknown selection type: {}".format(type(selection))) return dict(res) def _select_glob(self, source_glob, key_glob): source_re = re.compile(fnmatch.translate(source_glob)) key_re = re.compile(fnmatch.translate(key_glob)) if key_glob.endswith(('.value', '*')): ctrl_key_re = key_re else: # The translated pattern ends with "\Z" - insert before this p = key_re.pattern end_ix = p.rindex(r'\Z') ctrl_key_re = re.compile(p[:end_ix] + r'(\.value)?' + p[end_ix:]) matched = {} for source in self.all_sources: if not source_re.match(source): continue if key_glob == '*': # When the selection refers to all keys, make sure this # is restricted to the current selection of keys for # this source. if self.selection[source] is None: matched[source] = None else: matched[source] = self.selection[source] else: r = ctrl_key_re if source in self.control_sources else key_re keys = set(filter(r.match, self.keys_for_source(source))) if keys: matched[source] = keys if not matched: raise ValueError("No matches for pattern {}" .format((source_glob, key_glob))) return matched def select(self, seln_or_source_glob, key_glob='*', require_all=False): """Select a subset of sources and keys from this data. There are four possible ways to select data: 1. With two glob patterns (see below) for source and key names:: # Select data in the image group for any detector sources sel = run.select('*/DET/*', 'image.*') 2. With an iterable of (source, key) glob patterns:: # Select image.data and image.mask for any detector sources sel = run.select([('*/DET/*', 'image.data'), ('*/DET/*', 'image.mask')]) Data is included if it matches any of the pattern pairs. 3. With a dict of source names mapped to sets of key names (or empty sets to get all keys):: # Select image.data from one detector source, and all data from one XGM sel = run.select({'SPB_DET_AGIPD1M-1/DET/0CH0:xtdf': {'image.data'}, 'SA1_XTD2_XGM/XGM/DOOCS': set()}) Unlike the others, this option *doesn't* allow glob patterns. It's a more precise but less convenient option for code that knows exactly what sources and keys it needs. 4. With an existing DataCollection or KeyData object:: # Select the same data contained in another DataCollection prev_run.select(sel) The optional `require_all` argument restricts the trains to those for which all selected sources and keys have at least one data entry. By default, all trains remain selected. Returns a new :class:`DataCollection` object for the selected data. .. note:: 'Glob' patterns may be familiar from selecting files in a Unix shell. ``*`` matches anything, so ``*/DET/*`` selects sources with "/DET/" anywhere in the name. There are several kinds of wildcard: - ``*``: anything - ``?``: any single character - ``[xyz]``: one character, "x", "y" or "z" - ``[0-9]``: one digit character - ``[!xyz]``: one character, *not* x, y or z Anything else in the pattern must match exactly. It's case-sensitive, so "x" does not match "X". """ if isinstance(seln_or_source_glob, str): seln_or_source_glob = [(seln_or_source_glob, key_glob)] selection = self._expand_selection(seln_or_source_glob) files = [f for f in self.files if f.all_sources.intersection(selection.keys())] if require_all: # Select only those trains for which all selected sources # and keys have data, i.e. have a count > 0 in their # respective INDEX section. train_ids = self.train_ids for source, keys in selection.items(): if source in self.instrument_sources: # For INSTRUMENT sources, the INDEX is saved by # key group, which is the first hash component. In # many cases this is 'data', but not always. if keys is None: # All keys are selected. keys = self.keys_for_source(source) groups = {key.partition('.')[0] for key in keys} else: # CONTROL data has no key group. groups = [''] for group in groups: # Empty list would be converted to np.float64 array. source_tids = np.empty(0, dtype=np.uint64) for f in self._source_index[source]: valid = True if self.inc_suspect_trains else f.validity_flag # Add the trains with data in each file. _, counts = f.get_index(source, group) source_tids = np.union1d( f.train_ids[valid & (counts > 0)], source_tids ) # Remove any trains previously selected, for which this # selected source and key group has no data. train_ids = np.intersect1d(train_ids, source_tids) # Filtering may have eliminated previously selected files. files = [f for f in files if f.has_train_ids(train_ids, self.inc_suspect_trains)] train_ids = list(train_ids) # Convert back to a list. else: train_ids = self.train_ids return DataCollection( files, selection=selection, train_ids=train_ids, inc_suspect_trains=self.inc_suspect_trains, is_single_run=self.is_single_run ) def deselect(self, seln_or_source_glob, key_glob='*'): """Select everything except the specified sources and keys. This takes the same arguments as :meth:`select`, but the sources and keys you specify are dropped from the selection. Returns a new :class:`DataCollection` object for the remaining data. """ if isinstance(seln_or_source_glob, str): seln_or_source_glob = [(seln_or_source_glob, key_glob)] deselection = self._expand_selection(seln_or_source_glob) # Subtract deselection from self.selection selection = {} for source, keys in self.selection.items(): if source not in deselection: selection[source] = keys continue desel_keys = deselection[source] if desel_keys is None: continue # Drop the entire source if keys is None: keys = self.keys_for_source(source) selection[source] = keys - desel_keys if not selection[source]: # Drop the source if all keys were deselected del selection[source] files = [f for f in self.files if f.all_sources.intersection(selection.keys())] return DataCollection( files, selection=selection, train_ids=self.train_ids, inc_suspect_trains=self.inc_suspect_trains, is_single_run=self.is_single_run, ) def select_trains(self, train_range): """Select a subset of trains from this data. Choose a slice of trains by train ID:: from extra_data import by_id sel = run.select_trains(by_id[142844490:142844495]) Or select a list of trains:: sel = run.select_trains(by_id[[142844490, 142844493, 142844494]]) Or select trains by index within this collection:: sel = run.select_trains(np.s_[:5]) Returns a new :class:`DataCollection` object for the selected trains. Raises ------ ValueError If given train IDs do not overlap with the trains in this data. """ new_train_ids = select_train_ids(self.train_ids, train_range) files = [f for f in self.files if f.has_train_ids(new_train_ids, self.inc_suspect_trains)] return DataCollection( files, selection=self.selection, train_ids=new_train_ids, inc_suspect_trains=self.inc_suspect_trains, is_single_run=self.is_single_run, ) def split_trains(self, parts=None, trains_per_part=None): """Split this data into chunks with a fraction of the trains each. Either *parts* or *trains_per_part* must be specified. This returns an iterator yielding new :class:`DataCollection` objects. The parts will have similar sizes, e.g. splitting 11 trains with ``trains_per_part=8`` will produce 5 & 6 trains, not 8 & 3. Parameters ---------- parts: int How many parts to split the data into. If trains_per_part is also specified, this is a minimum, and it may make more parts. It may also make fewer if there are fewer trains in the data. trains_per_part: int A maximum number of trains in each part. Parts will often have fewer trains than this. """ for s in split_trains(len(self.train_ids), parts, trains_per_part): yield self.select_trains(s) def _check_source_conflicts(self): """Check for data with the same source and train ID in different files. """ sources_with_conflicts = set() files_conflict_cache = {} def files_have_conflict(files): fset = frozenset({f.filename for f in files}) if fset not in files_conflict_cache: if self.inc_suspect_trains: tids = np.concatenate([f.train_ids for f in files]) else: tids = np.concatenate([f.valid_train_ids for f in files]) files_conflict_cache[fset] = len(np.unique(tids)) != len(tids) return files_conflict_cache[fset] for source, files in self._source_index.items(): if files_have_conflict(files): sources_with_conflicts.add(source) if sources_with_conflicts: raise ValueError("{} sources have conflicting data " "(same train ID in different files): {}".format( len(sources_with_conflicts), ", ".join(sources_with_conflicts) )) def _expand_trainids(self, counts, trainIds): n = min(len(counts), len(trainIds)) return np.repeat(trainIds[:n], counts.astype(np.intp)[:n]) def _find_data_chunks(self, source, key): """Find contiguous chunks of data for the given source & key Yields DataChunk objects. """ return self._get_key_data(source, key)._data_chunks def _find_data(self, source, train_id) -> Tuple[FileAccess, int]: for f in self._source_index[source]: ixs = (f.train_ids == train_id).nonzero()[0] if self.inc_suspect_trains and ixs.size > 0: return f, ixs[0] for ix in ixs: if f.validity_flag[ix]: return f, ix return None, None def info(self, details_for_sources=()): """Show information about the selected data. """ details_sources_re = [re.compile(fnmatch.translate(p)) for p in details_for_sources] # time info train_count = len(self.train_ids) if train_count == 0: first_train = last_train = '-' span_txt = '0.0' else: first_train = self.train_ids[0] last_train = self.train_ids[-1] seconds, deciseconds = divmod((last_train - first_train + 1), 10) span_txt = '{}.{}'.format(datetime.timedelta(seconds=seconds), int(deciseconds)) detector_modules = {} for source in self.detector_sources: name, modno = DETECTOR_SOURCE_RE.match(source).groups((1, 2)) detector_modules[(name, modno)] = source # A run should only have one detector, but if that changes, don't hide it detector_name = ','.join(sorted(set(k[0] for k in detector_modules))) # disp print('# of trains: ', train_count) print('Duration: ', span_txt) print('First train ID:', first_train) print('Last train ID: ', last_train) print() print("{} detector modules ({})".format( len(self.detector_sources), detector_name )) if len(detector_modules) > 0: # Show detail on the first module (the others should be similar) mod_key = sorted(detector_modules)[0] mod_source = detector_modules[mod_key] dinfo = self.detector_info(mod_source) module = ' '.join(mod_key) dims = ' x '.join(str(d) for d in dinfo['dims']) print(" e.g. module {} : {} pixels".format(module, dims)) print(" {}".format(mod_source)) print(" {} frames per train, up to {} frames total".format( dinfo['frames_per_train'], dinfo['total_frames'] )) print() def src_data_detail(s, keys, prefix=''): """Detail for how much data is present for an instrument group""" if not keys: return counts = self.get_data_counts(s, list(keys)[0]) ntrains_data = (counts > 0).sum() print( f'{prefix}data for {ntrains_data} trains ' f'({ntrains_data / train_count:.2%}), ' f'up to {counts.max()} entries per train' ) def keys_detail(s, keys, prefix=''): """Detail for a group of keys""" for k in keys: entry_shape = self.get_entry_shape(s, k) if entry_shape: entry_info = f", entry shape {entry_shape}" else: entry_info = "" dt = self.get_dtype(s, k) print(f"{prefix}{k}\t[{dt}{entry_info}]") non_detector_inst_srcs = self.instrument_sources - self.detector_sources print(len(non_detector_inst_srcs), 'instrument sources (excluding detectors):') for s in sorted(non_detector_inst_srcs): print(' -', s) if not any(p.match(s) for p in details_sources_re): continue # Detail for instrument sources: for group, keys in groupby(sorted(self.keys_for_source(s)), key=lambda k: k.split('.')[0]): print(f' - {group}:') keys = list(keys) src_data_detail(s, keys, prefix=' ') keys_detail(s, keys, prefix=' - ') print() print(len(self.control_sources), 'control sources:') for s in sorted(self.control_sources): print(' -', s) if any(p.match(s) for p in details_sources_re): # Detail for control sources: list keys ctrl_keys = self.keys_for_source(s) print(' - Control keys (1 entry per train):') keys_detail(s, sorted(ctrl_keys), prefix=' - ') run_keys = self._source_index[s][0].get_run_keys(s) run_only_keys = run_keys - ctrl_keys if run_only_keys: print(' - Additional run keys (1 entry per run):') for k in sorted(run_only_keys): ds = self._source_index[s][0].file[f"/RUN/{s}/{k.replace('.', '/')}"] entry_shape = ds.shape[1:] if entry_shape: entry_info = f", entry shape {entry_shape}" else: entry_info = "" dt = ds.dtype if h5py.check_string_dtype(dt): dt = 'string' print(f" - {k}\t[{dt}{entry_info}]") print() def detector_info(self, source): """Get statistics about the detector data. Returns a dictionary with keys: - 'dims' (pixel dimensions) - 'frames_per_train' (estimated from one file) - 'total_frames' (estimated assuming all trains have data) """ source_files = self._source_index[source] file0 = sorted(source_files, key=lambda fa: fa.filename)[0] _, counts = file0.get_index(source, 'image') counts = set(np.unique(counts)) counts.discard(0) if len(counts) > 1: warn("Varying number of frames per train: %s" % counts) if counts: fpt = int(counts.pop()) else: fpt = 0 dims = file0.file['/INSTRUMENT/{}/image/data'.format(source)].shape[-2:] return { 'dims': dims, # Some trains have 0 frames; max is the interesting value 'frames_per_train': fpt, 'total_frames': fpt * len(self.train_ids), } def train_info(self, train_id): """Show information about a specific train in the run. Parameters ---------- train_id: int The specific train ID you get details information. Raises ------ ValueError if `train_id` is not found in the run. """ if train_id not in self.train_ids: raise ValueError("train {} not found in run.".format(train_id)) files = [f for f in self.files if f.has_train_ids([train_id], self.inc_suspect_trains)] ctrl = set().union(*[f.control_sources for f in files]) inst = set().union(*[f.instrument_sources for f in files]) # disp print('Train [{}] information'.format(train_id)) print('Devices') print('\tInstruments') [print('\t-', d) for d in sorted(inst)] or print('\t-') print('\tControls') [print('\t-', d) for d in sorted(ctrl)] or print('\t-') def train_timestamps(self, labelled=False): """Get approximate timestamps for each train Timestamps are stored and returned in UTC (not local time). Older files (before format version 1.0) do not have timestamp data, and the returned data in those cases will have the special value NaT (Not a Time). If *labelled* is True, they are returned in a pandas series, labelled with train IDs. If False (default), they are returned in a NumPy array of the same length as data.train_ids. """ arr = np.zeros(len(self.train_ids), dtype=np.uint64) id_to_ix = {tid: i for (i, tid) in enumerate(self.train_ids)} missing_tids = np.array(self.train_ids) for fa in self.files: tids, file_ixs, _ = np.intersect1d( fa.train_ids, missing_tids, return_indices=True ) if not self.inc_suspect_trains: valid = fa.validity_flag[file_ixs] tids, file_ixs = tids[valid], file_ixs[valid] if tids.size == 0 or 'INDEX/timestamp' not in fa.file: continue file_tss = fa.file['INDEX/timestamp'][:] for tid, ts in zip(tids, file_tss[file_ixs]): arr[id_to_ix[tid]] = ts missing_tids = np.setdiff1d(missing_tids, tids) if missing_tids.size == 0: # We've got a timestamp for every train break arr = arr.astype('datetime64[ns]') epoch = np.uint64(0).astype('datetime64[ns]') arr[arr == epoch] = 'NaT' # Not a Time if labelled: import pandas as pd return pd.Series(arr, index=self.train_ids) return arr def run_metadata(self) -> dict: """Get a dictionary of metadata about the run From file format version 1.0, the files capture: creationDate, daqLibrary, dataFormatVersion, karaboFramework, proposalNumber, runNumber, sequenceNumber, updateDate. """ if not self.is_single_run: raise MultiRunError() return self.files[0].metadata() def write(self, filename): """Write the selected data to a new HDF5 file You can choose a subset of the data using methods like :meth:`select` and :meth:`select_trains`, then use this write it to a new, smaller file. The target filename will be overwritten if it already exists. """ from .writer import FileWriter FileWriter(filename, self).write() def write_virtual(self, filename): """Write an HDF5 file with virtual datasets for the selected data. This doesn't copy the data, but each virtual dataset provides a view of data spanning multiple sequence files, which can be accessed as if it had been copied into one big file. This is *not* the same as `building virtual datasets to combine multi-module detector data `__. See :doc:`agipd_lpd_data` for that. Creating and reading virtual datasets requires HDF5 version 1.10. The target filename will be overwritten if it already exists. """ from .writer import VirtualFileWriter VirtualFileWriter(filename, self).write() def get_virtual_dataset(self, source, key, filename=None): """Create an HDF5 virtual dataset for a given source & key A dataset looks like a multidimensional array, but the data is loaded on-demand when you access it. So it's suitable as an interface to data which is too big to load entirely into memory. This returns an h5py.Dataset object. This exists in a real file as a 'virtual dataset', a collection of links pointing to the data in real datasets. If *filename* is passed, the file is written at that path, overwriting if it already exists. Otherwise, it uses a new temp file. To access the dataset from other worker processes, give them the name of the created file along with the path to the dataset inside it (accessible as ``ds.name``). They will need at least HDF5 1.10 to access the virtual dataset, and they must be on a system with access to the original data files, as the virtual dataset points to those. """ self._check_field(source, key) from .writer import VirtualFileWriter if filename is None: # Make a temp file to hold the virtual dataset. fd, filename = tempfile.mkstemp(suffix='-karabo-data-vds.h5') os.close(fd) vfw = VirtualFileWriter(filename, self) vfw.write_train_ids() ds_path = vfw.add_dataset(source, key) vfw.write_indexes() vfw.write_metadata() vfw.set_writer() vfw.file.close() # Close the file for writing and reopen read-only f = h5py.File(filename, 'r') return f[ds_path] class TrainIterator: """Iterate over trains in a collection of data Created by :meth:`DataCollection.trains`. """ def __init__(self, data, require_all=True, flat_keys=False): self.data = data self.require_all = require_all # {(source, key): (f, dataset)} self._datasets_cache = {} self._set_result = self._set_result_flat if flat_keys \ else self._set_result_nested @staticmethod def _set_result_nested(res, source, key, value): try: res[source][key] = value except KeyError: res[source] = {key: value} @staticmethod def _set_result_flat(res, source, key, value): res[(source, key)] = value def _find_data(self, source, key, tid): file, ds = self._datasets_cache.get((source, key), (None, None)) if ds: ixs = (file.train_ids == tid).nonzero()[0] if self.data.inc_suspect_trains and ixs.size > 0: return file, ixs[0], ds for ix in ixs: if file.validity_flag[ix]: return file, ix, ds data = self.data section = 'CONTROL' if source in data.control_sources else 'INSTRUMENT' path = '/{}/{}/{}'.format(section, source, key.replace('.', '/')) f, pos = data._find_data(source, tid) if f is not None: ds = f.file[path] self._datasets_cache[(source, key)] = (f, ds) return f, pos, ds return None, None, None def _assemble_data(self, tid): res = {} for source in self.data.control_sources: self._set_result(res, source, 'metadata', {'source': source, 'timestamp.tid': tid}) for key in self.data.keys_for_source(source): _, pos, ds = self._find_data(source, key, tid) if ds is None: continue self._set_result(res, source, key, ds[pos]) for source in self.data.instrument_sources: self._set_result(res, source, 'metadata', {'source': source, 'timestamp.tid': tid}) for key in self.data.keys_for_source(source): file, pos, ds = self._find_data(source, key, tid) if ds is None: continue group = key.partition('.')[0] firsts, counts = file.get_index(source, group) first, count = firsts[pos], counts[pos] if count == 1: self._set_result(res, source, key, ds[first]) elif count > 0: self._set_result(res, source, key, ds[first : first + count]) return res def __iter__(self): for tid in self.data.train_ids: tid = int(tid) # Convert numpy int to regular Python int if self.require_all and self.data._check_data_missing(tid): continue yield tid, self._assemble_data(tid) def H5File(path, *, inc_suspect_trains=True): """Open a single HDF5 file generated at European XFEL. :: file = H5File("RAW-R0017-DA01-S00000.h5") Returns a :class:`DataCollection` object. Parameters ---------- path: str Path to the HDF5 file inc_suspect_trains: bool If False, suspect train IDs within a file are skipped. In newer files, trains where INDEX/flag are 0 are suspect. For older files which don't have this flag, out-of-sequence train IDs are suspect. If True (default), it tries to include these trains. """ return DataCollection.from_path(path, inc_suspect_trains=inc_suspect_trains) def RunDirectory( path, include='*', file_filter=locality.lc_any, *, inc_suspect_trains=True ): """Open data files from a 'run' at European XFEL. :: run = RunDirectory("/gpfs/exfel/exp/XMPL/201750/p700000/raw/r0001") A 'run' is a directory containing a number of HDF5 files with data from the same time period. Returns a :class:`DataCollection` object. Parameters ---------- path: str Path to the run directory containing HDF5 files. include: str Wildcard string to filter data files. file_filter: callable Function to subset the list of filenames to open. Meant to be used with functions in the extra_data.locality module. inc_suspect_trains: bool If False, suspect train IDs within a file are skipped. In newer files, trains where INDEX/flag are 0 are suspect. For older files which don't have this flag, out-of-sequence train IDs are suspect. If True (default), it tries to include these trains. """ files = [f for f in os.listdir(path) if f.endswith('.h5')] files = [osp.join(path, f) for f in fnmatch.filter(files, include)] files = file_filter(files) if not files: raise Exception("No HDF5 files found in {} with glob pattern {}".format(path, include)) files_map = RunFilesMap(path) t0 = time.monotonic() d = DataCollection.from_paths( files, files_map, inc_suspect_trains=inc_suspect_trains, is_single_run=True, ) log.debug("Opened run with %d files in %.2g s", len(d.files), time.monotonic() - t0) files_map.save(d.files) return d # RunDirectory was previously RunHandler; we'll leave it accessible in case # any code was already using it. RunHandler = RunDirectory def open_run( proposal, run, data='raw', include='*', file_filter=locality.lc_any, *, inc_suspect_trains=True ): """Access EuXFEL data on the Maxwell cluster by proposal and run number. :: run = open_run(proposal=700000, run=1) Returns a :class:`DataCollection` object. Parameters ---------- proposal: str, int A proposal number, such as 2012, '2012', 'p002012', or a path such as '/gpfs/exfel/exp/SPB/201701/p002012'. run: str, int A run number such as 243, '243' or 'r0243'. data: str 'raw', 'proc' (processed) or 'all' (both 'raw' and 'proc') to access data from either or both of those folders. If 'all' is used, sources present in 'proc' overwrite those in 'raw'. The default is 'raw'. include: str Wildcard string to filter data files. file_filter: callable Function to subset the list of filenames to open. Meant to be used with functions in the extra_data.locality module. inc_suspect_trains: bool If False, suspect train IDs within a file are skipped. In newer files, trains where INDEX/flag are 0 are suspect. For older files which don't have this flag, out-of-sequence train IDs are suspect. If True (default), it tries to include these trains. """ if data == 'all': common_args = dict( proposal=proposal, run=run, include=include, file_filter=file_filter, inc_suspect_trains=inc_suspect_trains) # Create separate data collections for raw and proc. raw_dc = open_run(**common_args, data='raw') proc_dc = open_run(**common_args, data='proc') # Deselect to those raw sources not present in proc. raw_extra = raw_dc.deselect( [(src, '*') for src in raw_dc.all_sources & proc_dc.all_sources]) # Merge extra raw sources into proc sources and re-enable is_single_run. dc = proc_dc.union(raw_extra) dc.is_single_run = True return dc if isinstance(proposal, str): if ('/' not in proposal) and not proposal.startswith('p'): proposal = 'p' + proposal.rjust(6, '0') else: # Allow integers, including numpy integers proposal = 'p{:06d}'.format(index(proposal)) prop_dir = find_proposal(proposal) if isinstance(run, str): if run.startswith('r'): run = run[1:] else: run = index(run) # Allow integers, including numpy integers run = 'r' + str(run).zfill(4) return RunDirectory( osp.join(prop_dir, data, run), include=include, file_filter=file_filter, inc_suspect_trains=inc_suspect_trains, ) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1623678132.0 EXtra-data-1.7.0/extra_data/run_files_map.py0000664000175000017500000001672500000000000021636 0ustar00takluyvertakluyverimport json import logging import os import os.path as osp import numpy as np import re from tempfile import mkstemp import time SCRATCH_ROOT_DIR = "/gpfs/exfel/exp/" log = logging.getLogger(__name__) def atomic_dump(obj, path, **kwargs): """Write JSON to a file atomically This aims to avoid garbled files from multiple processes writing the same cache. It doesn't try to protect against e.g. sudden power failures, as forcing the OS to flush changes to disk may hurt performance. """ dirname, basename = osp.split(path) fd, tmp_filename = mkstemp(dir=dirname, prefix=basename) try: with open(fd, 'w') as f: json.dump(obj, f, **kwargs) except: os.unlink(tmp_filename) raise os.replace(tmp_filename, path) class RunFilesMap: """Cached data about HDF5 files in a run directory Stores the train IDs and source names in each file, along with some metadata to check that the cache is still valid. The cached information can be stored in: - (run dir)/karabo_data_map.json - (proposal dir)/scratch/.karabo_data_maps/raw_r0032.json """ cache_file = None def __init__(self, directory): self.directory = osp.abspath(directory) self.dir_stat = os.stat(self.directory) self.files_data = {} self.candidate_paths = self.map_paths_for_run(directory) self.load() def map_paths_for_run(self, directory): paths = [osp.join(directory, 'karabo_data_map.json')] # After resolving symlinks, data on Maxwell is stored in either # GPFS, e.g. /gpfs/exfel/d/proc/SCS/201901/p002212 or # dCache, e.g. /pnfs/xfel.eu/exfel/archive/XFEL/raw/SCS/201901/p002212 # On the online cluster the resolved path stay: # /gpfs/exfel/exp/inst/cycle/prop/(raw|proc)/run maxwell_match = re.match( # raw/proc instr cycle prop run r'.+/(raw|proc)/(\w+)/(\w+)/(p\d+)/(r\d+)/?$', os.path.realpath(directory) ) online_match = re.match( # instr cycle prop raw/proc run r'^.+/(\w+)/(\w+)/(p\d+)/(raw|proc)/(r\d+)/?$', os.path.realpath(directory) ) if maxwell_match: raw_proc, instr, cycle, prop, run_nr = maxwell_match.groups() elif online_match: instr, cycle, prop, raw_proc, run_nr = online_match.groups() else: run_nr = None if run_nr is not None: fname = '%s_%s.json' % (raw_proc, run_nr) prop_scratch = osp.join( SCRATCH_ROOT_DIR, instr, cycle, prop, 'scratch' ) if osp.isdir(prop_scratch): paths.append( osp.join(prop_scratch, '.karabo_data_maps', fname) ) return paths def load(self): """Load the cached data This skips over invalid cache entries(based on the file's size & mtime). """ loaded_data = [] t0 = time.monotonic() for path in self.candidate_paths: try: with open(path) as f: loaded_data = json.load(f) self.cache_file = path log.debug("Loaded cached files map from %s", path) break except (FileNotFoundError, json.JSONDecodeError): pass for info in loaded_data: filename = info['filename'] try: st = os.stat(osp.join(self.directory, filename)) except OSError: continue if (st.st_mtime == info['mtime']) and (st.st_size == info['size']): self.files_data[filename] = info if loaded_data: dt = time.monotonic() - t0 log.debug("Loaded cached files map in %.2g s", dt) def is_my_directory(self, dir_path): return osp.samestat(os.stat(dir_path), self.dir_stat) def get(self, path): """Get cache entry for a file path Returns a dict or None """ dirname, fname = osp.split(osp.abspath(path)) if self.is_my_directory(dirname) and (fname in self.files_data): d = self.files_data[fname] res = { 'train_ids': np.array(d['train_ids'], dtype=np.uint64), 'control_sources': frozenset(d['control_sources']), 'instrument_sources': frozenset(d['instrument_sources']) } # Older cache files don't contain info on 'suspect' trains. if 'suspect_train_indices' in d: res['flag'] = flag = np.ones_like(d['train_ids'], dtype=np.bool_) flag[d['suspect_train_indices']] = 0 return res return None def _cache_valid(self, fname): # The cache is invalid (needs to be written out) if the file is not in # files_data (which it won't be if the size or mtime don't match - see # load()), or if suspect_train_indices is missing. This was added after # we started making cache files, so we want to add it to existing caches. return 'suspect_train_indices' in self.files_data.get(fname, {}) def save(self, files): """Save the cache if needed This skips writing the cache out if all the data files already have valid cache entries. It also silences permission errors from writing the cache file. """ need_save = False for file_access in files: dirname, fname = osp.split(osp.abspath(file_access.filename)) if self.is_my_directory(dirname) and not self._cache_valid(fname): log.debug("Will save cached data for %s", fname) need_save = True # It's possible that the file we opened has been replaced by a # new one before this runs. If possible, use the stat FileAccess got # from the file descriptor, which will always be accurate. # Stat-ing the filename will almost always work as a fallback. if isinstance(file_access.metadata_fstat, os.stat_result): st = file_access.metadata_fstat else: log.warning("No fstat for %r, will stat name instead", fname) st = os.stat(file_access.filename) self.files_data[fname] = { 'filename': fname, 'mtime': st.st_mtime, 'size': st.st_size, 'train_ids': [int(t) for t in file_access.train_ids], 'control_sources': sorted(file_access.control_sources), 'instrument_sources': sorted(file_access.instrument_sources), 'suspect_train_indices': [ int(i) for i in (~file_access.validity_flag).nonzero()[0] ], } if need_save: t0 = time.monotonic() save_data = [info for (_, info) in sorted(self.files_data.items())] for path in self.candidate_paths: try: os.makedirs(osp.dirname(path), exist_ok=True) atomic_dump(save_data, path, indent=2) except PermissionError: continue else: dt = time.monotonic() - t0 log.debug("Saved run files map to %s in %.2g s", path, dt) return log.debug("Unable to save run files map") ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1626691885.0 EXtra-data-1.7.0/extra_data/stacking.py0000664000175000017500000002114300000000000020604 0ustar00takluyvertakluyverimport numpy as np import re __all__ = [ 'stack_data', 'stack_detector_data', ] def stack_data(train, data, axis=-3, xcept=()): """Stack data from devices in a train. For detector data, use stack_detector_data instead: it can handle missing modules, which this function cannot. The returned array will have an extra dimension. The data will be ordered according to any groups of digits in the source name, interpreted as integers. Other characters do not affect sorting. So: "B_7_0" < "A_12_0" < "A_12_1" Parameters ---------- train: dict Train data. data: str The path to the device parameter of the data you want to stack. axis: int, optional Array axis on which you wish to stack. xcept: list List of devices to ignore (useful if you have reccored slow data with detector data in the same run). Returns ------- combined: numpy.array Stacked data for requested data path. """ devices = [dev for dev in train.keys() if dev not in xcept] if not devices: raise ValueError("No data after filtering by 'xcept' argument.") dtypes = set() ordered_arrays = [] for device in sorted(devices, key=lambda d: list(map(int, re.findall(r'\d+', d)))): array = train[device][data] dtypes.add(array.dtype) ordered_arrays.append(array) if len(dtypes) > 1: raise ValueError("Arrays have mismatched dtypes: {}".format(dtypes)) return np.stack(ordered_arrays, axis=axis) def stack_detector_data( train, data, axis=-3, modules=16, fillvalue=None, real_array=True, *, pattern=r'/DET/(\d+)CH', starts_at=0, ): """Stack data from detector modules in a train. Parameters ---------- train: dict Train data. data: str The path to the device parameter of the data you want to stack, e.g. 'image.data'. axis: int Array axis on which you wish to stack (default is -3). modules: int Number of modules composing a detector (default is 16). fillvalue: number Value to use in place of data for missing modules. The default is nan (not a number) for floating-point data, and 0 for integers. real_array: bool If True (default), copy the data together into a real numpy array. If False, avoid copying the data and return a limited array-like wrapper around the existing arrays. This is sufficient for assembling images using detector geometry, and allows better performance. pattern: str Regex to find the module number in source names. Should contain a group which can be converted to an integer. E.g. ``r'/DET/JNGFR(\\d+)'`` for one JUNGFRAU naming convention. starts_at: int By default, uses module numbers starting at 0 (e.g. 0-15 inclusive). If the numbering is e.g. 1-16 instead, pass starts_at=1. This is not automatic because the first or last module may be missing from the data. Returns ------- combined: numpy.array Stacked data for requested data path. """ if not train: raise ValueError("No data") dtypes, shapes, empty_mods = set(), set(), set() modno_arrays = {} for src in train: det_mod_match = re.search(pattern, src) if not det_mod_match: raise ValueError(f"Source {src!r} doesn't match pattern {pattern!r}") modno = int(det_mod_match.group(1)) - starts_at try: array = train[src][data] except KeyError: continue dtypes.add(array.dtype) shapes.add(array.shape) modno_arrays[modno] = array if len(dtypes) > 1: raise ValueError("Arrays have mismatched dtypes: {}".format(dtypes)) if len(shapes) > 1: s1, s2, *_ = sorted(shapes) if len(shapes) > 2 or (s1[0] != 0) or (s1[1:] != s2[1:]): raise ValueError("Arrays have mismatched shapes: {}".format(shapes)) empty_mods = {n for n, a in modno_arrays.items() if a.shape == s1} for modno in empty_mods: del modno_arrays[modno] shapes.remove(s1) if max(modno_arrays) >= modules: raise IndexError("Module {} is out of range for a detector with {} modules" .format(max(modno_arrays), modules)) dtype = dtypes.pop() shape = shapes.pop() if fillvalue is None: fillvalue = np.nan if dtype.kind == 'f' else 0 fillvalue = dtype.type(fillvalue) # check value compatibility with dtype stack = StackView( modno_arrays, modules, shape, dtype, fillvalue, stack_axis=axis ) if real_array: return stack.asarray() return stack class StackView: """Limited array-like object holding detector data from several modules. Access is limited to either a single module at a time or all modules together, but this is enough to assemble detector images. """ def __init__(self, data, nmodules, mod_shape, dtype, fillvalue, stack_axis=-3): self._nmodules = nmodules self._data = data # {modno: array} self.dtype = dtype self._fillvalue = fillvalue self._mod_shape = mod_shape self.ndim = len(mod_shape) + 1 self._stack_axis = stack_axis if self._stack_axis < 0: self._stack_axis += self.ndim sax = self._stack_axis self.shape = mod_shape[:sax] + (nmodules,) + mod_shape[sax:] def __repr__(self): return "".format( self.shape, len(self._data), self._nmodules, self.dtype, ) # Multidimensional slicing def __getitem__(self, slices): if not isinstance(slices, tuple): slices = (slices,) missing_dims = self.ndim - len(slices) if Ellipsis in slices: ix = slices.index(Ellipsis) missing_dims += 1 slices = slices[:ix] + (slice(None, None),) * missing_dims + slices[ix + 1:] else: slices = slices + (slice(None, None),) * missing_dims modno = slices[self._stack_axis] mod_slices = slices[:self._stack_axis] + slices[self._stack_axis + 1:] if isinstance(modno, int): if modno < 0: modno += self._nmodules return self._get_single_mod(modno, mod_slices) elif modno == slice(None, None): return self._get_all_mods(mod_slices) else: raise Exception( "VirtualStack can only slice a single module or all modules" ) def _get_single_mod(self, modno, mod_slices): try: mod_data = self._data[modno] except KeyError: if modno >= self._nmodules: raise IndexError(modno) mod_data = np.full(self._mod_shape, self._fillvalue, self.dtype) self._data[modno] = mod_data # Now slice the module data as requested return mod_data[mod_slices] def _get_all_mods(self, mod_slices): new_data = {modno: self._get_single_mod(modno, mod_slices) for modno in self._data} new_mod_shape = list(new_data.values())[0].shape return StackView(new_data, self._nmodules, new_mod_shape, self.dtype, self._fillvalue) def asarray(self): """Copy this data into a real numpy array Don't do this until necessary - the point of using VirtualStack is to avoid copying the data unnecessarily. """ start_shape = (self._nmodules,) + self._mod_shape arr = np.full(start_shape, self._fillvalue, dtype=self.dtype) for modno, data in self._data.items(): arr[modno] = data return np.moveaxis(arr, 0, self._stack_axis) def squeeze(self, axis=None): """Drop axes of length 1 - see numpy.squeeze()""" if axis is None: slices = [0 if d == 1 else slice(None, None) for d in self.shape] elif isinstance(axis, (int, tuple)): if isinstance(axis, int): axis = (axis,) slices = [slice(None, None)] * self.ndim for ax in axis: try: slices[ax] = 0 except IndexError: raise np.AxisError( "axis {} is out of bounds for array of dimension {}" .format(ax, self.ndim) ) if self.shape[ax] != 1: raise ValueError("cannot squeeze out an axis with size != 1") else: raise TypeError("axis={!r} not supported".format(axis)) return self[tuple(slices)] ././@PaxHeader0000000000000000000000000000003400000000000010212 xustar0028 mtime=1627995375.6227217 EXtra-data-1.7.0/extra_data/tests/0000775000175000017500000000000000000000000017570 5ustar00takluyvertakluyver././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1602757031.0 EXtra-data-1.7.0/extra_data/tests/__init__.py0000644000175000017500000000000000000000000021665 0ustar00takluyvertakluyver././@PaxHeader0000000000000000000000000000003400000000000010212 xustar0028 mtime=1627995375.6227217 EXtra-data-1.7.0/extra_data/tests/cli/0000775000175000017500000000000000000000000020337 5ustar00takluyvertakluyver././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1602757031.0 EXtra-data-1.7.0/extra_data/tests/cli/__init__.py0000644000175000017500000000000000000000000022434 0ustar00takluyvertakluyver././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1626691885.0 EXtra-data-1.7.0/extra_data/tests/cli/test_make_virtual_cxi.py0000664000175000017500000000145600000000000025304 0ustar00takluyvertakluyverimport os import os.path as osp from testpath import assert_isfile from extra_data.cli.make_virtual_cxi import main def test_make_virtual_cxi(mock_spb_proc_run, tmpdir): output = osp.join(str(tmpdir), 'test.cxi') main([mock_spb_proc_run, '-o', output]) assert_isfile(output) def test_make_virtual_cxi_runno(mock_spb_proc_run, tmpdir): proc = osp.join(str(tmpdir), 'proc') os.mkdir(proc) os.symlink(mock_spb_proc_run, osp.join(proc, 'r0238')) output = osp.join(str(tmpdir), 'test.cxi') # Pass proposal directory and run number main([str(tmpdir), '238', '-o', output]) assert_isfile(output) def test_make_virtual_cxi_jungfrau(mock_jungfrau_run, tmpdir): output = osp.join(str(tmpdir), 'test.cxi') main([mock_jungfrau_run, '-o', output]) assert_isfile(output) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1623678132.0 EXtra-data-1.7.0/extra_data/tests/conftest.py0000664000175000017500000000701500000000000021772 0ustar00takluyvertakluyverimport os.path as osp import h5py import numpy as np import pytest from tempfile import TemporaryDirectory from . import make_examples @pytest.fixture(scope='session', params=['0.5', '1.0']) def format_version(request): return request.param @pytest.fixture(scope='module') def mock_agipd_data(format_version): # This one uses the older index format # (first/last/status instead of first/count) with TemporaryDirectory() as td: path = osp.join(td, 'CORR-R9999-AGIPD07-S00000.h5') make_examples.make_agipd_example_file(path, format_version=format_version) yield path @pytest.fixture(scope='module') def mock_lpd_data(format_version): with TemporaryDirectory() as td: path = osp.join(td, 'RAW-R9999-LPD00-S00000.h5') make_examples.make_lpd_file(path, format_version=format_version) yield path @pytest.fixture(scope='module') def mock_fxe_control_data(format_version): with TemporaryDirectory() as td: path = osp.join(td, 'RAW-R0450-DA01-S00001.h5') make_examples.make_fxe_da_file(path, format_version=format_version) yield path @pytest.fixture(scope='module') def mock_sa3_control_data(format_version): with TemporaryDirectory() as td: path = osp.join(td, 'RAW-R0450-DA01-S00001.h5') make_examples.make_sa3_da_file(path, format_version=format_version) yield path @pytest.fixture(scope='module') def mock_spb_control_data_badname(format_version): with TemporaryDirectory() as td: path = osp.join(td, 'RAW-R0309-DA01-S00000.h5') make_examples.make_data_file_bad_device_name(path, format_version=format_version) yield path @pytest.fixture(scope='session') def mock_fxe_raw_run(format_version): with TemporaryDirectory() as td: make_examples.make_fxe_run(td, format_version=format_version) yield td @pytest.fixture(scope='session') def mock_lpd_parallelgain_run(): with TemporaryDirectory() as td: make_examples.make_lpd_parallelgain_run(td, format_version='1.0') yield td @pytest.fixture(scope='session') def mock_spb_proc_run(format_version): with TemporaryDirectory() as td: make_examples.make_spb_run(td, raw=False, format_version=format_version) yield td @pytest.fixture(scope='session') def mock_reduced_spb_proc_run(format_version): """Varying number of frames stored from AGIPD""" rng = np.random.RandomState(123) # Fix seed with TemporaryDirectory() as td: make_examples.make_reduced_spb_run(td, raw=False, rng=rng, format_version=format_version) yield td @pytest.fixture(scope='session') def mock_spb_raw_run(format_version): with TemporaryDirectory() as td: make_examples.make_spb_run(td, format_version=format_version) yield td @pytest.fixture(scope='session') def mock_jungfrau_run(): with TemporaryDirectory() as td: make_examples.make_jungfrau_run(td) yield td @pytest.fixture(scope='session') def mock_scs_run(): with TemporaryDirectory() as td: make_examples.make_scs_run(td) yield td @pytest.fixture(scope='session') def empty_h5_file(): with TemporaryDirectory() as td: path = osp.join(td, 'empty.h5') with h5py.File(path, 'w'): pass yield path @pytest.fixture(scope='session') def mock_empty_file(): with TemporaryDirectory() as td: path = osp.join(td, 'RAW-R0450-DA01-S00002.h5') make_examples.make_sa3_da_file(path, ntrains=0) yield path ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1623678132.0 EXtra-data-1.7.0/extra_data/tests/make_examples.py0000664000175000017500000003640200000000000022762 0ustar00takluyvertakluyverimport h5py import os import os.path as osp import numpy as np from .mockdata import write_file from .mockdata.adc import ADC from .mockdata.base import write_base_index from .mockdata.basler_camera import BaslerCamera as BaslerCam from .mockdata.dctrl import DCtrl from .mockdata.detectors import AGIPDModule, DSSCModule, LPDModule from .mockdata.gauge import Gauge from .mockdata.gec_camera import GECCamera from .mockdata.imgfel import IMGFELCamera, IMGFELMotor from .mockdata.jungfrau import ( JUNGFRAUModule, JUNGFRAUControl, JUNGFRAUMonitor, JUNGFRAUPower, ) from .mockdata.motor import Motor from .mockdata.mpod import MPOD from .mockdata.tsens import TemperatureSensor from .mockdata.uvlamp import UVLamp from .mockdata.xgm import XGM vlen_bytes = h5py.special_dtype(vlen=bytes) def make_metadata(h5file, data_sources, chunksize=16): N = len(data_sources) if N % chunksize: N += chunksize - (N % chunksize) root = [ds.split('/', 1)[0] for ds in data_sources] devices = [ds.split('/', 1)[1] for ds in data_sources] sources_ds = h5file.create_dataset('METADATA/dataSourceId', (N,), dtype=vlen_bytes, maxshape=(None,)) sources_ds[:len(data_sources)] = data_sources root_ds = h5file.create_dataset('METADATA/root', (N,), dtype=vlen_bytes, maxshape=(None,)) root_ds[:len(data_sources)] = root devices_ds = h5file.create_dataset('METADATA/deviceId', (N,), dtype=vlen_bytes, maxshape=(None,)) devices_ds[:len(data_sources)] = devices def make_agipd_example_file(path, format_version='0.5'): """Make the structure of a data file from the AGIPD detector Based on /gpfs/exfel/d/proc/XMPL/201750/p700000/r0803/CORR-R0803-AGIPD07-S00000.h5 This has the old index format (first/last/status), whereas the other examples have the newer (first/count) format. """ f = h5py.File(path, 'w') slow_channels = ['header', 'detector', 'trailer'] channels = slow_channels + ['image'] train_ids = np.arange(10000, 10250) # Real train IDs are ~10^9 # RUN - empty in the example I'm working from f.create_group('RUN') # METADATA - lists the data sources in this file make_metadata(f, ['INSTRUMENT/SPB_DET_AGIPD1M-1/DET/7CH0:xtdf/'+ch for ch in channels]) def make_train_ids(path): d = f.create_dataset(path, (256,), 'u8', maxshape=(None,)) d[:250] = train_ids # INDEX - matching up data to train IDs write_base_index(f, 250, format_version=format_version) for ch in channels: grp_name = 'INDEX/SPB_DET_AGIPD1M-1/DET/7CH0:xtdf/%s/' % ch first = f.create_dataset(grp_name + 'first', (256,), 'u8', maxshape=(None,)) last = f.create_dataset(grp_name + 'last', (256,), 'u8', maxshape=(None,)) status = f.create_dataset(grp_name + 'status', (256,), 'u4', maxshape=(None,)) if ch in slow_channels: first[:250] = np.arange(250) last[:250] = np.arange(250) else: first[:250] = np.arange(0, 16000, 64) last[:250] = np.arange(63, 16000, 64) status[:250] = 1 # INSTRUMENT - the data itself # first, train IDs for each channel for ch in slow_channels: make_train_ids('INSTRUMENT/SPB_DET_AGIPD1M-1/DET/7CH0:xtdf/%s/trainId' % ch) fast_tids = f.create_dataset('INSTRUMENT/SPB_DET_AGIPD1M-1/DET/7CH0:xtdf/image/trainId', (16000, 1), 'u8') fast_tids[:,0] = np.repeat(train_ids, 64) # TODO: Not sure what this is, but it has quite a regular structure. # 5408 = 13 x 13 x 32 f.create_dataset('INSTRUMENT/SPB_DET_AGIPD1M-1/DET/7CH0:xtdf/detector/data', (256, 5408), 'u1', maxshape=(None, 5408)) f.create_dataset('INSTRUMENT/SPB_DET_AGIPD1M-1/DET/7CH0:xtdf/header/dataId', (256,), 'u8', maxshape=(None,)) # Empty in example linkId = f.create_dataset('INSTRUMENT/SPB_DET_AGIPD1M-1/DET/7CH0:xtdf/header/linkId', (256,), 'u8', maxshape=(None,)) linkId[:250] = 18446744069414584335 # Copied from example f.create_dataset('INSTRUMENT/SPB_DET_AGIPD1M-1/DET/7CH0:xtdf/header/magicNumberBegin', (256, 8), 'i1', maxshape=(None, 8)) # TODO: fill in data vmaj = f.create_dataset('INSTRUMENT/SPB_DET_AGIPD1M-1/DET/7CH0:xtdf/header/majorTrainFormatVersion', (256,), 'u4', maxshape=(None,)) vmaj[:250] = 1 vmin = f.create_dataset('INSTRUMENT/SPB_DET_AGIPD1M-1/DET/7CH0:xtdf/header/minorTrainFormatVersion', (256,), 'u4', maxshape=(None,)) vmin[:250] = 0 pc = f.create_dataset('INSTRUMENT/SPB_DET_AGIPD1M-1/DET/7CH0:xtdf/header/pulseCount', (256,), 'u8', maxshape=(None,)) pc[:250] = 64 f.create_dataset('INSTRUMENT/SPB_DET_AGIPD1M-1/DET/7CH0:xtdf/header/reserved', (256, 16), 'u1', maxshape=(None, 16)) # Empty in example cellId = f.create_dataset('INSTRUMENT/SPB_DET_AGIPD1M-1/DET/7CH0:xtdf/image/cellId', (16000, 1), 'u2') cellId[:, 0] = np.tile(np.arange(64), 250) # The data itself f.create_dataset('INSTRUMENT/SPB_DET_AGIPD1M-1/DET/7CH0:xtdf/image/data', (16000, 512, 128), 'f4') f.create_dataset('INSTRUMENT/SPB_DET_AGIPD1M-1/DET/7CH0:xtdf/image/gain', (16000, 512, 128), 'u1') length = f.create_dataset('INSTRUMENT/SPB_DET_AGIPD1M-1/DET/7CH0:xtdf/image/length', (16000, 1), 'u4', maxshape=(None, 1)) length[:] = 262144 # = 512*128*4(bytes) ? f.create_dataset('INSTRUMENT/SPB_DET_AGIPD1M-1/DET/7CH0:xtdf/image/mask', (16000, 512, 128, 3), 'u1') # TODO: values 128 or 0 pulseId = f.create_dataset('INSTRUMENT/SPB_DET_AGIPD1M-1/DET/7CH0:xtdf/image/pulseId', (16000, 1), 'u8') # In the real data, these are unevenly spaced, but this is close enough pulseId[:, 0] = np.tile(np.linspace(0, 125, 64, dtype='u8'), 250) f.create_dataset('INSTRUMENT/SPB_DET_AGIPD1M-1/DET/7CH0:xtdf/image/status', (16000, 1), 'u2') # Empty in example f.create_dataset('INSTRUMENT/SPB_DET_AGIPD1M-1/DET/7CH0:xtdf/trailer/checksum', (256, 16), 'i1', maxshape=(None, 16)) # Empty in example f.create_dataset('INSTRUMENT/SPB_DET_AGIPD1M-1/DET/7CH0:xtdf/trailer/magicNumberEnd', (256, 8), 'i1', maxshape=(None, 8)) # TODO: fill in data f.create_dataset('INSTRUMENT/SPB_DET_AGIPD1M-1/DET/7CH0:xtdf/trailer/status', (256,), 'u8', maxshape=(None,)) # Empty in example def make_fxe_da_file(path, format_version='0.5'): """Make the structure of a file with non-detector data from the FXE experiment Based on .../FXE/201830/p900023/r0450/RAW-R0450-DA01-S00001.h5 """ write_file(path, [ XGM('SA1_XTD2_XGM/DOOCS/MAIN'), XGM('SPB_XTD9_XGM/DOOCS/MAIN'), GECCamera('FXE_XAD_GEC/CAM/CAMERA'), GECCamera('FXE_XAD_GEC/CAM/CAMERA_NODATA', nsamples=0) ], ntrains=400, chunksize=200, format_version=format_version) def make_sa3_da_file(path, ntrains=500, format_version='0.5'): """Make the structure of a file with non-detector data from SASE3 tunnel Based on .../SA3/201830/p900026/r0317/RAW-R0317-DA01-S00000.h5 """ write_file(path, [ ADC('SA3_XTD10_MCP/ADC/1', nsamples=0, channels=( 'channel_3.output/data', 'channel_5.output/data', 'channel_9.output/data', )), UVLamp('SA3_XTD10_MCP/DCTRL/UVLAMP'), Motor('SA3_XTD10_MCP/MOTOR/X2'), TemperatureSensor('SA3_XTD10_VAC/TSENS/S30100K'), TemperatureSensor('SA3_XTD10_VAC/TSENS/S30160K'), TemperatureSensor('SA3_XTD10_VAC/TSENS/S30180K'), TemperatureSensor('SA3_XTD10_VAC/TSENS/S30190K'), TemperatureSensor('SA3_XTD10_VAC/TSENS/S30200K'), TemperatureSensor('SA3_XTD10_VAC/TSENS/S30250K'), TemperatureSensor('SA3_XTD10_VAC/TSENS/S30260K'), TemperatureSensor('SA3_XTD10_VAC/TSENS/S30280K'), TemperatureSensor('SA3_XTD10_VAC/TSENS/S30300K'), Gauge('SA3_XTD10_VAC/GAUGE/G30470D_IN'), Gauge('SA3_XTD10_VAC/GAUGE/G30480D_IN'), Gauge('SA3_XTD10_VAC/GAUGE/G30490D_IN'), Gauge('SA3_XTD10_VAC/GAUGE/G30500P'), Gauge('SA3_XTD10_VAC/GAUGE/G30510C'), DCtrl('SA3_XTD10_VAC/DCTRL/D6_APERT_IN_OK'), DCtrl('SA3_XTD10_VAC/DCTRL/D12_APERT_IN_OK'), XGM('SA3_XTD10_XGM/XGM/DOOCS'), IMGFELCamera('SA3_XTD10_IMGFEL/CAM/BEAMVIEW', nsamples=0), IMGFELCamera('SA3_XTD10_IMGFEL/CAM/BEAMVIEW2', nsamples=250), IMGFELMotor('SA3_XTD10_IMGFEL/MOTOR/FILTER'), IMGFELMotor('SA3_XTD10_IMGFEL/MOTOR/SCREEN'), MPOD('SA3_XTD10_MCP/MCPS/MPOD'), ], ntrains=ntrains, chunksize=50, format_version=format_version) def make_data_file_bad_device_name(path, format_version='0.5'): """Not all devices have the Karabo standard A/B/C naming convention""" write_file(path, [ BaslerCam('SPB_IRU_SIDEMIC_CAM', sensor_size=(1000, 1000)) ], ntrains=500, chunksize=50, format_version=format_version) def make_agipd_file(path, format_version='0.5'): write_file(path, [ AGIPDModule('SPB_DET_AGIPD1M-1/DET/0CH0', frames_per_train=64) ], ntrains=486, chunksize=32, format_version=format_version) def make_lpd_file(path, format_version='0.5'): write_file(path, [ LPDModule('FXE_DET_LPD1M-1/DET/0CH0', frames_per_train=128) ], ntrains=480, chunksize=32, format_version=format_version) def make_fxe_run(dir_path, raw=True, format_version='0.5'): prefix = 'RAW' if raw else 'CORR' for modno in range(16): path = osp.join(dir_path, '{}-R0450-LPD{:0>2}-S00000.h5'.format(prefix, modno)) write_file(path, [ LPDModule('FXE_DET_LPD1M-1/DET/{}CH0'.format(modno), raw=raw, frames_per_train=128) ], ntrains=480, chunksize=32, format_version=format_version) if not raw: return write_file(osp.join(dir_path, 'RAW-R0450-DA01-S00000.h5'), [ XGM('SA1_XTD2_XGM/DOOCS/MAIN'), XGM('SPB_XTD9_XGM/DOOCS/MAIN'), GECCamera('FXE_XAD_GEC/CAM/CAMERA'), GECCamera('FXE_XAD_GEC/CAM/CAMERA_NODATA', nsamples=0), ], ntrains=400, chunksize=200, format_version=format_version) write_file(osp.join(dir_path, '{}-R0450-DA01-S00001.h5'.format(prefix)), [ XGM('SA1_XTD2_XGM/DOOCS/MAIN'), XGM('SPB_XTD9_XGM/DOOCS/MAIN'), GECCamera('FXE_XAD_GEC/CAM/CAMERA'), GECCamera('FXE_XAD_GEC/CAM/CAMERA_NODATA', nsamples=0), ], ntrains=80, firsttrain=10400, chunksize=200, format_version=format_version) def make_lpd_parallelgain_run(dir_path, raw=True, format_version='0.5'): prefix = 'RAW' if raw else 'CORR' for modno in range(16): path = osp.join(dir_path, '{}-R0450-LPD{:0>2}-S00000.h5'.format(prefix, modno)) write_file(path, [ LPDModule('FXE_DET_LPD1M-1/DET/{}CH0'.format(modno), raw=raw, frames_per_train=300) ], ntrains=100, chunksize=32, format_version=format_version) def make_spb_run(dir_path, raw=True, sensor_size=(1024, 768), format_version='0.5'): prefix = 'RAW' if raw else 'CORR' for modno in range(16): path = osp.join(dir_path, '{}-R0238-AGIPD{:0>2}-S00000.h5'.format(prefix, modno)) write_file(path, [ AGIPDModule('SPB_DET_AGIPD1M-1/DET/{}CH0'.format(modno), raw=raw, frames_per_train=64) ], ntrains=64, chunksize=32, format_version=format_version) if not raw: return write_file(osp.join(dir_path, '{}-R0238-DA01-S00000.h5'.format(prefix)), [ XGM('SA1_XTD2_XGM/DOOCS/MAIN'), XGM('SPB_XTD9_XGM/DOOCS/MAIN'), BaslerCam('SPB_IRU_CAM/CAM/SIDEMIC', sensor_size=sensor_size) ], ntrains=32, chunksize=32, format_version=format_version) write_file(osp.join(dir_path, '{}-R0238-DA01-S00001.h5'.format(prefix)), [ XGM('SA1_XTD2_XGM/DOOCS/MAIN'), XGM('SPB_XTD9_XGM/DOOCS/MAIN'), BaslerCam('SPB_IRU_CAM/CAM/SIDEMIC', sensor_size=sensor_size) ], ntrains=32, firsttrain=10032, chunksize=32, format_version=format_version) def make_reduced_spb_run(dir_path, raw=True, rng=None, format_version='0.5'): # Simulate reduced AGIPD data, with varying number of frames per train. # Counts across modules should be consistent prefix = 'RAW' if raw else 'CORR' if rng is None: rng = np.random.RandomState() frame_counts = rng.randint(0, 20, size=64) for modno in range(16): path = osp.join(dir_path, '{}-R0238-AGIPD{:0>2}-S00000.h5'.format(prefix, modno)) write_file(path, [ AGIPDModule('SPB_DET_AGIPD1M-1/DET/{}CH0'.format(modno), raw=raw, frames_per_train=frame_counts) ], ntrains=64, chunksize=32, format_version=format_version) write_file(osp.join(dir_path, '{}-R0238-DA01-S00000.h5'.format(prefix)), [ XGM('SA1_XTD2_XGM/DOOCS/MAIN'), XGM('SPB_XTD9_XGM/DOOCS/MAIN'), BaslerCam('SPB_IRU_CAM/CAM/SIDEMIC', sensor_size=(1024, 768)) ], ntrains=32, chunksize=32, format_version=format_version) write_file(osp.join(dir_path, '{}-R0238-DA01-S00001.h5'.format(prefix)), [ XGM('SA1_XTD2_XGM/DOOCS/MAIN'), XGM('SPB_XTD9_XGM/DOOCS/MAIN'), BaslerCam('SPB_IRU_CAM/CAM/SIDEMIC', sensor_size=(1024, 768)) ], ntrains=32, firsttrain=10032, chunksize=32, format_version=format_version) def make_jungfrau_run(dir_path): # Naming based on /gpfs/exfel/exp/SPB/202022/p002732/raw/r0012 for modno in range(1, 9): path = osp.join(dir_path, f'RAW-R0012-JNGFR{modno:02}-S00000.h5') write_file(path, [ JUNGFRAUModule(f'SPB_IRDA_JF4M/DET/JNGFR{modno:02}') ], ntrains=100, chunksize=1, format_version='1.0') write_file(osp.join(dir_path, f'RAW-R0012-JNGFRCTRL00-S00000.h5'), [ JUNGFRAUControl('SPB_IRDA_JF4M/DET/CONTROL'), JUNGFRAUMonitor('SPB_IRDA_JF4M/MDL/MONITOR'), JUNGFRAUPower('SPB_IRDA_JF4M/MDL/POWER'), ], ntrains=100, chunksize=1, format_version='1.0') def make_scs_run(dir_path): # Multiple sequence files for detector modules for modno in range(16): mod = DSSCModule(f'SCS_DET_DSSC1M-1/DET/{modno}CH0', frames_per_train=64) for seq in range(2): path = osp.join(dir_path, f'RAW-R0163-DSSC{modno:0>2}-S{seq:0>5}.h5') write_file(path, [mod], ntrains=64, firsttrain=(10000 + seq * 64), chunksize=32, format_version='1.0') if __name__ == '__main__': make_agipd_example_file('agipd_example.h5') make_fxe_da_file('fxe_control_example.h5') make_sa3_da_file('sa3_control_example.h5') make_agipd_file('agipd_example2.h5') make_lpd_file('lpd_example.h5') os.makedirs('fxe_example_run', exist_ok=True) make_fxe_run('fxe_example_run') os.makedirs('spb_example_run', exist_ok=True) make_spb_run('spb_example_run') print("Written examples.") ././@PaxHeader0000000000000000000000000000003400000000000010212 xustar0028 mtime=1627995375.6307218 EXtra-data-1.7.0/extra_data/tests/mockdata/0000775000175000017500000000000000000000000021353 5ustar00takluyvertakluyver././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1602757031.0 EXtra-data-1.7.0/extra_data/tests/mockdata/__init__.py0000644000175000017500000000003700000000000023462 0ustar00takluyvertakluyverfrom .mkfile import write_file ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1602757031.0 EXtra-data-1.7.0/extra_data/tests/mockdata/adc.py0000644000175000017500000000372500000000000022461 0ustar00takluyvertakluyverfrom .base import DeviceBase class ADC(DeviceBase): def __init__(self, device_id, nsamples=None, channels=()): super().__init__(device_id, nsamples) self.output_channels = channels control_keys = [ ('config/softTrigTime', 'u4', ()), ('dacNode/dacCyclesSamples', 'u4', ()), ('dacNode/dacData', 'i4', (1024,)), ('dacNode/dacSkipSamples', 'u4', ()), ('dacNode/dacTrigger', 'u1', ()), ('dacNode/dacTriggerPeriod', 'u4', ()), ('dacNode/dacVoltageData', 'f8', (1000,)), ('dacNode/enableDAC', 'u1', ()), ('dacNode/voltageIntercept', 'f8', ()), ('dacNode/voltageSlope', 'f8', ()), ('delay', 'u4', ()), ('numberRawSamples', 'u4', ()), ('skipSamples', 'u4', ()), ('trainId', 'u8', ()), ('triggerTime', 'i4', ()), ('triggerTimeStat', 'u2', (1000,)), ] + sum(([ ('channel_%d/baseStart' % n, 'u4', ()), ('channel_%d/baseStop' % n, 'u4', ()), ('channel_%d/baseline' % n, 'f4', ()), ('channel_%d/calibrationFactor' % n, 'f8', ()), ('channel_%d/enablePeakComputation' % n, 'u1', ()), ('channel_%d/enableRawDataStreaming' % n, 'u1', ()), ('channel_%d/fixedBaseline' % n, 'f8', ()), ('channel_%d/fixedBaselineEna' % n, 'u1', ()), ('channel_%d/initialDelay' % n, 'u4', ()), ('channel_%d/numPulses' % n, 'u4', ()), ('channel_%d/peakMean' % n, 'f4', ()), ('channel_%d/peakSamples' % n, 'u4', ()), ('channel_%d/peakStd' % n, 'f4', ()), ('channel_%d/pulsePeriod' % n, 'u4', ()), ] for n in range(10)), []) instrument_keys = [ ('baseline', 'f8', ()), ('peakMean', 'f8', ()), ('peakStd', 'f8', ()), ('peaks', 'f4', (1000,)), ('rawBaseline', 'u4', ()), ('rawData', 'u2', (4096,)), ('rawPeaks', 'u4', (1000,)), ('samplesForBaseline', 'u4', ()), ('samplesPerPeak', 'u4', ()), ] ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1626691885.0 EXtra-data-1.7.0/extra_data/tests/mockdata/base.py0000664000175000017500000001755400000000000022653 0ustar00takluyvertakluyverfrom datetime import datetime, timedelta, timezone import os.path as osp import re import h5py import numpy as np class DeviceBase: # Override these in subclasses control_keys = [] output_channels = () instrument_keys = [] # These are set by write_file ntrains = 400 firsttrain = 10000 chunksize = 200 def __init__(self, device_id, nsamples=None): """Create a dummy device :param str device_id: e.g. "SA1_XTD2_XGM/DOOCS/MAIN" :param int ntrains: e.g. 256 :param int nsamples: For INSTRUMENT data only. Default is ntrains. If more, should be a multiple of ntrains. If fewer, samples will be spread evenly across the trains. :param int chunksize: The sample dimension will be padded to a multiple of this. """ self.device_id = device_id self.nsamples = nsamples def write_control(self, f): """Write the CONTROL and RUN data, and the relevant parts of INDEX""" N = self.ntrains # INDEX i_first = f.create_dataset('INDEX/%s/first' % self.device_id, (N,), 'u8', maxshape=(None,)) i_count = f.create_dataset('INDEX/%s/count' % self.device_id, (N,), 'u8', maxshape=(None,)) i_first[:] = np.arange(N) i_count[:] = 1 # CONTROL & RUN # Creating empty datasets for now. for (topic, datatype, dims) in self.control_keys: f.create_dataset('CONTROL/%s/%s/timestamp' % (self.device_id, topic), (N,), 'u8', maxshape=(None,)) f.create_dataset('CONTROL/%s/%s/value' % (self.device_id, topic), (N,)+dims, datatype, maxshape=((None,)+dims)) # RUN is the value at the start of the run f.create_dataset('RUN/%s/%s/timestamp' % (self.device_id, topic), (1,), 'u8', maxshape=(None,)) f.create_dataset('RUN/%s/%s/value' % (self.device_id, topic), (1,)+dims, datatype, maxshape=((None,)+dims)) def write_instrument(self, f): """Write the INSTRUMENT data, and the relevant parts of INDEX""" train0 = self.firsttrain if self.nsamples is None: self.nsamples = self.ntrains if self.ntrains == 0: first, count, trainids = [], [], [] elif self.nsamples == 0: first = count = 0 trainids = [] elif self.nsamples < self.ntrains: first = np.linspace(0, self.nsamples, endpoint=False, num=self.ntrains, dtype='u8') count = np.zeros((self.ntrains,), dtype='u8') count[:-1] = first[1:] - first[:-1] if count.sum() < self.nsamples: count[-1] = 1 assert count.sum() == self.nsamples trainids = np.linspace(train0, train0 + self.ntrains, endpoint=False, num=self.nsamples, dtype='u8') elif self.nsamples == self.ntrains: first = np.arange(self.ntrains) count = 1 trainids = np.arange(train0, train0 + self.ntrains) else: # nsamples > ntrains count = self.nsamples // self.ntrains first = np.arange(0, self.nsamples, step=count) trainids = np.repeat(np.arange(train0, train0 + self.ntrains), count) Npad = self.nsamples if Npad % self.chunksize: Npad += + self.chunksize - (Npad % self.chunksize) for channel in self.output_channels: dev_chan = '%s:%s' % (self.device_id, channel) # INDEX i_first = f.create_dataset('INDEX/%s/first' % dev_chan, (self.ntrains,), 'u8', maxshape=(None,)) i_count = f.create_dataset('INDEX/%s/count' % dev_chan, (self.ntrains,), 'u8', maxshape=(None,)) i_first[:] = first i_count[:] = count # INSTRUMENT tid = f.create_dataset('INSTRUMENT/%s/trainId' % dev_chan, (Npad,), 'u8', maxshape=(None,)) if len(trainids) > 0: tid[:self.nsamples] = trainids for (topic, datatype, dims) in self.instrument_keys: f.create_dataset('INSTRUMENT/%s/%s' % (dev_chan, topic), (Npad,) + dims, datatype, maxshape=((None,) + dims)) def datasource_ids(self): if self.control_keys: yield 'CONTROL/' + self.device_id if self.instrument_keys: for channel in self.output_channels: yield 'INSTRUMENT/%s:%s' % (self.device_id, channel) vlen_bytes = h5py.special_dtype(vlen=bytes) def write_metadata(h5file, data_sources, chunksize=16, format_version='0.5'): N = len(data_sources) if N % chunksize: N += chunksize - (N % chunksize) root = [ds.split('/', 1)[0] for ds in data_sources] devices = [ds.split('/', 1)[1] for ds in data_sources] if format_version == '0.5': data_sources_grp = h5file.create_group('METADATA') else: data_sources_grp = h5file.create_group('METADATA/dataSources') sources_ds = data_sources_grp.create_dataset('dataSourceId', (N,), dtype=vlen_bytes, maxshape=(None,)) sources_ds[:len(data_sources)] = data_sources root_ds = data_sources_grp.create_dataset('root', (N,), dtype=vlen_bytes, maxshape=(None,)) root_ds[:len(data_sources)] = root devices_ds = data_sources_grp.create_dataset('deviceId', (N,), dtype=vlen_bytes, maxshape=(None,)) devices_ds[:len(data_sources)] = devices if format_version != '0.5': h5file['METADATA/dataFormatVersion'] = [format_version.encode('ascii')] now = datetime.utcnow().replace(microsecond=0) updated_time = now + timedelta(minutes=5) h5file['METADATA/creationDate'] = [ now.strftime('%Y%m%dT%H%M%SZ').encode('ascii') ] h5file['METADATA/daqLibrary'] = [b'1.9.0'] h5file['METADATA/karaboFramework'] = [b'2.7.0'] h5file.create_dataset('METADATA/proposalNumber', dtype=np.uint32, data=[700000]) h5file.create_dataset('METADATA/runNumber', dtype=np.uint32, data=[1]) h5file['METADATA/runType'] = [b'Test DAQ'] h5file['METADATA/sample'] = [b'No Sample'] # get sequence number fname_pattern = r'^(RAW|CORR)\-R\d+\-.*\-S(\d+).h5$' match = re.match(fname_pattern, osp.basename(h5file.filename)) sequence = int(match[2]) if match is not None else 0 h5file.create_dataset('METADATA/sequenceNumber', dtype=np.uint32, data=[sequence]) h5file['METADATA/updateDate'] = [ updated_time.strftime('%Y%m%dT%H%M%SZ').encode('ascii') ] def write_base_index(f, N, first=10000, chunksize=16, format_version='0.5'): """Make base datasets in the files index 3 dataset are created: flag, timestamp, trainId Real train IDs are much larger (~10^9), so hopefully these won't be mistaken for real ones. """ if N % chunksize: Npad = N + chunksize - (N % chunksize) else: Npad = N if format_version != '0.5': # flag ds = f.create_dataset('INDEX/flag', (Npad,), 'i4', maxshape=(None,)) ds[:N] = np.ones(N) # timestamps ds = f.create_dataset('INDEX/timestamp', (Npad,), 'u8', maxshape=(None,)) # timestamps are stored as a single uint64 with nanoseconds resolution ts = datetime.now(tz=timezone.utc).timestamp() * 10**9 ds[:N] = [ts + i * 10**8 for i in range(N)] # trainIds ds = f.create_dataset('INDEX/trainId', (Npad,), 'u8', maxshape=(None,)) ds[:N] = np.arange(first, first + N) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1602757031.0 EXtra-data-1.7.0/extra_data/tests/mockdata/basler_camera.py0000644000175000017500000000305000000000000024501 0ustar00takluyvertakluyver"""Script that creates a mock-run for the basler camera""" from .base import DeviceBase class BaslerCamera(DeviceBase): """ Basler Camera device Based on example /gpfs/exfel/exp/SPB/201930/p900061/raw/r0055/RAW-R0055-DA01-S00000.h5 """ def __init__(self, device_id, nsamples=None, sensor_size=None): """Create a dummy basler device that inherits from Device Base""" self.sensor_size = sensor_size or (2058, 2456) super(BaslerCamera, self).__init__(device_id, nsamples=nsamples) self.output_channels = ('daqOutput/data',) # Technically, only the part before the / is the output channel. # But there is a structure associated with the part one level after that, # and we don't know what else to call it. self.instrument_keys = [ ('image/bitsPerPixel', 'i4', ()), ('image/dimTypes', 'i4', (2,)), ('image/dims', 'u8', (2,)), ('image/encoding', 'i4', ()), ('image/pixels', 'u2', self.sensor_size), ('image/roiOffsets', 'u8', (2,)), ('image/binning', 'u8', (2,)), ('image/flipX', 'u1', ()), ('image/flipY', 'u1', ()) ] def write_instrument(self, f): super().write_instrument(f) # Add fixed metadata for channel in self.output_channels: image_grp = 'INSTRUMENT/{}:{}/image/'.format(self.device_id, channel) f[image_grp]['bitsPerPixel'][:self.nsamples] = 16 f[image_grp]['dims'][:self.nsamples] = self.sensor_size ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1602757031.0 EXtra-data-1.7.0/extra_data/tests/mockdata/control_common.py0000644000175000017500000000145600000000000024761 0ustar00takluyvertakluyverinterlock_keys = [ ('interlock/AActCommand', 'u4', ()), ('interlock/AActionState', 'u4', ()), ('interlock/ACndAriOp', 'i2', ()), ('interlock/ACndComOp', 'i2', ()), ('interlock/ACndEnable', 'u1', ()), ('interlock/ACndFiltertime', 'i2', ()), ('interlock/ACndHysteresis', 'u1', ()), ('interlock/ACndSrc1Detail', 'i2', ()), ('interlock/ACndSrc2Detail', 'i2', ()), ('interlock/ACndThreshold', 'u1', ()), ('interlock/ACndValue1', 'u1', ()), ('interlock/ACndValue2', 'u1', ()), ('interlock/AConditionState', 'u4', ()), ('interlockOk', 'u1', ()), ('interlockOn', 'u1', ()), ] triggers_keys = [ ('trigger', 'u4', (1000,)), ] + sum(([ ('triggers/trig%d/enable' % n, 'u1', ()), ('triggers/trig%d/interval' % n, 'f8', ()), ] for n in range(1, 11)), []) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1602757031.0 EXtra-data-1.7.0/extra_data/tests/mockdata/dctrl.py0000644000175000017500000000353100000000000023035 0ustar00takluyvertakluyverfrom .base import DeviceBase from .control_common import interlock_keys, triggers_keys class DCtrl(DeviceBase): control_keys = [ ('ASafeValue', 'u1', ()), ('busy', 'u1', ()), ('epsilon', 'f4', ()), ('force', 'u1', ()), ('hardwareErrorDescriptor', 'u4', ()), ('hardwareStatusBitField', 'u4', ()), ('maxUpdateFrequency', 'f4', ()), ('pollInterval', 'f4', ()), ('pwmCycleLimit', 'i2', ()), ('pwmDutyCycle', 'f4', ()), ('pwmFrequency', 'f4', ()), ('softDeviceId', 'u4', ()), ('terminal', 'u4', ()), ] + [ # TODO: is there a way to factor these out? ('interlock/AActionState', 'u4', ()), ('interlock/AConditionState', 'u4', ()), ('interlock/a1/AActCommand', 'u4', (1000,)), ('interlock/c1/ACndAriOp', 'i2', ()), ('interlock/c1/ACndComOp', 'i2', ()), ('interlock/c1/ACndEnable', 'u1', ()), ('interlock/c1/ACndFiltertime', 'i2', ()), ('interlock/c1/ACndHysteresis', 'u4', ()), ('interlock/c1/ACndSrc1Detail', 'i2', ()), ('interlock/c1/ACndSrc2Detail', 'i2', ()), ('interlock/c1/ACndThreshold', 'u4', ()), ('interlock/c1/ACndValue1', 'u4', ()), ('interlock/c1/ACndValue2', 'u1', ()), ('interlock/c2/ACndAriOp', 'i2', ()), ('interlock/c2/ACndComOp', 'i2', ()), ('interlock/c2/ACndEnable', 'u1', ()), ('interlock/c2/ACndFiltertime', 'i2', ()), ('interlock/c2/ACndHysteresis', 'u4', ()), ('interlock/c2/ACndSrc1Detail', 'i2', ()), ('interlock/c2/ACndSrc2Detail', 'i2', ()), ('interlock/c2/ACndThreshold', 'u4', ()), ('interlock/c2/ACndValue1', 'u4', ()), ('interlock/c2/ACndValue2', 'u1', ()), ('interlockOk', 'u1', ()), ('interlockOn', 'u1', ()), ] + triggers_keys ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1626691885.0 EXtra-data-1.7.0/extra_data/tests/mockdata/detectors.py0000664000175000017500000001375000000000000023727 0ustar00takluyvertakluyverimport numpy as np class DetectorModule: # Overridden in subclasses: image_dims = () detector_data_size = 0 # Set by write_file: ntrains = 100 firsttrain = 10000 chunksize = 32 output_parts = [ 'detector', 'header', 'image', 'trailer', ] def __init__(self, device_id, frames_per_train=64, raw=True): self.device_id = device_id self._frames_per_train = frames_per_train if not raw: # Raw data has an extra dimension, used in AGIPD to separate data # and gain. This dimension is removed by the calibration process. self.image_dims = self.image_dims[1:] self.raw = raw def write_control(self, f): """Write the CONTROL and RUN data, and the relevant parts of INDEX""" pass @property def image_keys(self): if self.raw: return [ ('data', 'u2', self.image_dims), ('length', 'u4', (1,)), ('status', 'u2', (1,)), ] else: return [ ('data', 'f4', self.image_dims), ('mask', 'u4', self.image_dims), ('gain', 'u1', self.image_dims), ('length', 'u4', (1,)), ('status', 'u2', (1,)), ] @property def other_keys(self): return [ ('detector/data', 'u1', (self.detector_data_size,)), ('header/dataId', 'u8', ()), ('header/linkId', 'u8', ()), ('header/magicNumberBegin', 'i1', (8,)), ('header/majorTrainFormatVersion', 'u4', ()), ('header/minorTrainFormatVersion', 'u4', ()), ('header/pulseCount', 'u8', ()), ('header/reserved', 'u1', (16,)), ('trailer/checksum', 'i1', (16,)), ('trailer/magicNumberEnd', 'i1', (8,)), ('trailer/status', 'u8', ()), ] @property def frames_per_train(self): if np.ndim(self._frames_per_train) == 0: return np.full(self.ntrains, self._frames_per_train, np.uint64) return self._frames_per_train def write_instrument(self, f): """Write the INSTRUMENT data, and the relevant parts of INDEX""" trainids = np.arange(self.firsttrain, self.firsttrain + self.ntrains) ntrains_pad = self.ntrains if ntrains_pad % self.chunksize: ntrains_pad += + self.chunksize - (ntrains_pad % self.chunksize) # INDEX for part in self.output_parts: dev_chan = '%s:xtdf/%s' % (self.device_id, part) i_first = f.create_dataset('INDEX/%s/first' % dev_chan, (self.ntrains,), 'u8', maxshape=(None,)) i_count = f.create_dataset('INDEX/%s/count' % dev_chan, (self.ntrains,), 'u8', maxshape=(None,)) if part == 'image': # First first is always 0 i_first[1:] = np.cumsum(self.frames_per_train)[:-1] i_count[:] = self.frames_per_train else: i_first[:] = np.arange(self.ntrains) i_count[:] = 1 # INSTRUMENT (image) nframes = self.frames_per_train.sum() tid_index = np.repeat(trainids, self.frames_per_train.astype(np.intp)) pid_index = np.concatenate([ np.arange(0, n, dtype='u8') for n in self.frames_per_train ]) if self.raw: # Raw data have an extra dimension (length 1) and an unlimited max # for the first dimension. ds = f.create_dataset('INSTRUMENT/%s:xtdf/image/trainId' % self.device_id, (nframes, 1), 'u8', maxshape=(None, 1)) ds[:, 0] = tid_index pid = f.create_dataset('INSTRUMENT/%s:xtdf/image/pulseId' % self.device_id, (nframes, 1), 'u8', maxshape=(None, 1)) pid[:, 0] = pid_index cid = f.create_dataset('INSTRUMENT/%s:xtdf/image/cellId' % self.device_id, (nframes, 1), 'u2', maxshape=(None, 1)) cid[:, 0] = pid_index # Cell IDs mirror pulse IDs for now else: # Corrected data drops the extra dimension, and maxshape==shape. f.create_dataset( 'INSTRUMENT/%s:xtdf/image/trainId' % self.device_id, (nframes,), 'u8', chunks=True, data=tid_index ) f.create_dataset( 'INSTRUMENT/%s:xtdf/image/pulseId' % self.device_id, (nframes,), 'u8', chunks=True, data=pid_index ) f.create_dataset( # Cell IDs mirror pulse IDs for now 'INSTRUMENT/%s:xtdf/image/cellId' % self.device_id, (nframes,), 'u2', chunks=True, data=pid_index ) max_len = None if self.raw else nframes for (key, datatype, dims) in self.image_keys: f.create_dataset('INSTRUMENT/%s:xtdf/image/%s' % (self.device_id, key), (nframes,) + dims, datatype, maxshape=((max_len,) + dims)) # INSTRUMENT (other parts) for part in ['detector', 'header', 'trailer']: ds = f.create_dataset('INSTRUMENT/%s:xtdf/%s/trainId' % (self.device_id, part), (ntrains_pad,), 'u8', maxshape=(None,)) ds[:self.ntrains] = trainids for (key, datatype, dims) in self.other_keys: f.create_dataset('INSTRUMENT/%s:xtdf/%s' % (self.device_id, key), (ntrains_pad,) + dims, datatype, maxshape=((None,) + dims)) def datasource_ids(self): for part in self.output_parts: yield 'INSTRUMENT/%s:xtdf/%s' % (self.device_id, part) class AGIPDModule(DetectorModule): image_dims = (2, 512, 128) detector_data_size = 5408 class LPDModule(DetectorModule): image_dims = (1, 256, 256) detector_data_size = 416 class DSSCModule(DetectorModule): image_dims = (1, 128, 512) detector_data_size = 416 ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1602757031.0 EXtra-data-1.7.0/extra_data/tests/mockdata/gauge.py0000644000175000017500000000203100000000000023007 0ustar00takluyvertakluyverfrom .base import DeviceBase from .control_common import interlock_keys, triggers_keys class Gauge(DeviceBase): control_keys = [ ('AAlarmH', 'f4', ()), ('AAlarmL', 'f4', ()), ('AAverage', 'u1', ()), ('busy', 'u1', ()), ('calibration/expbase', 'f4', ()), ('calibration/formulaType', 'u1', ()), ('calibration/offset', 'f4', ()), ('calibration/rawValue', 'u4', ()), ('calibration/scale', 'f4', ()), ('calibration/terminalFactor', 'f4', ()), ('calibration/terminalOffset', 'f4', ()), ('epsSemiRaw', 'f4', ()), ('epsilon', 'f4', ()), ('force', 'u1', ()), ('hardwareErrorDescriptor', 'u4', ()), ('hardwareStatusBitField', 'u4', ()), ('maxUpdateFrequency', 'f4', ()), ('pollInterval', 'f4', ()), ('relativeEpsilon', 'u1', ()), ('semiRawValue', 'f4', ()), ('softDeviceId', 'u4', ()), ('terminal', 'u4', ()), ('value', 'f4', ()), ] + interlock_keys + triggers_keys ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1602757031.0 EXtra-data-1.7.0/extra_data/tests/mockdata/gec_camera.py0000644000175000017500000000405400000000000023774 0ustar00takluyvertakluyverfrom .base import DeviceBase class GECCamera(DeviceBase): control_keys = [ ('acquisitionTime', 'f4', ()), ('binningX', 'i4', ()), ('binningY', 'i4', ()), ('coolingLevel', 'i4', ()), ('cropLines', 'i4', ()), ('enableBiasCorrection', 'u1', ()), ('enableBurstMode', 'u1', ()), ('enableCooling', 'u1', ()), ('enableCropMode', 'u1', ()), ('enableExtTrigger', 'u1', ()), ('enableShutter', 'u1', ()), ('enableSync', 'u1', ()), ('exposureTime', 'i4', ()), ('firmwareVersion', 'i4', ()), ('modelId', 'i4', ()), ('numPixelInX', 'i4', ()), ('numPixelInY', 'i4', ()), ('numberOfCoolingLevels', 'i4', ()), ('numberOfMeasurements', 'i4', ()), ('pixelSize', 'f4', ()), ('readOutSpeed', 'i4', ()), ('shutterCloseTime', 'i4', ()), ('shutterOpenTime', 'i4', ()), ('shutterState', 'i4', ()), ('syncHigh', 'u1', ()), ('targetTemperature', 'i4', ()), ('temperatureBack', 'f4', ()), ('temperatureSensor', 'f4', ()), ('triggerTimeOut', 'i4', ()), ('updateInterval', 'i4', ()), ] # Technically, only the part before the / is the output channel. # But there is a structure associated with the part one level after that, # and we don't know what else to call it. output_channels = ('daqOutput/data',) instrument_keys = [ ('image/bitsPerPixel', 'i4', ()), ('image/dimTypes', 'i4', (2,)), ('image/dims', 'u8', (2,)), ('image/encoding', 'i4', ()), ('image/pixels', 'u2', (255, 1024)), ('image/roiOffsets', 'u8', (2,)), ] def write_instrument(self, f): super().write_instrument(f) # Fill in some fixed metadata about the image for channel in self.output_channels: image_grp = 'INSTRUMENT/%s:%s/image/' % (self.device_id, channel) f[image_grp + 'bitsPerPixel'][:self.nsamples] = 16 f[image_grp + 'dims'][:self.nsamples] = [1024, 255] ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1602757031.0 EXtra-data-1.7.0/extra_data/tests/mockdata/imgfel.py0000644000175000017500000001003000000000000023160 0ustar00takluyvertakluyverfrom .base import DeviceBase from .control_common import interlock_keys, triggers_keys class IMGFELCamera(DeviceBase): control_keys = [ ('Logger/file/maxBackupIndex', 'u4', ()), ('Logger/file/maxFileSize', 'u4', ()), ('Logger/file/mode', 'u4', ()), ('acqFrameCount', 'u4', ()), ('autoGain', 'u1', ()), ('bin/X', 'i4', ()), ('bin/Y', 'i4', ()), ('exposureTime', 'f8', ()), ('flip/X', 'u1', ()), ('flip/Y', 'u1', ()), ('frameRate', 'f8', ()), ('frameTransmissionDelay', 'u4', ()), ('gain', 'f8', ()), ('imageDepth', 'i4', ()), ('interPacketDelay', 'u4', ()), ('latencyTime', 'f8', ()), ('nbFrames', 'i4', ()), ('packetSize', 'u4', ()), ('pollingInterval', 'i4', ()), ('roi/Height', 'i4', ()), ('roi/Width', 'i4', ()), ('roi/X', 'i4', ()), ('roi/Y', 'i4', ()), ('rotation', 'i4', ()), ('sensorSize/height', 'i4', ()), ('sensorSize/width', 'i4', ()), ('simulateCamera', 'u1', ()), ('socketBufferSize', 'u4', ()), ('temperature', 'f8', ()), ('writeFile', 'u1', ()), ] # Technically, only the part before the / is the output channel. # But there is a structure associated with the part one level after that, # and we don't know what else to call it. output_channels = ('daqOutput/data',) instrument_keys = [ ('image/bitsPerPixel', 'i4', ()), ('image/dimTypes', 'i4', (2,)), ('image/dims', 'u8', (2,)), ('image/encoding', 'i4', ()), ('image/pixels', 'u2', (1944, 2592)), ('image/roiOffsets', 'u8', (2,)), ] class IMGFELMotor(DeviceBase): control_keys = [ ('ABackEMF', 'u2', ()), ('ACoilResistance', 'u2', ()), ('ADynOffsFactor', 'f4', ()), ('ADynOffsetType', 'u1', ()), ('AFunctionInput1', 'u1', ()), ('AFunctionInput2', 'u1', ()), ('AIntCounter7041', 'u1', ()), ('AMotorFullStep', 'u2', ()), ('AOffsetDynamic', 'f4', ()), ('AOverrun', 'f4', ()), ('aMax', 'i2', ()), ('acceleration', 'f4', ()), ('actualPosition', 'f4', ()), ('backlash', 'f4', ()), ('busy', 'u1', ()), ('calibrateTarget', 'f4', ()), ('checkLimitConsistency', 'u1', ()), ('controllerVoltage', 'f4', ()), ('deadband', 'f4', ()), ('encodeStep', 'f4', ()), ('epsilon', 'f4', ()), ('extEncoderEnabled', 'u1', ()), ('force', 'u1', ()), ('gear', 'f4', ()), ('hardwareErrorDescriptor', 'u4', ()), ('hardwareStatusBitField', 'u4', ()), ('homeNoLimit', 'u1', ()), ('homeUp', 'u1', ()), ('invLogicLim1', 'u1', ()), ('invLogicLim2', 'u1', ()), ('isCCWLimit', 'u1', ()), ('isCWLimit', 'u1', ()), ('isIdleOpenLoop', 'u1', ()), ('isInternalCounter', 'u1', ()), ('isInvertLimits', 'u1', ()), ('isLimitless', 'u1', ()), ('isOnTarget', 'u1', ()), ('isSWLimitHigh', 'u1', ()), ('isSWLimitLow', 'u1', ()), ('isSlave', 'u1', ()), ('limitPosH', 'f4', ()), ('limitPosL', 'f4', ()), ('masterSlaveCorrelation', 'f4', ()), ('maxCurrent', 'u2', ()), ('maxUpdateFrequency', 'f4', ()), ('modus', 'u1', ()), ('motorDriverVoltage', 'f4', ()), ('offset', 'f4', ()), ('pConst', 'f4', ()), ('plcCycleAveraging', 'u1', ()), ('pollInterval', 'f4', ()), ('reducedCurrent', 'u2', ()), ('saveLimitPosition', 'u1', ()), ('softDeviceId', 'u4', ()), ('stepCounterPosition', 'f4', ()), ('stepLength', 'f4', ()), ('syncEncoder', 'u1', ()), ('targetPosition', 'f4', ()), ('targetVelocity', 'i2', ()), ('terminal', 'u4', ()), ('terminalTemperature', 'u1', ()), ('vMax', 'i2', ()), ('vMin', 'i2', ()), ('velocity', 'f4', ()), ] + interlock_keys + triggers_keys ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1626691885.0 EXtra-data-1.7.0/extra_data/tests/mockdata/jungfrau.py0000664000175000017500000000477200000000000023560 0ustar00takluyvertakluyverfrom .base import DeviceBase class JUNGFRAUModule(DeviceBase): output_channels = ('daqOutput/data',) instrument_keys = [ ('adc', 'u2', (16, 512, 1024)), ('frameNumber', 'u8', (16,)), ('gain', 'u1', (16, 512, 1024)), ('mask', 'u2', (16, 512, 1024)), ('memoryCell', 'u1', (16,)), ('timestamp', 'f8', (16,)), ] class JUNGFRAUControl(DeviceBase): control_keys = [ ('acquisitionTime', 'f4', ()), ('angDir', 'i2', (1000,)), ('binSize', 'f4', (1000,)), ('bitDepth', 'i4', ()), ('dataStorage.enable', 'u1', ()), ('dataStorage.fileIndex', 'i4', ()), ('delayAfterTrigger', 'f4', (1000,)), ('detectorHostPort', 'u2', (1000,)), ('detectorHostStopPort', 'u2', (1000,)), ('exposurePeriod', 'f4', ()), ('exposureTime', 'f4', ()), ('exposureTimeout', 'u4', ()), ('exposureTimer', 'u2', ()), ('globalOff', 'f4', (1000,)), ('heartbeatInterval', 'i4', ()), ('lock', 'i2', (1000,)), ('master', 'i2', ()), ('maximumDetectorSize', 'i4', (1000,)), ('moveFlag', 'i2', (1000,)), ('numberOfCycles', 'i8', ()), ('numberOfFrames', 'i8', ()), ('numberOfGates', 'i8', ()), ('online', 'i2', (1000,)), ('performanceStatistics.enable', 'u1', ()), ('performanceStatistics.maxEventLoopLatency', 'u4', ()), ('performanceStatistics.maxProcessingLatency', 'u4', ()), ('performanceStatistics.messagingProblems', 'u1', ()), ('performanceStatistics.numMessages', 'u4', ()), ('performanceStatistics.processingLatency', 'f4', ()), ('pollingInterval', 'u4', ()), ('progress', 'i4', ()), ('rOnline', 'i2', ()), ('rxTcpPort', 'u2', (1000,)), ('rxUdpPort', 'u2', (1000,)), ('rxUdpSocketSize', 'u4', ()), ('storageCellStart', 'i2', ()), ('storageCells', 'i2', ()), ('threaded', 'i2', ()), ('triggerPeriod', 'f4', ()), ('vHighVoltage', 'u4', (1000,)), ('vHighVoltageMax', 'u4', ()), ] class JUNGFRAUMonitor(DeviceBase): control_keys = sum(([ (f'module{n}.adcTemperature', 'f8', ()), (f'module{n}.fpgaTemperature', 'f8', ()), ] for n in range(1, 9)), []) class JUNGFRAUPower(DeviceBase): control_keys = [ ('current', 'f8', ()), ('pollingInterval', 'f8', ()), ('port', 'u2', ()), ('temperature', 'f8', ()), ('voltage', 'f8', ()), ] ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1602757031.0 EXtra-data-1.7.0/extra_data/tests/mockdata/mkfile.py0000644000175000017500000000141200000000000023170 0ustar00takluyvertakluyverimport h5py from .base import write_base_index, write_metadata def write_file(filename, devices, ntrains, firsttrain=10000, chunksize=200, format_version='0.5'): f = h5py.File(filename, 'w') f.create_group('RUN') # Add this, even if it's left empty write_base_index(f, ntrains, first=firsttrain, chunksize=chunksize, format_version=format_version) data_sources = [] for dev in devices: dev.ntrains = ntrains dev.firsttrain = firsttrain dev.chunksize = chunksize dev.write_control(f) dev.write_instrument(f) data_sources.extend(dev.datasource_ids()) write_metadata(f, data_sources, chunksize=chunksize, format_version=format_version) f.close() ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1602757031.0 EXtra-data-1.7.0/extra_data/tests/mockdata/motor.py0000644000175000017500000001153000000000000023063 0ustar00takluyvertakluyverfrom .base import DeviceBase from .control_common import interlock_keys, triggers_keys class Motor(DeviceBase): control_keys = [ ('AActualVelocity', 'f8', ()), ('AEncoderResolution', 'u4', ()), ('AHomingVelocityOffPlcCam', 'f8', ()), ('AHomingVelocityToPlcCam', 'f8', ()), ('ANCParam', 'u4', ()), ('ANomCurrent', 'u4', ()), ('AOpenloopCurrent', 'u4', ()), ('APeakCurrent', 'u4', ()), ('AProfileAcceleration', 'u4', ()), ('AQuickStopDecceleration', 'u4', ()), ('AStandByCurrent', 'u4', ()), ('AStepperResolution', 'u4', ()), ('Acontrolword', 'u2', ()), ('actualPosition', 'f8', ()), ('axisBacklash', 'f8', ()), ('busy', 'u1', ()), ('calibrateTarget', 'f8', ()), ('enableSWLimitHigh', 'u1', ()), ('enableSWLimitLow', 'u1', ()), ('encoderPosition', 'f8', ()), ('epsilon', 'f4', ()), ('epsilonActualPosition', 'f8', ()), ('epsilonActualVelocity', 'f8', ()), ('force', 'u1', ()), ('hardwareErrorDescriptor', 'u4', ()), ('hardwareStatusBitField', 'u4', ()), ('isCCWLimit', 'u1', ()), ('isCWLimit', 'u1', ()), ('isInterlockLimitHigh', 'u1', ()), ('isInterlockLimitLow', 'u1', ()), ('isOnTarget', 'u1', ()), ('isSWLimitHigh', 'u1', ()), ('isSWLimitLow', 'u1', ()), ('isSlave', 'u1', ()), ('maxUpdateFrequency', 'f4', ()), ('mc2/aaxisacc', 'f8', ()), ('mc2/aaxiscalibrationvelocitybackward', 'f8', ()), ('mc2/aaxiscalibrationvelocityforward', 'f8', ()), ('mc2/aaxiscycletime', 'f8', ()), ('mc2/aaxisdec', 'f8', ()), ('mc2/aaxisdelaytimeveloposition', 'f8', ()), ('mc2/aaxisenableposcorrection', 'u1', ()), ('mc2/aaxisenbacklashcompensation', 'u1', ()), ('mc2/aaxisencoderdirectioninverse', 'u1', ()), ('mc2/aaxisencodermask', 'u4', ()), ('mc2/aaxisencodermodulovalue', 'f8', ()), ('mc2/aaxisencoderoffset', 'f8', ()), ('mc2/aaxisencoderscalingfactor', 'f8', ()), ('mc2/aaxisendatapersistence', 'u1', ()), ('mc2/aaxisenintargettimeout', 'u1', ()), ('mc2/aaxisenloopingdistance', 'u1', ()), ('mc2/aaxisenpositionlagmonitoring', 'u1', ()), ('mc2/aaxisenpositionrangemonitoring', 'u1', ()), ('mc2/aaxisentargetpositionmonitoring', 'u1', ()), ('mc2/aaxisfastacc', 'f8', ()), ('mc2/aaxisfastjerk', 'f8', ()), ('mc2/aaxisfaststopsignaltype', 'u4', ()), ('mc2/aaxisid', 'f8', ()), ('mc2/aaxisintargettimeout', 'f8', ()), ('mc2/aaxisjerk', 'f8', ()), ('mc2/aaxisjogincrementbackward', 'f8', ()), ('mc2/aaxisjogincrementforward', 'f8', ()), ('mc2/aaxisloopingdistance', 'f8', ()), ('mc2/aaxismanualvelocityfast', 'f8', ()), ('mc2/aaxismanualvelocityslow', 'f8', ()), ('mc2/aaxismaxposlagfiltertime', 'f8', ()), ('mc2/aaxismaxposlagvalue', 'f8', ()), ('mc2/aaxismaxvelocity', 'f8', ()), ('mc2/aaxismodulotolerancewindow', 'f8', ()), ('mc2/aaxismotionmonitoringtime', 'f8', ()), ('mc2/aaxismotionmonitoringwindow', 'f8', ()), ('mc2/aaxismotordirectioninverse', 'u1', ()), ('mc2/aaxisoverridetype', 'f8', ()), ('mc2/aaxisposcorrectionfiltertime', 'f8', ()), ('mc2/aaxispositionrangewindow', 'f8', ()), ('mc2/aaxisrapidtraversevelocity', 'f8', ()), ('mc2/aaxisrefveloonrefoutput', 'f8', ()), ('mc2/aaxistargetpositionmonitoringtime', 'f8', ()), ('mc2/aaxistargetpositionwindow', 'f8', ()), ('mc2/aaxisunitinterpretation', 'f8', ()), ('mc2/acommandedvelocity', 'f8', ()), ('mc2/aencoderaxisoffset', 'f8', ()), ('mc2/aencoderaxisscalingfactor', 'f8', ()), ('mc2/aencoderreferencemode', 'u1', ()), ('mc2/ahomingvelocitoffplccam', 'f8', ()), ('mc2/ahomingvelocittowardsplccam', 'f8', ()), ('mc2/ainvertdircalibrationcamsearch', 'u1', ()), ('mc2/ainvertdirsyncpulssearch', 'u1', ()), ('mc2/amodulotargetposition', 'f8', ()), ('mc2/amovedirection', 'i4', ()), ('mc2/ancsvbcycletime', 'f8', ()), ('mc2/axisenmotionmonitoring', 'u1', ()), ('mc2/axisfastdec', 'f8', ()), ('mc2/extendedStateWord', 'u4', ()), ('mc2/ncsafcycletime', 'f8', ()), ('mc2ContinuousMotion', 'u1', ()), ('mc2DiscreteMotion', 'u1', ()), ('mc2ErrorStop', 'u1', ()), ('pollInterval', 'f4', ()), ('softDeviceId', 'u4', ()), ('specificError', 'u4', ()), ('stepSize', 'f8', ()), ('swLimitHigh', 'f8', ()), ('swLimitLow', 'f8', ()), ('targetPosition', 'f8', ()), ('targetVelocity', 'f8', ()), ('terminal', 'u4', ()), ] + interlock_keys + triggers_keys ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1602757031.0 EXtra-data-1.7.0/extra_data/tests/mockdata/mpod.py0000644000175000017500000001106600000000000022666 0ustar00takluyvertakluyverfrom .base import DeviceBase class MPOD(DeviceBase): control_keys = [ ('Logger/file/maxBackupIndex', 'u4', ()), ('Logger/file/maxFileSize', 'u4', ()), ('Logger/file/mode', 'u4', ()), ('autoRearm', 'u1', ()), ('boards/board_0/boardStatus/bitModuleIsEventActive', 'u1', ()), ('boards/board_0/boardStatus/bitModuleIsFineAdjustment', 'u1', ()), ('boards/board_0/boardStatus/bitModuleIsGood', 'u1', ()), ('boards/board_0/boardStatus/bitModuleIsHardwareLimitVoltageGood', 'u1', ()), ('boards/board_0/boardStatus/bitModuleIsInputError', 'u1', ()), ('boards/board_0/boardStatus/bitModuleIsKillEnable', 'u1', ()), ('boards/board_0/boardStatus/bitModuleIsLiveInsertion', 'u1', ()), ('boards/board_0/boardStatus/bitModuleIsNoRamp', 'u1', ()), ('boards/board_0/boardStatus/bitModuleIsNoSumError', 'u1', ()), ('boards/board_0/boardStatus/bitModuleIsSafetyLoopGood', 'u1', ()), ('boards/board_0/boardStatus/bitModuleIsSupplyGood', 'u1', ()), ('boards/board_0/boardStatus/bitModuleIsTemperatureGood', 'u1', ()), ('boards/board_0/boardStatus/bitModuleNeedService', 'u1', ()), ('boards/board_0/boardStatus/bitModuleReserved1', 'u1', ()), ('boards/board_0/boardStatus/bitModuleReserved2', 'u1', ()), ('boards/board_0/boardStatus/bitModuleReserved3', 'u1', ()), ('boards/board_0/status', 'i4', ()), ('crateNode/crate', 'u4', ()), ('crateNode/crateStatus/bitBusReset', 'u1', ()), ('crateNode/crateStatus/bitFanTrayFailure', 'u1', ()), ('crateNode/crateStatus/bitInputFailure', 'u1', ()), ('crateNode/crateStatus/bitLocalControlOnly', 'u1', ()), ('crateNode/crateStatus/bitMainInError', 'u1', ()), ('crateNode/crateStatus/bitMainInhibit', 'u1', ()), ('crateNode/crateStatus/bitMainOn', 'u1', ()), ('crateNode/crateStatus/bitOutputFailure', 'u1', ()), ('crateNode/crateStatus/bitPlugAndPlayIncompatible', 'u1', ()), ('crateNode/crateStatus/bitSensorFailure', 'u1', ()), ('crateNode/crateStatus/bitSupplyDerating', 'u1', ()), ('crateNode/crateStatus/bitSupplyDerating2', 'u1', ()), ('crateNode/crateStatus/bitSupplyFailure', 'u1', ()), ('crateNode/crateStatus/bitSupplyFailure2', 'u1', ()), ('crateNode/crateStatus/bitVmeSysfail', 'u1', ()), ('crateNode/expandChannelStatus', 'u1', ()), ('crateNode/fanNominalSpeed', 'i4', ()), ('crateNode/groupsSwitchIseg', 'i4', ()), ('crateNode/indexes', 'u4', ()), ('crateNode/output', 'u4', ()), ('crateNode/outputNumber', 'i4', ()), ('crateNode/pollPeriod', 'i4', ()), ('crateNode/psOperatingTime', 'i4', ()), ('crateNode/secureOperation', 'u1', ()), ('crateNode/settlePeriod', 'i4', ()), ('crateNode/snmpPort', 'i4', ()), ('crateNode/snmpThreshold', 'i4', ()), ('crateNode/sysHardwareReset', 'i4', ()), ('crateNode/sysMainSwitch', 'i4', ()), ('crateNode/sysStatus', 'u4', ()), ('crateNode/system', 'u4', ()), ] + sum(([ ('channels/U%d/configMaxCurrent' % n, 'f4', ()), ('channels/U%d/configMaxSenseVoltage' % n, 'f4', ()), ('channels/U%d/configMaxTemperature' % n, 'i4', ()), ('channels/U%d/configMaxTerminalVoltage' % n, 'f4', ()), ('channels/U%d/current' % n, 'f4', ()), ('channels/U%d/currentFallRate' % n, 'f4', ()), ('channels/U%d/currentRiseRate' % n, 'f4', ()), ('channels/U%d/groupid' % n, 'i4', ()), ('channels/U%d/index' % n, 'i4', ()), ('channels/U%d/measurementCurrent' % n, 'f4', ()), ('channels/U%d/measurementSenseVoltage' % n, 'f4', ()), ('channels/U%d/measurementTemperature' % n, 'i4', ()), ('channels/U%d/measurementTerminalVoltage' % n, 'f4', ()), ('channels/U%d/status' % n, 'i4', ()), ('channels/U%d/supervisionBehavior' % n, 'i4', ()), ('channels/U%d/supervisionMaxCurrent' % n, 'f4', ()), ('channels/U%d/supervisionMaxPower' % n, 'f4', ()), ('channels/U%d/supervisionMaxSenseVoltage' % n, 'f4', ()), ('channels/U%d/supervisionMaxTemperature' % n, 'i4', ()), ('channels/U%d/supervisionMaxTerminalVoltage' % n, 'f4', ()), ('channels/U%d/supervisionMinSenseVoltage' % n, 'f4', ()), ('channels/U%d/switch' % n, 'i4', ()), ('channels/U%d/tripTimeMaxCurrent' % n, 'i4', ()), ('channels/U%d/voltage' % n, 'f4', ()), ('channels/U%d/voltageRampRate' % n, 'f4', ()), ] for n in range(8)), []) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1602757031.0 EXtra-data-1.7.0/extra_data/tests/mockdata/sidemic_camera.py0000644000175000017500000000202600000000000024650 0ustar00takluyvertakluyverfrom .base import DeviceBase class SidemicCamera(DeviceBase): # Based on example in /gpfs/exfel/d/raw/SPB/201701/p002012/r0309/RAW-R0309-DA01-S00000.h5 # Technically, only the part before the / is the output channel. # But there is a structure associated with the part one level after that, # and we don't know what else to call it. output_channels = ('daqOutput/data',) instrument_keys = [ ('image/bitsPerPixel', 'i4', ()), ('image/dimTypes', 'i4', (2,)), ('image/dims', 'u8', (2,)), ('image/encoding', 'i4', ()), ('image/pixels', 'u2', (2058, 2456)), ('image/roiOffsets', 'u8', (2,)), ] def write_instrument(self, f): super().write_instrument(f) # Fill in some fixed metadata about the image for channel in self.output_channels: image_grp = 'INSTRUMENT/%s:%s/image/' % (self.device_id, channel) f[image_grp + 'bitsPerPixel'][:self.nsamples] = 16 f[image_grp + 'dims'][:self.nsamples] = [1000, 1000] ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1602757031.0 EXtra-data-1.7.0/extra_data/tests/mockdata/tsens.py0000644000175000017500000000204600000000000023061 0ustar00takluyvertakluyverfrom .base import DeviceBase from .control_common import interlock_keys, triggers_keys class TemperatureSensor(DeviceBase): control_keys = [ ('AAlarmH', 'f4', ()), ('AAlarmL', 'f4', ()), ('AAverage', 'u1', ()), ('busy', 'u1', ()), ('calibration/expbase', 'f4', ()), ('calibration/formulaType', 'u1', ()), ('calibration/offset', 'f4', ()), ('calibration/rawValue', 'u4', ()), ('calibration/scale', 'f4', ()), ('calibration/terminalFactor', 'f4', ()), ('calibration/terminalOffset', 'f4', ()), ('epsSemiRaw', 'f4', ()), ('epsilon', 'f4', ()), ('force', 'u1', ()), ('hardwareErrorDescriptor', 'u4', ()), ('hardwareStatusBitField', 'u4', ()), ('maxUpdateFrequency', 'f4', ()), ('pollInterval', 'f4', ()), ('relativeEpsilon', 'u1', ()), ('semiRawValue', 'f4', ()), ('softDeviceId', 'u4', ()), ('terminal', 'u4', ()), ('value', 'f4', ()), ] + interlock_keys + triggers_keys ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1602757031.0 EXtra-data-1.7.0/extra_data/tests/mockdata/uvlamp.py0000644000175000017500000000120500000000000023225 0ustar00takluyvertakluyverfrom .base import DeviceBase from .control_common import interlock_keys, triggers_keys class UVLamp(DeviceBase): control_keys = [ ('ASafeValue', 'u1', ()), ('busy', 'u1', ()), ('epsilon', 'f4', ()), ('force', 'u1', ()), ('hardwareErrorDescriptor', 'u4', ()), ('hardwareStatusBitField', 'u4', ()), ('maxUpdateFrequency', 'f4', ()), ('pollInterval', 'f4', ()), ('pwmCycleLimit', 'i2', ()), ('pwmDutyCycle', 'f4', ()), ('pwmFrequency', 'f4', ()), ('softDeviceId', 'u4', ()), ('terminal', 'u4', ()), ] + interlock_keys + triggers_keys ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1602757031.0 EXtra-data-1.7.0/extra_data/tests/mockdata/xgm.py0000644000175000017500000000373100000000000022522 0ustar00takluyvertakluyverfrom .base import DeviceBase class XGM(DeviceBase): control_keys = [ ('beamPosition/ixPos', 'f4', ()), ('beamPosition/iyPos', 'f4', ()), ('current/bottom/output', 'f4', ()), ('current/bottom/rangeCode', 'i4', ()), ('current/left/output', 'f4', ()), ('current/left/rangeCode', 'i4', ()), ('current/right/output', 'f4', ()), ('current/right/rangeCode', 'i4', ()), ('current/top/output', 'f4', ()), ('current/top/rangeCode', 'i4', ()), ('gasDosing/measuredPressure', 'f4', ()), ('gasDosing/pressureSetPoint', 'f4', ()), ('gasSupply/gasTypeId', 'i4', ()), ('gasSupply/gsdCompatId', 'i4', ()), ('pollingInterval', 'i4', ()), ('pressure/dcr', 'f4', ()), ('pressure/gasType', 'i4', ()), ('pressure/pressure1', 'f4', ()), ('pressure/pressureFiltered', 'f4', ()), ('pressure/rd', 'f4', ()), ('pressure/rsp', 'f4', ()), ('pulseEnergy/conversion', 'f8', ()), ('pulseEnergy/crossUsed', 'f4', ()), ('pulseEnergy/gammaUsed', 'f4', ()), ('pulseEnergy/gmdError', 'i4', ()), ('pulseEnergy/nummberOfBrunches', 'f4', ()), ('pulseEnergy/photonFlux', 'f4', ()), ('pulseEnergy/pressure', 'f4', ()), ('pulseEnergy/temperature', 'f4', ()), ('pulseEnergy/usedGasType', 'i4', ()), ('pulseEnergy/wavelengthUsed', 'f4', ()), ('signalAdaption/dig', 'i4', ()), ] # Technically, only the part before the / is the output channel. # But there is a structure associated with the part one level after that, # and we don't know what else to call it. output_channels = ('output/data',) instrument_keys = [ ('intensityTD', 'f4', (1000,)), ('intensityAUXTD', 'f4', (1000,)), ('intensitySigma/x_data', 'f4', (1000,)), ('intensitySigma/y_data', 'f4', (1000,)), ('xTD', 'f4', (1000,)), ('yTD', 'f4', (1000,)), ] ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1626691885.0 EXtra-data-1.7.0/extra_data/tests/test_bad_trains.py0000664000175000017500000002510600000000000023313 0ustar00takluyvertakluyverimport os.path as osp from tempfile import TemporaryDirectory import h5py import numpy as np import pytest from testpath import assert_isfile from extra_data import H5File from extra_data.components import AGIPD1M from extra_data.exceptions import TrainIDError from extra_data.file_access import FileAccess from . import make_examples @pytest.fixture(scope='module') def agipd_file_tid_very_high(): with TemporaryDirectory() as td: path = osp.join(td, 'CORR-R9999-AGIPD07-S00000.h5') make_examples.make_agipd_example_file(path, format_version='0.5') with h5py.File(path, 'r+') as f: # Initial train IDs are np.arange(10000, 10250) f['INDEX/trainId'][10] = 10400 yield path @pytest.fixture(scope='module') def agipd_file_tid_high(): with TemporaryDirectory() as td: path = osp.join(td, 'CORR-R9999-AGIPD07-S00000.h5') make_examples.make_agipd_file(path, format_version='0.5') with h5py.File(path, 'r+') as f: # Initial train IDs are np.arange(10000, 10486), this will appear 2x f['INDEX/trainId'][10] = 10100 yield path @pytest.fixture(scope='module') def agipd_file_tid_low(): with TemporaryDirectory() as td: path = osp.join(td, 'CORR-R9999-AGIPD07-S00000.h5') make_examples.make_agipd_example_file(path, format_version='0.5') with h5py.File(path, 'r+') as f: # Initial train IDs are np.arange(10000, 10250) f['INDEX/trainId'][20] = 9000 yield path @pytest.fixture() def agipd_file_flag0(): with TemporaryDirectory() as td: path = osp.join(td, 'CORR-R9999-AGIPD07-S00000.h5') make_examples.make_agipd_file(path, format_version='1.0') with h5py.File(path, 'r+') as f: f['INDEX/flag'][30] = 0 yield path def test_guess_validity(agipd_file_tid_very_high, agipd_file_tid_high, agipd_file_tid_low): fa = FileAccess(agipd_file_tid_very_high) assert fa.validity_flag.sum() == 249 assert not fa.validity_flag[10] fa = FileAccess(agipd_file_tid_high) assert fa.validity_flag.sum() == 485 assert not fa.validity_flag[10] fa = FileAccess(agipd_file_tid_low) assert fa.validity_flag.sum() == 249 assert not fa.validity_flag[20] def test_validity_flag(agipd_file_flag0): fa = FileAccess(agipd_file_flag0) assert fa.validity_flag.sum() == 485 assert not fa.validity_flag[30] def test_exc_trainid(agipd_file_tid_very_high, agipd_file_tid_high, agipd_file_tid_low, agipd_file_flag0): f = H5File(agipd_file_tid_very_high, inc_suspect_trains=False) assert len(f.train_ids) == 249 assert 10400 not in f.train_ids f = H5File(agipd_file_tid_very_high, inc_suspect_trains=True) assert len(f.train_ids) == 250 assert 10400 in f.train_ids f = H5File(agipd_file_tid_high, inc_suspect_trains=False) assert len(f.train_ids) == 485 assert 10100 in f.train_ids with pytest.raises(ValueError): H5File(agipd_file_tid_high, inc_suspect_trains=True) f = H5File(agipd_file_tid_low, inc_suspect_trains=False) assert len(f.train_ids) == 249 assert 9000 not in f.train_ids f = H5File(agipd_file_tid_low, inc_suspect_trains=True) assert len(f.train_ids) == 250 assert 9000 in f.train_ids f = H5File(agipd_file_flag0, inc_suspect_trains=False) assert len(f.train_ids) == 485 assert 10030 not in f.train_ids f = H5File(agipd_file_flag0, inc_suspect_trains=True) assert len(f.train_ids) == 486 assert 10030 in f.train_ids # If the tests above pass, the invalid trains in the different sample files # are being recognised correctly. So for the tests below, we'll mainly test # each behaviour on just one of the sample files. def test_keydata_interface(agipd_file_tid_very_high): f = H5File(agipd_file_tid_very_high, inc_suspect_trains=False) kd = f['SPB_DET_AGIPD1M-1/DET/7CH0:xtdf', 'image.data'] assert len(kd.train_ids) == 249 assert kd.shape == (249 * 64, 512, 128) f = H5File(agipd_file_tid_very_high, inc_suspect_trains=True) kd = f['SPB_DET_AGIPD1M-1/DET/7CH0:xtdf', 'image.data'] assert len(kd.train_ids) == 250 assert kd.shape == (250 * 64, 512, 128) # Check selecting trains preserves inc_suspect_trains flag assert kd[:].shape == (250 * 64, 512, 128) def test_data_counts(agipd_file_flag0): f = H5File(agipd_file_flag0, inc_suspect_trains=False) kd = f['SPB_DET_AGIPD1M-1/DET/0CH0:xtdf', 'image.data'] assert 10030 not in kd.data_counts().index f = H5File(agipd_file_flag0, inc_suspect_trains=True) kd = f['SPB_DET_AGIPD1M-1/DET/0CH0:xtdf', 'image.data'] assert 10030 in kd.data_counts().index def test_array(agipd_file_tid_low): f = H5File(agipd_file_tid_low, inc_suspect_trains=False) arr = f['SPB_DET_AGIPD1M-1/DET/7CH0:xtdf', 'image.pulseId'].xarray() assert arr.shape == (249 * 64, 1) f = H5File(agipd_file_tid_low, inc_suspect_trains=True) arr = f['SPB_DET_AGIPD1M-1/DET/7CH0:xtdf', 'image.pulseId'].xarray() assert arr.shape == (250 * 64, 1) def test_array_dup(agipd_file_tid_high): f = H5File(agipd_file_tid_high, inc_suspect_trains=False) arr = f['SPB_DET_AGIPD1M-1/DET/0CH0:xtdf', 'image.pulseId'].xarray() assert arr.shape == (485 * 64, 1) assert list(arr.coords['trainId'].values[(9*64):(11*64):64]) == [10009, 10011] # Can't open files with duplicate train IDs using inc_suspect_trains=True def test_dask_array(agipd_file_flag0): f = H5File(agipd_file_flag0, inc_suspect_trains=False) arr = f['SPB_DET_AGIPD1M-1/DET/0CH0:xtdf', 'image.pulseId'].dask_array() assert arr.shape == (485 * 64, 1) f = H5File(agipd_file_flag0, inc_suspect_trains=True) arr = f['SPB_DET_AGIPD1M-1/DET/0CH0:xtdf', 'image.pulseId'].dask_array() assert arr.shape == (486 * 64, 1) def test_iterate_keydata(agipd_file_tid_very_high): f = H5File(agipd_file_tid_very_high, inc_suspect_trains=False) kd = f['SPB_DET_AGIPD1M-1/DET/7CH0:xtdf', 'image.pulseId'] tids = [t for (t, _) in kd.trains()] assert len(tids) == 249 assert 10400 not in tids f = H5File(agipd_file_tid_very_high, inc_suspect_trains=True) kd = f['SPB_DET_AGIPD1M-1/DET/7CH0:xtdf', 'image.pulseId'] tids = [t for (t, _) in kd.trains()] assert len(tids) == 250 assert 10400 in tids def test_iterate_keydata_dup(agipd_file_tid_high): f = H5File(agipd_file_tid_high, inc_suspect_trains=False) kd = f['SPB_DET_AGIPD1M-1/DET/0CH0:xtdf', 'image.pulseId'] tids = [t for (t, _) in kd.trains()] assert len(tids) == 485 assert 10100 in tids assert tids[9:11] == [10009, 10011] def test_iterate_datacollection(agipd_file_tid_low): f = H5File(agipd_file_tid_low, inc_suspect_trains=False) tids = [t for (t, _) in f.trains()] assert len(tids) == 249 assert 9000 not in tids def test_get_train_keydata(agipd_file_tid_low): f = H5File(agipd_file_tid_low, inc_suspect_trains=False) kd = f['SPB_DET_AGIPD1M-1/DET/7CH0:xtdf', 'image.pulseId'] with pytest.raises(TrainIDError): kd.train_from_id(9000) f = H5File(agipd_file_tid_low, inc_suspect_trains=True) kd = f['SPB_DET_AGIPD1M-1/DET/7CH0:xtdf', 'image.pulseId'] assert kd.train_from_id(9000)[0] == 9000 def test_components_array(agipd_file_flag0): f = H5File(agipd_file_flag0, inc_suspect_trains=False) agipd = AGIPD1M(f, modules=[0]) arr = agipd.get_array('image.data', pulses=np.s_[:1]) assert arr.shape == (1, 485, 1, 2, 512, 128) assert arr.dims == ('module', 'train', 'pulse', 'data_gain', 'slow_scan', 'fast_scan') def test_components_array_dup(agipd_file_tid_high): f = H5File(agipd_file_tid_high, inc_suspect_trains=False) agipd = AGIPD1M(f, modules=[0]) arr = agipd.get_array('image.data', pulses=np.s_[:1]) assert arr.shape == (1, 485, 1, 2, 512, 128) assert arr.dims == ('module', 'train', 'pulse', 'data_gain', 'slow_scan', 'fast_scan') assert list(arr.coords['train'].values[9:11]) == [10009, 10011] def test_write_virtual_cxi_dup(agipd_file_tid_high, tmp_path, caplog): f = H5File(agipd_file_tid_high, inc_suspect_trains=False) agipd = AGIPD1M(f, modules=[0]) cxi_path = tmp_path / 'exc_suspect.cxi' agipd.write_virtual_cxi(str(cxi_path)) assert_isfile(cxi_path) with h5py.File(cxi_path, 'r') as f: assert f['entry_1/data_1/data'].shape == (485 * 64, 16, 2, 512, 128) def test_write_virtual(agipd_file_tid_low, agipd_file_tid_high, tmp_path): f = H5File(agipd_file_tid_low, inc_suspect_trains=False) f.write_virtual(tmp_path / 'low.h5') with h5py.File(tmp_path / 'low.h5', 'r') as vf: assert 9000 not in vf['INDEX/trainId'][:] ds = vf['INSTRUMENT/SPB_DET_AGIPD1M-1/DET/7CH0:xtdf/image/pulseId'] assert ds.shape == (249 * 64, 1) f = H5File(agipd_file_tid_high, inc_suspect_trains=False) f.write_virtual(tmp_path / 'high.h5') with h5py.File(tmp_path / 'high.h5', 'r') as vf: ds = vf['INSTRUMENT/SPB_DET_AGIPD1M-1/DET/0CH0:xtdf/image/trainId'] assert ds.shape == (485 * 64, 1) assert list(ds[(9*64):(11*64):64]) == [10009, 10011] def test_still_valid_elsewhere(agipd_file_tid_very_high, mock_sa3_control_data): dc = H5File( agipd_file_tid_very_high, inc_suspect_trains=False ).union(H5File(mock_sa3_control_data)) assert dc.train_ids == list(range(10000, 10500)) agipd_src = 'SPB_DET_AGIPD1M-1/DET/7CH0:xtdf' tsens_src = 'SA3_XTD10_VAC/TSENS/S30250K' sel = dc.select({ agipd_src: {'image.pulseId'}, tsens_src: {'value.value'} }) assert sel.all_sources == {agipd_src, tsens_src} _, t1 = sel.train_from_id(10200, flat_keys=True) assert set(t1) >= {(agipd_src, 'image.pulseId'), (tsens_src, 'value.value')} _, t2 = sel.train_from_id(10400, flat_keys=True) assert (agipd_src, 'image.pulseId') not in t2 assert (tsens_src, 'value.value') in t2 tids_from_iter, data_from_iter = [], [] for tid, d in sel.trains(flat_keys=True): if tid in (10200, 10400): tids_from_iter.append(tid) data_from_iter.append(d) assert tids_from_iter == [10200, 10400] assert [set(d) for d in data_from_iter] == [set(t1), set(t2)] # Check that select with require_all respects the valid train filtering: sel2 = dc.select(agipd_src, require_all=True) assert len(sel2.train_ids) == 249 dc_inc = H5File(agipd_file_tid_very_high, inc_suspect_trains=True)\ .union(H5File(mock_sa3_control_data)) sel_inc = dc_inc.select(sel) _, t2_inc = sel_inc.train_from_id(10400, flat_keys=True) assert set(t2_inc) == set(t1) sel2_inc = dc_inc.select(agipd_src, require_all=True) assert len(sel2_inc.train_ids) == 250 ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1626691885.0 EXtra-data-1.7.0/extra_data/tests/test_components.py0000664000175000017500000004435000000000000023374 0ustar00takluyvertakluyverimport dask.array as da import h5py import numpy as np import os.path as osp import pytest from testpath import assert_isfile from extra_data.reader import RunDirectory, H5File, by_id, by_index from extra_data.components import ( AGIPD1M, DSSC1M, LPD1M, JUNGFRAU, identify_multimod_detectors, ) def test_get_array(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) det = LPD1M(run.select_trains(by_index[:3])) assert det.detector_name == 'FXE_DET_LPD1M-1' arr = det.get_array('image.data') assert arr.dtype == np.uint16 assert arr.shape == (16, 3, 128, 256, 256) assert arr.dims == ('module', 'train', 'pulse', 'slow_scan', 'fast_scan') arr = det.get_array('image.data', pulses=by_index[:10], unstack_pulses=False) assert arr.shape == (16, 30, 256, 256) assert arr.dtype == np.uint16 assert arr.dims == ('module', 'train_pulse', 'slow_scan', 'fast_scan') # fill value with pytest.raises(ValueError): det.get_array('image.data', fill_value=np.nan) arr = det.get_array('image.data', astype=np.float32) assert arr.dtype == np.float32 def test_get_array_pulse_id(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) det = LPD1M(run.select_trains(by_index[:3])) arr = det.get_array('image.data', pulses=by_id[0]) assert arr.shape == (16, 3, 1, 256, 256) assert (arr.coords['pulse'] == 0).all() arr = det.get_array('image.data', pulses=by_id[:5]) assert arr.shape == (16, 3, 5, 256, 256) # Empty selection arr = det.get_array('image.data', pulses=by_id[:0]) assert arr.shape == (16, 0, 0, 256, 256) arr = det.get_array('image.data', pulses=by_id[122:]) assert arr.shape == (16, 3, 6, 256, 256) arr = det.get_array('image.data', pulses=by_id[[1, 7, 22, 23]]) assert arr.shape == (16, 3, 4, 256, 256) assert list(arr.coords['pulse']) == [1, 7, 22, 23] def test_get_array_with_cell_ids(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) det = LPD1M(run.select_trains(by_index[:3])) arr = det.get_array('image.data', subtrain_index='cellId') assert arr.shape == (16, 3, 128, 256, 256) assert arr.dims == ('module', 'train', 'cell', 'slow_scan', 'fast_scan') arr = det.get_array('image.data', pulses=by_id[0], subtrain_index='cellId') assert arr.shape == (16, 3, 1, 256, 256) assert (arr.coords['cell'] == 0).all() def test_get_array_pulse_indexes(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) det = LPD1M(run.select_trains(by_index[:3])) arr = det.get_array('image.data', pulses=by_index[0]) assert arr.shape == (16, 3, 1, 256, 256) assert (arr.coords['pulse'] == 0).all() arr = det.get_array('image.data', pulses=by_index[:5]) assert arr.shape == (16, 3, 5, 256, 256) # Empty selection arr = det.get_array('image.data', pulses=by_index[:0]) assert arr.shape == (16, 0, 0, 256, 256) arr = det.get_array('image.data', pulses=by_index[122:]) assert arr.shape == (16, 3, 6, 256, 256) arr = det.get_array('image.data', pulses=by_index[[1, 7, 22, 23]]) assert arr.shape == (16, 3, 4, 256, 256) def test_get_array_pulse_id_reduced_data(mock_reduced_spb_proc_run): run = RunDirectory(mock_reduced_spb_proc_run) det = AGIPD1M(run.select_trains(by_index[:3])) arr = det.get_array('image.data', pulses=by_id[0]) assert arr.shape == (16, 3, 1, 512, 128) assert (arr.coords['pulse'] == 0).all() arr = det.get_array('image.data', pulses=by_id[:5]) assert (arr.coords['pulse'] < 5).all() # Empty selection arr = det.get_array('image.data', pulses=by_id[:0]) assert arr.shape == (16, 0, 0, 512, 128) arr = det.get_array('image.data', pulses=by_id[5:]) assert (arr.coords['pulse'] >= 5).all() arr = det.get_array('image.data', pulses=by_id[[1, 7, 15, 23]]) assert np.isin(arr.coords['pulse'], [1, 7, 15, 23]).all() def test_get_array_pulse_indexes_reduced_data(mock_reduced_spb_proc_run): run = RunDirectory(mock_reduced_spb_proc_run) det = AGIPD1M(run.select_trains(by_index[:3])) arr = det.get_array('image.data', pulses=by_index[0]) assert arr.shape == (16, 3, 1, 512, 128) assert (arr.coords['pulse'] == 0).all() arr = det.get_array('image.data', pulses=by_index[:5]) assert (arr.coords['pulse'] < 5).all() # Empty selection arr = det.get_array('image.data', pulses=by_index[:0]) assert arr.shape == (16, 0, 0, 512, 128) arr = det.get_array('image.data', pulses=np.s_[5:]) assert (arr.coords['pulse'] >= 5).all() arr = det.get_array('image.data', pulses=by_index[[1, 7, 15, 23]]) assert np.isin(arr.coords['pulse'], [1, 7, 15, 23]).all() arr = det.get_array('image.data', pulses=[1, 7, 15, 23]) assert np.isin(arr.coords['pulse'], [1, 7, 15, 23]).all() def test_get_array_roi(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) det = LPD1M(run.select_trains(by_index[:3])) assert det.detector_name == 'FXE_DET_LPD1M-1' arr = det.get_array('image.data', roi=np.s_[10:60, 100:200]) assert arr.shape == (16, 3, 128, 50, 100) assert arr.dims == ('module', 'train', 'pulse', 'slow_scan', 'fast_scan') def test_get_array_roi_dssc(mock_scs_run): run = RunDirectory(mock_scs_run) det = DSSC1M(run, modules=[3]) arr = det.get_array('image.data', roi=np.s_[20:25, 40:52]) assert arr.shape == (1, 128, 64, 5, 12) def test_get_array_lpd_parallelgain(mock_lpd_parallelgain_run): run = RunDirectory(mock_lpd_parallelgain_run) det = LPD1M(run.select_trains(by_index[:2]), parallel_gain=True) assert det.detector_name == 'FXE_DET_LPD1M-1' arr = det.get_array('image.data') assert arr.shape == (16, 2, 3, 100, 256, 256) assert arr.dims == ('module', 'train', 'gain', 'pulse', 'slow_scan', 'fast_scan') np.testing.assert_array_equal(arr.coords['gain'], np.arange(3)) np.testing.assert_array_equal(arr.coords['pulse'], np.arange(100)) def test_get_array_lpd_parallelgain_select_pulses(mock_lpd_parallelgain_run): run = RunDirectory(mock_lpd_parallelgain_run) det = LPD1M(run.select_trains(by_index[:2]), parallel_gain=True) assert det.detector_name == 'FXE_DET_LPD1M-1' arr = det.get_array('image.data', pulses=np.s_[:5]) assert arr.shape == (16, 2, 3, 5, 256, 256) assert arr.dims == ('module', 'train', 'gain', 'pulse', 'slow_scan', 'fast_scan') np.testing.assert_array_equal(arr.coords['gain'], np.arange(3)) np.testing.assert_array_equal(arr.coords['pulse'], np.arange(5)) arr = det.get_array('image.data', pulses=by_id[:5]) assert arr.shape == (16, 2, 3, 5, 256, 256) np.testing.assert_array_equal(arr.coords['pulse'], np.arange(5)) def test_get_array_jungfrau(mock_jungfrau_run): run = RunDirectory(mock_jungfrau_run) jf = JUNGFRAU(run.select_trains(by_index[:2])) assert jf.detector_name == 'SPB_IRDA_JF4M' arr = jf.get_array('data.adc') assert arr.shape == (8, 2, 16, 512, 1024) assert arr.dims == ('module', 'train', 'cell', 'slow_scan', 'fast_scan') np.testing.assert_array_equal(arr.coords['train'], [10000, 10001]) arr = jf.get_array('data.adc', astype=np.float32) assert arr.dtype == np.float32 def test_get_dask_array(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) det = LPD1M(run) arr = det.get_dask_array('image.data', fill_value=42) assert isinstance(arr.data, da.Array) assert arr.shape == (16, 480 * 128, 1, 256, 256) assert arr.dtype == np.uint16 assert arr.dims == ('module', 'train_pulse', 'dim_0', 'dim_1', 'dim_2') np.testing.assert_array_equal(arr.coords['module'], np.arange(16)) np.testing.assert_array_equal( arr.coords['trainId'], np.repeat(np.arange(10000, 10480), 128) ) np.testing.assert_array_equal( arr.coords['pulseId'], np.tile(np.arange(0, 128), 480) ) arr_cellid = det.get_dask_array('image.data', subtrain_index='cellId') assert arr_cellid.coords['cellId'].shape == (480 * 128,) def test_get_dask_array_reduced_data(mock_reduced_spb_proc_run): run = RunDirectory(mock_reduced_spb_proc_run) det = AGIPD1M(run) arr = det.get_dask_array('image.data') assert arr.shape[2:] == (512, 128) assert arr.dims == ('module', 'train_pulse', 'dim_0', 'dim_1') np.testing.assert_array_equal(arr.coords['module'], np.arange(16)) assert np.isin(arr.coords['trainId'], np.arange(10000, 10480)).all() assert np.isin(arr.coords['pulseId'], np.arange(0, 20)).all() def test_get_dask_array_lpd_parallelgain(mock_lpd_parallelgain_run): run = RunDirectory(mock_lpd_parallelgain_run) det = LPD1M(run.select_trains(by_index[:2]), parallel_gain=True) assert det.detector_name == 'FXE_DET_LPD1M-1' arr = det.get_dask_array('image.data') assert arr.shape == (16, 2 * 3 * 100, 1, 256, 256) assert arr.dims[:2] == ('module', 'train_pulse') np.testing.assert_array_equal(arr.coords['pulseId'], np.tile(np.arange(100), 6)) def test_get_dask_array_jungfrau(mock_jungfrau_run): run = RunDirectory(mock_jungfrau_run) jf = JUNGFRAU(run) assert jf.detector_name == 'SPB_IRDA_JF4M' arr = jf.get_dask_array('data.adc') assert arr.shape == (8, 100, 16, 512, 1024) assert arr.dims == ('module', 'train', 'cell', 'slow_scan', 'fast_scan') np.testing.assert_array_equal(arr.coords['train'], np.arange(10000, 10100)) def test_iterate(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) det = LPD1M(run.select_trains(by_index[:2])) it = iter(det.trains()) tid, d = next(it) assert d['image.data'].shape == (16, 1, 128, 256, 256) assert d['image.data'].dims == ('module', 'train', 'pulse', 'slow_scan', 'fast_scan') tid, d = next(it) assert d['image.data'].shape == (16, 1, 128, 256, 256) with pytest.raises(StopIteration): next(it) def test_iterate_pulse_id(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) det = LPD1M(run.select_trains(by_index[:3])) tid, d = next(iter(det.trains(pulses=by_id[0]))) assert d['image.data'].shape == (16, 1, 1, 256, 256) tid, d = next(iter(det.trains(pulses=by_id[:5]))) assert d['image.data'].shape == (16, 1, 5, 256, 256) tid, d = next(iter(det.trains(pulses=by_id[122:]))) assert d['image.data'].shape == (16, 1, 6, 256, 256) tid, d = next(iter(det.trains(pulses=by_id[[1, 7, 22, 23]]))) assert d['image.data'].shape == (16, 1, 4, 256, 256) assert list(d['image.data'].coords['pulse']) == [1, 7, 22, 23] def test_iterate_pulse_index(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) det = LPD1M(run.select_trains(by_index[:3])) tid, d = next(iter(det.trains(pulses=by_index[0]))) assert d['image.data'].shape == (16, 1, 1, 256, 256) tid, d = next(iter(det.trains(pulses=by_index[:5]))) assert d['image.data'].shape == (16, 1, 5, 256, 256) tid, d = next(iter(det.trains(pulses=by_index[122:]))) assert d['image.data'].shape == (16, 1, 6, 256, 256) tid, d = next(iter(det.trains(pulses=by_index[[1, 7, 22, 23]]))) assert d['image.data'].shape == (16, 1, 4, 256, 256) assert list(d['image.data'].coords['pulse']) == [1, 7, 22, 23] def test_iterate_lpd_parallel_gain(mock_lpd_parallelgain_run): run = RunDirectory(mock_lpd_parallelgain_run) det = LPD1M(run.select_trains(by_index[:3]), parallel_gain=True) tid, d = next(iter(det.trains())) assert d['image.data'].shape == (16, 1, 3, 100, 256, 256) assert d['image.data'].dims == \ ('module', 'train', 'gain', 'pulse', 'slow_scan', 'fast_scan') def test_iterate_jungfrau(mock_jungfrau_run): run = RunDirectory(mock_jungfrau_run) jf = JUNGFRAU(run) tid, d = next(iter(jf.trains())) assert tid == 10000 assert d['data.adc'].shape == (8, 16, 512, 1024) assert d['data.adc'].dims == ('module', 'cell', 'slow_scan', 'fast_scan') def test_write_virtual_cxi(mock_spb_proc_run, tmpdir): run = RunDirectory(mock_spb_proc_run) det = AGIPD1M(run) test_file = osp.join(str(tmpdir), 'test.cxi') det.write_virtual_cxi(test_file) assert_isfile(test_file) with h5py.File(test_file, 'r') as f: det_grp = f['entry_1/instrument_1/detector_1'] ds = det_grp['data'] assert isinstance(ds, h5py.Dataset) assert ds.is_virtual assert ds.shape[1:] == (16, 512, 128) assert 'axes' in ds.attrs assert len(ds.virtual_sources()) == 16 # Check position of each source file in the modules dimension for src in ds.virtual_sources(): start, _, block, count = src.vspace.get_regular_hyperslab() assert block[1] == 1 assert count[1] == 1 expected_file = 'CORR-R0238-AGIPD{:0>2}-S00000.h5'.format(start[1]) assert osp.basename(src.file_name) == expected_file # Check presence of other datasets assert 'gain' in det_grp assert 'mask' in det_grp assert 'experiment_identifier' in det_grp def test_write_virtual_cxi_some_modules(mock_spb_proc_run, tmpdir): run = RunDirectory(mock_spb_proc_run) det = AGIPD1M(run, modules=[3, 4, 8, 15]) test_file = osp.join(str(tmpdir), 'test.cxi') det.write_virtual_cxi(test_file) assert_isfile(test_file) with h5py.File(test_file, 'r') as f: det_grp = f['entry_1/instrument_1/detector_1'] ds = det_grp['data'] assert ds.shape[1:] == (16, 512, 128) def test_write_virtual_cxi_jungfrau(mock_jungfrau_run, tmpdir): run = RunDirectory(mock_jungfrau_run) det = JUNGFRAU(run) test_file = osp.join(str(tmpdir), 'test.cxi') det.write_virtual_cxi(test_file) assert_isfile(test_file) with h5py.File(test_file, 'r') as f: det_grp = f['entry_1/instrument_1/detector_1'] ds = det_grp['data'] assert isinstance(ds, h5py.Dataset) assert ds.is_virtual assert ds.shape[1:] == (8, 512, 1024) assert 'axes' in ds.attrs assert len(ds.virtual_sources()) == 8 # Check position of each source file in the modules dimension for src in ds.virtual_sources(): start, _, block, count = src.vspace.get_regular_hyperslab() assert block[1] == 1 assert count[1] == 1 expected_file = 'RAW-R0012-JNGFR{:0>2}-S00000.h5'.format( start[1] + 1) assert osp.basename(src.file_name) == expected_file # Check presence of other datasets assert 'gain' in det_grp assert 'mask' in det_grp assert 'experiment_identifier' in det_grp def test_write_virtual_cxi_jungfrau_some_modules(mock_jungfrau_run, tmpdir): run = RunDirectory(mock_jungfrau_run) det = JUNGFRAU(run, modules=[2, 3, 4, 6]) test_file = osp.join(str(tmpdir), 'test.cxi') det.write_virtual_cxi(test_file) assert_isfile(test_file) with h5py.File(test_file, 'r') as f: det_grp = f['entry_1/instrument_1/detector_1'] ds = det_grp['data'] assert ds.shape[1:] == (8, 512, 1024) np.testing.assert_array_equal(det_grp['module_identifier'][:], np.arange(1,9)) def test_write_virtual_cxi_raw_data(mock_fxe_raw_run, tmpdir, caplog): import logging caplog.set_level(logging.INFO) run = RunDirectory(mock_fxe_raw_run) det = LPD1M(run) test_file = osp.join(str(tmpdir), 'test.cxi') det.write_virtual_cxi(test_file) assert_isfile(test_file) with h5py.File(test_file, 'r') as f: det_grp = f['entry_1/instrument_1/detector_1'] ds = det_grp['data'] assert ds.shape[1:] == (16, 1, 256, 256) def test_write_virtual_cxi_reduced_data(mock_reduced_spb_proc_run, tmpdir): run = RunDirectory(mock_reduced_spb_proc_run) det = AGIPD1M(run) test_file = osp.join(str(tmpdir), 'test.cxi') det.write_virtual_cxi(test_file) assert_isfile(test_file) with h5py.File(test_file, 'r') as f: det_grp = f['entry_1/instrument_1/detector_1'] ds = det_grp['data'] assert ds.shape[1:] == (16, 512, 128) def test_write_selected_frames(mock_spb_raw_run, tmp_path): run = RunDirectory(mock_spb_raw_run) det = AGIPD1M(run) trains = np.repeat(np.arange(10000, 10006), 2) pulses = np.tile([0, 5], 6) test_file = tmp_path / 'sel_frames.h5' det.write_frames(test_file, trains, pulses) assert_isfile(test_file) with H5File(test_file) as f: np.testing.assert_array_equal( f.get_array('SPB_DET_AGIPD1M-1/DET/0CH0:xtdf', 'image.pulseId')[:, 0], pulses ) assert f.instrument_sources == { f'SPB_DET_AGIPD1M-1/DET/{i}CH0:xtdf' for i in range(16) } # pytest leaves temp files for inspection, but these files are big enough # to be inconvenient, so delete them if the assertions have passed. test_file.unlink() def test_write_selected_frames_proc(mock_spb_proc_run, tmp_path): run = RunDirectory(mock_spb_proc_run) det = AGIPD1M(run) trains = np.repeat(np.arange(10000, 10006), 2) pulses = np.tile([0, 7], 6) test_file = tmp_path / 'sel_frames.h5' det.write_frames(test_file, trains, pulses) assert_isfile(test_file) with H5File(test_file) as f: np.testing.assert_array_equal( f.get_array('SPB_DET_AGIPD1M-1/DET/0CH0:xtdf', 'image.pulseId'), pulses ) assert f.instrument_sources == { f'SPB_DET_AGIPD1M-1/DET/{i}CH0:xtdf' for i in range(16) } # pytest leaves temp files for inspection, but these files are big enough # to be inconvenient, so delete them if the assertions have passed. test_file.unlink() def test_identify_multimod_detectors(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) name, cls = identify_multimod_detectors(run, single=True) assert name == 'FXE_DET_LPD1M-1' assert cls is LPD1M dets = identify_multimod_detectors(run, single=False) assert dets == {(name, cls)} def test_identify_multimod_detectors_multi(mock_fxe_raw_run, mock_spb_raw_run): fxe_run = RunDirectory(mock_fxe_raw_run) spb_run = RunDirectory(mock_spb_raw_run) combined = fxe_run.select('*LPD1M*').union(spb_run) dets = identify_multimod_detectors(combined, single=False) assert dets == {('FXE_DET_LPD1M-1', LPD1M), ('SPB_DET_AGIPD1M-1', AGIPD1M)} with pytest.raises(ValueError): identify_multimod_detectors(combined, single=True) name, cls = identify_multimod_detectors(combined, single=True, clses=[AGIPD1M]) assert name == 'SPB_DET_AGIPD1M-1' assert cls is AGIPD1M ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1627647111.0 EXtra-data-1.7.0/extra_data/tests/test_keydata.py0000664000175000017500000001145500000000000022631 0ustar00takluyvertakluyverimport numpy as np import pytest from extra_data import RunDirectory, H5File from extra_data.exceptions import TrainIDError def test_get_keydata(mock_spb_raw_run): run = RunDirectory(mock_spb_raw_run) print(run.instrument_sources) am0 = run['SPB_DET_AGIPD1M-1/DET/0CH0:xtdf', 'image.data'] assert len(am0.files) == 1 assert am0.section == 'INSTRUMENT' assert am0.entry_shape == (2, 512, 128) assert am0.ndim == 4 assert am0.dtype == np.dtype('u2') xgm_beam_x = run['SPB_XTD9_XGM/DOOCS/MAIN', 'beamPosition.ixPos.value'] assert len(xgm_beam_x.files) == 2 assert xgm_beam_x.section == 'CONTROL' assert xgm_beam_x.entry_shape == () assert xgm_beam_x.ndim == 1 assert xgm_beam_x.dtype == np.dtype('f4') def test_select_trains(mock_spb_raw_run): run = RunDirectory(mock_spb_raw_run) xgm_beam_x = run['SPB_XTD9_XGM/DOOCS/MAIN', 'beamPosition.ixPos.value'] assert xgm_beam_x.shape == (64,) sel1 = xgm_beam_x[:20] # Equivalent to .select_trains(np.s_[:20]) assert sel1.shape == (20,) assert len(sel1.files) == 1 # Empty selection sel2 = xgm_beam_x[80:] assert sel2.shape == (0,) assert len(sel2.files) == 0 assert sel2.xarray().shape == (0,) # Single train sel3 = xgm_beam_x[32] assert sel3.shape == (1,) def test_split_trains(mock_spb_raw_run): run = RunDirectory(mock_spb_raw_run) xgm_beam_x = run['SPB_XTD9_XGM/DOOCS/MAIN', 'beamPosition.ixPos.value'] assert xgm_beam_x.shape == (64,) chunks = list(xgm_beam_x.split_trains(3)) assert len(chunks) == 3 assert {c.shape for c in chunks} == {(21,), (22,)} assert chunks[0].ndarray().shape == chunks[0].shape chunks = list(xgm_beam_x.split_trains(3, trains_per_part=20)) assert len(chunks) == 4 assert {c.shape for c in chunks} == {(16,)} def test_nodata(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) cam_pix = run['FXE_XAD_GEC/CAM/CAMERA_NODATA:daqOutput', 'data.image.pixels'] assert cam_pix.train_ids == list(range(10000, 10480)) assert len(cam_pix.files) == 2 assert cam_pix.shape == (0, 255, 1024) arr = cam_pix.xarray() assert arr.shape == (0, 255, 1024) assert arr.dtype == np.dtype('u2') assert list(cam_pix.trains()) == [] tid, data = cam_pix.train_from_id(10010) assert tid == 10010 assert data.shape == (0, 255, 1024) def test_iter_trains(mock_spb_raw_run): run = RunDirectory(mock_spb_raw_run) xgm_beam_x = run['SPB_XTD9_XGM/DOOCS/MAIN', 'beamPosition.ixPos.value'] assert [t for (t, _) in xgm_beam_x.trains()] == list(range(10000, 10064)) for _, v in xgm_beam_x.trains(): assert isinstance(v, np.float32) break def test_get_train(mock_spb_raw_run): run = RunDirectory(mock_spb_raw_run) xgm_beam_x = run['SPB_XTD9_XGM/DOOCS/MAIN', 'beamPosition.ixPos.value'] tid, val = xgm_beam_x.train_from_id(10005) assert tid == 10005 assert isinstance(val, np.float32) with pytest.raises(TrainIDError): xgm_beam_x.train_from_id(11000) tid, _ = xgm_beam_x.train_from_index(-10) assert tid == 10054 with pytest.raises(IndexError): xgm_beam_x.train_from_index(9999) def test_data_counts(mock_reduced_spb_proc_run): run = RunDirectory(mock_reduced_spb_proc_run) # control data xgm_beam_x = run['SPB_XTD9_XGM/DOOCS/MAIN', 'beamPosition.ixPos.value'] count = xgm_beam_x.data_counts() assert count.index.tolist() == xgm_beam_x.train_ids assert (count.values == 1).all() # instrument data camera = run['SPB_IRU_CAM/CAM/SIDEMIC:daqOutput', 'data.image.pixels'] count = camera.data_counts() assert count.index.tolist() == camera.train_ids mod = run['SPB_DET_AGIPD1M-1/DET/0CH0:xtdf', 'image.data'] count = mod.data_counts() assert count.index.tolist() == mod.train_ids assert count.values.sum() == mod.shape[0] def test_select_by(mock_spb_raw_run): run = RunDirectory(mock_spb_raw_run) am0 = run['SPB_DET_AGIPD1M-1/DET/0CH0:xtdf', 'image.data'] subrun = run.select(am0) assert subrun.all_sources == {am0.source} assert subrun.keys_for_source(am0.source) == {am0.key} def test_drop_empty_trains(mock_sa3_control_data): f = H5File(mock_sa3_control_data) beamview = f['SA3_XTD10_IMGFEL/CAM/BEAMVIEW2:daqOutput', 'data.image.dims'] assert len(beamview.train_ids) == 500 a1 = beamview.ndarray() assert a1.shape == (250, 2) frame_counts = beamview.data_counts(labelled=False) assert frame_counts.shape == (500,) assert frame_counts.min() == 0 beamview_w_data = beamview.drop_empty_trains() assert len(beamview_w_data.train_ids) == 250 np.testing.assert_array_equal(beamview_w_data.ndarray(), a1) frame_counts = beamview_w_data.data_counts(labelled=False) assert frame_counts.shape == (250,) assert frame_counts.min() == 1 ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1622824936.0 EXtra-data-1.7.0/extra_data/tests/test_lsxfel.py0000664000175000017500000000061700000000000022502 0ustar00takluyvertakluyverfrom extra_data import lsxfel def test_lsxfel_file(mock_lpd_data, capsys): lsxfel.summarise_file(mock_lpd_data) out, err = capsys.readouterr() assert "480 trains, 128 frames/train" in out def test_lsxfel_run(mock_fxe_raw_run, capsys): lsxfel.summarise_run(mock_fxe_raw_run) out, err = capsys.readouterr() assert "480 trains" in out assert "16 detector files" in out ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1602757031.0 EXtra-data-1.7.0/extra_data/tests/test_open_file_limiter.py0000644000175000017500000000360400000000000024667 0ustar00takluyvertakluyverimport gc import os import pytest from extra_data import file_access from extra_data.reader import DataCollection @pytest.fixture def files_limit_512(): orig_limiter = file_access.open_files_limiter file_access.open_files_limiter = l = file_access.OpenFilesLimiter(512) yield l file_access.open_files_limiter = orig_limiter @pytest.fixture def files_limit_3(): orig_limiter = file_access.open_files_limiter file_access.open_files_limiter = l = file_access.OpenFilesLimiter(3) yield l file_access.open_files_limiter = orig_limiter def test_filecache_large(mock_spb_raw_run, files_limit_512): fc = files_limit_512 files = [os.path.join(mock_spb_raw_run, f) \ for f in os.listdir(mock_spb_raw_run) if f.endswith('.h5')] run = DataCollection.from_paths(files) trains_iter = run.trains() tid, data = next(trains_iter) assert tid == 10000 device = 'SPB_IRU_CAM/CAM/SIDEMIC:daqOutput' assert device in data assert data[device]['data.image.pixels'].shape == (1024, 768) # 16 AGIPD files + 1st DA file, but the other sequence file may be opened assert fc.n_open_files() >= 17 del run, trains_iter gc.collect() assert fc.n_open_files() == 0 def test_filecache_small(mock_spb_raw_run, files_limit_3): fc = files_limit_3 files = [os.path.join(mock_spb_raw_run, f) \ for f in os.listdir(mock_spb_raw_run) if f.endswith('.h5')] run = DataCollection.from_paths(files) trains_iter = run.trains() for i in range(3): tid, data = next(trains_iter) assert tid == 10000 + i for j in range(16): device = f'SPB_DET_AGIPD1M-1/DET/{j}CH0:xtdf' assert device in data assert data[device]['image.data'].shape == (64, 2, 512, 128) assert len(fc._cache) == 3 del run, trains_iter gc.collect() assert fc.n_open_files() == 0 ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1602757031.0 EXtra-data-1.7.0/extra_data/tests/test_read_machinery.py0000644000175000017500000000065600000000000024160 0ustar00takluyvertakluyverimport os import os.path as osp from unittest import mock from extra_data import read_machinery def test_find_proposal(tmpdir): prop_dir = osp.join(str(tmpdir), 'SPB', '201701', 'p002012') os.makedirs(prop_dir) with mock.patch.object(read_machinery, 'DATA_ROOT_DIR', str(tmpdir)): assert read_machinery.find_proposal('p002012') == prop_dir assert read_machinery.find_proposal(prop_dir) == prop_dir ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1627645754.0 EXtra-data-1.7.0/extra_data/tests/test_reader_mockdata.py0000664000175000017500000007215100000000000024314 0ustar00takluyvertakluyverfrom datetime import datetime, timedelta, timezone from itertools import islice import h5py import numpy as np import os import pandas as pd import pytest import stat from tempfile import mkdtemp from testpath import assert_isfile from unittest import mock from xarray import DataArray from extra_data import ( H5File, RunDirectory, by_index, by_id, SourceNameError, PropertyNameError, DataCollection, open_run, MultiRunError ) def test_iterate_trains(mock_agipd_data): with H5File(mock_agipd_data) as f: for train_id, data in islice(f.trains(), 10): assert train_id in range(10000, 10250) assert 'SPB_DET_AGIPD1M-1/DET/7CH0:xtdf' in data assert len(data) == 1 assert 'image.data' in data['SPB_DET_AGIPD1M-1/DET/7CH0:xtdf'] def test_iterate_trains_flat_keys(mock_agipd_data): with H5File(mock_agipd_data) as f: for train_id, data in islice(f.trains(flat_keys=True), 10): assert train_id in range(10000, 10250) assert ('SPB_DET_AGIPD1M-1/DET/7CH0:xtdf', 'image.data') in data def test_get_train_bad_device_name(mock_spb_control_data_badname): # Check that we can handle devices which don't have the standard Karabo # name structure A/B/C. with H5File(mock_spb_control_data_badname) as f: train_id, data = f.train_from_id(10004) assert train_id == 10004 device = 'SPB_IRU_SIDEMIC_CAM:daqOutput' assert device in data assert 'data.image.dims' in data[device] dims = data[device]['data.image.dims'] assert list(dims) == [1000, 1000] def test_detector_info_oldfmt(mock_agipd_data): with H5File(mock_agipd_data) as f: di = f.detector_info('SPB_DET_AGIPD1M-1/DET/7CH0:xtdf') assert di['dims'] == (512, 128) assert di['frames_per_train'] == 64 assert di['total_frames'] == 16000 def test_detector_info(mock_lpd_data): with H5File(mock_lpd_data) as f: di = f.detector_info('FXE_DET_LPD1M-1/DET/0CH0:xtdf') assert di['dims'] == (256, 256) assert di['frames_per_train'] == 128 assert di['total_frames'] == 128 * 480 def test_train_info(mock_lpd_data, capsys): with H5File(mock_lpd_data) as f: f.train_info(10004) out, err = capsys.readouterr() assert "Devices" in out assert "FXE_DET_LPD1M-1/DET/0CH0:xtdf" in out def test_iterate_trains_fxe(mock_fxe_control_data): with H5File(mock_fxe_control_data) as f: for train_id, data in islice(f.trains(), 10): assert train_id in range(10000, 10400) assert 'SA1_XTD2_XGM/DOOCS/MAIN' in data.keys() assert 'beamPosition.ixPos.value' in data['SA1_XTD2_XGM/DOOCS/MAIN'] assert 'data.image.pixels' in data['FXE_XAD_GEC/CAM/CAMERA:daqOutput'] assert 'data.image.pixels' not in data['FXE_XAD_GEC/CAM/CAMERA_NODATA:daqOutput'] def test_iterate_file_select_trains(mock_fxe_control_data): with H5File(mock_fxe_control_data) as f: tids = [tid for (tid, _) in f.trains(train_range=by_id[:10003])] assert tids == [10000, 10001, 10002] tids = [tid for (tid, _) in f.trains(train_range=by_index[-2:])] assert tids == [10398, 10399] def test_iterate_trains_select_keys(mock_fxe_control_data): sel = { 'SA1_XTD2_XGM/DOOCS/MAIN': { 'beamPosition.ixPos.value', 'beamPosition.ixPos.timestamp', } } with H5File(mock_fxe_control_data) as f: for train_id, data in islice(f.trains(devices=sel), 10): assert train_id in range(10000, 10400) assert 'SA1_XTD2_XGM/DOOCS/MAIN' in data.keys() assert 'beamPosition.ixPos.value' in data['SA1_XTD2_XGM/DOOCS/MAIN'] assert 'beamPosition.ixPos.timestamp' in data['SA1_XTD2_XGM/DOOCS/MAIN'] assert 'beamPosition.iyPos.value' not in data['SA1_XTD2_XGM/DOOCS/MAIN'] assert 'SA3_XTD10_VAC/TSENS/S30160K' not in data def test_iterate_trains_require_all(mock_sa3_control_data): with H5File(mock_sa3_control_data) as f: trains_iter = f.trains( devices=[('*/CAM/BEAMVIEW:daqOutput', 'data.image.dims')], require_all=True ) tids = [t for (t, _) in trains_iter] assert tids == [] trains_iter = f.trains( devices=[('*/CAM/BEAMVIEW:daqOutput', 'data.image.dims')], require_all=False ) tids = [t for (t, _) in trains_iter] assert tids != [] def test_read_fxe_raw_run(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) assert len(run.files) == 18 # 16 detector modules + 2 control data files assert run.train_ids == list(range(10000, 10480)) run.info() # Smoke test def test_read_fxe_raw_run_selective(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run, include='*DA*') assert run.train_ids == list(range(10000, 10480)) assert 'SA1_XTD2_XGM/DOOCS/MAIN' in run.control_sources assert 'FXE_DET_LPD1M-1/DET/0CH0:xtdf' not in run.detector_sources run = RunDirectory(mock_fxe_raw_run, include='*LPD*') assert run.train_ids == list(range(10000, 10480)) assert 'SA1_XTD2_XGM/DOOCS/MAIN' not in run.control_sources assert 'FXE_DET_LPD1M-1/DET/0CH0:xtdf' in run.detector_sources def test_read_spb_proc_run(mock_spb_proc_run): run = RunDirectory(mock_spb_proc_run) #Test for calib data assert len(run.files) == 16 # only 16 detector modules for calib data assert run.train_ids == list(range(10000, 10064)) #64 trains tid, data = next(run.trains()) device = 'SPB_DET_AGIPD1M-1/DET/15CH0:xtdf' assert tid == 10000 for prop in ('image.gain', 'image.mask', 'image.data'): assert prop in data[device] assert 'u1' == data[device]['image.gain'].dtype assert 'u4' == data[device]['image.mask'].dtype assert 'f4' == data[device]['image.data'].dtype run.info() # Smoke test def test_iterate_spb_raw_run(mock_spb_raw_run): run = RunDirectory(mock_spb_raw_run) trains_iter = run.trains() tid, data = next(trains_iter) assert tid == 10000 device = 'SPB_IRU_CAM/CAM/SIDEMIC:daqOutput' assert device in data assert data[device]['data.image.pixels'].shape == (1024, 768) def test_properties_fxe_raw_run(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) assert run.train_ids == list(range(10000, 10480)) assert 'SPB_XTD9_XGM/DOOCS/MAIN' in run.control_sources assert 'FXE_DET_LPD1M-1/DET/15CH0:xtdf' in run.instrument_sources def test_iterate_fxe_run(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) trains_iter = run.trains() tid, data = next(trains_iter) assert tid == 10000 assert 'FXE_DET_LPD1M-1/DET/15CH0:xtdf' in data assert 'image.data' in data['FXE_DET_LPD1M-1/DET/15CH0:xtdf'] assert 'FXE_XAD_GEC/CAM/CAMERA' in data assert 'firmwareVersion.value' in data['FXE_XAD_GEC/CAM/CAMERA'] def test_iterate_select_trains(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) tids = [tid for (tid, _) in run.trains(train_range=by_id[10004:10006])] assert tids == [10004, 10005] tids = [tid for (tid, _) in run.trains(train_range=by_id[:10003])] assert tids == [10000, 10001, 10002] # Overlap with start of run tids = [tid for (tid, _) in run.trains(train_range=by_id[9000:10003])] assert tids == [10000, 10001, 10002] # Overlap with end of run tids = [tid for (tid, _) in run.trains(train_range=by_id[10478:10500])] assert tids == [10478, 10479] # Not overlapping with pytest.raises(ValueError) as excinfo: list(run.trains(train_range=by_id[9000:9050])) assert 'before' in str(excinfo.value) with pytest.raises(ValueError) as excinfo: list(run.trains(train_range=by_id[10500:10550])) assert 'after' in str(excinfo.value) tids = [tid for (tid, _) in run.trains(train_range=by_index[4:6])] assert tids == [10004, 10005] def test_iterate_run_glob_devices(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) trains_iter = run.trains([("*/DET/*", "image.data")]) tid, data = next(trains_iter) assert tid == 10000 assert 'FXE_DET_LPD1M-1/DET/15CH0:xtdf' in data assert 'image.data' in data['FXE_DET_LPD1M-1/DET/15CH0:xtdf'] assert 'detector.data' not in data['FXE_DET_LPD1M-1/DET/15CH0:xtdf'] assert 'FXE_XAD_GEC/CAM/CAMERA' not in data def test_train_by_id_fxe_run(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) _, data = run.train_from_id(10024) assert 'FXE_DET_LPD1M-1/DET/15CH0:xtdf' in data assert 'image.data' in data['FXE_DET_LPD1M-1/DET/15CH0:xtdf'] assert 'FXE_XAD_GEC/CAM/CAMERA' in data assert 'firmwareVersion.value' in data['FXE_XAD_GEC/CAM/CAMERA'] def test_train_by_id_fxe_run_selection(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) _, data = run.train_from_id(10024, [('*/DET/*', 'image.data')]) assert 'FXE_DET_LPD1M-1/DET/15CH0:xtdf' in data assert 'image.data' in data['FXE_DET_LPD1M-1/DET/15CH0:xtdf'] assert 'FXE_XAD_GEC/CAM/CAMERA' not in data def test_train_from_index_fxe_run(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) _, data = run.train_from_index(479) assert 'FXE_DET_LPD1M-1/DET/15CH0:xtdf' in data assert 'image.data' in data['FXE_DET_LPD1M-1/DET/15CH0:xtdf'] assert 'FXE_XAD_GEC/CAM/CAMERA' in data assert 'firmwareVersion.value' in data['FXE_XAD_GEC/CAM/CAMERA'] def test_file_get_series_control(mock_fxe_control_data): with H5File(mock_fxe_control_data) as f: s = f.get_series('SA1_XTD2_XGM/DOOCS/MAIN', "beamPosition.iyPos.value") assert isinstance(s, pd.Series) assert len(s) == 400 assert s.index[0] == 10000 def test_file_get_series_instrument(mock_spb_proc_run): agipd_file = os.path.join(mock_spb_proc_run, 'CORR-R0238-AGIPD07-S00000.h5') with H5File(agipd_file) as f: s = f.get_series('SPB_DET_AGIPD1M-1/DET/7CH0:xtdf', 'header.linkId') assert isinstance(s, pd.Series) assert len(s) == 64 assert s.index[0] == 10000 # Multiple readings per train s2 = f.get_series('SPB_DET_AGIPD1M-1/DET/7CH0:xtdf', 'image.pulseId') assert isinstance(s2, pd.Series) assert not s2.index.is_unique assert len(s2) == 64 * 64 assert len(s2.loc[10000:10004]) == 5 * 64 sel = f.select_trains(by_index[5:10]) s3 = sel.get_series('SPB_DET_AGIPD1M-1/DET/7CH0:xtdf', 'image.pulseId') assert isinstance(s3, pd.Series) assert not s3.index.is_unique assert len(s3) == 5 * 64 np.testing.assert_array_equal( s3.index.values, np.arange(10005, 10010).repeat(64) ) def test_run_get_series_control(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) s = run.get_series('SA1_XTD2_XGM/DOOCS/MAIN', "beamPosition.iyPos.value") assert isinstance(s, pd.Series) assert len(s) == 480 assert list(s.index) == list(range(10000, 10480)) def test_run_get_series_select_trains(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) sel = run.select_trains(by_id[10100:10150]) s = sel.get_series('SA1_XTD2_XGM/DOOCS/MAIN', "beamPosition.iyPos.value") assert isinstance(s, pd.Series) assert len(s) == 50 assert list(s.index) == list(range(10100, 10150)) def test_run_get_dataframe(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) df = run.get_dataframe(fields=[("*_XGM/*", "*.i[xy]Pos*")]) assert len(df.columns) == 4 assert "SA1_XTD2_XGM/DOOCS/MAIN/beamPosition.ixPos" in df.columns df2 = run.get_dataframe(fields=[("*_XGM/*", "*.i[xy]Pos*")], timestamps=True) assert len(df2.columns) == 8 assert "SA1_XTD2_XGM/DOOCS/MAIN/beamPosition.ixPos" in df2.columns assert "SA1_XTD2_XGM/DOOCS/MAIN/beamPosition.ixPos.timestamp" in df2.columns def test_file_get_array(mock_fxe_control_data): with H5File(mock_fxe_control_data) as f: arr = f.get_array('FXE_XAD_GEC/CAM/CAMERA:daqOutput', 'data.image.pixels') assert isinstance(arr, DataArray) assert arr.dims == ('trainId', 'dim_0', 'dim_1') assert arr.shape == (400, 255, 1024) assert arr.coords['trainId'][0] == 10000 def test_file_get_array_missing_trains(mock_sa3_control_data): with H5File(mock_sa3_control_data) as f: sel = f.select_trains(by_index[:6]) arr = sel.get_array( 'SA3_XTD10_IMGFEL/CAM/BEAMVIEW2:daqOutput', 'data.image.dims' ) assert isinstance(arr, DataArray) assert arr.dims == ('trainId', 'dim_0') assert arr.shape == (3, 2) np.testing.assert_array_less(arr.coords['trainId'], 10006) np.testing.assert_array_less(10000, arr.coords['trainId']) def test_file_get_array_control_roi(mock_sa3_control_data): with H5File(mock_sa3_control_data) as f: sel = f.select_trains(by_index[:6]) arr = sel.get_array( 'SA3_XTD10_VAC/DCTRL/D6_APERT_IN_OK', 'interlock.a1.AActCommand.value', roi=by_index[:25], ) assert isinstance(arr, DataArray) assert arr.shape == (6, 25) assert arr.coords['trainId'][0] == 10000 @pytest.mark.parametrize('name_in, name_out', [ (None, 'SA1_XTD2_XGM/DOOCS/MAIN:output.data.intensityTD'), ('SA1_XGM', 'SA1_XGM') ], ids=['defaultName', 'explicitName']) def test_run_get_array(mock_fxe_raw_run, name_in, name_out): run = RunDirectory(mock_fxe_raw_run) arr = run.get_array( 'SA1_XTD2_XGM/DOOCS/MAIN:output', 'data.intensityTD', extra_dims=['pulse'], name=name_in ) assert isinstance(arr, DataArray) assert arr.dims == ('trainId', 'pulse') assert arr.shape == (480, 1000) assert arr.coords['trainId'][0] == 10000 assert arr.name == name_out def test_run_get_array_empty(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) arr = run.get_array('FXE_XAD_GEC/CAM/CAMERA_NODATA:daqOutput', 'data.image.pixels') assert isinstance(arr, DataArray) assert arr.dims[0] == 'trainId' assert arr.shape == (0, 255, 1024) def test_run_get_array_error(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) with pytest.raises(SourceNameError): run.get_array('bad_name', 'data.intensityTD') with pytest.raises(PropertyNameError): run.get_array('SA1_XTD2_XGM/DOOCS/MAIN:output', 'bad_name') def test_run_get_array_select_trains(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) sel = run.select_trains(by_id[10100:10150]) arr = sel.get_array( 'SA1_XTD2_XGM/DOOCS/MAIN:output', 'data.intensityTD', extra_dims=['pulse'] ) assert isinstance(arr, DataArray) assert arr.dims == ('trainId', 'pulse') assert arr.shape == (50, 1000) assert arr.coords['trainId'][0] == 10100 def test_run_get_array_roi(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) arr = run.get_array('SA1_XTD2_XGM/DOOCS/MAIN:output', 'data.intensityTD', extra_dims=['pulse'], roi=by_index[:16]) assert isinstance(arr, DataArray) assert arr.dims == ('trainId', 'pulse') assert arr.shape == (480, 16) assert arr.coords['trainId'][0] == 10000 def test_run_get_array_multiple_per_train(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) sel = run.select_trains(np.s_[:2]) arr = sel.get_array( 'FXE_DET_LPD1M-1/DET/6CH0:xtdf', 'image.data', roi=np.s_[:, 10:20, 20:40] ) assert isinstance(arr, DataArray) assert arr.shape == (256, 1, 10, 20) np.testing.assert_array_equal(arr.coords['trainId'], np.repeat([10000, 10001], 128)) def test_run_get_virtual_dataset(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) ds = run.get_virtual_dataset('FXE_DET_LPD1M-1/DET/6CH0:xtdf', 'image.data') assert isinstance(ds, h5py.Dataset) assert ds.is_virtual assert ds.shape == (61440, 1, 256, 256) # Across two sequence files ds = run.get_virtual_dataset( 'FXE_XAD_GEC/CAM/CAMERA:daqOutput', 'data.image.pixels' ) assert isinstance(ds, h5py.Dataset) assert ds.is_virtual assert ds.shape == (480, 255, 1024) def test_run_get_virtual_dataset_filename(mock_fxe_raw_run, tmpdir): run = RunDirectory(mock_fxe_raw_run) path = str(tmpdir / 'test-vds.h5') ds = run.get_virtual_dataset( 'FXE_DET_LPD1M-1/DET/6CH0:xtdf', 'image.data', filename=path ) assert_isfile(path) assert ds.file.filename == path assert isinstance(ds, h5py.Dataset) assert ds.is_virtual assert ds.shape == (61440, 1, 256, 256) def test_run_get_dask_array(mock_fxe_raw_run): import dask.array as da run = RunDirectory(mock_fxe_raw_run) arr = run.get_dask_array( 'SA1_XTD2_XGM/DOOCS/MAIN:output', 'data.intensityTD', ) assert isinstance(arr, da.Array) assert arr.shape == (480, 1000) assert arr.dtype == np.float32 def test_run_get_dask_array_labelled(mock_fxe_raw_run): import dask.array as da run = RunDirectory(mock_fxe_raw_run) arr = run.get_dask_array( 'SA1_XTD2_XGM/DOOCS/MAIN:output', 'data.intensityTD', labelled=True ) assert isinstance(arr, DataArray) assert isinstance(arr.data, da.Array) assert arr.dims == ('trainId', 'dim_0') assert arr.shape == (480, 1000) assert arr.coords['trainId'][0] == 10000 def test_select(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) assert 'SPB_XTD9_XGM/DOOCS/MAIN' in run.control_sources # Basic selection machinery, glob API sel = run.select('*/DET/*', 'image.pulseId') assert 'SPB_XTD9_XGM/DOOCS/MAIN' not in sel.control_sources assert 'FXE_DET_LPD1M-1/DET/0CH0:xtdf' in sel.instrument_sources _, data = sel.train_from_id(10000) for source, source_data in data.items(): assert set(source_data.keys()) == {'image.pulseId', 'metadata'} # Basic selection machinery, dict-based API sel_by_dict = run.select({ 'SA1_XTD2_XGM/DOOCS/MAIN': None, 'FXE_DET_LPD1M-1/DET/0CH0:xtdf': {'image.pulseId'} }) assert sel_by_dict.control_sources == {'SA1_XTD2_XGM/DOOCS/MAIN'} assert sel_by_dict.instrument_sources == {'FXE_DET_LPD1M-1/DET/0CH0:xtdf'} assert sel_by_dict.keys_for_source('FXE_DET_LPD1M-1/DET/0CH0:xtdf') == \ sel.keys_for_source('FXE_DET_LPD1M-1/DET/0CH0:xtdf') # Re-select using * selection, should yield the same keys. assert sel.keys_for_source('FXE_DET_LPD1M-1/DET/0CH0:xtdf') == \ sel.select('FXE_DET_LPD1M-1/DET/0CH0:xtdf', '*') \ .keys_for_source('FXE_DET_LPD1M-1/DET/0CH0:xtdf') assert sel.keys_for_source('FXE_DET_LPD1M-1/DET/0CH0:xtdf') == \ sel.select({'FXE_DET_LPD1M-1/DET/0CH0:xtdf': {}}) \ .keys_for_source('FXE_DET_LPD1M-1/DET/0CH0:xtdf') # Re-select a different but originally valid key, should fail. with pytest.raises(ValueError): # ValueError due to globbing. sel.select('FXE_DET_LPD1M-1/DET/0CH0:xtdf', 'image.trainId') with pytest.raises(PropertyNameError): # PropertyNameError via explicit key. sel.select({'FXE_DET_LPD1M-1/DET/0CH0:xtdf': {'image.trainId'}}) # Select by another DataCollection. sel_by_dc = run.select(sel) assert sel_by_dc.control_sources == sel.control_sources assert sel_by_dc.instrument_sources == sel.instrument_sources assert sel_by_dc.train_ids == sel.train_ids @pytest.mark.parametrize('select_str', ['*/BEAMVIEW2:daqOutput', '*/BEAMVIEW2*', '*']) def test_select_require_all(mock_sa3_control_data, select_str): # De-select two sources in this example set, which have no trains # at all, to allow matching trains across all sources with the same # result. run = H5File(mock_sa3_control_data) \ .deselect([('SA3_XTD10_MCP/ADC/1:*', '*'), ('SA3_XTD10_IMGFEL/CAM/BEAMVIEW:*', '*')]) subrun = run.select(select_str, require_all=True) np.testing.assert_array_equal(subrun.train_ids, run.train_ids[1::2]) # The train IDs are held by ndarrays during this operation, make # sure it's a list of np.uint64 again. assert isinstance(subrun.train_ids, list) assert all([isinstance(x, np.uint64) for x in subrun.train_ids]) def test_deselect(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) xtd9_xgm = 'SPB_XTD9_XGM/DOOCS/MAIN' assert xtd9_xgm in run.control_sources sel = run.deselect('*_XGM/DOOCS*') assert xtd9_xgm not in sel.control_sources assert 'FXE_DET_LPD1M-1/DET/0CH0:xtdf' in sel.instrument_sources sel = run.deselect('*_XGM/DOOCS*', '*.ixPos') assert xtd9_xgm in sel.control_sources assert 'beamPosition.ixPos.value' not in sel.selection[xtd9_xgm] assert 'beamPosition.iyPos.value' in sel.selection[xtd9_xgm] sel = run.deselect(run.select('*_XGM/DOOCS*')) assert xtd9_xgm not in sel.control_sources assert 'FXE_DET_LPD1M-1/DET/0CH0:xtdf' in sel.instrument_sources def test_select_trains(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) assert len(run.train_ids) == 480 sel = run.select_trains(by_id[10200:10220]) assert sel.train_ids == list(range(10200, 10220)) sel = run.select_trains(by_index[:10]) assert sel.train_ids == list(range(10000, 10010)) with pytest.raises(ValueError): run.select_trains(by_id[9000:9100]) # Before data with pytest.raises(ValueError): run.select_trains(by_id[12000:12500]) # After data # Select a list of train IDs sel = run.select_trains(by_id[[9950, 10000, 10101, 10500]]) assert sel.train_ids == [10000, 10101] with pytest.raises(ValueError): run.select_trains(by_id[[9900, 10600]]) # Select a list of indexes sel = run.select_trains(by_index[[5, 25]]) assert sel.train_ids == [10005, 10025] with pytest.raises(IndexError): run.select_trains(by_index[[480]]) def test_split_trains(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) assert len(run.train_ids) == 480 chunks = list(run.split_trains(3)) assert len(chunks) == 3 assert {len(c.train_ids) for c in chunks} == {160} chunks = list(run.split_trains(4, trains_per_part=100)) assert len(chunks) == 5 assert {len(c.train_ids) for c in chunks} == {96} def test_train_timestamps(mock_scs_run): run = RunDirectory(mock_scs_run) tss = run.train_timestamps(labelled=False) assert isinstance(tss, np.ndarray) assert tss.shape == (len(run.train_ids),) assert tss.dtype == np.dtype('datetime64[ns]') assert np.all(np.diff(tss).astype(np.uint64) > 0) # Convert numpy datetime64[ns] to Python datetime (dropping some precision) dt0 = tss[0].astype('datetime64[ms]').item().replace(tzinfo=timezone.utc) now = datetime.now(timezone.utc) assert dt0 > (now - timedelta(days=1)) # assuming tests take < 1 day to run assert dt0 < now tss_ser = run.train_timestamps(labelled=True) assert isinstance(tss_ser, pd.Series) np.testing.assert_array_equal(tss_ser.values, tss) np.testing.assert_array_equal(tss_ser.index, run.train_ids) def test_train_timestamps_nat(mock_fxe_control_data): f = H5File(mock_fxe_control_data) tss = f.train_timestamps() assert tss.shape == (len(f.train_ids),) if f.files[0].format_version == '0.5': assert np.all(np.isnat(tss)) else: assert not np.any(np.isnat(tss)) def test_union(mock_fxe_raw_run): run = RunDirectory(mock_fxe_raw_run) sel1 = run.select('SPB_XTD9_XGM/DOOCS/MAIN', 'beamPosition.ixPos') sel2 = run.select('SPB_XTD9_XGM/DOOCS/MAIN', 'beamPosition.iyPos') joined = sel1.union(sel2) assert joined.control_sources == {'SPB_XTD9_XGM/DOOCS/MAIN'} assert joined.selection == { 'SPB_XTD9_XGM/DOOCS/MAIN': { 'beamPosition.ixPos.value', 'beamPosition.iyPos.value', } } sel1 = run.select_trains(by_id[10200:10220]) sel2 = run.select_trains(by_index[:10]) joined = sel1.union(sel2) assert joined.train_ids == list(range(10000, 10010)) + list(range(10200, 10220)) def test_union_raw_proc(mock_spb_raw_run, mock_spb_proc_run): raw_run = RunDirectory(mock_spb_raw_run) proc_run = RunDirectory(mock_spb_proc_run) run = raw_run.deselect('*AGIPD1M*').union(proc_run) assert run.all_sources == (raw_run.all_sources | proc_run.all_sources) def test_read_skip_invalid(mock_lpd_data, empty_h5_file, capsys): d = DataCollection.from_paths([mock_lpd_data, empty_h5_file]) assert d.instrument_sources == {'FXE_DET_LPD1M-1/DET/0CH0:xtdf'} out, err = capsys.readouterr() assert "Skipping file" in err def test_run_immutable_sources(mock_fxe_raw_run): test_run = RunDirectory(mock_fxe_raw_run) before = len(test_run.all_sources) with pytest.raises(AttributeError): test_run.all_sources.pop() assert len(test_run.all_sources) == before def test_open_run(mock_spb_raw_run, mock_spb_proc_run, tmpdir): prop_dir = os.path.join(str(tmpdir), 'SPB', '201830', 'p002012') # Set up raw os.makedirs(os.path.join(prop_dir, 'raw')) os.symlink(mock_spb_raw_run, os.path.join(prop_dir, 'raw', 'r0238')) # Set up proc os.makedirs(os.path.join(prop_dir, 'proc')) os.symlink(mock_spb_proc_run, os.path.join(prop_dir, 'proc', 'r0238')) with mock.patch('extra_data.read_machinery.DATA_ROOT_DIR', str(tmpdir)): # With integers run = open_run(proposal=2012, run=238) paths = {f.filename for f in run.files} assert paths for path in paths: assert '/raw/' in path # With strings run = open_run(proposal='2012', run='238') assert {f.filename for f in run.files} == paths # With numpy integers run = open_run(proposal=np.int64(2012), run=np.uint16(238)) assert {f.filename for f in run.files} == paths # Proc folder proc_run = open_run(proposal=2012, run=238, data='proc') proc_paths = {f.filename for f in proc_run.files} assert proc_paths for path in proc_paths: assert '/raw/' not in path # All folders all_run = open_run(proposal=2012, run=238, data='all') # Raw contains all sources. assert run.all_sources == all_run.all_sources # Proc is a true subset. assert proc_run.all_sources < all_run.all_sources for source, files in all_run._source_index.items(): for file in files: if '/DET/' in source: # AGIPD data is in proc. assert '/raw/' not in file.filename else: # Non-AGIPD data is in raw. # (CAM, XGM) assert '/proc/' not in file.filename # Run that doesn't exist with pytest.raises(Exception): open_run(proposal=2012, run=999) def test_open_file(mock_sa3_control_data): f = H5File(mock_sa3_control_data) file_access = f.files[0] assert file_access.format_version in ('0.5', '1.0') assert 'SA3_XTD10_VAC/TSENS/S30180K' in f.control_sources if file_access.format_version == '0.5': assert 'METADATA/dataSourceId' in file_access.file else: assert 'METADATA/dataSources/dataSourceId' in file_access.file @pytest.mark.skipif(hasattr(os, 'geteuid') and os.geteuid() == 0, reason="cannot run permission tests as root") def test_permission(): d = mkdtemp() os.chmod(d, not stat.S_IRUSR) with pytest.raises(PermissionError) as excinfo: run = RunDirectory(d) assert "Permission denied" in str(excinfo.value) assert d in str(excinfo.value) def test_empty_file_info(mock_empty_file, capsys): f = H5File(mock_empty_file) f.info() # smoke test def test_get_data_counts(mock_spb_raw_run): run = RunDirectory(mock_spb_raw_run) count = run.get_data_counts('SPB_XTD9_XGM/DOOCS/MAIN', 'beamPosition.ixPos.value') assert count.index.tolist() == run.train_ids assert (count.values == 1).all() def test_get_run_value(mock_fxe_control_data): f = H5File(mock_fxe_control_data) src = 'FXE_XAD_GEC/CAM/CAMERA' val = f.get_run_value(src, 'firmwareVersion') assert isinstance(val, np.int32) assert f.get_run_value(src, 'firmwareVersion.value') == val with pytest.raises(SourceNameError): f.get_run_value(src + '_NONEXIST', 'firmwareVersion') with pytest.raises(PropertyNameError): f.get_run_value(src, 'non.existant') def test_get_run_value_union(mock_fxe_control_data, mock_sa3_control_data): f = H5File(mock_fxe_control_data) f2 = H5File(mock_sa3_control_data) data = f.union(f2) with pytest.raises(MultiRunError): data.get_run_value('FXE_XAD_GEC/CAM/CAMERA', 'firmwareVersion') with pytest.raises(MultiRunError): data.get_run_values('FXE_XAD_GEC/CAM/CAMERA') def test_get_run_values(mock_fxe_control_data): f = H5File(mock_fxe_control_data) src = 'FXE_XAD_GEC/CAM/CAMERA' d = f.get_run_values(src, ) assert isinstance(d['firmwareVersion.value'], np.int32) assert isinstance(d['enableShutter.value'], np.uint8) def test_run_metadata(mock_spb_raw_run): run = RunDirectory(mock_spb_raw_run) md = run.run_metadata() if run.files[0].format_version == '0.5': assert md == {'dataFormatVersion': '0.5'} else: assert md['dataFormatVersion'] == '1.0' assert set(md) == { 'dataFormatVersion', 'creationDate', 'updateDate', 'daqLibrary', 'karaboFramework', 'proposalNumber', 'runNumber', 'runType', 'sample', 'sequenceNumber', } assert isinstance(md['creationDate'], str) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1623678132.0 EXtra-data-1.7.0/extra_data/tests/test_run_files_map.py0000664000175000017500000000516500000000000024033 0ustar00takluyvertakluyverimport h5py import numpy as np import os import pytest import unittest.mock as mock from .mockdata import write_file from .mockdata.xgm import XGM from extra_data import run_files_map, RunDirectory def test_candidate_paths(tmp_path): # 'real' paths (like /gpfs/exfel/d) prop_raw_path = tmp_path / 'raw' / 'FXE' / '201901' / 'p001234' run_dir = prop_raw_path / 'r0450' run_dir.mkdir(parents=True) # stable paths (like /gpfs/exfel/exp) exp = tmp_path / 'exp' prop_dir = exp / 'FXE' / '201901' / 'p001234' prop_scratch = exp / 'FXE' / '201901' / 'p001234' / 'scratch' prop_scratch.mkdir(parents=True) (prop_dir / 'raw').symlink_to(prop_raw_path) run_in_exp = prop_dir / 'raw' / 'r0450' with mock.patch.object(run_files_map, 'SCRATCH_ROOT_DIR', str(exp)): rfm = run_files_map.RunFilesMap(str(run_dir)) rfm_exp = run_files_map.RunFilesMap(str(run_in_exp)) assert rfm.candidate_paths == [ str(run_dir / 'karabo_data_map.json'), str(prop_scratch / '.karabo_data_maps' / 'raw_r0450.json'), ] assert rfm_exp.candidate_paths == [ str(run_in_exp / 'karabo_data_map.json'), str(prop_scratch / '.karabo_data_maps' / 'raw_r0450.json'), ] @pytest.fixture() def run_with_extra_file(mock_fxe_raw_run): extra_file = os.path.join(mock_fxe_raw_run, 'RAW-R0450-DA02-S00000.h5') write_file(extra_file, [ XGM('FXE_TEST_XGM/DOOCS/MAIN'), ], ntrains=480) try: yield mock_fxe_raw_run, extra_file finally: os.unlink(extra_file) def test_save_load_map(run_with_extra_file, tmp_path): run_dir, extra_file = run_with_extra_file run_map_path = str(tmp_path / 'kd_test_run_map.json') class TestRunFilesMap(run_files_map.RunFilesMap): def map_paths_for_run(self, directory): return [run_map_path] rfm = TestRunFilesMap(run_dir) assert rfm.files_data == {} with RunDirectory(run_dir) as run: rfm.save(run.files) rfm2 = TestRunFilesMap(run_dir) assert rfm2.cache_file == run_map_path file_info = rfm2.get(extra_file) assert isinstance(file_info['train_ids'], np.ndarray) assert isinstance(file_info['control_sources'], frozenset) assert isinstance(file_info['instrument_sources'], frozenset) assert isinstance(file_info['flag'], np.ndarray) np.testing.assert_array_equal(file_info['flag'], True) # Modify a file; this should make the cache invalid with h5py.File(extra_file, 'r+') as f: f.attrs['test_save_load_map'] = 1 rfm3 = TestRunFilesMap(run_dir) assert rfm3.cache_file == run_map_path assert rfm3.get(extra_file) is None ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1602757031.0 EXtra-data-1.7.0/extra_data/tests/test_slice_objs.py0000644000175000017500000000077700000000000023326 0ustar00takluyvertakluyverfrom extra_data import by_id, by_index def test_slicing_reprs(): ns = {'by_id': by_id, 'by_index': by_index} samples = [ 'by_id[:]', 'by_id[:2]', 'by_id[0:10:2]', 'by_id[4::2, 7]', 'by_index[:5, 3:12]', 'by_index[-4:, ...]', 'by_index[...]', 'by_index[..., ::-1]', ] # These examples are canonically formatted, so their repr() should match for expr in samples: obj = eval(expr, ns) assert repr(obj) == expr ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1623678132.0 EXtra-data-1.7.0/extra_data/tests/test_stacking.py0000664000175000017500000001400400000000000023003 0ustar00takluyvertakluyverimport numpy as np import pytest from extra_data import RunDirectory, stack_data, stack_detector_data from extra_data.stacking import StackView def test_stack_data(mock_fxe_raw_run): test_run = RunDirectory(mock_fxe_raw_run) tid, data = test_run.train_from_id(10000, devices=[('*/DET/*', 'image.data')]) comb = stack_data(data, 'image.data') assert comb.shape == (128, 1, 16, 256, 256) def test_stack_detector_data(mock_fxe_raw_run): test_run = RunDirectory(mock_fxe_raw_run) tid, data = test_run.train_from_id(10000, devices=[('*/DET/*', 'image.data')]) comb = stack_detector_data(data, 'image.data') assert comb.shape == (128, 1, 16, 256, 256) def test_stack_detector_data_missing(mock_fxe_raw_run): test_run = RunDirectory(mock_fxe_raw_run) tid, data = test_run.train_from_id(10000, devices=[('*/DET/*', 'image.data')]) # Three variants of missing data: # 1. Source missing del data['FXE_DET_LPD1M-1/DET/3CH0:xtdf'] # 2. Key missing del data['FXE_DET_LPD1M-1/DET/7CH0:xtdf']['image.data'] # 3. Empty array missing = ['FXE_DET_LPD1M-1/DET/{}CH0:xtdf'.format(m) for m in (1, 5, 9, 15)] for module in missing: data[module]['image.data'] = np.zeros((0, 1, 256, 256), dtype=np.uint16) comb = stack_detector_data(data, 'image.data', fillvalue=22) assert comb.shape == (128, 1, 16, 256, 256) assert not (comb[:, :, 0] == 22).any() # Control assert (comb[:, :, 3] == 22).all() # Source missing assert (comb[:, :, 7] == 22).all() # Key missing assert (comb[:, :, 5] == 22).all() # Empty array # default fillvalue for int is 0 comb = stack_detector_data(data, 'image.data') assert (comb[:, :, 3] == 0).all() with pytest.raises(ValueError): comb = stack_detector_data(data, 'image.data', fillvalue=np.nan) def test_stack_detector_data_stackview(mock_fxe_raw_run): test_run = RunDirectory(mock_fxe_raw_run) tid, data = test_run.train_from_id(10000, devices=[('*/DET/*', 'image.data')]) # Three variants of missing data: # 1. Source missing del data['FXE_DET_LPD1M-1/DET/3CH0:xtdf'] # 2. Key missing del data['FXE_DET_LPD1M-1/DET/7CH0:xtdf']['image.data'] # 3. Empty array missing = ['FXE_DET_LPD1M-1/DET/{}CH0:xtdf'.format(m) for m in (1, 5, 9, 15)] for module in missing: data[module]['image.data'] = np.zeros((0, 1, 256, 256), dtype=np.uint16) comb = stack_detector_data(data, 'image.data', fillvalue=22, real_array=False) assert comb.shape == (128, 1, 16, 256, 256) assert not (comb[:, :, 0] == 22).any() # Control assert (comb[:, :, 3] == 22).all() # Source missing assert (comb[:, :, 7] == 22).all() # Key missing assert (comb[:, :, 5] == 22).all() # Empty array # Slice across all modules pulse = comb[0, 0] assert pulse.shape == (16, 256, 256) assert not (pulse[0] == 22).any() assert (pulse[3] == 22).all() assert (pulse[7] == 22).all() assert (pulse[5] == 22).all() pulse_arr = pulse.asarray() assert pulse_arr.shape == (16, 256, 256) assert pulse_arr.max() == 22 assert pulse_arr.min() == 0 def test_stack_detector_data_wrong_pulses(mock_fxe_raw_run): test_run = RunDirectory(mock_fxe_raw_run) tid, data = test_run.train_from_id(10000, devices=[('*/DET/*', 'image.data')]) misshaped = ['FXE_DET_LPD1M-1/DET/{}CH0:xtdf'.format(m) for m in (12, 13)] for module in misshaped: data[module]['image.data'] = np.zeros((64, 1, 256, 256), dtype=np.uint16) with pytest.raises(ValueError) as excinfo: comb = stack_detector_data(data, 'image.data') assert '(64, 1, 256, 256)' in str(excinfo.value) def test_stack_detector_data_wrong_shape(mock_fxe_raw_run): test_run = RunDirectory(mock_fxe_raw_run) tid, data = test_run.train_from_id(10000, devices=[('*/DET/*', 'image.data')]) misshaped = ['FXE_DET_LPD1M-1/DET/{}CH0:xtdf'.format(m) for m in (0, 15)] for module in misshaped: data[module]['image.data'] = np.zeros((128, 1, 512, 128), dtype=np.uint16) with pytest.raises(ValueError) as excinfo: comb = stack_detector_data(data, 'image.data') assert '(128, 1, 512, 128)' in str(excinfo.value) def test_stack_detector_data_type_error(mock_fxe_raw_run): test_run = RunDirectory(mock_fxe_raw_run) tid, data = test_run.train_from_id(10000, devices=[('*/DET/*', 'image.data')]) module = 'FXE_DET_LPD1M-1/DET/3CH0:xtdf' data[module]['image.data'] = data[module]['image.data'].astype(np.float32) with pytest.raises(ValueError) as excinfo: comb = stack_detector_data(data, 'image.data') assert "dtype('float32')" in str(excinfo.value) def test_stack_detector_data_extra_mods(mock_fxe_raw_run): test_run = RunDirectory(mock_fxe_raw_run) tid, data = test_run.train_from_id(10000, devices=[('*/DET/*', 'image.data')]) data.setdefault( 'FXE_DET_LPD1M-1/DET/16CH0:xtdf', {'image.data': np.zeros((128, 1, 256, 256), dtype=np.uint16)}, ) with pytest.raises(IndexError) as excinfo: comb = stack_detector_data(data, 'image.data') assert "16" in str(excinfo.value) def test_stack_detector_data_jungfrau(mock_jungfrau_run): run = RunDirectory(mock_jungfrau_run) _, data = run.select('*JF4M/DET/*', 'data.adc').train_from_index(0) comb = stack_detector_data( data, 'data.adc', modules=8, pattern=r'/DET/JNGFR(\d+)', starts_at=1 ) assert comb.shape == (16, 8, 512, 1024) def test_stackview_squeeze(): # Squeeze not dropping stacking dim data = {0: np.zeros((1, 4)), 1: np.zeros((1, 4))} sv = StackView(data, 2, (1, 4), data[0], 0, stack_axis=0) assert sv.shape == (2, 1, 4) assert sv.squeeze().shape == (2, 4) # Squeeze dropping stacking dim data = {0: np.zeros((1, 4))} sv = StackView(data, 1, (1, 4), data[0].dtype, 0, stack_axis=0) assert sv.shape == (1, 1, 4) assert sv.squeeze().shape == (4,) assert sv.squeeze(axis=0).shape == (1, 4) assert sv.squeeze(axis=-2).shape == (1, 4) with pytest.raises(np.AxisError): sv.squeeze(axis=4) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1623678132.0 EXtra-data-1.7.0/extra_data/tests/test_streamer.py0000664000175000017500000000574400000000000023035 0ustar00takluyvertakluyver"""Test streaming data with ZMQ interface.""" import os import pytest from subprocess import PIPE, Popen from extra_data import by_id, H5File, RunDirectory from extra_data.export import _iter_trains, ZMQStreamer from karabo_bridge import Client def test_merge_detector(mock_fxe_raw_run, mock_fxe_control_data, mock_spb_proc_run): with RunDirectory(mock_fxe_raw_run) as run: for tid, data in _iter_trains(run, merge_detector=True): assert 'FXE_DET_LPD1M-1/DET/APPEND' in data assert 'FXE_DET_LPD1M-1/DET/0CH0:xtdf' not in data shape = data['FXE_DET_LPD1M-1/DET/APPEND']['image.data'].shape assert shape == (128, 1, 16, 256, 256) break for tid, data in _iter_trains(run): assert 'FXE_DET_LPD1M-1/DET/0CH0:xtdf' in data shape = data['FXE_DET_LPD1M-1/DET/0CH0:xtdf']['image.data'].shape assert shape == (128, 1, 256, 256) break with H5File(mock_fxe_control_data) as run: for tid, data in _iter_trains(run, merge_detector=True): assert frozenset(data) == run.select_trains(by_id[[tid]]).all_sources break with RunDirectory(mock_spb_proc_run) as run: for tid, data in _iter_trains(run, merge_detector=True): shape = data['SPB_DET_AGIPD1M-1/DET/APPEND']['image.data'].shape assert shape == (64, 16, 512, 128) shape = data['SPB_DET_AGIPD1M-1/DET/APPEND']['image.gain'].shape assert shape == (64, 16, 512, 128) shape = data['SPB_DET_AGIPD1M-1/DET/APPEND']['image.mask'].shape assert shape == (64, 16, 512, 128) break @pytest.mark.skipif(os.name != 'posix', reason="Test uses Unix socket") def test_serve_files(mock_fxe_raw_run, tmp_path): src = 'FXE_XAD_GEC/CAM/CAMERA:daqOutput' args = ['karabo-bridge-serve-files', '-z', 'PUSH', str(mock_fxe_raw_run), f'ipc://{tmp_path}/socket', '--source', src] interface = None p = Popen(args, stdin=PIPE, stdout=PIPE, stderr=PIPE, env=dict(os.environ, PYTHONUNBUFFERED='1')) try: for line in p.stdout: line = line.decode('utf-8') if line.startswith('Streamer started on:'): interface = line.partition(':')[2].strip() break print('interface:', interface) assert interface is not None, p.stderr.read().decode() with Client(interface, sock='PULL', timeout=30) as c: data, meta = c.next() tid = next(m['timestamp.tid'] for m in meta.values()) assert tid == 10000 assert set(data) == {src} finally: if p.poll() is None: p.kill() rc = p.wait(timeout=2) assert rc == -9 # process terminated by kill signal def test_deprecated_server(): with pytest.deprecated_call(): with ZMQStreamer(2222): pass if __name__ == '__main__': pytest.main(["-v"]) print("Run 'py.test -v -s' to see more output") ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1602757031.0 EXtra-data-1.7.0/extra_data/tests/test_utils.py0000644000175000017500000000167200000000000022345 0ustar00takluyvertakluyverimport numpy as np import os import pytest import re import tempfile from testpath import assert_isfile from extra_data import utils from extra_data.utils import QuickView def test_cbf_conversion(mock_agipd_data, capsys): with tempfile.TemporaryDirectory() as td: out_file = os.path.join(td, 'out.cbf') utils.hdf5_to_cbf(mock_agipd_data, out_file, index=0) assert_isfile(out_file) captured = capsys.readouterr() assert re.match("Convert .* to .*/out.cbf", captured.out) def test_init_quick_view(): qv = QuickView() assert qv.data is None qv.data = np.empty((1, 1, 1), dtype=np.int8) assert len(qv) == 1 assert qv.pos == 0 with pytest.raises(TypeError) as info: qv.data = 4 with pytest.raises(TypeError) as info: qv.data = np.empty((1, 1, 1, 1), dtype=np.int8) if __name__ == "__main__": pytest.main(["-v"]) print("Run 'py.test -v -s' to see more output") ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1623678132.0 EXtra-data-1.7.0/extra_data/tests/test_validation.py0000664000175000017500000001365400000000000023344 0ustar00takluyvertakluyverimport os.path as osp from pathlib import Path from h5py import File from pytest import fixture, raises from tempfile import TemporaryDirectory from extra_data.validation import FileAccess, FileValidator, RunValidator, ValidationError from . import make_examples @fixture(scope='function') def agipd_file(): with TemporaryDirectory() as td: path = osp.join(td, 'RAW-R0239-AGIPD00-S00000.h5') make_examples.make_agipd_file(path) yield path @fixture(scope='function') def data_aggregator_file(): with TemporaryDirectory() as td: path = osp.join(td, 'RAW-R0450-DA01-S00001.h5') make_examples.make_fxe_da_file(path) yield path def test_validate_run(mock_fxe_raw_run): rv = RunValidator(mock_fxe_raw_run) rv.validate() def test_file_error(mock_fxe_raw_run): not_readable = Path(mock_fxe_raw_run) / 'notReadable.h5' not_readable.touch(mode=0o066) problems = RunValidator(mock_fxe_raw_run).run_checks() assert len(problems) == 1 assert problems[0]['msg'] == 'Could not open file' assert problems[0]['file'] == str(not_readable) def test_zeros_in_train_ids(agipd_file): with File(agipd_file, 'r+') as f: # introduce zeros in trainId f['/INDEX/trainId'][12] = 0 with raises(ValidationError) as excinfo: FileValidator(FileAccess(agipd_file)).validate() problem = excinfo.value.problems.pop() assert problem['msg'] == 'Zeroes in trainId index before last train ID' assert problem['dataset'] == 'INDEX/trainId' assert 'RAW-R0239-AGIPD00-S00000.h5' in problem['file'] def test_non_strictly_increasing_train_ids(agipd_file): with File(agipd_file, 'r+') as f: # introduce non strictly increasing trainId f['/INDEX/trainId'][10] = 11010 f['/INDEX/trainId'][20] = 5 with raises(ValidationError) as excinfo: FileValidator(FileAccess(agipd_file)).validate() problem = excinfo.value.problems.pop() assert problem['msg'] == 'Train IDs are not strictly increasing, e.g. at 10 (11010 >= 10011)' assert problem['dataset'] == 'INDEX/trainId' assert 'RAW-R0239-AGIPD00-S00000.h5' in problem['file'] def test_index_pointing_outside_dataset(data_aggregator_file): with File(data_aggregator_file, 'r+') as f: # index pointing outside dataset f['/INDEX/FXE_XAD_GEC/CAM/CAMERA:daqOutput/data/first'][30] = 999 with raises(ValidationError) as excinfo: FileValidator(FileAccess(data_aggregator_file)).validate() assert 'Index referring to data (1000) outside dataset (400)' in str(excinfo.value) def test_invalid_first_dataset(data_aggregator_file): with File(data_aggregator_file, 'a') as f: # invalid first shape length = len(f['INDEX/SA1_XTD2_XGM/DOOCS/MAIN:output/data/first']) f['INDEX/SA1_XTD2_XGM/DOOCS/MAIN:output/data/first'].resize((length+1,)) with raises(ValidationError) as excinfo: FileValidator(FileAccess(data_aggregator_file)).validate() problem = excinfo.value.problems.pop() assert problem['msg'] == 'Index first & count have different number of entries' assert problem['dataset'] == 'INDEX/SA1_XTD2_XGM/DOOCS/MAIN:output/data' assert problem['first_shape'] == (401,) assert problem['count_shape'] == (400,) assert 'RAW-R0450-DA01-S00001.h5' in problem['file'] def test_invalid_first_and_count_dataset(data_aggregator_file): with File(data_aggregator_file, 'a') as f: # invalid first/index shape length = len(f['INDEX/SA1_XTD2_XGM/DOOCS/MAIN:output/data/first']) f['INDEX/SA1_XTD2_XGM/DOOCS/MAIN:output/data/first'].resize((length-1,)) length = len(f['INDEX/SA1_XTD2_XGM/DOOCS/MAIN:output/data/count']) f['INDEX/SA1_XTD2_XGM/DOOCS/MAIN:output/data/count'].resize((length-1,)) with raises(ValidationError) as excinfo: FileValidator(FileAccess(data_aggregator_file)).validate() problem = excinfo.value.problems.pop() assert problem['msg'] == 'Index has wrong number of entries' assert problem['dataset'] == 'INDEX/SA1_XTD2_XGM/DOOCS/MAIN:output/data' assert problem['index_shape'] == (399,) assert problem['trainids_shape'] == (400,) assert 'RAW-R0450-DA01-S00001.h5' in problem['file'] def test_first_dataset_not_starting_from_zero(data_aggregator_file): with File(data_aggregator_file, 'a') as f: # first index not starting at zero f['INDEX/SA1_XTD2_XGM/DOOCS/MAIN:output/data/first'][0] = 1 with raises(ValidationError) as excinfo: FileValidator(FileAccess(data_aggregator_file)).validate() assert "Index doesn't start at 0" in str(excinfo.value) assert "INDEX/SA1_XTD2_XGM/DOOCS/MAIN:output/data" in str(excinfo.value) def test_overlap(agipd_file): with File(agipd_file, 'r+') as f: # overlap first index f['INDEX/SPB_DET_AGIPD1M-1/DET/0CH0:xtdf/image/first'][1] = 0 f['INDEX/SPB_DET_AGIPD1M-1/DET/0CH0:xtdf/image/count'][1] = 128 # no gaps with raises(ValidationError) as excinfo: FileValidator(FileAccess(agipd_file)).validate() problem = excinfo.value.problems.pop() assert problem['msg'] == 'Overlaps (1) in index, e.g. at 0 (0 + 64 > 0)' assert problem['dataset'] == 'INDEX/SPB_DET_AGIPD1M-1/DET/0CH0:xtdf/image' assert 'RAW-R0239-AGIPD00-S00000.h5' in problem['file'] def test_gaps(agipd_file): with File(agipd_file, 'r+') as f: # gap in index f['INDEX/SPB_DET_AGIPD1M-1/DET/0CH0:xtdf/image/first'][1] = 0 f['INDEX/SPB_DET_AGIPD1M-1/DET/0CH0:xtdf/image/count'][0] = 0 with raises(ValidationError) as excinfo: FileValidator(FileAccess(agipd_file)).validate() problem = excinfo.value.problems.pop() assert problem['msg'] == 'Gaps (1) in index, e.g. at 1 (0 + 64 < 128)' assert problem['dataset'] == 'INDEX/SPB_DET_AGIPD1M-1/DET/0CH0:xtdf/image' assert 'RAW-R0239-AGIPD00-S00000.h5' in problem['file'] def test_file_without_data(mock_empty_file): FileValidator(FileAccess(mock_empty_file)).validate() ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1622824936.0 EXtra-data-1.7.0/extra_data/tests/test_writer.py0000664000175000017500000000370300000000000022520 0ustar00takluyvertakluyverimport h5py import os.path as osp import numpy as np from tempfile import TemporaryDirectory from testpath import assert_isfile from extra_data import RunDirectory, H5File def test_write_selected(mock_fxe_raw_run): with TemporaryDirectory() as td: new_file = osp.join(td, 'test.h5') with RunDirectory(mock_fxe_raw_run) as run: run.select('SPB_XTD9_XGM/*').write(new_file) assert_isfile(new_file) with H5File(new_file) as f: assert f.control_sources == {'SPB_XTD9_XGM/DOOCS/MAIN'} assert f.instrument_sources == {'SPB_XTD9_XGM/DOOCS/MAIN:output'} s = f.get_series('SPB_XTD9_XGM/DOOCS/MAIN', 'beamPosition.ixPos.value') # This should have concatenated the two sequence files (400 + 80) assert len(s) == 480 a = f.get_array('SPB_XTD9_XGM/DOOCS/MAIN:output', 'data.intensityTD') assert a.shape == (480, 1000) def test_write_virtual(mock_fxe_raw_run): with TemporaryDirectory() as td: new_file = osp.join(td, 'test.h5') with RunDirectory(mock_fxe_raw_run) as run: run.write_virtual(new_file) assert_isfile(new_file) with h5py.File(new_file, 'r') as f: ds = f['CONTROL/SPB_XTD9_XGM/DOOCS/MAIN/beamPosition/ixPos/value'] assert ds.is_virtual with H5File(new_file) as f: np.testing.assert_array_equal(f.train_ids, np.arange(10000, 10480, dtype=np.uint64)) assert 'SPB_XTD9_XGM/DOOCS/MAIN' in f.control_sources assert 'SPB_XTD9_XGM/DOOCS/MAIN:output' in f.instrument_sources s = f.get_series('SPB_XTD9_XGM/DOOCS/MAIN', 'beamPosition.ixPos.value') # This should have concatenated the two sequence files (400 + 80) assert len(s) == 480 a = f.get_array('SPB_XTD9_XGM/DOOCS/MAIN:output', 'data.intensityTD') assert a.shape == (480, 1000) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1627645754.0 EXtra-data-1.7.0/extra_data/utils.py0000664000175000017500000001121100000000000020134 0ustar00takluyvertakluyver""" Helpers functions for the euxfel_h5tools package. Copyright (c) 2017, European X-Ray Free-Electron Laser Facility GmbH All rights reserved. You should have received a copy of the 3-Clause BSD License along with this program. If not, see """ from psutil import net_if_addrs from socket import AF_INET from warnings import warn import h5py import numpy as np __all__ = ['hdf5_paths', 'hdf5_to_cbf', 'numpy_to_cbf', 'QuickView'] class QuickView: """Pun intended This object displays a 3D array as provided by calibrated hdf5 files. Given a 3D numpy array, it will provide you with a way to easily iterate over the pulses to display their respective images. First, instantiate and give it the data (a 3D numpy array): quick_v = QuickView(data) # or quick_v = QuickView() quick_v.data = data You can now iterate over it in three different ways: next(quick_v) quick_v.next() quick_v.previous() quick_v.pos = len(quick_v-1) You can also display a specific image without changing the position: quick_v.display(int) """ _image = None _data = None _current_index = 0 def __init__(self, data=None): if data: self.data = data @property def pos(self): return self._current_index @pos.setter def pos(self, pos): if self._data is not None: if 0 <= pos < len(self): self._current_index = pos self.show() else: err = "value should be 0 < value < " "{}".format(self._data.shape[0]) raise ValueError(err) @property def data(self): return self._data @data.setter def data(self, data): if not isinstance(data, np.ndarray) or len(data.shape) != 3: raise TypeError("Expected a 3D numpy array") self._data = data self._current_index = 0 self.display() def next(self): if self._current_index < len(self): self._current_index += 1 self.display() def prev(self): if self._current_index > 0: self._current_index -= 1 self.display() def display(self, index=None): import matplotlib.pyplot as plot if index is None: index = self._current_index image_frame = self.data[index, :, :] if self._image is None: self._image = plot.imshow(image_frame) else: self._image.set_data(image_frame) self._image.axes.set_title("pulseId: {}".format(index)) plot.draw() def __next__(self): self.next() def __len__(self): return self._data.shape[0] def hdf5_paths(ds, indent=0, maxlen=100): """Deprecated: Visit and print name of all element in HDF5 file (from S Hauf)""" warn( "hdf5_paths is deprecated and likely to be removed. Try our h5glance " "package for a similar view of HDF5 files. If this is a problem, " "please contact da-support@xfel.eu .", stacklevel=2, ) for k in list(ds.keys())[:maxlen]: print(" " * indent + k) if isinstance(ds[k], h5py.Group): hdf5_paths(ds[k], indent + 4, maxlen) else: print(" " * indent + k) def numpy_to_cbf(np_array, index=0, header=None): """Deprecated: Given a 3D numpy array, convert it to a CBF data object""" warn( "The numpy_to_cbf and hdf5_to_cbf functions are deprecated and likely " "to be removed. If you are using either of them, please contact " "da-support@xfel.eu .", stacklevel=2, ) import fabio.cbfimage img_reduced = np_array[index, ...] return fabio.cbfimage.cbfimage(header=header or {}, data=img_reduced) def hdf5_to_cbf(in_h5file, cbf_filename, index, header=None): """Deprecated: Conversion from HDF5 file to cbf binary image file""" tmpf = h5py.File(in_h5file, 'r') paths = list(tmpf["METADATA/dataSourceId"]) image_path = [p for p in paths if p.endswith(b"image")][0] images = tmpf[image_path + b"/data"] cbf_out = numpy_to_cbf(images, index=index) cbf_out.write(cbf_filename) print("Convert {} index {} to {}".format(in_h5file, index, cbf_filename)) def find_infiniband_ip(): """Find the first infiniband IP address :returns: str IP of the first infiniband interface if it exists else '*' """ addrs = net_if_addrs() for addr in addrs.get('ib0', ()): if addr.family is AF_INET: return addr.address return '*' ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1626691885.0 EXtra-data-1.7.0/extra_data/validation.py0000664000175000017500000002517500000000000021144 0ustar00takluyvertakluyverfrom argparse import ArgumentParser from multiprocessing import Pool from functools import partial import numpy as np import os import os.path as osp from shutil import get_terminal_size from signal import signal, SIGINT, SIG_IGN import sys from .reader import H5File, FileAccess from .run_files_map import RunFilesMap class ValidationError(Exception): def __init__(self, problems): self.problems = problems def __str__(self): lines = [] for prob in self.problems: lines.extend(['', prob['msg']]) for k, v in sorted(prob.items()): if k != 'msg': lines.append(" {}: {}".format(k, v)) return '\n'.join(lines) class FileValidator: def __init__(self, file: FileAccess): self.file = file self.filename = file.filename self.problems = [] def validate(self): problems = self.run_checks() if problems: raise ValidationError(problems) def run_checks(self): self.problems = [] self.check_indices() self.check_trainids() return self.problems def record(self, msg, **kwargs): self.problems.append(dict(msg=msg, file=self.filename, **kwargs)) def check_trainids(self): ds_path = 'INDEX/trainId' train_ids = self.file.file[ds_path][:] if (train_ids == 0).any(): first0 = train_ids.tolist().index(0) if not (train_ids[first0:] == 0).all(): self.record( 'Zeroes in trainId index before last train ID', dataset=ds_path ) nonzero_tids = train_ids[train_ids != 0] else: nonzero_tids = train_ids if len(nonzero_tids) > 1: non_incr = (nonzero_tids[1:] <= nonzero_tids[:-1]).nonzero()[0] if non_incr.size > 0: pos = non_incr[0] self.record( 'Train IDs are not strictly increasing, e.g. at {} ({} >= {})'.format( pos, nonzero_tids[pos], nonzero_tids[pos + 1] ), dataset=ds_path, ) def _get_index(self, path): """returns first and count dataset for specified source. This is slightly different to the same method in FileAccess as it does cut the dataset up to the trainId's dataset length. """ ix_group = self.file.file[path] firsts = ix_group['first'][:] if 'count' in ix_group: counts = ix_group['count'][:] else: status = ix_group['status'][:] counts = np.uint64((ix_group['last'][:] - firsts + 1) * status) return firsts, counts def check_indices(self): for src in self.file.control_sources: first, count = self.file.get_index(src, '') for key in self.file.get_keys(src): ds_path = f"CONTROL/{src}/{key.replace('.', '/')}" data_dim0 = self.file.file[ds_path].shape[0] if np.any((first + count) > data_dim0): max_end = (first + count).max() self.record( 'Index referring to data ({}) outside dataset ({})'.format( max_end, data_dim0 ), dataset=ds_path, ) break # Recording every key separately can make a *lot* of errors self._check_index(f'INDEX/{src}') for src in self.file.instrument_sources: src_groups = set() for key in self.file.get_keys(src): ds_path = 'INSTRUMENT/{}/{}'.format(src, key.replace('.', '/')) group = key.split('.', 1)[0] src_groups.add((src, group)) first, count = self.file.get_index(src, group) data_dim0 = self.file.file[ds_path].shape[0] if np.any((first + count) > data_dim0): max_end = (first + count).max() self.record( 'Index referring to data ({}) outside dataset ({})'.format( max_end, data_dim0 ), dataset=ds_path, ) for src, group in src_groups: self._check_index(f'INDEX/{src}/{group}') def _check_index(self, path): record = partial(self.record, dataset=path) first, count = self._get_index(path) if (first.ndim != 1) or (count.ndim != 1): record( "Index first / count are not 1D", first_shape=first.shape, count_shape=count.shape, ) return if first.shape != count.shape: record( "Index first & count have different number of entries", first_shape=first.shape, count_shape=count.shape, ) return if first.shape != self.file.train_ids.shape: record( "Index has wrong number of entries", index_shape=first.shape, trainids_shape=self.file.train_ids.shape, ) check_index_contiguous(first, count, record) def check_index_contiguous(firsts, counts, record): if firsts.size == 0: return # no data in this dataset if firsts[0] != 0: record("Index doesn't start at 0") gaps = firsts[1:].astype(np.int64) - (firsts + counts)[:-1] gap_ixs = (gaps > 0).nonzero()[0] if gap_ixs.size > 0: pos = gap_ixs[0] record("Gaps ({}) in index, e.g. at {} ({} + {} < {})".format( gap_ixs.size, pos, firsts[pos], counts[pos], firsts[pos+1] )) overlap_ixs = (gaps < 0).nonzero()[0] if overlap_ixs.size > 0: pos = overlap_ixs[0] record("Overlaps ({}) in index, e.g. at {} ({} + {} > {})".format( overlap_ixs.size, pos, firsts[pos], counts[pos], firsts[pos + 1] )) def progress_bar(done, total, suffix=' '): line = f'Progress: {done}/{total}{suffix}[{{}}]' length = min(get_terminal_size().columns - len(line), 50) filled = int(length * done // total) bar = '#' * filled + ' ' * (length - filled) return line.format(bar) def _check_file(args): runpath, filename = args filepath = osp.join(runpath, filename) problems = [] try: fa = FileAccess(filepath) except Exception as e: problems.append( dict(msg="Could not open file", file=filepath, error=e) ) return filename, None, problems else: fv = FileValidator(fa) problems.extend(fv.run_checks()) fa.close() return filename, fa, problems class RunValidator: def __init__(self, run_dir: str, term_progress=False): self.run_dir = run_dir self.term_progress = term_progress self.filenames = [f for f in os.listdir(run_dir) if f.endswith('.h5')] self.file_accesses = [] self.problems = [] def validate(self): problems = self.run_checks() if problems: raise ValidationError(problems) def run_checks(self): self.problems = [] self.check_files() self.check_files_map() return self.problems def progress(self, done, total, nproblems, badfiles): """Show progress information""" if not self.term_progress: return lines = progress_bar(done, total) lines += f'\n{nproblems} problems' if badfiles: lines += f' in {len(badfiles)} files (last: {badfiles[-1]})' if sys.stderr.isatty(): # "\x1b[2K": delete whole line, "\x1b[1A": move up cursor print('\x1b[2K\x1b[1A\x1b[2K', end='\r',file=sys.stderr) print(lines, end='', file=sys.stderr) else: print(lines, file=sys.stderr) def check_files(self): self.file_accesses = [] def initializer(): # prevent child processes from receiving KeyboardInterrupt signal(SIGINT, SIG_IGN) filepaths = [(self.run_dir, fn) for fn in sorted(self.filenames)] nfiles = len(self.filenames) badfiles = [] self.progress(0, nfiles, 0, badfiles) with Pool(initializer=initializer) as pool: iterator = pool.imap_unordered(_check_file, filepaths) for done, (fname, fa, problems) in enumerate(iterator, start=1): if problems: self.problems.extend(problems) badfiles.append(fname) if fa is not None: self.file_accesses.append(fa) self.progress(done, nfiles, len(self.problems), badfiles) if not self.file_accesses: self.problems.append( dict(msg="No usable files found", directory=self.run_dir) ) def check_files_map(self): # Outdated cache entries we can detect with the file's stat() are not a # problem. Loading the cache file will discard those automatically. cache = RunFilesMap(self.run_dir) for f_access in self.file_accesses: f_cache = cache.get(f_access.filename) if f_cache is None: continue if ( f_cache['control_sources'] != f_access.control_sources or f_cache['instrument_sources'] != f_access.instrument_sources or not np.array_equal(f_cache['train_ids'], f_access.train_ids) ): self.problems.append(dict( msg="Incorrect data map cache entry", cache_file=cache.cache_file, data_file=f_access.filename, )) f_access.close() def main(argv=None): if argv is None: argv = sys.argv[1:] ap = ArgumentParser(prog='extra-data-validate') ap.add_argument('path', help="HDF5 file or run directory of HDF5 files.") args = ap.parse_args(argv) path = args.path if os.path.isdir(path): print("Checking run directory:", path) print() validator = RunValidator(path, term_progress=True) else: print("Checking file:", path) validator = FileValidator(H5File(path).files[0]) try: validator.run_checks() except KeyboardInterrupt: print('\n^C (validation cancelled)') else: print() # Start a new line if validator.problems: print(f"Validation failed! {len(validator.problems)} problems:") print(str(ValidationError(validator.problems))) return 1 else: print("No problems found") if __name__ == '__main__': sys.exit(main()) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1627645754.0 EXtra-data-1.7.0/extra_data/write_cxi.py0000664000175000017500000004313300000000000021001 0ustar00takluyvertakluyver"""Writing CXI files from AGIPD/LPD data""" import h5py import logging import numpy as np log = logging.getLogger(__name__) class VirtualCXIWriterBase: """ Base class for machinery to write a CXI file containing virtual datasets. You don't normally need to use this class directly. Instead, use the write_virtual_cxi() method on a multi-module detector data interface object. CXI specifies a particular layout of data in the HDF5 file format. It is documented here: http://www.cxidb.org/cxi.html This code writes version 1.5 CXI files. Parameters ---------- detdata: extra_data.components.MultimodDetectorBase The detector data interface for the data to gather in this file. """ # 1 entry is an index along the first (time) dimension in the source files. # XTDF detectors (AGIPD etc.) arrange pulses along this dimension, so each # entry is one frame & one memory cell. JUNGFRAU in burst mode makes one # entry with a separate dimension for several pulses, so overrides this. cells_per_entry = 1 def __init__(self, detdata): self.detdata = detdata self.group_label, self.image_label = detdata._main_data_key.split('.') frame_counts = detdata.frame_counts * self.cells_per_entry self.nframes = frame_counts.sum() log.info("Up to %d frames per train, %d frames in total", frame_counts.max(), self.nframes) self.train_ids_perframe = np.repeat( frame_counts.index.values, frame_counts.values.astype(np.intp) ) # Cumulative sum gives the end of each train, subtract to get start self.train_id_to_ix = frame_counts.cumsum() - frame_counts # For AGIPD, DSSC & LPD detectors modules are numbered from 0. # Overridden for JUNGFRAU to number from 1. self.modulenos = list(range(self.nmodules)) @property def nmodules(self): """Number of detector modules.""" return self.detdata.n_modules @property def data(self): """DataCollection with detector data from a run.""" return self.detdata.data def _get_module_index(self, module): """Returns an index for the specified module.""" return self.modulenos.index(module) def collect_pulse_ids(self): """ Gather pulse/cell ID labels for all modules and check consistency. Raises ------ Exception: Some data has no pulse ID values for any module. Exception: Inconsistent pulse IDs between detector modules. Returns ------- pulse_ids_min: np.array Array of pulse IDs per frame common for all detector modules. """ # Gather pulse IDs NO_PULSE_ID = 9999 pulse_ids = np.full((self.nframes, self.nmodules), NO_PULSE_ID, dtype=np.uint64) pulse_key = self.group_label + '.' + self.pulse_id_label for source, modno in self.detdata.source_to_modno.items(): module_ix = self._get_module_index(modno) for chunk in self.data._find_data_chunks(source, pulse_key): chunk_data = chunk.dataset self._map_chunk(chunk, chunk_data, pulse_ids, module_ix) # Sanity checks on pulse IDs pulse_ids_min = pulse_ids.min(axis=1) if (pulse_ids_min == NO_PULSE_ID).any(): raise Exception("Failed to find pulse IDs for some data") pulse_ids[pulse_ids == NO_PULSE_ID] = 0 if (pulse_ids_min != pulse_ids.max(axis=1)).any(): raise Exception("Inconsistent pulse IDs for different modules") # Pulse IDs make sense. Drop the modules dimension, giving one # pulse ID for each frame. return pulse_ids_min def _map_chunk(self, chunk, chunk_data, target, tgt_ax1, have_data=None): """ Map data from chunk into target. Chunk points to contiguous source data, but if this misses a train, it might not correspond to a contiguous region in the output. So this may perform multiple mappings. Parameters ---------- chunk: read_machinery::DataChunk Reference to a contiguous chunk of data to be mapped. chunk_data: h5py.Dataset / h5py.VirtualSource Dataset / VirtualSource to map data from. target: np.array / h5py.VirtualLayout Target to map data to. tgt_ax1: int Value for the target axis 1 - index corresponding to the detector module. have_data: np.array(dtype=bool), optional An array to monitor which part of the target have been mapped with data. Defaults to None. """ # Expand the list of train IDs to one per frame chunk_tids = np.repeat(chunk.train_ids, chunk.counts.astype(np.intp)) chunk_match_start = int(chunk.first) while chunk_tids.size > 0: # Look up where the start of this chunk fits in the target tgt_start = int(self.train_id_to_ix[chunk_tids[0]]) # Chunk train IDs organized in the same way as in the target chunk_tids_target = np.repeat(chunk_tids, self.cells_per_entry) target_tids = self.train_ids_perframe[ tgt_start : tgt_start + len(chunk_tids)*self.cells_per_entry ] assert target_tids.shape == chunk_tids_target.shape, \ f"{target_tids.shape} != {chunk_tids_target.shape}" assert target_tids[0] == chunk_tids[0], \ f"{target_tids[0]} != {chunk_tids[0]}" # How much of this chunk can be mapped in one go? mismatches = (chunk_tids_target != target_tids).nonzero()[0] if mismatches.size > 0: n_match = mismatches[0] // self.cells_per_entry else: n_match = len(chunk_tids) # Select the matching data and add it to the target chunk_match_end = chunk_match_start + n_match tgt_end = tgt_start + (n_match*self.cells_per_entry) if self.cells_per_entry == 1: # In some cases, there's an extra dimension of length 1. # E.g. JUNGFRAU data with 1 memory cell per train or # DSSC/LPD raw data. if (len(chunk_data.shape) > 1 and chunk_data.shape[1] == 1): matched = chunk_data[chunk_match_start:chunk_match_end, 0] else: matched = chunk_data[chunk_match_start:chunk_match_end] target[tgt_start:tgt_end, tgt_ax1] = matched else: matched = chunk_data[chunk_match_start:chunk_match_end] if isinstance(chunk_data, h5py.VirtualSource): # Use broadcasting of h5py.VirtualSource target[tgt_start:tgt_end, tgt_ax1] = matched else: target[tgt_start:tgt_end, tgt_ax1] = matched.reshape( (-1,) + matched.shape[2:]) # Fill in the map of what data we have if have_data is not None: have_data[tgt_start:tgt_end, tgt_ax1] = True # Prepare remaining data in the chunk for the next match chunk_match_start = chunk_match_end chunk_tids = chunk_tids[n_match:] def _map_layouts(self, layouts): """ Map virtual sources into virtual layouts. Parameters ---------- layouts: dict A dictionary of unmapped virtual layouts. Returns ------- layouts: dict A dictionary of virtual layouts mapped to the virtual sources. """ for name, layout in layouts.items(): key = '{}.{}'.format(self.group_label, name) have_data = np.zeros((self.nframes, self.nmodules), dtype=bool) for source, modno in self.detdata.source_to_modno.items(): print(f" ### Source: {source}, ModNo: {modno}, Key: {key}") module_ix = self._get_module_index(modno) for chunk in self.data._find_data_chunks(source, key): vsrc = h5py.VirtualSource(chunk.dataset) self._map_chunk(chunk, vsrc, layout, module_ix, have_data) filled_pct = 100 * have_data.sum() / have_data.size if hasattr(layout, 'sources'): n_mappings = len(layout.sources) # h5py < 3.3 else: n_mappings = layout.dcpl.get_virtual_count() # h5py >= 3.3 log.info(f"Assembled {n_mappings:d} chunks for {key:s}, " f"filling {filled_pct:.2f}% of the hyperslab") return layouts def write(self, filename, fillvalues=None): """ Write the file on disc to filename. Parameters ---------- filename: str Path of the file to be written. fillvalues: dict, optional Keys are datasets names (one of: data, gain, mask) and associated fill value for missing data. defaults are: - data: nan (proc, float32) or 0 (raw, uint16) - gain: 0 (uint8) - mask: 0xffffffff (uint32) """ pulse_ids = self.collect_pulse_ids() experiment_ids = np.core.defchararray.add(np.core.defchararray.add( self.train_ids_perframe.astype(str), ':'), pulse_ids.astype(str)) layouts = self.collect_data() data_label = self.image_label _fillvalues = { # Data can be uint16 (raw) or float32 (proc) data_label: np.nan if layouts[data_label].dtype.kind == 'f' else 0, 'gain': 0, 'mask': 0xffffffff } if fillvalues: _fillvalues.update(fillvalues) # Enforce that fill values are compatible with array dtype _fillvalues[data_label] = layouts[data_label].dtype.type( _fillvalues[data_label]) if 'gain' in layouts: _fillvalues['gain'] = layouts['gain'].dtype.type( _fillvalues['gain']) if 'mask' in layouts: _fillvalues['mask'] = layouts['mask'].dtype.type( _fillvalues['mask']) log.info("Writing to %s", filename) # Virtual datasets require HDF5 >= 1.10. # Specifying this up front should mean it fails before touching # the file if run on an older version. We also specify this as # the maximum version, to ensure we're creating files that can # be read by HDF5 1.10. with h5py.File(filename, 'w', libver=('v110', 'v110')) as f: f.create_dataset('cxi_version', data=[150]) d = f.create_dataset('entry_1/experiment_identifier', shape=experiment_ids.shape, dtype=h5py.special_dtype(vlen=str)) d[:] = experiment_ids # pulseId, trainId, cellId are not part of the CXI standard, # but it allows extra data. f.create_dataset(f'entry_1/{self.pulse_id_label}', data=pulse_ids) f.create_dataset('entry_1/trainId', data=self.train_ids_perframe) cellids = f.create_virtual_dataset('entry_1/cellId', layouts[self.cell_id_label]) cellids.attrs['axes'] = 'experiment_identifier:module_identifier' dgrp = f.create_group('entry_1/instrument_1/detector_1') if len(layouts[data_label].shape) == 4: axes_s = 'experiment_identifier:module_identifier:y:x' else: # 5D dataset, with extra axis for axes_s = 'experiment_identifier:module_identifier:data_gain:y:x' ndg = layouts[data_label].shape[2] d = f.create_dataset('entry_1/data_gain', shape=(ndg,), dtype=h5py.special_dtype(vlen=str)) d[:] = ([data_label, 'gain'] if ndg == 2 else [data_label]) dgrp['data_gain'] = h5py.SoftLink('/entry_1/data_gain') data = dgrp.create_virtual_dataset( 'data', layouts[data_label], fillvalue=_fillvalues[data_label] ) data.attrs['axes'] = axes_s if 'gain' in layouts: gain = dgrp.create_virtual_dataset( 'gain', layouts['gain'], fillvalue=_fillvalues['gain'] ) gain.attrs['axes'] = axes_s if 'mask' in layouts: mask = dgrp.create_virtual_dataset( 'mask', layouts['mask'], fillvalue=_fillvalues['mask'] ) mask.attrs['axes'] = axes_s dgrp['experiment_identifier'] = h5py.SoftLink( '/entry_1/experiment_identifier') f['entry_1/data_1'] = h5py.SoftLink( '/entry_1/instrument_1/detector_1') dgrp.create_dataset('module_identifier', data=self.modulenos) log.info("Finished writing virtual CXI file") class XtdfCXIWriter(VirtualCXIWriterBase): """ Machinery to write VDS files for a group of detectors with similar data format - AGIPD, DSSC & LPD. You don't normally need to use this class directly. Instead, use the write_virtual_cxi() method on a multi-module detector data interface object. CXI specifies a particular layout of data in the HDF5 file format. It is documented here: http://www.cxidb.org/cxi.html This code writes version 1.5 CXI files. Parameters ---------- detdata: extra_data.components.XtdfDetectorBase The detector data interface for the data to gather in this file. """ def __init__(self, detdata) -> None: self.cells_per_entry = 1 self.pulse_id_label = 'pulseId' self.cell_id_label = 'cellId' super().__init__(detdata) def collect_data(self): """ Prepare virtual layouts and map them to the virtual sources in the data chunks. Returns ------- layouts: dict A dictionary mapping virtual datasets names (e.g. ``data``) to h5py virtual layouts. """ src = next(iter(self.detdata.source_to_modno)) h5file = self.data._source_index[src][0].file image_grp = h5file['INSTRUMENT'][src][self.group_label] VLayout = h5py.VirtualLayout det_name = type(self.detdata).__name__ if 'gain' in image_grp: log.info(f"Identified {det_name} calibrated data") shape = (self.nframes, self.nmodules) + self.detdata.module_shape log.info("Virtual data shape: %r", shape) layouts = { self.image_label: VLayout( shape, dtype=image_grp[self.image_label].dtype), 'gain': VLayout(shape, dtype=image_grp['gain'].dtype), } if 'mask' in image_grp: layouts['mask'] = VLayout(shape, dtype=image_grp['mask'].dtype) else: log.info(f"Identified {det_name} raw data") shape = (self.nframes, self.nmodules) + image_grp['data'].shape[1:] log.info("Virtual data shape: %r", shape) layouts = { self.image_label: VLayout( shape, dtype=image_grp[self.image_label].dtype), } layouts[self.cell_id_label] = VLayout( (self.nframes, self.nmodules), dtype=image_grp[self.cell_id_label].dtype ) return self._map_layouts(layouts) class JUNGFRAUCXIWriter(VirtualCXIWriterBase): """ Machinery to write VDS files for JUNGFRAU data in the same format as AGIPD/LPD virtual datasets. You don't normally need to use this class directly. Instead, use the write_virtual_cxi() method on a multi-module detector data interface object. CXI specifies a particular layout of data in the HDF5 file format. It is documented here: http://www.cxidb.org/cxi.html This code writes version 1.5 CXI files. Parameters ---------- detdata: extra_data.components.JUNGFRAU The detector data interface for the data to gather in this file. """ def __init__(self, detdata) -> None: # Check number of cells src = next(iter(detdata.source_to_modno)) keydata = detdata.data[src, 'data.adc'] self.cells_per_entry = keydata.entry_shape[0] self.pulse_id_label = 'memoryCell' self.cell_id_label = 'memoryCell' super().__init__(detdata) # For JUNGFRAU detectors modules are numbered from 1 self.modulenos = list(range(1, self.nmodules + 1)) def collect_data(self): """ Prepare virtual layouts and map them to the virtual sources in the data chunks. Returns ------- layouts: dict A dictionary mapping virtual datasets names (e.g. ``data``) to h5py virtual layouts. """ src = next(iter(self.detdata.source_to_modno)) h5file = self.data._source_index[src][0].file image_grp = h5file['INSTRUMENT'][src][self.group_label] VLayout = h5py.VirtualLayout det_name = type(self.detdata).__name__ log.info(f"Identified {det_name} data") shape = (self.nframes, self.nmodules) + self.detdata.module_shape log.info("Virtual data shape: %r", shape) layouts = { self.image_label: VLayout( shape, dtype=image_grp[self.image_label].dtype), 'gain': VLayout(shape, dtype=image_grp['gain'].dtype), 'mask': VLayout(shape, dtype=image_grp['mask'].dtype), self.cell_id_label: VLayout( (self.nframes, self.nmodules), dtype=image_grp[self.cell_id_label].dtype ), } return self._map_layouts(layouts) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1623678132.0 EXtra-data-1.7.0/extra_data/writer.py0000664000175000017500000001765700000000000020334 0ustar00takluyvertakluyverimport h5py import numpy as np class FileWriter: """Write data in European XFEL HDF5 format This is intended to allow copying a subset of data into a smaller, more portable file. """ def __init__(self, path, data): self.file = h5py.File(path, 'w') self.data = data self.indexes = {} # {path: (first, count)} self.data_sources = set() def prepare_source(self, source): """Prepare all the datasets for one source. We do this as a separate step so the contents of the file are defined together before the main data. """ for key in sorted(self.data.keys_for_source(source)): path = f"{self._section(source)}/{source}/{key.replace('.', '/')}" nentries = self._guess_number_of_storing_entries(source, key) src_ds1 = self.data._source_index[source][0].file[path] self.file.create_dataset_like( path, src_ds1, shape=(nentries,) + src_ds1.shape[1:], # Corrected detector data has maxshape==shape, but if any max # dim is smaller than the chunk size, h5py complains. Making # the first dimension unlimited avoids this. maxshape=(None,) + src_ds1.shape[1:], ) if source in self.data.instrument_sources: self.data_sources.add(f"INSTRUMENT/{source}/{key.partition('.')[0]}") if source not in self.data.instrument_sources: self.data_sources.add(f"CONTROL/{source}") def _guess_number_of_storing_entries(self, source, key): """Provide the length for the initial dataset to create. May be overridden in subclasses. """ return self.data.get_data_counts(source, key).sum() def _section(self, source): if source in self.data.instrument_sources: return 'INSTRUMENT' else: return 'CONTROL' def copy_dataset(self, source, key): """Copy data into a dataset""" a = self.data.get_array(source, key) path = f"{self._section(source)}/{source}/{key.replace('.', '/')}" self.file[path][:] = a.values self._make_index(source, key, a.coords['trainId'].values) def _make_index(self, source, key, data_tids): # Original files contain exactly 1 entry per train for control data, # but if one file starts before another, there can be some values # missing when we collect several files together. We don't try to # extrapolate to fill missing data, so some counts may be 0. if source in self.data.instrument_sources: index_path = source + '/' + key.partition('.')[0] else: index_path = source if index_path not in self.indexes: if source not in self.data.instrument_sources: assert len(np.unique(data_tids)) == len(data_tids),\ "Duplicate train IDs in control data!" self.indexes[index_path] = self._generate_index(data_tids) def _generate_index(self, data_tids): """Convert an array of train IDs to first/count for each train""" assert (np.diff(data_tids) >= 0).all(), "Out-of-order train IDs" counts = np.array([np.count_nonzero(t == data_tids) for t in self.data.train_ids], dtype=np.uint64) firsts = np.zeros_like(counts) firsts[1:] = np.cumsum(counts)[:-1] # firsts[0] is always 0 return firsts, counts def copy_source(self, source): """Copy data for all keys of one source""" for key in self.data.keys_for_source(source): self.copy_dataset(source, key) def write_train_ids(self): self.file.create_dataset( 'INDEX/trainId', data=self.data.train_ids, dtype='u8' ) def write_indexes(self): """Write the INDEX information for all data we've copied""" for groupname, (first, count) in self.indexes.items(): group = self.file.create_group(f'INDEX/{groupname}') group.create_dataset('first', data=first, dtype=np.uint64) group.create_dataset('count', data=count, dtype=np.uint64) def write_metadata(self): """Write the METADATA section, including lists of sources""" vlen_bytes = h5py.special_dtype(vlen=bytes) data_sources = sorted(self.data_sources) N = len(data_sources) sources_ds = self.file.create_dataset( 'METADATA/dataSourceId', (N,), dtype=vlen_bytes, maxshape=(None,) ) sources_ds[:] = data_sources root_ds = self.file.create_dataset( 'METADATA/root', (N,), dtype=vlen_bytes, maxshape=(None,) ) root_ds[:] = [ds.split('/', 1)[0] for ds in data_sources] devices_ds = self.file.create_dataset( 'METADATA/deviceId', (N,), dtype=vlen_bytes, maxshape=(None,) ) devices_ds[:] = [ds.split('/', 1)[1] for ds in data_sources] def set_writer(self): """Record the package & version writing the file in an attribute""" from . import __version__ self.file.attrs['writer'] = 'extra_data {}'.format(__version__) def write(self): d = self.data self.set_writer() self.write_train_ids() for source in d.all_sources: self.prepare_source(source) self.write_metadata() for source in d.all_sources: self.copy_source(source) self.write_indexes() class VirtualFileWriter(FileWriter): """Write virtual datasets in European XFEL format The new files refer to the original data files, so they aren't portable, but they provide more convenient access by reassembling data spread over several sequence files. """ def __init__(self, path, data): if not hasattr(h5py, 'VirtualLayout'): raise Exception("Creating virtual datasets requires HDF5 1.10 " "and h5py 2.9") super().__init__(path, data) def _assemble_data(self, source, key): """Assemble chunks of data into a virtual layout""" # First, get a list of all non-empty data chunks chunks = [c for c in self.data._find_data_chunks(source, key) if (c.counts > 0).any()] chunks.sort(key = lambda c: c.train_ids[0]) if not chunks: return None, None # Create the layout, which will describe what data is where n_total = np.sum([c.counts.sum() for c in chunks]) ds0 = chunks[0].dataset layout = h5py.VirtualLayout(shape=(n_total,) + ds0.shape[1:], dtype=ds0.dtype) # Map each chunk into the relevant part of the layout output_cursor = np.uint64(0) for chunk in chunks: n = chunk.counts.sum() src = h5py.VirtualSource(chunk.dataset) src = src[chunk.slice] layout[output_cursor : output_cursor + n] = src output_cursor += n assert output_cursor == n_total # Make an array of which train ID each data entry is for: train_ids = np.concatenate([ np.repeat(c.train_ids, c.counts.astype(np.intp)) for c in chunks ]) return layout, train_ids def prepare_source(self, source): for key in self.data.keys_for_source(source): self.add_dataset(source, key) def add_dataset(self, source, key): layout, train_ids = self._assemble_data(source, key) if not layout: return # No data path = f"{self._section(source)}/{source}/{key.replace('.', '/')}" self.file.create_virtual_dataset(path, layout) self._make_index(source, key, train_ids) if source in self.data.instrument_sources: self.data_sources.add(f"INSTRUMENT/{source}/{key.partition('.')[0]}") else: self.data_sources.add(f"CONTROL/{source}") return path def copy_source(self, source): pass # Override base class copying data ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1584883216.0 EXtra-data-1.7.0/pytest.ini0000644000175000017500000000034000000000000016336 0ustar00takluyvertakluyver[pytest] addopts = --ignore docs/xpd_examples.ipynb --ignore docs/xpd_examples2.ipynb --ignore docs/parallel_example.ipynb --ignore docs/dask_averaging.ipynb --ignore docs/inspection.ipynb --ignore docs/iterate_trains.ipynb ././@PaxHeader0000000000000000000000000000003400000000000010212 xustar0028 mtime=1627995375.6317217 EXtra-data-1.7.0/setup.cfg0000664000175000017500000000004600000000000016133 0ustar00takluyvertakluyver[egg_info] tag_build = tag_date = 0 ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1626691885.0 EXtra-data-1.7.0/setup.py0000775000175000017500000000512100000000000016026 0ustar00takluyvertakluyver#!/usr/bin/env python import os.path as osp import re from setuptools import setup, find_packages import sys def get_script_path(): return osp.dirname(osp.realpath(sys.argv[0])) def read(*parts): return open(osp.join(get_script_path(), *parts)).read() def find_version(*parts): vers_file = read(*parts) match = re.search(r'^__version__ = "(\d+\.\d+\.\d+)"', vers_file, re.M) if match is not None: return match.group(1) raise RuntimeError("Unable to find version string.") setup(name="EXtra-data", version=find_version("extra_data", "__init__.py"), author="European XFEL GmbH", author_email="da-support@xfel.eu", maintainer="Thomas Michelat", url="https://github.com/European-XFEL/EXtra-data", description="Tools to read and analyse data from European XFEL ", long_description=read("README.md"), long_description_content_type='text/markdown', license="BSD-3-Clause", packages=find_packages(), package_data={ 'extra_data.tests': ['dssc_geo_june19.h5', 'lpd_mar_18.h5'], }, entry_points={ "console_scripts": [ "lsxfel = extra_data.lsxfel:main", "karabo-bridge-serve-files = extra_data.export:main", "extra-data-validate = extra_data.validation:main", "extra-data-make-virtual-cxi = extra_data.cli.make_virtual_cxi:main", "extra-data-locality = extra_data.locality:main", ], }, install_requires=[ 'fabio', 'h5py>=2.10', 'karabo-bridge >=0.6', 'matplotlib', 'numpy', 'pandas', 'psutil', 'xarray', ], extras_require={ 'docs': [ 'sphinx', 'nbsphinx', 'ipython', # For nbsphinx syntax highlighting 'sphinxcontrib_github_alt', ], 'test': [ 'coverage', 'dask[array]', 'nbval', 'pytest', 'pytest-cov', 'testpath', ] }, python_requires='>=3.6', classifiers=[ 'Development Status :: 5 - Production/Stable', 'Environment :: Console', 'Intended Audience :: Developers', 'Intended Audience :: Science/Research', 'License :: OSI Approved :: BSD License', 'Operating System :: POSIX :: Linux', 'Programming Language :: Python :: 3', 'Topic :: Scientific/Engineering :: Information Analysis', 'Topic :: Scientific/Engineering :: Physics', ] )