pax_global_header00006660000000000000000000000064147156427250014527gustar00rootroot0000000000000052 comment=e2ef582a5d5e47bfa34d464e45efdb202775cb9d xarray-safe-rcm-2024.11.0/000077500000000000000000000000001471564272500150375ustar00rootroot00000000000000xarray-safe-rcm-2024.11.0/.flake8000066400000000000000000000005351471564272500162150ustar00rootroot00000000000000[flake8] ignore = # E203: whitespace before ':' - doesn't work well with black # E402: module level import not at top of file # E501: line too long - let black worry about that # E731: do not assign a lambda expression, use a def # W503: line break before binary operator E203,E402,E501,E731,W503 exclude= .eggs docs xarray-safe-rcm-2024.11.0/.github/000077500000000000000000000000001471564272500163775ustar00rootroot00000000000000xarray-safe-rcm-2024.11.0/.github/dependabot.yml000066400000000000000000000001661471564272500212320ustar00rootroot00000000000000version: 2 updates: - package-ecosystem: "github-actions" directory: "/" schedule: interval: "weekly" xarray-safe-rcm-2024.11.0/.github/release.yml000066400000000000000000000001141471564272500205360ustar00rootroot00000000000000changelog: exclude: authors: - dependabot - pre-commit-ci xarray-safe-rcm-2024.11.0/.github/workflows/000077500000000000000000000000001471564272500204345ustar00rootroot00000000000000xarray-safe-rcm-2024.11.0/.github/workflows/ci.yaml000066400000000000000000000043501471564272500217150ustar00rootroot00000000000000name: CI on: push: branches: [main] pull_request: branches: [main] workflow_dispatch: concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true jobs: detect-skip-ci-trigger: name: "Detect CI Trigger: [skip-ci]" if: | github.repository == 'umr-lops/xarray-safe-rcm' && github.event_name == 'push' || github.event_name == 'pull_request' runs-on: ubuntu-latest outputs: triggered: ${{ steps.detect-trigger.outputs.trigger-found }} steps: - uses: actions/checkout@v4 with: fetch-depth: 2 - uses: xarray-contrib/ci-trigger@v1 id: detect-trigger with: keyword: "[skip-ci]" ci: name: ${{ matrix.os }} py${{ matrix.python-version }} runs-on: ${{ matrix.os }} needs: detect-skip-ci-trigger if: needs.detect-skip-ci-trigger.outputs.triggered == 'false' defaults: run: shell: bash -l {0} strategy: fail-fast: false matrix: python-version: ["3.10", "3.11", "3.12"] os: ["ubuntu-latest", "macos-latest", "windows-latest"] steps: - name: Checkout the repository uses: actions/checkout@v4 with: # need to fetch all tags to get a correct version fetch-depth: 0 # fetch all branches and tags - name: Setup environment variables run: | echo "TODAY=$(date +'%Y-%m-%d')" >> $GITHUB_ENV echo "CONDA_ENV_FILE=ci/requirements/environment.yaml" >> $GITHUB_ENV - name: Setup micromamba uses: mamba-org/setup-micromamba@v2 with: environment-file: ${{ env.CONDA_ENV_FILE }} environment-name: xarray-safe-rcm-tests cache-environment: true cache-environment-key: "${{runner.os}}-${{runner.arch}}-py${{matrix.python-version}}-${{env.TODAY}}-${{hashFiles(env.CONDA_ENV_FILE)}}" create-args: >- python=${{matrix.python-version}} conda - name: Install xarray-safe-rcm run: | python -m pip install --no-deps -e . - name: Import xarray-safe-rcm run: | python -c "import safe_rcm" - name: Run tests run: | python -m pytest --cov=safe_rcm xarray-safe-rcm-2024.11.0/.github/workflows/pypi.yaml000066400000000000000000000024261471564272500223050ustar00rootroot00000000000000name: Upload Package to PyPI on: release: types: [created] jobs: build: name: Build packages runs-on: ubuntu-latest if: github.repository == 'umr-lops/xarray-safe-rcm' steps: - name: Checkout uses: actions/checkout@v4 - name: Set up Python uses: actions/setup-python@v5 with: python-version: "3.x" - name: Install dependencies run: | python -m pip install --upgrade pip python -m pip install build twine - name: Build run: | python -m build --sdist --wheel --outdir dist/ . - name: Check the built archives run: | twine check dist/* - name: Upload build artifacts uses: actions/upload-artifact@v4 with: name: packages path: dist/* pypi-publish: name: Upload to PyPI runs-on: ubuntu-latest needs: build environment: name: pypi url: https://pypi.org/p/xarray-safe-rcm permissions: id-token: write steps: - name: Download build artifacts uses: actions/download-artifact@v4 with: name: packages path: dist/ - name: Publish to PyPI uses: pypa/gh-action-pypi-publish@15c56dba361d8335944d31a2ecd17d700fc7bcbc xarray-safe-rcm-2024.11.0/.github/workflows/upstream-dev.yaml000066400000000000000000000050331471564272500237350ustar00rootroot00000000000000name: upstream-dev CI on: push: branches: [main] pull_request: branches: [main] schedule: - cron: "0 18 * * 0" # Weekly "On Sundays at 18:00" UTC workflow_dispatch: concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true jobs: detect-test-upstream-trigger: name: "Detect CI Trigger: [test-upstream]" if: github.event_name == 'push' || github.event_name == 'pull_request' runs-on: ubuntu-latest outputs: triggered: ${{ steps.detect-trigger.outputs.trigger-found }} steps: - uses: actions/checkout@v4 with: fetch-depth: 2 - uses: xarray-contrib/ci-trigger@v1.2 id: detect-trigger with: keyword: "[test-upstream]" upstream-dev: name: upstream-dev runs-on: ubuntu-latest needs: detect-test-upstream-trigger if: | always() && github.repository == 'umr-lops/xarray-safe-rcm' && ( github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || needs.detect-test-upstream-trigger.outputs.triggered == 'true' || contains(github.event.pull_request.labels.*.name, 'run-upstream') ) defaults: run: shell: bash -l {0} strategy: fail-fast: false matrix: python-version: ["3.12"] steps: - name: checkout the repository uses: actions/checkout@v4 with: # need to fetch all tags to get a correct version fetch-depth: 0 # fetch all branches and tags - name: set up conda environment uses: mamba-org/setup-micromamba@v2 with: environment-file: ci/requirements/environment.yaml environment-name: tests create-args: >- python=${{ matrix.python-version }} pytest-reportlog conda - name: install upstream-dev dependencies run: bash ci/install-upstream-dev.sh - name: install the package run: python -m pip install --no-deps -e . - name: show versions run: python -m pip list - name: import run: | python -c 'import safe_rcm' - name: run tests if: success() id: status run: | python -m pytest -rf --report-log=pytest-log.jsonl - name: report failures if: | failure() && steps.tests.outcome == 'failure' && github.event_name == 'schedule' uses: xarray-contrib/issue-from-pytest-log@v1 with: log-path: pytest-log.jsonl xarray-safe-rcm-2024.11.0/.gitignore000066400000000000000000000003421471564272500170260ustar00rootroot00000000000000# editor files *~ \#*\# # python bytecode *.py[co] __pycache__/ # install artifacts /build /dist /*.egg-info # tools .ipynb_checkpoints/ .hypothesis/ .pytest_cache .coverage .coverage.* .cache /docs/_build/ .prettier_cache xarray-safe-rcm-2024.11.0/.pre-commit-config.yaml000066400000000000000000000024421471564272500213220ustar00rootroot00000000000000ci: autoupdate_schedule: weekly # https://pre-commit.com/ repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v5.0.0 hooks: - id: trailing-whitespace - id: end-of-file-fixer - id: check-docstring-first - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.7.3 hooks: - id: ruff args: [--fix] - repo: https://github.com/psf/black-pre-commit-mirror rev: 24.10.0 hooks: - id: black-jupyter - repo: https://github.com/keewis/blackdoc rev: v0.3.9 hooks: - id: blackdoc additional_dependencies: ["black==24.10.0"] - id: blackdoc-autoupdate-black - repo: https://github.com/kynan/nbstripout rev: 0.8.0 hooks: - id: nbstripout args: [--extra-keys=metadata.kernelspec metadata.language_info.version] - repo: https://github.com/rbubley/mirrors-prettier rev: v3.3.3 hooks: - id: prettier args: [--cache-location=.prettier_cache] - repo: https://github.com/ComPWA/taplo-pre-commit rev: v0.9.3 hooks: - id: taplo-format args: [--option, array_auto_collapse=false] - id: taplo-lint args: [--no-schema] - repo: https://github.com/abravalheri/validate-pyproject rev: v0.23 hooks: - id: validate-pyproject xarray-safe-rcm-2024.11.0/.readthedocs.yml000066400000000000000000000006421471564272500201270ustar00rootroot00000000000000version: 2 build: os: ubuntu-22.04 tools: python: mambaforge-4.10 jobs: post_checkout: - (git --no-pager log --pretty="tformat:%s" -1 | grep -vqF "[skip-rtd]") || exit 183 pre_install: - git update-index --assume-unchanged docs/conf.py ci/requirements/docs.yaml conda: environment: ci/requirements/docs.yaml sphinx: fail_on_warning: true configuration: docs/conf.py formats: [] xarray-safe-rcm-2024.11.0/LICENSE000066400000000000000000000020741471564272500160470ustar00rootroot00000000000000MIT License Copyright (c) 2023, xarray-safe-rcm developers Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. xarray-safe-rcm-2024.11.0/README.md000066400000000000000000000002321471564272500163130ustar00rootroot00000000000000# xarray-safe-rcm Read RCM SAFE files into `xarray.DataTree` objects. ## Usage ```python import safe_rcm tree = safe_rcm.open_rcm(url, chunks={}) ``` xarray-safe-rcm-2024.11.0/ci/000077500000000000000000000000001471564272500154325ustar00rootroot00000000000000xarray-safe-rcm-2024.11.0/ci/install-upstream-dev.sh000066400000000000000000000013271471564272500220510ustar00rootroot00000000000000#!/usr/bin/env bash if command -v micromamba >/dev/null; then conda=micromamba elif command -v mamba >/dev/null; then conda=mamba else conda=conda fi conda remove -y --force cytoolz numpy xarray toolz fsspec python-dateutil pandas lxml xmlschema rioxarray python -m pip install \ -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple \ --no-deps \ --pre \ --upgrade \ numpy \ pandas \ xarray python -m pip install --upgrade \ git+https://github.com/pytoolz/toolz \ git+https://github.com/lxml/lxml \ git+https://github.com/sissaschool/xmlschema \ git+https://github.com/fsspec/filesystem_spec \ git+https://github.com/dateutil/dateutil \ git+https://github.com/corteva/rioxarray xarray-safe-rcm-2024.11.0/ci/requirements/000077500000000000000000000000001471564272500201555ustar00rootroot00000000000000xarray-safe-rcm-2024.11.0/ci/requirements/docs.yaml000066400000000000000000000002231471564272500217660ustar00rootroot00000000000000name: xarray-safe-rcm-docs channels: - conda-forge dependencies: - python=3.10 - sphinx>=4 - sphinx-book-theme - ipython - myst-parser xarray-safe-rcm-2024.11.0/ci/requirements/environment.yaml000066400000000000000000000007001471564272500234020ustar00rootroot00000000000000name: xarray-safe-rcm-tests channels: - conda-forge dependencies: - python # development - ipython - pre-commit - jupyterlab - jupyterlab_code_formatter - isort - black - dask-labextension # testing - pytest - pytest-reportlog - pytest-cov - hypothesis - coverage # I/O - rioxarray - h5netcdf - zarr - scipy # data - xarray - dask - numpy - pandas # processing - toolz - lxml - xmlschema xarray-safe-rcm-2024.11.0/docs/000077500000000000000000000000001471564272500157675ustar00rootroot00000000000000xarray-safe-rcm-2024.11.0/docs/conf.py000066400000000000000000000037531471564272500172760ustar00rootroot00000000000000# -- Project information ----------------------------------------------------- import datetime as dt project = "xarray-safe-rcm" author = f"{project} developers" initial_year = "2023" year = dt.datetime.now().year copyright = f"{initial_year}-{year}, {author}" # The root toctree document. root_doc = "index" # -- General configuration --------------------------------------------------- # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ "myst_parser", "sphinx.ext.extlinks", "sphinx.ext.intersphinx", "IPython.sphinxext.ipython_directive", "IPython.sphinxext.ipython_console_highlighting", ] extlinks = { "issue": ("https://github.com/umr-lops/xarray-safe-rcm/issues/%s", "GH%s"), "pull": ("https://github.com/umr-lops/xarray-safe-rcm/pull/%s", "PR%s"), } # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. exclude_patterns = ["_build", "directory"] # nitpicky mode: complain if references could not be found nitpicky = True # -- Options for HTML output ------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # html_theme = "sphinx_book_theme" # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". # html_static_path = ["_static"] # -- Options for the intersphinx extension ----------------------------------- intersphinx_mapping = { "python": ("https://docs.python.org/3/", None), "sphinx": ("https://www.sphinx-doc.org/en/stable/", None), } xarray-safe-rcm-2024.11.0/docs/index.md000066400000000000000000000000221471564272500174120ustar00rootroot00000000000000# xarray-safe-rcm xarray-safe-rcm-2024.11.0/docs/requirements.txt000066400000000000000000000000441471564272500212510ustar00rootroot00000000000000sphinx>=4 sphinx-book-theme ipython xarray-safe-rcm-2024.11.0/pyproject.toml000066400000000000000000000031111471564272500177470ustar00rootroot00000000000000[project] name = "xarray-safe-rcm" requires-python = ">= 3.10" license = { text = "MIT" } description = "xarray reader for radarsat constellation mission (RCM) SAFE files" readme = "README.md" dependencies = [ "toolz", "numpy", "xarray", "lxml", "xmlschema", "rioxarray", "fsspec", "exceptiongroup; python_version < '3.11'", ] dynamic = ["version"] [build-system] requires = ["setuptools>=64.0", "setuptools-scm"] build-backend = "setuptools.build_meta" [tool.setuptools.packages.find] include = [ "safe_rcm", "safe_rcm.*", ] [tool.setuptools_scm] fallback_version = "9999" [tool.ruff] target-version = "py310" builtins = ["ellipsis"] exclude = [".git", ".eggs", "build", "dist", "__pycache__"] line-length = 100 [tool.ruff.lint] ignore = [ "E402", # module level import not at top of file "E501", # line too long - let black worry about that "E731", # do not assign a lambda expression, use a def "UP038", # type union instead of tuple for isinstance etc ] select = [ "F", # Pyflakes "E", # Pycodestyle "I", # isort "UP", # Pyupgrade "TID", # flake8-tidy-imports "W", ] extend-safe-fixes = [ "TID252", # absolute imports "UP031", # percent string interpolation ] fixable = ["I", "TID252", "UP"] [tool.ruff.lint.isort] known-first-party = ["safe_rcm"] known-third-party = ["xarray", "tlz"] [tool.ruff.lint.flake8-tidy-imports] # Disallow all relative imports. ban-relative-imports = "all" [tool.coverage.run] source = ["safe_rcm"] branch = true [tool.coverage.report] show_missing = true exclude_lines = ["pragma: no cover", "if TYPE_CHECKING"] xarray-safe-rcm-2024.11.0/safe_rcm/000077500000000000000000000000001471564272500166165ustar00rootroot00000000000000xarray-safe-rcm-2024.11.0/safe_rcm/__init__.py000066400000000000000000000002661471564272500207330ustar00rootroot00000000000000from importlib.metadata import version from safe_rcm.api import open_rcm # noqa: F401 try: __version__ = version("xarray-safe-rcm") except Exception: __version__ = "9999" xarray-safe-rcm-2024.11.0/safe_rcm/api.py000066400000000000000000000123131471564272500177410ustar00rootroot00000000000000import os import posixpath from fnmatch import fnmatchcase import fsspec import xarray as xr from fsspec.implementations.dirfs import DirFileSystem from tlz.dicttoolz import valmap from tlz.functoolz import compose_left, curry, juxt from safe_rcm.calibrations import read_noise_levels from safe_rcm.manifest import read_manifest from safe_rcm.product.reader import read_product from safe_rcm.product.transformers import extract_dataset from safe_rcm.product.utils import starcall from safe_rcm.xml import read_xml try: ExceptionGroup except NameError: from exceptiongroup import ExceptionGroup @curry def execute(tree, f, path): node = tree[path] return f(node) def ignored_file(path, ignores): ignored = [ fnmatchcase(path, ignore) or fnmatchcase(posixpath.basename(path), ignore) for ignore in ignores ] return any(ignored) def open_rcm( url, *, backend_kwargs=None, manifest_ignores=[ "*.pdf", "*.html", "*.xslt", "*.png", "*.kml", "*.txt", "preview/*", ], **dataset_kwargs, ): """read SAFE files of the radarsat constellation mission (RCM) Parameters ---------- url : str backend_kwargs : mapping manifest_ignores : list of str, default: ["*.pdf", "*.html", "*.xslt", "*.png", \ "*.kml", "*.txt", "preview/*"] Globs that match files from the manifest that are allowed to be missing. **dataset_kwargs Keyword arguments forwarded to `xr.open_dataset`, used to open the contained data files. """ if not isinstance(url, (str, os.PathLike)): raise ValueError(f"cannot deal with object of type {type(url)}: {url}") if backend_kwargs is None: backend_kwargs = {} url = os.fspath(url) storage_options = backend_kwargs.get("storage_options", {}) mapper = fsspec.get_mapper(url, **storage_options) relative_fs = DirFileSystem(path=url, fs=mapper.fs) try: declared_files = read_manifest(mapper, "manifest.safe") except (FileNotFoundError, KeyError): raise ValueError( "cannot find the `manifest.safe` file. Are you sure this is a SAFE dataset?" ) missing_files = [ path for path in declared_files if not ignored_file(path, manifest_ignores) and not relative_fs.exists(path) ] if missing_files: raise ExceptionGroup( "not all files declared in the manifest are available", [ValueError(f"{p} does not exist") for p in missing_files], ) tree = read_product(mapper, "metadata/product.xml") calibration_root = "metadata/calibration" lookup_table_structure = { "/incidenceAngles": { "path": "/imageReferenceAttributes", "f": compose_left( lambda obj: obj.attrs["incidenceAngleFileName"], curry(posixpath.join, calibration_root), curry(read_xml, mapper), curry(extract_dataset, dims="coefficients"), ), }, "/lookupTables": { "path": "/imageReferenceAttributes/lookupTableFileName", "f": compose_left( lambda obj: obj.stack(stacked=["sarCalibrationType", "pole"]), lambda obj: obj.reset_index("stacked"), juxt( compose_left( lambda obj: obj.to_series().to_dict(), curry(valmap, curry(posixpath.join, calibration_root)), curry(valmap, curry(read_xml)(mapper)), curry(valmap, curry(extract_dataset, dims="coefficients")), curry(valmap, lambda ds: ds["gains"].assign_attrs(ds.attrs)), lambda d: xr.concat(list(d.values()), dim="stacked"), ), lambda obj: obj.coords, ), curry(starcall, lambda arr, coords: arr.assign_coords(coords)), lambda arr: arr.set_index({"stacked": ["sarCalibrationType", "pole"]}), lambda arr: arr.unstack("stacked"), lambda arr: arr.rename("lookup_tables"), lambda arr: arr.to_dataset(), ), }, "/noiseLevels": { "path": "/imageReferenceAttributes/noiseLevelFileName", "f": curry(read_noise_levels, mapper, calibration_root), }, } calibration = valmap( lambda x: execute(**x)(tree), lookup_table_structure, ) imagery_paths = tree["/sceneAttributes/ipdf"].to_series().to_dict() resolved = valmap( compose_left( curry(posixpath.join, "metadata"), posixpath.normpath, ), imagery_paths, ) imagery_dss = valmap( compose_left( curry(relative_fs.open), curry(xr.open_dataset, engine="rasterio", **dataset_kwargs), ), resolved, ) dss = [ds.assign_coords(pole=coord) for coord, ds in imagery_dss.items()] imagery = xr.concat(dss, dim="pole") return tree.assign( { "lookupTables": xr.DataTree.from_dict(calibration), "imagery": xr.DataTree(imagery), } ) xarray-safe-rcm-2024.11.0/safe_rcm/calibrations.py000066400000000000000000000062611471564272500216470ustar00rootroot00000000000000import posixpath import numpy as np import xarray as xr from tlz.dicttoolz import itemmap, merge_with, valfilter, valmap from tlz.functoolz import compose_left, curry, flip from tlz.itertoolz import first from safe_rcm.product.dicttoolz import keysplit from safe_rcm.product.reader import execute from safe_rcm.product.transformers import extract_dataset from safe_rcm.xml import read_xml def move_attrs_to_coords(ds, names): coords, attrs = keysplit(lambda k: k in names, ds.attrs) new = ds.copy() new.attrs = attrs return new.assign_coords(coords) def pad_common(dss): def compute_padding(item, maximum): key, value = item return key, (0, maximum[key] - value) sizes = [dict(ds.sizes) for ds in dss] maximum_sizes = valmap(max, merge_with(list, *sizes)) pad_widths = [itemmap(flip(compute_padding, maximum_sizes), _) for _ in sizes] return [ ds.pad(padding, mode="constant", constant_values=np.nan) for ds, padding in zip(dss, pad_widths) ] def _read_level(mapping): return ( extract_dataset(mapping) .pipe( lambda ds: ds.swap_dims( {first(valfilter(lambda v: v > 1, ds.sizes)): "coefficients"} ) ) .pipe(lambda ds: ds.reset_coords()) .pipe( move_attrs_to_coords, ["sarCalibrationType", "pixelFirstNoiseValue", "stepSize"], ) ) def read_noise_level_file(mapper, path): layout = { "/referenceNoiseLevel": { "path": "/referenceNoiseLevel", "f": compose_left( curry(map, _read_level), curry(map, lambda ds: ds.expand_dims("sarCalibrationType")), list, curry(xr.combine_by_coords, combine_attrs="drop_conflicts"), ), }, "/perBeamReferenceNoiseLevel": { "path": "/perBeamReferenceNoiseLevel", "f": compose_left( curry(map, _read_level), curry(map, lambda ds: ds.expand_dims("sarCalibrationType")), list, pad_common, curry(xr.combine_by_coords, combine_attrs="drop_conflicts"), ), }, "/azimuthNoiseLevelScaling": { "path": "/azimuthNoiseLevelScaling", "f": compose_left( curry(map, _read_level), list, pad_common, curry(xr.combine_by_coords, combine_attrs="drop_conflicts"), ), }, } decoded = read_xml(mapper, path) converted = valmap(lambda x: execute(**x)(decoded), layout) return converted def read_noise_levels(mapper, root, fnames): fnames = fnames.data.tolist() paths = [posixpath.join(root, name) for name in fnames] poles = [path.removesuffix(".xml").split("_")[1] for path in paths] trees = [read_noise_level_file(mapper, path) for path in paths] merged = merge_with(list, *trees) combined = valmap( compose_left( curry(xr.concat, dim="pole", combine_attrs="no_conflicts"), lambda x: x.assign_coords(pole=poles), ), merged, ) return xr.DataTree.from_dict(combined) xarray-safe-rcm-2024.11.0/safe_rcm/manifest.py000066400000000000000000000024121471564272500207750ustar00rootroot00000000000000from tlz import filter from tlz.functoolz import compose_left, curry from tlz.itertoolz import concat, get from safe_rcm.product.dicttoolz import query from safe_rcm.xml import read_xml def merge_location(loc): locator = loc["@locator"] href = loc["@href"] return f"{locator}/{href}".lstrip("/") def read_manifest(mapper, path): structure = { "/dataObjectSection/dataObject": compose_left( curry( map, compose_left( curry(get, "byteStream"), curry( map, compose_left( curry(get, "fileLocation"), curry(map, merge_location) ), ), concat, ), ), concat, ), "/metadataSection/metadataObject": compose_left( curry( filter, compose_left(curry(get, "@classification"), lambda x: x == "SYNTAX"), ), curry(map, compose_left(curry(get, "metadataReference"), merge_location)), ), } manifest = read_xml(mapper, path) return list(concat(func(query(path, manifest)) for path, func in structure.items())) xarray-safe-rcm-2024.11.0/safe_rcm/product/000077500000000000000000000000001471564272500202765ustar00rootroot00000000000000xarray-safe-rcm-2024.11.0/safe_rcm/product/dicttoolz.py000066400000000000000000000012551471564272500226660ustar00rootroot00000000000000from tlz.dicttoolz import get_in from tlz.itertoolz import first, groupby def query(path, mapping): if path == "/": return mapping keys = path.lstrip("/").split("/") return get_in(keys, mapping, no_default=True) def itemsplit(predicate, d): groups = groupby(predicate, d.items()) first = dict(groups.get(True, ())) second = dict(groups.get(False, ())) return first, second def valsplit(predicate, d): wrapper = lambda item: predicate(item[1]) return itemsplit(wrapper, d) def keysplit(predicate, d): wrapper = lambda item: predicate(item[0]) return itemsplit(wrapper, d) def first_values(d): return first(d.values()) xarray-safe-rcm-2024.11.0/safe_rcm/product/predicates.py000066400000000000000000000047421471564272500230020ustar00rootroot00000000000000import numpy as np from tlz.functoolz import compose, juxt from tlz.itertoolz import isiterable def disjunction(*predicates): return compose(any, juxt(predicates)) def conjunction(*predicates): return compose(all, juxt(predicates)) def is_scalar(x): return not isiterable(x) or isinstance(x, (str, bytes)) def is_composite_value(obj): if not isinstance(obj, list) or len(obj) not in [1, 2]: return False if any(not isinstance(el, dict) or list(el) != ["@dataStream", "$"] for el in obj): return False data_stream_values = [el["@dataStream"].lower() for el in obj] return data_stream_values in (["real", "imaginary"], ["magnitude"]) def is_complex(obj): return is_composite_value(obj) and len(obj) == 2 def is_magnitude(obj): return is_composite_value(obj) and len(obj) == 1 def is_array(obj): # definition of a array: # - list of scalars # - list of 1d lists # - complex array: # - complex parts # - list of complex values if not isinstance(obj, list): return False if len(obj) == 0: # zero-sized list, not sure what to do here return False elem = obj[0] if is_complex(obj): return not is_scalar(elem["$"]) elif is_scalar(elem): return True elif isinstance(elem, list): if len(elem) == 1 and is_scalar(elem[0]): return True elif is_complex(elem): # array of imaginary values return True elif all(map(is_scalar, elem)): return True return False def is_scalar_variable(obj): if not isinstance(obj, dict): return False if not all(is_scalar(v) for v in obj.values()): return False return all(k == "$" or k.startswith("@") for k in obj) is_scalar_valued = disjunction( is_scalar, lambda x: is_array(x) and len(x) == 1, is_scalar_variable ) def is_nested(obj): """nested means: list of dict, but all dict values are scalar or 1-valued""" if not isinstance(obj, list) or len(obj) == 0: return False elem = obj[0] if not isinstance(elem, dict): return False if all(map(is_scalar_valued, elem.values())): return True return False def is_nested_array(obj): return is_nested(obj) and "$" in obj[0] def is_nested_dataset(obj): return is_nested(obj) and "$" not in obj[0] def is_attr(column): """an attribute is a index if it has multiple unique values""" return np.unique(column).size == 1 xarray-safe-rcm-2024.11.0/safe_rcm/product/reader.py000066400000000000000000000252471471564272500221240ustar00rootroot00000000000000import xarray as xr from tlz.dicttoolz import keyfilter, merge, merge_with, valfilter, valmap from tlz.functoolz import compose_left, curry, juxt from tlz.itertoolz import first, second from safe_rcm.product import transformers from safe_rcm.product.dicttoolz import keysplit, query from safe_rcm.product.predicates import disjunction, is_nested_array, is_scalar_valued from safe_rcm.product.utils import dictfirst, starcall from safe_rcm.xml import read_xml @curry def attach_path(obj, path): if not hasattr(obj, "encoding"): raise ValueError( "cannot attach source path: `obj` does not have a `encoding` attribute." ) new = obj.copy() new.encoding["xpath"] = path return new @curry def execute(mapping, f, path): subset = query(path, mapping) return compose_left(f, attach_path(path=path))(subset) def read_product(mapper, product_path): decoded = read_xml(mapper, product_path) layout = { "/": { "path": "/", "f": curry(transformers.extract_metadata)(collapse=["securityAttributes"]), }, "/sourceAttributes": { "path": "/sourceAttributes", "f": transformers.extract_metadata, }, "/sourceAttributes/radarParameters": { "path": "/sourceAttributes/radarParameters", "f": transformers.extract_dataset, }, "/sourceAttributes/radarParameters/prfInformation": { "path": "/sourceAttributes/radarParameters/prfInformation", "f": transformers.extract_nested_dataset, }, "/sourceAttributes/orbitAndAttitude/orbitInformation": { "path": "/sourceAttributes/orbitAndAttitude/orbitInformation", "f": compose_left( curry(transformers.extract_dataset)(dims="timeStamp"), lambda ds: ds.assign_coords( {"timeStamp": ds["timeStamp"].astype("datetime64")} ), ), }, "/sourceAttributes/orbitAndAttitude/attitudeInformation": { "path": "/sourceAttributes/orbitAndAttitude/attitudeInformation", "f": compose_left( curry(transformers.extract_dataset)(dims="timeStamp"), lambda ds: ds.assign_coords( {"timeStamp": ds["timeStamp"].astype("datetime64")} ), ), }, "/sourceAttributes/rawDataAttributes": { "path": "/sourceAttributes/rawDataAttributes", "f": compose_left( curry(keysplit, lambda k: k != "rawDataAnalysis"), juxt( compose_left(first, transformers.extract_dataset), compose_left( second, dictfirst, curry(starcall, curry(merge_with, list)), curry( transformers.extract_dataset, dims={"rawDataHistogram": ["stacked", "histogram"]}, default_dims=["stacked"], ), lambda obj: obj.set_index({"stacked": ["pole", "beam"]}), lambda obj: obj.unstack("stacked"), ), ), curry(xr.merge), ), }, "/imageGenerationParameters/generalProcessingInformation": { "path": "/imageGenerationParameters/generalProcessingInformation", "f": transformers.extract_metadata, }, "/imageGenerationParameters/sarProcessingInformation": { "path": "/imageGenerationParameters/sarProcessingInformation", "f": compose_left( curry(keyfilter, lambda k: k not in {"azimuthWindow", "rangeWindow"}), transformers.extract_dataset, ), }, "/imageGenerationParameters/chirps": { "path": "/imageGenerationParameters/chirp", "f": compose_left( lambda el: merge_with(list, *el), curry(keysplit, lambda k: k != "chirpQuality"), juxt( first, compose_left( second, dictfirst, lambda el: merge_with(list, *el), ), ), lambda x: merge(*x), curry( transformers.extract_dataset, dims={ "amplitudeCoefficients": ["stacked", "coefficients"], "phaseCoefficients": ["stacked", "coefficients"], }, default_dims=["stacked"], ), lambda obj: obj.set_index({"stacked": ["pole", "pulse"]}), lambda obj: obj.drop_duplicates("stacked", keep="last"), lambda obj: obj.unstack("stacked"), ), }, "/imageGenerationParameters/slantRangeToGroundRange": { "path": "/imageGenerationParameters/slantRangeToGroundRange", "f": compose_left( lambda el: merge_with(list, *el), curry( transformers.extract_dataset, dims={ "groundToSlantRangeCoefficients": [ "zeroDopplerAzimuthTime", "coefficients", ], }, default_dims=["zeroDopplerAzimuthTime"], ), ), }, "/imageReferenceAttributes": { "path": "/imageReferenceAttributes", "f": compose_left( curry(valfilter)(disjunction(is_scalar_valued, is_nested_array)), transformers.extract_dataset, ), }, "/imageReferenceAttributes/rasterAttributes": { "path": "/imageReferenceAttributes/rasterAttributes", "f": transformers.extract_dataset, }, "/imageReferenceAttributes/geographicInformation/ellipsoidParameters": { "path": "/imageReferenceAttributes/geographicInformation/ellipsoidParameters", "f": curry(transformers.extract_dataset)(dims="params"), }, "/imageReferenceAttributes/geographicInformation/geolocationGrid": { "path": "/imageReferenceAttributes/geographicInformation/geolocationGrid/imageTiePoint", "f": compose_left( curry(transformers.extract_nested_datatree)(dims="tie_points"), lambda tree: xr.merge([node.ds for node in tree.subtree]), lambda ds: ds.set_index(tie_points=["line", "pixel"]), lambda ds: ds.unstack("tie_points"), ), }, "/imageReferenceAttributes/geographicInformation/rationalFunctions": { "path": "/imageReferenceAttributes/geographicInformation/rationalFunctions", "f": curry(transformers.extract_dataset)(dims="coefficients"), }, "/sceneAttributes": { "path": "/sceneAttributes/imageAttributes", "f": compose_left( first, # GRD datasets only have 1 curry(keyfilter)(lambda x: not x.startswith("@")), transformers.extract_dataset, ), }, "/grdBurstMap": { "path": "/grdBurstMap", "f": compose_left( curry( map, compose_left( curry(keysplit, lambda k: k != "burstAttributes"), juxt( first, compose_left( second, dictfirst, curry(starcall, curry(merge_with, list)), ), ), curry(starcall, merge), curry( transformers.extract_dataset, dims=["stacked"], ), lambda obj: obj.set_index({"stacked": ["burst", "beam"]}), lambda obj: obj.unstack("stacked"), ), ), list, curry(xr.concat, dim="burst_maps"), ), }, "/dopplerCentroid": { "path": "/dopplerCentroid", "f": compose_left( curry( map, compose_left( curry(keysplit, lambda k: k != "dopplerCentroidEstimate"), juxt( first, compose_left( second, dictfirst, curry(starcall, curry(merge_with, list)), ), ), curry(starcall, merge), curry( transformers.extract_dataset, dims={ "dopplerCentroidCoefficients": [ "burst", "coefficients", ], }, default_dims=["burst"], ), ), ), list, curry(xr.concat, dim="burst_maps"), ), }, "/dopplerRate": { "path": "/dopplerRate", "f": compose_left( curry( map, compose_left( curry(keysplit, lambda k: k != "dopplerRateEstimate"), juxt( first, compose_left( second, dictfirst, curry(starcall, curry(merge_with, list)), ), ), curry(starcall, merge), curry( transformers.extract_dataset, dims={ "dopplerRateCoefficients": ["burst", "coefficients"], }, default_dims=["burst"], ), ), ), list, curry(xr.concat, dim="burst_maps"), ), }, } converted = valmap( lambda x: execute(**x)(decoded), layout, ) return xr.DataTree.from_dict(converted) xarray-safe-rcm-2024.11.0/safe_rcm/product/transformers.py000066400000000000000000000156161471564272500234060ustar00rootroot00000000000000import numpy as np import xarray as xr from tlz.dicttoolz import ( itemfilter, itemmap, keyfilter, keymap, merge_with, valfilter, valmap, ) from tlz.functoolz import compose_left, curry, flip from tlz.itertoolz import concat, first, second from safe_rcm.product.dicttoolz import first_values, keysplit, valsplit from safe_rcm.product.predicates import ( is_array, is_attr, is_composite_value, is_nested_array, is_nested_dataset, is_scalar, ) ignore = ("@xmlns", "@xmlns:xsi", "@xsi:schemaLocation") def convert_composite(value): if not is_composite_value(value): raise ValueError(f"not a composite: {value}") converted = {part["@dataStream"].lower(): np.array(part["$"]) for part in value} if list(converted) == ["magnitude"]: return "magnitude", converted["magnitude"] else: return "complex", converted["real"] + 1j * converted["imaginary"] def extract_metadata( mapping, collapse=(), ignore=ignore, ): without_ignores = keyfilter(lambda k: k not in ignore, mapping) # extract the metadata metadata_ = itemfilter( lambda it: it[0].startswith("@") or is_scalar(it[1]), without_ignores, ) metadata = keymap(flip(str.lstrip, "@"), metadata_) # collapse the selected items to_collapse = keyfilter(lambda x: x in collapse, mapping) collapsed = dict(concat(v.items() for v in to_collapse.values())) attrs = metadata | collapsed return xr.Dataset(attrs=attrs) # return dataset to avoid bug in datatree def extract_array(obj, dims): if isinstance(dims, str): dims = [dims] # special case for pulses: if "pulses" in dims and len(obj) == 1 and isinstance(obj[0], str): obj = obj[0].split() elif len(obj) >= 1 and is_composite_value(obj[0]): obj = list(map(compose_left(convert_composite, second), obj)) data = np.array(obj) if data.size > 1: data = np.squeeze(data) return xr.Variable(dims, data) def extract_composite(obj, dims=()): type_, value = convert_composite(obj) if is_scalar(value): dims = () return xr.Variable(dims, value, {"type": type_}) def extract_variable(obj, dims=()): attributes, data = keysplit(lambda k: k.startswith("@"), obj) if list(data) != ["$"]: raise ValueError("not a variable") values = data["$"] if is_scalar(values): dims = () attrs = keymap(lambda k: k.lstrip("@"), attributes) return xr.Variable(dims, values, attrs) def extract_entry(name, obj, dims=None, default_dims=None): if default_dims is None: default_dims = [name] if isinstance(dims, dict): dims = dims.get(name, default_dims) elif dims is None: dims = default_dims if is_array(obj): # dimension coordinate return extract_array(obj, dims=dims) elif is_composite_value(obj): return extract_composite(obj, dims=dims) elif isinstance(obj, dict): return extract_variable(obj, dims=dims) elif is_nested_array(obj): return extract_nested_array(obj, dims=dims).pipe(rename, name) else: raise ValueError(f"unknown datastructure:\n{obj}") def extract_dataset(obj, dims=None, default_dims=None): filtered = keyfilter(lambda x: x not in ignore, obj) attrs, variables = valsplit(is_scalar, filtered) if len(variables) == 1 and is_nested_dataset(first_values(variables)): return extract_nested_dataset(first_values(variables), dims=dims).assign_attrs( attrs ) variables_ = keymap(lambda k: k.lstrip("@"), variables) filtered_variables = valfilter(lambda x: not is_nested_dataset(x), variables_) data_vars = itemmap( lambda item: ( item[0], extract_entry(*item, dims=dims, default_dims=default_dims), ), filtered_variables, ) return xr.Dataset(data_vars=data_vars, attrs=attrs) def extract_nested_variable(obj, dims=None): if is_array(obj): return xr.Variable(dims, obj) columns = merge_with(list, *obj) attributes, data = keysplit(lambda k: k.startswith("@"), columns) renamed = keymap(lambda k: k.lstrip("@"), attributes) attrs = valmap(first, renamed) return xr.Variable(dims, data["$"], attrs) def unstack(obj, dim="stacked"): if dim not in obj.dims: return obj stacked_coords = [name for name, arr in obj.coords.items() if dim in arr.dims] return obj.set_index({dim: stacked_coords}).unstack(dim) def rename(obj, name): renamed = obj.rename(name) if "$" not in obj.dims: return renamed if len(obj.dims) != 1: raise ValueError(f"unexpected number of dimensions: {list(obj.dims)}") return renamed.swap_dims({"$": name}) def to_variable_tuple(name, value, dims): if name in dims: dims_ = [name] else: dims_ = dims return (dims_, value) def extract_nested_array(obj, dims=None): columns = merge_with(list, *obj) attributes, data = keysplit(flip(str.startswith, "@"), columns) renamed = keymap(flip(str.lstrip, "@"), attributes) preprocessed_attrs = valmap(np.squeeze, renamed) attrs_, indexes = valsplit(is_attr, preprocessed_attrs) preprocessed_data = valmap(np.squeeze, data) originally_stacked = isinstance(dims, (tuple, list)) and "stacked" in dims if len(indexes) == 1: dims = list(indexes) elif len(indexes) >= 2: dims = ["stacked"] elif dims is None: dims = ["$"] coords = itemmap( lambda it: (it[0], to_variable_tuple(*it, dims=dims)), indexes, ) arr = xr.DataArray( data=preprocessed_data["$"], attrs=valmap(first, attrs_), dims=dims, coords=coords, ) if originally_stacked: return arr return arr.pipe(unstack, dim="stacked") def extract_nested_dataset(obj, dims=None): if not isinstance(obj, list): raise ValueError(f"unknown type: {type(obj)}") columns = merge_with(list, *obj) attributes, data = keysplit(flip(str.startswith, "@"), columns) renamed = keymap(flip(str.lstrip, "@"), attributes) preprocessed = valmap(np.squeeze, renamed) attrs_, indexes = valsplit(is_attr, preprocessed) attrs = valmap(first, attrs_) if dims is None: if len(indexes) <= 1: dims = list(indexes) else: dims = ["stacked"] data_vars = valmap(curry(extract_nested_variable)(dims=dims), data) coords = itemmap( lambda it: (it[0], to_variable_tuple(*it, dims=dims)), indexes, ) return xr.Dataset(data_vars=data_vars, coords=coords, attrs=attrs).pipe( unstack, dim="stacked" ) def extract_nested_datatree(obj, dims=None): if not isinstance(obj, list): raise ValueError(f"unknown type: {type(obj)}") datasets = merge_with(list, *obj) tree = valmap(curry(extract_nested_dataset)(dims=dims), datasets) return xr.DataTree.from_dict(tree) xarray-safe-rcm-2024.11.0/safe_rcm/product/utils.py000066400000000000000000000017321471564272500220130ustar00rootroot00000000000000from tlz.functoolz import flip, pipe from tlz.itertoolz import first, groupby def split_marked(mapping, marker="@"): groups = groupby(lambda item: item[0].startswith(marker), mapping.items()) attrs = {key.lstrip(marker): value for key, value in groups.get(True, {})} data = {key: value for key, value in groups.get(False, {})} return attrs, data def strip_namespaces(name, namespaces): """remove the given namespaces from a name Parameters ---------- name : str The string to trim namespaces : sequence of str The list of namespaces. Returns ------- trimmed : str The string without prefix and without leading colon. """ funcs = [ flip(str.removeprefix, ns) for ns in sorted(namespaces, key=len, reverse=True) ] return pipe(name, *funcs).lstrip(":") def starcall(func, args, **kwargs): return func(*args, **kwargs) def dictfirst(mapping): return first(mapping.values()) xarray-safe-rcm-2024.11.0/safe_rcm/tests/000077500000000000000000000000001471564272500177605ustar00rootroot00000000000000xarray-safe-rcm-2024.11.0/safe_rcm/tests/test_product_utils.py000066400000000000000000000033361471564272500242760ustar00rootroot00000000000000import functools import string import hypothesis.strategies as st from hypothesis import given from safe_rcm.product import utils def shared(*, key): def outer(func): @functools.wraps(func) def inner(*args, **kwargs): result = func(*args, **kwargs) return st.shared(result, key=key) return inner return outer markers = st.characters() marker = st.shared(markers, key="marker") def marked_mapping(marker): values = st.just(None) unmarked_keys = st.text() marked_keys = st.builds(lambda k, m: m + k, unmarked_keys, marker) keys = st.one_of(unmarked_keys, marked_keys) return st.dictionaries(keys, values) @given(marked_mapping(marker), marker) def test_split_marked(mapping, marker): marked, unmarked = utils.split_marked(mapping, marker=marker) assert list(unmarked) == [key for key in mapping if not key.startswith(marker)] @shared(key="namespaces") def namespaces(): values = st.just(None) keys = st.text(string.ascii_letters, min_size=1, max_size=4) return st.dictionaries(keys, values) @st.composite def draw_from(draw, elements): elements = draw(elements) if not elements: return "" return draw(st.sampled_from(elements)) def prefixed_names(namespaces): def builder(base, prefix): return f"{prefix}:{base}" if prefix != "" else base bases = st.text(string.ascii_letters, min_size=1) all_prefixes = namespaces.map(list) prefixes = draw_from(all_prefixes) return st.builds(builder, bases, prefixes) @given(prefixed_names(namespaces()), namespaces()) def test_strip_namespaces(name, namespaces): stripped = utils.strip_namespaces(name, namespaces) assert ":" not in stripped xarray-safe-rcm-2024.11.0/safe_rcm/tests/test_xml.py000066400000000000000000000211021471564272500221650ustar00rootroot00000000000000import collections import textwrap import fsspec import pytest from safe_rcm import xml def dedent(text): return textwrap.dedent(text.removeprefix("\n").rstrip()) schemas = [ dedent( """ """ ), dedent( """ """ ), dedent( """ """ ), ] Container = collections.namedtuple("SchemaSetup", ["mapper", "path", "expected"]) SchemaProperties = collections.namedtuple( "SchemaProperties", ["root_elements", "simple_types", "complex_types"] ) @pytest.fixture(params=enumerate(schemas)) def schema_setup(request): schema_index, schema = request.param mapper = fsspec.get_mapper("memory") mapper["schemas/root.xsd"] = schema.encode() mapper["schemas/schema1.xsd"] = dedent( """ """ ).encode() mapper["schemas/schema2.xsd"] = dedent( """ """ ).encode() mapper["schemas/schema3.xsd"] = dedent( """ """ ).encode() mapper["schemas/schema4.xsd"] = dedent( """ """ ).encode() return schema_index, mapper @pytest.fixture def schema_paths_setup(schema_setup): schema_index, mapper = schema_setup expected = [ ["schemas/root.xsd"], ["schemas/root.xsd", "schemas/schema2.xsd", "schemas/schema4.xsd"], [ "schemas/root.xsd", "schemas/schema1.xsd", "schemas/schema2.xsd", "schemas/schema3.xsd", "schemas/schema4.xsd", ], ] return Container(mapper, "schemas/root.xsd", expected[schema_index]) @pytest.fixture def schema_content_setup(schema_setup): schema_index, mapper = schema_setup count_type = {"name": "count", "type": "simple", "base_type": "integer"} manifest_type = {"name": "manifest", "type": "complex"} manifest_element = {"name": "manifest", "type": manifest_type} count_element = {"name": "count", "type": count_type} expected = [ SchemaProperties([], [], []), SchemaProperties([count_element], [count_type], []), SchemaProperties( [manifest_element, count_element], [count_type], [manifest_type] ), ] return Container(mapper, "schemas/root.xsd", expected[schema_index]) @pytest.fixture(params=["data.xml", "data/file.xml"]) def data_file_setup(request): path = request.param mapper = fsspec.get_mapper("memory") mapper["schemas/root.xsd"] = dedent( """ """ ).encode() mapper["schemas/schema1.xsd"] = dedent( """ """ ).encode() mapper["schemas/schema2.xsd"] = dedent( """ """ ).encode() schema_path = "schemas/root.xsd" if "/" not in path else "../schemas/root.xsd" mapper[path] = dedent( f""" 1 2 3 """ ).encode() expected = { "@xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance", "@xsi:schemaLocation": f"schema {schema_path}", "summary": {"quantity_a": 1, "quantity_b": 2}, "count": 3, } return Container(mapper, path, expected) def convert_type(t): def strip_namespace(name): return name.split("}", maxsplit=1)[1] if hasattr(t, "content"): # complex type return {"name": t.name, "type": "complex"} elif hasattr(t, "base_type"): # simple type, only restriction return { "name": t.name, "base_type": strip_namespace(t.base_type.name), "type": "simple", } def convert_element(el): return {"name": el.name, "type": convert_type(el.type)} def extract_schema_properties(schema): return SchemaProperties( [convert_element(v) for v in schema.root_elements], [convert_type(v) for v in schema.simple_types], [convert_type(v) for v in schema.complex_types], ) def test_remove_includes(): expected = schemas[0] actual = xml.remove_includes(schemas[1]) assert actual == expected @pytest.mark.parametrize( ["schema", "expected"], ( (schemas[0], []), (schemas[1], ["schema2.xsd"]), (schemas[2], ["schema1.xsd", "schema2.xsd"]), ), ) def test_extract_includes(schema, expected): actual = xml.extract_includes(schema) assert actual == expected @pytest.mark.parametrize( ["root", "path", "expected"], ( ("", "file.xml", "file.xml"), ("/root", "file.xml", "/root/file.xml"), ("/root", "/other_root/file.xml", "/other_root/file.xml"), ), ) def test_normalize(root, path, expected): actual = xml.normalize(root, path) assert actual == expected def test_schema_paths(schema_paths_setup): actual = xml.schema_paths(schema_paths_setup.mapper, schema_paths_setup.path) expected = schema_paths_setup.expected assert actual == expected def test_open_schemas(schema_content_setup): container = schema_content_setup actual = xml.open_schema(container.mapper, container.path) expected = container.expected assert extract_schema_properties(actual) == expected def test_read_xml(data_file_setup): container = data_file_setup actual = xml.read_xml(container.mapper, container.path) assert actual == container.expected xarray-safe-rcm-2024.11.0/safe_rcm/xml.py000066400000000000000000000042221471564272500177700ustar00rootroot00000000000000import io import posixpath import re from collections import deque import xmlschema from lxml import etree from tlz.dicttoolz import keymap include_re = re.compile(r'\s*') def remove_includes(text): return include_re.sub("", text) def extract_includes(text): return include_re.findall(text) def normalize(root, path): if posixpath.isabs(path) or posixpath.dirname(path): return path return posixpath.join(root, path) def schema_paths(mapper, root_schema): unvisited = deque([root_schema]) visited = [] while unvisited: path = unvisited.popleft() if path not in visited: visited.append(path) text = mapper[path].decode() includes = extract_includes(text) current_root = posixpath.dirname(path) normalized = [normalize(current_root, p) for p in includes] unvisited.extend([p for p in normalized if p not in visited]) return visited def open_schema(mapper, schema): """fsspec-compatible way to open remote schema files Parameters ---------- fs : fsspec.filesystem pre-instantiated fsspec filesystem instance root : str URL of the root directory of the schema files name : str File name of the schema to open. glob : str, default: "*.xsd" The glob used to find other schema files Returns ------- xmlschema.XMLSchema The opened schema object """ paths = schema_paths(mapper, schema) preprocessed = [io.StringIO(remove_includes(mapper[p].decode())) for p in paths] return xmlschema.XMLSchema(preprocessed) def read_xml(mapper, path): raw_data = mapper[path] tree = etree.fromstring(raw_data) namespaces = keymap(lambda x: x if x is not None else "rcm", tree.nsmap) schema_location = tree.xpath("./@xsi:schemaLocation", namespaces=namespaces)[0] _, schema_path_ = schema_location.split(" ") schema_path = posixpath.normpath( posixpath.join(posixpath.dirname(path), schema_path_) ) schema = open_schema(mapper, schema_path) decoded = schema.decode(tree) return decoded