pax_global_header00006660000000000000000000000064150037063220014510gustar00rootroot0000000000000052 comment=09b201e2936d4d794d08363439fc50c6e38d5e7d python-anndata-0.12.0~rc1/000077500000000000000000000000001500370632200153215ustar00rootroot00000000000000python-anndata-0.12.0~rc1/.cirun.yml000066400000000000000000000003361500370632200172440ustar00rootroot00000000000000runners: - name: aws-gpu-runner cloud: aws instance_type: g4dn.xlarge machine_image: ami-067a4ba2816407ee9 region: eu-north-1 preemptible: - true - false labels: - cirun-aws-gpu python-anndata-0.12.0~rc1/.codecov.yml000066400000000000000000000004041500370632200175420ustar00rootroot00000000000000# Based on pydata/xarray codecov: require_ci_to_pass: false coverage: status: project: default: # Require 80% coverage target: 80 changes: false comment: layout: "diff, flags, files" behavior: once require_base: false python-anndata-0.12.0~rc1/.editorconfig000066400000000000000000000003101500370632200177700ustar00rootroot00000000000000root = true [*] charset = utf-8 end_of_line = lf insert_final_newline = true trim_trailing_whitespace = true max_line_length = 88 indent_size = 4 indent_style = space [*.{yml,yaml}] indent_size = 2 python-anndata-0.12.0~rc1/.github/000077500000000000000000000000001500370632200166615ustar00rootroot00000000000000python-anndata-0.12.0~rc1/.github/ISSUE_TEMPLATE/000077500000000000000000000000001500370632200210445ustar00rootroot00000000000000python-anndata-0.12.0~rc1/.github/ISSUE_TEMPLATE/bug-report.yml000066400000000000000000000042141500370632200236560ustar00rootroot00000000000000name: Bug report description: anndata doesn’t do what it should? Please help us fix it! #title: ... labels: - Bug 🐛 - Triage 🩺 #assignees: [] body: - type: checkboxes id: terms attributes: label: Please make sure these conditions are met # description: ... options: - label: I have checked that this issue has not already been reported. required: true - label: I have confirmed this bug exists on the latest version of anndata. required: true - label: (optional) I have confirmed this bug exists on the master branch of anndata. required: false - type: markdown attributes: value: | **Note**: Please read [this guide][] detailing how to provide the necessary information for us to reproduce your bug. [this guide]: https://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports - type: textarea id: Report attributes: label: Report description: | Describe the bug you encountered, and what you were trying to do. Please use [github markdown][] features for readability. [github markdown]: https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax value: | Code: ```python ``` Traceback: ```pytb ``` validations: required: true - type: textarea id: versions attributes: label: Versions description: | Which version of anndata and other related software you used. Please install `session-info2`, run the following command in a notebook, click the “Copy as Markdown” button, then paste the results into the text box below. ```python In[1]: import anndata, session_info2; session_info2.session_info(dependencies=True) ``` Alternatively, run this in a console: ```python >>> import session_info2; print(session_info2.session_info(dependencies=True)._repr_mimebundle_()["text/markdown"]) ``` render: python validations: required: true python-anndata-0.12.0~rc1/.github/ISSUE_TEMPLATE/config.yml000066400000000000000000000006631500370632200230410ustar00rootroot00000000000000blank_issues_enabled: true contact_links: - name: Scverse Community Forum url: https://discourse.scverse.org/ about: If you have questions about “How to do X”, please ask them here. - name: Blank issue url: https://github.com/scverse/anndata/issues/new about: For things that don't quite fit elsewhere. Please note that other templates should be used in most cases – this is mainly for use by the developers. python-anndata-0.12.0~rc1/.github/ISSUE_TEMPLATE/enhancement-request.yml000066400000000000000000000005421500370632200255430ustar00rootroot00000000000000name: Enhancement request description: Anything you’d like to see in anndata? #title: ... labels: - enhancement - Triage 🩺 #assignees: [] body: - type: textarea id: description attributes: label: | Please describe your wishes and possible alternatives to achieve the desired result. validations: required: true python-anndata-0.12.0~rc1/.github/PULL_REQUEST_TEMPLATE.md000066400000000000000000000002661500370632200224660ustar00rootroot00000000000000 - [ ] Closes # - [ ] Tests added - [ ] Release note added (or unnecessary) python-anndata-0.12.0~rc1/.github/workflows/000077500000000000000000000000001500370632200207165ustar00rootroot00000000000000python-anndata-0.12.0~rc1/.github/workflows/benchmark.yml000066400000000000000000000026071500370632200234000ustar00rootroot00000000000000name: Benchmark on: push: branches: [main, "[0-9]+.[0-9]+.x"] pull_request: branches: [main] env: FORCE_COLOR: "1" defaults: run: shell: bash -el {0} jobs: benchmark: runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: python: ["3.12"] os: [ubuntu-latest] env: OS: ${{ matrix.os }} PYTHON: ${{ matrix.python }} ASV_DIR: "./benchmarks" steps: - uses: actions/checkout@v4 with: fetch-depth: 0 - run: git fetch origin main:main if: ${{ github.ref_name != 'main' }} # Errors on main branch - uses: mamba-org/setup-micromamba@v2 with: environment-name: asv cache-environment: true # Deps documented in https://asv.readthedocs.io/en/latest/installing.html # libmambapy upper bound: https://github.com/airspeed-velocity/asv/issues/1438 create-args: >- python=${{ matrix.python }} asv libmambapy<2 conda-build - name: Cache datasets uses: actions/cache@v4 with: path: | ~/.cache key: benchmark-state-${{ hashFiles('benchmarks/**') }} - name: Quick benchmark run working-directory: ${{ env.ASV_DIR }} run: | asv machine --yes asv run --quick --show-stderr --verbose python-anndata-0.12.0~rc1/.github/workflows/check-pr-milestoned.yml000066400000000000000000000016471500370632200253060ustar00rootroot00000000000000name: Pull Request Validation on: pull_request: branches: - main - master types: # milestone changes - milestoned - demilestoned # label changes for “no milestone” - labeled - unlabeled # initial check - opened - edited - reopened # code change (e.g. this workflow) - synchronize env: LABELS: ${{ join(github.event.pull_request.labels.*.name, '|') }} jobs: check-milestone: name: "Triage: Check Milestone" runs-on: ubuntu-latest steps: - name: Check if merging isn’t blocked uses: flying-sheep/check@v1 with: success: ${{ ! contains(env.LABELS, 'DON’T MERGE') }} - name: Check if a milestone is necessary and exists uses: flying-sheep/check@v1 with: success: ${{ github.event.pull_request.milestone != null || contains(env.LABELS, 'no milestone') }} python-anndata-0.12.0~rc1/.github/workflows/close-stale.yml000066400000000000000000000011261500370632200236540ustar00rootroot00000000000000name: "Close stale issues" on: schedule: - cron: "0 2 * * *" workflow_dispatch: jobs: stale: runs-on: ubuntu-latest steps: - uses: actions/stale@v5 with: days-before-issue-stale: -1 # We don't want to mark issues as stale in this action days-before-issue-close: 14 days-before-pr-close: -1 # don't close PRs days-before-pr-stale: -1 # don't mark PRs as stale stale-issue-label: stale any-of-labels: "needs info" debug-only: true # enable dry-run, remove when we know from the logs it's working. python-anndata-0.12.0~rc1/.github/workflows/codespell.yml000066400000000000000000000005631500370632200234170ustar00rootroot00000000000000--- name: Codespell on: push: branches: [main] pull_request: branches: [main] permissions: contents: read jobs: codespell: name: Check for spelling errors runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 with: fetch-depth: 0 filter: blob:none - uses: codespell-project/actions-codespell@v2 python-anndata-0.12.0~rc1/.github/workflows/label-stale.yml000066400000000000000000000014021500370632200236230ustar00rootroot00000000000000name: "Label stale issues" on: schedule: - cron: "30 1 * * 1,2,3,4,5" workflow_dispatch: jobs: stale: runs-on: ubuntu-latest steps: - uses: actions/stale@v5 with: days-before-issue-stale: 60 days-before-pr-stale: -1 # We don't want to mark PRs as stale days-before-close: -1 # We don't want to close issues in this action stale-issue-label: stale exempt-issue-labels: pinned,enhancement stale-issue-message: | This issue has been automatically marked as stale because it has not had recent activity. Please add a comment if you want to keep the issue open. Thank you for your contributions! debug-only: false # set to `true` to enable dry-run python-anndata-0.12.0~rc1/.github/workflows/publish.yml000066400000000000000000000010551500370632200231100ustar00rootroot00000000000000name: Publish Python Package on: release: types: [published] jobs: publish: runs-on: ubuntu-latest environment: pypi permissions: id-token: write # to authenticate as Trusted Publisher to pypi.org steps: - uses: actions/checkout@v4 with: fetch-depth: 0 filter: blob:none - uses: actions/setup-python@v5 with: python-version: "3.x" cache: "pip" - run: pip install build - run: python -m build - uses: pypa/gh-action-pypi-publish@release/v1 python-anndata-0.12.0~rc1/.github/workflows/test-cpu.yml000066400000000000000000000067021500370632200232120ustar00rootroot00000000000000name: CI on: push: branches: - main - "[0-9]+.[0-9]+.x" pull_request: env: PYTEST_ADDOPTS: "-v --color=yes" FORCE_COLOR: "1" defaults: run: shell: bash -el {0} # Cancel the job if new commits are pushed # https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true jobs: pytest: runs-on: ubuntu-latest strategy: matrix: include: - python-version: '3.13' test-type: coverage - python-version: '3.11' test-type: standard - python-version: '3.13' dependencies-version: pre-release test-type: strict-warning - python-version: '3.11' dependencies-version: minimum test-type: coverage steps: - uses: actions/checkout@v4 with: fetch-depth: 0 filter: blob:none - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install UV uses: astral-sh/setup-uv@v5 with: enable-cache: true cache-dependency-glob: pyproject.toml - name: Install dependencies if: matrix.dependencies-version == null run: uv pip install --system --compile "anndata[dev,test-full] @ ." -c ci/constraints.txt - name: Install minimum dependencies if: matrix.dependencies-version == 'minimum' run: | uv pip install --system --compile tomli packaging deps=$(python3 ci/scripts/min-deps.py pyproject.toml --extra dev test) uv pip install --system --compile $deps "anndata @ ." - name: Install dependencies release candidates if: matrix.dependencies-version == 'pre-release' run: uv pip install -v --system --compile --pre "anndata[dev,test-full] @ ." -c ci/constraints.txt - name: Display installed versions run: uv pip list - name: Run Pytest if: matrix.test-type == 'standard' run: pytest -n auto - name: Run Pytest (coverage) if: matrix.test-type == 'coverage' run: coverage run -m pytest -n auto --cov --cov-report=xml - name: Run Pytest (treat warnings as errors) if: matrix.test-type == 'strict-warning' run: pytest --strict-warnings -n auto - uses: codecov/codecov-action@v4 if: matrix.test-type == 'coverage' with: token: ${{ secrets.CODECOV_TOKEN }} fail_ci_if_error: true files: test-data/coverage.xml check-build: runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v4 with: fetch-depth: 0 filter: blob:none - name: Set up Python 3.13 uses: actions/setup-python@v5 with: python-version: '3.13' - name: Install build tools and requirements run: | python -m pip install --upgrade pip pip install build twine - name: Display installed versions run: pip list - name: Build & Twine check run: | python -m build --sdist --wheel . twine check dist/* - name: Check runtime version run: | pip install dist/*.whl python -c 'import anndata; print(anndata.__version__)' python-anndata-0.12.0~rc1/.github/workflows/test-gpu.yml000066400000000000000000000063371500370632200232220ustar00rootroot00000000000000name: AWS GPU on: push: branches: [main, "[0-9]+.[0-9]+.x"] pull_request: types: - labeled - opened - synchronize env: PYTEST_ADDOPTS: "-v --color=yes" FORCE_COLOR: "1" defaults: run: shell: bash -el {0} # Cancel the job if new commits are pushed # https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true # There are two jobs: # 1. `check` determines if the second job (`test`) will be run (through a job dependency). # 2. `test` runs on an AWS runner and executes the GPU tests. jobs: # If the `skip-gpu-ci` label is set, this job is skipped, and consequently the `test` job too. # If the `run-gpu-ci` label is set or we reacted to a `push` event, this job succeeds (and `test` is run). # If neither is set, this job fails, `test` is skipped, and the whole workflow fails. check: name: "Triage: Check if GPU tests are allowed to run" if: (!contains(github.event.pull_request.labels.*.name, 'skip-gpu-ci')) runs-on: ubuntu-latest steps: - uses: flying-sheep/check@v1 with: success: ${{ github.event_name == 'push' || contains(github.event.pull_request.labels.*.name, 'run-gpu-ci') }} # If `check` wasn’t skipped or failed, start an AWS runner and run the GPU tests on it. test: name: GPU Tests needs: check runs-on: "cirun-aws-gpu--${{ github.run_id }}" # Setting a timeout of 30 minutes, as the AWS costs money # At time of writing, a typical run takes about 5 minutes timeout-minutes: 30 steps: - uses: actions/checkout@v4 with: fetch-depth: 0 filter: blob:none - name: Nvidia SMI sanity check run: nvidia-smi - name: Install yq run: | sudo snap install yq - name: Extract max Python version from classifiers run: | classifiers=$(yq .project.classifiers pyproject.toml -oy | grep --only-matching --perl-regexp '(?<=Python :: )(\d\.\d+)') max_version=$(echo "$classifiers" | sort -V | tail -1) echo "max_python_version=$max_version" >> $GITHUB_ENV - name: Install Python uses: actions/setup-python@v5 with: # https://github.com/cupy/cupy/issues/8651 cupy does not support python3.13 yet python-version: "3.12" - name: Install UV uses: astral-sh/setup-uv@v5 with: enable-cache: true cache-dependency-glob: pyproject.toml - name: Install AnnData run: uv pip install --system -e ".[dev,test,cu12]" -c ci/constraints.txt - name: Env list run: pip list - name: Run test run: coverage run -m pytest -m gpu -n auto --cov --cov-report=xml - uses: codecov/codecov-action@v4 with: token: ${{ secrets.CODECOV_TOKEN }} fail_ci_if_error: true files: test-data/coverage.xml - name: Remove 'run-gpu-ci' Label if: always() uses: actions-ecosystem/action-remove-labels@v1 with: labels: "run-gpu-ci" github_token: ${{ secrets.GITHUB_TOKEN }} python-anndata-0.12.0~rc1/.gitignore000066400000000000000000000007011500370632200173070ustar00rootroot00000000000000# Temp files .DS_Store *~ # Caches for compiled and downloaded files __pycache__/ /*cache/ /node_modules/ /data/ /venv/ # Distribution / packaging /dist/ /ci/min-deps.txt /requirements*.lock /.python-version # Test results (nunit/junit) and coverage /test-data/ /*coverage* # jupyter .ipynb_checkpoints # docs /docs/generated/ /docs/_build/ # IDEs /.idea/ # Benchmark .asv benchmark/benchmarks/data benchmarks/benchmarks/data benchmarks/pkgs python-anndata-0.12.0~rc1/.gitmodules000066400000000000000000000001741500370632200175000ustar00rootroot00000000000000[submodule "docs/tutorials/notebooks"] path = docs/tutorials/notebooks url = https://github.com/scverse/anndata-tutorials python-anndata-0.12.0~rc1/.pre-commit-config.yaml000066400000000000000000000016111500370632200216010ustar00rootroot00000000000000repos: - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.11.4 hooks: - id: ruff args: ["--fix"] - id: ruff-format - repo: https://github.com/biomejs/pre-commit rev: v1.9.4 hooks: - id: biome-format - repo: https://github.com/ComPWA/taplo-pre-commit rev: v0.9.3 hooks: - id: taplo-format - repo: https://github.com/pre-commit/pre-commit-hooks rev: v5.0.0 hooks: - id: trailing-whitespace - id: end-of-file-fixer - id: check-added-large-files - id: check-case-conflict - id: check-toml - id: check-yaml - id: check-merge-conflict - id: detect-private-key - id: no-commit-to-branch args: ["--branch=main"] - repo: https://github.com/codespell-project/codespell rev: v2.4.1 hooks: - id: codespell additional_dependencies: - tomli python-anndata-0.12.0~rc1/.prettierignore000066400000000000000000000000051500370632200203570ustar00rootroot00000000000000*.md python-anndata-0.12.0~rc1/.prettierrc.yaml000066400000000000000000000002571500370632200204520ustar00rootroot00000000000000overrides: # JSON with comments and trailing commas - files: benchmarks/asv.conf.json options: parser: json5 quoteProps: preserve singleQuote: false python-anndata-0.12.0~rc1/.readthedocs.yml000066400000000000000000000011761500370632200204140ustar00rootroot00000000000000version: 2 build: os: ubuntu-20.04 tools: python: "3.13" jobs: post_checkout: # unshallow so version can be derived from tag - git fetch --unshallow || true pre_build: # run towncrier to preview the next version’s release notes - ( find docs/release-notes -regex '[^.]+[.][^.]+.md' | grep -q . ) && towncrier build --keep || true sphinx: configuration: docs/conf.py fail_on_warning: true # do not change or you will be fired python: install: - method: pip path: . extra_requirements: - doc submodules: include: - "docs/tutorials/notebooks" recursive: true python-anndata-0.12.0~rc1/.taplo.toml000066400000000000000000000001521500370632200174110ustar00rootroot00000000000000[formatting] array_auto_collapse = false column_width = 120 compact_arrays = false indent_string = ' ' python-anndata-0.12.0~rc1/.vscode/000077500000000000000000000000001500370632200166625ustar00rootroot00000000000000python-anndata-0.12.0~rc1/.vscode/launch.json000066400000000000000000000014011500370632200210230ustar00rootroot00000000000000{ "version": "0.2.0", "configurations": [ { "name": "Python: Build Docs", "type": "debugpy", "request": "launch", "module": "sphinx", "args": ["-M", "html", ".", "_build"], "cwd": "${workspaceFolder}/docs", "console": "internalConsole", "justMyCode": false, }, { "name": "Python: Debug Test", "type": "debugpy", "request": "launch", "program": "${file}", "purpose": ["debug-test"], "console": "internalConsole", "justMyCode": false, "env": { "PYTEST_ADDOPTS": "--color=yes" }, "presentation": { "hidden": true }, }, ], } python-anndata-0.12.0~rc1/.vscode/settings.json000066400000000000000000000013501500370632200214140ustar00rootroot00000000000000{ "[python][toml][json][jsonc]": { "editor.formatOnSave": true, "editor.codeActionsOnSave": { "source.organizeImports": "explicit", "source.fixAll": "explicit", }, }, "[python]": { "editor.defaultFormatter": "charliermarsh.ruff", }, "[toml]": { "editor.defaultFormatter": "tamasfe.even-better-toml", }, "[json][jsonc]": { "editor.defaultFormatter": "biomejs.biome", }, "python.analysis.typeCheckingMode": "basic", "python.testing.pytestEnabled": true, "python.testing.pytestArgs": [ "--color=yes", "-vv", "--strict-warnings", //"-nauto", ], "python.terminal.activateEnvironment": true, } python-anndata-0.12.0~rc1/LICENSE000066400000000000000000000030471500370632200163320ustar00rootroot00000000000000BSD 3-Clause License Copyright (c) 2025, scverse® Copyright (c) 2017-2018, P. Angerer, F. Alexander Wolf, Theis Lab All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. python-anndata-0.12.0~rc1/README.md000066400000000000000000000106711500370632200166050ustar00rootroot00000000000000[![Tests](https://github.com/scverse/anndata/actions/workflows/test-cpu.yml/badge.svg)](https://github.com/scverse/anndata/actions) [![Conda](https://img.shields.io/conda/vn/conda-forge/anndata.svg)](https://anaconda.org/conda-forge/anndata) [![Coverage](https://codecov.io/gh/scverse/anndata/branch/main/graph/badge.svg?token=IN1mJN1Wi8)](https://codecov.io/gh/scverse/anndata) [![Docs](https://readthedocs.com/projects/icb-anndata/badge/?version=latest)](https://anndata.readthedocs.io) [![PyPI](https://img.shields.io/pypi/v/anndata.svg)](https://pypi.org/project/anndata) [![Downloads](https://static.pepy.tech/badge/anndata/month)](https://pepy.tech/project/anndata) [![Downloads](https://static.pepy.tech/badge/anndata)](https://pepy.tech/project/anndata) [![Stars](https://img.shields.io/github/stars/scverse/anndata?style=flat&logo=github&color=yellow)](https://github.com/scverse/anndata/stargazers) [![Powered by NumFOCUS](https://img.shields.io/badge/powered%20by-NumFOCUS-orange.svg?style=flat&colorA=E1523D&colorB=007D8A)](http://numfocus.org) image # anndata - Annotated data anndata is a Python package for handling annotated data matrices in memory and on disk, positioned between pandas and xarray. anndata offers a broad range of computationally efficient features including, among others, sparse data support, lazy operations, and a PyTorch interface. - Discuss development on [GitHub](https://github.com/scverse/anndata). - Read the [documentation](https://anndata.readthedocs.io). - Ask questions on the [scverse Discourse](https://discourse.scverse.org). - Install via `pip install anndata` or `conda install anndata -c conda-forge`. - See [Scanpy's documentation](https://scanpy.readthedocs.io/) for usage related to single cell data. anndata was initially built for Scanpy. [//]: # (numfocus-fiscal-sponsor-attribution) anndata is part of the scverse® project ([website](https://scverse.org), [governance](https://scverse.org/about/roles)) and is fiscally sponsored by [NumFOCUS](https://numfocus.org/). If you like scverse® and want to support our mission, please consider making a tax-deductible [donation](https://numfocus.org/donate-to-scverse) to help the project pay for developer time, professional services, travel, workshops, and a variety of other needs.
## Public API Our public API is documented in the [API section][] of these docs. We cannot guarantee the stability of our internal APIs, whether it's the location of a function, its arguments, or something else. In other words, we do not officially support (or encourage users to do) something like `from anndata._core import AnnData` as `_core` is both not documented and contains a [leading underscore][]. However, we are aware that [many users do use these internal APIs][] and thus encourage them to [open an issue][] or migrate to the public API. That is, if something is missing from our public API as documented, for example a feature you wish to be exported publicly, please open an issue. [api section]: https://anndata.readthedocs.io/en/stable/api.html [leading underscore]: https://peps.python.org/pep-0008/#public-and-internal-interfaces [many users do use these internal APIs]: https://github.com/search?q=%22anndata._io%22&type=code [open an issue]: https://github.com/scverse/anndata/issues/new/choose ## Citation If you use `anndata` in your work, please cite the `anndata` publication as follows: > **anndata: Annotated data** > > Isaac Virshup, Sergei Rybakov, Fabian J. Theis, Philipp Angerer, F. Alexander Wolf > > _JOSS_ 2024 Sep 16. doi: [10.21105/joss.04371](https://doi.org/10.21105/joss.04371). You can cite the scverse publication as follows: > **The scverse project provides a computational ecosystem for single-cell omics data analysis** > > Isaac Virshup, Danila Bredikhin, Lukas Heumos, Giovanni Palla, Gregor Sturm, Adam Gayoso, Ilia Kats, Mikaela Koutrouli, Scverse Community, Bonnie Berger, Dana Pe’er, Aviv Regev, Sarah A. Teichmann, Francesca Finotello, F. Alexander Wolf, Nir Yosef, Oliver Stegle & Fabian J. Theis > > _Nat Biotechnol._ 2023 Apr 10. doi: [10.1038/s41587-023-01733-8](https://doi.org/10.1038/s41587-023-01733-8). python-anndata-0.12.0~rc1/benchmarks/000077500000000000000000000000001500370632200174365ustar00rootroot00000000000000python-anndata-0.12.0~rc1/benchmarks/README.md000066400000000000000000000074331500370632200207240ustar00rootroot00000000000000# AnnData Benchmarks This repo contains some work in progress benchmarks for [AnnData](https://github.com/theislab/anndata) using [asv](https://asv.readthedocs.io). ## Setup I definitely recommend reading through the asv docs. Currently, this assumes the benchmark suite can reach the `anndata` repo via the path `../anndata`. Otherwise, all you'll need to do is create a [machine file](https://asv.readthedocs.io/en/stable/commands.html#asv-machine) for your system and make sure `anndata`s dependencies are installable via `conda`. ### Data Data will need to be retrieved for these benchmarks. This can be downloaded using the script fetch_datasets.py. Note that the `h5ad` format has changed since it's inception. While the `anndata` package maintains backwards compatibility, older versions of `anndata` will not be able to read files written by more recent versions. To get around this for the benchmarks, datasets have to be able to be read by all versions which can require a setup function that creates the anndata object. ## Usage ### Runnings the benchmarks: To run benchmarks for a particular commit: `asv run {commit} --steps 1 -b` To run benchmarks for a range of commits: `asv run {commit1}..{commit2}` You can filter out the benchmarks which are run with the `-b {pattern}` flag. ### Accessing the benchmarks You can see what benchmarks you've already run using `asv show`. If you don't specify a commit, it will search for the available commits. If you specify a commit it'll show you those results. For example: ```bash $ asv show -b "views" Commits with results: Machine : mimir.mobility.unimelb.net.au Environment: conda-py3.7-h5py-memory_profiler-natsort-numpy-pandas-scipy 61eb5bb7 e9ccfc33 22f12994 0ebe187e ``` ```bash $ asv show -b "views" 0ebe187e Commit: 0ebe187e views.SubsetMemorySuite.track_repeated_subset_memratio [mimir.mobility.unimelb.net.au/conda-py3.7-h5py-memory_profiler-natsort-numpy-pandas-scipy] ok ======= ======= ========== ============ ===================== ====================== ====================== -- index_kind --------------------------------------- ------------------------------------------------------------------- n_obs n_var attr_set subset_dim intarray boolarray slice ======= ======= ========== ============ ===================== ====================== ====================== 100 100 X-csr obs 2.84 1.7916666666666667 0.5 100 100 X-csr var 2.5357142857142856 1.8695652173913044 0.5652173913043478 100 100 X-dense obs 3.1739130434782608 1.6538461538461537 0.6 ... ``` You can compare two commits with `asv compare` ```bash $ asv compare e9ccfc 0ebe187e All benchmarks: before after ratio [e9ccfc33] [0ebe187e] - 2.16 1.7916666666666667 0.83 views.SubsetMemorySuite.track_repeated_subset_memratio(100, 100, 'X-csr', 'obs', 'boolarray') + 2.533333333333333 2.84 1.12 views.SubsetMemorySuite.track_repeated_subset_memratio(100, 100, 'X-csr', 'obs', 'intarray') - 1.1923076923076923 0.5 0.42 views.SubsetMemorySuite.track_repeated_subset_memratio(100, 100, 'X-csr', 'obs', 'slice') 1.9615384615384615 1.8695652173913044 0.95 views.SubsetMemorySuite.track_repeated_subset_memratio(100, 100, 'X-csr', 'var', 'boolarray') ``` ### View in the browser: You can view the benchmarks in the browser with `asv publish` followed by `asv preview`. If you want to include benchmarks of a local branch, I think you'll have to add that branch to the `"branches"` list in `asv.conf.json`. python-anndata-0.12.0~rc1/benchmarks/asv.conf.json000066400000000000000000000153751500370632200220610ustar00rootroot00000000000000{ // The version of the config file format. Do not change, unless // you know what you are doing. "version": 1, // The name of the project being benchmarked "project": "anndata", // The project's homepage "project_url": "https://anndata.readthedocs.io/", // The URL or local path of the source code repository for the // project being benchmarked "repo": "../../anndata", // The Python project's subdirectory in your repo. If missing or // the empty string, the project is assumed to be located at the root // of the repository. // "repo_subdir": "", // Customizable commands for building, installing, and // uninstalling the project. See asv.conf.json documentation. // // "install_command": ["python -mpip install {wheel_file}"], // "uninstall_command": ["return-code=any python -mpip uninstall -y {project}"], "build_command": [ "python -m pip install build", "python -m build --wheel -o {build_cache_dir} {build_dir}", ], // List of branches to benchmark. If not provided, defaults to "master" // (for git) or "default" (for mercurial). "branches": ["main"], // for git // "branches": ["default"], // for mercurial // The DVCS being used. If not set, it will be automatically // determined from "repo" by looking at the protocol in the URL // (if remote), or by looking for special directories, such as // ".git" (if local). "dvcs": "git", // The tool to use to create environments. May be "conda", // "virtualenv" or other value depending on the plugins in use. // If missing or the empty string, the tool will be automatically // determined by looking for tools on the PATH environment // variable. "environment_type": "mamba", // timeout in seconds for installing any dependencies in environment // defaults to 10 min //"install_timeout": 600, // the base URL to show a commit for the project. "show_commit_url": "https://github.com/theislab/anndata/commit/", // The Pythons you'd like to test against. If not provided, defaults // to the current version of Python used to run `asv`. // "pythons": ["2.7", "3.6"], // The list of conda channel names to be searched for benchmark // dependency packages in the specified order "conda_channels": ["conda-forge", "defaults"], // The matrix of dependencies to test. Each key is the name of a // package (in PyPI) and the values are version numbers. An empty // list or empty string indicates to just test against the default // (latest) version. null indicates that the package is to not be // installed. If the package to be tested is only available from // PyPi, and the 'environment_type' is conda, then you can preface // the package name by 'pip+', and the package will be installed via // pip (with all the conda available packages installed first, // followed by the pip installed packages). // "matrix": { "numpy": [""], // "scipy": ["1.2", ""], "scipy": [""], "h5py": [""], "natsort": [""], "pandas": [""], "memory_profiler": [""], "zarr": [""], "pytoml": [""], "pytest": [""], "pooch": [""], // "scanpy": [""], // "psutil": [""] }, // Combinations of libraries/python versions can be excluded/included // from the set to test. Each entry is a dictionary containing additional // key-value pairs to include/exclude. // // An exclude entry excludes entries where all values match. The // values are regexps that should match the whole string. // // An include entry adds an environment. Only the packages listed // are installed. The 'python' key is required. The exclude rules // do not apply to includes. // // In addition to package names, the following keys are available: // // - python // Python version, as in the *pythons* variable above. // - environment_type // Environment type, as above. // - sys_platform // Platform, as in sys.platform. Possible values for the common // cases: 'linux2', 'win32', 'cygwin', 'darwin'. // // "exclude": [ // {"python": "3.2", "sys_platform": "win32"}, // skip py3.2 on windows // {"environment_type": "conda", "six": null}, // don't run without six on conda // ], // // "include": [ // // additional env for python2.7 // {"python": "2.7", "numpy": "1.8"}, // // additional env if run on windows+conda // {"platform": "win32", "environment_type": "mamba", "python": "2.7", "libpython": ""}, // ], // The directory (relative to the current directory) that benchmarks are // stored in. If not provided, defaults to "benchmarks" // "benchmark_dir": "benchmarks", // The directory (relative to the current directory) to cache the Python // environments in. If not provided, defaults to "env" "env_dir": ".asv/env", // The directory (relative to the current directory) that raw benchmark // results are stored in. If not provided, defaults to "results". "results_dir": ".asv/results", // The directory (relative to the current directory) that the html tree // should be written to. If not provided, defaults to "html". "html_dir": ".asv/html", // The number of characters to retain in the commit hashes. // "hash_length": 8, // `asv` will cache results of the recent builds in each // environment, making them faster to install next time. This is // the number of builds to keep, per environment. // "build_cache_size": 2, // The commits after which the regression search in `asv publish` // should start looking for regressions. Dictionary whose keys are // regexps matching to benchmark names, and values corresponding to // the commit (exclusive) after which to start looking for // regressions. The default is to start from the first commit // with results. If the commit is `null`, regression detection is // skipped for the matching benchmark. // // "regressions_first_commits": { // "some_benchmark": "352cdf", // Consider regressions only after this commit // "another_benchmark": null, // Skip regression detection altogether // }, // The thresholds for relative change in results, after which `asv // publish` starts reporting regressions. Dictionary of the same // form as in ``regressions_first_commits``, with values // indicating the thresholds. If multiple entries match, the // maximum is taken. If no entry matches, the default is 5%. // // "regressions_thresholds": { // "some_benchmark": 0.01, // Threshold of 1% // "another_benchmark": 0.5, // Threshold of 50% // }, } python-anndata-0.12.0~rc1/benchmarks/benchmarks/000077500000000000000000000000001500370632200215535ustar00rootroot00000000000000python-anndata-0.12.0~rc1/benchmarks/benchmarks/__init__.py000066400000000000000000000000001500370632200236520ustar00rootroot00000000000000python-anndata-0.12.0~rc1/benchmarks/benchmarks/anndata.py000066400000000000000000000023771500370632200235440ustar00rootroot00000000000000from __future__ import annotations import tracemalloc import numpy as np from .utils import gen_adata class GarbargeCollectionSuite: runs = 10 # custom because `memory_profiler` is a line-by-line profiler (also: https://github.com/pythonprofilers/memory_profiler/issues/402) def track_peakmem_garbage_collection(self, *_): def display_top(snapshot, key_type="lineno"): snapshot = snapshot.filter_traces( ( tracemalloc.Filter( inclusive=False, filename_pattern="", ), tracemalloc.Filter( inclusive=False, filename_pattern="", ), ) ) top_stats = snapshot.statistics(key_type) total = sum(stat.size for stat in top_stats) return total total = np.zeros(self.runs) tracemalloc.start() for i in range(self.runs): data = gen_adata(10000, 10000, "X-csc") # noqa: F841 snapshot = tracemalloc.take_snapshot() total[i] = display_top(snapshot) tracemalloc.stop() return max(total) python-anndata-0.12.0~rc1/benchmarks/benchmarks/readwrite.py000066400000000000000000000135161500370632200241210ustar00rootroot00000000000000""" This module will benchmark io of AnnData objects Things to test: * Read time, write time * Peak memory during io * File sizes Parameterized by: * What method is being used * What data is being included * Size of data being used Also interesting: * io for views * io for backed objects * Reading dense as sparse, writing sparse as dense """ from __future__ import annotations import sys import tempfile from pathlib import Path import numpy as np import pooch from memory_profiler import memory_usage # from . import datasets import anndata from .utils import get_actualsize, get_peak_mem, sedate PBMC_3K_URL = "https://falexwolf.de/data/pbmc3k_raw.h5ad" # PBMC_3K_PATH = Path(__file__).parent / "data/pbmc3k_raw.h5ad" # PBMC_REDUCED_PATH = Path(__file__).parent / "10x_pbmc68k_reduced.h5ad" # BM_43K_CSR_PATH = Path(__file__).parent.parent / "datasets/BM2_43k-cells.h5ad" # BM_43K_CSC_PATH = Path(__file__).parent.parent / "datasets/BM2_43k-cells_CSC.h5ad" # class ZarrReadSuite: # params = [] # param_names = ["input_url"] # def setup(self, input_url): # self.filepath = pooch.retrieve(url=input_url, known_hash=None) # def time_read_full(self, input_url): # anndata.read_zarr(self.filepath) # def peakmem_read_full(self, input_url): # anndata.read_zarr(self.filepath) # def mem_readfull_object(self, input_url): # return anndata.read_zarr(self.filepath) # def track_read_full_memratio(self, input_url): # mem_recording = memory_usage( # (sedate(anndata.read_zarr, 0.005), (self.filepath,)), interval=0.001 # ) # adata = anndata.read_zarr(self.filepath) # base_size = mem_recording[-1] - mem_recording[0] # print(np.max(mem_recording) - np.min(mem_recording)) # print(base_size) # return (np.max(mem_recording) - np.min(mem_recording)) / base_size # def peakmem_read_backed(self, input_url): # anndata.read_zarr(self.filepath, backed="r") # def mem_read_backed_object(self, input_url): # return anndata.read_zarr(self.filepath, backed="r") class H5ADInMemorySizeSuite: _urls = dict(pbmc3k=PBMC_3K_URL) params = _urls.keys() param_names = ["input_data"] def setup(self, input_data: str): self.filepath = pooch.retrieve(url=self._urls[input_data], known_hash=None) def track_in_memory_size(self, *_): adata = anndata.read_h5ad(self.filepath) adata_size = sys.getsizeof(adata) return adata_size def track_actual_in_memory_size(self, *_): adata = anndata.read_h5ad(self.filepath) adata_size = get_actualsize(adata) return adata_size class H5ADReadSuite: _urls = dict(pbmc3k=PBMC_3K_URL) params = _urls.keys() param_names = ["input_data"] def setup(self, input_data: str): self.filepath = pooch.retrieve(url=self._urls[input_data], known_hash=None) def time_read_full(self, *_): anndata.read_h5ad(self.filepath) def peakmem_read_full(self, *_): anndata.read_h5ad(self.filepath) def mem_readfull_object(self, *_): return anndata.read_h5ad(self.filepath) def track_read_full_memratio(self, *_): mem_recording = memory_usage( (sedate(anndata.read_h5ad, 0.005), (self.filepath,)), interval=0.001 ) # adata = anndata.read_h5ad(self.filepath) base_size = mem_recording[-1] - mem_recording[0] print(np.max(mem_recording) - np.min(mem_recording)) print(base_size) return (np.max(mem_recording) - np.min(mem_recording)) / base_size def peakmem_read_backed(self, *_): anndata.read_h5ad(self.filepath, backed="r") # causes benchmarking to break from: https://github.com/pympler/pympler/issues/151 # def mem_read_backed_object(self, *_): # return anndata.read_h5ad(self.filepath, backed="r") class H5ADWriteSuite: _urls = dict(pbmc3k=PBMC_3K_URL) params = _urls.keys() param_names = ["input_data"] def setup(self, input_data: str): mem_recording, adata = memory_usage( ( sedate(anndata.read_h5ad, 0.005), (pooch.retrieve(self._urls[input_data], known_hash=None),), ), retval=True, interval=0.001, ) self.adata = adata self.base_size = mem_recording[-1] - mem_recording[0] self.tmpdir = tempfile.TemporaryDirectory() self.writepth = Path(self.tmpdir.name) / "out.h5ad" def teardown(self, *_): self.tmpdir.cleanup() def time_write_full(self, *_): self.adata.write_h5ad(self.writepth, compression=None) def peakmem_write_full(self, *_): self.adata.write_h5ad(self.writepth) def track_peakmem_write_full(self, *_): return get_peak_mem((sedate(self.adata.write_h5ad), (self.writepth,))) def time_write_compressed(self, *_): self.adata.write_h5ad(self.writepth, compression="gzip") def peakmem_write_compressed(self, *_): self.adata.write_h5ad(self.writepth, compression="gzip") def track_peakmem_write_compressed(self, *_): return get_peak_mem( (sedate(self.adata.write_h5ad), (self.writepth,), {"compression": "gzip"}) ) class H5ADBackedWriteSuite(H5ADWriteSuite): _urls = dict(pbmc3k=PBMC_3K_URL) params = _urls.keys() param_names = ["input_data"] def setup(self, input_data): mem_recording, adata = memory_usage( ( sedate(anndata.read_h5ad, 0.005), (pooch.retrieve(self._urls[input_data], known_hash=None),), {"backed": "r"}, ), retval=True, interval=0.001, ) self.adata = adata self.base_size = mem_recording[-1] - mem_recording[0] self.tmpdir = tempfile.TemporaryDirectory() self.writepth = Path(self.tmpdir.name) / "out.h5ad" python-anndata-0.12.0~rc1/benchmarks/benchmarks/sparse_dataset.py000066400000000000000000000030021500370632200251220ustar00rootroot00000000000000from __future__ import annotations import numpy as np import zarr from scipy import sparse from anndata import AnnData from anndata._core.sparse_dataset import sparse_dataset from anndata._io.specs import write_elem def make_alternating_mask(n): mask_alternating = np.ones(10_000, dtype=bool) for i in range(0, 10_000, n): mask_alternating[i] = False return mask_alternating class SparseCSRContiguousSlice: _slices = { "0:1000": slice(0, 1000), "0:9000": slice(0, 9000), ":9000:-1": slice(None, 9000, -1), "::-2": slice(None, None, 2), "array": np.array([0, 5000, 9999]), "arange": np.arange(0, 1000), "first": 0, "alternating": make_alternating_mask(10), } params = ( [ (10_000, 10_000), # (10_000, 500) ], _slices.keys(), ) param_names = ["shape", "slice"] def setup(self, shape: tuple[int, int], slice: str): X = sparse.random( *shape, density=0.01, format="csr", random_state=np.random.default_rng(42) ) self.slice = self._slices[slice] g = zarr.group() write_elem(g, "X", X) self.x = sparse_dataset(g["X"]) self.adata = AnnData(self.x) def time_getitem(self, *_): self.x[self.slice] def peakmem_getitem(self, *_): self.x[self.slice] def time_getitem_adata(self, *_): self.adata[self.slice] def peakmem_getitem_adata(self, *_): self.adata[self.slice] python-anndata-0.12.0~rc1/benchmarks/benchmarks/utils.py000066400000000000000000000074021500370632200232700ustar00rootroot00000000000000from __future__ import annotations import gc import sys from string import ascii_lowercase from time import sleep import numpy as np import pandas as pd from memory_profiler import memory_usage from scipy import sparse from anndata import AnnData def get_actualsize(input_obj): """Using Python Garbage Collector to calculate the size of all elements attached to an object""" memory_size = 0 ids = set() objects = [input_obj] while objects: new = [] for obj in objects: if id(obj) not in ids: ids.add(id(obj)) memory_size += sys.getsizeof(obj) new.append(obj) objects = gc.get_referents(*new) return memory_size def get_anndata_memsize(adata): recording = memory_usage( (sedate(adata.copy, naplength=0.005), (adata,)), interval=0.001 ) diff = recording[-1] - recording[0] return diff def get_peak_mem(op, interval=0.001): recording = memory_usage(op, interval=interval) return np.max(recording) - np.min(recording) def sedate(func, naplength=0.05): """Make a function sleepy, so we can sample the start and end state.""" def wrapped_function(*args, **kwargs): sleep(naplength) val = func(*args, **kwargs) sleep(naplength) return val return wrapped_function # TODO: Factor out the time it takes to generate these def gen_indexer(adata, dim, index_kind, ratio): dimnames = ("obs", "var") index_kinds = {"slice", "intarray", "boolarray", "strarray"} if index_kind not in index_kinds: msg = f"Argument 'index_kind' must be one of {index_kinds}. Was {index_kind}." raise ValueError(msg) axis = dimnames.index(dim) subset = [slice(None), slice(None)] axis_size = adata.shape[axis] if index_kind == "slice": subset[axis] = slice(0, int(np.round(axis_size * ratio))) elif index_kind == "intarray": subset[axis] = np.random.choice( np.arange(axis_size), int(np.round(axis_size * ratio)), replace=False ) subset[axis].sort() elif index_kind == "boolarray": pos = np.random.choice( np.arange(axis_size), int(np.round(axis_size * ratio)), replace=False ) a = np.zeros(axis_size, dtype=bool) a[pos] = True subset[axis] = a elif index_kind == "strarray": subset[axis] = np.random.choice( getattr(adata, dim).index, int(np.round(axis_size * ratio)), replace=False ) else: raise ValueError() return tuple(subset) def take_view(adata, *, dim, index_kind, ratio=0.5, nviews=100): subset = gen_indexer(adata, dim, index_kind, ratio) views = [] for i in range(nviews): views.append(adata[subset]) def take_repeated_view(adata, *, dim, index_kind, ratio=0.9, nviews=10): v = adata views = [] for i in range(nviews): subset = gen_indexer(v, dim, index_kind, ratio) v = v[subset] views.append(v) def gen_adata(n_obs, n_var, attr_set): if "X-csr" in attr_set: X = sparse.random(n_obs, n_var, density=0.1, format="csr") elif "X-dense" in attr_set: X = sparse.random(n_obs, n_var, density=0.1, format="csr") X = X.toarray() else: # TODO: There's probably a better way to do this X = sparse.random(n_obs, n_var, density=0, format="csr") adata = AnnData(X) if "obs,var" in attr_set: adata.obs = pd.DataFrame( {k: np.random.randint(0, 100, n_obs) for k in ascii_lowercase}, index=[f"cell{i}" for i in range(n_obs)], ) adata.var = pd.DataFrame( {k: np.random.randint(0, 100, n_var) for k in ascii_lowercase}, index=[f"gene{i}" for i in range(n_var)], ) return adata python-anndata-0.12.0~rc1/biome.jsonc000066400000000000000000000007771500370632200174650ustar00rootroot00000000000000{ "$schema": "https://biomejs.dev/schemas/1.9.4/schema.json", "formatter": { "useEditorconfig": true }, "overrides": [ { "include": ["./.vscode/*.json", "**/*.jsonc", "**/asv.conf.json"], "json": { "formatter": { "trailingCommas": "all", }, "parser": { "allowComments": true, "allowTrailingCommas": true, }, }, }, ], } python-anndata-0.12.0~rc1/ci/000077500000000000000000000000001500370632200157145ustar00rootroot00000000000000python-anndata-0.12.0~rc1/ci/constraints.txt000066400000000000000000000000141500370632200210170ustar00rootroot00000000000000numba>=0.56 python-anndata-0.12.0~rc1/ci/scripts/000077500000000000000000000000001500370632200174035ustar00rootroot00000000000000python-anndata-0.12.0~rc1/ci/scripts/min-deps.py000077500000000000000000000121421500370632200214740ustar00rootroot00000000000000#!/usr/bin/env python3 # /// script # dependencies = [ # "tomli; python_version < '3.11'", # "packaging", # ] # /// from __future__ import annotations import argparse import sys import tomllib from collections import deque from contextlib import ExitStack from functools import cached_property from pathlib import Path from typing import TYPE_CHECKING from packaging.requirements import Requirement from packaging.version import Version if TYPE_CHECKING: from collections.abc import Generator, Iterable, Sequence from collections.abc import Set as AbstractSet from typing import Any, Self def min_dep(req: Requirement) -> Requirement: """ Given a requirement, return the minimum version specifier. Example ------- >>> min_dep(Requirement("numpy>=1.0")) >>> min_dep(Requirement("numpy<3.0")) """ req_name = req.name if req.extras: req_name = f"{req_name}[{','.join(req.extras)}]" filter_specs = [ spec for spec in req.specifier if spec.operator in {"==", "~=", ">=", ">"} ] if not filter_specs: # TODO: handle markers return Requirement(f"{req_name}{req.specifier}") min_version = Version("0.0.0.a1") for spec in filter_specs: if spec.operator in {">", ">=", "~="}: min_version = max(min_version, Version(spec.version)) elif spec.operator == "==": min_version = Version(spec.version) return Requirement(f"{req_name}=={min_version}.*") def extract_min_deps( dependencies: Iterable[Requirement], *, pyproject ) -> Generator[Requirement, None, None]: dependencies = deque(dependencies) # We'll be mutating this project_name = pyproject["project"]["name"] while len(dependencies) > 0: req = dependencies.pop() # If we are referring to other optional dependency lists, resolve them if req.name == project_name: assert req.extras, ( f"Project included itself as dependency, without specifying extras: {req}" ) for extra in req.extras: extra_deps = pyproject["project"]["optional-dependencies"][extra] dependencies += map(Requirement, extra_deps) else: yield min_dep(req) class Args(argparse.Namespace): """\ Parse a pyproject.toml file and output a list of minimum dependencies. Output is optimized for `[uv] pip install` (see `-o`/`--output` for details). """ _path: Path output: Path | None _extras: list[str] _all_extras: bool @classmethod def parse(cls, argv: Sequence[str] | None = None) -> Self: return cls.parser().parse_args(argv, cls()) @classmethod def parser(cls) -> argparse.ArgumentParser: parser = argparse.ArgumentParser( prog="min-deps", description=cls.__doc__, usage="pip install `python min-deps.py pyproject.toml`", ) parser.add_argument( "_path", metavar="pyproject.toml", type=Path, help="Path to pyproject.toml to parse minimum dependencies from", ) parser.add_argument( "--extras", dest="_extras", metavar="EXTRA", type=str, nargs="*", default=(), help="extras to install", ) parser.add_argument( "--all-extras", dest="_all_extras", action="store_true", help="get all extras", ) parser.add_argument( *("--output", "-o"), metavar="FILE", type=Path, default=None, help=( "output file (default: stdout). " "Without this option, output is space-separated for direct passing to `pip install`. " "With this option, output written to a file newline-separated file usable as `requirements.txt` or `constraints.txt`." ), ) return parser @cached_property def pyproject(self) -> dict[str, Any]: return tomllib.loads(self._path.read_text()) @cached_property def extras(self) -> AbstractSet[str]: if self._extras: if self._all_extras: sys.exit("Cannot specify both --extras and --all-extras") return dict.fromkeys(self._extras).keys() if not self._all_extras: return set() return self.pyproject["project"]["optional-dependencies"].keys() def main(argv: Sequence[str] | None = None) -> None: args = Args.parse(argv) project_name = args.pyproject["project"]["name"] deps = [ *map(Requirement, args.pyproject["project"]["dependencies"]), *(Requirement(f"{project_name}[{extra}]") for extra in args.extras), ] min_deps = extract_min_deps(deps, pyproject=args.pyproject) sep = "\n" if args.output else " " with ExitStack() as stack: f = stack.enter_context(args.output.open("w")) if args.output else sys.stdout print(sep.join(map(str, min_deps)), file=f) if __name__ == "__main__": main() python-anndata-0.12.0~rc1/ci/scripts/towncrier_automation.py000077500000000000000000000101431500370632200242330ustar00rootroot00000000000000#!/usr/bin/env python3 # /// script # dependencies = [ "towncrier", "packaging" ] # /// from __future__ import annotations import argparse import re import subprocess from functools import cache from typing import TYPE_CHECKING from packaging.version import Version if TYPE_CHECKING: from collections.abc import Sequence class BumpVersion(Version): def __init__(self, version: str) -> None: super().__init__(version) if len(self.release) != 3: msg = f"{version} must contain major, minor, and patch version." raise argparse.ArgumentTypeError(msg) base_branch = get_base_branch() patch_branch_pattern = re.compile(r"\d+\.\d+\.x") if self.micro != 0 and not patch_branch_pattern.fullmatch(base_branch): msg = ( f"{version} is a patch release, but " f"you are trying to release from a non-patch release branch: {base_branch}." ) raise argparse.ArgumentTypeError(msg) if self.micro == 0 and base_branch != "main": msg = ( f"{version} is a minor or major release, " f"but you are trying to release not from main: {base_branch}." ) raise argparse.ArgumentTypeError(msg) class Args(argparse.Namespace): version: BumpVersion dry_run: bool def parse_args(argv: Sequence[str] | None = None) -> Args: parser = argparse.ArgumentParser( prog="towncrier-automation", description=( "This script runs towncrier for a given version, " "creates a branch off of the current one, " "and then creates a PR into the original branch with the changes. " "The PR will be backported to main if the current branch is not main." ), ) parser.add_argument( "version", type=BumpVersion, help=( "The new version for the release must have at least three parts, like `major.minor.patch` and no `major.minor`. " "It can have a suffix like `major.minor.patch.dev0` or `major.minor.0rc1`." ), ) parser.add_argument( "--dry-run", help="Whether or not to dry-run the actual creation of the pull request", action="store_true", ) args = parser.parse_args(argv, Args()) return args def main(argv: Sequence[str] | None = None) -> None: args = parse_args(argv) # Run towncrier subprocess.run( ["towncrier", "build", f"--version={args.version}", "--yes"], check=True ) # Check if we are on the main branch to know if we need to backport base_branch = get_base_branch() pr_description = "" if base_branch == "main" else "@meeseeksdev backport to main" branch_name = f"release_notes_{args.version}" # Create a new branch + commit subprocess.run(["git", "switch", "-c", branch_name], check=True) subprocess.run(["git", "add", "docs/release-notes"], check=True) pr_title = f"(chore): generate {args.version} release notes" subprocess.run(["git", "commit", "-m", pr_title], check=True) # push if not args.dry_run: subprocess.run( ["git", "push", "--set-upstream", "origin", branch_name], check=True ) else: print("Dry run, not pushing") # Create a PR subprocess.run( [ "gh", "pr", "create", f"--base={base_branch}", f"--title={pr_title}", f"--body={pr_description}", "--label=skip-gpu-ci", *(["--label=no milestone"] if base_branch == "main" else []), *(["--dry-run"] if args.dry_run else []), ], check=True, ) # Enable auto-merge if not args.dry_run: subprocess.run( ["gh", "pr", "merge", branch_name, "--auto", "--squash"], check=True ) else: print("Dry run, not merging") @cache def get_base_branch(): return subprocess.run( ["git", "rev-parse", "--abbrev-ref", "HEAD"], capture_output=True, text=True, check=True, ).stdout.strip() if __name__ == "__main__": main() python-anndata-0.12.0~rc1/docs/000077500000000000000000000000001500370632200162515ustar00rootroot00000000000000python-anndata-0.12.0~rc1/docs/Makefile000066400000000000000000000012641500370632200177140ustar00rootroot00000000000000# Minimal makefile for Sphinx documentation # # You can set these variables from the command line. SPHINXOPTS = SPHINXBUILD = python3 -msphinx SPHINXPROJ = Scanpy SOURCEDIR = . BUILDDIR = _build # Put it first so that "make" without argument is like "make help". help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) .PHONY: help Makefile clean: rm -r "$(BUILDDIR)" rm -r "generated" find . -name anndata.*.rst -delete # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) python-anndata-0.12.0~rc1/docs/_key_contributors.rst000066400000000000000000000006561500370632200225560ustar00rootroot00000000000000.. sidebar:: Key Contributors * `Isaac Virshup`_: anndata >= 0.7, diverse contributions * Sergei Rybakov: diverse contributions * `Alex Wolf`_: initial conception/development * Philipp Angerer: initial conception/development, software quality .. _contributions graph: https://github.com/scverse/anndata/graphs/contributors .. _Isaac Virshup: https://twitter.com/ivirshup .. _Alex Wolf: https://twitter.com/falexwolf python-anndata-0.12.0~rc1/docs/_static/000077500000000000000000000000001500370632200176775ustar00rootroot00000000000000python-anndata-0.12.0~rc1/docs/_static/img/000077500000000000000000000000001500370632200204535ustar00rootroot00000000000000python-anndata-0.12.0~rc1/docs/_static/img/anndata_schema.svg000066400000000000000000002121141500370632200241230ustar00rootroot00000000000000 python-anndata-0.12.0~rc1/docs/_templates/000077500000000000000000000000001500370632200204065ustar00rootroot00000000000000python-anndata-0.12.0~rc1/docs/_templates/autosummary/000077500000000000000000000000001500370632200227745ustar00rootroot00000000000000python-anndata-0.12.0~rc1/docs/_templates/autosummary/class.rst000066400000000000000000000012131500370632200246300ustar00rootroot00000000000000{{ fullname | escape | underline}} .. currentmodule:: {{ module }} .. add toctree option to make autodoc generate the pages .. autoclass:: {{ objname }} {% block attributes %} {% if attributes %} .. rubric:: Attributes .. autosummary:: :toctree: . {% for item in attributes %} ~{{ name }}.{{ item }} {%- endfor %} {% endif %} {% endblock %} {% block methods %} {% if methods %} .. rubric:: Methods .. autosummary:: :toctree: . {% for item in methods %} {%- if item != '__init__' %} ~{{ name }}.{{ item }} {%- endif -%} {%- endfor %} {% endif %} {% endblock %} python-anndata-0.12.0~rc1/docs/api.md000066400000000000000000000076451500370632200173600ustar00rootroot00000000000000# API ```{eval-rst} .. module:: anndata ``` The central class: ```{eval-rst} .. autosummary:: :toctree: generated/ AnnData ``` (combining-api)= ## Combining Combining {class}`AnnData` objects. See also the section on concatenation. ```{eval-rst} .. autosummary:: :toctree: generated/ concat ``` (reading-api)= ## Reading Reading anndata’s native formats `.h5ad` and `zarr`. ```{eval-rst} .. autosummary:: :toctree: generated/ io.read_h5ad io.read_zarr ``` Reading individual portions ({attr}`~AnnData.obs`, {attr}`~AnnData.varm` etc.) of the {class}`AnnData` object. ```{eval-rst} .. autosummary:: :toctree: generated/ io.read_elem io.sparse_dataset ``` Reading file formats that cannot represent all aspects of {class}`AnnData` objects. ```{tip} You might have more success by assembling the {class}`AnnData` object yourself from the individual parts. ``` ```{eval-rst} .. autosummary:: :toctree: generated/ io.read_csv io.read_excel io.read_hdf io.read_loom io.read_mtx io.read_text io.read_umi_tools ``` (writing-api)= ## Writing Writing a complete {class}`AnnData` object to disk in anndata’s native formats `.h5ad` and `zarr`. (These functions are also exported as {func}`io.write_h5ad` and {func}`io.write_zarr`.) ```{eval-rst} .. autosummary:: :toctree: generated/ AnnData.write_h5ad AnnData.write_zarr .. .. autosummary:: :toctree: generated/ io.write_h5ad io.write_zarr .. toctree:: :hidden: generated/anndata.io.write_h5ad generated/anndata.io.write_zarr ``` Writing individual portions ({attr}`~AnnData.obs`, {attr}`~AnnData.varm` etc.) of the {class}`AnnData` object. ```{eval-rst} .. autosummary:: :toctree: generated/ io.write_elem ``` Writing formats that cannot represent all aspects of {class}`AnnData` objects. ```{eval-rst} .. autosummary:: :toctree: generated/ AnnData.write_csvs AnnData.write_loom ``` (experimental-api)= ## Experimental API ```{warning} APIs in the experimental module are currently in development and subject to change at any time. ``` Two classes for working with batched access to collections of many {class}`AnnData` objects or `.h5ad` files. In particular, for pytorch-based models. ```{eval-rst} .. autosummary:: :toctree: generated/ experimental.AnnCollection experimental.AnnLoader ``` Out of core concatenation ```{eval-rst} .. autosummary:: :toctree: generated/ experimental.concat_on_disk ``` Low level methods for reading and writing elements of an {class}`AnnData` object to a store: ```{eval-rst} .. autosummary:: :toctree: generated/ experimental.read_elem_lazy experimental.read_lazy ``` Utilities for customizing the IO process: ```{eval-rst} .. autosummary:: :toctree: generated/ experimental.read_dispatched experimental.write_dispatched ``` Types used by the former: ```{eval-rst} .. autosummary:: :toctree: generated/ experimental.IOSpec experimental.Read experimental.Write experimental.ReadCallback experimental.WriteCallback experimental.StorageType experimental.backed._lazy_arrays.MaskedArray experimental.backed._lazy_arrays.CategoricalArray experimental.backed._xarray.Dataset2D ``` (extensions-api)= ## Extensions ```{eval-rst} .. autosummary:: :toctree: generated/ register_anndata_namespace ``` Types used by the former: ```{eval-rst} .. autosummary:: :toctree: generated/ types.ExtensionNamespace ``` (errors-api)= ## Errors and warnings ```{eval-rst} .. autosummary:: :toctree: generated/ ImplicitModificationWarning ``` (settings-api)= ## Settings ```{eval-rst} .. autosummary:: :toctree: generated/ settings settings.override ``` (types-api)= ## Custom Types/Classes for Readable/Writeable Elements ```{eval-rst} .. autosummary:: :toctree: generated/ abc.CSRDataset abc.CSCDataset typing.Index typing.AxisStorable typing.RWAble ``` python-anndata-0.12.0~rc1/docs/benchmark-read-write.ipynb000066400000000000000000000070571500370632200233200ustar00rootroot00000000000000{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Simple benchmarks" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here, we perform simple benchmarks to demonstrate basic performance." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from __future__ import annotations\n", "\n", "import scanpy as sc\n", "\n", "import anndata as ad" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "adata = sc.datasets.pbmc3k()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "AnnData object with n_obs × n_vars = 2700 × 32738\n", " var: 'gene_ids'" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "adata" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Reading & writing" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let us start by writing & reading anndata's native HDF5 file format: `.h5ad`:" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 93.9 ms, sys: 17.4 ms, total: 111 ms\n", "Wall time: 118 ms\n" ] } ], "source": [ "%%time\n", "adata.write(\"test.h5ad\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 51.2 ms, sys: 13.3 ms, total: 64.5 ms\n", "Wall time: 64.1 ms\n" ] } ], "source": [ "%%time\n", "adata = ad.read_h5ad(\"test.h5ad\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We see that reading and writing is much faster than for loom files. The efficiency gain here is due to explicit storage of the sparse matrix structure." ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 2.82 s, sys: 457 ms, total: 3.27 s\n", "Wall time: 3.31 s\n" ] } ], "source": [ "%%time\n", "adata.write_loom(\"test.loom\")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 1.05 s, sys: 221 ms, total: 1.28 s\n", "Wall time: 1.28 s\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/Users/alexwolf/repos/anndata/anndata/_core/anndata.py:120: ImplicitModificationWarning: Transforming to str index.\n", " warnings.warn(\"Transforming to str index.\", ImplicitModificationWarning)\n" ] } ], "source": [ "%%time\n", "adata = ad.io.read_loom(\"test.loom\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 } python-anndata-0.12.0~rc1/docs/benchmarks.md000066400000000000000000000003751500370632200207150ustar00rootroot00000000000000# Benchmarks Computational operations in anndata are consistently benchmarked [here](https://github.com/ivirshup/anndata-benchmarks). Below follows a simple benchmark showing read-write efficiency. ```{toctree} :maxdepth: 1 benchmark-read-write ``` python-anndata-0.12.0~rc1/docs/concatenation.rst000066400000000000000000000311031500370632200216260ustar00rootroot00000000000000Concatenation ============= With :func:`~anndata.concat`, :class:`~anndata.AnnData` objects can be combined via a composition of two operations: concatenation and merging. * Concatenation is when we keep all sub elements of each object, and stack these elements in an ordered way. * Merging is combining a set of collections into one resulting collection which contains elements from the objects. .. note:: This function borrows from similar functions in pandas_ and xarray_. Argument which are used to control concatenation are modeled after :func:`pandas.concat` while strategies for merging are inspired by :func:`xarray.merge`'s `compat` argument. .. _pandas: https://pandas.pydata.org .. _xarray: http://xarray.pydata.org Concatenation ------------- Let's start off with an example: >>> import scanpy as sc, anndata as ad, numpy as np, pandas as pd >>> from scipy import sparse >>> from anndata import AnnData >>> pbmc = sc.datasets.pbmc68k_reduced() >>> pbmc AnnData object with n_obs × n_vars = 700 × 765 obs: 'bulk_labels', 'n_genes', 'percent_mito', 'n_counts', 'S_score', 'G2M_score', 'phase', 'louvain' var: 'n_counts', 'means', 'dispersions', 'dispersions_norm', 'highly_variable' uns: 'bulk_labels_colors', 'louvain', 'louvain_colors', 'neighbors', 'pca', 'rank_genes_groups' obsm: 'X_pca', 'X_umap' varm: 'PCs' obsp: 'distances', 'connectivities' If we split this object up by clusters of observations, then stack those subsets we'll obtain the same values – just ordered differently. >>> groups = pbmc.obs.groupby("louvain", observed=True).indices >>> pbmc_concat = ad.concat([pbmc[inds] for inds in groups.values()], merge="same") >>> assert np.array_equal(pbmc.X, pbmc_concat[pbmc.obs_names].X) >>> pbmc_concat AnnData object with n_obs × n_vars = 700 × 765 obs: 'bulk_labels', 'n_genes', 'percent_mito', 'n_counts', 'S_score', 'G2M_score', 'phase', 'louvain' var: 'n_counts', 'means', 'dispersions', 'dispersions_norm', 'highly_variable' obsm: 'X_pca', 'X_umap' varm: 'PCs' Note that we concatenated along the observations by default, and that most elements aligned to the observations were concatenated as well. A notable exception is :attr:`~anndata.AnnData.obsp`, which can be re-enabled with the `pairwise` keyword argument. This is because it's not obvious that combining graphs or distance matrices padded with 0s is particularly useful, and may be unintuitive. Inner and outer joins ~~~~~~~~~~~~~~~~~~~~~ When the variables present in the objects to be concatenated aren't exactly the same, you can choose to take either the intersection or union of these variables. This is otherwise called taking the `"inner"` (intersection) or `"outer"` (union) join. For example, given two anndata objects with differing variables: >>> a = AnnData(sparse.eye(3, format="csr"), var=pd.DataFrame(index=list("abc"))) >>> b = AnnData(sparse.eye(2, format="csr"), var=pd.DataFrame(index=list("ba"))) >>> ad.concat([a, b], join="inner").X.toarray() array([[1., 0.], [0., 1.], [0., 0.], [0., 1.], [1., 0.]]) >>> ad.concat([a, b], join="outer").X.toarray() array([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.], [0., 1., 0.], [1., 0., 0.]]) The join argument is used for any element which has both (1) an axis being concatenated and (2) an axis not being concatenated. When concatenating along the `obs` dimension, this means elements of `.X`, `obs`, `.layers`, and `.obsm` will be affected by the choice of `join`. To demonstrate this, let's say we're trying to combine a droplet based experiment with a spatial one. When building a joint anndata object, we would still like to store the coordinates for the spatial samples. >>> coords = np.hstack([np.repeat(np.arange(10), 10), np.tile(np.arange(10), 10)]).T >>> spatial = AnnData( ... sparse.random(5000, 10000, format="csr"), ... obsm={"coords": np.random.randn(5000, 2)} ... ) >>> droplet = AnnData(sparse.random(5000, 10000, format="csr")) >>> combined = ad.concat([spatial, droplet], join="outer") >>> sc.pl.embedding(combined, "coords") # doctest: +SKIP .. TODO: Get the above plot to show up Annotating data source (`label`, `keys`, and `index_unique`) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Often, you'd like to be able to tell which values came from which object. This can be accomplished with the `label`, `keys`, and `index_unique` keyword arguments. For an example, we'll show how you can keep track of the original dataset by passing a `Mapping` of dataset names to `AnnData` objects to `concat`: >>> adatas = { ... "a": ad.AnnData( ... sparse.random(3, 50, format="csr", density=0.1), ... obs=pd.DataFrame(index=[f"a-{i}" for i in range(3)]) ... ), ... "b": ad.AnnData( ... sparse.random(5, 50, format="csr", density=0.1), ... obs=pd.DataFrame(index=[f"b-{i}" for i in range(5)]) ... ), ... } >>> ad.concat(adatas, label="dataset").obs dataset a-0 a a-1 a a-2 a b-0 b b-1 b b-2 b b-3 b b-4 b Here, a categorical column (with the name specified by `label`) was added to the result. As an alternative to passing a `Mapping`, you can also specify dataset names with the `keys` argument. In some cases, your objects may share names along the axes being concatenated. These values can be made unique by appending the relevant key using the `index_unique` argument: .. TODO: skipping example since doctest does not capture stderr, but it's relevant to show the unique message >>> adatas = { ... "a": ad.AnnData( ... sparse.random(3, 10, format="csr", density=0.1), ... obs=pd.DataFrame(index=[f"cell-{i}" for i in range(3)]) ... ), ... "b": ad.AnnData( ... sparse.random(5, 10, format="csr", density=0.1), ... obs=pd.DataFrame(index=[f"cell-{i}" for i in range(5)]) ... ), ... } >>> ad.concat(adatas).obs # doctest: +SKIP Observation names are not unique. To make them unique, call `.obs_names_make_unique`. Empty DataFrame Columns: [] Index: [cell-0, cell-1, cell-2, cell-0, cell-1, cell-2, cell-3, cell-4] >>> ad.concat(adatas, index_unique="_").obs Empty DataFrame Columns: [] Index: [cell-0_a, cell-1_a, cell-2_a, cell-0_b, cell-1_b, cell-2_b, cell-3_b, cell-4_b] Merging ------- Combining elements not aligned to the axis of concatenation is controlled through the `merge` arguments. We provide a few strategies for merging elements aligned to the alternative axes: * `None`: No elements aligned to alternative axes are present in the result object. * `"same"`: Elements that are the same in each of the objects. * `"unique"`: Elements for which there is only one possible value. * `"first"`: The first element seen in each from each position. * `"only"`: Elements that show up in only one of the objects. We'll show how this works with elements aligned to the alternative axis, and then how merging works with `.uns`. First, our example case: >>> import scanpy as sc >>> blobs = sc.datasets.blobs(n_variables=30, n_centers=5) >>> sc.pp.pca(blobs) >>> blobs AnnData object with n_obs × n_vars = 640 × 30 obs: 'blobs' uns: 'pca' obsm: 'X_pca' varm: 'PCs' Now we will split this object by the categorical `"blobs"` and recombine it to illustrate different merge strategies. >>> adatas = [] >>> for group, idx in blobs.obs.groupby("blobs").indices.items(): ... sub_adata = blobs[idx].copy() ... sub_adata.obsm["qc"], sub_adata.varm[f"{group}_qc"] = sc.pp.calculate_qc_metrics( ... sub_adata, percent_top=(), inplace=False, log1p=False ... ) ... adatas.append(sub_adata) >>> adatas[0] AnnData object with n_obs × n_vars = 128 × 30 obs: 'blobs' uns: 'pca' obsm: 'X_pca', 'qc' varm: 'PCs', '0_qc' `adatas` is now a list of datasets with disjoint sets of observations and a common set of variables. Each object has had QC metrics computed, with observation-wise metrics stored under `"qc"` in `.obsm`, and variable-wise metrics stored with a unique key for each subset. Taking a look at how this affects concatenation: >>> ad.concat(adatas) AnnData object with n_obs × n_vars = 640 × 30 obs: 'blobs' obsm: 'X_pca', 'qc' >>> ad.concat(adatas, merge="same") AnnData object with n_obs × n_vars = 640 × 30 obs: 'blobs' obsm: 'X_pca', 'qc' varm: 'PCs' >>> ad.concat(adatas, merge="unique") AnnData object with n_obs × n_vars = 640 × 30 obs: 'blobs' obsm: 'X_pca', 'qc' varm: 'PCs', '0_qc', '1_qc', '2_qc', '3_qc', '4_qc' Note that comparisons are made after indices are aligned. That is, if the objects only share a subset of indices on the alternative axis, it's only required that values for those indices match when using a strategy like `"same"`. >>> a = AnnData( ... sparse.eye(3, format="csr"), ... var=pd.DataFrame({"nums": [1, 2, 3]}, index=list("abc")) ... ) >>> b = AnnData( ... sparse.eye(2, format="csr"), ... var=pd.DataFrame({"nums": [2, 1]}, index=list("ba")) ... ) >>> ad.concat([a, b], merge="same").var nums a 1 b 2 Merging `.uns` ~~~~~~~~~~~~~~ We use the same set of strategies for merging `uns` as we do for entries aligned to an axis, but these strategies are applied recursively. This is a little abstract, so we'll look at some examples of this. Here's our setup: >>> from anndata import AnnData >>> import numpy as np >>> a = AnnData(np.zeros((10, 10)), uns={"a": 1, "b": 2, "c": {"c.a": 3, "c.b": 4}}) >>> b = AnnData(np.zeros((10, 10)), uns={"a": 1, "b": 3, "c": {"c.b": 4}}) >>> c = AnnData(np.zeros((10, 10)), uns={"a": 1, "b": 4, "c": {"c.a": 3, "c.b": 4, "c.c": 5}}) For quick reference, these are the results from each of the merge strategies. These are discussed in more depth below: =========== ======================================================= `uns_merge` Result =========== ======================================================= `None` `{}` `"same"` `{"a": 1, "c": {"c.b": 4}}` `"unique"` `{"a": 1, "c": {"c.a": 3, "c.b": 4, "c.c": 5}}` `"only"` `{"c": {"c.c": 5}}` `"first"` `{"a": 1, "b": 2, "c": {"c.a": 3, "c.b": 4, "c.c": 5}}` =========== ======================================================= The default returns a fairly obvious result: >>> ad.concat([a, b, c]).uns == {} True But let's take a look at the others in a bit more depth. Here, we'll be wrapping the output data in a `dict` for simplicity of the return value. >>> dict(ad.concat([a, b, c], uns_merge="same").uns) {'a': 1, 'c': {'c.b': 4}} Here only the values for `uns["a"]` and `uns["c"]["c.b"]` were exactly the same, so only they were kept. `uns["b"]` has a number of values and neither `uns["c"]["c.a"]` or `uns["c"]["c.b"]` appears in each `uns`. A key feature to note is that comparisons are aware of the nested structure of `uns` and will be applied at any depth. This is why `uns["c"]["c.b"]` was kept. Merging `uns` in this way can be useful when there is some shared data between the objects being concatenated. For example, if each was put through the same pipeline with the same parameters, those parameters used would still be present in the resulting object. Now let's look at the behaviour of `unique`: >>> dict(ad.concat([a, b, c], uns_merge="unique").uns) {'a': 1, 'c': {'c.a': 3, 'c.b': 4, 'c.c': 5}} The results here are a super-set of those from `"same"`. Note that there was only one possible value at each position in the resulting mapping. That is, there were not alternative values present for `uns["c"]["c.c"]` even though it appeared only once. This can be useful when the object's were both run through the same pipeline but contain specific metadata per object. An example of this would be a spatial dataset, where the images are stored in `uns`. >>> dict(ad.concat([a, b, c], uns_merge="only").uns) {'c': {'c.c': 5}} `uns["c"]["c.c"]` is the only value that is kept, since it is the only one which was specified in only one `uns`. >>> dict(ad.concat([a, b, c], uns_merge="first").uns) {'a': 1, 'b': 2, 'c': {'c.a': 3, 'c.b': 4, 'c.c': 5}} In this case, the result has the union of the keys from all the starting dictionaries. The value is taken from the first object to have a value at this key. python-anndata-0.12.0~rc1/docs/conf.py000066400000000000000000000156341500370632200175610ustar00rootroot00000000000000from __future__ import annotations import sys from datetime import datetime from functools import partial from importlib import metadata from pathlib import Path, PurePosixPath from typing import TYPE_CHECKING from docutils import nodes if TYPE_CHECKING: from sphinx.application import Sphinx HERE = Path(__file__).parent _extension_dir = HERE / "extensions" sys.path[:0] = [str(_extension_dir)] # -- General configuration ------------------------------------------------ # General information project = "anndata" author = f"{project} developers" copyright = f"{datetime.now():%Y}, scverse" release = version = metadata.version("anndata") # default settings templates_path = ["_templates"] html_static_path = ["_static"] source_suffix = {".rst": "restructuredtext", ".md": "myst-nb"} master_doc = "index" default_role = "literal" exclude_patterns = [ "_build", "Thumbs.db", ".DS_Store", "**.ipynb_checkpoints", "tutorials/notebooks/*.rst", # exclude all 0.x.y.md files, but not index.md "release-notes/[!i]*.md", "news.md", # is `include`d into index.md ] pygments_style = "sphinx" extensions = [ "myst_nb", "sphinx_copybutton", "sphinx.ext.autodoc", "sphinx.ext.intersphinx", "sphinx.ext.doctest", "sphinx.ext.coverage", "sphinx.ext.mathjax", "sphinx.ext.napoleon", "sphinx.ext.autosummary", "sphinx_autodoc_typehints", # needs to be after napoleon "sphinx_issues", "sphinx_design", "sphinx_search.extension", "sphinxext.opengraph", "scanpydoc", # needs to be before linkcode "sphinx.ext.linkcode", "IPython.sphinxext.ipython_console_highlighting", "sphinx_toolbox.more_autodoc.autoprotocol", *(p.stem for p in _extension_dir.glob("*.py")), ] myst_enable_extensions = [ "html_image", # So README.md can be used on github and sphinx docs "colon_fence", "dollarmath", ] myst_heading_anchors = 3 nb_execution_mode = "off" # Generate the API documentation when building autosummary_generate = True autodoc_member_order = "bysource" autodoc_mock_imports = ["torch"] # autodoc_default_flags = ['members'] issues_github_path = "scverse/anndata" rtd_links_prefix = PurePosixPath("src") napoleon_google_docstring = False napoleon_numpy_docstring = True napoleon_include_init_with_doc = False napoleon_use_rtype = True # having a separate entry generally helps readability napoleon_use_param = True napoleon_custom_sections = [("Params", "Parameters")] typehints_defaults = "braces" todo_include_todos = False nitpicky = True # Report broken links nitpick_ignore = [ # APIs without an intersphinx entry # This API isn’t actually documented ("py:class", "anndata._core.raw.Raw"), # TODO: remove zappy support; the zappy repo is archived ("py:class", "anndata.compat.ZappyArray"), ] def setup(app: Sphinx): app.add_generic_role("small", partial(nodes.inline, classes=["small"])) app.add_generic_role("smaller", partial(nodes.inline, classes=["smaller"])) intersphinx_mapping = dict( awkward=("https://awkward-array.org/doc/stable", None), cupy=("https://docs.cupy.dev/en/stable", None), dask=("https://docs.dask.org/en/stable", None), h5py=("https://docs.h5py.org/en/latest", None), hdf5plugin=("https://hdf5plugin.readthedocs.io/en/latest", None), loompy=("https://linnarssonlab.org/loompy", None), numpy=("https://numpy.org/doc/stable", None), pandas=("https://pandas.pydata.org/pandas-docs/stable", None), python=("https://docs.python.org/3", None), scipy=("https://docs.scipy.org/doc/scipy", None), sklearn=("https://scikit-learn.org/stable", None), # TODO: move back to stable once `ObjectStore` is released zarr=("https://zarr.readthedocs.io/en/latest/", None), xarray=("https://docs.xarray.dev/en/stable", None), obstore=("https://developmentseed.org/obstore/latest/", None), kvikio=("https://docs.rapids.ai/api/kvikio/stable/", None), zarrs=("https://zarrs-python.readthedocs.io/en/stable/", None), ) qualname_overrides = { "h5py._hl.group.Group": "h5py.Group", "h5py._hl.files.File": "h5py.File", "h5py._hl.dataset.Dataset": "h5py.Dataset", "anndata._core.anndata.AnnData": "anndata.AnnData", **{ f"anndata._core.aligned_mapping.{cls}{kind}": "collections.abc.Mapping" for cls in "Layers AxisArrays PairwiseArrays".split() for kind in ["", "View"] }, "anndata._types.ReadCallback": "anndata.experimental.ReadCallback", "anndata._types.WriteCallback": "anndata.experimental.WriteCallback", "anndata._types.Read": "anndata.experimental.Read", "anndata._types.Write": "anndata.experimental.Write", "zarr.core.array.Array": "zarr.Array", "zarr.core.group.Group": "zarr.Group", # Buffer is not yet exported, so the buffer class registry is the closest thing "zarr.core.buffer.core.Buffer": "zarr.registry.Registry", "zarr.storage._common.StorePath": "zarr.storage.StorePath", "anndata.compat.DaskArray": "dask.array.Array", "anndata.compat.CupyArray": "cupy.ndarray", "anndata.compat.CupySparseMatrix": "cupyx.scipy.sparse.spmatrix", "awkward.highlevel.Array": "ak.Array", "numpy.int64": ("py:attr", "numpy.int64"), "pandas.DataFrame.iloc": ("py:attr", "pandas.DataFrame.iloc"), "pandas.DataFrame.loc": ("py:attr", "pandas.DataFrame.loc"), # should be fixed soon: https://github.com/tox-dev/sphinx-autodoc-typehints/pull/516 "types.EllipsisType": ("py:data", "types.EllipsisType"), "pathlib._local.Path": "pathlib.Path", } autodoc_type_aliases = dict( NDArray=":data:`~numpy.typing.NDArray`", AxisStorable=":data:`~anndata.typing.AxisStorable`", **{ f"{v}variantRWAble": ":data:`~anndata.typing.RWAble`" for v in ["In", "Co", "Contra"] }, ) # -- Social cards --------------------------------------------------------- ogp_site_url = "https://anndata.readthedocs.io/" ogp_image = "https://anndata.readthedocs.io/en/latest/_static/img/anndata_schema.svg" # -- Options for HTML output ---------------------------------------------- # The theme is sphinx-book-theme, with patches for readthedocs-sphinx-search html_theme = "scanpydoc" html_theme_options = dict( use_repository_button=True, repository_url="https://github.com/scverse/anndata", repository_branch="main", navigation_with_keys=False, # https://github.com/pydata/pydata-sphinx-theme/issues/1492 ) html_logo = "_static/img/anndata_schema.svg" issues_github_path = "scverse/anndata" html_show_sphinx = False # -- Options for other output formats ------------------------------------------ htmlhelp_basename = f"{project}doc" doc_title = f"{project} Documentation" latex_documents = [(master_doc, f"{project}.tex", doc_title, author, "manual")] man_pages = [(master_doc, project, doc_title, [author], 1)] texinfo_documents = [ ( master_doc, project, doc_title, author, project, "One line description of project.", "Miscellaneous", ) ] python-anndata-0.12.0~rc1/docs/contributing.md000066400000000000000000000006761500370632200213130ustar00rootroot00000000000000# Contributing AnnData follows the development practices outlined in the [Scanpy contribution guide](https://scanpy.readthedocs.io/en/latest/dev/release.html). ```{eval-rst} .. include:: _key_contributors.rst ``` ## CI ### GPU CI To test GPU specific code we have a paid self-hosted runner to run the gpu specific tests on. This CI runs by default on the main branch, but for PRs requires the `run-gpu-ci` label to prevent unnecessary runs. python-anndata-0.12.0~rc1/docs/extensions/000077500000000000000000000000001500370632200204505ustar00rootroot00000000000000python-anndata-0.12.0~rc1/docs/extensions/no_skip_abc_members.py000066400000000000000000000012421500370632200250020ustar00rootroot00000000000000"""Sphinx extension to not skip abstract methods.""" from __future__ import annotations from typing import TYPE_CHECKING if TYPE_CHECKING: from typing import Literal from sphinx.application import Sphinx from sphinx.ext.autodoc import Options def autodoc_skip_member( app: Sphinx, what: Literal["module", "class", "exception", "function", "method", "attribute"], name: str, obj: object, skip: bool, # noqa: FBT001 options: Options, ): if what == "method" and getattr(obj, "__isabstractmethod__", False): return False return None def setup(app: Sphinx): app.connect("autodoc-skip-member", autodoc_skip_member) python-anndata-0.12.0~rc1/docs/extensions/patch_myst_cite.py000066400000000000000000000015071500370632200242040ustar00rootroot00000000000000"""Override MyST’s cite role with one that works.""" from __future__ import annotations from types import MappingProxyType from typing import TYPE_CHECKING from docutils import nodes, utils if TYPE_CHECKING: from collections.abc import Mapping, Sequence from typing import Any from docutils.parsers.rst.states import Inliner from sphinx.application import Sphinx def cite_role( # noqa: PLR0917 name: str, rawsource: str, text: str, lineno: int, inliner: Inliner, options: Mapping[str, Any] = MappingProxyType({}), content: Sequence[str] = (), ) -> tuple[list[nodes.Node], list[nodes.system_message]]: key = utils.unescape(text) node = nodes.citation_reference(f"[{key}]_", key) return [node], [] def setup(app: Sphinx): app.add_role("cite", cite_role, override=True) python-anndata-0.12.0~rc1/docs/fileformat-prose.md000066400000000000000000000554701500370632200220640ustar00rootroot00000000000000# On-disk format ```{note} These docs are written for anndata 0.8+. Files written before this version may differ in some conventions, but will still be read by newer versions of the library. ``` AnnData objects are saved on disk to hierarchical array stores like [HDF5] (via {doc}`H5py `) and {doc}`zarr:index`. This allows us to have very similar structures in disk and on memory. As an example we’ll look into a typical `.h5ad`/ `.zarr` object that’s been through an analysis. The structures are largely equivalent, though there are a few minor differences when it comes to type encoding. ## Elements `````{tab-set} ````{tab-item} HDF5 :sync: hdf5 ```python >>> import h5py >>> store = h5py.File("for-ondisk-docs/cart-164k-processed.h5ad", mode="r") >>> list(store.keys()) ['X', 'layers', 'obs', 'obsm', 'obsp', 'uns', 'var', 'varm', 'varp'] ``` ```` ````{tab-item} Zarr :sync: zarr ```python >>> import zarr >>> store = zarr.open("for-ondisk-docs/cart-164k-processed.zarr", mode="r") >>> list(store.keys()) ['X', 'layers', 'obs', 'obsm', 'obsp', 'uns', 'var', 'varm', 'varp'] ``` ```` ````` In general, `AnnData` objects are comprised of various types of elements. Each element is encoded as either an Array (or Dataset in hdf5 terminology) or a collection of elements (e.g. Group) in the store. We record the type of an element using the `encoding-type` and `encoding-version` keys in its attributes. For example, we can see that this file represents an `AnnData` object from its metadata: ```python >>> dict(store.attrs) {'encoding-type': 'anndata', 'encoding-version': '0.1.0'} ``` Using this information, we're able to dispatch onto readers for the different element types that you'd find in an anndata. ### Element Specification * An element can be any object within the storage hierarchy (typically an array or group) with associated metadata * An element MUST have a string-valued field `"encoding-type"` in its metadata * An element MUST have a string-valued field `"encoding-version"` in its metadata that can be evaluated to a version ### AnnData specification (v0.1.0) * An `AnnData` object MUST be a group. * The group's metadata MUST include entries: `"encoding-type": "anndata"`, `"encoding-version": "0.1.0"`. * An `AnnData` group MUST contain entries `"obs"` and `"var"`, which MUST be dataframes (though this may only have an index with no columns). * The group MAY contain an entry `X`, which MUST be either a dense or sparse array and whose shape MUST be (`n_obs`, `n_var`) * The group MAY contain a mapping `layers`. Entries in `layers` MUST be dense or sparse arrays which have shapes (`n_obs`, `n_var`) * The group MAY contain a mapping `obsm`. Entries in `obsm` MUST be sparse arrays, dense arrays, or dataframes. These entries MUST have a first dimension of size `n_obs` * The group MAY contain a mapping `varm`. Entries in `varm` MUST be sparse arrays, dense arrays, or dataframes. These entries MUST have a first dimension of size `n_var` * The group MAY contain a mapping `obsp`. Entries in `obsp` MUST be sparse or dense arrays. The entries first two dimensions MUST be of size `n_obs` * The group MAY contain a mapping `varp`. Entries in `varp` MUST be sparse or dense arrays. The entries first two dimensions MUST be of size `n_var` * The group MAY contain a mapping `uns`. Entries in `uns` MUST be an anndata encoded type. ## Dense arrays Dense numeric arrays have the most simple representation on disk, as they have native equivalents in H5py {doc}`h5py:high/dataset` and Zarr {doc}`Arrays `. We can see an example of this with dimensionality reductions stored in the `obsm` group: `````{tab-set} ````{tab-item} HDF5 :sync: hdf5 ```python >>> store["obsm/X_pca"] ``` ```` ````{tab-item} Zarr :sync: zarr ```python >>> store["obsm/X_pca"] ``` ```` ````` ```python >>> dict(store["obsm"]["X_pca"].attrs) {'encoding-type': 'array', 'encoding-version': '0.2.0'} ``` ### Dense arrays specification (v0.2.0) * Dense arrays MUST be stored in an Array object * Dense arrays MUST have the entries `'encoding-type': 'array'` and `'encoding-version': '0.2.0'` in their metadata ## Sparse arrays Sparse arrays don’t have a native representations in HDF5 or Zarr, so we've defined our own based on their in-memory structure. Currently two sparse data formats are supported by `AnnData` objects, CSC and CSR (corresponding to {class}`scipy.sparse.csc_matrix` and {class}`scipy.sparse.csr_matrix` respectively). These formats represent a two-dimensional sparse array with three one-dimensional arrays, `indptr`, `indices`, and `data`. ```{note} A full description of these formats is out of scope for this document, but are [easy to find]. ``` We represent a sparse array as a `Group` on-disk, where the kind and shape of the sparse array is defined in the `Group`'s attributes: ```python >>> dict(store["X"].attrs) {'encoding-type': 'csr_matrix', 'encoding-version': '0.1.0', 'shape': [164114, 40145]} ``` The group contains three arrays: `````{tab-set} ````{tab-item} HDF5 :sync: hdf5 ```python >>> store["X"].visititems(print) data indices indptr ``` ```` ````{tab-item} Zarr :sync: zarr ```python >>> store["X"].visititems(print) data indices indptr ``` ```` ````` ### Sparse array specification (v0.1.0) * Each sparse array MUST be its own group * The group MUST contain arrays `indices`, `indptr`, and `data` * The group's metadata MUST contain: * `"encoding-type"`, which is set to `"csr_matrix"` or `"csc_matrix"` for compressed sparse row and compressed sparse column, respectively. * `"encoding-version"`, which is set to `"0.1.0"` * `"shape"` which is an integer array of length 2 whose values are the sizes of the array's dimensions ## DataFrames DataFrames are saved as a columnar format in a group, so each column of a DataFrame is saved as a separate array. We save a little more information in the attributes here. ```python >>> dict(store["var"].attrs) {'_index': 'ensembl_id', 'column-order': ['highly_variable', 'means', 'variances', 'variances_norm', 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'mito'], 'encoding-type': 'dataframe', 'encoding-version': '0.2.0'} ``` These attributes identify the index of the dataframe, as well as the original order of the columns. Each column in this dataframe is encoded as its own array. `````{tab-set} ````{tab-item} HDF5 :sync: hdf5 ```python >>> store["var"].visititems(print) ensembl_id feature_biotype feature_biotype/categories feature_biotype/codes feature_is_filtered ... ``` ```` ````{tab-item} Zarr :sync: zarr ```python >>> store["var"].visititems(print) ensembl_id feature_biotype feature_biotype/categories feature_biotype/codes feature_is_filtered ... ``` ```` ````` ```python >>> dict(store["var"]["feature_name"].attrs) {'encoding-type': 'categorical', 'encoding-version': '0.2.0', 'ordered': False} >>> dict(store["var"]["feature_is_filtered"].attrs) {'encoding-type': 'array', 'encoding-version': '0.2.0'} ``` ### Dataframe Specification (v0.2.0) * A dataframe MUST be stored as a group * The group's metadata: * MUST contain the field `"_index"`, whose value is the key of the array to be used as an index/ row labels * MUST contain encoding metadata `"encoding-type": "dataframe"`, `"encoding-version": "0.2.0"` * MUST contain `"column-order"` an array of strings denoting the order of column entries * The group MUST contain an array for the index * Each entry in the group MUST correspond to an array with equivalent first dimensions * Each entry SHOULD share chunk sizes (in the HDF5 or zarr container) ## Mappings Mappings are simply stored as `Group`s on disk. These are distinct from DataFrames and sparse arrays since they don’t have any special attributes. A `Group` is created for any `Mapping` in the AnnData object, including the standard `obsm`, `varm`, `layers`, and `uns`. Notably, this definition is used recursively within `uns`: `````{tab-set} ````{tab-item} HDF5 :sync: hdf5 ```python >>> store["uns"].visititems(print) [...] pca pca/variance pca/variance_ratio [...] ``` ```` ````{tab-item} Zarr :sync: zarr ```python >>> store["uns"].visititems(print) [...] pca pca/variance pca/variance_ratio [...] ``` ```` ````` ### Mapping specifications (v0.1.0) * Each mapping MUST be its own group * The group's metadata MUST contain the encoding metadata `"encoding-type": "dict"`, `"encoding-version": "0.1.0"` ## Scalars Zero dimensional arrays are used for scalar values (i.e. single values like strings, numbers or booleans). These should only occur inside of `uns`, and are commonly saved parameters: `````{tab-set} ````{tab-item} HDF5 :sync: hdf5 ```python >>> store["uns/neighbors/params"].visititems(print) method metric n_neighbors random_state ``` ```` ````{tab-item} Zarr :sync: zarr ```python >>> store["uns/neighbors/params"].visititems(print) method metric n_neighbors random_state ``` ```` ````` ```python >>> store["uns/neighbors/params/metric"][()] 'euclidean' >>> dict(store["uns/neighbors/params/metric"].attrs) {'encoding-type': 'string', 'encoding-version': '0.2.0'} ``` ### Scalar specification (v0.2.0) * Scalars MUST be written as a 0 dimensional array * Numeric scalars * MUST have `"encoding-type": "numeric-scalar"`, `"encoding-version": "0.2.0"` in their metadata * MUST be a single numeric value, including boolean, unsigned integer, signed integer, floating point, or complex floating point * String scalars * MUST have `"encoding-type": "string"`, `"encoding-version": "0.2.0"` in their metadata * In zarr, scalar strings MUST be stored as a fixed length unicode dtype * In HDF5, scalar strings MUST be stored as a variable length utf-8 encoded string dtype ## Categorical arrays ```python >>> categorical = store["obs"]["development_stage"] >>> dict(categorical.attrs) {'encoding-type': 'categorical', 'encoding-version': '0.2.0', 'ordered': False} ``` Discrete values can be efficiently represented with categorical arrays (similar to `factors` in `R`). These arrays encode the values as small width integers (`codes`), which map to the original label set (`categories`). Each entry in the `codes` array is the zero-based index of the encoded value in the `categories` array. To represent a missing value, a code of `-1` is used. We store these two arrays separately. `````{tab-set} ````{tab-item} HDF5 :sync: hdf5 ```python >>> categorical.visititems(print) categories codes ``` ```` ````{tab-item} Zarr :sync: zarr ```python >>> categorical.visititems(print) categories codes ``` ```` ````` ### Categorical array specification (v0.2.0) * Categorical arrays MUST be stored as a group * The group's metadata MUST contain the encoding metadata `"encoding-type": "categorical"`, `"encoding-version": "0.2.0"` * The group's metadata MUST contain the boolean valued field `"ordered"`, which indicates whether the categories are ordered * The group MUST contain an integer valued array named `"codes"` whose maximum value is the number of categories - 1 * The `"codes"` array MAY contain signed integer values. If so, the code `-1` denotes a missing value * The group MUST contain an array called `"categories"` ## String arrays Arrays of strings are handled differently than numeric arrays since numpy doesn't really have a good way of representing arrays of unicode strings. `anndata` assumes strings are text-like data, so it uses a variable length encoding. `````{tab-set} ````{tab-item} HDF5 :sync: hdf5 ```python >>> store["var"][store["var"].attrs["_index"]] ``` ```` ````{tab-item} Zarr :sync: zarr ```python >>> store["var"][store["var"].attrs["_index"]] ``` ```` ````` ```python >>> dict(categorical["categories"].attrs) {'encoding-type': 'string-array', 'encoding-version': '0.2.0'} ``` ### String array specifications (v0.2.0) * String arrays MUST be stored in arrays * The arrays's metadata MUST contain the encoding metadata `"encoding-type": "string-array"`, `"encoding-version": "0.2.0"` * In `zarr`, string arrays MUST be stored using `numcodecs`' `VLenUTF8` codec * In `HDF5`, string arrays MUST be stored using the variable length string data type, with a utf-8 encoding ## Nullable integers and booleans We support IO with Pandas nullable integer and boolean arrays. We represent these on disk similar to `numpy` masked arrays, `julia` nullable arrays, or `arrow` validity bitmaps (see {issue}`504` for more discussion). That is, we store an indicator array (or mask) of null values alongside the array of all values. `````{tab-set} ````{tab-item} HDF5 :sync: hdf5 ```python >>> from anndata import write_elem >>> null_store = h5py.File("tmp.h5", mode="w") >>> int_array = pd.array([1, None, 3, 4]) >>> int_array [1, , 3, 4] Length: 4, dtype: Int64 >>> write_elem(null_store, "nullable_integer", int_array) >>> null_store.visititems(print) nullable_integer nullable_integer/mask nullable_integer/values ``` ```` ````{tab-item} Zarr :sync: zarr ```python >>> from anndata import write_elem >>> null_store = zarr.open() >>> int_array = pd.array([1, None, 3, 4]) >>> int_array [1, , 3, 4] Length: 4, dtype: Int64 >>> write_elem(null_store, "nullable_integer", int_array) >>> null_store.visititems(print) nullable_integer nullable_integer/mask nullable_integer/values ``` ```` ````` ```python >>> dict(null_store["nullable_integer"].attrs) {'encoding-type': 'nullable-integer', 'encoding-version': '0.1.0'} ``` ### Nullable integer specifications (v0.1.0) * Nullable integers MUST be stored as a group * The group's attributes MUST have contain the encoding metadata `"encoding-type": "nullable-integer"`, `"encoding-version": "0.1.0"` * The group MUST contain an integer valued array under the key `"values"` * The group MUST contain an boolean valued array under the key `"mask"` ### Nullable boolean specifications (v0.1.0) * Nullable booleans MUST be stored as a group * The group's attributes MUST have contain the encoding metadata `"encoding-type": "nullable-boolean"`, `"encoding-version": "0.1.0"` * The group MUST contain an boolean valued array under the key `"values"` * The group MUST contain an boolean valued array under the key `"mask"` * The `"values"` and `"mask"` arrays MUST be the same shape ## AwkwardArrays ```{warning} **Experimental** Support for ragged arrays via awkward array is considered experimental under the 0.9.0 release series. Please direct feedback on it's implementation to [https://github.com/scverse/anndata](https://github.com/scverse/anndata). ``` Ragged arrays are supported in `anndata` through the [Awkward Array](https://awkward-array.org/) library. For storage on disk, we break down the awkward array into it’s constituent arrays using [`ak.to_buffers`](https://awkward-array.readthedocs.io/en/latest/_auto/ak.to_buffers.html) then writing these arrays using `anndata`’s methods. `````{tab-set} ````{tab-item} HDF5 :sync: hdf5 ```python >>> store["varm/transcript"].visititems(print) node1-mask node10-data node11-mask node12-offsets node13-mask node14-data node16-offsets node17-data node2-offsets node3-data node4-mask node5-offsets node6-data node7-mask node8-offsets node9-mask ``` ```` ````{tab-item} Zarr :sync: zarr ```python >>> store["varm/transcript"].visititems(print) node1-mask node10-data node11-mask node12-offsets node13-mask node14-data node16-offsets node17-data node2-offsets node3-data node4-mask node5-offsets node6-data node7-mask node8-offsets node9-mask ``` ```` ````` The length of the array is saved to it’s own `"length"` attribute, while metadata for the array structure is serialized and saved to the `“form”` attribute. ```python >>> dict(store["varm/transcript"].attrs) {'encoding-type': 'awkward-array', 'encoding-version': '0.1.0', 'form': '{"class": "RecordArray", "fields": ["tx_id", "seq_name", ' '"exon_seq_start", "exon_seq_end", "ensembl_id"], "contents": ' '[{"class": "BitMaskedArray", "mask": "u8", "valid_when": true, ' '"lsb_order": true, "content": {"class": "ListOffsetArray", ' '"offsets": "i64", "content": {"class": "NumpyArray", "primitive": ' '"uint8", "inner_shape": [], "parameters": {"__array__": "char"}, ' '"form_key": "node3"}, "parameters": {"__array__": "string"}, ' '"form_key": "node2"}, "parameters": {}, "form_key": "node1"}, ' ... 'length': 40145} ``` These can be read back as awkward arrays using the [`ak.from_buffers`](https://awkward-array.readthedocs.io/en/latest/_auto/ak.from_buffers.html) function: ```python >>> import awkward as ak >>> from anndata.io import read_elem >>> awkward_group = store["varm/transcript"] >>> ak.from_buffers( ... awkward_group.attrs["form"], ... awkward_group.attrs["length"], ... {k: read_elem(v) for k, v in awkward_group.items()} ... ) >>> transcript_models[:5] [{tx_id: 'ENST00000450305', seq_name: '1', exon_seq_start: [...], ...}, {tx_id: 'ENST00000488147', seq_name: '1', exon_seq_start: [...], ...}, {tx_id: 'ENST00000473358', seq_name: '1', exon_seq_start: [...], ...}, {tx_id: 'ENST00000477740', seq_name: '1', exon_seq_start: [...], ...}, {tx_id: 'ENST00000495576', seq_name: '1', exon_seq_start: [...], ...}] ----------------------------------------------------------------------- type: 5 * { tx_id: ?string, seq_name: ?string, exon_seq_start: option[var * ?int64], exon_seq_end: option[var * ?int64], ensembl_id: ?string } >>> transcript_models[0] {tx_id: 'ENST00000450305', seq_name: '1', exon_seq_start: [12010, 12179, 12613, 12975, 13221, 13453], exon_seq_end: [12057, 12227, 12697, 13052, 13374, 13670], ensembl_id: 'ENSG00000223972'} ------------------------------------------------------------ type: { tx_id: ?string, seq_name: ?string, exon_seq_start: option[var * ?int64], exon_seq_end: option[var * ?int64], ensembl_id: ?string } ``` [easy to find]: https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format) [hdf5]: https://en.wikipedia.org/wiki/Hierarchical_Data_Format python-anndata-0.12.0~rc1/docs/index.md000066400000000000000000000006761500370632200177130ustar00rootroot00000000000000```{include} ../README.md ``` # Latest additions See {doc}`/release-notes/index`, particularly {ref}`v0.10` for the current release, and [the `.feature` fragments](https://github.com/scverse/anndata/tree/main/docs) for the upcoming release, ```{toctree} :hidden: true :maxdepth: 1 tutorials/index api concatenation fileformat-prose interoperability benchmarks contributing release-notes/index references ``` # News ```{include} news.md ``` python-anndata-0.12.0~rc1/docs/interoperability.md000066400000000000000000000027151500370632200221650ustar00rootroot00000000000000# Interoperability The on-disk representation of anndata files can be read from other languages. Here we list interfaces for working with AnnData from your language of choice: ## R - [zellkonverter](https://bioconductor.org/packages/release/bioc/html/zellkonverter.html) zellkonverter provides basilisk based tooling for loading from `h5ad` files to `SingleCellExperiment` - [anndata](https://anndata.dynverse.org) provides an R implementation of `AnnData` as well as IO for the HDF5 format. - [MuData](https://bioconductor.org/packages/release/bioc/html/MuData.html) provides IO for `AnnData` and `MuData` stored in HDF5 to Bioconductor's `SingleCellExperiment` and `MultiAssayExperiment` objects. - [MuDataSeurat](https://pmbio.github.io/MuDataSeurat/) provides IO from `AnnData` and `MuData` stored in HDF5 to `Seurat` objects. ## Julia - [Muon.jl](https://docs.juliahub.com/Muon/QfqCh/0.1.1/objects/) provides Julia implementations of `AnnData` and `MuData` objects, as well as IO for the HDF5 format - [scVI.jl](https://maren-ha.github.io/scVI.jl/index.html) provides a Julia implementation of `AnnData` as well as IO for the HDF5 format. ## Javascript - [Vitessce](https://github.com/vitessce/vitessce) contains loaders from `AnnData`s stored as Zarr, and uses this to provide interactive visualization ## Rust - [anndata-rs](https://github.com/kaizhang/anndata-rs) provides a Rust implementation of `AnnData` as well as advanced IO support for the HDF5 storage format. python-anndata-0.12.0~rc1/docs/news.md000066400000000000000000000011111500370632200175410ustar00rootroot00000000000000# Muon paper published {small}`2022-02-02` Muon has been published in Genome Biology {cite}`Bredikhin22`. Muon is a framework for multimodal data built on top of `AnnData`. Check out [Muon](https://muon.readthedocs.io/en/latest/) and its datastructure [MuData](https://mudata.readthedocs.io/en/latest/). # COVID-19 datasets distributed as `h5ad` {small}`2020-04-01` In a joint initiative, the Wellcome Sanger Institute, the Human Cell Atlas, and the CZI distribute datasets related to COVID-19 via anndata's `h5ad` files: [covid19cellatlas.org](https://www.covid19cellatlas.org/). python-anndata-0.12.0~rc1/docs/references.rst000066400000000000000000000012061500370632200211230ustar00rootroot00000000000000References ---------- .. [Bredikhin22] Bredikhin *et al.* (2022), *MUON: multimodal omics analysis framework*, Genome Biology https://doi.org/10.1186/s13059-021-02577-8. .. [Hastie09] Hastie *et al.* (2009), *The Elements of Statistical Learning*, Springer https://web.stanford.edu/~hastie/ElemStatLearn/. .. [Huber15] Huber *et al.* (2015), *Orchestrating high-throughput genomic analysis with Bioconductor*, Nature Methods https://doi.org/10.1038/nmeth.3252. .. [Murphy12] Murphy (2012, *Machine Learning: A Probabilistic Perspective*, MIT Press https://mitpress.mit.edu/9780262018029/machine-learning/. python-anndata-0.12.0~rc1/docs/release-notes/000077500000000000000000000000001500370632200210175ustar00rootroot00000000000000python-anndata-0.12.0~rc1/docs/release-notes/0.10.0.md000066400000000000000000000042651500370632200220640ustar00rootroot00000000000000(v0.10.0)= ### 0.10.0 {small}`2023-10-06` #### Features **GPU Support** * Dense and sparse [`CuPy`](https://docs.cupy.dev/) arrays are now supported {pr}`1066` {user}`ivirshup` * Once you have `CuPy` arrays in your anndata, use it with: [`rapids-singlecell`](https://rapids-singlecell.readthedocs.io/en/latest/index.html) from v0.9+ * anndata now has GPU enabled CI. Made possibly by a grant from [CZI's EOSS program](https://chanzuckerberg.com/eoss/) and managed via [Cirun](https://Cirun.io) {pr}`1066` {pr}`1084` {user}`Zethson` {user}`ivirshup` **Out of core** * Concatenate on-disk anndata objects with {func}`anndata.experimental.concat_on_disk` {pr}`955` {user}`selmanozleyen` * AnnData can now hold dask arrays with `scipy.sparse.spmatrix` chunks {pr}`1114` {user}`ivirshup` * Public API for interacting with on disk sparse arrays: {func}`~anndata.io.sparse_dataset`, {class}`~anndata.abc.CSRDataset`, and {class}`~anndata.abc.CSCDataset` {pr}`765` {user}`ilan-gold` {user}`ivirshup` * Improved performance for simple slices of OOC sparse arrays {pr}`1131` {user}`ivirshup` **Improved errors and warnings** * Improved error messages when combining dataframes with duplicated column names {pr}`1029` {user}`ivirshup` * Improved warnings when modifying views of `AlingedMappings` {pr}`1016` {user}`flying-sheep` {user}`ivirshup` * `AnnDataReadError`s have been removed. The original error is now thrown with additional information in a note {pr}`1055` {user}`ivirshup` #### Documentation * Added zarr examples to {doc}`file format docs` {pr}`1162` {user}`ivirshup` #### Breaking changes * {meth}`anndata.AnnData.transpose` no longer copies unnecessarily. If you rely on the copying behavior, call `.copy` on the resulting object. {pr}`1114` {user}`ivirshup` #### Other updates * Bump minimum python version to 3.9 {pr}`1117` {user}`flying-sheep` #### Deprecations * Deprecate `anndata.read`, which was just an alias for {func}`anndata.io.read_h5ad` {pr}`1108` {user}`ivirshup`. * `dtype` argument to `AnnData` constructor is now deprecated {pr}`1153` {user}`ivirshup` #### Bug fixes * Fix shape inference on initialization when `X=None` is specified {pr}`1121` {user}`flying-sheep` python-anndata-0.12.0~rc1/docs/release-notes/0.10.1.md000066400000000000000000000002451500370632200220570ustar00rootroot00000000000000(v0.10.1)= ### 0.10.1 {small}`2023-10-08` #### Bug fixes * Fix `ad.concat` erroring when concatenating a categorical and object column {pr}`1171` {user}`ivirshup` python-anndata-0.12.0~rc1/docs/release-notes/0.10.2.md000066400000000000000000000014401500370632200220560ustar00rootroot00000000000000(v0.10.2)= ### 0.10.2 {small}`2023-10-11` #### Bug fixes * Added compatibility layer for packages relying on `anndata._core.sparse_dataset.SparseDataset`. Note that this API is *deprecated* and new code should use `anndata.CSRDataset`, `~anndata.CSCDataset`, and `anndata.sparse_dataset` instead. {pr}`1185` {user}`ivirshup` * Handle deprecation warning from `pd.Categorical.map` thrown during `anndata.concat` {pr}`1189` {user}`flying-sheep` {user}`ivirshup` * Fixed extra steps being included in IO tracebacks {pr}`1193` {user}`flying-sheep` * `as_dense` argument of `write_h5ad` no longer writes an array without encoding metadata {pr}`1193` {user}`flying-sheep` #### Performance * Improved performance of `concat_on_disk` with dense arrays in some cases {pr}`1169` {user}`selmanozleyen` python-anndata-0.12.0~rc1/docs/release-notes/0.10.3.md000066400000000000000000000007651500370632200220700ustar00rootroot00000000000000(v0.10.3)= ### 0.10.3 {small}`2023-10-31` #### Bug fixes * Prevent pandas from causing infinite recursion when setting a slice of a categorical column {pr}`1211` {user}`flying-sheep` #### Documentation * Stop showing “Support for Awkward Arrays is currently experimental” warnings when reading, concatenating, slicing, or transposing AnnData objects {pr}`1182` {user}`flying-sheep` #### Other updates * Fail canary CI job when tests raise unexpected warnings. {pr}`1182` {user}`flying-sheep` python-anndata-0.12.0~rc1/docs/release-notes/0.10.4.md000066400000000000000000000014001500370632200220540ustar00rootroot00000000000000(v0.10.4)= ### 0.10.4 {small}`2024-01-04` #### Bug fixes * Only try to use `Categorical.map(na_action=…)` in actually supported Pandas ≥2.1 {pr}`1226` {user}`flying-sheep` * `AnnData.__sizeof__()` support for backed datasets {pr}`1230` {user}`Neah-Ko` * `adata[:, []]` now returns an `AnnData` object empty on the appropriate dimensions instead of erroring {pr}`1243` {user}`ilan-gold` * `adata.X[mask]` works in newer `numpy` versions when `X` is `backed` {pr}`1255` {user}`ilan-gold` * `adata.X[...]` fixed for `X` as a `BaseCompressedSparseDataset` with `zarr` backend {pr}`1265` {user}`ilan-gold` * Improve read/write error reporting {pr}`1273` {user}`flying-sheep` #### Documentation * Improve aligned mapping error messages {pr}`1252` {user}`flying-sheep` python-anndata-0.12.0~rc1/docs/release-notes/0.10.5.md000066400000000000000000000014771500370632200220730ustar00rootroot00000000000000(v0.10.5)= ### 0.10.5 {small}`2024-01-25` #### Bug fixes * Fix outer concatenation along variables when only a subset of objects had an entry in layers {pr}`1291` {user}`ivirshup` * Fix comparison of >2d arrays in `uns` during concatenation {pr}`1300` {user}`ivirshup` * Fix IO with awkward array version 2.5.2 {pr}`1328` {user}`ivirshup` * Fix bug (introduced in 0.10.4) where indexing an AnnData with `list[bool]` would return the wrong result {pr}`1332` {user}`ivirshup` #### Documentation * Re-add search-as-you-type, this time via `readthedocs-sphinx-search` {pr}`1311` {user}`flying-sheep` #### Performance * `BaseCompressedSparseDataset`'s `indptr` is cached {pr}`1266` {user}`ilan-gold` * Improved performance when indexing backed sparse matrices with boolean masks along their major axis {pr}`1233` {user}`ilan-gold` python-anndata-0.12.0~rc1/docs/release-notes/0.10.6.md000066400000000000000000000027451500370632200220730ustar00rootroot00000000000000(v0.10.6)= ### 0.10.6 {small}`2024-03-11` #### Bug fixes * Defer import of zarr in test helpers, as scanpy CI job relies on them {pr}`1343` {user}`ilan-gold` * Writing a dataframe with non-unique column names now throws an error, instead of silently overwriting {pr}`1335` {user}`ivirshup` * Bring optimization from {pr}`1233` to indexing on the whole `AnnData` object, not just the sparse dataset itself {pr}`1365` {user}`ilan-gold` * Fix mean slice length checking to use improved performance when indexing backed sparse matrices with boolean masks along their major axis {pr}`1366` {user}`ilan-gold` * Fixed overflow occurring when writing dask arrays with sparse chunks by always writing dask arrays with 64 bit indptr and indices, and adding an overflow check to `.append` method of sparse on disk structures {pr}`1348` {user}`ivirshup` * Modified `ValueError` message for invalid `.X` during construction to show more helpful list instead of ambiguous `__name__` {pr}`1395` {user}`eroell` * Pin `array-api-compat!=1.5` to avoid incorrect implementation of `asarray` {pr}`1411` {user}`ivirshup` #### Documentation * Type hints and docstrings for `.to_df` method are updated and fixed {pr}`1402` {user}`WeilerP` #### Development * `anndata`'s CI now tests against minimum versions of it's dependencies. As a result, several dependencies had their minimum required version bumped. See diff for details {pr}`1314` {user}`ivirshup` * `anndata` now tests against Python 3.12 {pr}`1373` {user}`ivirshup` python-anndata-0.12.0~rc1/docs/release-notes/0.10.7.md000066400000000000000000000006511500370632200220660ustar00rootroot00000000000000(v0.10.7)= ### 0.10.7 {small}`2024-04-09` #### Bug fixes * Handle upstream `numcodecs` bug where read-only string arrays cannot be encoded {user}`ivirshup` {pr}`1421` * Use in-memory sparse matrix directly to fix compatibility with `scipy` `1.13` {user}`ilan-gold` {pr}`1435` #### Performance * Remove `vindex` for subsetting `dask.array.Array` because of its slowness and memory consumption {user}`ilan-gold` {pr}`1432` python-anndata-0.12.0~rc1/docs/release-notes/0.10.8.md000066400000000000000000000007221500370632200220660ustar00rootroot00000000000000(v0.10.8)= ### 0.10.8 {small}`2024-06-20` #### Bug fixes * Write out `64bit` indptr when appropriate for {func}`~anndata.experimental.concat_on_disk` {pr}`1493` {user}`ilan-gold` * Support for Numpy 2 {pr}`1499` {user}`flying-sheep` * Fix {func}`~anndata.io.sparse_dataset` docstring test on account of new {mod}`scipy` version {pr}`1514` {user}`ilan-gold` #### Documentation * Improved example for {func}`~anndata.io.sparse_dataset` {pr}`1468` {user}`ivirshup` python-anndata-0.12.0~rc1/docs/release-notes/0.10.9.md000066400000000000000000000025071500370632200220720ustar00rootroot00000000000000(v0.10.9)= ### 0.10.9 {small}`2024-08-28` #### Bug fixes - Fix writing large number of columns for `h5` files {user}`ilan-gold` {user}`selmanozleyen` ({pr}`1147`) - Add warning for setting `X` on a view with repeated indices {user}`ilan-gold` ({pr}`1501`) - Coerce {class}`numpy.matrix` classes to arrays when trying to store them in `AnnData` {user}`flying-sheep` ({pr}`1516`) - Fix for setting a dense `X` view with a sparse matrix {user}`ilan-gold` ({pr}`1532`) - Upper bound {mod}`numpy` for `gpu` installation on account of {issue}`cupy/cupy#8391` {user}`ilan-gold` ({pr}`1540`) - Upper bound dask on account of {issue}`1579` {user}`ilan-gold` ({pr}`1580`) - Ensure setting {attr}`pandas.DataFrame.index` on a view of a {class}`~anndata.AnnData` instantiates the {class}`~pandas.DataFrame` from the view {user}`ilan-gold` ({pr}`1586`) - Disallow using {class}`~pandas.DataFrame`s with multi-index columns {user}`ilan-gold` ({pr}`1589`) #### Development Process - create new `cupy` installation options for cuda 11 & 12 called `cu11` and `cu12` {user}`Intron7` ({pr}`1596`) #### Documentation - add `callback` typing for {func}`~anndata.experimental.read_dispatched` and {func}`~anndata.experimental.write_dispatched` {user}`ilan-gold` ({pr}`1557`) #### Performance - Support for `concat_on_disk` outer join {user}`ilan-gold` ({pr}`1504`) python-anndata-0.12.0~rc1/docs/release-notes/0.11.0.md000066400000000000000000000107641500370632200220660ustar00rootroot00000000000000(v0.11.0)= ### 0.11.0 {small}`2024-11-07` Release candidates: - (v0.11.0rc3)= {guilabel}`rc3` 2024-10-14 - (v0.11.0rc2)= {guilabel}`rc2` 2024-09-24 - (v0.11.0rc1)= {guilabel}`rc1` 2024-09-04 #### Bug fixes - Ensure {func}`anndata.concat` of {class}`~anndata.AnnData` object with {class}`scipy.sparse.spmatrix` and {class}`scipy.sparse.sparray` dask arrays uses the correct fill value of 0. {user}`ilan-gold` ({pr}`1719`) - Ensure that views of AwkwardArrays have their "view" attributes removed on saving an {class}`~anndata.AnnData` object to disk. {user}`grst` ({pr}`1736`) #### Breaking changes - {guilabel}`rc3` Drop support for `python` 3.9 {user}`ilan-gold` ({pr}`1712`) - {guilabel}`rc2` A new `anndata.io` module contains all `read_*` and `write_*` functions, and all imports of such functions should go through this module. Old ways of importing these functions i.e., `from anndata import read_csv` or `from anndata._io.specs import read_elem` will still work, but are now considered deprecated and give a warning on import with the exception of {func}`anndata.io.read_zarr` and {func}`anndata.io.read_h5ad`, which will remain at the top-level `anndata` without warning. {user}`ilan-gold ({pr}`1682`) - {guilabel}`rc1` Removed deprecated modules `anndata.core` and `anndata.readwrite` {user}`ivirshup` ({pr}`1197`) - {guilabel}`rc1` No longer export `sparse_dataset` from `anndata.experimental`, instead exporting {func}`anndata.io.sparse_dataset` {user}`ilan-gold` ({pr}`1642`) - {guilabel}`rc1` Move `RWAble` and `InMemoryElem` out of `experimental`, renaming `RWAble` to {type}`~anndata.typing.AxisStorable` and `InMemoryElem` to {type}`~anndata.typing.RWAble` {user}`ilan-gold` ({pr}`1643`) #### Development Process - {guilabel}`rc2` Add extra `dask` dependency for installation i.e., `pip install anndata[dask]` {user}`ilan-gold` ({pr}`1677`) - {guilabel}`rc2` Remove `shall_` from variable names in `settings` {user}`ilan-gold` ({pr}`1685`) - {guilabel}`rc1` Create new `cupy` installation options for cuda 11 & 12 called `cu11` and `cu12` {user}`Intron7` ({pr}`1596`) #### Documentation - {guilabel}`rc1` Correct {attr}`anndata.AnnData.X` type to include {class}`~anndata.abc.CSRDataset` and {class}`~anndata.abc.CSCDataset` as possible types and being deprecation process for non-csr/csc {class}`scipy.sparse.spmatrix` types in {attr}`anndata.AnnData.X` {user}`ilan-gold` ({pr}`1616`) #### Features - Add support for ellipsis indexing of the {class}`~anndata.AnnData` object {user}`ilan-gold` ({pr}`1729`) - {guilabel}`rc1` `scipy.sparse.csr_array` and `scipy.sparse.csc_array` are now supported when constructing `AnnData` objects {user}`ilan-gold` {user}`isaac-virshup` ({pr}`1028`) - {guilabel}`rc1` Allow `axis` parameter of e.g. {func}`anndata.concat` to accept `'obs'` and `'var'` {user}`flying-sheep` ({pr}`1244`) - {guilabel}`rc1` Add `settings` object with methods for altering internally-used options, like checking for uniqueness on `obs`' index {user}`ilan-gold` ({pr}`1270`) - {guilabel}`rc1` Add {attr}`~anndata.settings.remove_unused_categories` option to {attr}`anndata.settings` to override current behavior {user}`ilan-gold` ({pr}`1340`) - {guilabel}`rc1` Add `~anndata.experimental.read_elem_as_dask` function to handle i/o with sparse and dense arrays {user}`ilan-gold` ({pr}`1469`) - {guilabel}`rc1` Add ability to convert strings to categoricals on write in {meth}`~anndata.AnnData.write_h5ad` and {meth}`~anndata.AnnData.write_zarr` via `convert_strings_to_categoricals` parameter {user}` falexwolf` ({pr}`1474`) - {guilabel}`rc1` Add {attr}`~anndata.settings.check_uniqueness` option to {attr}`anndata.settings` to override current behavior {user}`ilan-gold` ({pr}`1507`) - {guilabel}`rc1` Add functionality to write from GPU {class}`dask.array.Array` to disk {user}`ilan-gold` ({pr}`1550`) - {guilabel}`rc1` Read and write support for nullable string arrays ({class}`pandas.arrays.StringArray`). Use pandas’ {doc}`pandas:user_guide/options` `mode.string_storage` to control which storage mode is used when reading `dtype="string"` columns. {user}`flying-sheep` ({pr}`1558`) - {guilabel}`rc1` Export {func}`~anndata.io.write_elem` and {func}`~anndata.io.read_elem` directly from the main package instead of `experimental` {user}`ilan-gold` ({pr}`1598`) - {guilabel}`rc1` Allow reading sparse data (via {func}`~anndata.io.read_elem` or {func}`~anndata.io.sparse_dataset`) into either {class}`scipy.sparse.csr_array` or {class}`scipy.sparse.csc_array` via {attr}`anndata.settings.use_sparse_array_on_read` {user}`ilan-gold` ({pr}`1633`) python-anndata-0.12.0~rc1/docs/release-notes/0.11.1.md000066400000000000000000000005601500370632200220600ustar00rootroot00000000000000(v0.11.1)= ### 0.11.1 {small}`2024-11-12` ### Bug fixes - Remove upper pin on `dask` and exclude versions broken with sparse indexing {user}`ilan-gold` ({pr}`1725`) - Fix chunking with -1 in `chunks` argument of `~anndata.experimental.read_elem_as_dask` {user}`ilan-gold` ({pr}`1743`) - Fix `cupy<0.13` imports in non-gpu environments {user}`ilan-gold` ({pr}`1754`) python-anndata-0.12.0~rc1/docs/release-notes/0.11.2.md000066400000000000000000000012661500370632200220650ustar00rootroot00000000000000(v0.11.2)= ### 0.11.2 {small}`2025-01-07` ### Bug fixes - Cache accesses to the `data` and `indices` arrays in {class}`~anndata.abc.CSRDataset` and {class}`~anndata.abc.CSCDataset` {user}`ilan-gold` ({pr}`1744`) - Error out on floating point indices that are not actually integers {user}`ilan-gold` ({pr}`1746`) - `write_elem` now filters out incompatible `dataset_kwargs` when saving zero-dimensional arrays {user}`ilia-kats` ({pr}`1783`) - Add {mod}`scipy` 1.5 compatibility {user}`flying-sheep` ({pr}`1806`) ### Performance - Batch slice-based indexing in {class}`anndata.abc.CSRDataset` and {class}`anndata.abc.CSCDataset` for performance boost in `zarr` {user}`ilan-gold` ({pr}`1790`) python-anndata-0.12.0~rc1/docs/release-notes/0.11.3.md000066400000000000000000000001711500370632200220600ustar00rootroot00000000000000(v0.11.3)= ### 0.11.3 {small}`2025-01-10` ### Bug fixes - Upper bound `zarr` at runtime {user}`ilan-gold` ({pr}`1819`) python-anndata-0.12.0~rc1/docs/release-notes/0.11.4.md000066400000000000000000000015501500370632200220630ustar00rootroot00000000000000(v0.11.4)= ### 0.11.4 {small}`2025-03-26` ### Bug fixes - Raise {class}`~anndata.ImplicitModificationWarning` when setting `X` on a view. {user}`ilan-gold` ({pr}`1853`) - Bound `dask` due to {issue}`dask/dask#11752` {user}`ilan-gold` ({pr}`1859`) - Fix concatenation of {class}`anndata.AnnData` objects along `var` using `join="outer"` when `varm` is not empty. {user}`ilia-kats` ({pr}`1911`) - Add `convert_strings_to_categoricals` parameter also to {meth}`~anndata.AnnData.write_h5ad` and {meth}`~anndata.AnnData.write_zarr` as intended {user}`flying-sheep` ({pr}`1914`) - Allow initialization of {class}`anndata.AnnData` objects without `X` (since they could be constructed previously by deleting `X`) {user}`ilan-gold` ({pr}`1941`) ### Development Process - Fix version number inference in development environments (CI and local) {user}`flying-sheep` ({pr}`1831`) python-anndata-0.12.0~rc1/docs/release-notes/0.12.0rc1.md000066400000000000000000000001111500370632200224560ustar00rootroot00000000000000(v0.12.0rc1)= ### 0.12.0rc1 {small}`2025-04-09` No significant changes. python-anndata-0.12.0~rc1/docs/release-notes/0.4.0.md000066400000000000000000000010071500370632200217760ustar00rootroot00000000000000(v0.4.0)= ### 0.4.0 {small}`23 December, 2017` - read/write [.loom](https://loompy.org) files - scalability beyond dataset sizes that fit into memory: see this [blog post] - {class}`~anndata.AnnData` has a {class}`~anndata.AnnData.raw` attribute, which simplifies storing the data matrix when you consider it *raw*: see the [clustering tutorial] [blog post]: http://falexwolf.de/blog/171223_AnnData_indexing_views_HDF5-backing/ [clustering tutorial]: https://github.com/scverse/scanpy_usage/tree/master/170505_seurat python-anndata-0.12.0~rc1/docs/release-notes/0.5.0.md000066400000000000000000000007151500370632200220040ustar00rootroot00000000000000(v0.5.0)= ### 0.5.0 {small}`9 February, 2018` - inform about duplicates in {class}`~anndata.AnnData.var_names` and resolve them using {func}`~anndata.AnnData.var_names_make_unique` - automatically remove unused categories after slicing - read/write [.loom](https://loompy.org) files using loompy 2 - fixed read/write for a few text file formats - read [UMI tools] files: {func}`~anndata.io.read_umi_tools` [umi tools]: https://github.com/CGATOxford/UMI-tools python-anndata-0.12.0~rc1/docs/release-notes/0.6.0.md000066400000000000000000000006341500370632200220050ustar00rootroot00000000000000(v0.6.0)= ### 0.6.0 {small}`1 May, 2018` - compatibility with Seurat converter - tremendous speedup for {meth}`~anndata.AnnData.concatenate` - bug fix for deep copy of unstructured annotation after slicing - bug fix for reading HDF5 stored single-category annotations - `'outer join'` concatenation: adds zeros for concatenation of sparse data and nans for dense data - better memory efficiency in loom exports python-anndata-0.12.0~rc1/docs/release-notes/0.6.x.md000066400000000000000000000024261500370632200221160ustar00rootroot00000000000000(v0.6.x)= ### 0.6.\* {small}`2019-*-*` - better support for aligned mappings (obsm, varm, layers) `0.6.22` {pr}`155` {smaller}`I Virshup` - convenience accessors {func}`~anndata.AnnData.obs_vector`, {func}`~anndata.AnnData.var_vector` for 1d arrays. `0.6.21` {pr}`144` {smaller}`I Virshup` - compatibility with Scipy >=1.3 by removing `IndexMixin` dependency. `0.6.20` {pr}`151` {smaller}`P Angerer` - bug fix for second-indexing into views. `0.6.19` {smaller}`P Angerer` - bug fix for reading excel files. `0.6.19` {smaller}`A Wolf` - changed default compression to `None` in {func}`~anndata.AnnData.write_h5ad` to speed up read and write, disk space use is usually less critical. `0.6.16` {smaller}`A Wolf` - maintain dtype upon copy. `0.6.13` {smaller}`A Wolf` - {attr}`~anndata.AnnData.layers` inspired by [.loom](https://loompy.org) files allows their information lossless reading via {func}`~anndata.io.read_loom`. `0.6.7`–`0.6.9` {pr}`46` & {pr}`48` {smaller}`S Rybakov` - support for reading zarr files: {func}`~anndata.io.read_zarr` `0.6.7` {pr}`38` {smaller}`T White` - initialization from pandas DataFrames `0.6.` {smaller}`A Wolf` - iteration over chunks {func}`~anndata.AnnData.chunked_X` and {func}`~anndata.AnnData.chunk_X` `0.6.1` {pr}`20` {smaller}`S Rybakov` python-anndata-0.12.0~rc1/docs/release-notes/0.7.0.md000066400000000000000000000046461500370632200220150ustar00rootroot00000000000000(v0.7.0)= ### 0.7.0 {small}`22 January, 2020` ```{warning} Breaking changes introduced between `0.6.22.post1` and `0.7`: - Elements of {class}`~anndata.AnnData`s don’t have their dimensionality reduced when the main object is subset. This is to maintain consistency when subsetting. See discussion in {issue}`145`. - Internal modules like `anndata.core` are private and their contents are not stable: See {issue}`174`. - The old deprecated attributes `.smp*`. `.add` and `.data` have been removed. ``` #### View overhaul {pr}`164` - Indexing into a view no longer keeps a reference to intermediate view, see {issue}`62`. - Views are now lazy. Elements of view of AnnData are not indexed until they’re accessed. - Indexing with scalars no longer reduces dimensionality of contained arrays, see {issue}`145`. - All elements of AnnData should now follow the same rules about how they’re subset, see {issue}`145`. - Can now index by observations and variables at the same time. #### IO overhaul {pr}`167` - Reading and writing has been overhauled for simplification and speed. - Time and memory usage can be half of previous in typical use cases - Zarr backend now supports sparse arrays, and generally is closer to having the same features as HDF5. - Backed mode should see significant speed and memory improvements for access along compressed dimensions and IO. PR {pr}`241`. - {class}`~pandas.Categorical`s can now be ordered (PR {pr}`230`) and written to disk with a large number of categories (PR {pr}`217`). #### Mapping attributes overhaul {smaller}`(obsm, varm, layers, ...)` - New attributes {attr}`~anndata.AnnData.obsp` and {attr}`~anndata.AnnData.varp` have been added for two dimensional arrays where each axis corresponds to a single axis of the AnnData object. PR {pr}`207`. - These are intended to store values like cell-by-cell graphs, which are currently stored in {attr}`~anndata.AnnData.uns`. - Sparse arrays are now allowed as values in all mapping attributes. - DataFrames are now allowed as values in {attr}`~anndata.AnnData.obsm` and {attr}`~anndata.AnnData.varm`. - All mapping attributes now share an implementation and will have the same behaviour. PR {pr}`164`. #### Miscellaneous improvements - Mapping attributes now have ipython tab completion (e.g. `adata.obsm["\\t` can provide suggestions) PR {pr}`183`. - {class}`~anndata.AnnData` attributes are now delete-able (e.g. `del adata.raw`) PR {pr}`242`. - Many many bug fixes python-anndata-0.12.0~rc1/docs/release-notes/0.7.2.md000066400000000000000000000035021500370632200220050ustar00rootroot00000000000000(v0.7.2)= ### 0.7.2 {small}`15 May, 2020` #### Concatenation overhaul {smaller}`I Virshup` - Elements of `uns` can now be merged, see {pr}`350` - Outer joins now work for `layers` and `obsm`, see {pr}`352` - Fill value for outer joins can now be specified - Expect improvements in performance, see {issue}`303` #### Functionality - {attr}`~anndata.AnnData.obsp` and {attr}`~anndata.AnnData.varp` can now be transposed {pr}`370` {smaller}`A Wolf` - {meth}`~anndata.AnnData.obs_names_make_unique` is now better at making values unique, and will warn if ambiguities arise {pr}`345` {smaller}`M Weiden` - {attr}`~anndata.AnnData.obsp` is now preferred for storing pairwise relationships between observations. In practice, this means there will be deprecation warnings and reformatting applied to objects which stored connectivities under `uns["neighbors"]`. Square matrices in {attr}`~anndata.AnnData.uns` will no longer be sliced (use `.{obs,var}p` instead). {pr}`337` {smaller}`I Virshup` - {class}`~anndata.ImplicitModificationWarning` is now exported {pr}`315` {smaller}`P Angerer` - Better support for {class}`~numpy.ndarray` subclasses stored in `AnnData` objects {pr}`335` {smaller}`michalk8` #### Bug fixes - Fixed inplace modification of {class}`~pandas.Index` objects by the make unique function {pr}`348` {smaller}`I Virshup` - Passing ambiguous keys to {meth}`~anndata.AnnData.obs_vector` and {meth}`~anndata.AnnData.var_vector` now throws errors {pr}`340` {smaller}`I Virshup` - Fix instantiating {class}`~anndata.AnnData` objects from {class}`~pandas.DataFrame` {pr}`316` {smaller}`P Angerer` - Fixed indexing into `AnnData` objects with arrays like `adata[adata[:, gene].X > 0]` {pr}`332` {smaller}`I Virshup` - Fixed type of version {pr}`315` {smaller}`P Angerer` - Fixed deprecated import from {mod}`pandas` {pr}`319` {smaller}`P Angerer` python-anndata-0.12.0~rc1/docs/release-notes/0.7.3.md000066400000000000000000000002251500370632200220050ustar00rootroot00000000000000(v0.7.3)= ### 0.7.3 {small}`20 May, 2020` #### Bug fixes - Fixed bug where graphs used too much memory when copying {pr}`381` {smaller}`I Virshup` python-anndata-0.12.0~rc1/docs/release-notes/0.7.4.md000066400000000000000000000011351500370632200220070ustar00rootroot00000000000000(v0.7.4)= ### 0.7.4 {small}`10 July, 2020` #### Concatenation overhaul {pr}`378` {smaller}`I Virshup` - New function {func}`anndata.concat` for concatenating `AnnData` objects along either observations or variables - New documentation section: {doc}`/concatenation` #### Functionality - AnnData object created from dataframes with sparse values will have sparse `.X` {pr}`395` {smaller}`I Virshup` #### Bug fixes - Fixed error from `AnnData.concatenate` by bumping minimum versions of numpy and pandas {issue}`385` - Fixed colors being incorrectly changed when `AnnData` object was subset {pr}`388` python-anndata-0.12.0~rc1/docs/release-notes/0.7.5.md000066400000000000000000000006231500370632200220110ustar00rootroot00000000000000(v0.7.5)= ### 0.7.5 {small}`12 November, 2020` #### Functionality - Added ipython tab completion and a useful return from `.keys` to `adata.uns` {pr}`415` {smaller}`I Virshup` #### Bug fixes - Compatibility with `h5py>=3` strings {pr}`444` {smaller}`I Virshup` - Allow `adata.raw = None`, as is documented {pr}`447` {smaller}`I Virshup` - Fix warnings from pandas 1.1 {pr}`425` {smaller}`I Virshup` python-anndata-0.12.0~rc1/docs/release-notes/0.7.6.md000066400000000000000000000027501500370632200220150ustar00rootroot00000000000000(v0.7.6)= ### 0.7.6 {small}`11 April, 2021` #### Features - Added {meth}`anndata.AnnData.to_memory` for returning an in memory object from a backed one {pr}`470` {pr}`542` {smaller}`V Bergen` {smaller}`I Virshup` - {meth}`anndata.AnnData.write_loom` now writes `obs_names` and `var_names` using the `Index`'s `.name` attribute, if set {pr}`538` {smaller}`I Virshup` #### Bug fixes - Fixed bug where `np.str_` column names errored at write time {pr}`457` {smaller}`I Virshup` - Fixed "value.index does not match parent’s axis 0/1 names" error triggered when a data frame is stored in obsm/varm after obs_names/var_names is updated {pr}`461` {smaller}`G Eraslan` - Fixed `adata.write_csvs` when `adata` is a view {pr}`462` {smaller}`I Virshup` - Fixed null values being converted to strings when strings are converted to categorical {pr}`529` {smaller}`I Virshup` - Fixed handling of compression key word arguments {pr}`536` {smaller}`I Virshup` - Fixed copying a backed `AnnData` from changing which file the original object points at {pr}`533` {smaller}`ilia-kats` - Fixed a bug where calling `AnnData.concatenate` an `AnnData` with no variables would error {pr}`537` {smaller}`I Virshup` #### Deprecations - Passing positional arguments to {func}`anndata.io.read_loom` besides the path is now deprecated {pr}`538` {smaller}`I Virshup` - {func}`anndata.io.read_loom` arguments `obsm_names` and `varm_names` are now deprecated in favour of `obsm_mapping` and `varm_mapping` {pr}`538` {smaller}`I Virshup` python-anndata-0.12.0~rc1/docs/release-notes/0.7.7.md000066400000000000000000000012321500370632200220100ustar00rootroot00000000000000(v0.7.7)= ### 0.7.7 {small}`9 November, 2021` #### Bug fixes - Fixed propagation of import error when importing `write_zarr` but not all dependencies are installed {pr}`579` {smaller}`R Hillje` - Fixed issue with `.uns` sub-dictionaries being referenced by copies {pr}`576` {smaller}`I Virshup` - Fixed out-of-bounds integer indices not raising {class}`IndexError` {pr}`630` {smaller}`M Klein` - Fixed backed `SparseDataset` indexing with scipy 1.7.2 {pr}`638` {smaller}`I Virshup` #### Development processes - Use PEPs 621 (standardized project metadata), 631 (standardized dependencies), and 660 (standardized editable installs) {pr}`639` {smaller}`I Virshup` python-anndata-0.12.0~rc1/docs/release-notes/0.7.8.md000066400000000000000000000001701500370632200220110ustar00rootroot00000000000000(v0.7.8)= ### 0.7.8 {small}`9 November, 2021` #### Bug fixes - Re-include test helpers {pr}`641` {smaller}`I Virshup` python-anndata-0.12.0~rc1/docs/release-notes/0.8.0.md000066400000000000000000000043721500370632200220120ustar00rootroot00000000000000(v0.8.0)= ### 0.8.0 {small}`14th March, 2022` #### IO Specification ```{warning} The on disk format of AnnData objects has been updated with this release. Previous releases of `anndata` will not be able to read all files written by this version. For discussion of possible future solutions to this issue, see {issue}`698` ``` Internal handling of IO has been overhauled. This should make it much easier to support new datatypes, use partial access, and use `AnnData` internally in other formats. - Each element should be tagged with an `encoding_type` and `encoding_version`. See updated docs on the {doc}`file format ` - Support for nullable integer and boolean data arrays. More data types to come! - Experimental support for low level access to the IO API via {func}`~anndata.io.read_elem` and {func}`~anndata.io.write_elem` #### Features - Added PyTorch dataloader {class}`~anndata.experimental.AnnLoader` and lazy concatenation object {class}`~anndata.experimental.AnnCollection`. See the [tutorials] {pr}`416` {smaller}`S Rybakov` - Compatibility with `h5ad` files written from Julia {pr}`569` {smaller}`I Kats` - Many logging messages that should have been warnings are now warnings {pr}`650` {smaller}`I Virshup` - Significantly more efficient {func}`anndata.io.read_umi_tools` {pr}`661` {smaller}`I Virshup` - Fixed deepcopy of a copy of a view retaining sparse matrix view mixin type {pr}`670` {smaller}`M Klein` - In many cases {attr}`~anndata.AnnData.X` can now be `None` {pr}`463` {smaller}`R Cannoodt` {pr}`677` {smaller}`I Virshup`. Remaining work is documented in {issue}`467`. - Removed hard `xlrd` dependency {smaller}`I Virshup` - `obs` and `var` dataframes are no longer copied by default on `AnnData` instantiation {issue}`371` {smaller}`I Virshup` #### Bug fixes - Fixed issue where `.copy` was creating sparse matrices views when copying {pr}`670` {smaller}`michalk8` - Fixed issue where `.X` matrix read in from `zarr` would always have `float32` values {pr}`701` {smaller}`I Virshup` - `` Raw.to_adata` `` now includes `obsp` in the output {pr}`404` {smaller}`G Eraslan` #### Dependencies - `xlrd` dropped as a hard dependency - Now requires `h5py` `v3.0.0` or newer [tutorials]: https://anndata-tutorials.readthedocs.io/en/latest/index.html python-anndata-0.12.0~rc1/docs/release-notes/0.9.0.md000066400000000000000000000057541500370632200220200ustar00rootroot00000000000000(v0.9.0)= ### 0.9.0 {small}`2023-04-11` #### Features - Added experimental support for dask arrays {pr}`813` {user}`syelman` {user}`rahulbshrestha` - `obsm`, `varm` and `uns` can now hold [AwkwardArrays](https://awkward-array.org/quickstart.html) {pr}`647` {user}`giovp`, {user}`grst`, {user}`ivirshup` - Added experimental functions {func}`anndata.experimental.read_dispatched` and {func}`anndata.experimental.write_dispatched` which allow customizing IO with a callback {pr}`873` {user}`ilan-gold` {user}`ivirshup` - Better error messages during IO {pr}`734` {user}`flying-sheep`, {user}`ivirshup` - Unordered categorical columns are no longer cast to object during {func}`anndata.concat` {pr}`763` {user}`ivirshup` #### Documentation - New tutorials for experimental features > - {doc}`/tutorials/notebooks/anndata_dask_array` – {pr}`886` {user}`syelman` > - {doc}`/tutorials/notebooks/{read,write}_dispatched` – {pr}`scverse/anndata-tutorials#17` {user}`ilan-gold` > - {doc}`/tutorials/notebooks/awkward-arrays` – {pr}`scverse/anndata-tutorials#15` {user}`grst` - {doc}`File format description ` now includes a more formal specification {pr}`882` {user}`ivirshup` - {doc}`/interoperability`: new page on interoperability with other packages {pr}`831` {user}`ivirshup` - Expanded docstring more documentation for `backed` argument of {func}`anndata.io.read_h5ad` {pr}`812` {user}`jeskowagner` - Documented how to use alternative compression methods for the `h5ad` file format, see {meth}`AnnData.write_h5ad() ` {pr}`857` {user}`nigeil` - General typo corrections 😅 {pr}`870` {user}`folded` #### Breaking changes - The `AnnData` `dtype` argument no longer defaults to `float32` {pr}`854` {user}`ivirshup` - Previously deprecated `force_dense` argument {meth}`AnnData.write_h5ad() ` has been removed. {pr}`855` {user}`ivirshup` - Previously deprecated behaviour around storing adjacency matrices in `uns` has been removed {pr}`866` {user}`ivirshup` #### Other updates - Bump minimum python version to 3.8 {pr}`820` {user}`ivirshup` #### Deprecations - {meth}`AnnData.concatenate() ` is now deprecated in favour of {func}`anndata.concat` {pr}`845` {user}`ivirshup` #### Bug fixes - Fix warning from `rename_categories` {pr}`790` {smaller}`I Virshup` - Remove backwards compat checks for categories in `uns` when we can tell the file is new enough {pr}`790` {smaller}`I Virshup` - Categorical arrays are now created with a python `bool` instead of a `numpy.bool_` {pr}`856` - Fixed order dependent outer concatenation bug {pr}`904` {user}`ivirshup`, reported by {user}`szalata` - Fixed bug in renaming categories {pr}`790` {user}`ivirshup`, reported by {user}`perrin-isir` - Fixed IO bug when keys in `uns` ended in `_categories` {pr}`806` {user}`ivirshup`, reported by {user}`Hrovatin` - Fixed `raw.to_adata` not populating `obs` aligned values when `raw` was assigned through the setter {pr}`939` {user}`ivirshup` python-anndata-0.12.0~rc1/docs/release-notes/0.9.1.md000066400000000000000000000001541500370632200220060ustar00rootroot00000000000000(v0.9.1)= ### 0.9.1 {small}`2023-04-11` #### Bug fixes * Fixing windows support {pr}`958` {user}`Koncopd` python-anndata-0.12.0~rc1/docs/release-notes/0.9.2.md000066400000000000000000000007041500370632200220100ustar00rootroot00000000000000(v0.9.2)= ### 0.9.2 {small}`2023-07-25` #### Bug fixes * Views of `awkward.Array`s now work with `awkward>=2.3` {pr}`1040` {user}`ivirshup` * Fix ufuncs of views like `adata.X[:10].cov(axis=0)` returning views {pr}`1043` {user}`flying-sheep` * Fix instantiating AnnData where `.X` is a `DataFrame` with an integer valued index {pr}`1002` {user}`flying-sheep` * Fix {func}`~anndata.io.read_zarr` when used on `zarr.Group` {pr}`1057` {user}`ivirshup` python-anndata-0.12.0~rc1/docs/release-notes/index.md000066400000000000000000000000521500370632200224450ustar00rootroot00000000000000# Release notes ```{release-notes} . ``` python-anndata-0.12.0~rc1/docs/tutorials/000077500000000000000000000000001500370632200202775ustar00rootroot00000000000000python-anndata-0.12.0~rc1/docs/tutorials/index.md000066400000000000000000000007151500370632200217330ustar00rootroot00000000000000# Tutorials For a quick introduction to `AnnData`, check out {doc}`Getting Started with AnnData `. For working with the experimental data loaders also see {ref}`experimental-api`. ```{toctree} :maxdepth: 1 notebooks/getting-started notebooks/annloader notebooks/anncollection notebooks/anncollection-annloader notebooks/anndata_dask_array notebooks/awkward-arrays notebooks/{read,write}_dispatched notebooks/read_lazy zarr-v3 ``` python-anndata-0.12.0~rc1/docs/tutorials/zarr-v3.md000066400000000000000000000160161500370632200221310ustar00rootroot00000000000000# zarr-v3 Guide/Roadmap `anndata` now uses the much improved {mod}`zarr` v3 package and also allows writing of datasets in the v3 format via {attr}`anndata.settings.zarr_write_format`, with the exception of structured arrays. Users should notice a significant performance improvement, especially for cloud data, but also likely for local data as well. Here is a quick guide on some of our learnings so far: ## Remote data We now provide the {func}`anndata.experimental.read_lazy` feature for reading as much of the {class}`~anndata.AnnData` object as lazily as possible, using `dask` and {mod}`xarray`. Please note that this feature is experimental and subject to change. To enable this functionality in a performant and feature-complete way for remote data sources, we use {doc}`zarr:user-guide/consolidated_metadata` on the `zarr` store (written by default). Please note that this introduces consistency issues – if you update the structure of the underlying `zarr` store i.e., remove a column from `obs`, the consolidated metadata will no longer be valid. Further, note that without consolidated metadata, we cannot guarantee your stored `AnnData` object will be fully readable. And even if it is fully readable, it will almost certainly be much slower to read. There are two ways of opening remote `zarr` stores from the `zarr-python` package, {class}`zarr.storage.FsspecStore` and {class}`zarr.storage.ObjectStore`, and both can be used with `read_lazy`. [`obstore` claims] to be more performant out-of-the-box, but notes that this claim has not been benchmarked with the `uvloop` event loop, which itself claims to be 2× more performant than the default event loop for `python`. ## Local data Local data generally poses a different set of challenges. First, write speeds can be somewhat slow and second, the creation of many small files on a file system can slow down a filesystem. For the "many small files" problem, `zarr` has introduced {ref}`sharding ` in the v3 file format. Sharding requires knowledge of the array element you are writing (such as shape or data type), though, and therefore you will need to use {func}`anndata.experimental.write_dispatched` to use sharding. For example, you cannot shard a 1D array with `shard` sizes `(256, 256)`. Here is a short example, although you should tune the sizes to your own use-case and also use the compression that makes the most sense for you: ```python import zarr import anndata as ad from collections.abc import Mapping from typing import Any ad.settings.zarr_write_format = 3 # Absolutely crucial! Sharding is only for the v3 file format! def write_sharded(group: zarr.Group, adata: ad.AnnData): def callback( func: ad.experimental.Write, g: zarr.Group, k: str, elem: ad.typing.RWAble, dataset_kwargs: Mapping[str, Any], iospec: ad.experimental.IOSpec, ): if iospec.encoding_type in {"array"}: dataset_kwargs = { "shards": tuple(int(2 ** (16 / len(elem.shape))) for _ in elem.shape), **dataset_kwargs, } dataset_kwargs["chunks"] = tuple(i // 2 for i in dataset_kwargs["shards"]) elif iospec.encoding_type in {"csr_matrix", "csc_matrix"}: dataset_kwargs = {"shards": (2**16,), "chunks": (2**8,), **dataset_kwargs} func(g, k, elem, dataset_kwargs=dataset_kwargs) return ad.experimental.write_dispatched(group, "/", adata, callback=callback) ``` However, `zarr-python` can be slow with sharding throughput as well as writing throughput. Thus if you wish to speed up either writing, sharding, or both (or receive a modest speed-boost for reading), a bridge to the `zarr` implementation in Rust {doc}`zarrs-python ` can help with that (see the [zarr-benchmarks]): ``` uv pip install zarrs ``` ```python import zarr import zarrs zarr.config.set({"codec_pipeline.path": "zarrs.ZarrsCodecPipeline"}) ``` However, this pipeline is not compatible with all types of zarr store, especially remote stores and there are limitations on where rust can give a performance boost for indexing. We therefore recommend this pipeline for writing full datasets and reading contiguous regions of said written data. ## Codecs The default `zarr-python` v3 codec for the v3 format is no longer `blosc` but `zstd`. While `zstd` is more widespread, you may find its performance to not meet your old expectations. Therefore, we recommend passing in the {class}`zarr.codecs.BloscCodec` to `compressor` on {func}`~anndata.AnnData.write_zarr` if you wish to return to the old behavior. There is currently a bug with `numcodecs` that prevents data written from other non-numcodecs `zstd` implementations from being read in by the default zarr pipeline (to which the above rust pipeline falls back if it cannot handle a datatype or indexing scheme, like `vlen-string`): {issue}`zarr-developers/numcodecs#424`. Thus is may be advisable to use `BloscCodec` with `zarr` v3 file format data if you wish to use the rust-accelerated pipeline until this issue is resolved. The same issue with `zstd` applies to data that may eventually be written by the GPU `zstd` implementation (see below). ## Dask Zarr v3 should be compatible with dask, although the default behavior is to use zarr's chunking for dask's own. With sharding, this behavior may be undesirable as shards can often contain many small chunks, thereby slowing down i/o as dask will need to index into the zarr store for every chunk. Therefore it may be better to customize this behavior by passing `chunks=my_zarr_array.shards` as an argument to {func}`dask.array.from_zarr` or similar. ## GPU i/o At the moment, it is unlikely your `anndata` i/o will work if you use {ref}`zarr.config.enable_gpu `. It's *possible* dense data i/o i.e., using {func}`anndata.io.read_elem` will work as expected, but this functionality is untested – sparse data, awkward arrays, and dataframes will not. `kvikio` currently provides a {class}`kvikio.zarr.GDSStore` although there are no working compressors at the moment exported from the `zarr-python` package (work is underway for `Zstd`: {pr}`zarr-developers/zarr-python#2863`. We anticipate enabling officially supporting this functionality officially for dense data, sparse data, and possibly awkward arrays in the next minor release, 0.13. ## Asynchronous i/o At the moment, `anndata` exports no `async` functions. However, `zarr-python` has a fully `async` API and provides its own event-loop so that users like `anndata` can interact with a synchronous API while still beenfitting from `zarr-python`'s asynchronous functionality under that API. We anticipate providing `async` versions of {func}`anndata.io.read_elem` and {func}`anndata.experimental.read_dispatched` so that users can download data asynchronously without using the `zarr-python` event loop. We also would like to create an asynchronous partial reader to enable iterative streaming of a dataset. [`obstore` claims]: https://developmentseed.org/obstore/latest/performance [zarr-benchmarks]: https://github.com/LDeakin/zarr_benchmarks python-anndata-0.12.0~rc1/hatch.toml000066400000000000000000000022501500370632200173040ustar00rootroot00000000000000[envs.default] installer = "uv" features = [ "dev" ] [envs.docs] features = [ "doc" ] scripts.build = "sphinx-build -M html docs docs/_build -W --keep-going {args}" scripts.open = "python3 -m webbrowser -t docs/_build/html/index.html" scripts.clean = "git clean -fdX -- {args:docs}" [envs.towncrier] scripts.create = "towncrier create {args}" scripts.build = "python3 ci/scripts/towncrier_automation.py {args}" scripts.clean = "git restore --source=HEAD --staged --worktree -- docs/release-notes" [envs.hatch-test] default-args = [ ] features = [ "dev", "test" ] extra-dependencies = [ "ipykernel" ] env-vars.UV_CONSTRAINT = "ci/constraints.txt" overrides.matrix.deps.env-vars = [ { if = [ "pre" ], key = "UV_PRERELEASE", value = "allow" }, { if = [ "min" ], key = "UV_CONSTRAINT", value = "ci/constraints.txt ci/min-deps.txt" }, ] overrides.matrix.deps.pre-install-commands = [ { if = [ "min" ], value = "uv run ci/scripts/min-deps.py pyproject.toml --all-extras -o ci/min-deps.txt" }, ] overrides.matrix.deps.python = [ { if = [ "min" ], value = "3.11" }, { if = [ "stable", "pre" ], value = "3.13" }, ] [[envs.hatch-test.matrix]] deps = [ "stable", "pre", "min" ] python-anndata-0.12.0~rc1/pyproject.toml000066400000000000000000000170351500370632200202430ustar00rootroot00000000000000[build-system] build-backend = "hatchling.build" requires = [ "hatchling", "hatch-vcs" ] [project] name = "anndata" description = "Annotated data." requires-python = ">=3.11" license = "BSD-3-Clause" authors = [ { name = "Philipp Angerer" }, { name = "Alex Wolf" }, { name = "Isaac Virshup" }, { name = "Sergei Rybakov" }, ] maintainers = [ { name = "Isaac Virshup", email = "ivirshup@gmail.com" }, { name = "Philipp Angerer", email = "philipp.angerer@helmholtz-munich.de" }, { name = "Ilan Gold", email = "ilan.gold@helmholtz-munich.de" }, ] readme = "README.md" classifiers = [ "Environment :: Console", "Framework :: Jupyter", "Intended Audience :: Developers", "Intended Audience :: Science/Research", "Natural Language :: English", "Operating System :: MacOS :: MacOS X", "Operating System :: Microsoft :: Windows", "Operating System :: POSIX :: Linux", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", "Topic :: Scientific/Engineering :: Bio-Informatics", "Topic :: Scientific/Engineering :: Visualization", ] dependencies = [ # pandas 2.1.0rc0 has pandas/issues/54622 "pandas >=2.0.0, !=2.1.0rc0, !=2.1.2", "numpy>=1.25", # https://github.com/scverse/anndata/issues/1434 "scipy >1.11", "h5py>=3.8", "natsort", "packaging>=24.2", # array-api-compat 1.5 has https://github.com/scverse/anndata/issues/1410 "array_api_compat>1.4,!=1.5", "legacy-api-wrap", "zarr >=2.18.7, !=3.0.0, !=3.0.1, !=3.0.2, !=3.0.3", ] dynamic = [ "version" ] [project.urls] Documentation = "https://anndata.readthedocs.io/" Source = "https://github.com/scverse/anndata" Home-page = "https://github.com/scverse/anndata" [project.optional-dependencies] dev = [ # runtime dev version generation "hatch-vcs", "anndata[dev-doc,test]", ] doc = [ "sphinx>=8.2.1", "sphinx-book-theme>=1.1.0", "sphinx-autodoc-typehints>=2.2.0", "sphinx-issues", "sphinx-copybutton", "sphinx-toolbox>=3.8.0", "sphinxext.opengraph", "myst-nb", "scanpydoc[theme,typehints] >=0.15.1", "awkward>=2.3", "IPython", # For syntax highlighting in notebooks "myst_parser", "sphinx_design>=0.5.0", "readthedocs-sphinx-search", # for unreleased changes "anndata[dev-doc,dask]", "awkward>=2.3", ] dev-doc = [ "towncrier>=24.8.0" ] # release notes tool test-full = [ "anndata[test,lazy]" ] test = [ "loompy>=3.0.5", "pytest>=8.2,<8.3.4", "pytest-cov", "pytest-randomly", "pytest-memray", "pytest-mock", "pytest-xdist[psutil]", "filelock", "matplotlib", "scikit-learn", "openpyxl", "joblib", "boltons", "scanpy>=1.10", "httpx", # For data downloading "dask[distributed]", "awkward>=2.3", "pyarrow", "anndata[dask]", ] gpu = [ "cupy" ] cu12 = [ "cupy-cuda12x" ] cu11 = [ "cupy-cuda11x" ] # requests and aiohttp needed for zarr remote data lazy = [ "xarray>=2024.06.0", "aiohttp", "requests", "anndata[dask]" ] # https://github.com/dask/dask/issues/11290 # https://github.com/dask/dask/issues/11752 dask = [ "dask[array]>=2023.5.1,!=2024.8.*,!=2024.9.*,<2025.2.0" ] [tool.hatch.version] source = "vcs" raw-options.version_scheme = "release-branch-semver" [tool.hatch.build.targets.wheel] packages = [ "src/anndata", "src/testing" ] [tool.coverage.run] data_file = "test-data/coverage" source_pkgs = [ "anndata" ] omit = [ "src/anndata/_version.py", "**/test_*.py" ] concurrency = [ "multiprocessing" ] parallel = "true" [tool.coverage.xml] output = "test-data/coverage.xml" [tool.coverage.paths] source = [ "./src", "**/site-packages" ] [tool.coverage.report] exclude_also = [ "if TYPE_CHECKING:", ] [tool.pytest.ini_options] addopts = [ "--import-mode=importlib", "--strict-markers", "--doctest-modules", "--pyargs", "-ptesting.anndata._pytest", "--dist=loadgroup", ] filterwarnings = [ "ignore::anndata._warnings.OldFormatWarning", "ignore::anndata._warnings.ExperimentalFeatureWarning", ] # When `--strict-warnings` is used, all warnings are treated as errors, except those: filterwarnings_when_strict = [ "default::anndata._warnings.ImplicitModificationWarning", "default:Transforming to str index:UserWarning", "default:(Observation|Variable) names are not unique. To make them unique:UserWarning", "default::scipy.sparse.SparseEfficiencyWarning", "default::dask.array.core.PerformanceWarning", "default:anndata will no longer support zarr v2:DeprecationWarning", "default:The codec `vlen-utf8:UserWarning", "default:The dtype `StringDType():UserWarning", "default:Consolidated metadata is:UserWarning", ] python_files = "test_*.py" testpaths = [ "anndata", # docstrings (module name due to --pyargs) "./tests", # unit tests "./ci/scripts", # CI script tests "./docs/concatenation.rst", # further doctests ] # For some reason this effects how logging is shown when tests are run xfail_strict = true markers = [ "gpu: mark test to run on GPU" ] [tool.ruff] src = [ "src" ] [tool.ruff.format] docstring-code-format = true [tool.ruff.lint] select = [ "E", # Error detected by Pycodestyle "EM", # Traceback-friendly error messages "F", # Errors detected by Pyflakes "FBT", # Boolean positional arguments "W", # Warning detected by Pycodestyle "PLW", # Pylint "UP", # pyupgrade "I", # isort "TC", # manage type checking blocks "TID", # Banned imports "ICN", # Follow import conventions "PTH", # Pathlib instead of os.path "PT", # Pytest conventions "PYI", # Typing ] ignore = [ # line too long -> we accept long comment lines; formatter gets rid of long code lines "E501", # Do not assign a lambda expression, use a def -> AnnData allows lambda expression assignments, "E731", # allow I, O, l as variable names -> I is the identity matrix, i, j, k, l is reasonable indexing notation "E741", # We use relative imports from parent modules "TID252", # Shadowing loop variables isn’t a big deal "PLW2901", ] [tool.ruff.lint.per-file-ignores] # E721 comparing types, but we specifically are checking that we aren't getting subtypes (views) "tests/test_readwrite.py" = [ "E721" ] [tool.ruff.lint.isort] known-first-party = [ "anndata" ] required-imports = [ "from __future__ import annotations" ] [tool.ruff.lint.flake8-tidy-imports.banned-api] "subprocess.call".msg = "Use `subprocess.run([…])` instead" "subprocess.check_call".msg = "Use `subprocess.run([…], check=True)` instead" "subprocess.check_output".msg = "Use `subprocess.run([…], check=True, capture_output=True)` instead" "legacy_api_wrap.legacy_api".msg = "Use anndata.compat.old_positionals instead" [tool.ruff.lint.flake8-type-checking] exempt-modules = [ ] strict = true [tool.codespell] skip = ".git,*.pdf,*.svg" ignore-words-list = "theis,coo,homogenous" [tool.towncrier] package = "anndata" directory = "docs/release-notes" filename = "docs/release-notes/{version}.md" single_file = false package_dir = "src" issue_format = "{{pr}}`{issue}`" title_format = "(v{version})=\n### {version} {{small}}`{project_date}`" fragment.bugfix.name = "Bug fixes" fragment.doc.name = "Documentation" fragment.feature.name = "Features" fragment.misc.name = "Miscellaneous improvements" fragment.performance.name = "Performance" fragment.breaking.name = "Breaking changes" fragment.dev.name = "Development Process" python-anndata-0.12.0~rc1/src/000077500000000000000000000000001500370632200161105ustar00rootroot00000000000000python-anndata-0.12.0~rc1/src/anndata/000077500000000000000000000000001500370632200175165ustar00rootroot00000000000000python-anndata-0.12.0~rc1/src/anndata/__init__.py000066400000000000000000000030371500370632200216320ustar00rootroot00000000000000"""Annotated multivariate observation data.""" from __future__ import annotations from typing import TYPE_CHECKING if TYPE_CHECKING: from typing import Any from ._core.anndata import AnnData from ._core.extensions import register_anndata_namespace from ._core.merge import concat from ._core.raw import Raw from ._settings import settings from ._version import __version__ from ._warnings import ( ExperimentalFeatureWarning, ImplicitModificationWarning, OldFormatWarning, WriteWarning, ) from .io import read_h5ad, read_zarr from .utils import module_get_attr_redirect # Submodules need to be imported last from . import abc, experimental, typing, io, types # noqa: E402 isort: skip # We use these in tests by attribute access from . import logging # noqa: F401, E402 isort: skip _DEPRECATED_IO = ( "read_loom", "read_hdf", "read_excel", "read_umi_tools", "read_csv", "read_text", "read_mtx", ) _DEPRECATED = dict((method, f"io.{method}") for method in _DEPRECATED_IO) def __getattr__(attr_name: str) -> Any: return module_get_attr_redirect(attr_name, deprecated_mapping=_DEPRECATED) __all__ = [ # Attributes "__version__", "settings", # Submodules "abc", "experimental", "typing", "types", "io", # Classes "AnnData", "Raw", # Functions "concat", "read_zarr", "read_h5ad", "register_anndata_namespace", # Warnings "OldFormatWarning", "WriteWarning", "ImplicitModificationWarning", "ExperimentalFeatureWarning", ] python-anndata-0.12.0~rc1/src/anndata/_core/000077500000000000000000000000001500370632200206055ustar00rootroot00000000000000python-anndata-0.12.0~rc1/src/anndata/_core/__init__.py000066400000000000000000000000001500370632200227040ustar00rootroot00000000000000python-anndata-0.12.0~rc1/src/anndata/_core/access.py000066400000000000000000000015601500370632200224220ustar00rootroot00000000000000from __future__ import annotations from functools import reduce from typing import TYPE_CHECKING, NamedTuple if TYPE_CHECKING: from anndata import AnnData class ElementRef(NamedTuple): parent: AnnData attrname: str keys: tuple[str, ...] = () def __str__(self) -> str: return f".{self.attrname}" + "".join(map(lambda x: f"['{x}']", self.keys)) @property def _parent_el(self): return reduce( lambda d, k: d[k], self.keys[:-1], getattr(self.parent, self.attrname) ) def get(self): """Get referenced value in self.parent.""" return reduce(lambda d, k: d[k], self.keys, getattr(self.parent, self.attrname)) def set(self, val): """Set referenced value in self.parent.""" self._parent_el[self.keys[-1]] = val def delete(self): del self._parent_el[self.keys[-1]] python-anndata-0.12.0~rc1/src/anndata/_core/aligned_df.py000066400000000000000000000062771500370632200232470ustar00rootroot00000000000000from __future__ import annotations import warnings from collections.abc import Mapping from functools import singledispatch from typing import TYPE_CHECKING import pandas as pd from pandas.api.types import is_string_dtype from .._warnings import ImplicitModificationWarning if TYPE_CHECKING: from collections.abc import Iterable from typing import Any, Literal @singledispatch def _gen_dataframe( anno: Any, index_names: Iterable[str], *, source: Literal["X", "shape"], attr: Literal["obs", "var"], length: int | None = None, ) -> pd.DataFrame: # pragma: no cover msg = f"Cannot convert {type(anno)} to {attr} DataFrame" raise ValueError(msg) @_gen_dataframe.register(Mapping) @_gen_dataframe.register(type(None)) def _gen_dataframe_mapping( anno: Mapping[str, Any] | None, index_names: Iterable[str], *, source: Literal["X", "shape"], attr: Literal["obs", "var"], length: int | None = None, ) -> pd.DataFrame: if anno is None or len(anno) == 0: anno = {} def mk_index(l: int) -> pd.Index: return pd.RangeIndex(0, l, name=None).astype(str) for index_name in index_names: if index_name not in anno: continue df = pd.DataFrame( anno, index=anno[index_name], columns=[k for k in anno.keys() if k != index_name], ) break else: df = pd.DataFrame( anno, index=None if length is None else mk_index(length), columns=None if len(anno) else [], ) if length is None: df.index = mk_index(len(df)) elif length != len(df): raise _mk_df_error(source, attr, length, len(df)) return df @_gen_dataframe.register(pd.DataFrame) def _gen_dataframe_df( anno: pd.DataFrame, index_names: Iterable[str], *, source: Literal["X", "shape"], attr: Literal["obs", "var"], length: int | None = None, ): if length is not None and length != len(anno): raise _mk_df_error(source, attr, length, len(anno)) anno = anno.copy(deep=False) if not is_string_dtype(anno.index): warnings.warn("Transforming to str index.", ImplicitModificationWarning) anno.index = anno.index.astype(str) if not len(anno.columns): anno.columns = anno.columns.astype(str) return anno @_gen_dataframe.register(pd.Series) @_gen_dataframe.register(pd.Index) def _gen_dataframe_1d( anno: pd.Series | pd.Index, index_names: Iterable[str], *, source: Literal["X", "shape"], attr: Literal["obs", "var"], length: int | None = None, ): msg = f"Cannot convert {type(anno)} to {attr} DataFrame" raise ValueError(msg) def _mk_df_error( source: Literal["X", "shape"], attr: Literal["obs", "var"], expected: int, actual: int, ): if source == "X": what = "row" if attr == "obs" else "column" msg = ( f"Observations annot. `{attr}` must have as many rows as `X` has {what}s " f"({expected}), but has {actual} rows." ) else: msg = ( f"`shape` is inconsistent with `{attr}` " "({actual} {what}s instead of {expected})" ) return ValueError(msg) python-anndata-0.12.0~rc1/src/anndata/_core/aligned_mapping.py000066400000000000000000000330371500370632200243030ustar00rootroot00000000000000from __future__ import annotations import warnings from abc import ABC, abstractmethod from collections.abc import MutableMapping, Sequence from copy import copy from dataclasses import dataclass from typing import TYPE_CHECKING, Generic, TypeVar import numpy as np import pandas as pd from .._warnings import ExperimentalFeatureWarning, ImplicitModificationWarning from ..compat import AwkArray, CSArray, CSMatrix from ..utils import ( axis_len, convert_to_dict, deprecated, raise_value_error_if_multiindex_columns, warn_once, ) from .access import ElementRef from .index import _subset from .storage import coerce_array from .views import as_view, view_update if TYPE_CHECKING: from collections.abc import Callable, Iterable, Iterator, Mapping from typing import ClassVar, Literal, Self from .anndata import AnnData from .raw import Raw OneDIdx = Sequence[int] | Sequence[bool] | slice TwoDIdx = tuple[OneDIdx, OneDIdx] # TODO: pd.DataFrame only allowed in AxisArrays? Value = pd.DataFrame | CSMatrix | CSArray | np.ndarray P = TypeVar("P", bound="AlignedMappingBase") """Parent mapping an AlignedView is based on.""" I = TypeVar("I", OneDIdx, TwoDIdx) class AlignedMappingBase(MutableMapping[str, Value], ABC): """\ An abstract base class for Mappings containing array-like values aligned to either one or both AnnData axes. """ _allow_df: ClassVar[bool] """If this mapping supports heterogeneous DataFrames""" _view_class: ClassVar[type[AlignedView]] """The view class for this aligned mapping.""" _actual_class: ClassVar[type[AlignedActual]] """The actual class (which has it’s own data) for this aligned mapping.""" _parent: AnnData | Raw """The parent object that this mapping is aligned to.""" def __repr__(self): return f"{type(self).__name__} with keys: {', '.join(self.keys())}" def _ipython_key_completions_(self) -> list[str]: return list(self.keys()) def _validate_value(self, val: Value, key: str) -> Value: """Raises an error if value is invalid""" if isinstance(val, AwkArray): warn_once( "Support for Awkward Arrays is currently experimental. " "Behavior may change in the future. Please report any issues you may encounter!", ExperimentalFeatureWarning, # stacklevel=3, ) for i, axis in enumerate(self.axes): if self.parent.shape[axis] == axis_len(val, i): continue right_shape = tuple(self.parent.shape[a] for a in self.axes) actual_shape = tuple(axis_len(val, a) for a, _ in enumerate(self.axes)) if actual_shape[i] is None and isinstance(val, AwkArray): dim = ("obs", "var")[i] msg = ( f"The AwkwardArray is of variable length in dimension {dim}.", f"Try ak.to_regular(array, {i}) before including the array in AnnData", ) else: dims = tuple(("obs", "var")[ax] for ax in self.axes) msg = ( f"Value passed for key {key!r} is of incorrect shape. " f"Values of {self.attrname} must match dimensions {dims} of parent. " f"Value had shape {actual_shape} while it should have had {right_shape}." ) raise ValueError(msg) name = f"{self.attrname.title().rstrip('s')} {key!r}" return coerce_array(val, name=name, allow_df=self._allow_df) @property @abstractmethod def attrname(self) -> str: """What attr for the AnnData is this?""" @property @abstractmethod def axes(self) -> tuple[Literal[0, 1], ...]: """Which axes of the parent is this aligned to?""" @property @abstractmethod def is_view(self) -> bool: ... @property def parent(self) -> AnnData | Raw: return self._parent def copy(self) -> dict[str, Value]: # Shallow copy for awkward array since their buffers are immutable return { k: copy(v) if isinstance(v, AwkArray) else v.copy() for k, v in self.items() } def _view(self, parent: AnnData, subset_idx: I) -> AlignedView[Self, I]: """Returns a subset copy-on-write view of the object.""" return self._view_class(self, parent, subset_idx) @deprecated("dict(obj)") def as_dict(self) -> dict: return dict(self) class AlignedView(AlignedMappingBase, Generic[P, I]): is_view: ClassVar[Literal[True]] = True # override docstring parent: AnnData """Reference to parent AnnData view""" attrname: str """What attribute in the parent is this?""" parent_mapping: P """The object this is a view of.""" subset_idx: I """The subset of the parent to view.""" def __init__(self, parent_mapping: P, parent_view: AnnData, subset_idx: I): self.parent_mapping = parent_mapping self._parent = parent_view self.subset_idx = subset_idx if hasattr(parent_mapping, "_axis"): # LayersBase has no _axis, the rest does self._axis = parent_mapping._axis # type: ignore def __getitem__(self, key: str) -> Value: return as_view( _subset(self.parent_mapping[key], self.subset_idx), ElementRef(self.parent, self.attrname, (key,)), ) def __setitem__(self, key: str, value: Value) -> None: value = self._validate_value(value, key) # Validate before mutating warnings.warn( f"Setting element `.{self.attrname}['{key}']` of view, " "initializing view as actual.", ImplicitModificationWarning, stacklevel=2, ) with view_update(self.parent, self.attrname, ()) as new_mapping: new_mapping[key] = value def __delitem__(self, key: str) -> None: if key not in self: msg = f"{key!r} not found in view of {self.attrname}" raise KeyError(msg) # Make sure it exists before bothering with a copy warnings.warn( f"Removing element `.{self.attrname}['{key}']` of view, " "initializing view as actual.", ImplicitModificationWarning, stacklevel=2, ) with view_update(self.parent, self.attrname, ()) as new_mapping: del new_mapping[key] def __contains__(self, key: str) -> bool: return key in self.parent_mapping def __iter__(self) -> Iterator[str]: return iter(self.parent_mapping) def __len__(self) -> int: return len(self.parent_mapping) class AlignedActual(AlignedMappingBase): is_view: ClassVar[Literal[False]] = False _data: MutableMapping[str, Value] """Underlying mapping to the data""" def __init__(self, parent: AnnData | Raw, *, store: MutableMapping[str, Value]): self._parent = parent self._data = store for k, v in self._data.items(): self._data[k] = self._validate_value(v, k) def __getitem__(self, key: str) -> Value: return self._data[key] def __setitem__(self, key: str, value: Value): value = self._validate_value(value, key) self._data[key] = value def __contains__(self, key: str) -> bool: return key in self._data def __delitem__(self, key: str): del self._data[key] def __iter__(self) -> Iterator[str]: return iter(self._data) def __len__(self) -> int: return len(self._data) class AxisArraysBase(AlignedMappingBase): """\ Mapping of key→array-like, where array-like is aligned to an axis of parent AnnData. """ _allow_df: ClassVar = True _dimnames: ClassVar = ("obs", "var") _axis: Literal[0, 1] @property def attrname(self) -> str: return f"{self.dim}m" @property def axes(self) -> tuple[Literal[0, 1]]: """Axes of the parent this is aligned to""" return (self._axis,) @property def dim(self) -> str: """Name of the dimension this aligned to.""" return self._dimnames[self._axis] def to_df(self) -> pd.DataFrame: """Convert to pandas dataframe.""" df = pd.DataFrame(index=self.dim_names) for key in self.keys(): value = self[key] for icolumn, column in enumerate(value.T): df[f"{key}{icolumn + 1}"] = column return df def _validate_value(self, val: Value, key: str) -> Value: if isinstance(val, pd.DataFrame): raise_value_error_if_multiindex_columns(val, f"{self.attrname}[{key!r}]") if not val.index.equals(self.dim_names): # Could probably also re-order index if it’s contained try: pd.testing.assert_index_equal(val.index, self.dim_names) except AssertionError as e: msg = f"value.index does not match parent’s {self.dim} names:\n{e}" raise ValueError(msg) from None else: msg = "Index.equals and pd.testing.assert_index_equal disagree" raise AssertionError(msg) return super()._validate_value(val, key) @property def dim_names(self) -> pd.Index: return (self.parent.obs_names, self.parent.var_names)[self._axis] class AxisArrays(AlignedActual, AxisArraysBase): def __init__( self, parent: AnnData | Raw, *, axis: Literal[0, 1], store: MutableMapping[str, Value] | AxisArraysBase, ): if axis not in {0, 1}: raise ValueError() self._axis = axis super().__init__(parent, store=store) class AxisArraysView(AlignedView[AxisArraysBase, OneDIdx], AxisArraysBase): pass AxisArraysBase._view_class = AxisArraysView AxisArraysBase._actual_class = AxisArrays class LayersBase(AlignedMappingBase): """\ Mapping of key: array-like, where array-like is aligned to both axes of the parent anndata. """ _allow_df: ClassVar = False attrname: ClassVar[Literal["layers"]] = "layers" axes: ClassVar[tuple[Literal[0], Literal[1]]] = (0, 1) class Layers(AlignedActual, LayersBase): pass class LayersView(AlignedView[LayersBase, TwoDIdx], LayersBase): pass LayersBase._view_class = LayersView LayersBase._actual_class = Layers class PairwiseArraysBase(AlignedMappingBase): """\ Mapping of key: array-like, where both axes of array-like are aligned to one axis of the parent anndata. """ _allow_df: ClassVar = False _dimnames: ClassVar = ("obs", "var") _axis: Literal[0, 1] @property def attrname(self) -> str: return f"{self.dim}p" @property def axes(self) -> tuple[Literal[0], Literal[0]] | tuple[Literal[1], Literal[1]]: """Axes of the parent this is aligned to""" return self._axis, self._axis # type: ignore @property def dim(self) -> str: """Name of the dimension this aligned to.""" return self._dimnames[self._axis] class PairwiseArrays(AlignedActual, PairwiseArraysBase): def __init__( self, parent: AnnData, *, axis: Literal[0, 1], store: MutableMapping[str, Value], ): if axis not in {0, 1}: raise ValueError() self._axis = axis super().__init__(parent, store=store) class PairwiseArraysView(AlignedView[PairwiseArraysBase, OneDIdx], PairwiseArraysBase): pass PairwiseArraysBase._view_class = PairwiseArraysView PairwiseArraysBase._actual_class = PairwiseArrays AlignedMapping = ( AxisArrays | AxisArraysView | Layers | LayersView | PairwiseArrays | PairwiseArraysView ) T = TypeVar("T", bound=AlignedMapping) """Pair of types to be aligned.""" @dataclass class AlignedMappingProperty(property, Generic[T]): """A :class:`property` that creates an ephemeral AlignedMapping. The actual data is stored as `f'_{self.name}'` in the parent object. """ name: str """Name of the attribute in the parent object.""" cls: type[T] """Concrete type that will be constructed.""" axis: Literal[0, 1] | None = None """Axis of the parent to align to.""" def construct(self, obj: AnnData, *, store: MutableMapping[str, Value]) -> T: if self.axis is None: return self.cls(obj, store=store) return self.cls(obj, axis=self.axis, store=store) @property def fget(self) -> Callable[[], None]: """Fake fget for sphinx-autodoc-typehints.""" def fake(): ... fake.__annotations__ = {"return": self.cls._actual_class | self.cls._view_class} return fake def __get__(self, obj: None | AnnData, objtype: type | None = None) -> T: if obj is None: # When accessed from the class, e.g. via `AnnData.obs`, # this needs to return a `property` instance, e.g. for Sphinx return self # type: ignore if not obj.is_view: return self.construct(obj, store=getattr(obj, f"_{self.name}")) parent_anndata = obj._adata_ref idxs = (obj._oidx, obj._vidx) parent: AlignedMapping = getattr(parent_anndata, self.name) return parent._view(obj, tuple(idxs[ax] for ax in parent.axes)) def __set__( self, obj: AnnData, value: Mapping[str, Value] | Iterable[tuple[str, Value]] ) -> None: value = convert_to_dict(value) _ = self.construct(obj, store=value) # Validate if obj.is_view: obj._init_as_actual(obj.copy()) setattr(obj, f"_{self.name}", value) def __delete__(self, obj) -> None: setattr(obj, self.name, dict()) python-anndata-0.12.0~rc1/src/anndata/_core/anndata.py000066400000000000000000002271031500370632200225720ustar00rootroot00000000000000"""\ Main class and helper functions. """ from __future__ import annotations import warnings from collections import OrderedDict from collections.abc import Mapping, MutableMapping, Sequence from copy import copy, deepcopy from functools import partial, singledispatch from pathlib import Path from textwrap import dedent from typing import TYPE_CHECKING, cast import h5py import numpy as np import pandas as pd from natsort import natsorted from numpy import ma from pandas.api.types import infer_dtype from scipy import sparse from scipy.sparse import issparse from anndata._warnings import ImplicitModificationWarning from .. import utils from .._settings import settings from ..compat import CSArray, DaskArray, ZarrArray, _move_adj_mtx, old_positionals from ..logging import anndata_logger as logger from ..utils import ( axis_len, deprecated, ensure_df_homogeneous, raise_value_error_if_multiindex_columns, ) from .access import ElementRef from .aligned_df import _gen_dataframe from .aligned_mapping import AlignedMappingProperty, AxisArrays, Layers, PairwiseArrays from .file_backing import AnnDataFileManager, to_memory from .index import _normalize_indices, _subset, get_vector from .raw import Raw from .sparse_dataset import BaseCompressedSparseDataset, sparse_dataset from .storage import coerce_array from .views import ( DictView, _resolve_idxs, as_view, ) if TYPE_CHECKING: from collections.abc import Iterable from os import PathLike from typing import Any, ClassVar, Literal from zarr.storage import StoreLike from ..compat import Index1D from ..typing import XDataType from .aligned_mapping import AxisArraysView, LayersView, PairwiseArraysView from .index import Index class AnnData(metaclass=utils.DeprecationMixinMeta): """\ An annotated data matrix. .. figure:: ../_static/img/anndata_schema.svg :width: 260px :align: right :class: dark-light :class:`~anndata.AnnData` stores a data matrix :attr:`X` together with annotations of observations :attr:`obs` (:attr:`obsm`, :attr:`obsp`), variables :attr:`var` (:attr:`varm`, :attr:`varp`), and unstructured annotations :attr:`uns`. An :class:`~anndata.AnnData` object `adata` can be sliced like a :class:`~pandas.DataFrame`, for instance `adata_subset = adata[:, list_of_variable_names]`. :class:`~anndata.AnnData`’s basic structure is similar to R’s ExpressionSet [Huber15]_. If setting an `.h5ad`-formatted HDF5 backing file `.filename`, data remains on the disk but is automatically loaded into memory if needed. Parameters ---------- X A #observations × #variables data matrix. A view of the data is used if the data type matches, otherwise, a copy is made. obs Key-indexed one-dimensional observations annotation of length #observations. var Key-indexed one-dimensional variables annotation of length #variables. uns Key-indexed unstructured annotation. obsm Key-indexed multi-dimensional observations annotation of length #observations. If passing a :class:`~numpy.ndarray`, it needs to have a structured datatype. varm Key-indexed multi-dimensional variables annotation of length #variables. If passing a :class:`~numpy.ndarray`, it needs to have a structured datatype. layers Key-indexed multi-dimensional arrays aligned to dimensions of `X`. shape Shape tuple (#observations, #variables). Can only be provided if `X` is `None`. filename Name of backing file. See :class:`h5py.File`. filemode Open mode of backing file. See :class:`h5py.File`. See Also -------- io.read_h5ad io.read_csv io.read_excel io.read_hdf io.read_loom io.read_zarr io.read_mtx io.read_text io.read_umi_tools Notes ----- :class:`~anndata.AnnData` stores observations (samples) of variables/features in the rows of a matrix. This is the convention of the modern classics of statistics [Hastie09]_ and machine learning [Murphy12]_, the convention of dataframes both in R and Python and the established statistics and machine learning packages in Python (statsmodels_, scikit-learn_). Single dimensional annotations of the observation and variables are stored in the :attr:`obs` and :attr:`var` attributes as :class:`~pandas.DataFrame`\\ s. This is intended for metrics calculated over their axes. Multi-dimensional annotations are stored in :attr:`obsm` and :attr:`varm`, which are aligned to the objects observation and variable dimensions respectively. Square matrices representing graphs are stored in :attr:`obsp` and :attr:`varp`, with both of their own dimensions aligned to their associated axis. Additional measurements across both observations and variables are stored in :attr:`layers`. Indexing into an AnnData object can be performed by relative position with numeric indices (like pandas’ :meth:`~pandas.DataFrame.iloc`), or by labels (like :meth:`~pandas.DataFrame.loc`). To avoid ambiguity with numeric indexing into observations or variables, indexes of the AnnData object are converted to strings by the constructor. Subsetting an AnnData object by indexing into it will also subset its elements according to the dimensions they were aligned to. This means an operation like `adata[list_of_obs, :]` will also subset :attr:`obs`, :attr:`obsm`, and :attr:`layers`. Subsetting an AnnData object returns a view into the original object, meaning very little additional memory is used upon subsetting. This is achieved lazily, meaning that the constituent arrays are subset on access. Copying a view causes an equivalent “real” AnnData object to be generated. Attempting to modify a view (at any attribute except X) is handled in a copy-on-modify manner, meaning the object is initialized in place. Here’s an example:: batch1 = adata[adata.obs["batch"] == "batch1", :] batch1.obs["value"] = 0 # This makes batch1 a “real” AnnData object At the end of this snippet: `adata` was not modified, and `batch1` is its own AnnData object with its own data. Similar to Bioconductor’s `ExpressionSet` and :mod:`scipy.sparse` matrices, subsetting an AnnData object retains the dimensionality of its constituent arrays. Therefore, unlike with the classes exposed by :mod:`pandas`, :mod:`numpy`, and `xarray`, there is no concept of a one dimensional AnnData object. AnnDatas always have two inherent dimensions, :attr:`obs` and :attr:`var`. Additionally, maintaining the dimensionality of the AnnData object allows for consistent handling of :mod:`scipy.sparse` matrices and :mod:`numpy` arrays. .. _statsmodels: http://www.statsmodels.org/stable/index.html .. _scikit-learn: http://scikit-learn.org/ """ _BACKED_ATTRS = ["X", "raw.X"] # backwards compat _H5_ALIASES = dict( X={"X", "_X", "data", "_data"}, obs={"obs", "_obs", "smp", "_smp"}, var={"var", "_var"}, uns={"uns"}, obsm={"obsm", "_obsm", "smpm", "_smpm"}, varm={"varm", "_varm"}, layers={"layers", "_layers"}, ) _H5_ALIASES_NAMES = dict( obs={"obs_names", "smp_names", "row_names", "index"}, var={"var_names", "col_names", "index"}, ) _accessors: ClassVar[set[str]] = set() @old_positionals( "obsm", "varm", "layers", "raw", "dtype", "shape", "filename", "filemode", "asview", ) def __init__( self, X: XDataType | pd.DataFrame | None = None, obs: pd.DataFrame | Mapping[str, Iterable[Any]] | None = None, var: pd.DataFrame | Mapping[str, Iterable[Any]] | None = None, uns: Mapping[str, Any] | None = None, *, obsm: np.ndarray | Mapping[str, Sequence[Any]] | None = None, varm: np.ndarray | Mapping[str, Sequence[Any]] | None = None, layers: Mapping[str, XDataType] | None = None, raw: Mapping[str, Any] | None = None, dtype: np.dtype | type | str | None = None, shape: tuple[int, int] | None = None, filename: PathLike[str] | str | None = None, filemode: Literal["r", "r+"] | None = None, asview: bool = False, obsp: np.ndarray | Mapping[str, Sequence[Any]] | None = None, varp: np.ndarray | Mapping[str, Sequence[Any]] | None = None, oidx: Index1D | None = None, vidx: Index1D | None = None, ): # check for any multi-indices that aren’t later checked in coerce_array for attr, key in [(obs, "obs"), (var, "var"), (X, "X")]: if isinstance(attr, pd.DataFrame): raise_value_error_if_multiindex_columns(attr, key) if asview: if not isinstance(X, AnnData): msg = "`X` has to be an AnnData object." raise ValueError(msg) self._init_as_view(X, oidx, vidx) else: self._init_as_actual( X=X, obs=obs, var=var, uns=uns, obsm=obsm, varm=varm, raw=raw, layers=layers, dtype=dtype, shape=shape, obsp=obsp, varp=varp, filename=filename, filemode=filemode, ) def _init_as_view(self, adata_ref: AnnData, oidx: Index, vidx: Index): if adata_ref.isbacked and adata_ref.is_view: msg = ( "Currently, you cannot index repeatedly into a backed AnnData, " "that is, you cannot make a view of a view." ) raise ValueError(msg) self._is_view = True if isinstance(oidx, int | np.integer): if not (-adata_ref.n_obs <= oidx < adata_ref.n_obs): msg = f"Observation index `{oidx}` is out of range." raise IndexError(msg) oidx += adata_ref.n_obs * (oidx < 0) oidx = slice(oidx, oidx + 1, 1) if isinstance(vidx, int | np.integer): if not (-adata_ref.n_vars <= vidx < adata_ref.n_vars): msg = f"Variable index `{vidx}` is out of range." raise IndexError(msg) vidx += adata_ref.n_vars * (vidx < 0) vidx = slice(vidx, vidx + 1, 1) if adata_ref.is_view: prev_oidx, prev_vidx = adata_ref._oidx, adata_ref._vidx adata_ref = adata_ref._adata_ref oidx, vidx = _resolve_idxs((prev_oidx, prev_vidx), (oidx, vidx), adata_ref) # self._adata_ref is never a view self._adata_ref = adata_ref self._oidx = oidx self._vidx = vidx # the file is the same as of the reference object self.file = adata_ref.file # views on attributes of adata_ref obs_sub = adata_ref.obs.iloc[oidx] var_sub = adata_ref.var.iloc[vidx] # fix categories uns = copy(adata_ref._uns) if settings.remove_unused_categories: self._remove_unused_categories(adata_ref.obs, obs_sub, uns) self._remove_unused_categories(adata_ref.var, var_sub, uns) # set attributes self._obs = as_view(obs_sub, view_args=(self, "obs")) self._var = as_view(var_sub, view_args=(self, "var")) self._uns = uns # set data if self.isbacked: self._X = None # set raw, easy, as it’s immutable anyways... if adata_ref._raw is not None: # slicing along variables axis is ignored self._raw = adata_ref.raw[oidx] self._raw._adata = self else: self._raw = None def _init_as_actual( self, X=None, obs=None, var=None, uns=None, obsm=None, varm=None, varp=None, obsp=None, raw=None, layers=None, dtype=None, shape=None, filename=None, filemode=None, ): # view attributes self._is_view = False self._adata_ref = None self._oidx = None self._vidx = None # ---------------------------------------------------------------------- # various ways of initializing the data # ---------------------------------------------------------------------- # If X is a data frame, we store its indices for verification x_indices = [] # init from file if filename is not None: self.file = AnnDataFileManager(self, filename, filemode) else: self.file = AnnDataFileManager(self, None) # init from AnnData if isinstance(X, AnnData): if any((obs, var, uns, obsm, varm, obsp, varp)): msg = "If `X` is a dict no further arguments must be provided." raise ValueError(msg) X, obs, var, uns, obsm, varm, obsp, varp, layers, raw = ( X._X, X.obs, X.var, X.uns, X.obsm, X.varm, X.obsp, X.varp, X.layers, X.raw, ) # init from DataFrame elif isinstance(X, pd.DataFrame): # to verify index matching, we wait until obs and var are DataFrames if obs is None: obs = pd.DataFrame(index=X.index) elif not isinstance(X.index, pd.RangeIndex): x_indices.append(("obs", "index", X.index.astype(str))) if var is None: var = pd.DataFrame(index=X.columns) elif not isinstance(X.columns, pd.RangeIndex): x_indices.append(("var", "columns", X.columns.astype(str))) X = ensure_df_homogeneous(X, "X") # ---------------------------------------------------------------------- # actually process the data # ---------------------------------------------------------------------- # check data type of X if X is not None: X = coerce_array(X, name="X") if shape is not None: msg = "`shape` needs to be `None` if `X` is not `None`." raise ValueError(msg) _check_2d_shape(X) # if type doesn’t match, a copy is made, otherwise, use a view if dtype is not None: warnings.warn( "The dtype argument is deprecated and will be removed in late 2024.", FutureWarning, ) if issparse(X) or isinstance(X, ma.MaskedArray): # TODO: maybe use view on data attribute of sparse matrix # as in readwrite.read_10x_h5 if X.dtype != np.dtype(dtype): X = X.astype(dtype) elif isinstance(X, ZarrArray | DaskArray): X = X.astype(dtype) else: # is np.ndarray or a subclass, convert to true np.ndarray X = np.asarray(X, dtype) # data matrix and shape self._X = X n_obs, n_vars = X.shape source = "X" else: self._X = None n_obs, n_vars = ( shape if shape is not None else _infer_shape(obs, var, obsm, varm, layers, obsp, varp) ) source = "shape" # annotations self._obs = _gen_dataframe( obs, ["obs_names", "row_names"], source=source, attr="obs", length=n_obs ) self._var = _gen_dataframe( var, ["var_names", "col_names"], source=source, attr="var", length=n_vars ) # now we can verify if indices match! for attr_name, x_name, idx in x_indices: attr = getattr(self, attr_name) if isinstance(attr.index, pd.RangeIndex): attr.index = idx elif not idx.equals(attr.index): msg = f"Index of {attr_name} must match {x_name} of X." raise ValueError(msg) # unstructured annotations self.uns = uns or OrderedDict() self.obsm = obsm self.varm = varm self.obsp = obsp self.varp = varp # Backwards compat for connectivities matrices in uns["neighbors"] _move_adj_mtx({"uns": self._uns, "obsp": self._obsp}) self._check_dimensions() if settings.check_uniqueness: self._check_uniqueness() if self.filename: assert not isinstance(raw, Raw), ( "got raw from other adata but also filename?" ) if {"raw", "raw.X"} & set(self.file): raw = dict(X=None, **raw) if not raw: self._raw = None elif isinstance(raw, Mapping): self._raw = Raw(self, **raw) else: # is a Raw from another AnnData self._raw = Raw(self, raw._X, raw.var, raw.varm) # clean up old formats self._clean_up_old_format(uns) # layers self.layers = layers @old_positionals("show_stratified", "with_disk") def __sizeof__( self, *, show_stratified: bool = False, with_disk: bool = False ) -> int: def get_size(X) -> int: def cs_to_bytes(X) -> int: return int(X.data.nbytes + X.indptr.nbytes + X.indices.nbytes) if isinstance(X, h5py.Dataset) and with_disk: return int(np.array(X.shape).prod() * X.dtype.itemsize) elif isinstance(X, BaseCompressedSparseDataset) and with_disk: return cs_to_bytes(X._to_backed()) elif issparse(X): return cs_to_bytes(X) else: return X.__sizeof__() sizes = {} attrs = ["X", "_obs", "_var"] attrs_multi = ["_uns", "_obsm", "_varm", "varp", "_obsp", "_layers"] for attr in attrs + attrs_multi: if attr in attrs_multi: keys = getattr(self, attr).keys() s = sum(get_size(getattr(self, attr)[k]) for k in keys) else: s = get_size(getattr(self, attr)) if s > 0 and show_stratified: from tqdm import tqdm print( f"Size of {attr.replace('_', '.'):<7}: {tqdm.format_sizeof(s, 'B')}" ) sizes[attr] = s return sum(sizes.values()) def _gen_repr(self, n_obs, n_vars) -> str: if self.isbacked: backed_at = f" backed at {str(self.filename)!r}" else: backed_at = "" descr = f"AnnData object with n_obs × n_vars = {n_obs} × {n_vars}{backed_at}" for attr in [ "obs", "var", "uns", "obsm", "varm", "layers", "obsp", "varp", ]: keys = getattr(self, attr).keys() if len(keys) > 0: descr += f"\n {attr}: {str(list(keys))[1:-1]}" return descr def __repr__(self) -> str: if self.is_view: return "View of " + self._gen_repr(self.n_obs, self.n_vars) else: return self._gen_repr(self.n_obs, self.n_vars) def __eq__(self, other): """Equality testing""" msg = ( "Equality comparisons are not supported for AnnData objects, " "instead compare the desired attributes." ) raise NotImplementedError(msg) @property def shape(self) -> tuple[int, int]: """Shape of data matrix (:attr:`n_obs`, :attr:`n_vars`).""" return self.n_obs, self.n_vars @property def X(self) -> XDataType | None: """Data matrix of shape :attr:`n_obs` × :attr:`n_vars`.""" if self.isbacked: if not self.file.is_open: self.file.open() X = self.file["X"] if isinstance(X, h5py.Group): X = sparse_dataset(X) # This is so that we can index into a backed dense dataset with # indices that aren’t strictly increasing if self.is_view: X = _subset(X, (self._oidx, self._vidx)) elif self.is_view and self._adata_ref.X is None: X = None elif self.is_view: X = as_view( _subset(self._adata_ref.X, (self._oidx, self._vidx)), ElementRef(self, "X"), ) else: X = self._X return X # if self.n_obs == 1 and self.n_vars == 1: # return X[0, 0] # elif self.n_obs == 1 or self.n_vars == 1: # if issparse(X): X = X.toarray() # return X.flatten() # else: # return X @X.setter def X(self, value: XDataType | None): if value is None: if self.isbacked: msg = "Cannot currently remove data matrix from backed object." raise NotImplementedError(msg) if self.is_view: self._init_as_actual(self.copy()) self._X = None return value = coerce_array(value, name="X", allow_array_like=True) # If indices are both arrays, we need to modify them # so we don’t set values like coordinates # This can occur if there are successive views if ( self.is_view and isinstance(self._oidx, np.ndarray) and isinstance(self._vidx, np.ndarray) ): oidx, vidx = np.ix_(self._oidx, self._vidx) else: oidx, vidx = self._oidx, self._vidx if ( np.isscalar(value) or (hasattr(value, "shape") and (self.shape == value.shape)) or (self.n_vars == 1 and self.n_obs == len(value)) or (self.n_obs == 1 and self.n_vars == len(value)) ): if not np.isscalar(value): if self.is_view and any( isinstance(idx, np.ndarray) and len(np.unique(idx)) != len(idx.ravel()) for idx in [oidx, vidx] ): msg = ( "You are attempting to set `X` to a matrix on a view which has non-unique indices. " "The resulting `adata.X` will likely not equal the value to which you set it. " "To avoid this potential issue, please make a copy of the data first. " "In the future, this operation will throw an error." ) warnings.warn(msg, FutureWarning, stacklevel=1) if self.shape != value.shape: # For assigning vector of values to 2d array or matrix # Not necessary for row of 2d array value = value.reshape(self.shape) if self.isbacked: if self.is_view: X = self.file["X"] if isinstance(X, h5py.Group): X = sparse_dataset(X) X[oidx, vidx] = value else: self._set_backed("X", value) else: if self.is_view: if sparse.issparse(self._adata_ref._X) and isinstance( value, np.ndarray ): if isinstance(self._adata_ref.X, CSArray): memory_class = sparse.coo_array else: memory_class = sparse.coo_matrix value = memory_class(value) elif sparse.issparse(value) and isinstance( self._adata_ref._X, np.ndarray ): warnings.warn( "Trying to set a dense array with a sparse array on a view." "Densifying the sparse array." "This may incur excessive memory usage", stacklevel=2, ) value = value.toarray() warnings.warn( "Modifying `X` on a view results in data being overridden", ImplicitModificationWarning, stacklevel=2, ) self._adata_ref._X[oidx, vidx] = value else: self._X = value else: msg = f"Data matrix has wrong shape {value.shape}, need to be {self.shape}." raise ValueError(msg) @X.deleter def X(self): self.X = None layers: AlignedMappingProperty[Layers | LayersView] = AlignedMappingProperty( "layers", Layers ) """\ Dictionary-like object with values of the same dimensions as :attr:`X`. Layers in AnnData are inspired by loompy’s :ref:`loomlayers`. Return the layer named `"unspliced"`:: adata.layers["unspliced"] Create or replace the `"spliced"` layer:: adata.layers["spliced"] = ... Assign the 10th column of layer `"spliced"` to the variable a:: a = adata.layers["spliced"][:, 10] Delete the `"spliced"` layer:: del adata.layers["spliced"] Return layers’ names:: adata.layers.keys() """ @property def raw(self) -> Raw: """\ Store raw version of :attr:`X` and :attr:`var` as `.raw.X` and `.raw.var`. The :attr:`raw` attribute is initialized with the current content of an object by setting:: adata.raw = adata.copy() Its content can be deleted:: adata.raw = None # or del adata.raw Upon slicing an AnnData object along the obs (row) axis, :attr:`raw` is also sliced. Slicing an AnnData object along the vars (columns) axis leaves :attr:`raw` unaffected. Note that you can call:: adata.raw[:, 'orig_variable_name'].X to retrieve the data associated with a variable that might have been filtered out or "compressed away" in :attr:`X`. """ return self._raw @raw.setter def raw(self, value: AnnData): if value is None: del self.raw elif not isinstance(value, AnnData): msg = "Can only init raw attribute with an AnnData object." raise ValueError(msg) else: if self.is_view: self._init_as_actual(self.copy()) self._raw = Raw(self, X=value.X, var=value.var, varm=value.varm) @raw.deleter def raw(self): if self.is_view: self._init_as_actual(self.copy()) self._raw = None @property def n_obs(self) -> int: """Number of observations.""" return len(self.obs_names) @property def n_vars(self) -> int: """Number of variables/features.""" return len(self.var_names) def _set_dim_df(self, value: pd.DataFrame, attr: Literal["obs", "var"]): if not isinstance(value, pd.DataFrame): msg = f"Can only assign pd.DataFrame to {attr}." raise ValueError(msg) raise_value_error_if_multiindex_columns(value, attr) value_idx = self._prep_dim_index(value.index, attr) if self.is_view: self._init_as_actual(self.copy()) setattr(self, f"_{attr}", value) self._set_dim_index(value_idx, attr) if not len(value.columns): value.columns = value.columns.astype(str) def _prep_dim_index(self, value, attr: str) -> pd.Index: """Prepares index to be uses as obs_names or var_names for AnnData object.AssertionError If a pd.Index is passed, this will use a reference, otherwise a new index object is created. """ if self.shape[attr == "var"] != len(value): msg = f"Length of passed value for {attr}_names is {len(value)}, but this AnnData has shape: {self.shape}" raise ValueError(msg) if isinstance(value, pd.Index) and not isinstance(value.name, str | type(None)): msg = ( f"AnnData expects .{attr}.index.name to be a string or None, " f"but you passed a name of type {type(value.name).__name__!r}" ) raise ValueError(msg) else: value = pd.Index(value) if not isinstance(value.name, str | type(None)): value.name = None if ( len(value) > 0 and not isinstance(value, pd.RangeIndex) and infer_dtype(value) not in {"string", "bytes"} ): sample = list(value[: min(len(value), 5)]) msg = dedent( f""" AnnData expects .{attr}.index to contain strings, but got values like: {sample} Inferred to be: {infer_dtype(value)} """ ) warnings.warn(msg, stacklevel=2) return value def _set_dim_index(self, value: pd.Index, attr: str): # Assumes _prep_dim_index has been run if self.is_view: self._init_as_actual(self.copy()) getattr(self, attr).index = value for v in getattr(self, f"_{attr}m").values(): if isinstance(v, pd.DataFrame): v.index = value @property def obs(self) -> pd.DataFrame: """One-dimensional annotation of observations (`pd.DataFrame`).""" return self._obs @obs.setter def obs(self, value: pd.DataFrame): self._set_dim_df(value, "obs") @obs.deleter def obs(self): self.obs = pd.DataFrame({}, index=self.obs_names) @property def obs_names(self) -> pd.Index: """Names of observations (alias for `.obs.index`).""" return self.obs.index @obs_names.setter def obs_names(self, names: Sequence[str]): names = self._prep_dim_index(names, "obs") self._set_dim_index(names, "obs") @property def var(self) -> pd.DataFrame: """One-dimensional annotation of variables/ features (`pd.DataFrame`).""" return self._var @var.setter def var(self, value: pd.DataFrame): self._set_dim_df(value, "var") @var.deleter def var(self): self.var = pd.DataFrame({}, index=self.var_names) @property def var_names(self) -> pd.Index: """Names of variables (alias for `.var.index`).""" return self.var.index @var_names.setter def var_names(self, names: Sequence[str]): names = self._prep_dim_index(names, "var") self._set_dim_index(names, "var") @property def uns(self) -> MutableMapping: """Unstructured annotation (ordered dictionary).""" uns = self._uns if self.is_view: uns = DictView(uns, view_args=(self, "_uns")) return uns @uns.setter def uns(self, value: MutableMapping): if not isinstance(value, MutableMapping): msg = "Only mutable mapping types (e.g. dict) are allowed for `.uns`." raise ValueError(msg) if isinstance(value, DictView): value = value.copy() if self.is_view: self._init_as_actual(self.copy()) self._uns = value @uns.deleter def uns(self): self.uns = OrderedDict() obsm: AlignedMappingProperty[AxisArrays | AxisArraysView] = AlignedMappingProperty( "obsm", AxisArrays, 0 ) """\ Multi-dimensional annotation of observations (mutable structured :class:`~numpy.ndarray`). Stores for each key a two or higher-dimensional :class:`~numpy.ndarray` of length `n_obs`. Is sliced with `data` and `obs` but behaves otherwise like a :term:`mapping`. """ varm: AlignedMappingProperty[AxisArrays | AxisArraysView] = AlignedMappingProperty( "varm", AxisArrays, 1 ) """\ Multi-dimensional annotation of variables/features (mutable structured :class:`~numpy.ndarray`). Stores for each key a two or higher-dimensional :class:`~numpy.ndarray` of length `n_vars`. Is sliced with `data` and `var` but behaves otherwise like a :term:`mapping`. """ obsp: AlignedMappingProperty[PairwiseArrays | PairwiseArraysView] = ( AlignedMappingProperty("obsp", PairwiseArrays, 0) ) """\ Pairwise annotation of observations, a mutable mapping with array-like values. Stores for each key a two or higher-dimensional :class:`~numpy.ndarray` whose first two dimensions are of length `n_obs`. Is sliced with `data` and `obs` but behaves otherwise like a :term:`mapping`. """ varp: AlignedMappingProperty[PairwiseArrays | PairwiseArraysView] = ( AlignedMappingProperty("varp", PairwiseArrays, 1) ) """\ Pairwise annotation of variables/features, a mutable mapping with array-like values. Stores for each key a two or higher-dimensional :class:`~numpy.ndarray` whose first two dimensions are of length `n_var`. Is sliced with `data` and `var` but behaves otherwise like a :term:`mapping`. """ def obs_keys(self) -> list[str]: """List keys of observation annotation :attr:`obs`.""" return self._obs.keys().tolist() def var_keys(self) -> list[str]: """List keys of variable annotation :attr:`var`.""" return self._var.keys().tolist() def obsm_keys(self) -> list[str]: """List keys of observation annotation :attr:`obsm`.""" return list(self.obsm.keys()) def varm_keys(self) -> list[str]: """List keys of variable annotation :attr:`varm`.""" return list(self.varm.keys()) def uns_keys(self) -> list[str]: """List keys of unstructured annotation.""" return sorted(list(self._uns.keys())) @property def isbacked(self) -> bool: """`True` if object is backed on disk, `False` otherwise.""" return self.filename is not None @property def is_view(self) -> bool: """`True` if object is view of another AnnData object, `False` otherwise.""" return self._is_view @property def filename(self) -> Path | None: """\ Change to backing mode by setting the filename of a `.h5ad` file. - Setting the filename writes the stored data to disk. - Setting the filename when the filename was previously another name moves the backing file from the previous file to the new file. If you want to copy the previous file, use `copy(filename='new_filename')`. """ return self.file.filename @filename.setter def filename(self, filename: PathLike[str] | str | None): # convert early for later comparison filename = None if filename is None else Path(filename) # change from backing-mode back to full loading into memory if filename is None: if self.filename is not None: self.file._to_memory_mode() else: # both filename and self.filename are None # do nothing return else: if self.filename is not None: if self.filename != filename: # write the content of self to the old file # and close the file self.write() self.filename.rename(filename) else: # do nothing return else: # change from memory to backing-mode # write the content of self to disk if self.raw is not None: as_dense = ("X", "raw/X") else: as_dense = ("X",) self.write(filename, as_dense=as_dense) # open new file for accessing self.file.open(filename, "r+") # as the data is stored on disk, we can safely set self._X to None self._X = None def _set_backed(self, attr, value): from .._io.utils import write_attribute write_attribute(self.file._file, attr, value) def _normalize_indices(self, index: Index | None) -> tuple[slice, slice]: return _normalize_indices(index, self.obs_names, self.var_names) # TODO: this is not quite complete... def __delitem__(self, index: Index): obs, var = self._normalize_indices(index) # TODO: does this really work? if not self.isbacked: del self._X[obs, var] else: X = self.file["X"] del X[obs, var] self._set_backed("X", X) if var == slice(None): del self._obs.iloc[obs, :] if obs == slice(None): del self._var.iloc[var, :] def __getitem__(self, index: Index) -> AnnData: """Returns a sliced view of the object.""" oidx, vidx = self._normalize_indices(index) return AnnData(self, oidx=oidx, vidx=vidx, asview=True) @staticmethod @singledispatch def _remove_unused_categories( df_full: pd.DataFrame, df_sub: pd.DataFrame, uns: dict[str, Any] ): for k in df_full: if not isinstance(df_full[k].dtype, pd.CategoricalDtype): continue all_categories = df_full[k].cat.categories with pd.option_context("mode.chained_assignment", None): df_sub[k] = df_sub[k].cat.remove_unused_categories() # also correct the colors... color_key = f"{k}_colors" if color_key not in uns: continue color_vec = uns[color_key] if np.array(color_vec).ndim == 0: # Make 0D arrays into 1D ones uns[color_key] = np.array(color_vec)[(None,)] elif len(color_vec) != len(all_categories): # Reset colors del uns[color_key] else: idx = np.where(np.isin(all_categories, df_sub[k].cat.categories))[0] uns[color_key] = np.array(color_vec)[(idx,)] def rename_categories(self, key: str, categories: Sequence[Any]): """\ Rename categories of annotation `key` in :attr:`obs`, :attr:`var`, and :attr:`uns`. Only supports passing a list/array-like `categories` argument. Besides calling `self.obs[key].cat.categories = categories` – similar for :attr:`var` - this also renames categories in unstructured annotation that uses the categorical annotation `key`. Parameters ---------- key Key for observations or variables annotation. categories New categories, the same number as the old categories. """ if isinstance(categories, Mapping): msg = "Only list-like `categories` is supported." raise ValueError(msg) if key in self.obs: old_categories = self.obs[key].cat.categories.tolist() self.obs[key] = self.obs[key].cat.rename_categories(categories) elif key in self.var: old_categories = self.var[key].cat.categories.tolist() self.var[key] = self.var[key].cat.rename_categories(categories) else: msg = f"{key} is neither in `.obs` nor in `.var`." raise ValueError(msg) # this is not a good solution # but depends on the scanpy conventions for storing the categorical key # as `groupby` in the `params` slot for k1, v1 in self.uns.items(): if not ( isinstance(v1, Mapping) and "params" in v1 and "groupby" in v1["params"] and v1["params"]["groupby"] == key ): continue for k2, v2 in v1.items(): # picks out the recarrays that are named according to the old # categories if isinstance(v2, np.ndarray) and v2.dtype.names is not None: if list(v2.dtype.names) == old_categories: self.uns[k1][k2].dtype.names = categories else: logger.warning( f"Omitting {k1}/{k2} as old categories do not match." ) def strings_to_categoricals(self, df: pd.DataFrame | None = None): """\ Transform string annotations to categoricals. Only affects string annotations that lead to less categories than the total number of observations. Params ------ df If `df` is `None`, modifies both :attr:`obs` and :attr:`var`, otherwise modifies `df` inplace. Notes ----- Turns the view of an :class:`~anndata.AnnData` into an actual :class:`~anndata.AnnData`. """ dont_modify = False # only necessary for backed views if df is None: dfs = [self.obs, self.var] if self.is_view and self.isbacked: dont_modify = True else: dfs = [df] for df in dfs: string_cols = [ key for key in df.columns if infer_dtype(df[key]) == "string" ] for key in string_cols: c = pd.Categorical(df[key]) # TODO: We should only check if non-null values are unique, but # this would break cases where string columns with nulls could # be written as categorical, but not as string. # Possible solution: https://github.com/scverse/anndata/issues/504 if len(c.categories) >= len(c): continue # Ideally this could be done inplace sorted_categories = natsorted(c.categories) if not np.array_equal(c.categories, sorted_categories): c = c.reorder_categories(sorted_categories) if dont_modify: msg = ( "Please call `.strings_to_categoricals()` on full " "AnnData, not on this view. You might encounter this" "error message while copying or writing to disk." ) raise RuntimeError(msg) df[key] = c logger.info(f"... storing {key!r} as categorical") _sanitize = strings_to_categoricals # backwards compat def _inplace_subset_var(self, index: Index1D): """\ Inplace subsetting along variables dimension. Same as `adata = adata[:, index]`, but inplace. """ adata_subset = self[:, index].copy() self._init_as_actual(adata_subset) def _inplace_subset_obs(self, index: Index1D): """\ Inplace subsetting along variables dimension. Same as `adata = adata[index, :]`, but inplace. """ adata_subset = self[index].copy() self._init_as_actual(adata_subset) # TODO: Update, possibly remove def __setitem__(self, index: Index, val: float | XDataType): if self.is_view: msg = "Object is view and cannot be accessed with `[]`." raise ValueError(msg) obs, var = self._normalize_indices(index) if not self.isbacked: self._X[obs, var] = val else: X = self.file["X"] X[obs, var] = val self._set_backed("X", X) def __len__(self) -> int: return self.shape[0] def transpose(self) -> AnnData: """\ Transpose whole object. Data matrix is transposed, observations and variables are interchanged. Ignores `.raw`. """ from anndata.compat import _safe_transpose if not self.isbacked: X = self.X else: X = self.file["X"] if self.is_view: msg = ( "You’re trying to transpose a view of an `AnnData`, " "which is currently not implemented. Call `.copy()` before transposing." ) raise ValueError(msg) return AnnData( X=_safe_transpose(X) if X is not None else None, layers={k: _safe_transpose(v) for k, v in self.layers.items()}, obs=self.var, var=self.obs, uns=self._uns, obsm=self.varm, varm=self.obsm, obsp=self.varp, varp=self.obsp, filename=self.filename, ) T = property(transpose) def to_df(self, layer: str | None = None) -> pd.DataFrame: """\ Generate shallow :class:`~pandas.DataFrame`. The data matrix :attr:`X` is returned as :class:`~pandas.DataFrame`, where :attr:`obs_names` initializes the index, and :attr:`var_names` the columns. * No annotations are maintained in the returned object. * The data matrix is densified in case it is sparse. Params ------ layer Key for `.layers`. Returns ------- Pandas DataFrame of specified data matrix. """ if layer is not None: X = self.layers[layer] elif not self._has_X(): msg = "X is None, cannot convert to dataframe." raise ValueError(msg) else: X = self.X if issparse(X): X = X.toarray() return pd.DataFrame(X, index=self.obs_names, columns=self.var_names) def _get_X(self, *, use_raw: bool = False, layer: str | None = None): """\ Convenience method for getting expression values with common arguments and error handling. """ is_layer = layer is not None if use_raw and is_layer: msg = ( "Cannot use expression from both layer and raw. You provided:" f"`use_raw={use_raw}` and `layer={layer}`" ) raise ValueError(msg) if is_layer: return self.layers[layer] elif use_raw: if self.raw is None: msg = "This AnnData doesn’t have a value in `.raw`." raise ValueError(msg) return self.raw.X else: return self.X def obs_vector(self, k: str, *, layer: str | None = None) -> np.ndarray: """\ Convenience function for returning a 1 dimensional ndarray of values from :attr:`X`, :attr:`layers`\\ `[k]`, or :attr:`obs`. Made for convenience, not performance. Intentionally permissive about arguments, for easy iterative use. Params ------ k Key to use. Should be in :attr:`var_names` or :attr:`obs`\\ `.columns`. layer What layer values should be returned from. If `None`, :attr:`X` is used. Returns ------- A one dimensional ndarray, with values for each obs in the same order as :attr:`obs_names`. """ if layer == "X": if "X" in self.layers: pass else: warnings.warn( "In a future version of AnnData, access to `.X` by passing" " `layer='X'` will be removed. Instead pass `layer=None`.", FutureWarning, ) layer = None return get_vector(self, k, "obs", "var", layer=layer) def var_vector(self, k, *, layer: str | None = None) -> np.ndarray: """\ Convenience function for returning a 1 dimensional ndarray of values from :attr:`X`, :attr:`layers`\\ `[k]`, or :attr:`obs`. Made for convenience, not performance. Intentionally permissive about arguments, for easy iterative use. Params ------ k Key to use. Should be in :attr:`obs_names` or :attr:`var`\\ `.columns`. layer What layer values should be returned from. If `None`, :attr:`X` is used. Returns ------- A one dimensional ndarray, with values for each var in the same order as :attr:`var_names`. """ if layer == "X": if "X" in self.layers: pass else: warnings.warn( "In a future version of AnnData, access to `.X` by passing " "`layer='X'` will be removed. Instead pass `layer=None`.", FutureWarning, ) layer = None return get_vector(self, k, "var", "obs", layer=layer) @deprecated("obs_vector") def _get_obs_array(self, k, use_raw=False, layer=None): # noqa: FBT002 """\ Get an array from the layer (default layer='X') along the :attr:`obs` dimension by first looking up `obs.keys` and then :attr:`obs_names`. """ if not use_raw or k in self.obs.columns: return self.obs_vector(k=k, layer=layer) else: return self.raw.obs_vector(k) @deprecated("var_vector") def _get_var_array(self, k, use_raw=False, layer=None): # noqa: FBT002 """\ Get an array from the layer (default layer='X') along the :attr:`var` dimension by first looking up `var.keys` and then :attr:`var_names`. """ if not use_raw or k in self.var.columns: return self.var_vector(k=k, layer=layer) else: return self.raw.var_vector(k) def _mutated_copy(self, **kwargs): """Creating AnnData with attributes optionally specified via kwargs.""" if self.isbacked: if "X" not in kwargs or (self.raw is not None and "raw" not in kwargs): msg = ( "This function does not currently handle backed objects " "internally, this should be dealt with before." ) raise NotImplementedError(msg) new = {} for key in ["obs", "var", "obsm", "varm", "obsp", "varp", "layers"]: if key in kwargs: new[key] = kwargs[key] else: new[key] = getattr(self, key).copy() if "X" in kwargs: new["X"] = kwargs["X"] elif self._has_X(): new["X"] = self.X.copy() if "uns" in kwargs: new["uns"] = kwargs["uns"] else: new["uns"] = deepcopy(self._uns) if "raw" in kwargs: new["raw"] = kwargs["raw"] elif self.raw is not None: new["raw"] = self.raw.copy() return AnnData(**new) @old_positionals("copy") def to_memory(self, *, copy: bool = False) -> AnnData: """Return a new AnnData object with all backed arrays loaded into memory. Params ------ copy Whether the arrays that are already in-memory should be copied. Example ------- .. code:: python import anndata backed = anndata.io.read_h5ad("file.h5ad", backed="r") mem = backed[backed.obs["cluster"] == "a", :].to_memory() """ new = {} for attr_name in [ "X", "obs", "var", "obsm", "varm", "obsp", "varp", "layers", "uns", ]: attr = getattr(self, attr_name, None) if attr is not None: new[attr_name] = to_memory(attr, copy=copy) if self.raw is not None: new["raw"] = { "X": to_memory(self.raw.X, copy=copy), "var": to_memory(self.raw.var, copy=copy), "varm": to_memory(self.raw.varm, copy=copy), } if self.isbacked: self.file.close() return AnnData(**new) def copy(self, filename: PathLike[str] | str | None = None) -> AnnData: """Full copy, optionally on disk.""" if not self.isbacked: if self.is_view and self._has_X(): # TODO: How do I unambiguously check if this is a copy? # Subsetting this way means we don’t have to have a view type # defined for the matrix, which is needed for some of the # current distributed backend. Specifically Dask. return self._mutated_copy( X=_subset(self._adata_ref.X, (self._oidx, self._vidx)).copy() ) else: return self._mutated_copy() else: from ..io import read_h5ad, write_h5ad if filename is None: msg = ( "To copy an AnnData object in backed mode, " "pass a filename: `.copy(filename='myfilename.h5ad')`. " "To load the object into memory, use `.to_memory()`." ) raise ValueError(msg) mode = self.file._filemode write_h5ad(filename, self) return read_h5ad(filename, backed=mode) @deprecated( "anndata.concat", add_msg="See the tutorial for concat at: " "https://anndata.readthedocs.io/en/latest/concatenation.html", hide=False, ) def concatenate( self, *adatas: AnnData, join: str = "inner", batch_key: str = "batch", batch_categories: Sequence[Any] = None, uns_merge: str | None = None, index_unique: str | None = "-", fill_value=None, ) -> AnnData: """\ Concatenate along the observations axis. The :attr:`uns`, :attr:`varm` and :attr:`obsm` attributes are ignored. Currently, this works only in `'memory'` mode. .. note:: For more flexible and efficient concatenation, see: :func:`~anndata.concat`. Parameters ---------- adatas AnnData matrices to concatenate with. Each matrix is referred to as a “batch”. join Use intersection (`'inner'`) or union (`'outer'`) of variables. batch_key Add the batch annotation to :attr:`obs` using this key. batch_categories Use these as categories for the batch annotation. By default, use increasing numbers. uns_merge Strategy to use for merging entries of uns. These strategies are applied recusivley. Currently implemented strategies include: * `None`: The default. The concatenated object will just have an empty dict for `uns`. * `"same"`: Only entries which have the same value in all AnnData objects are kept. * `"unique"`: Only entries which have one unique value in all AnnData objects are kept. * `"first"`: The first non-missing value is used. * `"only"`: A value is included if only one of the AnnData objects has a value at this path. index_unique Make the index unique by joining the existing index names with the batch category, using `index_unique='-'`, for instance. Provide `None` to keep existing indices. fill_value Scalar value to fill newly missing values in arrays with. Note: only applies to arrays and sparse matrices (not dataframes) and will only be used if `join="outer"`. .. note:: If not provided, the default value is `0` for sparse matrices and `np.nan` for numpy arrays. See the examples below for more information. Returns ------- :class:`~anndata.AnnData` The concatenated :class:`~anndata.AnnData`, where `adata.obs[batch_key]` stores a categorical variable labeling the batch. Notes ----- .. warning:: If you use `join='outer'` this fills 0s for sparse data when variables are absent in a batch. Use this with care. Dense data is filled with `NaN`. See the examples. Examples -------- Joining on intersection of variables. >>> adata1 = AnnData( ... np.array([[1, 2, 3], [4, 5, 6]]), ... dict(obs_names=['s1', 's2'], anno1=['c1', 'c2']), ... dict(var_names=['a', 'b', 'c'], annoA=[0, 1, 2]), ... ) >>> adata2 = AnnData( ... np.array([[1, 2, 3], [4, 5, 6]]), ... dict(obs_names=['s3', 's4'], anno1=['c3', 'c4']), ... dict(var_names=['d', 'c', 'b'], annoA=[0, 1, 2]), ... ) >>> adata3 = AnnData( ... np.array([[1, 2, 3], [4, 5, 6]]), ... dict(obs_names=['s1', 's2'], anno2=['d3', 'd4']), ... dict(var_names=['d', 'c', 'b'], annoA=[0, 2, 3], annoB=[0, 1, 2]), ... ) >>> adata = adata1.concatenate(adata2, adata3) >>> adata AnnData object with n_obs × n_vars = 6 × 2 obs: 'anno1', 'anno2', 'batch' var: 'annoA-0', 'annoA-1', 'annoA-2', 'annoB-2' >>> adata.X array([[2, 3], [5, 6], [3, 2], [6, 5], [3, 2], [6, 5]]) >>> adata.obs anno1 anno2 batch s1-0 c1 NaN 0 s2-0 c2 NaN 0 s3-1 c3 NaN 1 s4-1 c4 NaN 1 s1-2 NaN d3 2 s2-2 NaN d4 2 >>> adata.var.T b c annoA-0 1 2 annoA-1 2 1 annoA-2 3 2 annoB-2 2 1 Joining on the union of variables. >>> outer = adata1.concatenate(adata2, adata3, join='outer') >>> outer AnnData object with n_obs × n_vars = 6 × 4 obs: 'anno1', 'anno2', 'batch' var: 'annoA-0', 'annoA-1', 'annoA-2', 'annoB-2' >>> outer.var.T a b c d annoA-0 0.0 1.0 2.0 NaN annoA-1 NaN 2.0 1.0 0.0 annoA-2 NaN 3.0 2.0 0.0 annoB-2 NaN 2.0 1.0 0.0 >>> outer.var_names Index(['a', 'b', 'c', 'd'], dtype='object') >>> outer.X array([[ 1., 2., 3., nan], [ 4., 5., 6., nan], [nan, 3., 2., 1.], [nan, 6., 5., 4.], [nan, 3., 2., 1.], [nan, 6., 5., 4.]]) >>> outer.X.sum(axis=0) array([nan, 25., 23., nan]) >>> import pandas as pd >>> Xdf = pd.DataFrame(outer.X, columns=outer.var_names) >>> Xdf a b c d 0 1.0 2.0 3.0 NaN 1 4.0 5.0 6.0 NaN 2 NaN 3.0 2.0 1.0 3 NaN 6.0 5.0 4.0 4 NaN 3.0 2.0 1.0 5 NaN 6.0 5.0 4.0 >>> Xdf.sum() a 5.0 b 25.0 c 23.0 d 10.0 dtype: float64 One way to deal with missing values is to use masked arrays: >>> from numpy import ma >>> outer.X = ma.masked_invalid(outer.X) >>> outer.X masked_array( data=[[1.0, 2.0, 3.0, --], [4.0, 5.0, 6.0, --], [--, 3.0, 2.0, 1.0], [--, 6.0, 5.0, 4.0], [--, 3.0, 2.0, 1.0], [--, 6.0, 5.0, 4.0]], mask=[[False, False, False, True], [False, False, False, True], [ True, False, False, False], [ True, False, False, False], [ True, False, False, False], [ True, False, False, False]], fill_value=1e+20) >>> outer.X.sum(axis=0).data array([ 5., 25., 23., 10.]) The masked array is not saved but has to be reinstantiated after saving. >>> outer.write('./test.h5ad') >>> from anndata import read_h5ad >>> outer = read_h5ad('./test.h5ad') >>> outer.X array([[ 1., 2., 3., nan], [ 4., 5., 6., nan], [nan, 3., 2., 1.], [nan, 6., 5., 4.], [nan, 3., 2., 1.], [nan, 6., 5., 4.]]) For sparse data, everything behaves similarly, except that for `join='outer'`, zeros are added. >>> from scipy.sparse import csr_matrix >>> adata1 = AnnData( ... csr_matrix([[0, 2, 3], [0, 5, 6]], dtype=np.float32), ... dict(obs_names=['s1', 's2'], anno1=['c1', 'c2']), ... dict(var_names=['a', 'b', 'c']), ... ) >>> adata2 = AnnData( ... csr_matrix([[0, 2, 3], [0, 5, 6]], dtype=np.float32), ... dict(obs_names=['s3', 's4'], anno1=['c3', 'c4']), ... dict(var_names=['d', 'c', 'b']), ... ) >>> adata3 = AnnData( ... csr_matrix([[1, 2, 0], [0, 5, 6]], dtype=np.float32), ... dict(obs_names=['s5', 's6'], anno2=['d3', 'd4']), ... dict(var_names=['d', 'c', 'b']), ... ) >>> adata = adata1.concatenate(adata2, adata3, join='outer') >>> adata.var_names Index(['a', 'b', 'c', 'd'], dtype='object') >>> adata.X.toarray() array([[0., 2., 3., 0.], [0., 5., 6., 0.], [0., 3., 2., 0.], [0., 6., 5., 0.], [0., 0., 2., 1.], [0., 6., 5., 0.]], dtype=float32) """ from .merge import concat, merge_dataframes, merge_outer, merge_same if self.isbacked: msg = "Currently, concatenate only works in memory mode." raise ValueError(msg) if len(adatas) == 0: return self.copy() elif len(adatas) == 1 and not isinstance(adatas[0], AnnData): adatas = adatas[0] # backwards compatibility all_adatas = (self,) + tuple(adatas) out = concat( all_adatas, axis=0, join=join, label=batch_key, keys=batch_categories, uns_merge=uns_merge, fill_value=fill_value, index_unique=index_unique, pairwise=False, ) # Backwards compat (some of this could be more efficient) # obs used to always be an outer join sparse_class = sparse.csr_matrix if any(isinstance(a.X, CSArray) for a in all_adatas): sparse_class = sparse.csr_array out.obs = concat( [AnnData(sparse_class(a.shape), obs=a.obs) for a in all_adatas], axis=0, join="outer", label=batch_key, keys=batch_categories, index_unique=index_unique, ).obs # Removing varm del out.varm # Implementing old-style merging of var if batch_categories is None: batch_categories = np.arange(len(all_adatas)).astype(str) pat = rf"-({'|'.join(batch_categories)})$" out.var = merge_dataframes( [a.var for a in all_adatas], out.var_names, partial(merge_outer, batch_keys=batch_categories, merge=merge_same), ) out.var = out.var.iloc[ :, ( out.var.columns.str.extract(pat, expand=False) .fillna("") .argsort(kind="stable") ), ] return out def var_names_make_unique(self, join: str = "-"): # Important to go through the setter so obsm dataframes are updated too self.var_names = utils.make_index_unique(self.var.index, join) var_names_make_unique.__doc__ = utils.make_index_unique.__doc__ def obs_names_make_unique(self, join: str = "-"): # Important to go through the setter so obsm dataframes are updated too self.obs_names = utils.make_index_unique(self.obs.index, join) obs_names_make_unique.__doc__ = utils.make_index_unique.__doc__ def _check_uniqueness(self): if not self.obs.index.is_unique: utils.warn_names_duplicates("obs") if not self.var.index.is_unique: utils.warn_names_duplicates("var") def __contains__(self, key: Any): msg = "AnnData has no attribute __contains__, don’t check `in adata`." raise AttributeError(msg) def _check_dimensions(self, key=None): if key is None: key = {"obsm", "varm"} else: key = {key} if "obsm" in key: if ( not all([axis_len(o, 0) == self.n_obs for o in self.obsm.values()]) and len(self.obsm.dim_names) != self.n_obs ): msg = ( "Observations annot. `obsm` must have number of rows of `X`" f" ({self.n_obs}), but has {len(self.obsm)} rows." ) raise ValueError(msg) if "varm" in key: if ( not all([axis_len(v, 0) == self.n_vars for v in self.varm.values()]) and len(self.varm.dim_names) != self.n_vars ): msg = ( "Variables annot. `varm` must have number of columns of `X`" f" ({self.n_vars}), but has {len(self.varm)} rows." ) raise ValueError(msg) @old_positionals("compression", "compression_opts", "as_dense") def write_h5ad( self, filename: PathLike[str] | str | None = None, *, convert_strings_to_categoricals: bool = True, compression: Literal["gzip", "lzf"] | None = None, compression_opts: int | Any = None, as_dense: Sequence[str] = (), ): """\ Write `.h5ad`-formatted hdf5 file. .. note:: Setting compression to `'gzip'` can save disk space but will slow down writing and subsequent reading. Prior to v0.6.16, this was the default for parameter `compression`. Generally, if you have sparse data that are stored as a dense matrix, you can dramatically improve performance and reduce disk space by converting to a :class:`~scipy.sparse.csr_matrix`:: from scipy.sparse import csr_matrix adata.X = csr_matrix(adata.X) Parameters ---------- filename Filename of data file. Defaults to backing file. convert_strings_to_categoricals Convert string columns to categorical. compression For [`lzf`, `gzip`], see the h5py :ref:`dataset_compression`. Alternative compression filters such as `zstd` can be passed from the :doc:`hdf5plugin ` library. Experimental. Usage example:: import hdf5plugin adata.write_h5ad( filename, compression=hdf5plugin.FILTERS["zstd"] ) .. note:: Datasets written with hdf5plugin-provided compressors cannot be opened without first loading the hdf5plugin library using `import hdf5plugin`. When using alternative compression filters such as `zstd`, consider writing to `zarr` format instead of `h5ad`, as the `zarr` library provides a more transparent compression pipeline. compression_opts For [`lzf`, `gzip`], see the h5py :ref:`dataset_compression`. Alternative compression filters such as `zstd` can be configured using helpers from the :doc:`hdf5plugin ` library. Experimental. Usage example (setting `zstd` compression level to 5):: import hdf5plugin adata.write_h5ad( filename, compression=hdf5plugin.FILTERS["zstd"], compression_opts=hdf5plugin.Zstd(clevel=5).filter_options ) as_dense Sparse arrays in AnnData object to write as dense. Currently only supports `X` and `raw/X`. """ from ..io import write_h5ad if filename is None and not self.isbacked: msg = "Provide a filename!" raise ValueError(msg) if filename is None: filename = self.filename write_h5ad( Path(filename), self, convert_strings_to_categoricals=convert_strings_to_categoricals, compression=compression, compression_opts=compression_opts, as_dense=as_dense, ) if self.isbacked: self.file.filename = filename write = write_h5ad # a shortcut and backwards compat @old_positionals("skip_data", "sep") def write_csvs( self, dirname: PathLike[str] | str, *, skip_data: bool = True, sep: str = "," ): """\ Write annotation to `.csv` files. It is not possible to recover the full :class:`~anndata.AnnData` from these files. Use :meth:`write` for this. Parameters ---------- dirname Name of directory to which to export. skip_data Skip the data matrix :attr:`X`. sep Separator for the data. """ from ..io import write_csvs write_csvs(dirname, self, skip_data=skip_data, sep=sep) @old_positionals("write_obsm_varm") def write_loom( self, filename: PathLike[str] | str, *, write_obsm_varm: bool = False ): """\ Write `.loom`-formatted hdf5 file. Parameters ---------- filename The filename. """ from ..io import write_loom write_loom(filename, self, write_obsm_varm=write_obsm_varm) @old_positionals("chunks") def write_zarr( self, store: StoreLike, *, chunks: tuple[int, ...] | None = None, convert_strings_to_categoricals: bool = True, ): """\ Write a hierarchical Zarr array store. Parameters ---------- store The filename, a :class:`~typing.MutableMapping`, or a Zarr storage class. chunks Chunk shape. convert_strings_to_categoricals Convert string columns to categorical. """ from ..io import write_zarr # TODO: What is a bool for chunks supposed to do? if isinstance(chunks, bool): msg = ( "Passing `write_zarr(adata, chunks=True)` is no longer supported. " "Please pass `write_zarr(adata)` instead." ) raise ValueError(msg) write_zarr( store, self, chunks=chunks, convert_strings_to_categoricals=convert_strings_to_categoricals, ) def chunked_X(self, chunk_size: int | None = None): """\ Return an iterator over the rows of the data matrix :attr:`X`. Parameters ---------- chunk_size Row size of a single chunk. """ if chunk_size is None: # Should be some adaptive code chunk_size = 6000 start = 0 n = self.n_obs for _ in range(int(n // chunk_size)): end = start + chunk_size yield (self.X[start:end], start, end) start = end if start < n: yield (self.X[start:n], start, n) @old_positionals("replace") def chunk_X( self, select: int | Sequence[int] | np.ndarray = 1000, *, replace: bool = True, ): """\ Return a chunk of the data matrix :attr:`X` with random or specified indices. Parameters ---------- select Depending on the type: :class:`int` A random chunk with `select` rows will be returned. :term:`sequence` (e.g. a list, tuple or numpy array) of :class:`int` A chunk with these indices will be returned. replace If `select` is an integer then `True` means random sampling of indices with replacement, `False` without replacement. """ if isinstance(select, int): select = select if select < self.n_obs else self.n_obs choice = np.random.choice(self.n_obs, select, replace) elif isinstance(select, np.ndarray | Sequence): choice = np.asarray(select) else: msg = "select should be int or array" raise ValueError(msg) reverse = None if self.isbacked: # h5py can only slice with a sorted list of unique index values # so random batch with indices [2, 2, 5, 3, 8, 10, 8] will fail # this fixes the problem indices, reverse = np.unique(choice, return_inverse=True) selection = self.X[indices.tolist()] else: selection = self.X[choice] selection = selection.toarray() if issparse(selection) else selection return selection if reverse is None else selection[reverse] def _has_X(self) -> bool: """ Check if X is None. This is more efficient than trying `adata.X is None` for views, since creating views (at least anndata's kind) can be expensive. """ if not self.is_view: return self.X is not None else: return self._adata_ref.X is not None # -------------------------------------------------------------------------- # all of the following is for backwards compat # -------------------------------------------------------------------------- @property @deprecated("is_view") def isview(self): return self.is_view def _clean_up_old_format(self, uns): # multicolumn keys # all of the rest is only for backwards compat for bases in [["obs", "smp"], ["var"]]: axis = bases[0] for k in [f"{p}{base}_keys_multicol" for p in ["", "_"] for base in bases]: if uns and k in uns: keys = list(uns[k]) del uns[k] break else: keys = [] # now, for compat, fill the old multicolumn entries into obsm and varm # and remove them from obs and var m_attr = getattr(self, f"_{axis}m") for key in keys: m_attr[key] = self._get_and_delete_multicol_field(axis, key) def _get_and_delete_multicol_field(self, a, key_multicol): keys = [] for k in getattr(self, a).columns: if k.startswith(key_multicol): keys.append(k) values = getattr(self, a)[keys].values getattr(self, a).drop(keys, axis=1, inplace=True) return values def _check_2d_shape(X): """\ Check shape of array or sparse matrix. Assure that X is always 2D: Unlike numpy we always deal with 2D arrays. """ if X.dtype.names is None and len(X.shape) != 2: msg = f"X needs to be 2-dimensional, not {len(X.shape)}-dimensional." raise ValueError(msg) def _infer_shape_for_axis( xxx: pd.DataFrame | Mapping[str, Iterable[Any]] | None, xxxm: np.ndarray | Mapping[str, Sequence[Any]] | None, layers: Mapping[str, np.ndarray | sparse.spmatrix] | None, xxxp: np.ndarray | Mapping[str, Sequence[Any]] | None, axis: Literal[0, 1], ) -> int | None: for elem in [xxx, xxxm, xxxp]: if elem is not None and hasattr(elem, "shape"): return elem.shape[0] for elem, id in zip([layers, xxxm, xxxp], ["layers", "xxxm", "xxxp"]): if elem is not None: elem = cast("Mapping", elem) for sub_elem in elem.values(): if hasattr(sub_elem, "shape"): size = cast("int", sub_elem.shape[axis if id == "layers" else 0]) return size return None def _infer_shape( obs: pd.DataFrame | Mapping[str, Iterable[Any]] | None = None, var: pd.DataFrame | Mapping[str, Iterable[Any]] | None = None, obsm: np.ndarray | Mapping[str, Sequence[Any]] | None = None, varm: np.ndarray | Mapping[str, Sequence[Any]] | None = None, layers: Mapping[str, np.ndarray | sparse.spmatrix] | None = None, obsp: np.ndarray | Mapping[str, Sequence[Any]] | None = None, varp: np.ndarray | Mapping[str, Sequence[Any]] | None = None, ): return ( _infer_shape_for_axis(obs, obsm, layers, obsp, 0), _infer_shape_for_axis(var, varm, layers, varp, 1), ) python-anndata-0.12.0~rc1/src/anndata/_core/extensions.py000066400000000000000000000242711500370632200233640ustar00rootroot00000000000000from __future__ import annotations import inspect from pathlib import Path from typing import TYPE_CHECKING, Generic, TypeVar, get_type_hints, overload from warnings import warn from ..types import ExtensionNamespace from .anndata import AnnData if TYPE_CHECKING: from collections.abc import Callable # Based off of the extension framework in Polars # https://github.com/pola-rs/polars/blob/main/py-polars/polars/api.py __all__ = ["register_anndata_namespace"] def find_stacklevel() -> int: """ Find the first place in the stack that is not inside AnnData. Taken from: https://github.com/pola-rs/polars/blob/main/py-polars/polars/_utils/various.py#L447 """ pkg_dir = str(Path(__file__).parent.parent) # https://stackoverflow.com/questions/17407119/python-inspect-stack-is-slow frame = inspect.currentframe() n = 0 try: while frame: fname = inspect.getfile(frame) if fname.startswith(pkg_dir) or ( (qualname := getattr(frame.f_code, "co_qualname", None)) # ignore @singledispatch wrappers and qualname.startswith("singledispatch.") ): frame = frame.f_back n += 1 else: break finally: # https://docs.python.org/3/library/inspect.html # > Though the cycle detector will catch these, destruction of the frames # > (and local variables) can be made deterministic by removing the cycle # > in a finally clause. del frame return n # Reserved namespaces include accessors built into AnnData (currently there are none) # and all current attributes of AnnData _reserved_namespaces: set[str] = set(dir(AnnData)) NameSpT = TypeVar("NameSpT", bound=ExtensionNamespace) T = TypeVar("T") class AccessorNameSpace(ExtensionNamespace, Generic[NameSpT]): """Establish property-like namespace object for user-defined functionality.""" def __init__(self, name: str, namespace: type[NameSpT]) -> None: self._accessor = name self._ns = namespace @overload def __get__(self, instance: None, cls: type[T]) -> type[NameSpT]: ... @overload def __get__(self, instance: T, cls: type[T]) -> NameSpT: ... def __get__(self, instance: T | None, cls: type[T]) -> NameSpT | type[NameSpT]: if instance is None: return self._ns ns_instance = self._ns(instance) # type: ignore[call-arg] setattr(instance, self._accessor, ns_instance) return ns_instance def _check_namespace_signature(ns_class: type) -> None: """Validate the signature of a namespace class for AnnData extensions. This function ensures that any class intended to be used as an extension namespace has a properly formatted `__init__` method such that: 1. Accepts at least two parameters (self and adata) 2. Has 'adata' as the name of the second parameter 3. Has the second parameter properly type-annotated as 'AnnData' or any equivalent import alias The function performs runtime validation of these requirements before a namespace can be registered through the `register_anndata_namespace` decorator. Parameters ---------- ns_class The namespace class to validate. Raises ------ TypeError If the `__init__` method has fewer than 2 parameters (missing the AnnData parameter). AttributeError If the second parameter of `__init__` lacks a type annotation. TypeError If the second parameter of `__init__` is not named 'adata'. TypeError If the second parameter of `__init__` is not annotated as the 'AnnData' class. TypeError If both the name and type annotation of the second parameter are incorrect. """ sig = inspect.signature(ns_class.__init__) params = list(sig.parameters.values()) # Ensure there are at least two parameters (self and adata) if len(params) < 2: error_msg = "Namespace initializer must accept an AnnData instance as the second parameter." raise TypeError(error_msg) # Get the second parameter (expected to be 'adata') param = params[1] if param.annotation is inspect._empty: err_msg = "Namespace initializer's second parameter must be annotated as the 'AnnData' class, got empty annotation." raise AttributeError(err_msg) name_ok = param.name == "adata" # Resolve the annotation using get_type_hints to handle forward references and aliases. try: type_hints = get_type_hints(ns_class.__init__) resolved_type = type_hints.get(param.name, param.annotation) except NameError as e: err_msg = f"Namespace initializer's second parameter must be named 'adata', got '{param.name}'." raise NameError(err_msg) from e type_ok = resolved_type is AnnData match (name_ok, type_ok): case (True, True): return # Signature is correct. case (False, True): msg = f"Namespace initializer's second parameter must be named 'adata', got {param.name!r}." raise TypeError(msg) case (True, False): type_repr = getattr(resolved_type, "__name__", str(resolved_type)) msg = f"Namespace initializer's second parameter must be annotated as the 'AnnData' class, got '{type_repr}'." raise TypeError(msg) case _: type_repr = getattr(resolved_type, "__name__", str(resolved_type)) msg = ( f"Namespace initializer's second parameter must be named 'adata', got {param.name!r}. " f"And must be annotated as 'AnnData', got {type_repr!r}." ) raise TypeError(msg) def _create_namespace( name: str, cls: type[AnnData] ) -> Callable[[type[NameSpT]], type[NameSpT]]: """Register custom namespace against the underlying AnnData class.""" def namespace(ns_class: type[NameSpT]) -> type[NameSpT]: _check_namespace_signature(ns_class) # Perform the runtime signature check if name in _reserved_namespaces: msg = f"cannot override reserved attribute {name!r}" raise AttributeError(msg) elif name in cls._accessors: warn( f"Overriding existing custom namespace {name!r} (on {cls.__name__!r})", UserWarning, stacklevel=find_stacklevel(), ) setattr(cls, name, AccessorNameSpace(name, ns_class)) cls._accessors.add(name) return ns_class return namespace def register_anndata_namespace( name: str, ) -> Callable[[type[NameSpT]], type[NameSpT]]: """Decorator for registering custom functionality with an :class:`~anndata.AnnData` object. This decorator allows you to extend AnnData objects with custom methods and properties organized under a namespace. The namespace becomes accessible as an attribute on AnnData instances, providing a clean way to you to add domain-specific functionality without modifying the AnnData class itself, or extending the class with additional methods as you see fit in your workflow. Parameters ---------- name Name under which the accessor should be registered. This will be the attribute name used to access your namespace's functionality on AnnData objects (e.g., `adata.{name}`). Cannot conflict with existing AnnData attributes like `obs`, `var`, `X`, etc. The list of reserved attributes includes everything outputted by `dir(AnnData)`. Returns ------- A decorator that registers the decorated class as a custom namespace. Notes ----- Implementation requirements: 1. The decorated class must have an `__init__` method that accepts exactly one parameter (besides `self`) named `adata` and annotated with type :class:`~anndata.AnnData`. 2. The namespace will be initialized with the AnnData object on first access and then cached on the instance. 3. If the namespace name conflicts with an existing namespace, a warning is issued. 4. If the namespace name conflicts with a built-in AnnData attribute, an AttributeError is raised. Examples -------- Simple transformation namespace with two methods: >>> import anndata as ad >>> import numpy as np >>> >>> @ad.register_anndata_namespace("transform") ... class TransformX: ... def __init__(self, adata: ad.AnnData): ... self._adata = adata ... ... def log1p( ... self, layer: str = None, inplace: bool = False ... ) -> ad.AnnData | None: ... '''Log1p transform the data.''' ... data = self._adata.layers[layer] if layer else self._adata.X ... log1p_data = np.log1p(data) ... ... if layer: ... layer_name = f"{layer}_log1p" if not inplace else layer ... else: ... layer_name = "log1p" ... ... self._adata.layers[layer_name] = log1p_data ... ... if not inplace: ... return self._adata ... ... def arcsinh( ... self, layer: str = None, scale: float = 1.0, inplace: bool = False ... ) -> ad.AnnData | None: ... '''Arcsinh transform the data with optional scaling.''' ... data = self._adata.layers[layer] if layer else self._adata.X ... asinh_data = np.arcsinh(data / scale) ... ... if layer: ... layer_name = f"{layer}_arcsinh" if not inplace else layer ... else: ... layer_name = "arcsinh" ... ... self._adata.layers[layer_name] = asinh_data ... ... if not inplace: ... return self._adata >>> >>> # Create an AnnData object >>> rng = np.random.default_rng(42) >>> adata = ad.AnnData(X=rng.poisson(1, size=(100, 2000))) >>> >>> # Use the registered namespace >>> adata.transform.log1p() # Transforms X and returns the AnnData object AnnData object with n_obs × n_vars = 100 × 2000 layers: 'log1p' >>> adata.transform.arcsinh() # Transforms X and returns the AnnData object AnnData object with n_obs × n_vars = 100 × 2000 layers: 'log1p', 'arcsinh' """ return _create_namespace(name, AnnData) python-anndata-0.12.0~rc1/src/anndata/_core/file_backing.py000066400000000000000000000115541500370632200235620ustar00rootroot00000000000000from __future__ import annotations import weakref from collections.abc import Mapping from functools import singledispatch from pathlib import Path, PurePosixPath from typing import TYPE_CHECKING import h5py from ..compat import AwkArray, DaskArray, ZarrArray, ZarrGroup from .sparse_dataset import BaseCompressedSparseDataset if TYPE_CHECKING: from collections.abc import Iterator from os import PathLike from typing import Literal from .._types import ArrayStorageType from . import anndata class AnnDataFileManager: """Backing file manager for AnnData.""" def __init__( self, adata: anndata.AnnData, filename: PathLike[str] | str | None = None, filemode: Literal["r", "r+"] | None = None, ): self._adata_ref = weakref.ref(adata) self.filename = filename self._filemode = filemode self._file = None if filename: self.open() def __getstate__(self): state = self.__dict__.copy() state["_adata_ref"] = state["_adata_ref"]() return state def __setstate__(self, state): self.__dict__ = state.copy() self.__dict__["_adata_ref"] = weakref.ref(state["_adata_ref"]) @property def _adata(self): return self._adata_ref() def __repr__(self) -> str: if self.filename is None: return "Backing file manager: no file is set." else: return f"Backing file manager of file {self.filename}." def __contains__(self, x) -> bool: return x in self._file def __iter__(self) -> Iterator[str]: return iter(self._file) def __getitem__( self, key: str ) -> h5py.Group | h5py.Dataset | BaseCompressedSparseDataset: return self._file[key] def __setitem__( self, key: str, value: h5py.Group | h5py.Dataset | BaseCompressedSparseDataset, ): self._file[key] = value def __delitem__(self, key: str): del self._file[key] @property def filename(self) -> Path: return self._filename @filename.setter def filename(self, filename: PathLike[str] | str | None): self._filename = None if filename is None else Path(filename) def open( self, filename: PathLike[str] | str | None = None, filemode: Literal["r", "r+"] | None = None, ): if filename is not None: self.filename = filename if filemode is not None: self._filemode = filemode if self.filename is None: msg = "Cannot open backing file if backing not initialized." raise ValueError(msg) self._file = h5py.File(self.filename, self._filemode) def close(self): """Close the backing file, remember filename, do *not* change to memory mode.""" if self._file is not None: self._file.close() def _to_memory_mode(self): """Close the backing file, forget filename, *do* change to memory mode.""" self._adata._X = self._adata.X[()] self._file.close() self._file = None self._filename = None @property def is_open(self) -> bool: """State of backing file.""" if self._file is None: return False # try accessing the id attribute to see if the file is open return bool(self._file.id) @singledispatch def to_memory(x, *, copy: bool = False): """Permissivley convert objects to in-memory representation. If they already are in-memory, (or are just unrecognized) pass a copy through. """ if copy and hasattr(x, "copy"): return x.copy() else: return x @to_memory.register(ZarrArray) @to_memory.register(h5py.Dataset) def _(x: ArrayStorageType, *, copy: bool = False): return x[...] @to_memory.register(BaseCompressedSparseDataset) def _(x: BaseCompressedSparseDataset, *, copy: bool = False): return x.to_memory() @to_memory.register(DaskArray) def _(x: DaskArray, *, copy: bool = False): return x.compute() @to_memory.register(Mapping) def _(x: Mapping, *, copy: bool = False): return {k: to_memory(v, copy=copy) for k, v in x.items()} @to_memory.register(AwkArray) def _(x: AwkArray, *, copy: bool = False): from copy import copy as _copy if copy: return _copy(x) else: return x @singledispatch def filename(x): msg = f"Not implemented for {type(x)}" raise NotImplementedError(msg) @filename.register(h5py.Group) @filename.register(h5py.Dataset) def _(x): return x.file.filename @filename.register(ZarrArray) @filename.register(ZarrGroup) def _(x): return x.store.path @singledispatch def get_elem_name(x): msg = f"Not implemented for {type(x)}" raise NotImplementedError(msg) @get_elem_name.register(h5py.Group) def _(x): return x.name @get_elem_name.register(ZarrGroup) def _(x): return PurePosixPath(x.path).name python-anndata-0.12.0~rc1/src/anndata/_core/index.py000066400000000000000000000221511500370632200222670ustar00rootroot00000000000000from __future__ import annotations from collections.abc import Iterable, Sequence from functools import singledispatch from itertools import repeat from typing import TYPE_CHECKING import h5py import numpy as np import pandas as pd from scipy.sparse import issparse from ..compat import AwkArray, CSArray, CSMatrix, DaskArray if TYPE_CHECKING: from ..compat import Index, Index1D def _normalize_indices( index: Index | None, names0: pd.Index, names1: pd.Index ) -> tuple[slice, slice]: # deal with tuples of length 1 if isinstance(index, tuple) and len(index) == 1: index = index[0] # deal with pd.Series if isinstance(index, pd.Series): index: Index = index.values if isinstance(index, tuple): # TODO: The series should probably be aligned first index = tuple(i.values if isinstance(i, pd.Series) else i for i in index) ax0, ax1 = unpack_index(index) ax0 = _normalize_index(ax0, names0) ax1 = _normalize_index(ax1, names1) return ax0, ax1 def _normalize_index( indexer: slice | np.integer | int | str | Sequence[bool | int | np.integer] | np.ndarray | pd.Index, index: pd.Index, ) -> slice | int | np.ndarray: # ndarray of int or bool from ..experimental.backed._compat import DataArray # TODO: why is this here? All tests pass without it and it seems at the minimum not strict enough. if not isinstance(index, pd.RangeIndex) and ( index.dtype == float or index.dtype == int ): msg = f"Don’t call _normalize_index with non-categorical/string names and non-range index {index}" raise TypeError(msg) # the following is insanely slow for sequences, # we replaced it using pandas below def name_idx(i): if isinstance(i, str): i = index.get_loc(i) return i if isinstance(indexer, slice): start = name_idx(indexer.start) stop = name_idx(indexer.stop) # string slices can only be inclusive, so +1 in that case if isinstance(indexer.stop, str): stop = None if stop is None else stop + 1 step = indexer.step return slice(start, stop, step) elif isinstance(indexer, np.integer | int): return indexer elif isinstance(indexer, str): return index.get_loc(indexer) # int elif isinstance( indexer, Sequence | np.ndarray | pd.Index | CSMatrix | np.matrix | CSArray ): if hasattr(indexer, "shape") and ( (indexer.shape == (index.shape[0], 1)) or (indexer.shape == (1, index.shape[0])) ): if isinstance(indexer, CSMatrix | CSArray): indexer = indexer.toarray() indexer = np.ravel(indexer) if not isinstance(indexer, np.ndarray | pd.Index): indexer = np.array(indexer) if len(indexer) == 0: indexer = indexer.astype(int) if isinstance(indexer, np.ndarray) and np.issubdtype( indexer.dtype, np.floating ): indexer_int = indexer.astype(int) if np.all((indexer - indexer_int) != 0): msg = f"Indexer {indexer!r} has floating point values." raise IndexError(msg) if issubclass(indexer.dtype.type, np.integer | np.floating): return indexer # Might not work for range indexes elif issubclass(indexer.dtype.type, np.bool_): if indexer.shape != index.shape: msg = ( f"Boolean index does not match AnnData’s shape along this " f"dimension. Boolean index has shape {indexer.shape} while " f"AnnData index has shape {index.shape}." ) raise IndexError(msg) return indexer else: # indexer should be string array positions = index.get_indexer(indexer) if np.any(positions < 0): not_found = indexer[positions < 0] msg = ( f"Values {list(not_found)}, from {list(indexer)}, " "are not valid obs/ var names or indices." ) raise KeyError(msg) return positions # np.ndarray[int] elif isinstance(indexer, DataArray): if isinstance(indexer.data, DaskArray): return indexer.data.compute() return indexer.data msg = f"Unknown indexer {indexer!r} of type {type(indexer)}" raise IndexError() def _fix_slice_bounds(s: slice, length: int) -> slice: """The slice will be clipped to length, and the step won't be None. E.g. infer None valued attributes. """ step = s.step if s.step is not None else 1 # slice constructor would have errored if step was 0 if step > 0: start = s.start if s.start is not None else 0 stop = s.stop if s.stop is not None else length elif step < 0: # Reverse start = s.start if s.start is not None else length stop = s.stop if s.stop is not None else 0 return slice(start, stop, step) def unpack_index(index: Index) -> tuple[Index1D, Index1D]: if not isinstance(index, tuple): if index is Ellipsis: index = slice(None) return index, slice(None) num_ellipsis = sum(i is Ellipsis for i in index) if num_ellipsis > 1: msg = "an index can only have a single ellipsis ('...')" raise IndexError(msg) # If index has Ellipsis, filter it out (and if not, error) if len(index) > 2: if not num_ellipsis: msg = "Received a length 3 index without an ellipsis" raise IndexError(msg) index = tuple(i for i in index if i is not Ellipsis) return index # If index has Ellipsis, replace it with slice if len(index) == 2: index = tuple(slice(None) if i is Ellipsis else i for i in index) return index if len(index) == 1: index = index[0] if index is Ellipsis: index = slice(None) return index, slice(None) msg = "invalid number of indices" raise IndexError(msg) @singledispatch def _subset(a: np.ndarray | pd.DataFrame, subset_idx: Index): # Select as combination of indexes, not coordinates # Correcting for indexing behaviour of np.ndarray if all(isinstance(x, Iterable) for x in subset_idx): subset_idx = np.ix_(*subset_idx) return a[subset_idx] @_subset.register(DaskArray) def _subset_dask(a: DaskArray, subset_idx: Index): if len(subset_idx) > 1 and all(isinstance(x, Iterable) for x in subset_idx): if issparse(a._meta) and a._meta.format == "csc": return a[:, subset_idx[1]][subset_idx[0], :] return a[subset_idx[0], :][:, subset_idx[1]] return a[subset_idx] @_subset.register(CSMatrix) @_subset.register(CSArray) def _subset_sparse(a: CSMatrix | CSArray, subset_idx: Index): # Correcting for indexing behaviour of sparse.spmatrix if len(subset_idx) > 1 and all(isinstance(x, Iterable) for x in subset_idx): first_idx = subset_idx[0] if issubclass(first_idx.dtype.type, np.bool_): first_idx = np.where(first_idx)[0] subset_idx = (first_idx.reshape(-1, 1), *subset_idx[1:]) return a[subset_idx] @_subset.register(pd.DataFrame) def _subset_df(df: pd.DataFrame, subset_idx: Index): return df.iloc[subset_idx] @_subset.register(AwkArray) def _subset_awkarray(a: AwkArray, subset_idx: Index): if all(isinstance(x, Iterable) for x in subset_idx): subset_idx = np.ix_(*subset_idx) return a[subset_idx] # Registration for SparseDataset occurs in sparse_dataset.py @_subset.register(h5py.Dataset) def _subset_dataset(d, subset_idx): if not isinstance(subset_idx, tuple): subset_idx = (subset_idx,) ordered = list(subset_idx) rev_order = [slice(None) for _ in range(len(subset_idx))] for axis, axis_idx in enumerate(ordered.copy()): if isinstance(axis_idx, np.ndarray): if axis_idx.dtype == bool: axis_idx = np.where(axis_idx)[0] order = np.argsort(axis_idx) ordered[axis] = axis_idx[order] rev_order[axis] = np.argsort(order) # from hdf5, then to real order return d[tuple(ordered)][tuple(rev_order)] def make_slice(idx, dimidx, n=2): mut = list(repeat(slice(None), n)) mut[dimidx] = idx return tuple(mut) def get_vector(adata, k, coldim, idxdim, layer=None): # adata could be self if Raw and AnnData shared a parent dims = ("obs", "var") col = getattr(adata, coldim).columns idx = getattr(adata, f"{idxdim}_names") in_col = k in col in_idx = k in idx if (in_col + in_idx) == 2: msg = f"Key {k} could be found in both .{idxdim}_names and .{coldim}.columns" raise ValueError(msg) elif (in_col + in_idx) == 0: msg = f"Could not find key {k} in .{idxdim}_names or .{coldim}.columns." raise KeyError(msg) elif in_col: return getattr(adata, coldim)[k].values elif in_idx: selected_dim = dims.index(idxdim) idx = adata._normalize_indices(make_slice(k, selected_dim)) a = adata._get_X(layer=layer)[idx] if issparse(a): a = a.toarray() return np.ravel(a) python-anndata-0.12.0~rc1/src/anndata/_core/merge.py000066400000000000000000001557341500370632200222750ustar00rootroot00000000000000""" Code for merging/ concatenating AnnData objects. """ from __future__ import annotations from collections import OrderedDict from collections.abc import Callable, Mapping, MutableSet from functools import partial, reduce, singledispatch from itertools import repeat from operator import and_, or_, sub from typing import TYPE_CHECKING, Literal, TypeVar from warnings import warn import numpy as np import pandas as pd import scipy from natsort import natsorted from packaging.version import Version from scipy import sparse from anndata._core.file_backing import to_memory from anndata._warnings import ExperimentalFeatureWarning from ..compat import ( AwkArray, CSArray, CSMatrix, CupyArray, CupyCSRMatrix, CupySparseMatrix, DaskArray, _map_cat_to_str, ) from ..utils import asarray, axis_len, warn_once from .anndata import AnnData from .index import _subset, make_slice if TYPE_CHECKING: from collections.abc import Collection, Generator, Iterable, Sequence from typing import Any from pandas.api.extensions import ExtensionDtype from anndata._types import Join_T from anndata.experimental.backed._compat import DataArray, Dataset2D T = TypeVar("T") ################### # Utilities ################### # Pretty much just for maintaining order of keys class OrderedSet(MutableSet): def __init__(self, vals=()): self.dict = OrderedDict(zip(vals, repeat(None))) def __contains__(self, val): return val in self.dict def __iter__(self): return iter(self.dict) def __len__(self): return len(self.dict) def __repr__(self): return "OrderedSet: {" + ", ".join(map(str, self)) + "}" def copy(self): return OrderedSet(self.dict.copy()) def add(self, val): self.dict[val] = None def union(self, *vals) -> OrderedSet: return reduce(or_, vals, self) def discard(self, val): if val in self: del self.dict[val] def difference(self, *vals) -> OrderedSet: return reduce(sub, vals, self) def union_keys(ds: Collection) -> OrderedSet: return reduce(or_, ds, OrderedSet()) def intersect_keys(ds: Collection) -> OrderedSet: return reduce(and_, map(OrderedSet, ds)) class MissingVal: """Represents a missing value.""" def is_missing(v) -> bool: return v is MissingVal def not_missing(v) -> bool: return v is not MissingVal # We need to be able to check for equality of arrays to know which are the same. # Unfortunately equality of arrays is poorly defined. # * `np.array_equal` does not work for sparse arrays # * `np.array_equal(..., equal_nan=True)` does not work for null values at the moment # (see https://github.com/numpy/numpy/issues/16377) # So we have to define it ourselves with these two issues in mind. # TODO: Hopefully this will stop being an issue in the future and this code can be removed. @singledispatch def equal(a, b) -> bool: a = asarray(a) b = asarray(b) if a.ndim == b.ndim == 0: return bool(a == b) return np.array_equal(a, b) @equal.register(pd.DataFrame) def equal_dataframe(a, b) -> bool: return a.equals(b) @equal.register(DaskArray) def equal_dask_array(a, b) -> bool: import dask.array as da from dask.base import tokenize if a is b: return True if a.shape != b.shape: return False if isinstance(b, DaskArray): if tokenize(a) == tokenize(b): return True if isinstance(a._meta, CSMatrix): # TODO: Maybe also do this in the other case? return da.map_blocks(equal, a, b, drop_axis=(0, 1)).all() else: return da.equal(a, b, where=~(da.isnan(a) == da.isnan(b))).all() @equal.register(np.ndarray) def equal_array(a, b) -> bool: # Reshaping allows us to compare inputs with >2 dimensions # We cast to pandas since it will still work with non-numeric types b = asarray(b) if a.shape != b.shape: return False return equal(pd.DataFrame(a.reshape(-1)), pd.DataFrame(b.reshape(-1))) @equal.register(CupyArray) def equal_cupyarray(a, b) -> bool: import cupy as cp return bool(cp.array_equal(a, b, equal_nan=True)) @equal.register(pd.Series) def equal_series(a, b) -> bool: return a.equals(b) @equal.register(CSMatrix) @equal.register(CSArray) @equal.register(CupySparseMatrix) def equal_sparse(a, b) -> bool: # It's a weird api, don't blame me import array_api_compat xp = array_api_compat.array_namespace(a.data) if isinstance(b, CupySparseMatrix | CSMatrix | CSArray): if isinstance(a, CupySparseMatrix): # Comparison broken for CSC matrices # https://github.com/cupy/cupy/issues/7757 a, b = CupyCSRMatrix(a), CupyCSRMatrix(b) comp = a != b if isinstance(comp, bool): return not comp if isinstance(comp, CupySparseMatrix): # https://github.com/cupy/cupy/issues/7751 comp = comp.get() # fmt: off return ( (len(comp.data) == 0) or ( xp.isnan(a[comp]).all() and xp.isnan(b[comp]).all() ) ) # fmt: on else: return False @equal.register(AwkArray) def equal_awkward(a, b) -> bool: from ..compat import awkward as ak return ak.almost_equal(a, b) def as_sparse(x, *, use_sparse_array: bool = False) -> CSMatrix | CSArray: if not isinstance(x, CSMatrix | CSArray): in_memory_array_class = ( sparse.csr_array if use_sparse_array else sparse.csr_matrix ) if isinstance(x, DaskArray): x = x.map_blocks( sparse.csr_matrix, meta=sparse.csr_matrix(x._meta), dtype=x.dtype, ).compute() return in_memory_array_class(x) return x def as_cp_sparse(x) -> CupySparseMatrix: import cupyx.scipy.sparse as cpsparse if isinstance(x, cpsparse.spmatrix): return x elif isinstance(x, np.ndarray): return cpsparse.csr_matrix(as_sparse(x)) else: return cpsparse.csr_matrix(x) def unify_dtypes( dfs: Iterable[pd.DataFrame | Dataset2D], ) -> list[pd.DataFrame | Dataset2D]: """ Attempts to unify datatypes from multiple dataframes. For catching cases where pandas would convert to object dtype. """ dfs = list(dfs) # Get shared categorical columns df_dtypes = [dict(df.dtypes) for df in dfs] columns = reduce(lambda x, y: x.union(y), [df.columns for df in dfs]) dtypes: dict[str, list[np.dtype | ExtensionDtype]] = {col: [] for col in columns} for col in columns: for df in df_dtypes: dtypes[col].append(df.get(col, None)) if len(dtypes) == 0: return dfs else: dfs = [df.copy(deep=False) for df in dfs] new_dtypes = {} for col in dtypes.keys(): target_dtype = try_unifying_dtype(dtypes[col]) if target_dtype is not None: new_dtypes[col] = target_dtype for df in dfs: for col, dtype in new_dtypes.items(): if col in df: df[col] = df[col].astype(dtype) return dfs def try_unifying_dtype( col: Sequence[np.dtype | ExtensionDtype], ) -> pd.core.dtypes.base.ExtensionDtype | None: """ If dtypes can be unified, returns the dtype they would be unified to. Returns None if they can't be unified, or if we can expect pandas to unify them for us. Params ------ col: A list of dtypes to unify. Can be numpy/ pandas dtypes, or None (which denotes a missing value) """ dtypes: set[pd.CategoricalDtype] = set() # Categorical if any(isinstance(dtype, pd.CategoricalDtype) for dtype in col): ordered = False for dtype in col: if isinstance(dtype, pd.CategoricalDtype): dtypes.add(dtype) ordered = ordered | dtype.ordered elif not pd.isnull(dtype): return None if len(dtypes) > 0 and not ordered: categories = reduce( lambda x, y: x.union(y), [dtype.categories for dtype in dtypes if not pd.isnull(dtype)], ) return pd.CategoricalDtype(natsorted(categories), ordered=False) # Boolean elif all(pd.api.types.is_bool_dtype(dtype) or dtype is None for dtype in col): if any(dtype is None for dtype in col): return pd.BooleanDtype() else: return None else: return None def check_combinable_cols(cols: list[pd.Index], join: Join_T): """Given columns for a set of dataframes, checks if the can be combined. Looks for if there are duplicated column names that would show up in the result. """ repeated_cols = reduce(lambda x, y: x.union(y[y.duplicated()]), cols, set()) if join == "inner": intersecting_cols = intersect_keys(cols) problem_cols = repeated_cols.intersection(intersecting_cols) elif join == "outer": problem_cols = repeated_cols else: raise ValueError() if len(problem_cols) > 0: problem_cols = list(problem_cols) msg = ( f"Cannot combine dataframes as some contained duplicated column names - " "causing ambiguity.\n\n" f"The problem columns are: {problem_cols}" ) raise pd.errors.InvalidIndexError(msg) # TODO: open PR or feature request to cupy def _cp_block_diag(mats, format=None, dtype=None): """ Modified version of scipy.sparse.block_diag for cupy sparse. """ import cupy as cp from cupyx.scipy import sparse as cpsparse row = [] col = [] data = [] r_idx = 0 c_idx = 0 for a in mats: # if isinstance(a, (list, numbers.Number)): # a = cpsparse.coo_matrix(a) nrows, ncols = a.shape if cpsparse.issparse(a): a = a.tocoo() row.append(a.row + r_idx) col.append(a.col + c_idx) data.append(a.data) else: a_row, a_col = cp.divmod(cp.arange(nrows * ncols), ncols) row.append(a_row + r_idx) col.append(a_col + c_idx) data.append(a.reshape(-1)) r_idx += nrows c_idx += ncols row = cp.concatenate(row) col = cp.concatenate(col) data = cp.concatenate(data) return cpsparse.coo_matrix( (data, (row, col)), shape=(r_idx, c_idx), dtype=dtype ).asformat(format) def _dask_block_diag(mats): from itertools import permutations import dask.array as da blocks = np.zeros((len(mats), len(mats)), dtype=object) for i, j in permutations(range(len(mats)), 2): blocks[i, j] = da.from_array( sparse.csr_matrix((mats[i].shape[0], mats[j].shape[1])) ) for i, x in enumerate(mats): if not isinstance(x._meta, sparse.csr_matrix): x = x.map_blocks(sparse.csr_matrix) blocks[i, i] = x return da.block(blocks.tolist()) ################### # Per element logic ################### def unique_value(vals: Collection[T]) -> T | MissingVal: """ Given a collection vals, returns the unique value (if one exists), otherwise returns MissingValue. """ unique_val = vals[0] for v in vals[1:]: if not equal(v, unique_val): return MissingVal return unique_val def first(vals: Collection[T]) -> T | MissingVal: """ Given a collection of vals, return the first non-missing one.If they're all missing, return MissingVal. """ for val in vals: if not_missing(val): return val return MissingVal def only(vals: Collection[T]) -> T | MissingVal: """Return the only value in the collection, otherwise MissingVal.""" if len(vals) == 1: return vals[0] else: return MissingVal ################### # Merging ################### def merge_nested(ds: Collection[Mapping], keys_join: Callable, value_join: Callable): out = {} for k in keys_join(ds): v = _merge_nested(ds, k, keys_join, value_join) if not_missing(v): out[k] = v return out def _merge_nested( ds: Collection[Mapping], k, keys_join: Callable, value_join: Callable ): vals = [d[k] for d in ds if k in d] if len(vals) == 0: return MissingVal elif all(isinstance(v, Mapping) for v in vals): new_map = merge_nested(vals, keys_join, value_join) if len(new_map) == 0: return MissingVal else: return new_map else: return value_join(vals) def merge_unique(ds: Collection[Mapping]) -> Mapping: return merge_nested(ds, union_keys, unique_value) def merge_same(ds: Collection[Mapping]) -> Mapping: return merge_nested(ds, intersect_keys, unique_value) def merge_first(ds: Collection[Mapping]) -> Mapping: return merge_nested(ds, union_keys, first) def merge_only(ds: Collection[Mapping]) -> Mapping: return merge_nested(ds, union_keys, only) ################### # Interface ################### # Leaving out for now, it's ugly in the rendered docs and would be adding a dependency. # from typing_extensions import Literal # UNS_STRATEGIES_TYPE = Literal[None, "same", "unique", "first", "only"] MERGE_STRATEGIES = { None: lambda x: {}, "same": merge_same, "unique": merge_unique, "first": merge_first, "only": merge_only, } StrategiesLiteral = Literal["same", "unique", "first", "only"] def resolve_merge_strategy( strategy: str | Callable | None, ) -> Callable[[Collection[Mapping]], Mapping]: if not isinstance(strategy, Callable): strategy = MERGE_STRATEGIES[strategy] return strategy ##################### # Concatenation ##################### class Reindexer: """ Indexing to be applied to axis of 2d array orthogonal to the axis being concatenated. Attrs ----- old_idx Original index new_idx Target index old_pos Indices of original index which will be kept new_pos Indices of new index which data from old_pos will be placed in. Together with `old_pos` this forms a mapping. """ def __init__(self, old_idx, new_idx): self.old_idx = old_idx self.new_idx = new_idx self.no_change = new_idx.equals(old_idx) new_pos = new_idx.get_indexer(old_idx) old_pos = np.arange(len(new_pos)) mask = new_pos != -1 self.new_pos = new_pos[mask] self.old_pos = old_pos[mask] def __call__(self, el, *, axis=1, fill_value=None): return self.apply(el, axis=axis, fill_value=fill_value) def apply(self, el, *, axis, fill_value=None): """ Reindex element so el[axis] is aligned to self.new_idx. Missing values are to be replaced with `fill_value`. """ if self.no_change and (axis_len(el, axis) == len(self.old_idx)): return el if isinstance(el, pd.DataFrame): return self._apply_to_df(el, axis=axis, fill_value=fill_value) elif isinstance(el, CSMatrix | CSArray | CupySparseMatrix): return self._apply_to_sparse(el, axis=axis, fill_value=fill_value) elif isinstance(el, AwkArray): return self._apply_to_awkward(el, axis=axis, fill_value=fill_value) elif isinstance(el, DaskArray): return self._apply_to_dask_array(el, axis=axis, fill_value=fill_value) elif isinstance(el, CupyArray): return self._apply_to_cupy_array(el, axis=axis, fill_value=fill_value) else: return self._apply_to_array(el, axis=axis, fill_value=fill_value) def _apply_to_df(self, el: pd.DataFrame, *, axis, fill_value=None): if fill_value is None: fill_value = np.nan return el.reindex(self.new_idx, axis=axis, fill_value=fill_value) def _apply_to_dask_array(self, el: DaskArray, *, axis, fill_value=None): import dask.array as da if fill_value is None: fill_value = default_fill_value([el]) shape = list(el.shape) if el.shape[axis] == 0: # Presumably faster since it won't allocate the full array shape[axis] = len(self.new_idx) return da.broadcast_to(fill_value, tuple(shape)) indexer = self.idx sub_el = _subset(el, make_slice(indexer, axis, len(shape))) if any(indexer == -1): sub_el[make_slice(indexer == -1, axis, len(shape))] = fill_value return sub_el def _apply_to_cupy_array(self, el, *, axis, fill_value=None): import cupy as cp if fill_value is None: fill_value = default_fill_value([el]) if el.shape[axis] == 0: # Presumably faster since it won't allocate the full array shape = list(el.shape) shape[axis] = len(self.new_idx) return cp.broadcast_to(cp.asarray(fill_value), tuple(shape)) old_idx_tuple = [slice(None)] * len(el.shape) old_idx_tuple[axis] = self.old_pos old_idx_tuple = tuple(old_idx_tuple) new_idx_tuple = [slice(None)] * len(el.shape) new_idx_tuple[axis] = self.new_pos new_idx_tuple = tuple(new_idx_tuple) out_shape = list(el.shape) out_shape[axis] = len(self.new_idx) out = cp.full(tuple(out_shape), fill_value) out[new_idx_tuple] = el[old_idx_tuple] return out def _apply_to_array(self, el, *, axis, fill_value=None): if fill_value is None: fill_value = default_fill_value([el]) if el.shape[axis] == 0: # Presumably faster since it won't allocate the full array shape = list(el.shape) shape[axis] = len(self.new_idx) return np.broadcast_to(fill_value, tuple(shape)) indexer = self.idx # Indexes real fast, and does outer indexing return pd.api.extensions.take( el, indexer, axis=axis, allow_fill=True, fill_value=fill_value ) def _apply_to_sparse( self, el: CSMatrix | CSArray, *, axis, fill_value=None ) -> CSMatrix: if isinstance(el, CupySparseMatrix): from cupyx.scipy import sparse else: from scipy import sparse import array_api_compat xp = array_api_compat.array_namespace(el.data) if fill_value is None: fill_value = default_fill_value([el]) if fill_value != 0: to_fill = self.new_idx.get_indexer(self.new_idx.difference(self.old_idx)) else: to_fill = xp.array([]) # Fixing outer indexing for missing values if el.shape[axis] == 0: shape = list(el.shape) shape[axis] = len(self.new_idx) shape = tuple(shape) if fill_value == 0: if isinstance(el, CSArray): memory_class = sparse.csr_array else: memory_class = sparse.csr_matrix return memory_class(shape) else: return type(el)(xp.broadcast_to(xp.asarray(fill_value), shape)) fill_idxer = None if len(to_fill) > 0 or isinstance(el, CupySparseMatrix): idxmtx_dtype = xp.promote_types(el.dtype, xp.array(fill_value).dtype) else: idxmtx_dtype = bool if isinstance(el, CSArray): memory_class = sparse.coo_array else: memory_class = sparse.coo_matrix if axis == 1: idxmtx = memory_class( ( xp.ones(len(self.new_pos), dtype=idxmtx_dtype), (xp.asarray(self.old_pos), xp.asarray(self.new_pos)), ), shape=(len(self.old_idx), len(self.new_idx)), dtype=idxmtx_dtype, ) out = el @ idxmtx if len(to_fill) > 0: out = out.tocsc() fill_idxer = (slice(None), to_fill) elif axis == 0: idxmtx = memory_class( ( xp.ones(len(self.new_pos), dtype=idxmtx_dtype), (xp.asarray(self.new_pos), xp.asarray(self.old_pos)), ), shape=(len(self.new_idx), len(self.old_idx)), dtype=idxmtx_dtype, ) out = idxmtx @ el if len(to_fill) > 0: out = out.tocsr() fill_idxer = (to_fill, slice(None)) if fill_idxer is not None: out[fill_idxer] = fill_value return out def _apply_to_awkward(self, el: AwkArray, *, axis, fill_value=None): import awkward as ak if self.no_change: return el elif axis == 1: # Indexing by field if self.new_idx.isin(self.old_idx).all(): # inner join return el[self.new_idx] else: # outer join # TODO: this code isn't actually hit, we should refactor msg = "This should be unreachable, please open an issue." raise Exception(msg) else: if len(self.new_idx) > len(self.old_idx): el = ak.pad_none(el, 1, axis=axis) # axis == 0 return el[self.idx] @property def idx(self): return self.old_idx.get_indexer(self.new_idx) def merge_indices(inds: Iterable[pd.Index], join: Join_T) -> pd.Index: if join == "inner": return reduce(lambda x, y: x.intersection(y), inds) elif join == "outer": return reduce(lambda x, y: x.union(y), inds) else: msg = f"`join` must be one of 'inner' or 'outer', got {join!r}" raise ValueError(msg) def default_fill_value(els): """Given some arrays, returns what the default fill value should be. This is largely due to backwards compat, and might not be the ideal solution. """ if any( isinstance(el, CSMatrix | CSArray) or (isinstance(el, DaskArray) and isinstance(el._meta, CSMatrix | CSArray)) for el in els ): return 0 else: return np.nan def gen_reindexer(new_var: pd.Index, cur_var: pd.Index): """ Given a new set of var_names, and a current set, generates a function which will reindex a matrix to be aligned with the new set. Usage ----- >>> a = AnnData(sparse.eye(3, format="csr"), var=pd.DataFrame(index=list("abc"))) >>> b = AnnData(sparse.eye(2, format="csr"), var=pd.DataFrame(index=list("ba"))) >>> reindexer = gen_reindexer(a.var_names, b.var_names) >>> sparse.vstack([a.X, reindexer(b.X)]).toarray() array([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.], [0., 1., 0.], [1., 0., 0.]]) """ return Reindexer(cur_var, new_var) def np_bool_to_pd_bool_array(df: pd.DataFrame): for col_name, col_type in dict(df.dtypes).items(): if col_type is np.dtype(bool): df[col_name] = pd.array(df[col_name].values) return df def concat_arrays(arrays, reindexers, axis=0, index=None, fill_value=None): from anndata.experimental.backed._compat import Dataset2D arrays = list(arrays) if fill_value is None: fill_value = default_fill_value(arrays) if any(isinstance(a, Dataset2D) for a in arrays): if any(isinstance(a, pd.DataFrame) for a in arrays): arrays = [to_memory(a) if isinstance(a, Dataset2D) else a for a in arrays] elif not all(isinstance(a, Dataset2D) for a in arrays): msg = f"Cannot concatenate a Dataset2D with other array types {[type(a) for a in arrays if not isinstance(a, Dataset2D)]}." raise ValueError(msg) else: return concat_dataset2d_on_annot_axis(arrays, join="outer") if any(isinstance(a, pd.DataFrame) for a in arrays): # TODO: This is hacky, 0 is a sentinel for outer_concat_aligned_mapping if not all( isinstance(a, pd.DataFrame) or a is MissingVal or 0 in a.shape for a in arrays ): msg = "Cannot concatenate a dataframe with other array types." raise NotImplementedError(msg) # TODO: behaviour here should be chosen through a merge strategy df = pd.concat( unify_dtypes(f(x) for f, x in zip(reindexers, arrays)), axis=axis, ignore_index=True, ) df.index = index return df elif any(isinstance(a, AwkArray) for a in arrays): from ..compat import awkward as ak if not all( isinstance(a, AwkArray) or a is MissingVal or 0 in a.shape for a in arrays ): msg = "Cannot concatenate an AwkwardArray with other array types." raise NotImplementedError(msg) return ak.concatenate([f(a) for f, a in zip(reindexers, arrays)], axis=axis) elif any(isinstance(a, CupySparseMatrix) for a in arrays): import cupyx.scipy.sparse as cpsparse if not all( isinstance(a, CupySparseMatrix | CupyArray) or 0 in a.shape for a in arrays ): msg = "Cannot concatenate a cupy array with other array types." raise NotImplementedError(msg) sparse_stack = (cpsparse.vstack, cpsparse.hstack)[axis] return sparse_stack( [ f(as_cp_sparse(a), axis=1 - axis, fill_value=fill_value) for f, a in zip(reindexers, arrays) ], format="csr", ) elif any(isinstance(a, CupyArray) for a in arrays): import cupy as cp if not all(isinstance(a, CupyArray) or 0 in a.shape for a in arrays): msg = "Cannot concatenate a cupy array with other array types." raise NotImplementedError(msg) return cp.concatenate( [ f(cp.asarray(x), fill_value=fill_value, axis=1 - axis) for f, x in zip(reindexers, arrays) ], axis=axis, ) elif any(isinstance(a, CSMatrix | CSArray) for a in arrays): sparse_stack = (sparse.vstack, sparse.hstack)[axis] use_sparse_array = any(issubclass(type(a), CSArray) for a in arrays) mat = sparse_stack( [ f( as_sparse(a, use_sparse_array=use_sparse_array), axis=1 - axis, fill_value=fill_value, ) for f, a in zip(reindexers, arrays) ], format="csr", ) scipy_version = Version(scipy.__version__) # Bug where xstack produces a matrix not an array in 1.11.* if use_sparse_array and (scipy_version.major, scipy_version.minor) == (1, 11): if mat.format == "csc": return sparse.csc_array(mat) return sparse.csr_array(mat) return mat else: return np.concatenate( [ f(x, fill_value=fill_value, axis=1 - axis) for f, x in zip(reindexers, arrays) ], axis=axis, ) def inner_concat_aligned_mapping( mappings, *, reindexers=None, index=None, axis=0, concat_axis=None ): if concat_axis is None: concat_axis = axis result = {} for k in intersect_keys(mappings): els = [m[k] for m in mappings] if reindexers is None: cur_reindexers = gen_inner_reindexers( els, new_index=index, axis=concat_axis ) else: cur_reindexers = reindexers result[k] = concat_arrays(els, cur_reindexers, index=index, axis=concat_axis) return result def gen_inner_reindexers(els, new_index, axis: Literal[0, 1] = 0): alt_axis = 1 - axis if axis == 0: df_indices = lambda x: x.columns elif axis == 1: df_indices = lambda x: x.indices if all(isinstance(el, pd.DataFrame) for el in els if not_missing(el)): common_ind = reduce( lambda x, y: x.intersection(y), (df_indices(el) for el in els) ) reindexers = [Reindexer(df_indices(el), common_ind) for el in els] elif any(isinstance(el, AwkArray) for el in els if not_missing(el)): if not all(isinstance(el, AwkArray) for el in els if not_missing(el)): msg = "Cannot concatenate an AwkwardArray with other array types." raise NotImplementedError(msg) common_keys = intersect_keys(el.fields for el in els) reindexers = [ Reindexer(pd.Index(el.fields), pd.Index(list(common_keys))) for el in els ] else: min_ind = min(el.shape[alt_axis] for el in els) reindexers = [ gen_reindexer(pd.RangeIndex(min_ind), pd.RangeIndex(el.shape[alt_axis])) for el in els ] return reindexers def gen_outer_reindexers(els, shapes, new_index: pd.Index, *, axis=0): if all(isinstance(el, pd.DataFrame) for el in els if not_missing(el)): reindexers = [ (lambda x: x) if not_missing(el) else (lambda _, shape=shape: pd.DataFrame(index=range(shape))) for el, shape in zip(els, shapes) ] elif any(isinstance(el, AwkArray) for el in els if not_missing(el)): import awkward as ak if not all(isinstance(el, AwkArray) for el in els if not_missing(el)): msg = "Cannot concatenate an AwkwardArray with other array types." raise NotImplementedError(msg) warn_once( "Outer joins on awkward.Arrays will have different return values in the future. " "For details, and to offer input, please see:\n\n\t" "https://github.com/scverse/anndata/issues/898", ExperimentalFeatureWarning, ) # all_keys = union_keys(el.fields for el in els if not_missing(el)) reindexers = [] for el in els: if not_missing(el): reindexers.append(lambda x: x) else: reindexers.append( lambda x: ak.pad_none( ak.Array([]), len(x), 0, ) ) else: max_col = max(el.shape[1] for el in els if not_missing(el)) orig_cols = [el.shape[1] if not_missing(el) else 0 for el in els] reindexers = [ gen_reindexer(pd.RangeIndex(max_col), pd.RangeIndex(n)) for n in orig_cols ] return reindexers def missing_element( n: int, els: list[CSArray | CSMatrix | np.ndarray | DaskArray], axis: Literal[0, 1] = 0, fill_value: Any | None = None, off_axis_size: int = 0, ) -> np.ndarray | DaskArray: """Generates value to use when there is a missing element.""" should_return_dask = any(isinstance(el, DaskArray) for el in els) # 0 sized array for in-memory prevents allocating unnecessary memory while preserving broadcasting. shape = (n, off_axis_size) if axis == 0 else (off_axis_size, n) if should_return_dask: import dask.array as da return da.full( shape, default_fill_value(els) if fill_value is None else fill_value ) return np.zeros(shape, dtype=bool) def outer_concat_aligned_mapping( mappings, *, reindexers=None, index=None, axis=0, concat_axis=None, fill_value=None ): if concat_axis is None: concat_axis = axis result = {} ns = [m.parent.shape[axis] for m in mappings] for k in union_keys(mappings): els = [m.get(k, MissingVal) for m in mappings] if reindexers is None: cur_reindexers = gen_outer_reindexers( els, ns, new_index=index, axis=concat_axis ) else: cur_reindexers = reindexers # Dask needs to create a full array and can't do the size-0 trick off_axis_size = 0 if any(isinstance(e, DaskArray) for e in els): if not isinstance(cur_reindexers[0], Reindexer): # pragma: no cover msg = "Cannot re-index a dask array without a Reindexer" raise ValueError(msg) off_axis_size = cur_reindexers[0].idx.shape[0] # Handling of missing values here is hacky for dataframes # We should probably just handle missing elements for all types result[k] = concat_arrays( [ el if not_missing(el) else missing_element( n, axis=concat_axis, els=els, fill_value=fill_value, off_axis_size=off_axis_size, ) for el, n in zip(els, ns) ], cur_reindexers, axis=concat_axis, index=index, fill_value=fill_value, ) return result def concat_pairwise_mapping( mappings: Collection[Mapping], shapes: Collection[int], join_keys=intersect_keys ): result = {} if any(any(isinstance(v, CSArray) for v in m.values()) for m in mappings): sparse_class = sparse.csr_array else: sparse_class = sparse.csr_matrix for k in join_keys(mappings): els = [ m.get(k, sparse_class((s, s), dtype=bool)) for m, s in zip(mappings, shapes) ] if all(isinstance(el, CupySparseMatrix | CupyArray) for el in els): result[k] = _cp_block_diag(els, format="csr") elif all(isinstance(el, DaskArray) for el in els): result[k] = _dask_block_diag(els) else: result[k] = sparse.block_diag(els, format="csr") return result def merge_dataframes( dfs: Iterable[pd.DataFrame], new_index, merge_strategy=merge_unique ) -> pd.DataFrame: dfs = [df.reindex(index=new_index) for df in dfs] # New dataframe with all shared data new_df = pd.DataFrame(merge_strategy(dfs), index=new_index) return new_df def merge_outer(mappings, batch_keys, *, join_index="-", merge=merge_unique): """ Combine elements of two mappings, such that non-overlapping entries are added with their batch-key appended. Note: this currently does NOT work for nested mappings. Additionally, values are not promised to be unique, and may be overwritten. """ all_keys = union_keys(mappings) out = merge(mappings) for key in all_keys.difference(out.keys()): for b, m in zip(batch_keys, mappings): val = m.get(key, None) if val is not None: out[f"{key}{join_index}{b}"] = val return out def _resolve_axis( axis: Literal["obs", 0, "var", 1], ) -> tuple[Literal[0], Literal["obs"]] | tuple[Literal[1], Literal["var"]]: if axis in {0, "obs"}: return (0, "obs") if axis in {1, "var"}: return (1, "var") msg = f"`axis` must be either 0, 1, 'obs', or 'var', was {axis}" raise ValueError(msg) def axis_indices(adata: AnnData, axis: Literal["obs", 0, "var", 1]) -> pd.Index: """Helper function to get adata.{dim}_names.""" _, axis_name = _resolve_axis(axis) return getattr(adata, f"{axis_name}_names") # TODO: Resolve https://github.com/scverse/anndata/issues/678 and remove this function def concat_Xs(adatas, reindexers, axis, fill_value): """ Shimy until support for some missing X's is implemented. Basically just checks if it's one of the two supported cases, or throws an error. This is not done inline in `concat` because we don't want to maintain references to the values of a.X. """ Xs = [a.X for a in adatas] if all(X is None for X in Xs): return None elif any(X is None for X in Xs): msg = ( "Some (but not all) of the AnnData's to be concatenated had no .X value. " "Concatenation is currently only implemented for cases where all or none of" " the AnnData's have .X assigned." ) raise NotImplementedError(msg) else: return concat_arrays(Xs, reindexers, axis=axis, fill_value=fill_value) def make_dask_col_from_extension_dtype( col: DataArray, *, use_only_object_dtype: bool = False ) -> DaskArray: """ Creates dask arrays from :class:`pandas.api.extensions.ExtensionArray` dtype :class:`xarray.DataArray`s. Parameters ---------- col The columns to be converted use_only_object_dtype Whether or not to cast all :class:`pandas.api.extensions.ExtensionArray` dtypes to `object` type, by default False Returns ------- A :class:`dask.Array`: representation of the column. """ import dask.array as da from anndata._io.specs.lazy_methods import ( compute_chunk_layout_for_axis_size, get_chunksize, maybe_open_h5, ) from anndata.experimental import read_elem_lazy from anndata.experimental.backed._compat import DataArray from anndata.experimental.backed._compat import xarray as xr base_path_or_zarr_group = col.attrs.get("base_path_or_zarr_group") elem_name = col.attrs.get("elem_name") dims = col.dims coords = col.coords.copy() with maybe_open_h5(base_path_or_zarr_group, elem_name) as f: maybe_chunk_size = get_chunksize(read_elem_lazy(f)) chunk_size = ( compute_chunk_layout_for_axis_size( 1000 if maybe_chunk_size is None else maybe_chunk_size[0], col.shape[0] ), ) def get_chunk(block_info=None): # reopening is important to get around h5py's unserializable lock in processes with maybe_open_h5(base_path_or_zarr_group, elem_name) as f: v = read_elem_lazy(f) variable = xr.Variable( data=xr.core.indexing.LazilyIndexedArray(v), dims=dims ) data_array = DataArray( variable, coords=coords, dims=dims, ) idx = tuple( slice(start, stop) for start, stop in block_info[None]["array-location"] ) chunk = np.array(data_array.data[idx].array) return chunk if col.dtype == "category" or col.dtype == "string" or use_only_object_dtype: dtype = "object" else: dtype = col.dtype.numpy_dtype return da.map_blocks( get_chunk, chunks=chunk_size, meta=np.array([], dtype=dtype), dtype=dtype, ) def make_xarray_extension_dtypes_dask( annotations: Iterable[Dataset2D], *, use_only_object_dtype: bool = False ) -> Generator[Dataset2D, None, None]: """ Creates a generator of Dataset2D objects with dask arrays in place of :class:`pandas.api.extensions.ExtensionArray` dtype columns. Parameters ---------- annotations The datasets to be altered use_only_object_dtype Whether or not to cast all :class:`pandas.api.extensions.ExtensionArray` dtypes to `object` type, by default False Yields ------ An altered dataset. """ for a in annotations: extension_cols = { col for col in a.columns if pd.api.types.is_extension_array_dtype(a[col]) } yield a.copy( data={ name: ( make_dask_col_from_extension_dtype( col, use_only_object_dtype=use_only_object_dtype ) if name in extension_cols else col ) for name, col in a.items() } ) DS_CONCAT_DUMMY_INDEX_NAME = "concat_index" def concat_dataset2d_on_annot_axis( annotations: Iterable[Dataset2D], join: Join_T, ) -> Dataset2D: """Create a concatenate dataset from a list of :class:`~anndata.experimental.backed._xarray.Dataset2D` objects. The goal of this function is to mimic `pd.concat(..., ignore_index=True)` so has some complicated logic for handling the "index" to ensure (a) nothing is loaded into memory and (b) the true index is always tracked. Parameters ---------- annotations The :class:`~anndata.experimental.backed._xarray.Dataset2D` objects to be concatenated. join Type of join operation Returns ------- Concatenated :class:`~anndata.experimental.backed._xarray.Dataset2D` """ from anndata._io.specs.lazy_methods import DUMMY_RANGE_INDEX_KEY from anndata.experimental.backed._compat import Dataset2D from anndata.experimental.backed._compat import xarray as xr annotations_re_indexed = [] for a in make_xarray_extension_dtypes_dask(annotations): old_key = list(a.coords.keys())[0] # First create a dummy index a.coords[DS_CONCAT_DUMMY_INDEX_NAME] = ( old_key, pd.RangeIndex(a[a.attrs["indexing_key"]].shape[0]).astype("str"), ) # Set all the dimensions to this new dummy index a = a.swap_dims({old_key: DS_CONCAT_DUMMY_INDEX_NAME}) # Move the old coordinate into a variable old_coord = a.coords[old_key] del a.coords[old_key] a[old_key] = old_coord annotations_re_indexed.append(a) # Concat along the dummy index ds = Dataset2D( xr.concat(annotations_re_indexed, join=join, dim=DS_CONCAT_DUMMY_INDEX_NAME), attrs={"indexing_key": f"true_{DS_CONCAT_DUMMY_INDEX_NAME}"}, ) ds.coords[DS_CONCAT_DUMMY_INDEX_NAME] = pd.RangeIndex( ds.coords[DS_CONCAT_DUMMY_INDEX_NAME].shape[0] ).astype("str") # Drop any lingering dimensions (swap doesn't delete) ds = ds.drop_dims(d for d in ds.dims if d != DS_CONCAT_DUMMY_INDEX_NAME) # Create a new true index and then delete the columns resulting from the concatenation for each index. # This includes the dummy column (which is neither a dimension nor a true indexing column) index = xr.concat( [a[a.attrs["indexing_key"]] for a in annotations_re_indexed], dim=DS_CONCAT_DUMMY_INDEX_NAME, ) # prevent duplicate values index.coords[DS_CONCAT_DUMMY_INDEX_NAME] = ds.coords[DS_CONCAT_DUMMY_INDEX_NAME] ds[f"true_{DS_CONCAT_DUMMY_INDEX_NAME}"] = index for key in set(a.attrs["indexing_key"] for a in annotations_re_indexed): del ds[key] if DUMMY_RANGE_INDEX_KEY in ds: del ds[DUMMY_RANGE_INDEX_KEY] return ds def concat( adatas: Collection[AnnData] | Mapping[str, AnnData], *, axis: Literal["obs", 0, "var", 1] = "obs", join: Join_T = "inner", merge: StrategiesLiteral | Callable | None = None, uns_merge: StrategiesLiteral | Callable | None = None, label: str | None = None, keys: Collection | None = None, index_unique: str | None = None, fill_value: Any | None = None, pairwise: bool = False, ) -> AnnData: """Concatenates AnnData objects along an axis. See the :doc:`concatenation <../concatenation>` section in the docs for a more in-depth description. Params ------ adatas The objects to be concatenated. If a Mapping is passed, keys are used for the `keys` argument and values are concatenated. axis Which axis to concatenate along. join How to align values when concatenating. If "outer", the union of the other axis is taken. If "inner", the intersection. See :doc:`concatenation <../concatenation>` for more. merge How elements not aligned to the axis being concatenated along are selected. Currently implemented strategies include: * `None`: No elements are kept. * `"same"`: Elements that are the same in each of the objects. * `"unique"`: Elements for which there is only one possible value. * `"first"`: The first element seen at each from each position. * `"only"`: Elements that show up in only one of the objects. For :class:`xarray.Dataset` objects, we use their :func:`xarray.merge` with `override` to stay lazy. uns_merge How the elements of `.uns` are selected. Uses the same set of strategies as the `merge` argument, except applied recursively. label Column in axis annotation (i.e. `.obs` or `.var`) to place batch information in. If it's None, no column is added. keys Names for each object being added. These values are used for column values for `label` or appended to the index if `index_unique` is not `None`. Defaults to incrementing integer labels. index_unique Whether to make the index unique by using the keys. If provided, this is the delimiter between "{orig_idx}{index_unique}{key}". When `None`, the original indices are kept. fill_value When `join="outer"`, this is the value that will be used to fill the introduced indices. By default, sparse arrays are padded with zeros, while dense arrays and DataFrames are padded with missing values. pairwise Whether pairwise elements along the concatenated dimension should be included. This is False by default, since the resulting arrays are often not meaningful. Notes ----- .. warning:: If you use `join='outer'` this fills 0s for sparse data when variables are absent in a batch. Use this with care. Dense data is filled with `NaN`. Examples -------- Preparing example objects >>> import anndata as ad, pandas as pd, numpy as np >>> from scipy import sparse >>> a = ad.AnnData( ... X=sparse.csr_matrix(np.array([[0, 1], [2, 3]])), ... obs=pd.DataFrame({"group": ["a", "b"]}, index=["s1", "s2"]), ... var=pd.DataFrame(index=["var1", "var2"]), ... varm={ ... "ones": np.ones((2, 5)), ... "rand": np.random.randn(2, 3), ... "zeros": np.zeros((2, 5)), ... }, ... uns={"a": 1, "b": 2, "c": {"c.a": 3, "c.b": 4}}, ... ) >>> b = ad.AnnData( ... X=sparse.csr_matrix(np.array([[4, 5, 6], [7, 8, 9]])), ... obs=pd.DataFrame( ... {"group": ["b", "c"], "measure": [1.2, 4.3]}, index=["s3", "s4"] ... ), ... var=pd.DataFrame(index=["var1", "var2", "var3"]), ... varm={"ones": np.ones((3, 5)), "rand": np.random.randn(3, 5)}, ... uns={"a": 1, "b": 3, "c": {"c.b": 4}}, ... ) >>> c = ad.AnnData( ... X=sparse.csr_matrix(np.array([[10, 11], [12, 13]])), ... obs=pd.DataFrame({"group": ["a", "b"]}, index=["s1", "s2"]), ... var=pd.DataFrame(index=["var3", "var4"]), ... uns={"a": 1, "b": 4, "c": {"c.a": 3, "c.b": 4, "c.c": 5}}, ... ) Concatenating along different axes >>> ad.concat([a, b]).to_df() var1 var2 s1 0 1 s2 2 3 s3 4 5 s4 7 8 >>> ad.concat([a, c], axis="var").to_df() var1 var2 var3 var4 s1 0 1 10 11 s2 2 3 12 13 Inner and outer joins >>> inner = ad.concat([a, b]) # Joining on intersection of variables >>> inner AnnData object with n_obs × n_vars = 4 × 2 obs: 'group' >>> (inner.obs_names, inner.var_names) # doctest: +NORMALIZE_WHITESPACE (Index(['s1', 's2', 's3', 's4'], dtype='object'), Index(['var1', 'var2'], dtype='object')) >>> outer = ad.concat([a, b], join="outer") # Joining on union of variables >>> outer AnnData object with n_obs × n_vars = 4 × 3 obs: 'group', 'measure' >>> outer.var_names Index(['var1', 'var2', 'var3'], dtype='object') >>> outer.to_df() # Sparse arrays are padded with zeroes by default var1 var2 var3 s1 0 1 0 s2 2 3 0 s3 4 5 6 s4 7 8 9 Using the axis’ index instead of its name >>> ad.concat([a, b], axis=0).to_df() # Equivalent to axis="obs" var1 var2 s1 0 1 s2 2 3 s3 4 5 s4 7 8 >>> ad.concat([a, c], axis=1).to_df() # Equivalent to axis="var" var1 var2 var3 var4 s1 0 1 10 11 s2 2 3 12 13 Keeping track of source objects >>> ad.concat({"a": a, "b": b}, label="batch").obs group batch s1 a a s2 b a s3 b b s4 c b >>> ad.concat([a, b], label="batch", keys=["a", "b"]).obs # Equivalent to previous group batch s1 a a s2 b a s3 b b s4 c b >>> ad.concat({"a": a, "b": b}, index_unique="-").obs group s1-a a s2-a b s3-b b s4-b c Combining values not aligned to axis of concatenation >>> ad.concat([a, b], merge="same") AnnData object with n_obs × n_vars = 4 × 2 obs: 'group' varm: 'ones' >>> ad.concat([a, b], merge="unique") AnnData object with n_obs × n_vars = 4 × 2 obs: 'group' varm: 'ones', 'zeros' >>> ad.concat([a, b], merge="first") AnnData object with n_obs × n_vars = 4 × 2 obs: 'group' varm: 'ones', 'rand', 'zeros' >>> ad.concat([a, b], merge="only") AnnData object with n_obs × n_vars = 4 × 2 obs: 'group' varm: 'zeros' The same merge strategies can be used for elements in `.uns` >>> dict(ad.concat([a, b, c], uns_merge="same").uns) {'a': 1, 'c': {'c.b': 4}} >>> dict(ad.concat([a, b, c], uns_merge="unique").uns) {'a': 1, 'c': {'c.a': 3, 'c.b': 4, 'c.c': 5}} >>> dict(ad.concat([a, b, c], uns_merge="only").uns) {'c': {'c.c': 5}} >>> dict(ad.concat([a, b, c], uns_merge="first").uns) {'a': 1, 'b': 2, 'c': {'c.a': 3, 'c.b': 4, 'c.c': 5}} """ from anndata.experimental.backed._compat import Dataset2D from anndata.experimental.backed._compat import xarray as xr # Argument normalization merge = resolve_merge_strategy(merge) uns_merge = resolve_merge_strategy(uns_merge) if isinstance(adatas, Mapping): if keys is not None: msg = ( "Cannot specify categories in both mapping keys and using `keys`. " "Only specify this once." ) raise TypeError(msg) keys, adatas = list(adatas.keys()), list(adatas.values()) else: adatas = list(adatas) if keys is None: keys = np.arange(len(adatas)).astype(str) axis, axis_name = _resolve_axis(axis) alt_axis, alt_axis_name = _resolve_axis(axis=1 - axis) # Label column label_col = pd.Categorical.from_codes( np.repeat(np.arange(len(adatas)), [a.shape[axis] for a in adatas]), categories=keys, ) # Combining indexes concat_indices = pd.concat( [pd.Series(axis_indices(a, axis=axis)) for a in adatas], ignore_index=True ) if index_unique is not None: concat_indices = concat_indices.str.cat( _map_cat_to_str(label_col), sep=index_unique ) concat_indices = pd.Index(concat_indices) alt_indices = merge_indices( [axis_indices(a, axis=alt_axis) for a in adatas], join=join ) reindexers = [ gen_reindexer(alt_indices, axis_indices(a, axis=alt_axis)) for a in adatas ] # Annotation for concatenation axis check_combinable_cols([getattr(a, axis_name).columns for a in adatas], join=join) annotations = [getattr(a, axis_name) for a in adatas] are_any_annotations_dataframes = any( isinstance(a, pd.DataFrame) for a in annotations ) if are_any_annotations_dataframes: annotations_in_memory = ( to_memory(a) if isinstance(a, Dataset2D) else a for a in annotations ) concat_annot = pd.concat( unify_dtypes(annotations_in_memory), join=join, ignore_index=True, ) concat_annot.index = concat_indices else: concat_annot = concat_dataset2d_on_annot_axis(annotations, join) concat_indices.name = DS_CONCAT_DUMMY_INDEX_NAME if label is not None: concat_annot[label] = label_col # Annotation for other axis alt_annotations = [getattr(a, alt_axis_name) for a in adatas] are_any_alt_annotations_dataframes = any( isinstance(a, pd.DataFrame) for a in alt_annotations ) if are_any_alt_annotations_dataframes: alt_annotations_in_memory = [ to_memory(a) if isinstance(a, Dataset2D) else a for a in alt_annotations ] alt_annot = merge_dataframes(alt_annotations_in_memory, alt_indices, merge) else: # TODO: figure out mapping of our merge to theirs instead of just taking first, although this appears to be # the only "lazy" setting so I'm not sure we really want that. # Because of xarray's merge upcasting, it's safest to simply assume that all dtypes are objects. annotations_with_only_dask = list( make_xarray_extension_dtypes_dask( alt_annotations, use_only_object_dtype=True ) ) annotations_with_only_dask = [ a.rename({a.attrs["indexing_key"]: "merge_index"}) for a in annotations_with_only_dask ] alt_annot = Dataset2D( xr.merge(annotations_with_only_dask, join=join, compat="override"), attrs={"indexing_key": "merge_index"}, ) X = concat_Xs(adatas, reindexers, axis=axis, fill_value=fill_value) if join == "inner": concat_aligned_mapping = inner_concat_aligned_mapping join_keys = intersect_keys elif join == "outer": concat_aligned_mapping = partial( outer_concat_aligned_mapping, fill_value=fill_value ) join_keys = union_keys else: msg = f"{join=} should have been validated above by pd.concat" raise AssertionError(msg) layers = concat_aligned_mapping( [a.layers for a in adatas], axis=axis, reindexers=reindexers ) concat_mapping = concat_aligned_mapping( [getattr(a, f"{axis_name}m") for a in adatas], axis=axis, concat_axis=0, index=concat_indices, ) if pairwise: concat_pairwise = concat_pairwise_mapping( mappings=[getattr(a, f"{axis_name}p") for a in adatas], shapes=[a.shape[axis] for a in adatas], join_keys=join_keys, ) else: concat_pairwise = {} # TODO: Reindex lazily, so we don't have to make those copies until we're sure we need the element alt_mapping = merge( [ {k: r(v, axis=0) for k, v in getattr(a, f"{alt_axis_name}m").items()} for r, a in zip(reindexers, adatas) ], ) alt_pairwise = merge( [ { k: r(r(v, axis=0), axis=1) for k, v in getattr(a, f"{alt_axis_name}p").items() } for r, a in zip(reindexers, adatas) ] ) uns = uns_merge([a.uns for a in adatas]) raw = None has_raw = [a.raw is not None for a in adatas] if all(has_raw): raw = concat( [ AnnData( X=a.raw.X, obs=pd.DataFrame(index=a.obs_names), var=a.raw.var, varm=a.raw.varm, ) for a in adatas ], join=join, label=label, keys=keys, index_unique=index_unique, fill_value=fill_value, axis=axis, ) elif any(has_raw): warn( "Only some AnnData objects have `.raw` attribute, " "not concatenating `.raw` attributes.", UserWarning, ) return AnnData( **{ "X": X, "layers": layers, axis_name: concat_annot, alt_axis_name: alt_annot, f"{axis_name}m": concat_mapping, f"{alt_axis_name}m": alt_mapping, f"{axis_name}p": concat_pairwise, f"{alt_axis_name}p": alt_pairwise, "uns": uns, "raw": raw, } ) python-anndata-0.12.0~rc1/src/anndata/_core/raw.py000066400000000000000000000172331500370632200217560ustar00rootroot00000000000000from __future__ import annotations from typing import TYPE_CHECKING import h5py import numpy as np import pandas as pd from scipy.sparse import issparse from ..compat import CupyArray, CupySparseMatrix from .aligned_df import _gen_dataframe from .aligned_mapping import AlignedMappingProperty, AxisArrays from .index import _normalize_index, _subset, get_vector, unpack_index from .sparse_dataset import sparse_dataset if TYPE_CHECKING: from collections.abc import Mapping, Sequence from typing import ClassVar from ..compat import CSMatrix from .aligned_mapping import AxisArraysView from .anndata import AnnData from .sparse_dataset import BaseCompressedSparseDataset # TODO: Implement views for Raw class Raw: is_view: ClassVar = False def __init__( self, adata: AnnData, X: np.ndarray | CSMatrix | None = None, var: pd.DataFrame | Mapping[str, Sequence] | None = None, varm: AxisArrays | Mapping[str, np.ndarray] | None = None, ): self._adata = adata self._n_obs = adata.n_obs # construct manually if adata.isbacked == (X is None): # Move from GPU to CPU since it's large and not always used if isinstance(X, CupyArray | CupySparseMatrix): self._X = X.get() else: self._X = X n_var = None if self._X is None else self._X.shape[1] self._var = _gen_dataframe( var, ["var_names"], source="X", attr="var", length=n_var ) self.varm = varm elif X is None: # construct from adata # Move from GPU to CPU since it's large and not always used if isinstance(adata.X, CupyArray | CupySparseMatrix): self._X = adata.X.get() else: self._X = adata.X.copy() self._var = adata.var.copy() self.varm = adata.varm.copy() elif adata.isbacked: msg = "Cannot specify X if adata is backed" raise ValueError(msg) def _get_X(self, layer=None): if layer is not None: raise ValueError() return self.X @property def X(self) -> BaseCompressedSparseDataset | np.ndarray | CSMatrix: # TODO: Handle unsorted array of integer indices for h5py.Datasets if not self._adata.isbacked: return self._X if not self._adata.file.is_open: self._adata.file.open() # Handle legacy file formats: if "raw/X" in self._adata.file: X = self._adata.file["raw/X"] elif "raw.X" in self._adata.file: X = self._adata.file["raw.X"] # Backwards compat else: msg = ( f"Could not find dataset for raw X in file: " f"{self._adata.file.filename}." ) raise AttributeError(msg) if isinstance(X, h5py.Group): X = sparse_dataset(X) # Check if we need to subset if self._adata.is_view: # TODO: As noted above, implement views of raw # so we can know if we need to subset by var return _subset(X, (self._adata._oidx, slice(None))) else: return X @property def shape(self) -> tuple[int, int]: return self.n_obs, self.n_vars @property def var(self) -> pd.DataFrame: return self._var @property def n_vars(self) -> int: return self._var.shape[0] @property def n_obs(self) -> int: return self._n_obs varm: AlignedMappingProperty[AxisArrays | AxisArraysView] = AlignedMappingProperty( "varm", AxisArrays, 1 ) @property def var_names(self) -> pd.Index[str]: return self.var.index @property def obs_names(self) -> pd.Index[str]: return self._adata.obs_names def __getitem__(self, index): oidx, vidx = self._normalize_indices(index) # To preserve two dimensional shape if isinstance(vidx, int | np.integer): vidx = slice(vidx, vidx + 1, 1) if isinstance(oidx, int | np.integer): oidx = slice(oidx, oidx + 1, 1) if not self._adata.isbacked: X = _subset(self.X, (oidx, vidx)) else: X = None var = self._var.iloc[vidx] new = Raw(self._adata, X=X, var=var) if self.varm is not None: # Since there is no view of raws new.varm = self.varm._view(_RawViewHack(self, vidx), (vidx,)).copy() return new def __str__(self) -> str: descr = f"Raw AnnData with n_obs × n_vars = {self.n_obs} × {self.n_vars}" for attr in ["var", "varm"]: keys = getattr(self, attr).keys() if len(keys) > 0: descr += f"\n {attr}: {str(list(keys))[1:-1]}" return descr def copy(self) -> Raw: return Raw( self._adata, X=self.X.copy(), var=self.var.copy(), varm=None if self._varm is None else self._varm.copy(), ) def to_adata(self) -> AnnData: """Create full AnnData object.""" from anndata import AnnData return AnnData( X=self.X.copy(), var=self.var.copy(), varm=None if self._varm is None else self._varm.copy(), obs=self._adata.obs.copy(), obsm=self._adata.obsm.copy(), obsp=self._adata.obsp.copy(), uns=self._adata.uns.copy(), ) def _normalize_indices(self, packed_index): # deal with slicing with pd.Series if isinstance(packed_index, pd.Series): packed_index = packed_index.values if isinstance(packed_index, tuple): if len(packed_index) != 2: raise IndexDimError(len(packed_index)) if isinstance(packed_index[1], pd.Series): packed_index = packed_index[0], packed_index[1].values if isinstance(packed_index[0], pd.Series): packed_index = packed_index[0].values, packed_index[1] obs, var = unpack_index(packed_index) obs = _normalize_index(obs, self._adata.obs_names) var = _normalize_index(var, self.var_names) return obs, var def var_vector(self, k: str) -> np.ndarray: # TODO decorator to copy AnnData.var_vector docstring return get_vector(self, k, "var", "obs") def obs_vector(self, k: str) -> np.ndarray: # TODO decorator to copy AnnData.obs_vector docstring idx = self._normalize_indices((slice(None), k)) a = self.X[idx] if issparse(a): a = a.toarray() return np.ravel(a) # This exists to accommodate AlignedMappings, # until we implement a proper RawView or get rid of Raw in favor of modes. class _RawViewHack: def __init__(self, raw: Raw, vidx: slice | np.ndarray): self.parent_raw = raw self.vidx = vidx @property def shape(self) -> tuple[int, int]: return self.parent_raw.n_obs, len(self.var_names) @property def obs_names(self) -> pd.Index: return self.parent_raw.obs_names @property def var_names(self) -> pd.Index: return self.parent_raw.var_names[self.vidx] class IndexDimError(IndexError): MSG = ( "You tried to slice an AnnData(View) object with an" "{}-dimensional index, but only 2 dimensions exist in such an object." ) MSG_1D = ( "\nIf you tried to slice cells using adata[cells, ], " "note that Python (unlike R) uses adata[cells, :] as slicing syntax." ) def __init__(self, n_dims: int): msg = self.MSG.format(n_dims) if n_dims == 1: msg += self.MSG_1D super().__init__(msg) python-anndata-0.12.0~rc1/src/anndata/_core/sparse_dataset.py000066400000000000000000000623571500370632200241760ustar00rootroot00000000000000"""\ This module implements on disk sparse datasets. This code is based on and uses the conventions of h5sparse_ by `Appier Inc.`_. See the copyright and license note in this directory source code. .. _h5sparse: https://github.com/appier/h5sparse .. _Appier Inc.: https://www.appier.com/ """ # TODO: # - think about supporting the COO format from __future__ import annotations import warnings from abc import ABC from collections.abc import Iterable from functools import cached_property from itertools import accumulate, chain, pairwise from math import floor from pathlib import Path from typing import TYPE_CHECKING, NamedTuple import h5py import numpy as np import scipy import scipy.sparse as ss from packaging.version import Version from scipy.sparse import _sparsetools from .. import abc from .._settings import settings from ..compat import ( CSArray, CSMatrix, H5Group, ZarrArray, ZarrGroup, _read_attr, is_zarr_v2, ) from .index import _fix_slice_bounds, _subset, unpack_index if TYPE_CHECKING: from collections.abc import Sequence from typing import Literal from scipy.sparse._compressed import _cs_matrix from .._types import GroupStorageType from ..compat import H5Array from .index import Index, Index1D else: from scipy.sparse import spmatrix as _cs_matrix SCIPY_1_15 = Version(scipy.__version__) >= Version("1.15rc0") class BackedFormat(NamedTuple): format: Literal["csr", "csc"] backed_type: type[BackedSparseMatrix] memory_type: type[_cs_matrix] class BackedSparseMatrix(_cs_matrix): """\ Mixin class for backed sparse matrices. Largely needed for the case `backed_sparse_csr(...)[:]`, since that calls copy on `.data`, `.indices`, and `.indptr`. """ data: GroupStorageType indices: GroupStorageType indptr: np.ndarray def copy(self) -> CSMatrix: if isinstance(self.data, h5py.Dataset): return sparse_dataset(self.data.parent).to_memory() if isinstance(self.data, ZarrArray): import zarr if is_zarr_v2(): sparse_group = zarr.open( store=self.data.store, mode="r", chunk_store=self.data.chunk_store, # chunk_store is needed, not clear why )[Path(self.data.path).parent] else: anndata_group = zarr.open_group(store=self.data.store, mode="r") sparse_group = anndata_group[ str( Path(str(self.data.store_path)) .relative_to(str(anndata_group.store_path)) .parent ) ] return sparse_dataset(sparse_group).to_memory() return super().copy() def _set_many(self, i: Iterable[int], j: Iterable[int], x): """\ Sets value at each (i, j) to x Here (i,j) index major and minor respectively, and must not contain duplicate entries. """ # Scipy 1.3+ compat n_samples = 1 if np.isscalar(x) else len(x) offsets = self._offsets(i, j, n_samples) if -1 not in offsets: # make a list for interaction with h5py offsets = list(offsets) # only affects existing non-zero cells self.data[offsets] = x return else: msg = "You cannot change the sparsity structure of a SparseDataset." raise ValueError(msg) # replace where possible # mask = offsets > -1 # # offsets[mask] # bool_data_mask = np.zeros(len(self.data), dtype=bool) # bool_data_mask[offsets[mask]] = True # self.data[bool_data_mask] = x[mask] # # self.data[offsets[mask]] = x[mask] # # only insertions remain # mask = ~mask # i = i[mask] # i[i < 0] += M # j = j[mask] # j[j < 0] += N # self._insert_many(i, j, x[mask]) def _zero_many(self, i: Sequence[int], j: Sequence[int]): """\ Sets value at each (i, j) to zero, preserving sparsity structure. Here (i,j) index major and minor respectively. """ offsets = self._offsets(i, j, len(i)) # only assign zeros to the existing sparsity structure self.data[list(offsets[offsets > -1])] = 0 def _offsets( self, i: Iterable[int], j: Iterable[int], n_samples: int ) -> np.ndarray: i, j, M, N = self._prepare_indices(i, j) offsets = np.empty(n_samples, dtype=self.indices.dtype) ret = _sparsetools.csr_sample_offsets( M, N, self.indptr, self.indices, n_samples, i, j, offsets ) if ret == 1: # rinse and repeat self.sum_duplicates() _sparsetools.csr_sample_offsets( M, N, self.indptr, self.indices, n_samples, i, j, offsets ) return offsets def _get_contiguous_compressed_slice( self, s: slice ) -> tuple[np.ndarray, np.ndarray, np.ndarray]: new_indptr = self.indptr[s.start : s.stop + 1].copy() start = new_indptr[0] stop = new_indptr[-1] new_indptr -= start new_data = self.data[start:stop] new_indices = self.indices[start:stop] return new_data, new_indices, new_indptr class backed_csr_matrix(BackedSparseMatrix, ss.csr_matrix): def _get_intXslice(self, row: int, col: slice) -> ss.csr_matrix: return ss.csr_matrix( get_compressed_vector(self, row), shape=(1, self.shape[1]) )[:, col] def _get_sliceXslice(self, row: slice, col: slice) -> ss.csr_matrix: row = _fix_slice_bounds(row, self.shape[0]) col = _fix_slice_bounds(col, self.shape[1]) out_shape = ( slice_len(row, self.shape[0]), slice_len(col, self.shape[1]), ) if out_shape[0] == 1: return self._get_intXslice(slice_as_int(row, self.shape[0]), col) if row.step != 1: return self._get_arrayXslice(np.arange(*row.indices(self.shape[0])), col) res = ss.csr_matrix( self._get_contiguous_compressed_slice(row), shape=(out_shape[0], self.shape[1]), ) return res if out_shape[1] == self.shape[1] else res[:, col] def _get_arrayXslice(self, row: Sequence[int], col: slice) -> ss.csr_matrix: idxs = np.asarray(row) if len(idxs) == 0: return ss.csr_matrix((0, self.shape[1])) if idxs.dtype == bool: idxs = np.where(idxs) return ss.csr_matrix( get_compressed_vectors(self, idxs), shape=(len(idxs), self.shape[1]) )[:, col] class backed_csc_matrix(BackedSparseMatrix, ss.csc_matrix): def _get_sliceXint(self, row: slice, col: int) -> ss.csc_matrix: return ss.csc_matrix( get_compressed_vector(self, col), shape=(self.shape[0], 1) )[row, :] def _get_sliceXslice(self, row: slice, col: slice) -> ss.csc_matrix: row = _fix_slice_bounds(row, self.shape[0]) col = _fix_slice_bounds(col, self.shape[1]) out_shape = ( slice_len(row, self.shape[0]), slice_len(col, self.shape[1]), ) if out_shape[1] == 1: return self._get_sliceXint(row, slice_as_int(col, self.shape[1])) if col.step != 1: return self._get_sliceXarray(row, np.arange(*col.indices(self.shape[1]))) res = ss.csc_matrix( self._get_contiguous_compressed_slice(col), shape=(self.shape[0], out_shape[1]), ) return res if out_shape[0] == self.shape[0] else res[row, :] def _get_sliceXarray(self, row: slice, col: Sequence[int]) -> ss.csc_matrix: idxs = np.asarray(col) if len(idxs) == 0: return ss.csc_matrix((self.shape[0], 0)) if idxs.dtype == bool: idxs = np.where(idxs) return ss.csc_matrix( get_compressed_vectors(self, idxs), shape=(self.shape[0], len(idxs)) )[row, :] FORMATS = [ BackedFormat("csr", backed_csr_matrix, ss.csr_matrix), BackedFormat("csc", backed_csc_matrix, ss.csc_matrix), BackedFormat("csr", backed_csr_matrix, ss.csr_array), BackedFormat("csc", backed_csc_matrix, ss.csc_array), ] def slice_len(s: slice, l: int) -> int: """Returns length of `a[s]` where `len(a) == l`.""" return len(range(*s.indices(l))) def slice_as_int(s: slice, l: int) -> int: """Converts slices of length 1 to the integer index they’ll access.""" out = list(range(*s.indices(l))) assert len(out) == 1 return out[0] def get_compressed_vectors( x: BackedSparseMatrix, row_idxs: Iterable[int] ) -> tuple[Sequence, Sequence, Sequence]: indptr_slices = [slice(*(x.indptr[i : i + 2])) for i in row_idxs] # HDF5 cannot handle out-of-order integer indexing if isinstance(x.data, ZarrArray): as_np_indptr = np.concatenate( [np.arange(s.start, s.stop) for s in indptr_slices] ) data = x.data[as_np_indptr] indices = x.indices[as_np_indptr] else: data = np.concatenate([x.data[s] for s in indptr_slices]) indices = np.concatenate([x.indices[s] for s in indptr_slices]) indptr = list(accumulate(chain((0,), (s.stop - s.start for s in indptr_slices)))) return data, indices, indptr def get_compressed_vectors_for_slices( x: BackedSparseMatrix, slices: Iterable[slice] ) -> tuple[Sequence, Sequence, Sequence]: indptr_indices = [x.indptr[slice(s.start, s.stop + 1)] for s in slices] indptr_limits = [slice(i[0], i[-1]) for i in indptr_indices] # HDF5 cannot handle out-of-order integer indexing if isinstance(x.data, ZarrArray): indptr_int = np.concatenate([np.arange(s.start, s.stop) for s in indptr_limits]) data = x.data[indptr_int] indices = x.indices[indptr_int] else: data = np.concatenate([x.data[s] for s in indptr_limits]) indices = np.concatenate([x.indices[s] for s in indptr_limits]) # Need to track the size of the gaps in the slices to each indptr subselection gaps = (s1.start - s0.stop for s0, s1 in pairwise(indptr_limits)) offsets = accumulate(chain([indptr_limits[0].start], gaps)) start_indptr = indptr_indices[0] - next(offsets) if len(slices) < 2: # there is only one slice so no need to concatenate return data, indices, start_indptr end_indptr = np.concatenate( [s[1:] - o for s, o in zip(indptr_indices[1:], offsets)] ) indptr = np.concatenate([start_indptr, end_indptr]) return data, indices, indptr def get_compressed_vector( x: BackedSparseMatrix, idx: int ) -> tuple[Sequence, Sequence, Sequence]: s = slice(*(x.indptr[idx : idx + 2])) data = x.data[s] indices = x.indices[s] indptr = [0, len(data)] return data, indices, indptr def subset_by_major_axis_mask( mtx: _cs_matrix, mask: np.ndarray ) -> tuple[np.ndarray, np.ndarray, np.ndarray]: slices = np.ma.extras._ezclump(mask) def mean_slice_length(slices): return floor(sum(s.stop - s.start for s in slices) / len(slices)) # heuristic for whether slicing should be optimized if len(slices) > 0: if mean_slice_length(slices) <= 7: return get_compressed_vectors(mtx, np.where(mask)[0]) else: return get_compressed_vectors_for_slices(mtx, slices) return [], [], [0] def get_memory_class( format: Literal["csr", "csc"], *, use_sparray_in_io: bool = False ) -> type[_cs_matrix]: for fmt, _, memory_class in FORMATS: if format == fmt: if use_sparray_in_io and issubclass(memory_class, CSArray): return memory_class elif not use_sparray_in_io and issubclass(memory_class, CSMatrix): return memory_class msg = f"Format string {format} is not supported." raise ValueError(msg) def get_backed_class( format: Literal["csr", "csc"], *, use_sparray_in_io: bool = False ) -> type[BackedSparseMatrix]: for fmt, backed_class, _ in FORMATS: if format == fmt: if use_sparray_in_io and issubclass(backed_class, CSArray): return backed_class elif not use_sparray_in_io and issubclass(backed_class, CSMatrix): return backed_class msg = f"Format string {format} is not supported." raise ValueError(msg) def _get_group_format(group: GroupStorageType) -> str: if "h5sparse_format" in group.attrs: # TODO: Warn about an old format # If this is only just going to be public, I could insist it's not like this return _read_attr(group.attrs, "h5sparse_format") else: # Should this be an extra field? return _read_attr(group.attrs, "encoding-type").replace("_matrix", "") # Check for the overridden few methods above in our BackedSparseMatrix subclasses def is_sparse_indexing_overridden( format: Literal["csr", "csc"], row: Index1D, col: Index1D ): major_indexer, minor_indexer = (row, col) if format == "csr" else (col, row) return isinstance(minor_indexer, slice) and ( (isinstance(major_indexer, int | np.integer)) or (isinstance(major_indexer, slice)) or (isinstance(major_indexer, np.ndarray) and major_indexer.ndim == 1) ) def validate_indices( mtx: BackedSparseMatrix, indices: tuple[Index1D, Index1D] ) -> tuple[Index1D, Index1D]: res = mtx._validate_indices(indices) return res[0] if SCIPY_1_15 else res class BaseCompressedSparseDataset(abc._AbstractCSDataset, ABC): _group: GroupStorageType def __init__(self, group: GroupStorageType): type(self)._check_group_format(group) self._group = group @property def group(self) -> GroupStorageType: """The group underlying the backed matrix.""" return self._group @group.setter def group(self, val): msg = f"Do not reset group on a {type(self)} with {val}. Instead use `sparse_dataset` to make a new class." raise AttributeError(msg) @property def backend(self) -> Literal["zarr", "hdf5"]: """Which file type is used on-disk.""" if isinstance(self.group, ZarrGroup): return "zarr" elif isinstance(self.group, H5Group): return "hdf5" else: msg = f"Unknown group type {type(self.group)}" raise ValueError(msg) @property def dtype(self) -> np.dtype: """The :class:`numpy.dtype` of the `data` attribute of the sparse matrix.""" return self._data.dtype @classmethod def _check_group_format(cls, group): group_format = _get_group_format(group) assert group_format == cls.format @property def _name(self) -> str: """Name of the group.""" return self.group.name @property def shape(self) -> tuple[int, int]: """Shape of the matrix read off disk.""" shape = _read_attr(self.group.attrs, "shape", None) if shape is None: # TODO warn shape = self.group.attrs.get("h5sparse_shape") return tuple(map(int, shape)) def __repr__(self) -> str: name = type(self).__name__.removeprefix("_") return f"{name}: backend {self.backend}, shape {self.shape}, data_dtype {self.dtype}" def __getitem__(self, index: Index | tuple[()]) -> float | CSMatrix | CSArray: indices = self._normalize_index(index) row, col = indices mtx = self._to_backed() row_sp_matrix_validated, col_sp_matrix_validated = validate_indices( mtx, indices ) # Handle masked indexing along major axis if self.format == "csr" and np.array(row).dtype == bool: sub = ss.csr_matrix( subset_by_major_axis_mask(mtx, row), shape=(row.sum(), mtx.shape[1]) )[:, col] elif self.format == "csc" and np.array(col).dtype == bool: sub = ss.csc_matrix( subset_by_major_axis_mask(mtx, col), shape=(mtx.shape[0], col.sum()) )[row, :] # read into memory data if we do not override access methods elif not is_sparse_indexing_overridden( self.format, row_sp_matrix_validated, col_sp_matrix_validated ): sub = self.to_memory()[row_sp_matrix_validated, col_sp_matrix_validated] else: sub = mtx[row, col] # If indexing is array x array it returns a backed_sparse_matrix # Not sure what the performance is on that operation # Also need to check if memory format is not matrix mtx_fmt = get_memory_class( self.format, use_sparray_in_io=settings.use_sparse_array_on_read ) must_convert_to_array = issubclass(mtx_fmt, CSArray) and not isinstance( sub, CSArray ) if isinstance(sub, BackedSparseMatrix) or must_convert_to_array: return mtx_fmt(sub) else: return sub def _normalize_index( self, index: Index | tuple[()] ) -> tuple[np.ndarray, np.ndarray]: if isinstance(index, tuple) and not len(index): index = slice(None) row, col = unpack_index(index) if all(isinstance(x, Iterable) for x in (row, col)): row, col = np.ix_(row, col) return row, col def __setitem__(self, index: Index | tuple[()], value) -> None: warnings.warn( "__setitem__ for backed sparse will be removed in the next anndata release.", FutureWarning, ) row, col = self._normalize_index(index) mock_matrix = self._to_backed() mock_matrix[row, col] = value # TODO: split to other classes? def append(self, sparse_matrix: CSMatrix | CSArray) -> None: """Append an in-memory or on-disk sparse matrix to the current object's store. Parameters ---------- sparse_matrix The matrix to append. Raises ------ NotImplementedError If the matrix to append is not one of :class:`~scipy.sparse.csr_array`, :class:`~scipy.sparse.csc_array`, :class:`~scipy.sparse.csr_matrix`, or :class:`~scipy.sparse.csc_matrix`. ValueError If both the on-disk and to-append matrices are not of the same format i.e., `csr` or `csc`. OverflowError If the underlying data store has a 32 bit indptr, and the new matrix is too large to fit in it i.e., would cause a 64 bit `indptr` to be written. AssertionError If the on-disk data does not have `csc` or `csr` format. """ # Prep variables shape = self.shape if isinstance(sparse_matrix, BaseCompressedSparseDataset): sparse_matrix = sparse_matrix._to_backed() # Check input if not ss.issparse(sparse_matrix): msg = ( "Currently, only sparse matrices of equivalent format can be " "appended to a SparseDataset." ) raise NotImplementedError(msg) if self.format not in {"csr", "csc"}: msg = f"The append method for format {self.format} is not implemented." raise NotImplementedError(msg) if self.format != sparse_matrix.format: msg = ( f"Matrices must have same format. Currently are " f"{self.format!r} and {sparse_matrix.format!r}" ) raise ValueError(msg) [indptr_offset] = self.group["indices"].shape if self.group["indptr"].dtype == np.int32: new_nnz = indptr_offset + sparse_matrix.indices.shape[0] if new_nnz >= np.iinfo(np.int32).max: msg = ( "This array was written with a 32 bit intptr, but is now large " "enough to require 64 bit values. Please recreate the array with " "a 64 bit indptr." ) raise OverflowError(msg) # shape if self.format == "csr": assert shape[1] == sparse_matrix.shape[1], ( "CSR matrices must have same size of dimension 1 to be appended." ) new_shape = (shape[0] + sparse_matrix.shape[0], shape[1]) elif self.format == "csc": assert shape[0] == sparse_matrix.shape[0], ( "CSC matrices must have same size of dimension 0 to be appended." ) new_shape = (shape[0], shape[1] + sparse_matrix.shape[1]) else: msg = "We forgot to update this branching to a new format" raise AssertionError(msg) if "h5sparse_shape" in self.group.attrs: del self.group.attrs["h5sparse_shape"] self.group.attrs["shape"] = new_shape # data data = self.group["data"] orig_data_size = data.shape[0] data.resize((orig_data_size + sparse_matrix.data.shape[0],)) # see https://github.com/zarr-developers/zarr-python/discussions/2712 for why we need to read first append_data = sparse_matrix.data append_indices = sparse_matrix.indices if isinstance(sparse_matrix.data, ZarrArray) and not is_zarr_v2(): data[orig_data_size:] = append_data[...] else: data[orig_data_size:] = append_data # indptr indptr = self.group["indptr"] orig_data_size = indptr.shape[0] indptr.resize((orig_data_size + sparse_matrix.indptr.shape[0] - 1,)) indptr[orig_data_size:] = ( sparse_matrix.indptr[1:].astype(np.int64) + indptr_offset ) # indices if isinstance(sparse_matrix.data, ZarrArray) and not is_zarr_v2(): append_indices = append_indices[...] indices = self.group["indices"] orig_data_size = indices.shape[0] indices.resize((orig_data_size + sparse_matrix.indices.shape[0],)) indices[orig_data_size:] = append_indices # Clear cached property for attr in ["_indptr", "_indices", "_data"]: if hasattr(self, attr): delattr(self, attr) @cached_property def _indptr(self) -> np.ndarray: """\ Other than `data` and `indices`, this is only as long as the major axis It should therefore fit into memory, so we cache it for faster access. """ arr = self.group["indptr"][...] return arr @cached_property def _indices(self) -> H5Array | ZarrArray: """\ Cache access to the indices to prevent unnecessary reads of the zarray """ return self.group["indices"] @cached_property def _data(self) -> H5Array | ZarrArray: """\ Cache access to the data to prevent unnecessary reads of the zarray """ return self.group["data"] def _to_backed(self) -> BackedSparseMatrix: format_class = get_backed_class(self.format) mtx = format_class(self.shape, dtype=self.dtype) mtx.data = self._data mtx.indices = self._indices mtx.indptr = self._indptr return mtx def to_memory(self) -> CSMatrix | CSArray: format_class = get_memory_class( self.format, use_sparray_in_io=settings.use_sparse_array_on_read ) mtx = format_class(self.shape, dtype=self.dtype) mtx.data = self._data[...] mtx.indices = self._indices[...] mtx.indptr = self._indptr return mtx class _CSRDataset(BaseCompressedSparseDataset, abc.CSRDataset): """Internal concrete version of :class:`anndata.abc.CSRDataset`.""" class _CSCDataset(BaseCompressedSparseDataset, abc.CSCDataset): """Internal concrete version of :class:`anndata.abc.CSRDataset`.""" def sparse_dataset(group: GroupStorageType) -> abc.CSRDataset | abc.CSCDataset: """Generates a backed mode-compatible sparse dataset class. Parameters ---------- group The backing group store. Returns ------- Sparse dataset class. Example ------- First we'll need a stored dataset: >>> import scanpy as sc >>> import h5py >>> from anndata.io import sparse_dataset >>> from anndata.io import read_elem >>> sc.datasets.pbmc68k_reduced().raw.to_adata().write_h5ad("pbmc.h5ad") Initialize a sparse dataset from storage >>> f = h5py.File("pbmc.h5ad") >>> X = sparse_dataset(f["X"]) >>> X CSRDataset: backend hdf5, shape (700, 765), data_dtype float32 Indexing returns sparse matrices >>> X[100:200] # doctest: +ELLIPSIS <...sparse matrix of...float32...with 25003 stored elements...> These can also be used inside of an AnnData object, no need for backed mode >>> from anndata import AnnData >>> adata = AnnData( ... layers={"backed": X}, obs=read_elem(f["obs"]), var=read_elem(f["var"]) ... ) >>> adata.layers["backed"] CSRDataset: backend hdf5, shape (700, 765), data_dtype float32 Indexing access (i.e., from views) brings selection into memory >>> adata[adata.obs["bulk_labels"] == "CD56+ NK"].layers[ ... "backed" ... ] # doctest: +ELLIPSIS <...sparse matrix of...float32...with 7340 stored elements...> """ encoding_type = _get_group_format(group) if encoding_type == "csr": return _CSRDataset(group) elif encoding_type == "csc": return _CSCDataset(group) msg = f"Unknown encoding type {encoding_type}" raise ValueError(msg) @_subset.register(BaseCompressedSparseDataset) def subset_sparsedataset(d, subset_idx): return d[subset_idx] python-anndata-0.12.0~rc1/src/anndata/_core/storage.py000066400000000000000000000051431500370632200226260ustar00rootroot00000000000000from __future__ import annotations import warnings from typing import TYPE_CHECKING, get_args import numpy as np import pandas as pd from scipy import sparse from anndata.compat import CSArray, CSMatrix from .._warnings import ImplicitModificationWarning from ..utils import ( ensure_df_homogeneous, join_english, raise_value_error_if_multiindex_columns, ) if TYPE_CHECKING: from typing import Any def coerce_array( value: Any, *, name: str, allow_df: bool = False, allow_array_like: bool = False, ): try: from anndata.experimental.backed._compat import Dataset2D except ImportError: class Dataset2D: @staticmethod def __repr__(): return "mock anndata.experimental.backed._xarray." """Coerce arrays stored in layers/X, and aligned arrays ({obs,var}{m,p}).""" from ..typing import ArrayDataStructureTypes # If value is a scalar and we allow that, return it if allow_array_like and np.isscalar(value): return value # If value is one of the allowed types, return it array_data_structure_types = get_args(ArrayDataStructureTypes) if isinstance(value, (*array_data_structure_types, Dataset2D)): if isinstance(value, np.matrix): msg = f"{name} should not be a np.matrix, use np.ndarray instead." warnings.warn(msg, ImplicitModificationWarning) value = value.A return value is_non_csc_r_array_or_matrix = ( (isinstance(value, base) and not isinstance(value, csr_c_format)) for base, csr_c_format in [ (sparse.spmatrix, CSMatrix), (sparse.sparray, CSArray), ] ) if any(is_non_csc_r_array_or_matrix): msg = f"Only CSR and CSC {'matrices' if isinstance(value, sparse.spmatrix) else 'arrays'} are supported." raise ValueError(msg) if isinstance(value, pd.DataFrame): if allow_df: raise_value_error_if_multiindex_columns(value, name) return value if allow_df else ensure_df_homogeneous(value, name) # if value is an array-like object, try to convert it e = None if allow_array_like: try: # TODO: asarray? asanyarray? return np.array(value) except (ValueError, TypeError) as _e: e = _e # if value isn’t the right type or convertible, raise an error msg = f"{name} needs to be of one of {join_english(map(str, array_data_structure_types))}, not {type(value)}." if e is not None: msg += " (Failed to convert it to an array, see above for details.)" raise ValueError(msg) from e python-anndata-0.12.0~rc1/src/anndata/_core/views.py000066400000000000000000000342321500370632200223200ustar00rootroot00000000000000from __future__ import annotations import warnings from contextlib import contextmanager from copy import deepcopy from functools import reduce, singledispatch, wraps from typing import TYPE_CHECKING, Literal import numpy as np import pandas as pd from pandas.api.types import is_bool_dtype from scipy import sparse from anndata._warnings import ImplicitModificationWarning from .._settings import settings from ..compat import ( AwkArray, CupyArray, CupyCSCMatrix, CupyCSRMatrix, DaskArray, ZappyArray, ) from .access import ElementRef if TYPE_CHECKING: from collections.abc import Callable, Iterable, KeysView, Sequence from typing import Any from anndata import AnnData @contextmanager def view_update(adata_view: AnnData, attr_name: str, keys: tuple[str, ...]): """Context manager for updating a view of an AnnData object. Contains logic for "actualizing" a view. Yields the object to be modified in-place. Parameters ---------- adata_view A view of an AnnData attr_name Name of the attribute being updated keys Keys to the attribute being updated Yields ------ `adata.attr[key1][key2][keyn]...` """ new = adata_view.copy() attr = getattr(new, attr_name) container = reduce(lambda d, k: d[k], keys, attr) yield container adata_view._init_as_actual(new) class _SetItemMixin: """\ Class which (when values are being set) lets their parent AnnData view know, so it can make a copy of itself. This implements copy-on-modify semantics for views of AnnData objects. """ _view_args: ElementRef | None def __setitem__(self, idx: Any, value: Any): if self._view_args is None: super().__setitem__(idx, value) else: warnings.warn( f"Trying to modify attribute `.{self._view_args.attrname}` of view, " "initializing view as actual.", ImplicitModificationWarning, stacklevel=2, ) with view_update(*self._view_args) as container: container[idx] = value class _ViewMixin(_SetItemMixin): def __init__( self, *args, view_args: tuple[AnnData, str, tuple[str, ...]] = None, **kwargs, ): if view_args is not None: view_args = ElementRef(*view_args) self._view_args = view_args super().__init__(*args, **kwargs) # TODO: This makes `deepcopy(obj)` return `obj._view_args.parent._adata_ref`, fix it def __deepcopy__(self, memo): parent, attrname, keys = self._view_args return deepcopy(getattr(parent._adata_ref, attrname)) _UFuncMethod = Literal["__call__", "reduce", "reduceat", "accumulate", "outer", "inner"] class ArrayView(_SetItemMixin, np.ndarray): def __new__( cls, input_array: Sequence[Any], view_args: tuple[AnnData, str, tuple[str, ...]] = None, ): arr = np.asanyarray(input_array).view(cls) if view_args is not None: view_args = ElementRef(*view_args) arr._view_args = view_args return arr def __array_finalize__(self, obj: np.ndarray | None): if obj is not None: self._view_args = getattr(obj, "_view_args", None) def __array_ufunc__( self: ArrayView, ufunc: Callable[..., Any], method: _UFuncMethod, *inputs, out: tuple[np.ndarray, ...] | None = None, **kwargs, ) -> np.ndarray: """Makes numpy ufuncs convert all instances of views to plain arrays. See https://numpy.org/devdocs/user/basics.subclassing.html#array-ufunc-for-ufuncs """ def convert_all(arrs: Iterable[np.ndarray]) -> Iterable[np.ndarray]: return ( arr.view(np.ndarray) if isinstance(arr, ArrayView) else arr for arr in arrs ) if out is None: outputs = (None,) * ufunc.nout else: out = outputs = tuple(convert_all(out)) results = super().__array_ufunc__( ufunc, method, *convert_all(inputs), out=out, **kwargs ) if results is NotImplemented: return NotImplemented if ufunc.nout == 1: results = (results,) results = tuple( (np.asarray(result) if output is None else output) for result, output in zip(results, outputs) ) return results[0] if len(results) == 1 else results def keys(self) -> KeysView[str]: # it’s a structured array return self.dtype.names def copy(self, order: str = "C") -> np.ndarray: # we want a conventional array return np.array(self) def toarray(self) -> np.ndarray: return self.copy() # Extends DaskArray # Calls parent __new__ constructor since # even calling astype on a dask array # needs a .compute() call to actually happen. # So no construction by view casting like ArrayView class DaskArrayView(_SetItemMixin, DaskArray): def __new__( cls, input_array: DaskArray, view_args: tuple[AnnData, str, tuple[str, ...]] = None, ): arr = super().__new__( cls, dask=input_array.dask, name=input_array.name, chunks=input_array.chunks, dtype=input_array.dtype, meta=input_array._meta, shape=input_array.shape, ) if view_args is not None: view_args = ElementRef(*view_args) arr._view_args = view_args return arr def __array_finalize__(self, obj: DaskArray | None): if obj is not None: self._view_args = getattr(obj, "_view_args", None) def keys(self) -> KeysView[str]: # it’s a structured array return self.dtype.names # Unlike array views, SparseCSRMatrixView and SparseCSCMatrixView # do not propagate through subsetting class SparseCSRMatrixView(_ViewMixin, sparse.csr_matrix): # https://github.com/scverse/anndata/issues/656 def copy(self) -> sparse.csr_matrix: return sparse.csr_matrix(self).copy() class SparseCSCMatrixView(_ViewMixin, sparse.csc_matrix): # https://github.com/scverse/anndata/issues/656 def copy(self) -> sparse.csc_matrix: return sparse.csc_matrix(self).copy() class SparseCSRArrayView(_ViewMixin, sparse.csr_array): # https://github.com/scverse/anndata/issues/656 def copy(self) -> sparse.csr_array: return sparse.csr_array(self).copy() class SparseCSCArrayView(_ViewMixin, sparse.csc_array): # https://github.com/scverse/anndata/issues/656 def copy(self) -> sparse.csc_array: return sparse.csc_array(self).copy() class CupySparseCSRView(_ViewMixin, CupyCSRMatrix): def copy(self) -> CupyCSRMatrix: return CupyCSRMatrix(self).copy() class CupySparseCSCView(_ViewMixin, CupyCSCMatrix): def copy(self) -> CupyCSCMatrix: return CupyCSCMatrix(self).copy() class CupyArrayView(_ViewMixin, CupyArray): def __new__( cls, input_array: Sequence[Any], view_args: tuple[AnnData, str, tuple[str, ...]] = None, ): import cupy as cp arr = cp.asarray(input_array).view(type=cls) if view_args is not None: view_args = ElementRef(*view_args) arr._view_args = view_args return arr def copy(self) -> CupyArray: import cupy as cp return cp.array(self).copy() class DictView(_ViewMixin, dict): pass class DataFrameView(_ViewMixin, pd.DataFrame): _metadata = ["_view_args"] @wraps(pd.DataFrame.drop) def drop(self, *args, inplace: bool = False, **kw): if not inplace: return self.copy().drop(*args, **kw) with view_update(*self._view_args) as df: df.drop(*args, inplace=True, **kw) def __setattr__(self, key: str, value: Any): if key == "index": warnings.warn( f"Trying to modify {key} of attribute `.{self._view_args.attrname}` of view, " "initializing view as actual.", ImplicitModificationWarning, stacklevel=2, ) with view_update(*self._view_args) as container: setattr(container, key, value) else: super().__setattr__(key, value) @singledispatch def as_view(obj, view_args): msg = f"No view type has been registered for {type(obj)}" raise NotImplementedError(msg) @as_view.register(np.ndarray) def as_view_array(array, view_args): return ArrayView(array, view_args=view_args) @as_view.register(DaskArray) def as_view_dask_array(array, view_args): return DaskArrayView(array, view_args=view_args) @as_view.register(pd.DataFrame) def as_view_df(df, view_args): if settings.remove_unused_categories: for col in df.columns: if isinstance(df[col].dtype, pd.CategoricalDtype): with pd.option_context("mode.chained_assignment", None): df[col] = df[col].cat.remove_unused_categories() return DataFrameView(df, view_args=view_args) @as_view.register(sparse.csr_matrix) def as_view_csr_matrix(mtx, view_args): return SparseCSRMatrixView(mtx, view_args=view_args) @as_view.register(sparse.csc_matrix) def as_view_csc_matrix(mtx, view_args): return SparseCSCMatrixView(mtx, view_args=view_args) @as_view.register(sparse.csr_array) def as_view_csr_array(mtx, view_args): return SparseCSRArrayView(mtx, view_args=view_args) @as_view.register(sparse.csc_array) def as_view_csc_array(mtx, view_args): return SparseCSCArrayView(mtx, view_args=view_args) @as_view.register(dict) def as_view_dict(d, view_args): return DictView(d, view_args=view_args) @as_view.register(ZappyArray) def as_view_zappy(z, view_args): # Previous code says ZappyArray works as view, # but as far as I can tell they’re immutable. return z @as_view.register(CupyArray) def as_view_cupy(array, view_args): return CupyArrayView(array, view_args=view_args) @as_view.register(CupyCSRMatrix) def as_view_cupy_csr(mtx, view_args): return CupySparseCSRView(mtx, view_args=view_args) @as_view.register(CupyCSCMatrix) def as_view_cupy_csc(mtx, view_args): return CupySparseCSCView(mtx, view_args=view_args) try: import weakref from ..compat import awkward as ak # Registry to store weak references from AwkwardArrayViews to their parent AnnData container _registry = weakref.WeakValueDictionary() _PARAM_NAME = "_view_args" class AwkwardArrayView(_ViewMixin, AwkArray): @property def _view_args(self): """Override _view_args to retrieve the values from awkward arrays parameters. Awkward arrays cannot be subclassed like other python objects. Instead subclasses need to be attached as "behavior". These "behaviors" cannot take any additional parameters (as we do for other data types to store `_view_args`). Therefore, we need to store `_view_args` using awkward's parameter mechanism. These parameters need to be json-serializable, which is why we can't store ElementRef directly, but need to replace the reference to the parent AnnDataView container with a weak reference. """ parent_key, attrname, keys = self.layout.parameter(_PARAM_NAME) parent = _registry[parent_key] return ElementRef(parent, attrname, keys) def __copy__(self) -> AwkArray: """ Turn the AwkwardArrayView into an actual AwkwardArray with no special behavior. Need to override __copy__ instead of `.copy()` as awkward arrays don't implement `.copy()` and are copied using python's standard copy mechanism in `aligned_mapping.py`. """ array = self # makes a shallow copy and removes the reference to the original AnnData object array = ak.with_parameter(self, _PARAM_NAME, None) array = ak.with_parameter(array, "__list__", None) return array @as_view.register(AwkArray) def as_view_awkarray(array, view_args): parent, attrname, keys = view_args parent_key = f"target-{id(parent)}" _registry[parent_key] = parent # TODO: See https://github.com/scverse/anndata/pull/647#discussion_r963494798_ for more details and # possible strategies to stack behaviors. # A better solution might be based on xarray-style "attrs", once this is implemented # https://github.com/scikit-hep/awkward/issues/1391#issuecomment-1412297114 if type(array).__name__ != "Array": msg = ( "Cannot create a view of an awkward array with __array__ parameter. " "Please open an issue in the AnnData repo and describe your use-case." ) raise NotImplementedError(msg) array = ak.with_parameter(array, _PARAM_NAME, (parent_key, attrname, keys)) array = ak.with_parameter(array, "__list__", "AwkwardArrayView") return array ak.behavior["AwkwardArrayView"] = AwkwardArrayView except ImportError: class AwkwardArrayView: pass def _resolve_idxs(old, new, adata): t = tuple(_resolve_idx(old[i], new[i], adata.shape[i]) for i in (0, 1)) return t @singledispatch def _resolve_idx(old, new, l): return old[new] @_resolve_idx.register(np.ndarray) def _resolve_idx_ndarray(old, new, l): if is_bool_dtype(old) and is_bool_dtype(new): mask_new = np.zeros_like(old) mask_new[np.flatnonzero(old)[new]] = True return mask_new if is_bool_dtype(old): old = np.where(old)[0] return old[new] @_resolve_idx.register(np.integer) @_resolve_idx.register(int) def _resolve_idx_scalar(old, new, l): return np.array([old])[new] @_resolve_idx.register(slice) def _resolve_idx_slice(old, new, l): if isinstance(new, slice): return _resolve_idx_slice_slice(old, new, l) else: return np.arange(*old.indices(l))[new] def _resolve_idx_slice_slice(old, new, l): r = range(*old.indices(l))[new] # Convert back to slice start, stop, step = r.start, r.stop, r.step if len(r) == 0: stop = start elif stop < 0: stop = None return slice(start, stop, step) python-anndata-0.12.0~rc1/src/anndata/_io/000077500000000000000000000000001500370632200202645ustar00rootroot00000000000000python-anndata-0.12.0~rc1/src/anndata/_io/__init__.py000066400000000000000000000005051500370632200223750ustar00rootroot00000000000000from __future__ import annotations import warnings __all__: list[str] = [] def __getattr__(key: str): from .. import io attr = getattr(io, key) warnings.warn( f"Importing {key} from `anndata._io` is deprecated. " "Please use anndata.io instead.", FutureWarning, ) return attr python-anndata-0.12.0~rc1/src/anndata/_io/h5ad.py000066400000000000000000000325051500370632200214640ustar00rootroot00000000000000from __future__ import annotations import re from functools import partial from pathlib import Path from types import MappingProxyType from typing import TYPE_CHECKING, TypeVar from warnings import warn import h5py import numpy as np import pandas as pd from scipy import sparse from anndata._warnings import OldFormatWarning from .._core.anndata import AnnData from .._core.file_backing import filename from .._core.sparse_dataset import BaseCompressedSparseDataset from ..compat import ( CSMatrix, _clean_uns, _decode_structured_array, _from_fixed_length_strings, ) from ..experimental import read_dispatched from .specs import read_elem, write_elem from .specs.registry import IOSpec, write_spec from .utils import ( H5PY_V3, _read_legacy_raw, idx_chunks_along_axis, no_write_dataset_2d, report_read_key_on_error, report_write_key_on_error, ) if TYPE_CHECKING: from collections.abc import Callable, Collection, Mapping, Sequence from os import PathLike from typing import Any, Literal from .._core.file_backing import AnnDataFileManager T = TypeVar("T") @no_write_dataset_2d def write_h5ad( filepath: PathLike[str] | str, adata: AnnData, *, as_dense: Sequence[str] = (), convert_strings_to_categoricals: bool = True, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), **kwargs, ) -> None: """See :meth:`~anndata.AnnData.write_h5ad`.""" if isinstance(as_dense, str): as_dense = [as_dense] if "raw.X" in as_dense: as_dense = list(as_dense) as_dense[as_dense.index("raw.X")] = "raw/X" if any(val not in {"X", "raw/X"} for val in as_dense): msg = "Currently, only `X` and `raw/X` are supported values in `as_dense`" raise NotImplementedError(msg) if "raw/X" in as_dense and adata.raw is None: msg = "Cannot specify writing `raw/X` to dense if it doesn’t exist." raise ValueError(msg) if convert_strings_to_categoricals: adata.strings_to_categoricals() if adata.raw is not None: adata.strings_to_categoricals(adata.raw.var) dataset_kwargs = {**dataset_kwargs, **kwargs} filepath = Path(filepath) mode = "a" if adata.isbacked else "w" if adata.isbacked: # close so that we can reopen below adata.file.close() with h5py.File(filepath, mode) as f: # TODO: Use spec writing system for this # Currently can't use write_dispatched here because this function is also called to do an # inplace update of a backed object, which would delete "/" f = f["/"] f.attrs.setdefault("encoding-type", "anndata") f.attrs.setdefault("encoding-version", "0.1.0") if "X" in as_dense and isinstance( adata.X, CSMatrix | BaseCompressedSparseDataset ): write_sparse_as_dense(f, "X", adata.X, dataset_kwargs=dataset_kwargs) elif not (adata.isbacked and Path(adata.filename) == Path(filepath)): # If adata.isbacked, X should already be up to date write_elem(f, "X", adata.X, dataset_kwargs=dataset_kwargs) if "raw/X" in as_dense and isinstance( adata.raw.X, CSMatrix | BaseCompressedSparseDataset ): write_sparse_as_dense( f, "raw/X", adata.raw.X, dataset_kwargs=dataset_kwargs ) write_elem(f, "raw/var", adata.raw.var, dataset_kwargs=dataset_kwargs) write_elem( f, "raw/varm", dict(adata.raw.varm), dataset_kwargs=dataset_kwargs ) elif adata.raw is not None: write_elem(f, "raw", adata.raw, dataset_kwargs=dataset_kwargs) write_elem(f, "obs", adata.obs, dataset_kwargs=dataset_kwargs) write_elem(f, "var", adata.var, dataset_kwargs=dataset_kwargs) write_elem(f, "obsm", dict(adata.obsm), dataset_kwargs=dataset_kwargs) write_elem(f, "varm", dict(adata.varm), dataset_kwargs=dataset_kwargs) write_elem(f, "obsp", dict(adata.obsp), dataset_kwargs=dataset_kwargs) write_elem(f, "varp", dict(adata.varp), dataset_kwargs=dataset_kwargs) write_elem(f, "layers", dict(adata.layers), dataset_kwargs=dataset_kwargs) write_elem(f, "uns", dict(adata.uns), dataset_kwargs=dataset_kwargs) @report_write_key_on_error @write_spec(IOSpec("array", "0.2.0")) def write_sparse_as_dense( f: h5py.Group, key: str, value: CSMatrix | BaseCompressedSparseDataset, *, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): real_key = None # Flag for if temporary key was used if key in f: if isinstance(value, BaseCompressedSparseDataset) and ( filename(value.group) == filename(f) ): # Write to temporary key before overwriting real_key = key # Transform key to temporary, e.g. raw/X -> raw/_X, or X -> _X key = re.sub(r"(.*)(\w(?!.*/))", r"\1_\2", key.rstrip("/")) else: del f[key] # Wipe before write dset = f.create_dataset(key, shape=value.shape, dtype=value.dtype, **dataset_kwargs) compressed_axis = int(isinstance(value, sparse.csc_matrix)) for idx in idx_chunks_along_axis(value.shape, compressed_axis, 1000): dset[idx] = value[idx].toarray() if real_key is not None: del f[real_key] f[real_key] = f[key] del f[key] def read_h5ad_backed( filename: str | PathLike[str], mode: Literal["r", "r+"] ) -> AnnData: d = dict(filename=filename, filemode=mode) f = h5py.File(filename, mode) attributes = ["obsm", "varm", "obsp", "varp", "uns", "layers"] df_attributes = ["obs", "var"] if "encoding-type" in f.attrs: attributes.extend(df_attributes) else: for k in df_attributes: if k in f: # Backwards compat d[k] = read_dataframe(f[k]) d.update({k: read_elem(f[k]) for k in attributes if k in f}) d["raw"] = _read_raw(f, attrs={"var", "varm"}) adata = AnnData(**d) # Backwards compat to <0.7 if isinstance(f["obs"], h5py.Dataset): _clean_uns(adata) return adata def read_h5ad( filename: PathLike[str] | str, backed: Literal["r", "r+"] | bool | None = None, *, as_sparse: Sequence[str] = (), as_sparse_fmt: type[CSMatrix] = sparse.csr_matrix, chunk_size: int = 6000, # TODO, probably make this 2d chunks ) -> AnnData: """\ Read `.h5ad`-formatted hdf5 file. Parameters ---------- filename File name of data file. backed If `'r'`, load :class:`~anndata.AnnData` in `backed` mode instead of fully loading it into memory (`memory` mode). If you want to modify backed attributes of the AnnData object, you need to choose `'r+'`. Currently, `backed` only support updates to `X`. That means any changes to other slots like `obs` will not be written to disk in `backed` mode. If you would like save changes made to these slots of a `backed` :class:`~anndata.AnnData`, write them to a new file (see :meth:`~anndata.AnnData.write`). For an example, see :ref:`read-partial`. as_sparse If an array was saved as dense, passing its name here will read it as a sparse_matrix, by chunk of size `chunk_size`. as_sparse_fmt Sparse format class to read elements from `as_sparse` in as. chunk_size Used only when loading sparse dataset that is stored as dense. Loading iterates through chunks of the dataset of this row size until it reads the whole dataset. Higher size means higher memory consumption and higher (to a point) loading speed. """ if backed not in {None, False}: mode = backed if mode is True: mode = "r+" assert mode in {"r", "r+"} return read_h5ad_backed(filename, mode) if as_sparse_fmt not in (sparse.csr_matrix, sparse.csc_matrix): msg = "Dense formats can only be read to CSR or CSC matrices at this time." raise NotImplementedError(msg) if isinstance(as_sparse, str): as_sparse = [as_sparse] else: as_sparse = list(as_sparse) for i in range(len(as_sparse)): if as_sparse[i] in {("raw", "X"), "raw.X"}: as_sparse[i] = "raw/X" elif as_sparse[i] not in {"raw/X", "X"}: msg = "Currently only `X` and `raw/X` can be read as sparse." raise NotImplementedError(msg) rdasp = partial( read_dense_as_sparse, sparse_format=as_sparse_fmt, axis_chunk=chunk_size ) with h5py.File(filename, "r") as f: def callback(func, elem_name: str, elem, iospec): if iospec.encoding_type == "anndata" or elem_name.endswith("/"): return AnnData( **{ # This is covering up backwards compat in the anndata initializer # In most cases we should be able to call `func(elen[k])` instead k: read_dispatched(elem[k], callback) for k in elem.keys() if not k.startswith("raw.") } ) elif elem_name.startswith("/raw."): return None elif elem_name == "/X" and "X" in as_sparse: return rdasp(elem) elif elem_name == "/raw": return _read_raw(f, as_sparse, rdasp) elif elem_name in {"/obs", "/var"}: # Backwards compat return read_dataframe(elem) return func(elem) adata = read_dispatched(f, callback=callback) # Backwards compat (should figure out which version) if "raw.X" in f: raw = AnnData(**_read_raw(f, as_sparse, rdasp)) raw.obs_names = adata.obs_names adata.raw = raw # Backwards compat to <0.7 if isinstance(f["obs"], h5py.Dataset): _clean_uns(adata) return adata def _read_raw( f: h5py.File | AnnDataFileManager, as_sparse: Collection[str] = (), rdasp: Callable[[h5py.Dataset], CSMatrix] | None = None, *, attrs: Collection[str] = ("X", "var", "varm"), ) -> dict: if as_sparse: assert rdasp is not None, "must supply rdasp if as_sparse is supplied" raw = {} if "X" in attrs and "raw/X" in f: read_x = rdasp if "raw/X" in as_sparse else read_elem raw["X"] = read_x(f["raw/X"]) for v in ("var", "varm"): if v in attrs and f"raw/{v}" in f: raw[v] = read_elem(f[f"raw/{v}"]) return _read_legacy_raw(f, raw, read_dataframe, read_elem, attrs=attrs) @report_read_key_on_error def read_dataframe_legacy(dataset: h5py.Dataset) -> pd.DataFrame: """Read pre-anndata 0.7 dataframes.""" warn( f"'{dataset.name}' was written with a very old version of AnnData. " "Consider rewriting it.", OldFormatWarning, ) if H5PY_V3: df = pd.DataFrame( _decode_structured_array( _from_fixed_length_strings(dataset[()]), dtype=dataset.dtype ) ) else: df = pd.DataFrame(_from_fixed_length_strings(dataset[()])) df.set_index(df.columns[0], inplace=True) return df def read_dataframe(group: h5py.Group | h5py.Dataset) -> pd.DataFrame: """Backwards compat function""" if not isinstance(group, h5py.Group): return read_dataframe_legacy(group) else: return read_elem(group) @report_read_key_on_error def read_dataset(dataset: h5py.Dataset): if H5PY_V3: string_dtype = h5py.check_string_dtype(dataset.dtype) if (string_dtype is not None) and (string_dtype.encoding == "utf-8"): dataset = dataset.asstr() value = dataset[()] if not hasattr(value, "dtype"): return value elif isinstance(value.dtype, str): pass elif issubclass(value.dtype.type, np.bytes_): value = value.astype(str) # Backwards compat, old datasets have strings as one element 1d arrays if len(value) == 1: return value[0] elif len(value.dtype.descr) > 1: # Compound dtype # For backwards compat, now strings are written as variable length dtype = value.dtype value = _from_fixed_length_strings(value) if H5PY_V3: value = _decode_structured_array(value, dtype=dtype) if value.shape == (): value = value[()] return value @report_read_key_on_error def read_dense_as_sparse( dataset: h5py.Dataset, sparse_format: CSMatrix, axis_chunk: int ): if sparse_format == sparse.csr_matrix: return read_dense_as_csr(dataset, axis_chunk) elif sparse_format == sparse.csc_matrix: return read_dense_as_csc(dataset, axis_chunk) else: msg = f"Cannot read dense array as type: {sparse_format}" raise ValueError(msg) def read_dense_as_csr(dataset: h5py.Dataset, axis_chunk: int = 6000): sub_matrices = [] for idx in idx_chunks_along_axis(dataset.shape, 0, axis_chunk): dense_chunk = dataset[idx] sub_matrix = sparse.csr_matrix(dense_chunk) sub_matrices.append(sub_matrix) return sparse.vstack(sub_matrices, format="csr") def read_dense_as_csc(dataset: h5py.Dataset, axis_chunk: int = 6000): sub_matrices = [] for idx in idx_chunks_along_axis(dataset.shape, 1, axis_chunk): sub_matrix = sparse.csc_matrix(dataset[idx]) sub_matrices.append(sub_matrix) return sparse.hstack(sub_matrices, format="csc") python-anndata-0.12.0~rc1/src/anndata/_io/read.py000066400000000000000000000360041500370632200215540ustar00rootroot00000000000000from __future__ import annotations import bz2 import gzip from collections import OrderedDict from os import PathLike, fspath from pathlib import Path from types import MappingProxyType from typing import TYPE_CHECKING from warnings import warn import h5py import numpy as np import pandas as pd from scipy import sparse from .. import AnnData from ..compat import _deprecate_positional_args from .utils import is_float if TYPE_CHECKING: from collections.abc import Generator, Iterable, Iterator, Mapping def read_csv( filename: PathLike[str] | str | Iterator[str], delimiter: str | None = ",", first_column_names: bool | None = None, dtype: str = "float32", ) -> AnnData: """\ Read `.csv` file. Same as :func:`~anndata.io.read_text` but with default delimiter `','`. Parameters ---------- filename Data file. delimiter Delimiter that separates data within text file. If `None`, will split at arbitrary number of white spaces, which is different from enforcing splitting at single white space `' '`. first_column_names Assume the first column stores row names. dtype Numpy data type. """ return read_text(filename, delimiter, first_column_names, dtype) def read_excel( filename: PathLike[str] | str, sheet: str | int, dtype: str = "float32" ) -> AnnData: """\ Read `.xlsx` (Excel) file. Assumes that the first columns stores the row names and the first row the column names. Parameters ---------- filename File name to read from. sheet Name of sheet in Excel file. """ # rely on pandas for reading an excel file from pandas import read_excel df = read_excel(fspath(filename), sheet) X = df.values[:, 1:] row = dict(row_names=df.iloc[:, 0].values.astype(str)) col = dict(col_names=np.array(df.columns[1:], dtype=str)) return AnnData(X, row, col) def read_umi_tools(filename: PathLike[str] | str, dtype=None) -> AnnData: """\ Read a gzipped condensed count matrix from umi_tools. Parameters ---------- filename File name to read from. """ # import pandas for conversion of a dict of dicts into a matrix # import gzip to read a gzipped file :-) table = pd.read_table(filename, dtype={"gene": "category", "cell": "category"}) X = sparse.csr_matrix( (table["count"], (table["cell"].cat.codes, table["gene"].cat.codes)), dtype=dtype, ) obs = pd.DataFrame(index=pd.Index(table["cell"].cat.categories, name="cell")) var = pd.DataFrame(index=pd.Index(table["gene"].cat.categories, name="gene")) return AnnData(X=X, obs=obs, var=var) def read_hdf(filename: PathLike[str] | str, key: str) -> AnnData: """\ Read `.h5` (hdf5) file. Note: Also looks for fields `row_names` and `col_names`. Parameters ---------- filename Filename of data file. key Name of dataset in the file. """ with h5py.File(filename, "r") as f: # the following is necessary in Python 3, because only # a view and not a list is returned keys = [k for k in f.keys()] if key == "": msg = ( f"The file {filename} stores the following sheets:\n{keys}\n" f"Call read/read_hdf5 with one of them." ) raise ValueError(msg) # read array X = f[key][()] # try to find row and column names rows_cols = [{}, {}] for iname, name in enumerate(["row_names", "col_names"]): if name in keys: rows_cols[iname][name] = f[name][()] adata = AnnData(X, rows_cols[0], rows_cols[1]) return adata def _fmt_loom_axis_attrs( input: Mapping, idx_name: str, dimm_mapping: Mapping[str, Iterable[str]] ) -> tuple[pd.DataFrame, Mapping[str, np.ndarray]]: axis_df = pd.DataFrame() axis_mapping = {} for key, names in dimm_mapping.items(): axis_mapping[key] = np.array([input.pop(name) for name in names]).T for k, v in input.items(): if v.ndim > 1 and v.shape[1] > 1: axis_mapping[k] = v else: axis_df[k] = v if idx_name in axis_df: axis_df.set_index(idx_name, drop=True, inplace=True) return axis_df, axis_mapping @_deprecate_positional_args(version="0.9") def read_loom( filename: PathLike[str] | str, *, sparse: bool = True, cleanup: bool = False, X_name: str = "spliced", obs_names: str = "CellID", obsm_names: Mapping[str, Iterable[str]] | None = None, var_names: str = "Gene", varm_names: Mapping[str, Iterable[str]] | None = None, dtype: str = "float32", obsm_mapping: Mapping[str, Iterable[str]] = MappingProxyType({}), varm_mapping: Mapping[str, Iterable[str]] = MappingProxyType({}), **kwargs, ) -> AnnData: """\ Read `.loom`-formatted hdf5 file. This reads the whole file into memory. Beware that you have to explicitly state when you want to read the file as sparse data. Parameters ---------- filename The filename. sparse Whether to read the data matrix as sparse. cleanup Whether to collapse all obs/var fields that only store one unique value into `.uns['loom-.']`. X_name Loompy key with which the data matrix :attr:`~anndata.AnnData.X` is initialized. obs_names Loompy key where the observation/cell names are stored. obsm_mapping Loompy keys which will be constructed into observation matrices var_names Loompy key where the variable/gene names are stored. varm_mapping Loompy keys which will be constructed into variable matrices **kwargs: Arguments to loompy.connect Example ------- .. code:: python pbmc = anndata.io.read_loom( "pbmc.loom", sparse=True, X_name="lognorm", obs_names="cell_names", var_names="gene_names", obsm_mapping={ "X_umap": ["umap_1", "umap_2"] } ) """ # Deprecations if obsm_names is not None: warn( "Argument obsm_names has been deprecated in favour of `obsm_mapping`. " "In 0.9 this will be an error.", FutureWarning, ) if obsm_mapping != {}: msg = ( "Received values for both `obsm_names` and `obsm_mapping`. This is " "ambiguous, only pass `obsm_mapping`." ) raise ValueError(msg) obsm_mapping = obsm_names if varm_names is not None: warn( "Argument varm_names has been deprecated in favour of `varm_mapping`. " "In 0.9 this will be an error.", FutureWarning, ) if varm_mapping != {}: msg = ( "Received values for both `varm_names` and `varm_mapping`. This is " "ambiguous, only pass `varm_mapping`." ) raise ValueError(msg) varm_mapping = varm_names filename = fspath(filename) # allow passing pathlib.Path objects from loompy import connect with connect(filename, "r", **kwargs) as lc: if X_name not in lc.layers.keys(): X_name = "" X = lc.layers[X_name].sparse().T.tocsr() if sparse else lc.layers[X_name][()].T X = X.astype(dtype, copy=False) layers = OrderedDict() if X_name != "": layers["matrix"] = ( lc.layers[""].sparse().T.tocsr() if sparse else lc.layers[""][()].T ) for key in lc.layers.keys(): if key != "": layers[key] = ( lc.layers[key].sparse().T.tocsr() if sparse else lc.layers[key][()].T ) # TODO: Figure out the singleton obs elements obs, obsm = _fmt_loom_axis_attrs(dict(lc.col_attrs), obs_names, obsm_mapping) var, varm = _fmt_loom_axis_attrs(dict(lc.row_attrs), var_names, varm_mapping) uns = {} if cleanup: uns_obs = {} for key in obs.columns: if len(obs[key].unique()) == 1: uns_obs[key] = obs[key].iloc[0] del obs[key] if uns_obs: uns["loom-obs"] = uns_obs uns_var = {} for key in var.columns: if len(var[key].unique()) == 1: uns_var[key] = var[key].iloc[0] del var[key] if uns_var: uns["loom-var"] = uns_var adata = AnnData( X, obs=obs, var=var, layers=layers, obsm=obsm if obsm else None, varm=varm if varm else None, uns=uns, ) return adata def read_mtx(filename: PathLike[str] | str, dtype: str = "float32") -> AnnData: """\ Read `.mtx` file. Parameters ---------- filename The filename. dtype Numpy data type. """ from scipy.io import mmread # could be rewritten accounting for dtype to be more performant X = mmread(fspath(filename)).astype(dtype) from scipy.sparse import csr_matrix X = csr_matrix(X) return AnnData(X) def read_text( filename: PathLike[str] | str | Iterator[str], delimiter: str | None = None, first_column_names: bool | None = None, dtype: str = "float32", ) -> AnnData: """\ Read `.txt`, `.tab`, `.data` (text) file. Same as :func:`~anndata.io.read_csv` but with default delimiter `None`. Parameters ---------- filename Data file, filename or stream. delimiter Delimiter that separates data within text file. If `None`, will split at arbitrary number of white spaces, which is different from enforcing splitting at single white space `' '`. first_column_names Assume the first column stores row names. dtype Numpy data type. """ if not isinstance(filename, PathLike | str | bytes): return _read_text(filename, delimiter, first_column_names, dtype) filename = Path(filename) if filename.suffix == ".gz": with gzip.open(str(filename), mode="rt") as f: return _read_text(f, delimiter, first_column_names, dtype) elif filename.suffix == ".bz2": with bz2.open(str(filename), mode="rt") as f: return _read_text(f, delimiter, first_column_names, dtype) else: with filename.open() as f: return _read_text(f, delimiter, first_column_names, dtype) def _iter_lines(file_like: Iterable[str]) -> Generator[str, None, None]: """Helper for iterating only nonempty lines without line breaks""" for line in file_like: line = line.rstrip("\r\n") if line: yield line def _read_text( f: Iterator[str], delimiter: str | None, first_column_names: bool | None, dtype: str, ) -> AnnData: comments = [] data = [] lines = _iter_lines(f) col_names = [] row_names = [] # read header and column names for line in lines: if line.startswith("#"): comment = line.lstrip("# ") if comment: comments.append(comment) else: if delimiter is not None and delimiter not in line: msg = f"Did not find delimiter {delimiter!r} in first line." raise ValueError(msg) line_list = line.split(delimiter) # the first column might be row names, so check the last if not is_float(line_list[-1]): col_names = line_list # logg.msg(" assuming first line in file stores column names", v=4) else: if not is_float(line_list[0]) or first_column_names: first_column_names = True row_names.append(line_list[0]) data.append(np.array(line_list[1:], dtype=dtype)) else: data.append(np.array(line_list, dtype=dtype)) break if not col_names: # try reading col_names from the last comment line if len(comments) > 0: # logg.msg(" assuming last comment line stores variable names", v=4) col_names = np.array(comments[-1].split()) # just numbers as col_names else: # logg.msg(" did not find column names in file", v=4) col_names = np.arange(len(data[0])).astype(str) col_names = np.array(col_names, dtype=str) # read another line to check if first column contains row names or not if first_column_names is None: first_column_names = False for line in lines: line_list = line.split(delimiter) if first_column_names or not is_float(line_list[0]): # logg.msg(" assuming first column in file stores row names", v=4) first_column_names = True row_names.append(line_list[0]) data.append(np.array(line_list[1:], dtype=dtype)) else: data.append(np.array(line_list, dtype=dtype)) break # if row names are just integers if len(data) > 1 and data[0].size != data[1].size: # logg.msg( # " assuming first row stores column names and first column row names", # v=4, # ) first_column_names = True col_names = np.array(data[0]).astype(int).astype(str) row_names.append(data[1][0].astype(int).astype(str)) data = [data[1][1:]] # parse the file for line in lines: line_list = line.split(delimiter) if first_column_names: row_names.append(line_list[0]) data.append(np.array(line_list[1:], dtype=dtype)) else: data.append(np.array(line_list, dtype=dtype)) # logg.msg(" read data into list of lists", t=True, v=4) # transform to array, this takes a long time and a lot of memory # but it’s actually the same thing as np.genfromtxt does # - we don’t use the latter as it would involve another slicing step # in the end, to separate row_names from float data, slicing takes # a lot of memory and CPU time if data[0].size != data[-1].size: msg = ( f"Length of first line ({data[0].size}) is different " f"from length of last line ({data[-1].size})." ) raise ValueError(msg) data = np.array(data, dtype=dtype) # logg.msg(" constructed array from list of list", t=True, v=4) # transform row_names if not row_names: row_names = np.arange(len(data)).astype(str) # logg.msg(" did not find row names in file", v=4) else: row_names = np.array(row_names) for iname, name in enumerate(row_names): row_names[iname] = name.strip('"') # adapt col_names if necessary if col_names.size > data.shape[1]: col_names = col_names[1:] for iname, name in enumerate(col_names): col_names[iname] = name.strip('"') return AnnData( data, obs=dict(obs_names=row_names), var=dict(var_names=col_names), ) python-anndata-0.12.0~rc1/src/anndata/_io/specs/000077500000000000000000000000001500370632200214015ustar00rootroot00000000000000python-anndata-0.12.0~rc1/src/anndata/_io/specs/__init__.py000066400000000000000000000006531500370632200235160ustar00rootroot00000000000000from __future__ import annotations from . import lazy_methods, methods from .registry import ( _LAZY_REGISTRY, # noqa: F401 _REGISTRY, # noqa: F401 IOSpec, Reader, Writer, get_spec, read_elem, read_elem_lazy, write_elem, ) __all__ = [ "methods", "lazy_methods", "write_elem", "get_spec", "read_elem", "read_elem_lazy", "Reader", "Writer", "IOSpec", ] python-anndata-0.12.0~rc1/src/anndata/_io/specs/lazy_methods.py000066400000000000000000000261411500370632200244610ustar00rootroot00000000000000from __future__ import annotations from contextlib import contextmanager from functools import partial, singledispatch from pathlib import Path from typing import TYPE_CHECKING, overload import h5py import numpy as np import pandas as pd from scipy import sparse import anndata as ad from anndata._core.file_backing import filename, get_elem_name from anndata.abc import CSCDataset, CSRDataset from anndata.compat import DaskArray, H5Array, H5Group, ZarrArray, ZarrGroup from .registry import _LAZY_REGISTRY, IOSpec if TYPE_CHECKING: from collections.abc import Generator, Mapping, Sequence from typing import Literal, ParamSpec, TypeVar from anndata.experimental.backed._compat import DataArray, Dataset2D from anndata.experimental.backed._lazy_arrays import CategoricalArray, MaskedArray from ...compat import CSArray, CSMatrix, H5File from .registry import LazyDataStructures, LazyReader BlockInfo = Mapping[ Literal[None], dict[str, Sequence[tuple[int, int]]], ] P = ParamSpec("P") R = TypeVar("R") D = TypeVar("D") @overload @contextmanager def maybe_open_h5( path_or_other: Path, elem_name: str ) -> Generator[H5File, None, None]: ... @overload @contextmanager def maybe_open_h5(path_or_other: D, elem_name: str) -> Generator[D, None, None]: ... @contextmanager def maybe_open_h5( path_or_other: H5File | D, elem_name: str ) -> Generator[H5File | D, None, None]: if not isinstance(path_or_other, Path): yield path_or_other return file = h5py.File(path_or_other, "r") try: yield file[elem_name] finally: file.close() _DEFAULT_STRIDE = 1000 def compute_chunk_layout_for_axis_size( chunk_axis_size: int, full_axis_size: int ) -> tuple[int, ...]: n_strides, rest = np.divmod(full_axis_size, chunk_axis_size) chunk = (chunk_axis_size,) * n_strides if rest > 0: chunk += (rest,) return chunk def make_dask_chunk( path_or_sparse_dataset: Path | D, elem_name: str, block_info: BlockInfo | None = None, ) -> CSMatrix | CSArray: if block_info is None: msg = "Block info is required" raise ValueError(msg) # We need to open the file in each task since `dask` cannot share h5py objects when using `dask.distributed` # https://github.com/scverse/anndata/issues/1105 with maybe_open_h5(path_or_sparse_dataset, elem_name) as f: mtx = ad.io.sparse_dataset(f) if isinstance(f, H5Group) else f idx = tuple( slice(start, stop) for start, stop in block_info[None]["array-location"] ) chunk = mtx[idx] return chunk @singledispatch def get_chunksize(obj) -> tuple[int, ...]: if hasattr(obj, "chunks"): return obj.chunks msg = "object of type {type(obj)} has no recognized chunks" raise ValueError(msg) @_LAZY_REGISTRY.register_read(H5Group, IOSpec("csc_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(H5Group, IOSpec("csr_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csr_matrix", "0.1.0")) def read_sparse_as_dask( elem: H5Group | ZarrGroup, *, _reader: LazyReader, chunks: tuple[int, ...] | None = None, # only tuple[int, int] is supported here ) -> DaskArray: import dask.array as da path_or_sparse_dataset = ( Path(filename(elem)) if isinstance(elem, H5Group) else ad.io.sparse_dataset(elem) ) elem_name = get_elem_name(elem) shape: tuple[int, int] = tuple(elem.attrs["shape"]) if isinstance(path_or_sparse_dataset, CSRDataset | CSCDataset): dtype = path_or_sparse_dataset.dtype else: dtype = elem["data"].dtype is_csc: bool = elem.attrs["encoding-type"] == "csc_matrix" stride: int = _DEFAULT_STRIDE major_dim, minor_dim = (1, 0) if is_csc else (0, 1) if chunks is not None: if len(chunks) != 2: msg = "`chunks` must be a tuple of two integers" raise ValueError(msg) if chunks[minor_dim] not in {shape[minor_dim], -1, None}: msg = ( "Only the major axis can be chunked. " f"Try setting chunks to {((-1, _DEFAULT_STRIDE) if is_csc else (_DEFAULT_STRIDE, -1))}" ) raise ValueError(msg) stride = ( chunks[major_dim] if chunks[major_dim] not in {None, -1} else shape[major_dim] ) shape_minor, shape_major = shape if is_csc else shape[::-1] chunks_major = compute_chunk_layout_for_axis_size(stride, shape_major) chunks_minor = (shape_minor,) chunk_layout = ( (chunks_minor, chunks_major) if is_csc else (chunks_major, chunks_minor) ) memory_format = sparse.csc_matrix if is_csc else sparse.csr_matrix make_chunk = partial(make_dask_chunk, path_or_sparse_dataset, elem_name) da_mtx = da.map_blocks( make_chunk, dtype=dtype, chunks=chunk_layout, meta=memory_format((0, 0), dtype=dtype), ) return da_mtx @_LAZY_REGISTRY.register_read(H5Array, IOSpec("string-array", "0.2.0")) def read_h5_string_array( elem: H5Array, *, _reader: LazyReader, chunks: tuple[int, int] | None = None, ) -> DaskArray: import dask.array as da from anndata._io.h5ad import read_dataset return da.from_array( read_dataset(elem), chunks=chunks if chunks is not None else (_DEFAULT_STRIDE,) * len(elem.shape), ) @_LAZY_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0")) def read_h5_array( elem: H5Array, *, _reader: LazyReader, chunks: tuple[int, ...] | None = None ) -> DaskArray: import dask.array as da path = Path(elem.file.filename) elem_name: str = elem.name shape = tuple(elem.shape) dtype = elem.dtype chunks: tuple[int, ...] = ( tuple( c if c not in {None, -1} else s for c, s in zip(chunks, shape, strict=True) ) if chunks is not None else tuple(min(_DEFAULT_STRIDE, s) for s in shape) ) chunk_layout = tuple( compute_chunk_layout_for_axis_size(chunks[i], shape[i]) for i in range(len(shape)) ) make_chunk = partial(make_dask_chunk, path, elem_name) return da.map_blocks( make_chunk, dtype=dtype, chunks=chunk_layout, meta=np.array([]) ) @_LAZY_REGISTRY.register_read(ZarrArray, IOSpec("string-array", "0.2.0")) @_LAZY_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0")) def read_zarr_array( elem: ZarrArray, *, _reader: LazyReader, chunks: tuple[int, ...] | None = None ) -> DaskArray: chunks: tuple[int, ...] = chunks if chunks is not None else elem.chunks import dask.array as da return da.from_zarr(elem, chunks=chunks) def _gen_xarray_dict_iterator_from_elems( elem_dict: dict[str, LazyDataStructures], dim_name: str, index: np.NDArray, ) -> Generator[tuple[str, DataArray], None, None]: from anndata.experimental.backed._compat import DataArray from anndata.experimental.backed._compat import xarray as xr from anndata.experimental.backed._lazy_arrays import CategoricalArray, MaskedArray for k, v in elem_dict.items(): if isinstance(v, DaskArray) and k != dim_name: data_array = DataArray(v, coords=[index], dims=[dim_name], name=k) elif isinstance(v, CategoricalArray | MaskedArray) and k != dim_name: variable = xr.Variable( data=xr.core.indexing.LazilyIndexedArray(v), dims=[dim_name] ) data_array = DataArray( variable, coords=[index], dims=[dim_name], name=k, attrs={ "base_path_or_zarr_group": v.base_path_or_zarr_group, "elem_name": v.elem_name, }, ) elif k == dim_name: data_array = DataArray( index, coords=[index], dims=[dim_name], name=dim_name ) else: msg = f"Could not read {k}: {v} from into xarray Dataset2D" raise ValueError(msg) yield k, data_array DUMMY_RANGE_INDEX_KEY = "_anndata_dummy_range_index" @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("dataframe", "0.2.0")) @_LAZY_REGISTRY.register_read(H5Group, IOSpec("dataframe", "0.2.0")) def read_dataframe( elem: H5Group | ZarrGroup, *, _reader: LazyReader, use_range_index: bool = False, ) -> Dataset2D: from anndata.experimental.backed._compat import DataArray, Dataset2D elem_dict = { k: _reader.read_elem(elem[k]) for k in [*elem.attrs["column-order"], elem.attrs["_index"]] } # If we use a range index, the coord axis needs to have the special dim name # which is used below as well. if not use_range_index: dim_name = elem.attrs["_index"] # no sense in reading this in multiple times index = elem_dict[dim_name].compute() else: dim_name = DUMMY_RANGE_INDEX_KEY index = pd.RangeIndex(len(elem_dict[elem.attrs["_index"]])).astype("str") elem_xarray_dict = dict( _gen_xarray_dict_iterator_from_elems(elem_dict, dim_name, index) ) if use_range_index: elem_xarray_dict[DUMMY_RANGE_INDEX_KEY] = DataArray( index, coords=[index], dims=[DUMMY_RANGE_INDEX_KEY], name=DUMMY_RANGE_INDEX_KEY, ) # We ensure the indexing_key attr always points to the true index # so that the roundtrip works even for the `use_range_index` `True` case ds = Dataset2D(elem_xarray_dict, attrs={"indexing_key": elem.attrs["_index"]}) return ds @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("categorical", "0.2.0")) @_LAZY_REGISTRY.register_read(H5Group, IOSpec("categorical", "0.2.0")) def read_categorical( elem: H5Group | ZarrGroup, *, _reader: LazyReader, ) -> CategoricalArray: from anndata.experimental.backed._lazy_arrays import CategoricalArray base_path_or_zarr_group = ( Path(filename(elem)) if isinstance(elem, H5Group) else elem ) elem_name = get_elem_name(elem) return CategoricalArray( codes=elem["codes"], categories=elem["categories"], ordered=elem.attrs["ordered"], base_path_or_zarr_group=base_path_or_zarr_group, elem_name=elem_name, ) def read_nullable( elem: H5Group | ZarrGroup, *, encoding_type: Literal[ "nullable-integer", "nullable-boolean", "nullable-string-array" ], _reader: LazyReader, ) -> MaskedArray: from anndata.experimental.backed._lazy_arrays import MaskedArray base_path_or_zarr_group = ( Path(filename(elem)) if isinstance(elem, H5Group) else elem ) elem_name = get_elem_name(elem) return MaskedArray( values=elem["values"], mask=elem["mask"] if "mask" in elem else None, dtype_str=encoding_type, base_path_or_zarr_group=base_path_or_zarr_group, elem_name=elem_name, ) for dtype in ["integer", "boolean", "string-array"]: for group_type in [ZarrGroup, H5Group]: _LAZY_REGISTRY.register_read(group_type, IOSpec(f"nullable-{dtype}", "0.1.0"))( partial(read_nullable, encoding_type=f"nullable-{dtype}") ) python-anndata-0.12.0~rc1/src/anndata/_io/specs/methods.py000066400000000000000000001323101500370632200234160ustar00rootroot00000000000000from __future__ import annotations import warnings from collections.abc import Mapping from copy import copy from functools import partial from itertools import product from types import MappingProxyType from typing import TYPE_CHECKING from warnings import warn import h5py import numpy as np import pandas as pd from packaging.version import Version from scipy import sparse import anndata as ad from anndata import AnnData, Raw from anndata._core import views from anndata._core.index import _normalize_indices from anndata._core.merge import intersect_keys from anndata._core.sparse_dataset import _CSCDataset, _CSRDataset, sparse_dataset from anndata._io.utils import H5PY_V3, check_key, zero_dim_array_as_scalar from anndata._warnings import OldFormatWarning from anndata.compat import ( AwkArray, CupyArray, CupyCSCMatrix, CupyCSRMatrix, DaskArray, H5Array, H5File, H5Group, ZarrArray, ZarrGroup, _decode_structured_array, _from_fixed_length_strings, _read_attr, _require_group_write_dataframe, ) from ..._settings import settings from ...compat import is_zarr_v2 from .registry import _REGISTRY, IOSpec, read_elem, read_elem_partial if TYPE_CHECKING: from collections.abc import Callable, Iterator from os import PathLike from typing import Any, Literal from numpy import typing as npt from numpy.typing import NDArray from anndata._types import ArrayStorageType, GroupStorageType from anndata.compat import CSArray, CSMatrix from anndata.typing import AxisStorable, InMemoryArrayOrScalarType from .registry import Reader, Writer #################### # Dask utils # #################### try: from dask.utils import SerializableLock as Lock except ImportError: from threading import Lock # to fix https://github.com/dask/distributed/issues/780 GLOBAL_LOCK = Lock() #################### # Dispatch methods # #################### # def is_full_slice(idx): # if isinstance(idx, tuple)len(idx) == 1: # if isinstance(idx, type(None)): # return True # elif idx is Ellipsis: # return True # elif isinstance(idx, tuple): # for el in idx: # if isinstance(el, type(None)): # pass # elif isinstance(el, slice): # if el != slice(None): # return False # else: # return False # return True # return False def zarr_v3_compressor_compat(dataset_kwargs) -> dict: if not is_zarr_v2() and (compressor := dataset_kwargs.pop("compressor", None)): dataset_kwargs["compressors"] = compressor return dataset_kwargs def _to_cpu_mem_wrapper(write_func): """ Wrapper to bring cupy types into cpu memory before writing. Ideally we do direct writing at some point. """ def wrapper( f, k, cupy_val: CupyArray | CupyCSCMatrix | CupyCSRMatrix, *, _writer: Writer, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): return write_func( f, k, cupy_val.get(), _writer=_writer, dataset_kwargs=dataset_kwargs ) return wrapper ################################ # Fallbacks / backwards compat # ################################ # Note: there is no need for writing in a backwards compatible format, maybe @_REGISTRY.register_read(H5File, IOSpec("", "")) @_REGISTRY.register_read(H5Group, IOSpec("", "")) @_REGISTRY.register_read(H5Array, IOSpec("", "")) def read_basic( elem: H5File | H5Group | H5Array, *, _reader: Reader ) -> dict[str, InMemoryArrayOrScalarType] | npt.NDArray | CSMatrix | CSArray: from anndata._io import h5ad warn( f"Element '{elem.name}' was written without encoding metadata.", OldFormatWarning, stacklevel=3, ) if isinstance(elem, Mapping): # Backwards compat sparse arrays if "h5sparse_format" in elem.attrs: return sparse_dataset(elem).to_memory() return {k: _reader.read_elem(v) for k, v in dict(elem).items()} elif isinstance(elem, h5py.Dataset): return h5ad.read_dataset(elem) # TODO: Handle legacy @_REGISTRY.register_read(ZarrGroup, IOSpec("", "")) @_REGISTRY.register_read(ZarrArray, IOSpec("", "")) def read_basic_zarr( elem: ZarrGroup | ZarrArray, *, _reader: Reader ) -> dict[str, InMemoryArrayOrScalarType] | npt.NDArray | CSMatrix | CSArray: from anndata._io import zarr warn( f"Element '{elem.name}' was written without encoding metadata.", OldFormatWarning, stacklevel=3, ) if isinstance(elem, ZarrGroup): # Backwards compat sparse arrays if "h5sparse_format" in elem.attrs: return sparse_dataset(elem).to_memory() return {k: _reader.read_elem(v) for k, v in dict(elem).items()} elif isinstance(elem, ZarrArray): return zarr.read_dataset(elem) # TODO: Handle legacy # @_REGISTRY.register_read_partial(IOSpec("", "")) # def read_basic_partial(elem, *, items=None, indices=(slice(None), slice(None))): # if isinstance(elem, Mapping): # return _read_partial(elem, items=items, indices=indices) # elif indices != (slice(None), slice(None)): # return elem[indices] # else: # return elem[()] ########### # AnnData # ########### def read_indices(group): obs_group = group["obs"] obs_idx_elem = obs_group[_read_attr(obs_group.attrs, "_index")] obs_idx = read_elem(obs_idx_elem) var_group = group["var"] var_idx_elem = var_group[_read_attr(var_group.attrs, "_index")] var_idx = read_elem(var_idx_elem) return obs_idx, var_idx def read_partial( pth: PathLike[str] | str, *, obs_idx=slice(None), var_idx=slice(None), X=True, obs=None, var=None, obsm=None, varm=None, obsp=None, varp=None, layers=None, uns=None, ) -> ad.AnnData: result = {} with h5py.File(pth, "r") as f: obs_idx, var_idx = _normalize_indices((obs_idx, var_idx), *read_indices(f)) result["obs"] = read_elem_partial( f["obs"], items=obs, indices=(obs_idx, slice(None)) ) result["var"] = read_elem_partial( f["var"], items=var, indices=(var_idx, slice(None)) ) if X: result["X"] = read_elem_partial(f["X"], indices=(obs_idx, var_idx)) else: result["X"] = sparse.csr_matrix((len(result["obs"]), len(result["var"]))) if "obsm" in f: result["obsm"] = _read_partial( f["obsm"], items=obsm, indices=(obs_idx, slice(None)) ) if "varm" in f: result["varm"] = _read_partial( f["varm"], items=varm, indices=(var_idx, slice(None)) ) if "obsp" in f: result["obsp"] = _read_partial( f["obsp"], items=obsp, indices=(obs_idx, obs_idx) ) if "varp" in f: result["varp"] = _read_partial( f["varp"], items=varp, indices=(var_idx, var_idx) ) if "layers" in f: result["layers"] = _read_partial( f["layers"], items=layers, indices=(obs_idx, var_idx) ) if "uns" in f: result["uns"] = _read_partial(f["uns"], items=uns) return ad.AnnData(**result) def _read_partial(group, *, items=None, indices=(slice(None), slice(None))): if group is None: return None if items is None: keys = intersect_keys((group,)) else: keys = intersect_keys((group, items)) result = {} for k in keys: if isinstance(items, Mapping): next_items = items.get(k, None) else: next_items = None result[k] = read_elem_partial(group[k], items=next_items, indices=indices) return result @_REGISTRY.register_write(ZarrGroup, AnnData, IOSpec("anndata", "0.1.0")) @_REGISTRY.register_write(H5Group, AnnData, IOSpec("anndata", "0.1.0")) def write_anndata( f: GroupStorageType, k: str, adata: AnnData, *, _writer: Writer, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): g = f.require_group(k) _writer.write_elem(g, "X", adata.X, dataset_kwargs=dataset_kwargs) _writer.write_elem(g, "obs", adata.obs, dataset_kwargs=dataset_kwargs) _writer.write_elem(g, "var", adata.var, dataset_kwargs=dataset_kwargs) _writer.write_elem(g, "obsm", dict(adata.obsm), dataset_kwargs=dataset_kwargs) _writer.write_elem(g, "varm", dict(adata.varm), dataset_kwargs=dataset_kwargs) _writer.write_elem(g, "obsp", dict(adata.obsp), dataset_kwargs=dataset_kwargs) _writer.write_elem(g, "varp", dict(adata.varp), dataset_kwargs=dataset_kwargs) _writer.write_elem(g, "layers", dict(adata.layers), dataset_kwargs=dataset_kwargs) _writer.write_elem(g, "uns", dict(adata.uns), dataset_kwargs=dataset_kwargs) _writer.write_elem(g, "raw", adata.raw, dataset_kwargs=dataset_kwargs) @_REGISTRY.register_read(H5Group, IOSpec("anndata", "0.1.0")) @_REGISTRY.register_read(H5Group, IOSpec("raw", "0.1.0")) @_REGISTRY.register_read(H5File, IOSpec("anndata", "0.1.0")) @_REGISTRY.register_read(H5File, IOSpec("raw", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("anndata", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("raw", "0.1.0")) def read_anndata(elem: GroupStorageType | H5File, *, _reader: Reader) -> AnnData: d = {} for k in [ "X", "obs", "var", "obsm", "varm", "obsp", "varp", "layers", "uns", "raw", ]: if k in elem: d[k] = _reader.read_elem(elem[k]) return AnnData(**d) @_REGISTRY.register_write(H5Group, Raw, IOSpec("raw", "0.1.0")) @_REGISTRY.register_write(ZarrGroup, Raw, IOSpec("raw", "0.1.0")) def write_raw( f: GroupStorageType, k: str, raw: Raw, *, _writer: Writer, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): g = f.require_group(k) _writer.write_elem(g, "X", raw.X, dataset_kwargs=dataset_kwargs) _writer.write_elem(g, "var", raw.var, dataset_kwargs=dataset_kwargs) _writer.write_elem(g, "varm", dict(raw.varm), dataset_kwargs=dataset_kwargs) ######## # Null # ######## @_REGISTRY.register_read(H5Array, IOSpec("null", "0.1.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("null", "0.1.0")) def read_null(_elem, _reader) -> None: return None @_REGISTRY.register_write(H5Group, type(None), IOSpec("null", "0.1.0")) def write_null_h5py(f, k, _v, _writer, dataset_kwargs=MappingProxyType({})): f.create_dataset(k, data=h5py.Empty("f"), **dataset_kwargs) @_REGISTRY.register_write(ZarrGroup, type(None), IOSpec("null", "0.1.0")) def write_null_zarr(f, k, _v, _writer, dataset_kwargs=MappingProxyType({})): # zarr has no first-class null dataset if is_zarr_v2(): import zarr # zarr has no first-class null dataset f.create_dataset(k, data=zarr.empty(()), **dataset_kwargs) else: # TODO: why is this not actually storing the empty info with a f.empty call? # It fails complaining that k doesn't exist when updating the attributes. f.create_array(k, shape=(), dtype="bool") ############ # Mappings # ############ @_REGISTRY.register_read(H5Group, IOSpec("dict", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("dict", "0.1.0")) def read_mapping(elem: GroupStorageType, *, _reader: Reader) -> dict[str, AxisStorable]: return {k: _reader.read_elem(v) for k, v in dict(elem).items()} @_REGISTRY.register_write(H5Group, dict, IOSpec("dict", "0.1.0")) @_REGISTRY.register_write(ZarrGroup, dict, IOSpec("dict", "0.1.0")) def write_mapping( f: GroupStorageType, k: str, v: dict[str, AxisStorable], *, _writer: Writer, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): g = f.require_group(k) for sub_k, sub_v in v.items(): _writer.write_elem(g, sub_k, sub_v, dataset_kwargs=dataset_kwargs) ############## # np.ndarray # ############## @_REGISTRY.register_write(H5Group, list, IOSpec("array", "0.2.0")) @_REGISTRY.register_write(ZarrGroup, list, IOSpec("array", "0.2.0")) def write_list( f: GroupStorageType, k: str, elem: list[AxisStorable], *, _writer: Writer, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): _writer.write_elem(f, k, np.array(elem), dataset_kwargs=dataset_kwargs) # TODO: Is this the right behavior for MaskedArrays? # It's in the `AnnData.concatenate` docstring, but should we keep it? @_REGISTRY.register_write(H5Group, views.ArrayView, IOSpec("array", "0.2.0")) @_REGISTRY.register_write(H5Group, np.ndarray, IOSpec("array", "0.2.0")) @_REGISTRY.register_write(H5Group, np.ma.MaskedArray, IOSpec("array", "0.2.0")) @_REGISTRY.register_write(ZarrGroup, views.ArrayView, IOSpec("array", "0.2.0")) @_REGISTRY.register_write(ZarrGroup, np.ndarray, IOSpec("array", "0.2.0")) @_REGISTRY.register_write(ZarrGroup, np.ma.MaskedArray, IOSpec("array", "0.2.0")) @_REGISTRY.register_write(ZarrGroup, ZarrArray, IOSpec("array", "0.2.0")) @_REGISTRY.register_write(ZarrGroup, H5Array, IOSpec("array", "0.2.0")) @zero_dim_array_as_scalar def write_basic( f: GroupStorageType, k: str, elem: views.ArrayView | np.ndarray | h5py.Dataset | np.ma.MaskedArray | ZarrArray, *, _writer: Writer, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): """Write methods which underlying library handles natively.""" dataset_kwargs = dataset_kwargs.copy() dtype = dataset_kwargs.pop("dtype", elem.dtype) if isinstance(f, H5Group) or is_zarr_v2(): f.create_dataset(k, data=elem, shape=elem.shape, dtype=dtype, **dataset_kwargs) else: dataset_kwargs = zarr_v3_compressor_compat(dataset_kwargs) f.create_array(k, shape=elem.shape, dtype=dtype, **dataset_kwargs) # see https://github.com/zarr-developers/zarr-python/discussions/2712 if isinstance(elem, ZarrArray): f[k][...] = elem[...] else: f[k][...] = elem def _iter_chunks_for_copy( elem: ArrayStorageType, dest: ArrayStorageType ) -> Iterator[slice | tuple[list[slice]]]: """ Returns an iterator of tuples of slices for copying chunks from `elem` to `dest`. * If `dest` has chunks, it will return the chunks of `dest`. * If `dest` is not chunked, we write it in ~100MB chunks or 1000 rows, whichever is larger. """ if dest.chunks and hasattr(dest, "iter_chunks"): return dest.iter_chunks() else: shape = elem.shape # Number of rows that works out to n_rows = max( ad.settings.min_rows_for_chunked_h5_copy, elem.chunks[0] if elem.chunks is not None else 1, ) return (slice(i, min(i + n_rows, shape[0])) for i in range(0, shape[0], n_rows)) @_REGISTRY.register_write(H5Group, H5Array, IOSpec("array", "0.2.0")) @_REGISTRY.register_write(H5Group, ZarrArray, IOSpec("array", "0.2.0")) def write_chunked_dense_array_to_group( f: H5Group, k: str, elem: ArrayStorageType, *, _writer: Writer, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): """Write to a h5py.Dataset in chunks. `h5py.Group.create_dataset(..., data: h5py.Dataset)` will load all of `data` into memory before writing. Instead, we will write in chunks to avoid this. We don't need to do this for zarr since zarr handles this automatically. """ dtype = dataset_kwargs.get("dtype", elem.dtype) kwargs = {**dataset_kwargs, "dtype": dtype} dest = f.create_dataset(k, shape=elem.shape, **kwargs) for chunk in _iter_chunks_for_copy(elem, dest): dest[chunk] = elem[chunk] _REGISTRY.register_write(H5Group, CupyArray, IOSpec("array", "0.2.0"))( _to_cpu_mem_wrapper(write_basic) ) _REGISTRY.register_write(ZarrGroup, CupyArray, IOSpec("array", "0.2.0"))( _to_cpu_mem_wrapper(write_basic) ) @_REGISTRY.register_write(ZarrGroup, DaskArray, IOSpec("array", "0.2.0")) def write_basic_dask_zarr( f: ZarrGroup, k: str, elem: DaskArray, *, _writer: Writer, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): import dask.array as da dataset_kwargs = dataset_kwargs.copy() dataset_kwargs = zarr_v3_compressor_compat(dataset_kwargs) if is_zarr_v2(): g = f.require_dataset(k, shape=elem.shape, dtype=elem.dtype, **dataset_kwargs) else: g = f.require_array(k, shape=elem.shape, dtype=elem.dtype, **dataset_kwargs) da.store(elem, g, lock=GLOBAL_LOCK) # Adding this separately because h5py isn't serializable # https://github.com/pydata/xarray/issues/4242 @_REGISTRY.register_write(H5Group, DaskArray, IOSpec("array", "0.2.0")) def write_basic_dask_h5( f: H5Group, k: str, elem: DaskArray, *, _writer: Writer, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): import dask.array as da import dask.config as dc if dc.get("scheduler", None) == "dask.distributed": msg = "Cannot write dask arrays to hdf5 when using distributed scheduler" raise ValueError(msg) g = f.require_dataset(k, shape=elem.shape, dtype=elem.dtype, **dataset_kwargs) da.store(elem, g) @_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("string-array", "0.2.0")) def read_array(elem: ArrayStorageType, *, _reader: Reader) -> npt.NDArray: return elem[()] @_REGISTRY.register_read_partial(H5Array, IOSpec("array", "0.2.0")) @_REGISTRY.register_read_partial(ZarrArray, IOSpec("string-array", "0.2.0")) def read_array_partial(elem, *, items=None, indices=(slice(None, None))): return elem[indices] @_REGISTRY.register_read_partial(ZarrArray, IOSpec("array", "0.2.0")) def read_zarr_array_partial(elem, *, items=None, indices=(slice(None, None))): return elem.oindex[indices] # arrays of strings @_REGISTRY.register_read(H5Array, IOSpec("string-array", "0.2.0")) def read_string_array(d: H5Array, *, _reader: Reader): return read_array(d.asstr(), _reader=_reader) @_REGISTRY.register_read_partial(H5Array, IOSpec("string-array", "0.2.0")) def read_string_array_partial(d, items=None, indices=slice(None)): return read_array_partial(d.asstr(), items=items, indices=indices) @_REGISTRY.register_write( H5Group, (views.ArrayView, "U"), IOSpec("string-array", "0.2.0") ) @_REGISTRY.register_write( H5Group, (views.ArrayView, "O"), IOSpec("string-array", "0.2.0") ) @_REGISTRY.register_write(H5Group, (np.ndarray, "U"), IOSpec("string-array", "0.2.0")) @_REGISTRY.register_write(H5Group, (np.ndarray, "O"), IOSpec("string-array", "0.2.0")) @_REGISTRY.register_write(H5Group, (np.ndarray, "T"), IOSpec("string-array", "0.2.0")) @zero_dim_array_as_scalar def write_vlen_string_array( f: H5Group, k: str, elem: np.ndarray, *, _writer: Writer, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): """Write methods which underlying library handles nativley.""" str_dtype = h5py.special_dtype(vlen=str) f.create_dataset(k, data=elem.astype(str_dtype), dtype=str_dtype, **dataset_kwargs) @_REGISTRY.register_write( ZarrGroup, (views.ArrayView, "U"), IOSpec("string-array", "0.2.0") ) @_REGISTRY.register_write( ZarrGroup, (views.ArrayView, "O"), IOSpec("string-array", "0.2.0") ) @_REGISTRY.register_write(ZarrGroup, (np.ndarray, "U"), IOSpec("string-array", "0.2.0")) @_REGISTRY.register_write(ZarrGroup, (np.ndarray, "O"), IOSpec("string-array", "0.2.0")) @_REGISTRY.register_write(ZarrGroup, (np.ndarray, "T"), IOSpec("string-array", "0.2.0")) @zero_dim_array_as_scalar def write_vlen_string_array_zarr( f: ZarrGroup, k: str, elem: np.ndarray, *, _writer: Writer, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): if is_zarr_v2(): import numcodecs if Version(numcodecs.__version__) < Version("0.13"): msg = "Old numcodecs version detected. Please update for improved performance and stability." warnings.warn(msg) # Workaround for https://github.com/zarr-developers/numcodecs/issues/514 if hasattr(elem, "flags") and not elem.flags.writeable: elem = elem.copy() f.create_dataset( k, shape=elem.shape, dtype=object, object_codec=numcodecs.VLenUTF8(), **dataset_kwargs, ) f[k][:] = elem else: from numcodecs import VLenUTF8 dataset_kwargs = dataset_kwargs.copy() dataset_kwargs = zarr_v3_compressor_compat(dataset_kwargs) match ( ad.settings.zarr_write_format, Version(np.__version__) >= Version("2.0.0"), ): case 2, _: filters, dtype = [VLenUTF8()], object case 3, True: filters, dtype = None, np.dtypes.StringDType() case 3, False: filters, dtype = None, np.dtypes.ObjectDType() f.create_array( k, shape=elem.shape, dtype=dtype, filters=filters, **dataset_kwargs, ) f[k][:] = elem ############### # np.recarray # ############### def _to_hdf5_vlen_strings(value: np.ndarray) -> np.ndarray: """This corrects compound dtypes to work with hdf5 files.""" new_dtype = [] for dt_name, (dt_type, _) in value.dtype.fields.items(): if dt_type.kind in {"U", "O"}: new_dtype.append((dt_name, h5py.special_dtype(vlen=str))) else: new_dtype.append((dt_name, dt_type)) return value.astype(new_dtype) @_REGISTRY.register_read(H5Array, IOSpec("rec-array", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("rec-array", "0.2.0")) def read_recarray(d: ArrayStorageType, *, _reader: Reader) -> np.recarray | npt.NDArray: value = d[()] dtype = value.dtype value = _from_fixed_length_strings(value) if H5PY_V3: value = _decode_structured_array(value, dtype=dtype) return value @_REGISTRY.register_write(H5Group, (np.ndarray, "V"), IOSpec("rec-array", "0.2.0")) @_REGISTRY.register_write(H5Group, np.recarray, IOSpec("rec-array", "0.2.0")) def write_recarray( f: H5Group, k: str, elem: np.ndarray | np.recarray, *, _writer: Writer, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): f.create_dataset(k, data=_to_hdf5_vlen_strings(elem), **dataset_kwargs) @_REGISTRY.register_write(ZarrGroup, (np.ndarray, "V"), IOSpec("rec-array", "0.2.0")) @_REGISTRY.register_write(ZarrGroup, np.recarray, IOSpec("rec-array", "0.2.0")) def write_recarray_zarr( f: ZarrGroup, k: str, elem: np.ndarray | np.recarray, *, _writer: Writer, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): from anndata.compat import _to_fixed_length_strings elem = _to_fixed_length_strings(elem) if isinstance(f, H5Group) or is_zarr_v2(): f.create_dataset(k, data=elem, shape=elem.shape, **dataset_kwargs) else: dataset_kwargs = dataset_kwargs.copy() dataset_kwargs = zarr_v3_compressor_compat(dataset_kwargs) # TODO: zarr’s on-disk format v3 doesn’t support this dtype f.create_array(k, shape=elem.shape, dtype=elem.dtype, **dataset_kwargs) f[k][...] = elem ################# # Sparse arrays # ################# def write_sparse_compressed( f: GroupStorageType, key: str, value: CSMatrix | CSArray, *, _writer: Writer, fmt: Literal["csr", "csc"], dataset_kwargs=MappingProxyType({}), ): g = f.require_group(key) g.attrs["shape"] = value.shape dataset_kwargs = dict(dataset_kwargs) indptr_dtype = dataset_kwargs.pop("indptr_dtype", value.indptr.dtype) # Allow resizing for hdf5 if isinstance(f, H5Group): dataset_kwargs = dict(maxshape=(None,), **dataset_kwargs) dataset_kwargs = zarr_v3_compressor_compat(dataset_kwargs) for attr_name in ["data", "indices", "indptr"]: attr = getattr(value, attr_name) dtype = indptr_dtype if attr_name == "indptr" else attr.dtype if isinstance(f, H5Group) or is_zarr_v2(): g.create_dataset( attr_name, data=attr, shape=attr.shape, dtype=dtype, **dataset_kwargs ) else: arr = g.create_array( attr_name, shape=attr.shape, dtype=dtype, **dataset_kwargs ) # see https://github.com/zarr-developers/zarr-python/discussions/2712 arr[...] = attr[...] write_csr = partial(write_sparse_compressed, fmt="csr") write_csc = partial(write_sparse_compressed, fmt="csc") for store_type, (cls, spec, func) in product( (H5Group, ZarrGroup), [ # spmatrix (sparse.csr_matrix, IOSpec("csr_matrix", "0.1.0"), write_csr), (views.SparseCSRMatrixView, IOSpec("csr_matrix", "0.1.0"), write_csr), (sparse.csc_matrix, IOSpec("csc_matrix", "0.1.0"), write_csc), (views.SparseCSCMatrixView, IOSpec("csc_matrix", "0.1.0"), write_csc), # sparray (sparse.csr_array, IOSpec("csr_matrix", "0.1.0"), write_csr), (views.SparseCSRArrayView, IOSpec("csr_matrix", "0.1.0"), write_csr), (sparse.csc_array, IOSpec("csc_matrix", "0.1.0"), write_csc), (views.SparseCSCArrayView, IOSpec("csc_matrix", "0.1.0"), write_csc), # cupy spmatrix (CupyCSRMatrix, IOSpec("csr_matrix", "0.1.0"), _to_cpu_mem_wrapper(write_csr)), ( views.CupySparseCSRView, IOSpec("csr_matrix", "0.1.0"), _to_cpu_mem_wrapper(write_csr), ), (CupyCSCMatrix, IOSpec("csc_matrix", "0.1.0"), _to_cpu_mem_wrapper(write_csc)), ( views.CupySparseCSCView, IOSpec("csc_matrix", "0.1.0"), _to_cpu_mem_wrapper(write_csc), ), ], ): _REGISTRY.register_write(store_type, cls, spec)(func) @_REGISTRY.register_write(H5Group, _CSRDataset, IOSpec("", "0.1.0")) @_REGISTRY.register_write(H5Group, _CSCDataset, IOSpec("", "0.1.0")) @_REGISTRY.register_write(ZarrGroup, _CSRDataset, IOSpec("", "0.1.0")) @_REGISTRY.register_write(ZarrGroup, _CSCDataset, IOSpec("", "0.1.0")) def write_sparse_dataset( f: GroupStorageType, k: str, elem: _CSCDataset | _CSRDataset, *, _writer: Writer, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): write_sparse_compressed( f, k, elem._to_backed(), _writer=_writer, fmt=elem.format, dataset_kwargs=dataset_kwargs, ) # TODO: Cleaner way to do this f[k].attrs["encoding-type"] = f"{elem.format}_matrix" f[k].attrs["encoding-version"] = "0.1.0" @_REGISTRY.register_write(H5Group, (DaskArray, CupyArray), IOSpec("array", "0.2.0")) @_REGISTRY.register_write(ZarrGroup, (DaskArray, CupyArray), IOSpec("array", "0.2.0")) @_REGISTRY.register_write( H5Group, (DaskArray, CupyCSRMatrix), IOSpec("csr_matrix", "0.1.0") ) @_REGISTRY.register_write( H5Group, (DaskArray, CupyCSCMatrix), IOSpec("csc_matrix", "0.1.0") ) @_REGISTRY.register_write( ZarrGroup, (DaskArray, CupyCSRMatrix), IOSpec("csr_matrix", "0.1.0") ) @_REGISTRY.register_write( ZarrGroup, (DaskArray, CupyCSCMatrix), IOSpec("csc_matrix", "0.1.0") ) def write_cupy_dask_sparse(f, k, elem, _writer, dataset_kwargs=MappingProxyType({})): _writer.write_elem( f, k, elem.map_blocks(lambda x: x.get(), dtype=elem.dtype, meta=elem._meta.get()), dataset_kwargs=dataset_kwargs, ) @_REGISTRY.register_write( H5Group, (DaskArray, sparse.csr_matrix), IOSpec("csr_matrix", "0.1.0") ) @_REGISTRY.register_write( H5Group, (DaskArray, sparse.csc_matrix), IOSpec("csc_matrix", "0.1.0") ) @_REGISTRY.register_write( ZarrGroup, (DaskArray, sparse.csr_matrix), IOSpec("csr_matrix", "0.1.0") ) @_REGISTRY.register_write( ZarrGroup, (DaskArray, sparse.csc_matrix), IOSpec("csc_matrix", "0.1.0") ) def write_dask_sparse( f: GroupStorageType, k: str, elem: DaskArray, *, _writer: Writer, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): sparse_format = elem._meta.format def as_int64_indices(x): x.indptr = x.indptr.astype(np.int64, copy=False) x.indices = x.indices.astype(np.int64, copy=False) return x if sparse_format == "csr": axis = 0 elif sparse_format == "csc": axis = 1 else: msg = f"Cannot write dask sparse arrays with format {sparse_format}" raise NotImplementedError(msg) def chunk_slice(start: int, stop: int) -> tuple[slice | None, slice | None]: result = [slice(None), slice(None)] result[axis] = slice(start, stop) return tuple(result) axis_chunks = elem.chunks[axis] chunk_start = 0 chunk_stop = axis_chunks[0] _writer.write_elem( f, k, as_int64_indices(elem[chunk_slice(chunk_start, chunk_stop)].compute()), dataset_kwargs=dataset_kwargs, ) disk_mtx = sparse_dataset(f[k]) for chunk_size in axis_chunks[1:]: chunk_start = chunk_stop chunk_stop += chunk_size disk_mtx.append(elem[chunk_slice(chunk_start, chunk_stop)].compute()) @_REGISTRY.register_read(H5Group, IOSpec("csc_matrix", "0.1.0")) @_REGISTRY.register_read(H5Group, IOSpec("csr_matrix", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("csr_matrix", "0.1.0")) def read_sparse(elem: GroupStorageType, *, _reader: Reader) -> CSMatrix | CSArray: return sparse_dataset(elem).to_memory() @_REGISTRY.register_read_partial(H5Group, IOSpec("csc_matrix", "0.1.0")) @_REGISTRY.register_read_partial(H5Group, IOSpec("csr_matrix", "0.1.0")) @_REGISTRY.register_read_partial(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) @_REGISTRY.register_read_partial(ZarrGroup, IOSpec("csr_matrix", "0.1.0")) def read_sparse_partial(elem, *, items=None, indices=(slice(None), slice(None))): return sparse_dataset(elem)[indices] ################# # Awkward array # ################# @_REGISTRY.register_write(H5Group, AwkArray, IOSpec("awkward-array", "0.1.0")) @_REGISTRY.register_write(ZarrGroup, AwkArray, IOSpec("awkward-array", "0.1.0")) @_REGISTRY.register_write( H5Group, views.AwkwardArrayView, IOSpec("awkward-array", "0.1.0") ) @_REGISTRY.register_write( ZarrGroup, views.AwkwardArrayView, IOSpec("awkward-array", "0.1.0") ) def write_awkward( f: GroupStorageType, k: str, v: views.AwkwardArrayView | AwkArray, *, _writer: Writer, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): from anndata.compat import awkward as ak group = f.require_group(k) if isinstance(v, views.AwkwardArrayView): # copy to remove the view attributes v = copy(v) form, length, container = ak.to_buffers(ak.to_packed(v)) group.attrs["length"] = length group.attrs["form"] = form.to_json() for k, v in container.items(): _writer.write_elem(group, k, v, dataset_kwargs=dataset_kwargs) @_REGISTRY.register_read(H5Group, IOSpec("awkward-array", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("awkward-array", "0.1.0")) def read_awkward(elem: GroupStorageType, *, _reader: Reader) -> AwkArray: from anndata.compat import awkward as ak form = _read_attr(elem.attrs, "form") length = _read_attr(elem.attrs, "length") container = {k: _reader.read_elem(elem[k]) for k in elem.keys()} return ak.from_buffers(form, int(length), container) ############## # DataFrames # ############## @_REGISTRY.register_write(H5Group, views.DataFrameView, IOSpec("dataframe", "0.2.0")) @_REGISTRY.register_write(H5Group, pd.DataFrame, IOSpec("dataframe", "0.2.0")) @_REGISTRY.register_write(ZarrGroup, views.DataFrameView, IOSpec("dataframe", "0.2.0")) @_REGISTRY.register_write(ZarrGroup, pd.DataFrame, IOSpec("dataframe", "0.2.0")) def write_dataframe( f: GroupStorageType, key: str, df: views.DataFrameView | pd.DataFrame, *, _writer: Writer, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): # Check arguments for reserved in ("_index",): if reserved in df.columns: msg = f"{reserved!r} is a reserved name for dataframe columns." raise ValueError(msg) group = _require_group_write_dataframe(f, key, df) if not df.columns.is_unique: duplicates = list(df.columns[df.columns.duplicated()]) msg = f"Found repeated column names: {duplicates}. Column names must be unique." raise ValueError(msg) col_names = [check_key(c) for c in df.columns] group.attrs["column-order"] = col_names if df.index.name is not None: if df.index.name in col_names and not pd.Series( df.index, index=df.index ).equals(df[df.index.name]): msg = ( f"DataFrame.index.name ({df.index.name!r}) is also used by a column " "whose values are different. This is not supported. Please make sure " "the values are the same, or use a different name." ) raise ValueError(msg) index_name = df.index.name else: index_name = "_index" group.attrs["_index"] = check_key(index_name) # ._values is "the best" array representation. It's the true array backing the # object, where `.values` is always a np.ndarray and .array is always a pandas # array. _writer.write_elem( group, index_name, df.index._values, dataset_kwargs=dataset_kwargs ) for colname, series in df.items(): # TODO: this should write the "true" representation of the series (i.e. the underlying array or ndarray depending) _writer.write_elem( group, colname, series._values, dataset_kwargs=dataset_kwargs ) @_REGISTRY.register_read(H5Group, IOSpec("dataframe", "0.2.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("dataframe", "0.2.0")) def read_dataframe(elem: GroupStorageType, *, _reader: Reader) -> pd.DataFrame: columns = list(_read_attr(elem.attrs, "column-order")) idx_key = _read_attr(elem.attrs, "_index") df = pd.DataFrame( {k: _reader.read_elem(elem[k]) for k in columns}, index=_reader.read_elem(elem[idx_key]), columns=columns if len(columns) else None, ) if idx_key != "_index": df.index.name = idx_key return df # TODO: Figure out what indices is allowed to be at each element @_REGISTRY.register_read_partial(H5Group, IOSpec("dataframe", "0.2.0")) @_REGISTRY.register_read_partial(ZarrGroup, IOSpec("dataframe", "0.2.0")) def read_dataframe_partial( elem, *, items=None, indices=(slice(None, None), slice(None, None)) ): if items is not None: columns = [ col for col in _read_attr(elem.attrs, "column-order") if col in items ] else: columns = list(_read_attr(elem.attrs, "column-order")) idx_key = _read_attr(elem.attrs, "_index") df = pd.DataFrame( {k: read_elem_partial(elem[k], indices=indices[0]) for k in columns}, index=read_elem_partial(elem[idx_key], indices=indices[0]), columns=columns if len(columns) else None, ) if idx_key != "_index": df.index.name = idx_key return df # Backwards compat dataframe reading @_REGISTRY.register_read(H5Group, IOSpec("dataframe", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("dataframe", "0.1.0")) def read_dataframe_0_1_0(elem: GroupStorageType, *, _reader: Reader) -> pd.DataFrame: columns = _read_attr(elem.attrs, "column-order") idx_key = _read_attr(elem.attrs, "_index") df = pd.DataFrame( {k: read_series(elem[k]) for k in columns}, index=read_series(elem[idx_key]), columns=columns if len(columns) else None, ) if idx_key != "_index": df.index.name = idx_key return df def read_series(dataset: h5py.Dataset) -> np.ndarray | pd.Categorical: # For reading older dataframes if "categories" in dataset.attrs: if isinstance(dataset, ZarrArray): import zarr parent_name = dataset.name.rstrip(dataset.basename).strip("/") parent = zarr.open(dataset.store, mode="r")[parent_name] else: parent = dataset.parent categories_dset = parent[_read_attr(dataset.attrs, "categories")] categories = read_elem(categories_dset) ordered = bool(_read_attr(categories_dset.attrs, "ordered", default=False)) return pd.Categorical.from_codes( read_elem(dataset), categories, ordered=ordered ) else: return read_elem(dataset) @_REGISTRY.register_read_partial(H5Group, IOSpec("dataframe", "0.1.0")) @_REGISTRY.register_read_partial(ZarrGroup, IOSpec("dataframe", "0.1.0")) def read_partial_dataframe_0_1_0( elem, *, items=None, indices=(slice(None), slice(None)) ): if items is None: items = slice(None) else: items = list(items) return read_elem(elem)[items].iloc[indices[0]] ############### # Categorical # ############### @_REGISTRY.register_write(H5Group, pd.Categorical, IOSpec("categorical", "0.2.0")) @_REGISTRY.register_write(ZarrGroup, pd.Categorical, IOSpec("categorical", "0.2.0")) def write_categorical( f: GroupStorageType, k: str, v: pd.Categorical, *, _writer: Writer, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): g = f.require_group(k) g.attrs["ordered"] = bool(v.ordered) _writer.write_elem(g, "codes", v.codes, dataset_kwargs=dataset_kwargs) _writer.write_elem( g, "categories", v.categories._values, dataset_kwargs=dataset_kwargs ) @_REGISTRY.register_read(H5Group, IOSpec("categorical", "0.2.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("categorical", "0.2.0")) def read_categorical(elem: GroupStorageType, *, _reader: Reader) -> pd.Categorical: return pd.Categorical.from_codes( codes=_reader.read_elem(elem["codes"]), categories=_reader.read_elem(elem["categories"]), ordered=bool(_read_attr(elem.attrs, "ordered")), ) @_REGISTRY.register_read_partial(H5Group, IOSpec("categorical", "0.2.0")) @_REGISTRY.register_read_partial(ZarrGroup, IOSpec("categorical", "0.2.0")) def read_partial_categorical(elem, *, items=None, indices=(slice(None),)): return pd.Categorical.from_codes( codes=read_elem_partial(elem["codes"], indices=indices), categories=read_elem(elem["categories"]), ordered=bool(_read_attr(elem.attrs, "ordered")), ) #################### # Pandas nullables # #################### @_REGISTRY.register_write( H5Group, pd.arrays.IntegerArray, IOSpec("nullable-integer", "0.1.0") ) @_REGISTRY.register_write( ZarrGroup, pd.arrays.IntegerArray, IOSpec("nullable-integer", "0.1.0") ) @_REGISTRY.register_write( H5Group, pd.arrays.BooleanArray, IOSpec("nullable-boolean", "0.1.0") ) @_REGISTRY.register_write( ZarrGroup, pd.arrays.BooleanArray, IOSpec("nullable-boolean", "0.1.0") ) @_REGISTRY.register_write( H5Group, pd.arrays.StringArray, IOSpec("nullable-string-array", "0.1.0") ) @_REGISTRY.register_write( ZarrGroup, pd.arrays.StringArray, IOSpec("nullable-string-array", "0.1.0") ) def write_nullable( f: GroupStorageType, k: str, v: pd.arrays.IntegerArray | pd.arrays.BooleanArray | pd.arrays.StringArray, *, _writer: Writer, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): if ( isinstance(v, pd.arrays.StringArray) and not settings.allow_write_nullable_strings ): msg = ( "`anndata.settings.allow_write_nullable_strings` is False, " "because writing of `pd.arrays.StringArray` is new " "and not supported in anndata < 0.11, still use by many people. " "Opt-in to writing these arrays by toggling the setting to True." ) raise RuntimeError(msg) g = f.require_group(k) values = ( v.to_numpy(na_value="") if isinstance(v, pd.arrays.StringArray) else v.to_numpy(na_value=0, dtype=v.dtype.numpy_dtype) ) _writer.write_elem(g, "values", values, dataset_kwargs=dataset_kwargs) _writer.write_elem(g, "mask", v.isna(), dataset_kwargs=dataset_kwargs) def _read_nullable( elem: GroupStorageType, *, _reader: Reader, # BaseMaskedArray array_type: Callable[ [NDArray[np.number], NDArray[np.bool_]], pd.api.extensions.ExtensionArray ], ) -> pd.api.extensions.ExtensionArray: return array_type( _reader.read_elem(elem["values"]), mask=_reader.read_elem(elem["mask"]), ) def _string_array( values: np.ndarray, mask: np.ndarray ) -> pd.api.extensions.ExtensionArray: """Construct a string array from values and mask.""" arr = pd.array(values, dtype=pd.StringDtype()) arr[mask] = pd.NA return arr _REGISTRY.register_read(H5Group, IOSpec("nullable-integer", "0.1.0"))( read_nullable_integer := partial(_read_nullable, array_type=pd.arrays.IntegerArray) ) _REGISTRY.register_read(ZarrGroup, IOSpec("nullable-integer", "0.1.0"))( read_nullable_integer ) _REGISTRY.register_read(H5Group, IOSpec("nullable-boolean", "0.1.0"))( read_nullable_boolean := partial(_read_nullable, array_type=pd.arrays.BooleanArray) ) _REGISTRY.register_read(ZarrGroup, IOSpec("nullable-boolean", "0.1.0"))( read_nullable_boolean ) _REGISTRY.register_read(H5Group, IOSpec("nullable-string-array", "0.1.0"))( read_nullable_string := partial(_read_nullable, array_type=_string_array) ) _REGISTRY.register_read(ZarrGroup, IOSpec("nullable-string-array", "0.1.0"))( read_nullable_string ) ########### # Scalars # ########### @_REGISTRY.register_read(H5Array, IOSpec("numeric-scalar", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("numeric-scalar", "0.2.0")) def read_scalar(elem: ArrayStorageType, *, _reader: Reader) -> np.number: # TODO: `item` ensures the return is in fact a scalar (needed after zarr v3 which now returns a 1 elem array) # https://github.com/zarr-developers/zarr-python/issues/2713 return elem[()].item() def _remove_scalar_compression_args(dataset_kwargs: Mapping[str, Any]) -> dict: # Can’t compress scalars, error is thrown dataset_kwargs = dict(dataset_kwargs) for arg in ( "compression", "compression_opts", "chunks", "shuffle", "fletcher32", "scaleoffset", "compressor", ): dataset_kwargs.pop(arg, None) return dataset_kwargs def write_scalar_zarr( f: ZarrGroup, key: str, value, *, _writer: Writer, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): # these args are ignored in v2: https://zarr.readthedocs.io/en/v2.18.4/api/hierarchy.html#zarr.hierarchy.Group.create_dataset # and error out in v3 dataset_kwargs = _remove_scalar_compression_args(dataset_kwargs) if is_zarr_v2(): return f.create_dataset(key, data=np.array(value), shape=(), **dataset_kwargs) else: from numcodecs import VLenUTF8 match ad.settings.zarr_write_format, value: case 2, str(): filters, dtype = [VLenUTF8()], object case 3, str(): filters, dtype = None, np.dtypes.StringDType() case _, _: filters, dtype = None, np.array(value).dtype a = f.create_array( key, shape=(), dtype=dtype, filters=filters, **dataset_kwargs, ) a[...] = np.array(value) def write_hdf5_scalar( f: H5Group, key: str, value, *, _writer: Writer, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): # Can’t compress scalars, error is thrown dataset_kwargs = _remove_scalar_compression_args(dataset_kwargs) f.create_dataset(key, data=np.array(value), **dataset_kwargs) for numeric_scalar_type in [ *(bool, np.bool_), *(np.uint8, np.uint16, np.uint32, np.uint64), *(int, np.int8, np.int16, np.int32, np.int64), *(float, *np.floating.__subclasses__()), *np.complexfloating.__subclasses__(), ]: _REGISTRY.register_write( H5Group, numeric_scalar_type, IOSpec("numeric-scalar", "0.2.0") )(write_hdf5_scalar) _REGISTRY.register_write( ZarrGroup, numeric_scalar_type, IOSpec("numeric-scalar", "0.2.0") )(write_scalar_zarr) _REGISTRY.register_write(ZarrGroup, str, IOSpec("string", "0.2.0"))(write_scalar_zarr) _REGISTRY.register_write(ZarrGroup, np.str_, IOSpec("string", "0.2.0"))( write_scalar_zarr ) @_REGISTRY.register_read(H5Array, IOSpec("string", "0.2.0")) def read_hdf5_string(elem: H5Array, *, _reader: Reader) -> str: return elem.asstr()[()] @_REGISTRY.register_read(ZarrArray, IOSpec("string", "0.2.0")) def read_zarr_string(elem: ZarrArray, *, _reader: Reader) -> str: return str(elem[()]) _REGISTRY.register_read(H5Array, IOSpec("bytes", "0.2.0"))(read_scalar) _REGISTRY.register_read(ZarrArray, IOSpec("bytes", "0.2.0"))(read_scalar) @_REGISTRY.register_write(H5Group, np.str_, IOSpec("string", "0.2.0")) @_REGISTRY.register_write(H5Group, str, IOSpec("string", "0.2.0")) def write_string( f: H5Group, k: str, v: np.str_ | str, *, _writer: Writer, dataset_kwargs: Mapping[str, Any], ): dataset_kwargs = dataset_kwargs.copy() dataset_kwargs.pop("compression", None) dataset_kwargs.pop("compression_opts", None) f.create_dataset( k, data=np.array(v, dtype=h5py.string_dtype(encoding="utf-8")), **dataset_kwargs ) # @_REGISTRY.register_write(np.bytes_, IOSpec("bytes", "0.2.0")) # @_REGISTRY.register_write(bytes, IOSpec("bytes", "0.2.0")) # def write_string(f, k, v, dataset_kwargs): # if "compression" in dataset_kwargs: # dataset_kwargs = dict(dataset_kwargs) # dataset_kwargs.pop("compression") # f.create_dataset(k, data=np.array(v), **dataset_kwargs) python-anndata-0.12.0~rc1/src/anndata/_io/specs/registry.py000066400000000000000000000411121500370632200236220ustar00rootroot00000000000000from __future__ import annotations import inspect import warnings from collections.abc import Mapping from dataclasses import dataclass from functools import partial, singledispatch, wraps from types import MappingProxyType from typing import TYPE_CHECKING, Generic, TypeVar from anndata._io.utils import report_read_key_on_error, report_write_key_on_error from anndata._types import Read, ReadLazy, _ReadInternal, _ReadLazyInternal from anndata.compat import DaskArray, ZarrGroup, _read_attr, is_zarr_v2 if TYPE_CHECKING: from collections.abc import Callable, Generator, Iterable from typing import Any from anndata._types import ( GroupStorageType, ReadCallback, StorageType, Write, WriteCallback, _WriteInternal, ) from anndata.experimental.backed._compat import Dataset2D from anndata.experimental.backed._lazy_arrays import CategoricalArray, MaskedArray from anndata.typing import RWAble T = TypeVar("T") W = TypeVar("W", bound=_WriteInternal) LazyDataStructures = DaskArray | Dataset2D | CategoricalArray | MaskedArray # TODO: This probably should be replaced by a hashable Mapping due to conversion b/w "_" and "-" # TODO: Should filetype be included in the IOSpec if it changes the encoding? Or does the intent that these things be "the same" overrule that? @dataclass(frozen=True) class IOSpec: encoding_type: str encoding_version: str # TODO: Should this subclass from LookupError? class IORegistryError(Exception): @classmethod def _from_write_parts( cls, dest_type: type, typ: type | tuple[type, str], modifiers: frozenset[str] ) -> IORegistryError: msg = f"No method registered for writing {typ} into {dest_type}" if modifiers: msg += f" with {modifiers}" return cls(msg) @classmethod def _from_read_parts( cls, method: str, registry: Mapping, src_typ: type[StorageType], spec: IOSpec, ) -> IORegistryError: # TODO: Improve error message if type exists, but version does not msg = ( f"No {method} method registered for {spec} from {src_typ}. " "You may need to update your installation of anndata." ) return cls(msg) def write_spec(spec: IOSpec): def decorator(func: W) -> W: @wraps(func) def wrapper(g: GroupStorageType, k: str, *args, **kwargs): result = func(g, k, *args, **kwargs) g[k].attrs.setdefault("encoding-type", spec.encoding_type) g[k].attrs.setdefault("encoding-version", spec.encoding_version) return result return wrapper return decorator _R = TypeVar("_R", _ReadInternal, _ReadLazyInternal) R = TypeVar("R", Read, ReadLazy) class IORegistry(Generic[_R, R]): def __init__(self): self.read: dict[tuple[type, IOSpec, frozenset[str]], _R] = {} self.read_partial: dict[tuple[type, IOSpec, frozenset[str]], Callable] = {} self.write: dict[ tuple[type, type | tuple[type, str], frozenset[str]], _WriteInternal ] = {} self.write_specs: dict[type | tuple[type, str] | tuple[type, type], IOSpec] = {} def register_write( self, dest_type: type, src_type: type | tuple[type, str], spec: IOSpec | Mapping[str, str], modifiers: Iterable[str] = frozenset(), ) -> Callable[[_WriteInternal[T]], _WriteInternal[T]]: spec = proc_spec(spec) modifiers = frozenset(modifiers) # Record specification for src_type if src_type in self.write_specs and (spec != self.write_specs[src_type]): # First check for consistency current_spec = self.write_specs[src_type] msg = ( "Cannot overwrite IO specifications. Attempted to overwrite encoding " f"for {src_type} from {current_spec} to {spec}" ) raise TypeError(msg) else: self.write_specs[src_type] = spec def _register(func): self.write[(dest_type, src_type, modifiers)] = write_spec(spec)(func) return func return _register def get_write( self, dest_type: type, src_type: type | tuple[type, str], modifiers: frozenset[str] = frozenset(), *, writer: Writer, ) -> Write: import h5py if dest_type is h5py.File: dest_type = h5py.Group if (dest_type, src_type, modifiers) not in self.write: raise IORegistryError._from_write_parts(dest_type, src_type, modifiers) internal = self.write[(dest_type, src_type, modifiers)] return partial(internal, _writer=writer) def has_write( self, dest_type: type, src_type: type | tuple[type, str], modifiers: frozenset[str], ) -> bool: return (dest_type, src_type, modifiers) in self.write def register_read( self, src_type: type, spec: IOSpec | Mapping[str, str], modifiers: Iterable[str] = frozenset(), ) -> Callable[[_R], _R]: spec = proc_spec(spec) modifiers = frozenset(modifiers) def _register(func): self.read[(src_type, spec, modifiers)] = func return func return _register def get_read( self, src_type: type, spec: IOSpec, modifiers: frozenset[str] = frozenset(), *, reader: Reader, ) -> R: if (src_type, spec, modifiers) not in self.read: raise IORegistryError._from_read_parts("read", self.read, src_type, spec) # noqa: EM101 internal = self.read[(src_type, spec, modifiers)] return partial(internal, _reader=reader) def has_read( self, src_type: type, spec: IOSpec, modifiers: frozenset[str] = frozenset() ) -> bool: return (src_type, spec, modifiers) in self.read def register_read_partial( self, src_type: type, spec: IOSpec | Mapping[str, str], modifiers: Iterable[str] = frozenset(), ): spec = proc_spec(spec) modifiers = frozenset(modifiers) def _register(func): self.read_partial[(src_type, spec, modifiers)] = func return func return _register def get_partial_read( self, src_type: type, spec: IOSpec, modifiers: frozenset[str] = frozenset() ): if (src_type, spec, modifiers) in self.read_partial: return self.read_partial[(src_type, spec, modifiers)] name = "read_partial" raise IORegistryError._from_read_parts(name, self.read_partial, src_type, spec) def get_spec(self, elem: Any) -> IOSpec: if isinstance(elem, DaskArray): if (typ_meta := (DaskArray, type(elem._meta))) in self.write_specs: return self.write_specs[typ_meta] elif hasattr(elem, "dtype"): if (typ_kind := (type(elem), elem.dtype.kind)) in self.write_specs: return self.write_specs[typ_kind] return self.write_specs[type(elem)] _REGISTRY: IORegistry[_ReadInternal, Read] = IORegistry() _LAZY_REGISTRY: IORegistry[_ReadLazyInternal, ReadLazy] = IORegistry() @singledispatch def proc_spec(spec) -> IOSpec: msg = f"proc_spec not defined for type: {type(spec)}." raise NotImplementedError(msg) @proc_spec.register(IOSpec) def proc_spec_spec(spec: IOSpec) -> IOSpec: return spec @proc_spec.register(Mapping) def proc_spec_mapping(spec: Mapping[str, str]) -> IOSpec: return IOSpec(**{k.replace("-", "_"): v for k, v in spec.items()}) def get_spec( elem: StorageType, ) -> IOSpec: return proc_spec( { k: _read_attr(elem.attrs, k, "") for k in ["encoding-type", "encoding-version"] } ) def _iter_patterns( elem, ) -> Generator[tuple[type, type | str] | tuple[type, type, str], None, None]: """Iterates over possible patterns for an element in order of precedence.""" from anndata.compat import DaskArray t = type(elem) if isinstance(elem, DaskArray): yield (t, type(elem._meta), elem.dtype.kind) yield (t, type(elem._meta)) if hasattr(elem, "dtype"): yield (t, elem.dtype.kind) yield t class Reader: def __init__( self, registry: IORegistry, callback: ReadCallback | None = None ) -> None: self.registry = registry self.callback = callback @report_read_key_on_error def read_elem( self, elem: StorageType, modifiers: frozenset[str] = frozenset(), ) -> RWAble: """Read an element from a store. See exported function for more details.""" iospec = get_spec(elem) read_func: Read = self.registry.get_read( type(elem), iospec, modifiers, reader=self ) if self.callback is None: return read_func(elem) return self.callback(read_func, elem.name, elem, iospec=iospec) class LazyReader(Reader): @report_read_key_on_error def read_elem( self, elem: StorageType, modifiers: frozenset[str] = frozenset(), chunks: tuple[int, ...] | None = None, **kwargs, ) -> LazyDataStructures: """Read a dask element from a store. See exported function for more details.""" iospec = get_spec(elem) read_func: ReadLazy = self.registry.get_read( type(elem), iospec, modifiers, reader=self ) if self.callback is not None: msg = "Dask reading does not use a callback. Ignoring callback." warnings.warn(msg, stacklevel=2) read_params = inspect.signature(read_func).parameters for kwarg in kwargs: if kwarg not in read_params: msg = ( f"Keyword argument {kwarg} passed to read_elem_lazy are not supported by the " "registered read function." ) raise ValueError(msg) if "chunks" in read_params: kwargs["chunks"] = chunks return read_func(elem, **kwargs) class Writer: def __init__(self, registry: IORegistry, callback: WriteCallback | None = None): self.registry = registry self.callback = callback def find_write_func( self, dest_type: type, elem: Any, modifiers: frozenset[str] ) -> Write: for pattern in _iter_patterns(elem): if self.registry.has_write(dest_type, pattern, modifiers): return self.registry.get_write( dest_type, pattern, modifiers, writer=self ) # Raises IORegistryError return self.registry.get_write(dest_type, type(elem), modifiers, writer=self) @report_write_key_on_error def write_elem( self, store: GroupStorageType, k: str, elem: RWAble, *, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), modifiers: frozenset[str] = frozenset(), ): from pathlib import PurePosixPath import h5py # we allow stores to have a prefix like /uns which are then written to with keys like /uns/foo if "/" in k.split(store.name)[-1][1:]: msg = "Forward slashes are not allowed in keys." raise ValueError(msg) if isinstance(store, h5py.File): store = store["/"] dest_type = type(store) # Normalize k to absolute path if ( (isinstance(store, ZarrGroup) and is_zarr_v2()) or isinstance(store, h5py.Group) and not PurePosixPath(k).is_absolute() ): k = str(PurePosixPath(store.name) / k) if k == "/": if isinstance(store, ZarrGroup) and not is_zarr_v2(): from zarr.core.sync import sync sync(store.store.clear()) else: store.clear() elif k in store: del store[k] write_func = self.find_write_func(dest_type, elem, modifiers) if self.callback is None: return write_func(store, k, elem, dataset_kwargs=dataset_kwargs) return self.callback( write_func, store, k, elem, dataset_kwargs=dataset_kwargs, iospec=self.registry.get_spec(elem), ) def read_elem(elem: StorageType) -> RWAble: """ Read an element from a store. Assumes that the element is encoded using the anndata encoding. This function will determine the encoded type using the encoding metadata stored in elem's attributes. Params ------ elem The stored element. """ return Reader(_REGISTRY).read_elem(elem) def read_elem_lazy( elem: StorageType, chunks: tuple[int, ...] | None = None, **kwargs ) -> LazyDataStructures: """ Read an element from a store lazily. Assumes that the element is encoded using the anndata encoding. This function will determine the encoded type using the encoding metadata stored in elem's attributes. Parameters ---------- elem The stored element. chunks, optional length `n`, the same `n` as the size of the underlying array. Note that the minor axis dimension must match the shape for sparse. Defaults to `(1000, adata.shape[1])` for CSR sparse, `(adata.shape[0], 1000)` for CSC sparse, and the on-disk chunking otherwise for dense. Can use `-1` or `None` to indicate use of the size of the corresponding dimension. Returns ------- A "lazy" elem Examples -------- Setting up our example: >>> from scanpy.datasets import pbmc3k >>> import tempfile >>> import anndata as ad >>> import zarr >>> tmp_path = tempfile.gettempdir() >>> zarr_path = tmp_path + "/adata.zarr" >>> adata = pbmc3k() >>> adata.layers["dense"] = adata.X.toarray() >>> adata.write_zarr(zarr_path) Reading a sparse matrix from a zarr store lazily, with custom chunk size and default: >>> g = zarr.open(zarr_path) >>> adata.X = ad.experimental.read_elem_lazy(g["X"]) >>> adata.X dask.array >>> adata.X = ad.experimental.read_elem_lazy(g["X"], chunks=(500, adata.shape[1])) >>> adata.X dask.array Reading a dense matrix from a zarr store lazily: >>> adata.layers["dense"] = ad.experimental.read_elem_lazy(g["layers/dense"]) >>> adata.layers["dense"] dask.array Making a new anndata object from on-disk, with custom chunks: >>> adata = ad.AnnData( ... obs=ad.io.read_elem(g["obs"]), ... var=ad.io.read_elem(g["var"]), ... uns=ad.io.read_elem(g["uns"]), ... obsm=ad.io.read_elem(g["obsm"]), ... varm=ad.io.read_elem(g["varm"]), ... ) >>> adata.X = ad.experimental.read_elem_lazy(g["X"], chunks=(500, adata.shape[1])) >>> adata.layers["dense"] = ad.experimental.read_elem_lazy(g["layers/dense"]) We also support using -1 and None as a chunk size to signify the reading the whole axis: >>> adata.X = ad.experimental.read_elem_lazy(g["X"], chunks=(500, -1)) >>> adata.X = ad.experimental.read_elem_lazy(g["X"], chunks=(500, None)) """ return LazyReader(_LAZY_REGISTRY).read_elem(elem, chunks=chunks, **kwargs) def write_elem( store: GroupStorageType, k: str, elem: RWAble, *, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ) -> None: """ Write an element to a storage group using anndata encoding. Params ------ store The group to write to. k The key to write to in the group. Note that absolute paths will be written from the root. elem The element to write. Typically an in-memory object, e.g. an AnnData, pandas dataframe, scipy sparse matrix, etc. dataset_kwargs Keyword arguments to pass to the stores dataset creation function. E.g. for zarr this would be `chunks`, `compressor`. """ Writer(_REGISTRY).write_elem(store, k, elem, dataset_kwargs=dataset_kwargs) # TODO: If all items would be read, just call normal read method def read_elem_partial( elem, *, items=None, indices=(slice(None), slice(None)), modifiers: frozenset[str] = frozenset(), ): """Read part of an element from an on disk store.""" read_partial = _REGISTRY.get_partial_read( type(elem), get_spec(elem), frozenset(modifiers) ) return read_partial(elem, items=items, indices=indices) python-anndata-0.12.0~rc1/src/anndata/_io/utils.py000066400000000000000000000225671500370632200220120ustar00rootroot00000000000000from __future__ import annotations from functools import WRAPPER_ASSIGNMENTS, wraps from itertools import pairwise from typing import TYPE_CHECKING, cast from warnings import warn import h5py from packaging.version import Version from .._core.sparse_dataset import BaseCompressedSparseDataset if TYPE_CHECKING: from collections.abc import Callable, Mapping from typing import Any, Literal from .._types import ContravariantRWAble, StorageType, _WriteInternal from ..compat import H5Group, ZarrGroup from .specs.registry import Writer Storage = StorageType | BaseCompressedSparseDataset # For allowing h5py v3 # https://github.com/scverse/anndata/issues/442 H5PY_V3 = Version(h5py.__version__).major >= 3 # ------------------------------------------------------------------------------- # Type conversion # ------------------------------------------------------------------------------- # Could be numba’d if it returned tuples instead of slices def idx_chunks_along_axis(shape: tuple, axis: int, chunk_size: int): """\ Gives indexer tuples chunked along an axis. Params ------ shape Shape of array to be chunked axis Axis to chunk along chunk_size Size of chunk along axis Returns ------- An iterator of tuples for indexing into an array of passed shape. """ total = shape[axis] cur = 0 mutable_idx = [slice(None) for i in range(len(shape))] while cur + chunk_size < total: mutable_idx[axis] = slice(cur, cur + chunk_size) yield tuple(mutable_idx) cur += chunk_size mutable_idx[axis] = slice(cur, None) yield tuple(mutable_idx) def is_float(string): """\ Check whether string is float. See also -------- http://stackoverflow.com/questions/736043/checking-if-a-string-can-be-converted-to-float-in-python """ try: float(string) return True except ValueError: return False def is_int(string): """Check whether string is integer.""" try: int(string) return True except ValueError: return False def convert_bool(string): """Check whether string is boolean.""" if string == "True": return True, True elif string == "False": return True, False else: return False, False def convert_string(string): """Convert string to int, float or bool.""" if is_int(string): return int(string) elif is_float(string): return float(string) elif convert_bool(string)[0]: return convert_bool(string)[1] elif string == "None": return None else: return string def check_key(key): """Checks that passed value is a valid h5py key. Should convert it if there is an obvious conversion path, error otherwise. """ typ = type(key) if issubclass(typ, str): return str(key) # TODO: Should I try to decode bytes? It's what h5py would do, # but it will be read out as a str. # elif issubclass(typ, bytes): # return key else: msg = f"{key} of type {typ} is an invalid key. Should be str." raise TypeError(msg) # ------------------------------------------------------------------------------- # Generic functions # ------------------------------------------------------------------------------- def read_attribute(*args, **kwargs): from .specs import read_elem warn( "This internal function has been deprecated, please use read_elem instead", FutureWarning, ) return read_elem(*args, **kwargs) def write_attribute(*args, **kwargs): from .specs import write_elem warn( "This internal function has been deprecated, please use write_elem instead", FutureWarning, ) return write_elem(*args, **kwargs) # ------------------------------------------------------------------------------- # Errors handling # ------------------------------------------------------------------------------- # TODO: Is there a consistent way to do this which just modifies the previously # thrown error? Could do a warning? class AnnDataReadError(OSError): """Error caused while trying to read in AnnData.""" pass def _get_display_path(store: Storage) -> str: """Return an absolute path of an element (always starts with “/”).""" if isinstance(store, BaseCompressedSparseDataset): store = store.group path = store.name or "??" # can be None return f"/{path.removeprefix('/')}" def add_key_note( e: BaseException, store: Storage, path: str, key: str, op: Literal["read", "writ"] ) -> None: if any( f"Error raised while {op}ing key" in note for note in getattr(e, "__notes__", []) ): return dir = "to" if op == "writ" else "from" msg = f"Error raised while {op}ing key {key!r} of {type(store)} {dir} {path}" e.add_note(msg) def report_read_key_on_error(func): """\ A decorator for hdf5/zarr element reading which makes keys involved in errors get reported. Example ------- >>> import zarr >>> import numpy as np >>> @report_read_key_on_error ... def read_arr(group): ... raise NotImplementedError() >>> z = zarr.open("tmp.zarr", mode="w") >>> z["X"] = np.array([1, 2, 3]) >>> read_arr(z["X"]) # doctest: +SKIP """ @wraps(func) def func_wrapper(*args, **kwargs): from anndata._io.specs import Reader # Figure out signature (method vs function) by going through args for arg in args: if not isinstance(arg, Reader): store = cast("Storage", arg) break else: msg = "No element found in args." raise ValueError(msg) try: return func(*args, **kwargs) except Exception as e: path, key = _get_display_path(store).rsplit("/", 1) add_key_note(e, store, path or "/", key, "read") raise return func_wrapper def report_write_key_on_error(func): """\ A decorator for hdf5/zarr element writing which makes keys involved in errors get reported. Example ------- >>> import zarr >>> @report_write_key_on_error ... def write_arr(group, key, val): ... raise NotImplementedError() >>> z = zarr.open("tmp.zarr", mode="w") >>> X = [1, 2, 3] >>> write_arr(z, "X", X) # doctest: +SKIP """ @wraps(func) def func_wrapper(*args, **kwargs): from anndata._io.specs import Writer # Figure out signature (method vs function) by going through args for arg, key in pairwise(args): if not isinstance(arg, Writer): store = cast("Storage", arg) break else: msg = "No element found in args." raise ValueError(msg) try: return func(*args, **kwargs) except Exception as e: path = _get_display_path(store) add_key_note(e, store, path, key, "writ") raise return func_wrapper # ------------------------------------------------------------------------------- # Common h5ad/zarr stuff # ------------------------------------------------------------------------------- def _read_legacy_raw( f: ZarrGroup | H5Group, modern_raw, # TODO: type read_df: Callable, read_attr: Callable, *, attrs=("X", "var", "varm"), ) -> dict: """\ Backwards compat for reading legacy raw. Makes sure that no modern raw group coexists with legacy raw.* groups. """ if modern_raw: if any(k.startswith("raw.") for k in f): what = f"File {f.filename}" if hasattr(f, "filename") else "Store" msg = f"{what} has both legacy and current raw formats." raise ValueError(msg) return modern_raw raw = {} if "X" in attrs and "raw.X" in f: raw["X"] = read_attr(f["raw.X"]) if "var" in attrs and "raw.var" in f: raw["var"] = read_df(f["raw.var"]) # Backwards compat if "varm" in attrs and "raw.varm" in f: raw["varm"] = read_attr(f["raw.varm"]) return raw def zero_dim_array_as_scalar(func: _WriteInternal): """\ A decorator for write_elem implementations of arrays where zero-dimensional arrays need special handling. """ @wraps(func, assigned=WRAPPER_ASSIGNMENTS + ("__defaults__", "__kwdefaults__")) def func_wrapper( f: StorageType, k: str, elem: ContravariantRWAble, *, _writer: Writer, dataset_kwargs: Mapping[str, Any], ): if elem.shape == (): _writer.write_elem(f, k, elem[()], dataset_kwargs=dataset_kwargs) else: func(f, k, elem, _writer=_writer, dataset_kwargs=dataset_kwargs) return func_wrapper def no_write_dataset_2d(write): def raise_error_if_dataset_2d_present(store, adata, *args, **kwargs): from anndata.experimental.backed._compat import has_dataset_2d if has_dataset_2d(adata): msg = ( "Writing AnnData objects with a Dataset2D not supported yet. " "Please use `ds.to_memory` to bring the dataset into memory. " "Note that if you have generated this object by concatenating several `AnnData` objects" "the original types may be lost." ) raise NotImplementedError(msg) return write(store, adata, *args, **kwargs) return raise_error_if_dataset_2d_present python-anndata-0.12.0~rc1/src/anndata/_io/write.py000066400000000000000000000114201500370632200217660ustar00rootroot00000000000000from __future__ import annotations import math import warnings from os import fspath from pathlib import Path from typing import TYPE_CHECKING import numpy as np import pandas as pd from scipy.sparse import issparse from anndata._io.utils import no_write_dataset_2d from .._warnings import WriteWarning from ..compat import old_positionals from ..logging import get_logger if TYPE_CHECKING: from os import PathLike from .. import AnnData logger = get_logger(__name__) @no_write_dataset_2d @old_positionals("skip_data", "sep") def write_csvs( dirname: PathLike[str] | str, adata: AnnData, *, skip_data: bool = True, sep: str = ",", ): """See :meth:`~anndata.AnnData.write_csvs`.""" dirname = Path(dirname) if dirname.suffix == ".csv": dirname = dirname.with_suffix("") logger.info(f"writing .csv files to {dirname}") if not dirname.is_dir(): dirname.mkdir(parents=True, exist_ok=True) dir_uns = dirname / "uns" if not dir_uns.is_dir(): dir_uns.mkdir(parents=True, exist_ok=True) d = dict( obs=adata._obs, var=adata._var, obsm=adata.obsm.to_df(), varm=adata.varm.to_df(), ) if not skip_data: d["X"] = pd.DataFrame(adata.X.toarray() if issparse(adata.X) else adata.X) d_write = {**d, **adata._uns} not_yet_raised_sparse_warning = True for key, value in d_write.items(): if issparse(value): if not_yet_raised_sparse_warning: warnings.warn("Omitting to write sparse annotation.", WriteWarning) not_yet_raised_sparse_warning = False continue filename = dirname if key not in {"X", "var", "obs", "obsm", "varm"}: filename = dir_uns filename /= f"{key}.csv" df = value if not isinstance(value, pd.DataFrame): value = np.array(value) if np.ndim(value) == 0: value = value[None] try: df = pd.DataFrame(value) except Exception as e: warnings.warn( f"Omitting to write {key!r} of type {type(e)}.", WriteWarning, ) continue df.to_csv( filename, sep=sep, header=key in {"obs", "var", "obsm", "varm"}, index=key in {"obs", "var"}, ) @no_write_dataset_2d @old_positionals("write_obsm_varm") def write_loom( filename: PathLike[str] | str, adata: AnnData, *, write_obsm_varm: bool = False ) -> None: """See :meth:`~anndata.AnnData.write_loom`.""" filename = Path(filename) row_attrs = {k: np.array(v) for k, v in adata.var.to_dict("list").items()} row_names = adata.var_names row_dim = row_names.name if row_names.name is not None else "var_names" row_attrs[row_dim] = row_names.values col_attrs = {k: np.array(v) for k, v in adata.obs.to_dict("list").items()} col_names = adata.obs_names col_dim = col_names.name if col_names.name is not None else "obs_names" col_attrs[col_dim] = col_names.values if adata.X is None: msg = "loompy does not accept empty matrices as data" raise ValueError(msg) if write_obsm_varm: for key in adata.obsm.keys(): col_attrs[key] = adata.obsm[key] for key in adata.varm.keys(): row_attrs[key] = adata.varm[key] elif len(adata.obsm.keys()) > 0 or len(adata.varm.keys()) > 0: logger.warning( f"The loom file will lack these fields:\n" f"{adata.obsm.keys() | adata.varm.keys()}\n" f"Use write_obsm_varm=True to export multi-dimensional annotations" ) layers = {"": adata.X.T} for key in adata.layers.keys(): layers[key] = adata.layers[key].T from loompy import create if filename.exists(): filename.unlink() create(fspath(filename), layers, row_attrs=row_attrs, col_attrs=col_attrs) def _get_chunk_indices(za): # TODO: does zarr provide code for this? """\ Return all the indices (coordinates) for the chunks in a zarr array, even empty ones. """ return [ (i, j) for i in range(int(math.ceil(float(za.shape[0]) / za.chunks[0]))) for j in range(int(math.ceil(float(za.shape[1]) / za.chunks[1]))) ] def _write_in_zarr_chunks(za, key, value): if key != "X": za[:] = value # don’t chunk metadata else: for ci in _get_chunk_indices(za): s0, e0 = za.chunks[0] * ci[0], za.chunks[0] * (ci[0] + 1) s1, e1 = za.chunks[1] * ci[1], za.chunks[1] * (ci[1] + 1) print(ci, s0, e1, s1, e1) if issparse(value): za[s0:e0, s1:e1] = value[s0:e0, s1:e1].todense() else: za[s0:e0, s1:e1] = value[s0:e0, s1:e1] python-anndata-0.12.0~rc1/src/anndata/_io/zarr.py000066400000000000000000000130331500370632200216140ustar00rootroot00000000000000from __future__ import annotations from pathlib import Path from typing import TYPE_CHECKING, TypeVar from warnings import warn import numpy as np import pandas as pd import zarr from scipy import sparse from .._core.anndata import AnnData from .._settings import settings from .._warnings import OldFormatWarning from ..compat import _clean_uns, _from_fixed_length_strings, is_zarr_v2 from ..experimental import read_dispatched, write_dispatched from .specs import read_elem from .utils import _read_legacy_raw, no_write_dataset_2d, report_read_key_on_error if TYPE_CHECKING: from collections.abc import MutableMapping from os import PathLike from zarr.core.common import AccessModeLiteral from zarr.storage import StoreLike T = TypeVar("T") def _check_rec_array(adata): if settings.zarr_write_format == 3 and len( structured_dtype_keys := { k for k in adata.uns.keys() if isinstance(adata.uns[k], np.recarray) or (isinstance(adata.uns[k], np.ndarray) and adata.uns[k].dtype.kind == "V") } ): msg = f"zarr v3 does not support structured dtypes. Found keys {structured_dtype_keys}" raise NotImplementedError(msg) @no_write_dataset_2d def write_zarr( store: StoreLike, adata: AnnData, *, chunks: tuple[int, ...] | None = None, convert_strings_to_categoricals: bool = True, **ds_kwargs, ) -> None: """See :meth:`~anndata.AnnData.write_zarr`.""" _check_rec_array(adata) if isinstance(store, Path): store = str(store) if convert_strings_to_categoricals: adata.strings_to_categoricals() if adata.raw is not None: adata.strings_to_categoricals(adata.raw.var) # TODO: Use spec writing system for this f = open_write_group(store) f.attrs.setdefault("encoding-type", "anndata") f.attrs.setdefault("encoding-version", "0.1.0") def callback(func, s, k: str, elem, dataset_kwargs, iospec): if ( chunks is not None and not isinstance(elem, sparse.spmatrix) and k.lstrip("/") == "X" ): dataset_kwargs = dict(dataset_kwargs, chunks=chunks) func(s, k, elem, dataset_kwargs=dataset_kwargs) write_dispatched(f, "/", adata, callback=callback, dataset_kwargs=ds_kwargs) if is_zarr_v2(): zarr.convenience.consolidate_metadata(f.store) else: zarr.consolidate_metadata(f.store) def read_zarr(store: PathLike[str] | str | MutableMapping | zarr.Group) -> AnnData: """\ Read from a hierarchical Zarr array store. Parameters ---------- store The filename, a :class:`~typing.MutableMapping`, or a Zarr storage class. """ if isinstance(store, Path): store = str(store) if isinstance(store, zarr.Group): f = store else: f = zarr.open(store, mode="r") # Read with handling for backwards compat def callback(func, elem_name: str, elem, iospec): if iospec.encoding_type == "anndata" or elem_name.endswith("/"): return AnnData( **{ k: read_dispatched(v, callback) for k, v in dict(elem).items() if not k.startswith("raw.") } ) elif elem_name.startswith("/raw."): return None elif elem_name in {"/obs", "/var"}: return read_dataframe(elem) elif elem_name == "/raw": # Backwards compat return _read_legacy_raw(f, func(elem), read_dataframe, func) return func(elem) adata = read_dispatched(f, callback=callback) # Backwards compat (should figure out which version) if "raw.X" in f: raw = AnnData(**_read_legacy_raw(f, adata.raw, read_dataframe, read_elem)) raw.obs_names = adata.obs_names adata.raw = raw # Backwards compat for <0.7 if isinstance(f["obs"], zarr.Array): _clean_uns(adata) return adata @report_read_key_on_error def read_dataset(dataset: zarr.Array): """Legacy method for reading datasets without encoding_type.""" value = dataset[...] if not hasattr(value, "dtype"): return value elif isinstance(value.dtype, str): pass elif issubclass(value.dtype.type, np.bytes_): value = value.astype(str).astype(object) # bytestring -> unicode -> str elif len(value.dtype.descr) > 1: # Compound dtype # For backwards compat, now strings are written as variable length value = _from_fixed_length_strings(value) if value.shape == (): value = value[()] return value @report_read_key_on_error def read_dataframe_legacy(dataset: zarr.Array) -> pd.DataFrame: """Reads old format of dataframes""" # NOTE: Likely that categoricals need to be removed from uns warn( f"'{dataset.name}' was written with a very old version of AnnData. " "Consider rewriting it.", OldFormatWarning, ) df = pd.DataFrame(_from_fixed_length_strings(dataset[()])) df.set_index(df.columns[0], inplace=True) return df @report_read_key_on_error def read_dataframe(group: zarr.Group | zarr.Array) -> pd.DataFrame: # Fast paths if isinstance(group, zarr.Array): return read_dataframe_legacy(group) else: return read_elem(group) def open_write_group( store: StoreLike, *, mode: AccessModeLiteral = "w", **kwargs ) -> zarr.Group: if not is_zarr_v2() and "zarr_format" not in kwargs: kwargs["zarr_format"] = settings.zarr_write_format return zarr.open_group(store, mode=mode, **kwargs) python-anndata-0.12.0~rc1/src/anndata/_settings.py000066400000000000000000000357571500370632200221100ustar00rootroot00000000000000from __future__ import annotations import inspect import os import textwrap import warnings from collections.abc import Iterable from contextlib import contextmanager from dataclasses import dataclass, field, fields from enum import Enum from functools import partial from inspect import Parameter, signature from types import GenericAlias from typing import TYPE_CHECKING, Generic, NamedTuple, TypeVar, cast if TYPE_CHECKING: from collections.abc import Callable, Sequence from typing import Any, TypeGuard T = TypeVar("T") class DeprecatedOption(NamedTuple): option: str message: str | None removal_version: str | None def _is_plain_type(obj: object) -> TypeGuard[type]: return isinstance(obj, type) and not isinstance(obj, GenericAlias) def describe(self: RegisteredOption, *, as_rst: bool = False) -> str: type_str = self.type.__name__ if _is_plain_type(self.type) else str(self.type) if as_rst: default_str = repr(self.default_value).replace("\\", "\\\\") doc = f"""\ .. attribute:: settings.{self.option} :type: {type_str} :value: {default_str} {self.description} """ else: doc = f"""\ {self.option}: `{type_str}` {self.description} (default: `{self.default_value!r}`). """ return textwrap.dedent(doc) class RegisteredOption(NamedTuple, Generic[T]): option: str default_value: T description: str validate: Callable[[T], None] type: object describe = describe def check_and_get_environ_var( key: str, default_value: str, allowed_values: Sequence[str] | None = None, cast: Callable[[Any], T] | type[Enum] = lambda x: x, ) -> T: """Get the environment variable and return it is a (potentially) non-string, usable value. Parameters ---------- key The environment variable name. default_value The default value for `os.environ.get`. allowed_values Allowable string values., by default None cast Casting from the string to a (potentially different) python object, by default lambdax:x Returns ------- The casted value. """ environ_value_or_default_value = os.environ.get(key, default_value) if ( allowed_values is not None and environ_value_or_default_value not in allowed_values ): msg = ( f"Value {environ_value_or_default_value!r} is not in allowed {allowed_values} for environment variable {key}. " f"Default {default_value} will be used." ) warnings.warn(msg) environ_value_or_default_value = default_value return ( cast(environ_value_or_default_value) if not isinstance(cast, type(Enum)) else cast[environ_value_or_default_value] ) def check_and_get_bool(option, default_value): return check_and_get_environ_var( f"ANNDATA_{option.upper()}", str(int(default_value)), ["0", "1"], lambda x: bool(int(x)), ) def check_and_get_int(option, default_value): return check_and_get_environ_var( f"ANNDATA_{option.upper()}", str(int(default_value)), None, lambda x: int(x), ) _docstring = """ This manager allows users to customize settings for the anndata package. Settings here will generally be for advanced use-cases and should be used with caution. The following options are available: {options_description} For setting an option please use :func:`~anndata.settings.override` (local) or set the above attributes directly (global) i.e., `anndata.settings.my_setting = foo`. For assignment by environment variable, use the variable name in all caps with `ANNDATA_` as the prefix before import of :mod:`anndata`. For boolean environment variable setting, use 1 for `True` and 0 for `False`. """ @dataclass class SettingsManager: _registered_options: dict[str, RegisteredOption] = field(default_factory=dict) _deprecated_options: dict[str, DeprecatedOption] = field(default_factory=dict) _config: dict[str, object] = field(default_factory=dict) __doc_tmpl__: str = _docstring def describe( self, option: str | Iterable[str] | None = None, *, should_print_description: bool = True, as_rst: bool = False, ) -> str: """Print and/or return a (string) description of the option(s). Parameters ---------- option Option(s) to be described, by default None (i.e., do all option) should_print_description Whether or not to print the description in addition to returning it. Returns ------- The description. """ describe = partial( self.describe, should_print_description=should_print_description, as_rst=as_rst, ) if option is None: return describe(self._registered_options.keys()) if isinstance(option, Iterable) and not isinstance(option, str): return "\n".join([describe(k) for k in option]) registered_option = self._registered_options[option] doc = registered_option.describe(as_rst=as_rst).rstrip("\n") if option in self._deprecated_options: opt = self._deprecated_options[option] if opt.message is not None: doc += f" *{opt.message}" doc += f" {option} will be removed in {opt.removal_version}.*" if should_print_description: print(doc) return doc def deprecate( self, option: str, removal_version: str, message: str | None = None ) -> None: """Deprecate options with a message at a version. Parameters ---------- option Which option should be deprecated. removal_version The version targeted for removal. message A custom message. """ self._deprecated_options[option] = DeprecatedOption( option, message, removal_version ) def register( self, option: str, default_value: T, description: str, validate: Callable[[T], None], option_type: object | None = None, get_from_env: Callable[[str, T], T] = lambda x, y: y, ) -> None: """Register an option so it can be set/described etc. by end-users Parameters ---------- option Option to be set. default_value Default value with which to set the option. description Description to be used in the docstring. validate A function which raises a `ValueError` or `TypeError` if the value is invalid. option_type Optional override for the option type to be displayed. Otherwise `type(default_value)`. get_from_env An optional function which takes as arguments the name of the option and a default value and returns the value from the environment variable `ANNDATA_CAPS_OPTION` (or default if not present). Default behavior is to return `default_value` without checking the environment. """ try: validate(default_value) except (ValueError, TypeError) as e: e.add_note(f"for option {option!r}") raise e option_type = type(default_value) if option_type is None else option_type self._registered_options[option] = RegisteredOption( option, default_value, description, validate, option_type ) self._config[option] = get_from_env(option, default_value) self._update_override_function_for_new_option(option) def _update_override_function_for_new_option( self, option: str, ): """This function updates the keyword arguments, docstring, and annotations of the `SettingsManager.override` function as the `SettingsManager.register` method is called. Parameters ---------- option The option being registered for which the override function needs updating. """ option_type = self._registered_options[option].type # Update annotations for type checking. self.override.__annotations__[option] = option_type # __signature__ needs to be updated for tab autocompletion in IPython. # See https://github.com/ipython/ipython/issues/11624 for inspiration. self.override.__func__.__signature__ = signature(self.override).replace( parameters=[ Parameter(name="self", kind=Parameter.POSITIONAL_ONLY), *[ Parameter( name=k, annotation=option_type, kind=Parameter.KEYWORD_ONLY, ) for k in self._registered_options ], ] ) # Update docstring for `SettingsManager.override` as well. doc = cast("str", self.override.__doc__) insert_index = doc.find("\n Yields") option_docstring = "\t" + "\t".join( self.describe(option, should_print_description=False).splitlines( keepends=True ) ) self.override.__func__.__doc__ = ( f"{doc[:insert_index]}\n{option_docstring}{doc[insert_index:]}" ) def __setattr__(self, option: str, val: object) -> None: """ Set an option to a value. To see the allowed option to be set and their description, use describe_option. Parameters ---------- option Option to be set. val Value with which to set the option. Raises ------ AttributeError If the option has not been registered, this function will raise an error. """ if option in {f.name for f in fields(self)}: return super().__setattr__(option, val) elif option not in self._registered_options: msg = ( f"{option} is not an available option for anndata. " "Please open an issue if you believe this is a mistake." ) raise AttributeError(msg) registered_option = self._registered_options[option] registered_option.validate(val) self._config[option] = val def __getattr__(self, option: str) -> object: """ Gets the option's value. Parameters ---------- option Option to be got. Returns ------- Value of the option. """ if option in self._deprecated_options: deprecated = self._deprecated_options[option] msg = f"{option!r} will be removed in {deprecated.removal_version}. {deprecated.message}" warnings.warn(msg, FutureWarning) if option in self._config: return self._config[option] msg = f"{option} not found." raise AttributeError(msg) def __dir__(self) -> Iterable[str]: return sorted((*dir(super()), *self._config.keys())) def reset(self, option: Iterable[str] | str) -> None: """ Resets option(s) to its (their) default value(s). Parameters ---------- option The option(s) to be reset. """ if isinstance(option, Iterable) and not isinstance(option, str): for opt in option: self.reset(opt) else: self._config[option] = self._registered_options[option].default_value @contextmanager def override(self, **overrides): """ Provides local override via keyword arguments as a context manager. Parameters ---------- Yields ------ None """ restore = {a: getattr(self, a) for a in overrides} try: for attr, value in overrides.items(): setattr(self, attr, value) yield None finally: for attr, value in restore.items(): setattr(self, attr, value) def __repr__(self) -> str: params = "".join(f"\t{k}={v!r},\n" for k, v in self._config.items()) return f"{type(self).__name__}(\n{params}\n)" @property def __doc__(self): in_sphinx = any("/sphinx/" in frame.filename for frame in inspect.stack()) options_description = self.describe( should_print_description=False, as_rst=in_sphinx ) return self.__doc_tmpl__.format( options_description=options_description, ) settings = SettingsManager() ################################################################################## # PLACE REGISTERED SETTINGS HERE SO THEY CAN BE PICKED UP FOR DOCSTRING CREATION # ################################################################################## V = TypeVar("V") def gen_validator(_type: type[V]) -> Callable[[V], None]: def validate_type(val: V) -> None: if not isinstance(val, _type): msg = f"{val} not valid {_type}" raise TypeError(msg) return validate_type validate_bool = gen_validator(bool) validate_int = gen_validator(int) settings.register( "remove_unused_categories", default_value=True, description="Whether or not to remove unused categories with :class:`~pandas.Categorical`.", validate=validate_bool, get_from_env=check_and_get_bool, ) settings.register( "check_uniqueness", default_value=True, description=( "Whether or not to check uniqueness of the `obs` indices on `__init__` of :class:`~anndata.AnnData`." ), validate=validate_bool, get_from_env=check_and_get_bool, ) settings.register( "allow_write_nullable_strings", default_value=False, description="Whether or not to allow writing of `pd.arrays.StringArray`.", validate=validate_bool, get_from_env=check_and_get_bool, ) def validate_zarr_write_format(format: int): validate_int(format) if format not in {2, 3}: msg = "non-v2 zarr on-disk format not supported" raise ValueError(msg) settings.register( "zarr_write_format", default_value=2, description="Which version of zarr to write to.", validate=validate_zarr_write_format, get_from_env=lambda name, default: check_and_get_environ_var( f"ANNDATA_{name.upper()}", str(default), ["2", "3"], lambda x: int(x), ), ) def validate_sparse_settings(val: Any) -> None: validate_bool(val) settings.register( "use_sparse_array_on_read", default_value=False, description="Whether or not to use :class:`scipy.sparse.sparray` as the default class when reading in data", validate=validate_bool, get_from_env=check_and_get_bool, ) settings.register( "min_rows_for_chunked_h5_copy", default_value=1000, description="Minimum number of rows at a time to copy when writing out an H5 Dataset to a new location", validate=validate_int, get_from_env=check_and_get_int, ) ################################################################################## ################################################################################## python-anndata-0.12.0~rc1/src/anndata/_types.py000066400000000000000000000123301500370632200213720ustar00rootroot00000000000000""" Defines some useful types for this library. Should probably be cleaned up before thinking about exporting. """ from __future__ import annotations from typing import TYPE_CHECKING, Literal, Protocol, TypeVar from .compat import H5Array, H5Group, ZarrArray, ZarrGroup from .typing import RWAble if TYPE_CHECKING: from collections.abc import Mapping from typing import Any, TypeAlias from ._io.specs.registry import ( IOSpec, LazyDataStructures, LazyReader, Reader, Writer, ) __all__ = [ "ArrayStorageType", "GroupStorageType", "StorageType", "_ReadInternal", "_ReadLazyInternal", "_WriteInternal", ] ArrayStorageType: TypeAlias = ZarrArray | H5Array GroupStorageType: TypeAlias = ZarrGroup | H5Group StorageType: TypeAlias = ArrayStorageType | GroupStorageType # NOTE: If you change these, be sure to update `autodoc_type_aliases` in docs/conf.py! ContravariantRWAble = TypeVar("ContravariantRWAble", bound=RWAble, contravariant=True) CovariantRWAble = TypeVar("CovariantRWAble", bound=RWAble, covariant=True) InvariantRWAble = TypeVar("InvariantRWAble", bound=RWAble) SCo = TypeVar("SCo", covariant=True, bound=StorageType) SCon = TypeVar("SCon", contravariant=True, bound=StorageType) class _ReadInternal(Protocol[SCon, CovariantRWAble]): def __call__(self, elem: SCon, *, _reader: Reader) -> CovariantRWAble: ... class _ReadLazyInternal(Protocol[SCon]): def __call__( self, elem: SCon, *, _reader: LazyReader, chunks: tuple[int, ...] | None = None ) -> LazyDataStructures: ... class Read(Protocol[SCon, CovariantRWAble]): def __call__(self, elem: SCon) -> CovariantRWAble: """Low-level reading function for an element. Parameters ---------- elem The element to read from. Returns ------- The element read from the store. """ ... class ReadLazy(Protocol[SCon]): def __call__( self, elem: SCon, *, chunks: tuple[int, ...] | None = None ) -> LazyDataStructures: """Low-level reading function for a lazy element. Parameters ---------- elem The element to read from. chunks The chunk size to be used. Returns ------- The lazy element read from the store. """ ... class _WriteInternal(Protocol[ContravariantRWAble]): def __call__( self, f: StorageType, k: str, v: ContravariantRWAble, *, _writer: Writer, dataset_kwargs: Mapping[str, Any], ) -> None: ... class Write(Protocol[ContravariantRWAble]): def __call__( self, f: StorageType, k: str, v: ContravariantRWAble, *, dataset_kwargs: Mapping[str, Any], ) -> None: """Low-level writing function for an element. Parameters ---------- f The store to which `elem` should be written. k The key to read in from the group. v The element to write out. dataset_kwargs Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`zarr:index`. """ ... class ReadCallback(Protocol[SCo, InvariantRWAble]): def __call__( self, /, read_func: Read[SCo, InvariantRWAble], elem_name: str, elem: StorageType, *, iospec: IOSpec, ) -> InvariantRWAble: """ Callback used in :func:`anndata.experimental.read_dispatched` to customize reading an element from a store. Params ------ read_func :func:`anndata.io.read_elem` function to call to read the current element given the ``iospec``. elem_name The key to read in from the group. elem The element to read from. iospec Internal AnnData encoding specification for the element. Returns ------- The element read from the store. """ ... class WriteCallback(Protocol[InvariantRWAble]): def __call__( self, /, write_func: Write[InvariantRWAble], store: StorageType, elem_name: str, elem: InvariantRWAble, *, iospec: IOSpec, dataset_kwargs: Mapping[str, Any], ) -> None: """ Callback used in :func:`anndata.experimental.write_dispatched` to customize writing an element to a store. Params ------ write_func :func:`anndata.io.write_elem` function to call to read the current element given the ``iospec``. store The store to which `elem` should be written. elem_name The key to read in from the group. elem The element to write out. iospec Internal AnnData encoding specification for the element. dataset_kwargs Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`zarr:index`. """ ... AnnDataElem = Literal[ "obs", "var", "obsm", "varm", "obsp", "varp", "layers", "X", "raw", "uns", ] Join_T = Literal["inner", "outer"] python-anndata-0.12.0~rc1/src/anndata/_version.py000066400000000000000000000023611500370632200217160ustar00rootroot00000000000000"""Get version from VCS in a dev environment or from package metadata in production. See . """ from __future__ import annotations from pathlib import Path __all__ = ["__version__"] def _get_version_from_vcs() -> str: # pragma: no cover from hatchling.metadata.core import ProjectMetadata from hatchling.plugin.exceptions import UnknownPluginError from hatchling.plugin.manager import PluginManager from hatchling.utils.fs import locate_file if (pyproject_toml := locate_file(__file__, "pyproject.toml")) is None: msg = "pyproject.toml not found although hatchling is installed" raise LookupError(msg) root = Path(pyproject_toml).parent metadata = ProjectMetadata(root=str(root), plugin_manager=PluginManager()) try: # Version can be either statically set in pyproject.toml or computed dynamically: return metadata.core.version or metadata.hatch.version.cached except UnknownPluginError: msg = "Unable to import hatch plugin." raise ImportError(msg) try: __version__ = _get_version_from_vcs() except (ImportError, LookupError): import importlib.metadata __version__ = importlib.metadata.version("anndata") python-anndata-0.12.0~rc1/src/anndata/_warnings.py000066400000000000000000000013341500370632200220600ustar00rootroot00000000000000from __future__ import annotations class WriteWarning(UserWarning): pass class OldFormatWarning(PendingDeprecationWarning): """Raised when a file in an old file format is read.""" pass class ImplicitModificationWarning(UserWarning): """\ Raised whenever initializing an object or assigning a property changes the type of a part of a parameter or the value being assigned. Examples ======== >>> import pandas as pd >>> adata = AnnData(obs=pd.DataFrame(index=[0, 1, 2])) # doctest: +SKIP ImplicitModificationWarning: Transforming to str index. """ pass class ExperimentalFeatureWarning(Warning): """Raised when an unstable experimental feature is used.""" pass python-anndata-0.12.0~rc1/src/anndata/abc.py000066400000000000000000000031551500370632200206210ustar00rootroot00000000000000from __future__ import annotations from abc import ABC, abstractmethod from typing import TYPE_CHECKING if TYPE_CHECKING: from typing import ClassVar, Literal import numpy as np from .compat import CSArray, CSMatrix, Index __all__ = ["CSRDataset", "CSCDataset"] class _AbstractCSDataset(ABC): """Base for the public API for CSRDataset/CSCDataset.""" format: ClassVar[Literal["csr", "csc"]] """The format of the sparse matrix.""" shape: tuple[int, int] """Shape of the matrix.""" dtype: np.dtype """The :class:`numpy.dtype` of the `data` attribute of the sparse matrix.""" backend: Literal["zarr", "hdf5"] """Which file type is used on-disk.""" @abstractmethod def __getitem__(self, index: Index) -> float | CSMatrix | CSArray: """Load a slice or an element from the sparse dataset into memory. Parameters ---------- index Index to load. Returns ------- The desired data read off disk. """ @abstractmethod def to_memory(self) -> CSMatrix | CSArray: """Load the sparse dataset into memory. Returns ------- The in-memory representation of the sparse dataset. """ _sparse_dataset_doc = """\ On disk {format} sparse matrix. Analogous to :class:`h5py.Dataset` or :class:`zarr.Array`, but for sparse matrices. """ class CSRDataset(_AbstractCSDataset, ABC): __doc__ = _sparse_dataset_doc.format(format="CSR") format = "csr" class CSCDataset(_AbstractCSDataset, ABC): __doc__ = _sparse_dataset_doc.format(format="CSC") format = "csc" python-anndata-0.12.0~rc1/src/anndata/compat/000077500000000000000000000000001500370632200210015ustar00rootroot00000000000000python-anndata-0.12.0~rc1/src/anndata/compat/__init__.py000066400000000000000000000313511500370632200231150ustar00rootroot00000000000000from __future__ import annotations from codecs import decode from collections.abc import Mapping from functools import cache, partial, singledispatch, wraps from importlib.util import find_spec from inspect import Parameter, signature from types import EllipsisType from typing import TYPE_CHECKING, TypeVar from warnings import warn import h5py import numpy as np import pandas as pd import scipy from packaging.version import Version from zarr import Array as ZarrArray # noqa: F401 from zarr import Group as ZarrGroup if TYPE_CHECKING: from typing import Any ############################# # scipy sparse array comapt # ############################# CSMatrix = scipy.sparse.csr_matrix | scipy.sparse.csc_matrix CSArray = scipy.sparse.csr_array | scipy.sparse.csc_array class Empty: pass Index1D = slice | int | str | np.int64 | np.ndarray | pd.Series IndexRest = Index1D | EllipsisType Index = ( IndexRest | tuple[Index1D, IndexRest] | tuple[IndexRest, Index1D] | tuple[Index1D, Index1D, EllipsisType] | tuple[EllipsisType, Index1D, Index1D] | tuple[Index1D, EllipsisType, Index1D] | CSMatrix | CSArray ) H5Group = h5py.Group H5Array = h5py.Dataset H5File = h5py.File ############################# # Optional deps ############################# @cache def is_zarr_v2() -> bool: import zarr from packaging.version import Version return Version(zarr.__version__) < Version("3.0.0") if is_zarr_v2(): msg = "anndata will no longer support zarr v2 in the near future. Please prepare to upgrade to zarr>=3." warn(msg, DeprecationWarning) if find_spec("awkward") or TYPE_CHECKING: import awkward # noqa: F401 from awkward import Array as AwkArray else: class AwkArray: @staticmethod def __repr__(): return "mock awkward.highlevel.Array" if find_spec("zappy") or TYPE_CHECKING: from zappy.base import ZappyArray else: class ZappyArray: @staticmethod def __repr__(): return "mock zappy.base.ZappyArray" if TYPE_CHECKING: # type checkers are confused and can only see …core.Array from dask.array.core import Array as DaskArray elif find_spec("dask"): from dask.array import Array as DaskArray else: class DaskArray: @staticmethod def __repr__(): return "mock dask.array.core.Array" # https://github.com/scverse/anndata/issues/1749 def is_cupy_importable() -> bool: try: import cupy # noqa: F401 except ImportError: return False return True if is_cupy_importable() or TYPE_CHECKING: from cupy import ndarray as CupyArray from cupyx.scipy.sparse import csc_matrix as CupyCSCMatrix from cupyx.scipy.sparse import csr_matrix as CupyCSRMatrix from cupyx.scipy.sparse import spmatrix as CupySparseMatrix try: import dask.array as da except ImportError: pass else: da.register_chunk_type(CupyCSRMatrix) da.register_chunk_type(CupyCSCMatrix) else: class CupySparseMatrix: @staticmethod def __repr__(): return "mock cupyx.scipy.sparse.spmatrix" class CupyCSRMatrix: @staticmethod def __repr__(): return "mock cupyx.scipy.sparse.csr_matrix" class CupyCSCMatrix: @staticmethod def __repr__(): return "mock cupyx.scipy.sparse.csc_matrix" class CupyArray: @staticmethod def __repr__(): return "mock cupy.ndarray" if find_spec("legacy_api_wrap") or TYPE_CHECKING: from legacy_api_wrap import legacy_api # noqa: TID251 old_positionals = partial(legacy_api, category=FutureWarning) else: def old_positionals(*old_positionals): return lambda func: func ############################# # IO helpers ############################# @singledispatch def _read_attr(attrs: Mapping, name: str, default: Any | None = Empty): if default is Empty: return attrs[name] else: return attrs.get(name, default=default) @_read_attr.register(h5py.AttributeManager) def _read_attr_hdf5( attrs: h5py.AttributeManager, name: str, default: Any | None = Empty ): """ Read an HDF5 attribute and perform all necessary conversions. At the moment, this only implements conversions for string attributes, other types are passed through. String conversion is needed compatibility with other languages. For example Julia's HDF5.jl writes string attributes as fixed-size strings, which are read as bytes by h5py. """ if name not in attrs and default is not Empty: return default attr = attrs[name] attr_id = attrs.get_id(name) dtype = h5py.check_string_dtype(attr_id.dtype) if dtype is None: return attr else: if dtype.length is None: # variable-length string, no problem return attr elif len(attr_id.shape) == 0: # Python bytestring return attr.decode("utf-8") else: # NumPy array return [decode(s, "utf-8") for s in attr] def _from_fixed_length_strings(value): """\ Convert from fixed length strings to unicode. For backwards compatibility with older h5ad and zarr files. """ new_dtype = [] for dt in value.dtype.descr: dt_list = list(dt) dt_type = dt[1] # could probably match better is_annotated = isinstance(dt_type, tuple) if is_annotated: dt_type = dt_type[0] # Fixing issue introduced with h5py v2.10.0, see: # https://github.com/h5py/h5py/issues/1307 if issubclass(np.dtype(dt_type).type, np.bytes_): dt_list[1] = f"U{int(dt_type[2:])}" elif is_annotated or np.issubdtype(np.dtype(dt_type), np.str_): dt_list[1] = "O" # Assumption that it’s a vlen str new_dtype.append(tuple(dt_list)) return value.astype(new_dtype) def _decode_structured_array( arr: np.ndarray, *, dtype: np.dtype | None = None, copy: bool = False ) -> np.ndarray: """ h5py 3.0 now reads all strings as bytes. There is a helper method which can convert these to strings, but there isn't anything for fields of structured dtypes. Params ------ arr An array with structured dtype dtype dtype of the array. This is checked for h5py string data types. Passing this is allowed for cases where array may have been processed by another function before hand. """ if copy: arr = arr.copy() if dtype is None: dtype = arr.dtype # codecs.decode is 2x slower than this lambda, go figure decode = np.frompyfunc(lambda x: x.decode("utf-8"), 1, 1) for k, (dt, _) in dtype.fields.items(): check = h5py.check_string_dtype(dt) if check is not None and check.encoding == "utf-8": decode(arr[k], out=arr[k]) return arr def _to_fixed_length_strings(value: np.ndarray) -> np.ndarray: """\ Convert variable length strings to fixed length. Currently a workaround for https://github.com/zarr-developers/zarr-python/pull/422 """ new_dtype = [] for dt_name, (dt_type, dt_offset) in value.dtype.fields.items(): if dt_type.kind == "O": # Assuming the objects are str size = max(len(x.encode()) for x in value.getfield("O", dt_offset)) new_dtype.append((dt_name, ("U", size))) else: new_dtype.append((dt_name, dt_type)) return value.astype(new_dtype) Group_T = TypeVar("Group_T", bound=ZarrGroup | h5py.Group) # TODO: This is a workaround for https://github.com/scverse/anndata/issues/874 # See https://github.com/h5py/h5py/pull/2311#issuecomment-1734102238 for why this is done this way. def _require_group_write_dataframe( f: Group_T, name: str, df: pd.DataFrame, *args, **kwargs ) -> Group_T: if len(df.columns) > 5_000 and isinstance(f, H5Group): # actually 64kb is the limit, but this should be a conservative estimate return f.create_group(name, track_order=True, *args, **kwargs) return f.require_group(name, *args, **kwargs) ############################# # Dealing with uns ############################# def _clean_uns(adata: AnnData): # noqa: F821 """ Compat function for when categorical keys were stored in uns. This used to be buggy because when storing categorical columns in obs and var with the same column name, only one `_categories` is retained. """ k_to_delete = set() for cats_name, cats in adata.uns.items(): if not cats_name.endswith("_categories"): continue name = cats_name.replace("_categories", "") # fix categories with a single category if isinstance(cats, str | int): cats = [cats] for ann in [adata.obs, adata.var]: if name not in ann: continue codes: np.ndarray = ann[name].values # hack to maybe find the axis the categories were for if not np.all(codes < len(cats)): continue ann[name] = pd.Categorical.from_codes(codes, cats) k_to_delete.add(cats_name) for cats_name in k_to_delete: del adata.uns[cats_name] def _move_adj_mtx(d): """ Read-time fix for moving adjacency matrices from uns to obsp """ n = d.get("uns", {}).get("neighbors", {}) obsp = d.setdefault("obsp", {}) for k in ("distances", "connectivities"): if ( (k in n) and isinstance(n[k], scipy.sparse.spmatrix | np.ndarray) and len(n[k].shape) == 2 ): warn( f"Moving element from .uns['neighbors']['{k}'] to .obsp['{k}'].\n\n" "This is where adjacency matrices should go now.", FutureWarning, ) obsp[k] = n.pop(k) def _find_sparse_matrices(d: Mapping, n: int, keys: tuple, paths: list): """Find paths to sparse matrices with shape (n, n).""" for k, v in d.items(): if isinstance(v, Mapping): _find_sparse_matrices(v, n, (*keys, k), paths) elif scipy.sparse.issparse(v) and v.shape == (n, n): paths.append((*keys, k)) return paths # This function was adapted from scikit-learn # github.com/scikit-learn/scikit-learn/blob/master/sklearn/utils/validation.py def _deprecate_positional_args(func=None, *, version: str = "1.0 (renaming of 0.25)"): """Decorator for methods that issues warnings for positional arguments. Using the keyword-only argument syntax in pep 3102, arguments after the * will issue a warning when passed as a positional argument. Parameters ---------- func Function to check arguments on. version The version when positional arguments will result in error. """ def _inner_deprecate_positional_args(f): sig = signature(f) kwonly_args = [] all_args = [] for name, param in sig.parameters.items(): if param.kind == Parameter.POSITIONAL_OR_KEYWORD: all_args.append(name) elif param.kind == Parameter.KEYWORD_ONLY: kwonly_args.append(name) @wraps(f) def inner_f(*args, **kwargs): extra_args = len(args) - len(all_args) if extra_args <= 0: return f(*args, **kwargs) # extra_args > 0 args_msg = [ f"{name}={arg}" for name, arg in zip(kwonly_args[:extra_args], args[-extra_args:]) ] args_msg = ", ".join(args_msg) warn( f"Pass {args_msg} as keyword args. From version {version} passing " "these as positional arguments will result in an error", FutureWarning, ) kwargs.update(zip(sig.parameters, args)) return f(**kwargs) return inner_f if func is not None: return _inner_deprecate_positional_args(func) return _inner_deprecate_positional_args def _transpose_by_block(dask_array: DaskArray) -> DaskArray: import dask.array as da b = dask_array.blocks b_raveled = b.ravel() block_layout = np.zeros(b.shape, dtype=object) for i in range(block_layout.size): block_layout.flat[i] = b_raveled[i].map_blocks( lambda x: x.T, chunks=b_raveled[i].chunks[::-1] ) return da.block(block_layout.T.tolist()) def _safe_transpose(x): """Safely transpose x This is a workaround for: https://github.com/scipy/scipy/issues/19161 """ if isinstance(x, DaskArray) and scipy.sparse.issparse(x._meta): return _transpose_by_block(x) else: return x.T def _map_cat_to_str(cat: pd.Categorical) -> pd.Categorical: if Version(pd.__version__) >= Version("2.1"): # Argument added in pandas 2.1 return cat.map(str, na_action="ignore") else: return cat.map(str) python-anndata-0.12.0~rc1/src/anndata/experimental/000077500000000000000000000000001500370632200222135ustar00rootroot00000000000000python-anndata-0.12.0~rc1/src/anndata/experimental/__init__.py000066400000000000000000000026721500370632200243330ustar00rootroot00000000000000from __future__ import annotations from types import MappingProxyType from typing import TYPE_CHECKING from .._io.specs import IOSpec, read_elem_lazy from .._types import Read, ReadCallback, StorageType, Write, WriteCallback from ..utils import module_get_attr_redirect from ._dispatch_io import read_dispatched, write_dispatched from .backed import read_lazy from .merge import concat_on_disk from .multi_files import AnnCollection from .pytorch import AnnLoader if TYPE_CHECKING: from typing import Any # Map old name in `anndata.experimental` to new name in `anndata` _DEPRECATED = MappingProxyType( dict( (kv if isinstance(kv, tuple) else (kv, kv)) for kv in ( ("CSRDataset", "abc.CSRDataset"), ("CSCDataset", "abc.CSCDataset"), ("sparse_dataset", "io.sparse_dataset"), ("read_elem", "io.read_elem"), ("write_elem", "io.write_elem"), ("RWAble", "typing.AxisStorable"), ("InMemoryElem", "typing.RWAble"), ) ) ) def __getattr__(attr_name: str) -> Any: return module_get_attr_redirect( attr_name, deprecated_mapping=_DEPRECATED, old_module_path="experimental" ) __all__ = [ "AnnCollection", "AnnLoader", "read_elem_lazy", "read_dispatched", "write_dispatched", "IOSpec", "concat_on_disk", "Read", "read_lazy", "Write", "ReadCallback", "WriteCallback", "StorageType", ] python-anndata-0.12.0~rc1/src/anndata/experimental/_dispatch_io.py000066400000000000000000000035151500370632200252160ustar00rootroot00000000000000from __future__ import annotations from types import MappingProxyType from typing import TYPE_CHECKING if TYPE_CHECKING: from collections.abc import Mapping from typing import Any from anndata._types import ( GroupStorageType, ReadCallback, StorageType, WriteCallback, ) from anndata.typing import RWAble def read_dispatched( elem: StorageType, callback: ReadCallback, ) -> RWAble: """ Read elem, calling the callback at each sub-element. Params ------ elem Storage container (e.g. `h5py.Group`, `zarr.Group`). This must have anndata element specifications. callback Function to call at each anndata encoded element. See Also -------- :doc:`/tutorials/notebooks/{read,write}_dispatched` """ from anndata._io.specs import _REGISTRY, Reader reader = Reader(_REGISTRY, callback=callback) return reader.read_elem(elem) def write_dispatched( store: GroupStorageType, key: str, elem: RWAble, callback: WriteCallback, *, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ) -> None: """ Write elem to store, recursively calling callback at each sub-element. Params ------ store Storage container to be written to. key Key to write element to. To write to the root group, use "/". elem The element to write. Probably an AnnData. callback Function called when writing each element. dataset_kwargs Keyword arguments to pass to the dataset creation function. See Also -------- :doc:`/tutorials/notebooks/{read,write}_dispatched` """ from anndata._io.specs import _REGISTRY, Writer writer = Writer(_REGISTRY, callback=callback) writer.write_elem(store, key, elem, dataset_kwargs=dataset_kwargs) python-anndata-0.12.0~rc1/src/anndata/experimental/backed/000077500000000000000000000000001500370632200234245ustar00rootroot00000000000000python-anndata-0.12.0~rc1/src/anndata/experimental/backed/__init__.py000066400000000000000000000001301500370632200255270ustar00rootroot00000000000000from __future__ import annotations from ._io import read_lazy __all__ = ["read_lazy"] python-anndata-0.12.0~rc1/src/anndata/experimental/backed/_compat.py000066400000000000000000000021031500370632200254140ustar00rootroot00000000000000from __future__ import annotations from importlib.util import find_spec from typing import TYPE_CHECKING if find_spec("xarray") or TYPE_CHECKING: import xarray from xarray import DataArray from xarray.backends import BackendArray from xarray.backends.zarr import ZarrArrayWrapper else: class DataArray: def __repr__(self) -> str: return "mock DataArray" xarray = None class ZarrArrayWrapper: def __repr__(self) -> str: return "mock ZarrArrayWrapper" class BackendArray: def __repr__(self) -> str: return "mock BackendArray" from ._xarray import Dataset, Dataset2D # noqa: F401 if TYPE_CHECKING: from anndata import AnnData def has_dataset_2d(adata: AnnData) -> bool: if any(isinstance(annot_df, Dataset2D) for annot_df in [adata.obs, adata.var]): return True for annot_m_key in ["varm", "obsm"]: annot_m = getattr(adata, annot_m_key) if any(isinstance(maybe_df, Dataset2D) for maybe_df in annot_m.values()): return True return False python-anndata-0.12.0~rc1/src/anndata/experimental/backed/_io.py000066400000000000000000000135721500370632200245540ustar00rootroot00000000000000from __future__ import annotations import typing import warnings from os import PathLike from pathlib import Path from typing import TYPE_CHECKING import h5py from anndata._io.specs.registry import read_elem_lazy from anndata._types import AnnDataElem from testing.anndata._doctest import doctest_needs from ..._core.anndata import AnnData from ..._settings import settings from ...compat import ZarrGroup, is_zarr_v2 from .. import read_dispatched if TYPE_CHECKING: from collections.abc import MutableMapping from anndata._io.specs.registry import IOSpec from anndata._types import Read, StorageType @doctest_needs("xarray") def read_lazy( store: PathLike[str] | str | MutableMapping | ZarrGroup | h5py.Dataset, *, load_annotation_index: bool = True, ) -> AnnData: """ Lazily read in on-disk/in-cloud AnnData stores, including `obs` and `var`. No array data should need to be read into memory with the exception of :class:`ak.Array`, scalars, and some older-encoding arrays. Parameters ---------- store A store-like object to be read in. If :class:`zarr.Group`, it is best for it to be consolidated. load_annotation_index Whether or not to use a range index for the `{obs,var}` :class:`xarray.Dataset` so as not to load the index into memory. If `False`, the real `index` will be inserted as `{obs,var}_names` in the object but not be one of the `coords` thereby preventing read operations. Access to `adata.obs.index` will also only give the dummy index, and not the "real" index that is file-backed. Returns ------- A lazily read-in :class:`~anndata.AnnData` object. Examples -------- Preparing example objects >>> import anndata as ad >>> from urllib.request import urlretrieve >>> import scanpy as sc >>> base_url = "https://datasets.cellxgene.cziscience.com" >>> def get_cellxgene_data(id_: str): ... out_path = sc.settings.datasetdir / f"{id_}.h5ad" ... if out_path.exists(): ... return out_path ... file_url = f"{base_url}/{id_}.h5ad" ... sc.settings.datasetdir.mkdir(parents=True, exist_ok=True) ... urlretrieve(file_url, out_path) ... return out_path >>> path_b_cells = get_cellxgene_data("a93eab58-3d82-4b61-8a2f-d7666dcdb7c4") >>> path_fetal = get_cellxgene_data("d170ff04-6da0-4156-a719-f8e1bbefbf53") >>> b_cells_adata = ad.experimental.read_lazy(path_b_cells) >>> fetal_adata = ad.experimental.read_lazy(path_fetal) >>> print(b_cells_adata) AnnData object with n_obs × n_vars = 146 × 33452 obs: 'donor_id', 'self_reported_ethnicity_ontology_term_id', 'organism_ontology_term_id', ... >>> print(fetal_adata) AnnData object with n_obs × n_vars = 344 × 15585 obs: 'nCount_Spatial', 'nFeature_Spatial', 'Cluster', 'adult_pred_type'... This functionality is compatible with :func:`anndata.concat` >>> ad.concat([b_cells_adata, fetal_adata], join="outer") AnnData object with n_obs × n_vars = 490 × 33452 obs: 'donor_id', 'self_reported_ethnicity_ontology_term_id', 'organism_ontology_term_id'... """ try: import xarray # noqa: F401 except ImportError: msg = ( "xarray is required to use the `read_lazy` function. Please install xarray." ) raise ImportError(msg) is_h5_store = isinstance(store, h5py.Dataset | h5py.File | h5py.Group) is_h5 = ( isinstance(store, PathLike | str) and Path(store).suffix == ".h5ad" ) or is_h5_store has_keys = True # true if consolidated or h5ad if not is_h5: import zarr if not isinstance(store, ZarrGroup): try: f = zarr.open_consolidated(store, mode="r") except ( KeyError if is_zarr_v2() else ValueError ): # v3 returns a ValueError for consolidated metadata not found msg = "Did not read zarr as consolidated. Consider consolidating your metadata." warnings.warn(msg) has_keys = False f = zarr.open_group(store, mode="r") else: f = store else: if is_h5_store: f = store else: f = h5py.File(store, mode="r") def callback(func: Read, /, elem_name: str, elem: StorageType, *, iospec: IOSpec): if iospec.encoding_type in {"anndata", "raw"} or elem_name.endswith("/"): iter_object = ( dict(elem).items() if has_keys else ( (k, v) for k, v in ( (k, elem.get(k, None)) for k in typing.get_args(AnnDataElem) ) if v is not None # need to do this instead of `k in elem` to prevent unnecessary metadata accesses ) ) return AnnData(**{k: read_dispatched(v, callback) for k, v in iter_object}) elif ( iospec.encoding_type in { "csr_matrix", "csc_matrix", "array", "string-array", "dataframe", "categorical", } or "nullable" in iospec.encoding_type ): if "dataframe" == iospec.encoding_type and elem_name in {"/obs", "/var"}: return read_elem_lazy(elem, use_range_index=not load_annotation_index) return read_elem_lazy(elem) elif iospec.encoding_type in {"awkward-array"}: return read_dispatched(elem, None) elif iospec.encoding_type == "dict": return { k: read_dispatched(v, callback=callback) for k, v in dict(elem).items() } return func(elem) with settings.override(check_uniqueness=load_annotation_index): adata = read_dispatched(f, callback=callback) return adata python-anndata-0.12.0~rc1/src/anndata/experimental/backed/_lazy_arrays.py000066400000000000000000000137711500370632200265060ustar00rootroot00000000000000from __future__ import annotations from functools import cached_property from typing import TYPE_CHECKING, Generic, TypeVar import pandas as pd from anndata._core.index import _subset from anndata._core.views import as_view from anndata._io.specs.lazy_methods import get_chunksize from anndata.compat import H5Array, ZarrArray from ..._settings import settings from ._compat import BackendArray, DataArray, ZarrArrayWrapper from ._compat import xarray as xr if TYPE_CHECKING: from pathlib import Path from typing import Literal import numpy as np from anndata._core.index import Index from anndata.compat import ZarrGroup K = TypeVar("K", H5Array, ZarrArray) class ZarrOrHDF5Wrapper(ZarrArrayWrapper, Generic[K]): def __init__(self, array: K): self.chunks = array.chunks if isinstance(array, ZarrArray): return super().__init__(array) self._array = array self.shape = self._array.shape self.dtype = self._array.dtype def __getitem__(self, key: xr.core.indexing.ExplicitIndexer): if isinstance(self._array, ZarrArray): return super().__getitem__(key) return xr.core.indexing.explicit_indexing_adapter( key, self.shape, xr.core.indexing.IndexingSupport.OUTER_1VECTOR, lambda key: self._array[key], ) class CategoricalArray(BackendArray, Generic[K]): """ A wrapper class meant to enable working with lazy categorical data. We do not guarantee the stability of this API beyond that guaranteed by :class:`xarray.backends.BackendArray`. """ _codes: ZarrOrHDF5Wrapper[K] _categories: ZarrArray | H5Array shape: tuple[int, ...] base_path_or_zarr_group: Path | ZarrGroup elem_name: str def __init__( self, codes: K, categories: ZarrArray | H5Array, base_path_or_zarr_group: Path | ZarrGroup, elem_name: str, *args, ordered: bool, **kwargs, ): self._categories = categories self._ordered = ordered self._codes = ZarrOrHDF5Wrapper(codes) self.shape = self._codes.shape self.base_path_or_zarr_group = base_path_or_zarr_group self.file_format = "zarr" if isinstance(codes, ZarrArray) else "h5" self.elem_name = elem_name @cached_property def categories(self) -> np.ndarray: if isinstance(self._categories, ZarrArray): return self._categories[...] from ..._io.h5ad import read_dataset return read_dataset(self._categories) def __getitem__( self, key: xr.core.indexing.ExplicitIndexer ) -> xr.core.extension_array.PandasExtensionArray: codes = self._codes[key] categorical_array = pd.Categorical.from_codes( codes=codes, categories=self.categories, ordered=self._ordered ) if settings.remove_unused_categories: categorical_array = categorical_array.remove_unused_categories() return xr.core.extension_array.PandasExtensionArray(categorical_array) @cached_property def dtype(self): return pd.CategoricalDtype(categories=self.categories, ordered=self._ordered) class MaskedArray(BackendArray, Generic[K]): """ A wrapper class meant to enable working with lazy masked data. We do not guarantee the stability of this API beyond that guaranteed by :class:`xarray.backends.BackendArray`. """ _mask: ZarrOrHDF5Wrapper[K] _values: ZarrOrHDF5Wrapper[K] _dtype_str: Literal["nullable-integer", "nullable-boolean", "nullable-string-array"] shape: tuple[int, ...] base_path_or_zarr_group: Path | ZarrGroup elem_name: str def __init__( self, values: ZarrArray | H5Array, dtype_str: Literal[ "nullable-integer", "nullable-boolean", "nullable-string-array" ], mask: ZarrArray | H5Array, base_path_or_zarr_group: Path | ZarrGroup, elem_name: str, ): self._mask = ZarrOrHDF5Wrapper(mask) self._values = ZarrOrHDF5Wrapper(values) self._dtype_str = dtype_str self.shape = self._values.shape self.base_path_or_zarr_group = base_path_or_zarr_group self.file_format = "zarr" if isinstance(mask, ZarrArray) else "h5" self.elem_name = elem_name def __getitem__( self, key: xr.core.indexing.ExplicitIndexer ) -> xr.core.extension_array.PandasExtensionArray: values = self._values[key] mask = self._mask[key] if self._dtype_str == "nullable-integer": # numpy does not support nan ints extension_array = pd.arrays.IntegerArray(values, mask=mask) elif self._dtype_str == "nullable-boolean": extension_array = pd.arrays.BooleanArray(values, mask=mask) elif self._dtype_str == "nullable-string-array": values[mask] = pd.NA extension_array = pd.array(values, dtype=pd.StringDtype()) else: msg = f"Invalid dtype_str {self._dtype_str}" raise RuntimeError(msg) return xr.core.extension_array.PandasExtensionArray(extension_array) @cached_property def dtype(self): if self._dtype_str == "nullable-integer": return pd.array( [], dtype=str(pd.api.types.pandas_dtype(self._values.dtype)).capitalize(), ).dtype elif self._dtype_str == "nullable-boolean": return pd.BooleanDtype() elif self._dtype_str == "nullable-string-array": return pd.StringDtype() msg = f"Invalid dtype_str {self._dtype_str}" raise RuntimeError(msg) @_subset.register(DataArray) def _subset_masked(a: DataArray, subset_idx: Index): return a[subset_idx] @as_view.register(DataArray) def _view_pd_boolean_array(a: DataArray, view_args): return a @get_chunksize.register(MaskedArray) def _(a: MaskedArray): return get_chunksize(a._values) @get_chunksize.register(CategoricalArray) def _(a: CategoricalArray): return get_chunksize(a._codes) python-anndata-0.12.0~rc1/src/anndata/experimental/backed/_xarray.py000066400000000000000000000102351500370632200254440ustar00rootroot00000000000000from __future__ import annotations from typing import TYPE_CHECKING import pandas as pd from ..._core.anndata import AnnData, _gen_dataframe from ..._core.file_backing import to_memory from ..._core.index import _subset from ..._core.views import as_view try: from xarray import Dataset except ImportError: class Dataset: def __repr__(self) -> str: return "mock Dataset" if TYPE_CHECKING: from collections.abc import Hashable, Iterable from typing import Any, Literal from ..._core.index import Index from ._compat import xarray as xr def get_index_dim(ds: xr.DataArray) -> Hashable: if len(ds.sizes) != 1: msg = f"xarray Dataset should not have more than 1 dims, found {len(ds.sizes)} {ds.sizes}, {ds}" raise ValueError(msg) return list(ds.indexes.keys())[0] class Dataset2D(Dataset): """ A wrapper class meant to enable working with lazy dataframe data. We do not guarantee the stability of this API beyond that guaranteed by :class:`xarray.Dataset` and the `to_memory` function, a thin wrapper around :meth:`xarray.Dataset.to_dataframe` to ensure roundtrip compatibility here. """ __slots__ = () @property def index(self) -> pd.Index: """:attr:`~anndata.AnnData` internally looks for :attr:`~pandas.DataFrame.index` so this ensures usability Returns ------- The index of the of the dataframe as resolved from :attr:`~xarray.Dataset.coords`. """ coord = get_index_dim(self) return self.indexes[coord] @index.setter def index(self, val) -> None: coord = get_index_dim(self) self.coords[coord] = val @property def shape(self) -> tuple[int, int]: """:attr:`~anndata.AnnData` internally looks for :attr:`~pandas.DataFrame.shape` so this ensures usability Returns ------- The (2D) shape of the dataframe resolved from :attr:`~xarray.Dataset.sizes`. """ return (self.sizes[get_index_dim(self)], len(self)) @property def iloc(self): """:attr:`~anndata.AnnData` internally looks for :attr:`~pandas.DataFrame.iloc` so this ensures usability Returns ------- Handler class for doing the iloc-style indexing using :meth:`~xarray.Dataset.isel`. """ class IlocGetter: def __init__(self, ds): self._ds = ds def __getitem__(self, idx): coord = get_index_dim(self._ds) return self._ds.isel(**{coord: idx}) return IlocGetter(self) def to_memory(self, *, copy=False) -> pd.DataFrame: df = self.to_dataframe() index_key = self.attrs.get("indexing_key", None) if df.index.name != index_key and index_key is not None: df = df.set_index(index_key) df.index.name = None # matches old AnnData object return df @property def columns(self) -> pd.Index: """ :class:`~anndata.AnnData` internally looks for :attr:`~pandas.DataFrame.columns` so this ensures usability Returns ------- :class:`pandas.Index` that represents the "columns." """ columns_list = list(self.keys()) return pd.Index(columns_list) @_subset.register(Dataset2D) def _(a: Dataset2D, subset_idx: Index): key = get_index_dim(a) # xarray seems to have some code looking for a second entry in tuples if isinstance(subset_idx, tuple) and len(subset_idx) == 1: subset_idx = subset_idx[0] return a.isel(**{key: subset_idx}) @as_view.register(Dataset2D) def _(a: Dataset2D, view_args): return a @_gen_dataframe.register(Dataset2D) def _gen_dataframe_xr( anno: Dataset2D, index_names: Iterable[str], *, source: Literal["X", "shape"], attr: Literal["obs", "var"], length: int | None = None, ): return anno @AnnData._remove_unused_categories.register(Dataset2D) def _remove_unused_categories_xr( df_full: Dataset2D, df_sub: Dataset2D, uns: dict[str, Any] ): pass # this is handled automatically by the categorical arrays themselves i.e., they dedup upon access. to_memory.register(Dataset2D, Dataset2D.to_memory) python-anndata-0.12.0~rc1/src/anndata/experimental/merge.py000066400000000000000000000523141500370632200236710ustar00rootroot00000000000000from __future__ import annotations import shutil from collections.abc import Mapping from functools import singledispatch from os import PathLike from pathlib import Path from typing import TYPE_CHECKING import numpy as np import pandas as pd from scipy.sparse import csc_matrix, csr_matrix from .._core.file_backing import to_memory from .._core.merge import ( MissingVal, _resolve_axis, concat_arrays, gen_inner_reindexers, gen_reindexer, intersect_keys, merge_dataframes, merge_indices, resolve_merge_strategy, unify_dtypes, ) from .._core.sparse_dataset import BaseCompressedSparseDataset, sparse_dataset from .._io.specs import read_elem, write_elem from ..compat import H5Array, H5Group, ZarrArray, ZarrGroup, _map_cat_to_str from . import read_dispatched if TYPE_CHECKING: from collections.abc import Callable, Collection, Iterable, Sequence from typing import Any, Literal from .._core.merge import Reindexer, StrategiesLiteral SPARSE_MATRIX = {"csc_matrix", "csr_matrix"} EAGER_TYPES = {"dataframe", "awkward-array"} ################### # Utilities ################### # Wrapper to reindexer that stores if there is a change # and won't do anything if there is class IdentityReindexer: def __init__(self): self.no_change = True def __call__(self, x, *args, **kwargs): return x # Checks if given indices are equal to each other in the whole list. def _indices_equal(indices: Iterable[pd.Index]) -> bool: init_elem = indices[0] return all(np.array_equal(init_elem, elem) for elem in indices[1:]) def _gen_slice_to_append( datasets: Sequence[BaseCompressedSparseDataset], reindexers, max_loaded_elems: int, axis=0, fill_value=None, ): for ds, ri in zip(datasets, reindexers): n_slices = ds.shape[axis] * ds.shape[1 - axis] // max_loaded_elems if n_slices < 2: yield (csr_matrix, csc_matrix)[axis]( ri(to_memory(ds), axis=1 - axis, fill_value=fill_value) ) else: slice_size = max_loaded_elems // ds.shape[1 - axis] if slice_size == 0: slice_size = 1 rem_slices = ds.shape[axis] idx = 0 while rem_slices > 0: ds_part = None if axis == 0: ds_part = ds[idx : idx + slice_size, :] elif axis == 1: ds_part = ds[:, idx : idx + slice_size] yield (csr_matrix, csc_matrix)[axis]( ri(ds_part, axis=1 - axis, fill_value=fill_value) ) rem_slices -= slice_size idx += slice_size ################### # File Management ################### @singledispatch def as_group(store, *, mode: str) -> ZarrGroup | H5Group: msg = "This is not yet implemented." raise NotImplementedError(msg) @as_group.register(PathLike) @as_group.register(str) def _(store: PathLike[str] | str, *, mode: str) -> ZarrGroup | H5Group: store = Path(store) if store.suffix == ".h5ad": import h5py return h5py.File(store, mode=mode) if mode == "r": # others all write: r+, a, w, w- import zarr return zarr.open_group(store, mode=mode) from anndata._io.zarr import open_write_group return open_write_group(store, mode=mode) @as_group.register(ZarrGroup) @as_group.register(H5Group) def _(store, *, mode: str) -> ZarrGroup | H5Group: del mode return store ################### # Reading ################### def read_as_backed(group: ZarrGroup | H5Group): """ Read the group until BaseCompressedSparseDataset, Array or EAGER_TYPES are encountered. """ def callback(func, elem_name: str, elem, iospec): if iospec.encoding_type in SPARSE_MATRIX: return sparse_dataset(elem) elif iospec.encoding_type in EAGER_TYPES: return read_elem(elem) elif iospec.encoding_type == "array": return elem elif iospec.encoding_type == "dict": return {k: read_as_backed(v) for k, v in dict(elem).items()} else: return func(elem) return read_dispatched(group, callback=callback) def _df_index(df: ZarrGroup | H5Group) -> pd.Index: index_key = df.attrs["_index"] return pd.Index(read_elem(df[index_key])) ################### # Writing ################### def write_concat_dense( arrays: Sequence[ZarrArray | H5Array], output_group: ZarrGroup | H5Group, output_path: ZarrGroup | H5Group, axis: Literal[0, 1] = 0, reindexers: Reindexer = None, fill_value=None, ): """ Writes the concatenation of given dense arrays to disk using dask. """ import dask.array as da darrays = ( da.from_array(a, chunks="auto" if a.chunks is None else a.chunks) for a in arrays ) res = da.concatenate( [ ri(a, axis=1 - axis, fill_value=fill_value) for a, ri in zip(darrays, reindexers) ], axis=axis, ) write_elem(output_group, output_path, res) output_group[output_path].attrs.update( {"encoding-type": "array", "encoding-version": "0.2.0"} ) def write_concat_sparse( datasets: Sequence[BaseCompressedSparseDataset], output_group: ZarrGroup | H5Group, output_path: ZarrGroup | H5Group, max_loaded_elems: int, axis: Literal[0, 1] = 0, reindexers: Reindexer = None, fill_value=None, ): """ Writes and concatenates sparse datasets into a single output dataset. Args: datasets (Sequence[BaseCompressedSparseDataset]): A sequence of BaseCompressedSparseDataset objects to be concatenated. output_group (Union[ZarrGroup, H5Group]): The output group where the concatenated dataset will be written. output_path (Union[ZarrGroup, H5Group]): The output path where the concatenated dataset will be written. max_loaded_elems (int): The maximum number of sparse elements to load at once. axis (Literal[0, 1], optional): The axis along which the datasets should be concatenated. Defaults to 0. reindexers (Reindexer, optional): A reindexer object that defines the reindexing operation to be applied. Defaults to None. fill_value (Any, optional): The fill value to use for missing elements. Defaults to None. """ elems = None if all(ri.no_change for ri in reindexers): elems = iter(datasets) else: elems = _gen_slice_to_append( datasets, reindexers, max_loaded_elems, axis, fill_value ) number_non_zero = sum(d.group["indices"].shape[0] for d in datasets) init_elem = next(elems) indptr_dtype = "int64" if number_non_zero >= np.iinfo(np.int32).max else "int32" write_elem( output_group, output_path, init_elem, dataset_kwargs=dict(indptr_dtype=indptr_dtype), ) del init_elem out_dataset: BaseCompressedSparseDataset = read_as_backed(output_group[output_path]) for temp_elem in elems: out_dataset.append(temp_elem) del temp_elem def _write_concat_mappings( mappings, output_group: ZarrGroup | H5Group, keys, path, max_loaded_elems, axis=0, index=None, reindexers=None, fill_value=None, ): """ Write a list of mappings to a zarr/h5 group. """ mapping_group = output_group.create_group(path) mapping_group.attrs.update( { "encoding-type": "dict", "encoding-version": "0.1.0", } ) for k in keys: elems = [m[k] for m in mappings] _write_concat_sequence( elems, output_group=mapping_group, output_path=k, axis=axis, index=index, reindexers=reindexers, fill_value=fill_value, max_loaded_elems=max_loaded_elems, ) def _write_concat_arrays( arrays: Sequence[ZarrArray | H5Array | BaseCompressedSparseDataset], output_group, output_path, max_loaded_elems, axis=0, reindexers=None, fill_value=None, join="inner", ): init_elem = arrays[0] init_type = type(init_elem) if not all(isinstance(a, init_type) for a in arrays): msg = f"All elements must be the same type instead got types: {[type(a) for a in arrays]}" raise NotImplementedError(msg) if reindexers is None: if join == "inner": reindexers = gen_inner_reindexers(arrays, new_index=None, axis=axis) else: msg = "Cannot reindex arrays with outer join." raise NotImplementedError(msg) if isinstance(init_elem, BaseCompressedSparseDataset): expected_sparse_fmt = ["csr", "csc"][axis] if all(a.format == expected_sparse_fmt for a in arrays): write_concat_sparse( arrays, output_group, output_path, max_loaded_elems, axis, reindexers, fill_value, ) else: msg = f"Concat of following not supported: {[a.format for a in arrays]}" raise NotImplementedError(msg) else: write_concat_dense( arrays, output_group, output_path, axis, reindexers, fill_value ) def _write_concat_sequence( arrays: Sequence[pd.DataFrame | BaseCompressedSparseDataset | H5Array | ZarrArray], output_group, output_path, max_loaded_elems, axis=0, index=None, reindexers=None, fill_value=None, join="inner", ): """ array, dataframe, csc_matrix, csc_matrix """ if any(isinstance(a, pd.DataFrame) for a in arrays): if reindexers is None: if join == "inner": reindexers = gen_inner_reindexers(arrays, None, axis=axis) else: msg = "Cannot reindex dataframes with outer join." raise NotImplementedError(msg) if not all( isinstance(a, pd.DataFrame) or a is MissingVal or 0 in a.shape for a in arrays ): msg = "Cannot concatenate a dataframe with other array types." raise NotImplementedError(msg) df = concat_arrays( arrays=arrays, reindexers=reindexers, axis=axis, index=index, fill_value=fill_value, ) write_elem(output_group, output_path, df) elif all( isinstance(a, pd.DataFrame | BaseCompressedSparseDataset | H5Array | ZarrArray) for a in arrays ): _write_concat_arrays( arrays, output_group, output_path, max_loaded_elems, axis, reindexers, fill_value, join, ) else: msg = f"Concatenation of these types is not yet implemented: {[type(a) for a in arrays]} with axis={axis}." raise NotImplementedError(msg) def _write_alt_mapping(groups, output_group, alt_axis_name, alt_indices, merge): alt_mapping = merge([read_as_backed(g[alt_axis_name]) for g in groups]) # If its empty, we need to write an empty dataframe with the correct index if not alt_mapping: alt_df = pd.DataFrame(index=alt_indices) write_elem(output_group, alt_axis_name, alt_df) else: write_elem(output_group, alt_axis_name, alt_mapping) def _write_alt_annot(groups, output_group, alt_axis_name, alt_indices, merge): # Annotation for other axis alt_annot = merge_dataframes( [read_elem(g[alt_axis_name]) for g in groups], alt_indices, merge ) write_elem(output_group, alt_axis_name, alt_annot) def _write_axis_annot( groups, output_group, axis_name, concat_indices, label, label_col, join ): concat_annot = pd.concat( unify_dtypes(read_elem(g[axis_name]) for g in groups), join=join, ignore_index=True, ) concat_annot.index = concat_indices if label is not None: concat_annot[label] = label_col write_elem(output_group, axis_name, concat_annot) def concat_on_disk( in_files: Collection[PathLike[str] | str] | Mapping[str, PathLike[str] | str], out_file: PathLike[str] | str, *, max_loaded_elems: int = 100_000_000, axis: Literal["obs", 0, "var", 1] = 0, join: Literal["inner", "outer"] = "inner", merge: StrategiesLiteral | Callable[[Collection[Mapping]], Mapping] | None = None, uns_merge: ( StrategiesLiteral | Callable[[Collection[Mapping]], Mapping] | None ) = None, label: str | None = None, keys: Collection[str] | None = None, index_unique: str | None = None, fill_value: Any | None = None, pairwise: bool = False, ) -> None: """\ Concatenates multiple AnnData objects along a specified axis using their corresponding stores or paths, and writes the resulting AnnData object to a target location on disk. Unlike :func:`anndata.concat`, this method does not require loading the input AnnData objects into memory, making it a memory-efficient alternative for large datasets. The resulting object written to disk should be equivalent to the concatenation of the loaded AnnData objects using :func:`anndata.concat`. To adjust the maximum amount of data loaded in memory; for sparse arrays use the max_loaded_elems argument; for dense arrays see the Dask documentation, as the Dask concatenation function is used to concatenate dense arrays in this function Params ------ in_files The corresponding stores or paths of AnnData objects to be concatenated. If a Mapping is passed, keys are used for the `keys` argument and values are concatenated. out_file The target path or store to write the result in. max_loaded_elems The maximum number of elements to load in memory when concatenating sparse arrays. Note that this number also includes the empty entries. Set to 100m by default meaning roughly 400mb will be loaded to memory simultaneously. axis Which axis to concatenate along. join How to align values when concatenating. If `"outer"`, the union of the other axis is taken. If `"inner"`, the intersection. See :doc:`concatenation <../concatenation>` for more. merge How elements not aligned to the axis being concatenated along are selected. Currently implemented strategies include: * `None`: No elements are kept. * `"same"`: Elements that are the same in each of the objects. * `"unique"`: Elements for which there is only one possible value. * `"first"`: The first element seen at each from each position. * `"only"`: Elements that show up in only one of the objects. uns_merge How the elements of `.uns` are selected. Uses the same set of strategies as the `merge` argument, except applied recursively. label Column in axis annotation (i.e. `.obs` or `.var`) to place batch information in. If it's None, no column is added. keys Names for each object being added. These values are used for column values for `label` or appended to the index if `index_unique` is not `None`. Defaults to incrementing integer labels. index_unique Whether to make the index unique by using the keys. If provided, this is the delimiter between `"{orig_idx}{index_unique}{key}"`. When `None`, the original indices are kept. fill_value When `join="outer"`, this is the value that will be used to fill the introduced indices. By default, sparse arrays are padded with zeros, while dense arrays and DataFrames are padded with missing values. pairwise Whether pairwise elements along the concatenated dimension should be included. This is False by default, since the resulting arrays are often not meaningful. Notes ----- .. warning:: If you use `join='outer'` this fills 0s for sparse data when variables are absent in a batch. Use this with care. Dense data is filled with `NaN`. Examples -------- See :func:`anndata.concat` for the semantics. The following examples highlight the differences this function has. First, let’s get some “big” datasets with a compatible ``var`` axis: >>> import httpx >>> import scanpy as sc >>> base_url = "https://datasets.cellxgene.cziscience.com" >>> def get_cellxgene_data(id_: str): ... out_path = sc.settings.datasetdir / f'{id_}.h5ad' ... if out_path.exists(): ... return out_path ... file_url = f"{base_url}/{id_}.h5ad" ... sc.settings.datasetdir.mkdir(parents=True, exist_ok=True) ... out_path.write_bytes(httpx.get(file_url).content) ... return out_path >>> path_b_cells = get_cellxgene_data('a93eab58-3d82-4b61-8a2f-d7666dcdb7c4') >>> path_fetal = get_cellxgene_data('d170ff04-6da0-4156-a719-f8e1bbefbf53') Now we can concatenate them on-disk: >>> import anndata as ad >>> ad.experimental.concat_on_disk( ... dict(b_cells=path_b_cells, fetal=path_fetal), ... 'merged.h5ad', ... label='dataset', ... ) >>> adata = ad.read_h5ad('merged.h5ad', backed=True) >>> adata.X CSRDataset: backend hdf5, shape (490, 15585), data_dtype float32 >>> adata.obs['dataset'].value_counts() # doctest: +SKIP dataset fetal 344 b_cells 146 Name: count, dtype: int64 """ if len(in_files) == 0: msg = "No objects to concatenate." raise ValueError(msg) # Argument normalization if pairwise: msg = "pairwise concatenation not yet implemented" raise NotImplementedError(msg) merge = resolve_merge_strategy(merge) uns_merge = resolve_merge_strategy(uns_merge) out_file = Path(out_file) if not out_file.parent.exists(): msg = f"Parent directory of {out_file} does not exist." raise FileNotFoundError(msg) if isinstance(in_files, Mapping): if keys is not None: msg = ( "Cannot specify categories in both mapping keys and using `keys`. " "Only specify this once." ) raise TypeError(msg) keys, in_files = list(in_files.keys()), list(in_files.values()) else: in_files = list(in_files) if len(in_files) == 1: shutil.copy2(in_files[0], out_file) return if keys is None: keys = np.arange(len(in_files)).astype(str) axis, axis_name = _resolve_axis(axis) _, alt_axis_name = _resolve_axis(1 - axis) output_group = as_group(out_file, mode="w") groups = [as_group(f, mode="r") for f in in_files] use_reindexing = False alt_idxs = [_df_index(g[alt_axis_name]) for g in groups] # All {axis_name}_names must be equal if reindexing not applied if not _indices_equal(alt_idxs): use_reindexing = True # All groups must be anndata if not all(g.attrs.get("encoding-type") == "anndata" for g in groups): msg = "All groups must be anndata" raise ValueError(msg) # Write metadata output_group.attrs.update({"encoding-type": "anndata", "encoding-version": "0.1.0"}) # Read the backed objects of Xs Xs = [read_as_backed(g["X"]) for g in groups] # Label column label_col = pd.Categorical.from_codes( np.repeat(np.arange(len(groups)), [x.shape[axis] for x in Xs]), categories=keys, ) # Combining indexes concat_indices = pd.concat( [pd.Series(_df_index(g[axis_name])) for g in groups], ignore_index=True ) if index_unique is not None: concat_indices = concat_indices.str.cat( _map_cat_to_str(label_col), sep=index_unique ) # Resulting indices for {axis_name} and {alt_axis_name} concat_indices = pd.Index(concat_indices) alt_index = merge_indices(alt_idxs, join=join) reindexers = None if use_reindexing: reindexers = [ gen_reindexer(alt_index, alt_old_index) for alt_old_index in alt_idxs ] else: reindexers = [IdentityReindexer()] * len(groups) # Write {axis_name} _write_axis_annot( groups, output_group, axis_name, concat_indices, label, label_col, join ) # Write {alt_axis_name} _write_alt_annot(groups, output_group, alt_axis_name, alt_index, merge) # Write {alt_axis_name}m _write_alt_mapping(groups, output_group, alt_axis_name, alt_index, merge) # Write X _write_concat_arrays( arrays=Xs, output_group=output_group, output_path="X", axis=axis, reindexers=reindexers, fill_value=fill_value, max_loaded_elems=max_loaded_elems, ) # Write Layers and {axis_name}m mapping_names = [ ( f"{axis_name}m", concat_indices, 0, None if use_reindexing else [IdentityReindexer()] * len(groups), ), ("layers", None, axis, reindexers), ] for m, m_index, m_axis, m_reindexers in mapping_names: maps = [read_as_backed(g[m]) for g in groups] _write_concat_mappings( maps, output_group, intersect_keys(maps), m, max_loaded_elems=max_loaded_elems, axis=m_axis, index=m_index, reindexers=m_reindexers, fill_value=fill_value, ) python-anndata-0.12.0~rc1/src/anndata/experimental/multi_files/000077500000000000000000000000001500370632200245275ustar00rootroot00000000000000python-anndata-0.12.0~rc1/src/anndata/experimental/multi_files/__init__.py000066400000000000000000000001531500370632200266370ustar00rootroot00000000000000from __future__ import annotations from ._anncollection import AnnCollection __all__ = ["AnnCollection"] python-anndata-0.12.0~rc1/src/anndata/experimental/multi_files/_anncollection.py000066400000000000000000001051761500370632200301020ustar00rootroot00000000000000from __future__ import annotations import warnings from collections.abc import Callable, Mapping from functools import reduce from typing import TYPE_CHECKING import numpy as np import pandas as pd from h5py import Dataset from ..._core.aligned_mapping import AxisArrays from ..._core.anndata import AnnData from ..._core.index import _normalize_index, _normalize_indices from ..._core.merge import concat_arrays, inner_concat_aligned_mapping from ..._core.sparse_dataset import BaseCompressedSparseDataset from ..._core.views import _resolve_idx from ...compat import _map_cat_to_str, old_positionals if TYPE_CHECKING: from collections.abc import Iterable, Sequence from typing import Literal from ..._core.index import Index ATTRS = ["obs", "obsm", "layers"] def _merge(arrs): rxers = [lambda x, fill_value, axis: x] * len(arrs) return concat_arrays(arrs, rxers) def _select_convert(key, convert, arr=None): key_convert = None if callable(convert): key_convert = convert elif isinstance(convert, dict) and key in convert: key_convert = convert[key] if arr is not None: return key_convert(arr) if key_convert is not None else arr else: return key_convert def _harmonize_types(attrs_keys, adatas): attrs_keys_types = {} def check_type(attr, key=None): arrs = [] for a in adatas: attr_arr = getattr(a, attr) if key is not None: attr_arr = attr_arr[key] arrs.append(attr_arr) # hacky but numpy find_common_type doesn't work with categoricals try: dtype = _merge([arr[:1] for arr in arrs]).dtype except ValueError: dtype = _merge([arr[:1, :1] for arr in arrs]).dtype return dtype for attr, keys in attrs_keys.items(): if len(keys) == 0: continue attrs_keys_types[attr] = {} for key in keys: attrs_keys_types[attr][key] = check_type(attr, key) attrs_keys_types["X"] = check_type("X") return attrs_keys_types class _ConcatViewMixin: def _resolve_idx(self, oidx, vidx): adatas_oidx = [] reverse = None old_oidx = getattr(self, "oidx", None) if old_oidx is not None: oidx = _resolve_idx(old_oidx, oidx, self.limits[-1]) if isinstance(oidx, slice): start, stop, step = oidx.indices(self.limits[-1]) oidx = np.arange(start, stop, step) else: oidx = np.array([oidx]) if isinstance(oidx, int) else oidx u_oidx = oidx if len(self.adatas) == 1: return [u_oidx], oidx, vidx, reverse iter_limits = list(zip([0] + self.limits, self.limits)) n_adatas_used = 0 for lower, upper in iter_limits: if np.any((u_oidx >= lower) & (u_oidx < upper)): n_adatas_used += 1 need_reverse = ( self.indices_strict and n_adatas_used > 1 and u_oidx.size > 1 and np.any(u_oidx[:-1] > u_oidx[1:]) ) if need_reverse: u_oidx, reverse = np.unique(u_oidx, return_inverse=True) for lower, upper in iter_limits: mask = (u_oidx >= lower) & (u_oidx < upper) adatas_oidx.append(u_oidx[mask] - lower if mask.any() else None) old_vidx = getattr(self, "vidx", None) if old_vidx is not None: vidx = _resolve_idx(old_vidx, vidx, self.adatas[0].n_vars) if isinstance(vidx, int): vidx = np.array([vidx]) return adatas_oidx, oidx, vidx, reverse class _IterateViewMixin: @old_positionals("axis", "shuffle", "drop_last") def iterate_axis( self, batch_size: int, *, axis: Literal[0, 1] = 0, shuffle: bool = False, drop_last: bool = False, ): """Iterate the lazy object over an axis. Parameters ---------- batch_size How many samples to put into a batch when iterating. axis The axis to iterate over. shuffle Set to `True` to have the indices reshuffled before iterating. drop_last Set to `True` to drop a batch with the length lower than `batch_size`. """ if axis not in {0, 1}: msg = "Axis should be either 0 or 1." raise ValueError(msg) n = self.shape[axis] if shuffle: indices = np.random.permutation(n).tolist() else: indices = list(range(n)) for i in range(0, n, batch_size): idx = indices[i : min(i + batch_size, n)] if axis == 1: batch = self[:, idx] else: batch = self[idx] # only happens if the last batch is smaller than batch_size if len(batch) < batch_size and drop_last: continue yield batch, idx class MapObsView: def __init__( self, attr, adatas, keys, adatas_oidx, adatas_vidx=None, convert=None, reverse=None, dtypes=None, obs_names=None, ): self.adatas = adatas self._keys = keys self.adatas_oidx = adatas_oidx self.adatas_vidx = adatas_vidx self.attr = attr self.convert = convert self.reverse = reverse self.dtypes = dtypes self.obs_names = obs_names def __getitem__(self, key: str, *, use_convert: bool = True): if self._keys is not None and key not in self._keys: msg = f"No {key} in {self.attr} view" raise KeyError(msg) arrs = [] for i, oidx in enumerate(self.adatas_oidx): if oidx is None: continue arr = getattr(self.adatas[i], self.attr)[key] if self.adatas_vidx is not None: vidx = self.adatas_vidx[i] else: vidx = None if vidx is not None: idx = oidx, vidx else: idx = oidx if isinstance(arr, pd.DataFrame): arrs.append(arr.iloc[idx]) else: if vidx is not None: idx = np.ix_(*idx) if not isinstance(idx[1], slice) else idx arrs.append(arr.iloc[idx] if isinstance(arr, pd.Series) else arr[idx]) if len(arrs) > 1: _arr = _merge(arrs) _arr = _arr if self.reverse is None else _arr[self.reverse] else: _arr = arrs[0] # what if it is a dataframe? if self.dtypes is not None: _arr = _arr.astype(self.dtypes[key], copy=False) if self.convert is not None and use_convert: _arr = _select_convert(key, self.convert, _arr) return _arr def keys(self): if self._keys is not None: return self._keys else: return list(getattr(self.adatas[0], self.attr).keys()) @old_positionals("use_convert") def to_dict(self, keys: Iterable[str] | None = None, *, use_convert=True): dct = {} keys = self.keys() if keys is None else keys for key in keys: dct[key] = self.__getitem__(key, use_convert=use_convert) return dct @property def df(self): if self.attr != "obs": return None return pd.DataFrame(self.to_dict(use_convert=False), index=self.obs_names) def __repr__(self): descr = f"View of {self.attr} with keys: {str(self.keys())[1:-1]}" return descr class AnnCollectionView(_ConcatViewMixin, _IterateViewMixin): """\ An object to access the observation attributes of `adatas` in AnnCollection. Created as a result of subsetting an :class:`~anndata.experimental.AnnCollection` object. An object of this class can have `.obs`, `.obsm`, `.layers`, `.X` depending on the results of joins in the reference AnnCollection object. Notes ----- Nothing is copied until keys of the attributes or `.X` are accessed. """ def __init__(self, reference, convert, resolved_idx): self.reference = reference self.indices_strict = self.reference.indices_strict self.adatas = self.reference.adatas self.limits = self.reference.limits self.adatas_oidx, self.oidx, self.vidx, self.reverse = resolved_idx self.adatas_vidx = [] for i, vidx in enumerate(self.reference.adatas_vidx): if vidx is None: self.adatas_vidx.append(self.vidx) else: new_vidx = _resolve_idx(vidx, self.vidx, self.adatas[i].n_vars) self.adatas_vidx.append(new_vidx) self._view_attrs_keys = self.reference._view_attrs_keys self._attrs = self.reference._attrs self._dtypes = self.reference._dtypes self._layers_view, self._obsm_view, self._obs_view = None, None, None self._X = None self._convert = None self._convert_X = None self.convert = convert def _lazy_init_attr(self, attr: str, *, set_vidx: bool = False): if getattr(self, f"_{attr}_view") is not None: return keys = None attr_dtypes = None if attr in self._view_attrs_keys: reverse = self.reverse keys = self._view_attrs_keys[attr] if len(keys) == 0: return adatas = self.adatas adatas_oidx = self.adatas_oidx if self._dtypes is not None: attr_dtypes = self._dtypes[attr] else: reverse = None adatas = [self.reference] adatas_oidx = [self.oidx] adatas_vidx = self.adatas_vidx if set_vidx else None attr_convert = None if self.convert is not None: attr_convert = _select_convert(attr, self.convert) if attr == "obs": obs_names = self.obs_names else: obs_names = None setattr( self, f"_{attr}_view", MapObsView( attr, adatas, keys, adatas_oidx, adatas_vidx, attr_convert, reverse, attr_dtypes, obs_names, ), ) def _gather_X(self): if self._X is not None: return self._X Xs = [] for i, oidx in enumerate(self.adatas_oidx): if oidx is None: continue adata = self.adatas[i] X = adata.X vidx = self.adatas_vidx[i] if isinstance(X, Dataset): reverse = None if oidx.size > 1 and np.any(oidx[:-1] >= oidx[1:]): oidx, reverse = np.unique(oidx, return_inverse=True) if isinstance(vidx, slice): arr = X[oidx, vidx] else: # this is a very memory inefficient approach # todo: fix arr = X[oidx][:, vidx] Xs.append(arr if reverse is None else arr[reverse]) elif isinstance(X, BaseCompressedSparseDataset): # very slow indexing with two arrays if isinstance(vidx, slice) or len(vidx) <= 1000: Xs.append(X[oidx, vidx]) else: Xs.append(X[oidx][:, vidx]) else: # if vidx is present it is less memory efficient idx = oidx, vidx idx = np.ix_(*idx) if not isinstance(vidx, slice) else idx Xs.append(X[idx]) if len(Xs) > 1: _X = _merge(Xs) # todo: get rid of reverse for dense arrays _X = _X if self.reverse is None else _X[self.reverse] else: _X = Xs[0] if self._dtypes is not None: _X = _X.astype(self._dtypes["X"], copy=False) self._X = _X return _X @property def X(self): """Lazy subset of data matrix. The data matrix formed from the `.X` attributes of the underlying `adatas`, properly reindexed and lazily merged. Nothing is copied until `.X` is accessed, no real concatenation of the underlying `.X` attributes is done. """ # inconsistent behavior here, _X can be changed, # but the other attributes can't be changed. # maybe do return ... _X.copy() or _X.setflags(write=False) _X = self._gather_X() return self._convert_X(_X) if self._convert_X is not None else _X @property def layers(self): """Lazy subset of layers. The layers attribute formed from lazy inner join and subsetting of the `.layers` of the underlying `adatas`. No copy is made until you access a key from `.layers`, only the subset of the accessed key is copied. To get `.layers` as a dictionary, use `.layers.to_dict()`. You can also specify keys to include in the dict `.layers.to_dict(keys=['key1', 'key2'])` and if you want converters to be turned off when copying to dict `.layers.to_dict(use_convert=False)`. """ self._lazy_init_attr("layers", set_vidx=True) return self._layers_view @property def obsm(self): """Lazy subset of multi-dimensional annotation of observations. Points to the `.obsm` attributes of the underlying adatas to `.obsm` of the parent AnnCollection object depending on the `join_obsm` option of the AnnCollection object. See the docs of :class:`~anndata.experimental.AnnCollection` for details. Copy rules are the same as for `.layers`, i.e. everything is lazy. To get `.obsm` as a dictionary, use `.obsm.to_dict()`. You can also specify keys to include in the dict `.obsm.to_dict(keys=['key1', 'key2'])` and if you want converters to be turned off when copying to dict `.obsm.to_dict(use_convert=False)`. """ self._lazy_init_attr("obsm") return self._obsm_view @property def obs(self): """Lazy suset of one-dimensional annotation of observations. Points to the `.obs` attributes of the underlying adatas to `.obs` of the parent AnnCollection object depending on the `join_obs` option of the AnnCollection object. See the docs of `~anndata.experimental.AnnCollection` for details. Copy rules are the same as for `.layers`, i.e. everything is lazy. To get `.obs` as a DataFrame, use `.obs.df`. To get `.obs` as a dictionary, use `.obs.to_dict()`. You can also specify keys to include in the dict `.obs.to_dict(keys=['key1', 'key2'])` and if you want converters to be turned off when copying to dict `.obs.to_dict(use_convert=False)`. """ self._lazy_init_attr("obs") return self._obs_view @property def obs_names(self): """Names of observations of this subset object.""" return self.reference.obs_names[self.oidx] @property def var_names(self): """Names of variables of this subset object.""" return self.reference.var_names[self.vidx] @property def shape(self): """Shape of the lazily concatenated subset of the data matrix.""" return len(self.obs_names), len(self.var_names) @property def n_obs(self): """Number of observations.""" return self.shape[0] @property def n_vars(self): """Number of variables/features.""" return self.shape[1] @property def convert(self): """On the fly converters for keys of attributes and data matrix. A function or a Mapping of functions which will be applied to the values of attributes (`.X`) or to specific keys of these attributes (`.obs`, `.obsm`, `.layers`). The keys of the Mapping should correspond to the attributes or keys of the attributes (hierarchically) and the values should be functions used for conversion. Examples ---------- :: { # densify .X "X": lambda a: a.toarray() if issparse(a) else a, # change dtype for all keys of .obsm "obsm": lambda a: np.asarray(a, dtype="float32"), # change type only for one key of .obs "obs": dict(key1=lambda c: c.astype(str)), } """ return self._convert @convert.setter def convert(self, value): self._convert = value self._convert_X = _select_convert("X", self._convert) for attr in ATTRS: setattr(self, f"_{attr}_view", None) def __len__(self): return len(self.obs_names) def __getitem__(self, index: Index): oidx, vidx = _normalize_indices(index, self.obs_names, self.var_names) resolved_idx = self._resolve_idx(oidx, vidx) return AnnCollectionView(self.reference, self.convert, resolved_idx) @property def has_backed(self): """`True` if the current subset of `adatas` has backed objects, `False` otherwise.""" for i, adata in enumerate(self.adatas): if adata.isbacked and self.adatas_oidx[i] is not None: return True return False def __repr__(self): n_obs, n_vars = self.shape descr = f"AnnCollectionView object with n_obs × n_vars = {n_obs} × {n_vars}" all_attrs_keys = self._view_attrs_keys.copy() for attr in self._attrs: all_attrs_keys[attr] = list(getattr(self.reference, attr).keys()) for attr, keys in all_attrs_keys.items(): if len(keys) > 0: descr += f"\n {attr}: {str(keys)[1:-1]}" return descr @old_positionals("ignore_X", "ignore_layers") def to_adata(self, *, ignore_X: bool = False, ignore_layers: bool = False): """Convert this AnnCollectionView object to an AnnData object. Parameters ---------- ignore_X if `True`, adds `.X` to the AnnData object. ignore_layers if `True`, copies `.layers` to the AnnData object. """ if ignore_layers or self.layers is None: layers = None else: layers = self.layers.to_dict(use_convert=False) obsm = None if self.obsm is None else self.obsm.to_dict(use_convert=False) obs = ( None if self.obs is None else pd.DataFrame(self.obs.to_dict(use_convert=False)) ) if ignore_X: X = None shape = self.shape else: X = self._gather_X() shape = None adata = AnnData(X, obs=obs, obsm=obsm, layers=layers, shape=shape) adata.obs_names = self.obs_names adata.var_names = self.var_names return adata @property def attrs_keys(self): """Dict of all accessible attributes and their keys.""" return self.reference.attrs_keys DictCallable = dict[str, Callable] ConvertType = Callable | dict[str, Callable | DictCallable] class AnnCollection(_ConcatViewMixin, _IterateViewMixin): """\ Lazily concatenate AnnData objects along the `obs` axis. This class doesn't copy data from underlying AnnData objects, but lazily subsets using a joint index of observations and variables. It also allows on-the-fly application of prespecified converters to `.obs` attributes of the AnnData objects. Subsetting of this object returns an `AnnCollectionView`, which provides views of `.obs`, `.obsm`, `.layers`, `.X` from the underlying AnnData objects. Parameters ---------- adatas The objects to be lazily concatenated. If a Mapping is passed, keys are used for the `keys` argument and values are concatenated. join_obs If "inner" specified all `.obs` attributes from `adatas` will be inner joined and copied to this object. If "outer" specified all `.obsm` attributes from `adatas` will be outer joined and copied to this object. For "inner" and "outer" subset objects will access `.obs` of this object, not the original `.obs` attributes of `adatas`. If `None`, nothing is copied to this object's `.obs`, a subset object will directly access `.obs` attributes of `adatas` (with proper reindexing and dtype conversions). For `None`the inner join rule is used to select columns of `.obs` of `adatas`. join_obsm If "inner" specified all `.obsm` attributes from `adatas` will be inner joined and copied to this object. Subset objects will access `.obsm` of this object, not the original `.obsm` attributes of `adatas`. If `None`, nothing is copied to this object's `.obsm`, a subset object will directly access `.obsm` attributes of `adatas` (with proper reindexing and dtype conversions). For both options the inner join rule for the underlying `.obsm` attributes is used. join_vars Specify how to join `adatas` along the var axis. If `None`, assumes all `adatas` have the same variables. If "inner", the intersection of all variables in `adatas` will be used. label Column in `.obs` to place batch information in. If it's None, no column is added. keys Names for each object being added. These values are used for column values for `label` or appended to the index if `index_unique` is not `None`. Defaults to incrementing integer labels. index_unique Whether to make the index unique by using the keys. If provided, this is the delimiter between "{orig_idx}{index_unique}{key}". When `None`, the original indices are kept. convert You can pass a function or a Mapping of functions which will be applied to the values of attributes (`.obs`, `.obsm`, `.layers`, `.X`) or to specific keys of these attributes in the subset object. Specify an attribute and a key (if needed) as keys of the passed Mapping and a function to be applied as a value. harmonize_dtypes If `True`, all retrieved arrays from subset objects will have the same dtype. indices_strict If `True`, arrays from the subset objects will always have the same order of indices as in selection used to subset. This parameter can be set to `False` if the order in the returned arrays is not important, for example, when using them for stochastic gradient descent. In this case the performance of subsetting can be a bit better. Examples ---------- >>> from scanpy.datasets import pbmc68k_reduced, pbmc3k_processed >>> adata1, adata2 = pbmc68k_reduced(), pbmc3k_processed() >>> adata1.shape (700, 765) >>> adata2.shape (2638, 1838) >>> dc = AnnCollection([adata1, adata2], join_vars='inner') >>> dc AnnCollection object with n_obs × n_vars = 3338 × 208 constructed from 2 AnnData objects view of obsm: 'X_pca', 'X_umap' obs: 'n_genes', 'percent_mito', 'n_counts', 'louvain' >>> batch = dc[100:200] # AnnCollectionView >>> batch AnnCollectionView object with n_obs × n_vars = 100 × 208 obsm: 'X_pca', 'X_umap' obs: 'n_genes', 'percent_mito', 'n_counts', 'louvain' >>> batch.X.shape (100, 208) >>> len(batch.obs['louvain']) 100 """ @old_positionals( "join_obs", "join_obsm", "join_vars", "label", "keys", "index_unique", "convert", "harmonize_dtypes", "indices_strict", ) def __init__( self, adatas: Sequence[AnnData] | dict[str, AnnData], *, join_obs: Literal["inner", "outer"] | None = "inner", join_obsm: Literal["inner"] | None = None, join_vars: Literal["inner"] | None = None, label: str | None = None, keys: Sequence[str] | None = None, index_unique: str | None = None, convert: ConvertType | None = None, harmonize_dtypes: bool = True, indices_strict: bool = True, ): if isinstance(adatas, Mapping): if keys is not None: msg = ( "Cannot specify categories in both mapping keys and using `keys`. " "Only specify this once." ) raise TypeError(msg) keys, adatas = list(adatas.keys()), list(adatas.values()) else: adatas = list(adatas) # check if the variables are the same in all adatas self.adatas_vidx = [None for adata in adatas] vars_names_list = [adata.var_names for adata in adatas] vars_eq = all([adatas[0].var_names.equals(vrs) for vrs in vars_names_list[1:]]) if vars_eq: self.var_names = adatas[0].var_names elif join_vars == "inner": var_names = reduce(pd.Index.intersection, vars_names_list) self.adatas_vidx = [] for adata in adatas: if var_names.equals(adata.var_names): self.adatas_vidx.append(None) else: adata_vidx = _normalize_index(var_names, adata.var_names) self.adatas_vidx.append(adata_vidx) self.var_names = var_names else: msg = ( "Adatas have different variables. " "Please specify join_vars='inner' for intersection." ) raise ValueError(msg) concat_indices = pd.concat( [pd.Series(a.obs_names) for a in adatas], ignore_index=True ) if keys is None: keys = np.arange(len(adatas)).astype(str) label_col = pd.Categorical.from_codes( np.repeat(np.arange(len(adatas)), [a.shape[0] for a in adatas]), categories=keys, ) if index_unique is not None: concat_indices = concat_indices.str.cat( _map_cat_to_str(label_col), sep=index_unique ) self.obs_names = pd.Index(concat_indices) if not self.obs_names.is_unique: warnings.warn("Observation names are not unique.", UserWarning) view_attrs = ATTRS.copy() self._attrs = [] # process obs joins if join_obs is not None: view_attrs.remove("obs") self._attrs.append("obs") concat_annot = pd.concat( [a.obs for a in adatas], join=join_obs, ignore_index=True ) concat_annot.index = self.obs_names self._obs = concat_annot else: self._obs = pd.DataFrame(index=self.obs_names) if label is not None: self._obs[label] = label_col # process obsm inner join self._obsm = None if join_obsm == "inner": view_attrs.remove("obsm") self._attrs.append("obsm") self._obsm = inner_concat_aligned_mapping( [a.obsm for a in adatas], index=self.obs_names ) self._obsm = ( AxisArrays(self, axis=0, store={}) if self._obsm == {} else self._obsm ) # process inner join of views self._view_attrs_keys = {} for attr in view_attrs: self._view_attrs_keys[attr] = list(getattr(adatas[0], attr).keys()) for a in adatas[1:]: for attr, keys in self._view_attrs_keys.items(): ai_attr = getattr(a, attr) a0_attr = getattr(adatas[0], attr) new_keys = [] for key in keys: if key in ai_attr.keys(): a0_ashape = a0_attr[key].shape ai_ashape = ai_attr[key].shape if ( len(a0_ashape) < 2 or a0_ashape[1] == ai_ashape[1] or attr == "layers" ): new_keys.append(key) self._view_attrs_keys[attr] = new_keys self.adatas = adatas self.limits = [adatas[0].n_obs] for i in range(len(adatas) - 1): self.limits.append(self.limits[i] + adatas[i + 1].n_obs) # init converter self._convert = convert self._dtypes = None if len(adatas) > 1 and harmonize_dtypes: self._dtypes = _harmonize_types(self._view_attrs_keys, self.adatas) self.indices_strict = indices_strict def __getitem__(self, index: Index): oidx, vidx = _normalize_indices(index, self.obs_names, self.var_names) resolved_idx = self._resolve_idx(oidx, vidx) return AnnCollectionView(self, self.convert, resolved_idx) @property def convert(self): """On the fly converters for keys of attributes and data matrix. A function or a Mapping of functions which will be applied to the values of attributes (`.X`) or to specific keys of these attributes (`.obs`, `.obsm`, `.layers`) of subset objects. The converters are not applied to `.obs` and `.obsm` (if present) of this object, only to the attributes of subset objects. The keys of the Mapping should correspond to the attributes or keys of the attributes (hierarchically) and the values should be functions used for conversion. Examples -------- :: { # densify .X "X": lambda a: a.toarray() if issparse(a) else a, # change dtype for all keys of .obsm "obsm": lambda a: np.asarray(a, dtype="float32"), # change type only for one key of .obs "obs": dict(key1=lambda c: c.astype(str)), } """ return self._convert @convert.setter def convert(self, value): self._convert = value @property def obs(self): """One-dimensional annotation of observations. If `join_obs` was set to "inner" and "outer", subset objects' `.obs` will point to this `.obs`; otherwise, to `.obs` of the underlying objects (`adatas`). """ return self._obs @property def obsm(self): """Multi-dimensional annotation of observations. If `join_obsm` was set to "inner", subset objects' `.obsm` will point to this `.obsm`; otherwise, to `.obsm` of the underlying objects (`adatas`). In the latter case, `.obsm` of this object will be `None`. """ return self._obsm @property def shape(self): """Shape of the lazily concatenated data matrix""" return self.limits[-1], len(self.var_names) @property def n_obs(self): """Number of observations.""" return self.shape[0] @property def n_vars(self): """Number of variables/features.""" return self.shape[1] def __len__(self): return self.limits[-1] def to_adata(self): """Convert this AnnCollection object to an AnnData object. The AnnData object won't have `.X`, only `.obs` and `.obsm`. """ if "obs" in self._view_attrs_keys or "obsm" in self._view_attrs_keys: concat_view = self[self.obs_names] if "obsm" in self._view_attrs_keys: obsm = ( concat_view.obsm.to_dict(use_convert=False) if concat_view.obsm is not None else None ) else: obsm = self.obsm.copy() obs = self.obs.copy() if "obs" in self._view_attrs_keys and concat_view.obs is not None: for key, value in concat_view.obs.to_dict(use_convert=False).items(): obs[key] = value adata = AnnData(X=None, obs=obs, obsm=obsm, shape=self.shape) adata.obs_names = self.obs_names adata.var_names = self.var_names return adata def lazy_attr(self, attr, key=None): """Get a subsettable key from an attribute (array-like) or an attribute. Returns a LazyAttrData object which provides subsetting over the specified attribute (`.obs` or `.obsm`) or over a key from this attribute. In the latter case, it acts as a lazy array. """ return LazyAttrData(self, attr, key) @property def has_backed(self): """`True` if `adatas` have backed AnnData objects, `False` otherwise.""" return any([adata.isbacked for adata in self.adatas]) @property def attrs_keys(self): """Dict of all accessible attributes and their keys.""" _attrs_keys = {} for attr in self._attrs: keys = list(getattr(self, attr).keys()) _attrs_keys[attr] = keys _attrs_keys.update(self._view_attrs_keys) return _attrs_keys def __repr__(self): n_obs, n_vars = self.shape descr = f"AnnCollection object with n_obs × n_vars = {n_obs} × {n_vars}" descr += f"\n constructed from {len(self.adatas)} AnnData objects" for attr, keys in self._view_attrs_keys.items(): if len(keys) > 0: descr += f"\n view of {attr}: {str(keys)[1:-1]}" for attr in self._attrs: keys = list(getattr(self, attr).keys()) if len(keys) > 0: descr += f"\n {attr}: {str(keys)[1:-1]}" if "obs" in self._view_attrs_keys: keys = list(self.obs.keys()) if len(keys) > 0: descr += f"\n own obs: {str(keys)[1:-1]}" return descr class LazyAttrData(_IterateViewMixin): def __init__(self, adset: AnnCollection, attr: str, key: str | None = None): self.adset = adset self.attr = attr self.key = key def __getitem__(self, index): oidx = None vidx = None if isinstance(index, tuple) and self.attr in {"obs", "obsm"}: oidx = index[0] if len(index) > 1: vidx = index[1] if oidx is None: view = self.adset[index] else: view = self.adset[oidx] attr_arr = getattr(view, self.attr) if self.key is not None: attr_arr = attr_arr[self.key] return attr_arr if vidx is None else attr_arr[:, vidx] @property def shape(self): shape = self.adset.shape if self.attr in {"X", "layers"}: return shape elif self.attr == "obs": return (shape[0],) elif self.attr == "obsm" and self.key is not None: return shape[0], self[:1].shape[1] else: return None @property def ndim(self): return len(self.shape) if self.shape is not None else 0 @property def dtype(self): _dtypes = self.adset._dtypes if _dtypes is not None and self.attr in _dtypes: return _dtypes[self.attr][self.key] attr = self[:1] if hasattr(attr, "dtype"): return attr.dtype else: return None python-anndata-0.12.0~rc1/src/anndata/experimental/pytorch/000077500000000000000000000000001500370632200237035ustar00rootroot00000000000000python-anndata-0.12.0~rc1/src/anndata/experimental/pytorch/__init__.py000066400000000000000000000001371500370632200260150ustar00rootroot00000000000000from __future__ import annotations from ._annloader import AnnLoader __all__ = ["AnnLoader"] python-anndata-0.12.0~rc1/src/anndata/experimental/pytorch/_annloader.py000066400000000000000000000177451500370632200263750ustar00rootroot00000000000000from __future__ import annotations from collections.abc import Mapping from copy import copy from functools import partial from importlib.util import find_spec from math import ceil from typing import TYPE_CHECKING import numpy as np from scipy.sparse import issparse from ..._core.anndata import AnnData from ...compat import old_positionals from ..multi_files._anncollection import AnnCollection, _ConcatViewMixin if find_spec("torch") or TYPE_CHECKING: import torch from torch.utils.data import BatchSampler, DataLoader, Sampler else: Sampler, BatchSampler, DataLoader = object, object, object if TYPE_CHECKING: from collections.abc import Callable, Generator, Sequence from typing import TypeAlias, Union from scipy.sparse import spmatrix # need to use Union because of autodoc_mock_imports Array: TypeAlias = Union[torch.Tensor, np.ndarray, spmatrix] # noqa: UP007 # Custom sampler to get proper batches instead of joined separate indices # maybe move to multi_files class BatchIndexSampler(Sampler): @old_positionals("batch_size", "shuffle", "drop_last") def __init__( self, n_obs: int, *, batch_size: int, shuffle: bool = False, drop_last: bool = False, ) -> None: self.n_obs = n_obs self.batch_size = batch_size if batch_size < n_obs else n_obs self.shuffle = shuffle self.drop_last = drop_last def __iter__(self) -> Generator[list[int], None, None]: indices: list[int] if self.shuffle: indices = np.random.permutation(self.n_obs).tolist() else: indices = list(range(self.n_obs)) for i in range(0, self.n_obs, self.batch_size): batch = indices[i : min(i + self.batch_size, self.n_obs)] # only happens if the last batch is smaller than batch_size if len(batch) < self.batch_size and self.drop_last: continue yield batch def __len__(self) -> int: if self.drop_last: length = self.n_obs // self.batch_size else: length = ceil(self.n_obs / self.batch_size) return length # maybe replace use_cuda with explicit device option def default_converter(arr: Array, *, use_cuda: bool, pin_memory: bool): if isinstance(arr, torch.Tensor): if use_cuda: arr = arr.cuda() elif pin_memory: arr = arr.pin_memory() elif arr.dtype.name != "category" and np.issubdtype(arr.dtype, np.number): if issparse(arr): arr = arr.toarray() if use_cuda: arr = torch.tensor(arr, device="cuda") else: arr = torch.tensor(arr) arr = arr.pin_memory() if pin_memory else arr return arr def _convert_on_top( convert: Callable[[Array], Array] | None | Mapping[str, Callable[[Array], Array]], top_convert: Callable[[Array], Array], attrs_keys: Sequence[str] | Mapping[str, Sequence[str]], ): if convert is None: new_convert = top_convert elif callable(convert): def compose_convert(arr): return top_convert(convert(arr)) new_convert = compose_convert else: new_convert = {} for attr in attrs_keys: if attr not in convert: new_convert[attr] = top_convert else: as_ks: Sequence[str] | None if not isinstance(attrs_keys, Mapping): as_ks = None else: as_ks = attrs_keys[attr] new_convert[attr] = _convert_on_top(convert[attr], top_convert, as_ks) return new_convert # AnnLoader has the same arguments as DataLoader, but uses BatchIndexSampler by default class AnnLoader(DataLoader): """\ PyTorch DataLoader for AnnData objects. Builds DataLoader from a sequence of AnnData objects, from an :class:`~anndata.experimental.AnnCollection` object or from an `AnnCollectionView` object. Takes care of the required conversions. Parameters ---------- adatas `AnnData` objects or an `AnnCollection` object from which to load the data. batch_size How many samples per batch to load. shuffle Set to `True` to have the data reshuffled at every epoch. use_default_converter Use the default converter to convert arrays to pytorch tensors, transfer to the default cuda device (if `use_cuda=True`), do memory pinning (if `pin_memory=True`). If you pass an AnnCollection object with prespecified converters, the default converter won't overwrite these converters but will be applied on top of them. use_cuda Transfer pytorch tensors to the default cuda device after conversion. Only works if `use_default_converter=True` **kwargs Arguments for PyTorch DataLoader. If `adatas` is not an `AnnCollection` object, then also arguments for `AnnCollection` initialization. """ @old_positionals("batch_size", "shuffle", "use_default_converter", "use_cuda") def __init__( self, adatas: Sequence[AnnData] | dict[str, AnnData], *, batch_size: int = 1, shuffle: bool = False, use_default_converter: bool = True, use_cuda: bool = False, **kwargs, ): if isinstance(adatas, AnnData): adatas = [adatas] if ( isinstance(adatas, list) or isinstance(adatas, tuple) or isinstance(adatas, dict) ): join_obs = kwargs.pop("join_obs", "inner") join_obsm = kwargs.pop("join_obsm", None) label = kwargs.pop("label", None) keys = kwargs.pop("keys", None) index_unique = kwargs.pop("index_unique", None) convert = kwargs.pop("convert", None) harmonize_dtypes = kwargs.pop("harmonize_dtypes", True) indices_strict = kwargs.pop("indices_strict", True) dataset = AnnCollection( adatas, join_obs=join_obs, join_obsm=join_obsm, label=label, keys=keys, index_unique=index_unique, convert=convert, harmonize_dtypes=harmonize_dtypes, indices_strict=indices_strict, ) elif isinstance(adatas, _ConcatViewMixin): dataset = copy(adatas) else: msg = "adata should be of type AnnData or AnnCollection." raise ValueError(msg) if use_default_converter: pin_memory = kwargs.pop("pin_memory", False) _converter = partial( default_converter, use_cuda=use_cuda, pin_memory=pin_memory ) dataset.convert = _convert_on_top( dataset.convert, _converter, dict(dataset.attrs_keys, X=[]) ) has_sampler = "sampler" in kwargs has_batch_sampler = "batch_sampler" in kwargs has_worker_init_fn = ( "worker_init_fn" in kwargs and kwargs["worker_init_fn"] is not None ) has_workers = "num_workers" in kwargs and kwargs["num_workers"] > 0 use_parallel = has_worker_init_fn or has_workers if ( batch_size is not None and batch_size > 1 and not has_batch_sampler and not use_parallel ): drop_last = kwargs.pop("drop_last", False) if has_sampler: sampler = kwargs.pop("sampler") sampler = BatchSampler( sampler, batch_size=batch_size, drop_last=drop_last ) else: sampler = BatchIndexSampler( len(dataset), batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, ) super().__init__(dataset, batch_size=None, sampler=sampler, **kwargs) else: super().__init__(dataset, batch_size=batch_size, shuffle=shuffle, **kwargs) python-anndata-0.12.0~rc1/src/anndata/io.py000066400000000000000000000012721500370632200205010ustar00rootroot00000000000000from __future__ import annotations from ._core.sparse_dataset import sparse_dataset from ._io.h5ad import read_h5ad, write_h5ad from ._io.read import ( read_csv, read_excel, read_hdf, read_loom, read_mtx, read_text, read_umi_tools, ) from ._io.specs import read_elem, write_elem from ._io.write import write_csvs, write_loom from ._io.zarr import read_zarr, write_zarr __all__ = [ "read_csv", "read_excel", "read_h5ad", "read_hdf", "read_loom", "read_mtx", "read_text", "read_umi_tools", "read_zarr", "write_csvs", "write_h5ad", "write_loom", "write_zarr", "write_elem", "read_elem", "sparse_dataset", ] python-anndata-0.12.0~rc1/src/anndata/logging.py000066400000000000000000000031761500370632200215250ustar00rootroot00000000000000from __future__ import annotations import logging import os from .compat import old_positionals _previous_memory_usage = None anndata_logger = logging.getLogger("anndata") # Don’t pass log messages on to logging.root and its handler anndata_logger.propagate = False anndata_logger.addHandler(logging.StreamHandler()) # Logs go to stderr anndata_logger.handlers[-1].setFormatter(logging.Formatter("%(message)s")) anndata_logger.handlers[-1].setLevel("INFO") def get_logger(name: str) -> logging.Logger: """\ Creates a child logger that delegates to anndata_logger instead to logging.root """ return anndata_logger.manager.getLogger(name) def get_memory_usage() -> tuple[float, float]: import psutil process = psutil.Process(os.getpid()) try: meminfo = process.memory_info() except AttributeError: meminfo = process.get_memory_info() mem = meminfo[0] / 2**30 # output in GB mem_diff = mem global _previous_memory_usage # noqa: PLW0603 if _previous_memory_usage is not None: mem_diff = mem - _previous_memory_usage _previous_memory_usage = mem return mem, mem_diff @old_positionals("newline") def format_memory_usage( mem_usage: tuple[float, float], msg: str = "", *, newline: bool = False ): nl = "\n" if newline else "" more = " \n... " if msg != "" else "" mem, diff = mem_usage return ( f"{nl}{msg}{more}Memory usage: current {mem:.2f} GB, difference {diff:+.2f} GB" ) @old_positionals("newline") def print_memory_usage(msg: str = "", *, newline: bool = False): print(format_memory_usage(get_memory_usage(), msg, newline)) python-anndata-0.12.0~rc1/src/anndata/tests/000077500000000000000000000000001500370632200206605ustar00rootroot00000000000000python-anndata-0.12.0~rc1/src/anndata/tests/__init__.py000066400000000000000000000000001500370632200227570ustar00rootroot00000000000000python-anndata-0.12.0~rc1/src/anndata/tests/helpers.py000066400000000000000000001055211500370632200227000ustar00rootroot00000000000000from __future__ import annotations import itertools import random import re import warnings from collections import Counter, defaultdict from collections.abc import Mapping from contextlib import contextmanager from functools import partial, singledispatch, wraps from string import ascii_letters from typing import TYPE_CHECKING import h5py import numpy as np import pandas as pd import pytest from pandas.api.types import is_numeric_dtype from scipy import sparse import anndata from anndata import AnnData, ExperimentalFeatureWarning, Raw from anndata._core.aligned_mapping import AlignedMappingBase from anndata._core.sparse_dataset import BaseCompressedSparseDataset from anndata._core.views import ArrayView from anndata.compat import ( AwkArray, CSArray, CSMatrix, CupyArray, CupyCSCMatrix, CupyCSRMatrix, CupySparseMatrix, DaskArray, ZarrArray, is_zarr_v2, ) from anndata.utils import asarray if TYPE_CHECKING: from collections.abc import Callable, Collection, Iterable from typing import Literal, TypeGuard, TypeVar from zarr.abc.store import ByteRequest from zarr.core.buffer import BufferPrototype from .._types import ArrayStorageType DT = TypeVar("DT") try: from pandas.core.arrays.integer import IntegerDtype except ImportError: IntegerDtype = ( *(pd.Int8Dtype, pd.Int16Dtype, pd.Int32Dtype, pd.Int64Dtype), *(pd.UInt8Dtype, pd.UInt16Dtype, pd.UInt32Dtype, pd.UInt64Dtype), ) # Give this to gen_adata when dask array support is expected. GEN_ADATA_DASK_ARGS = dict( obsm_types=( sparse.csr_matrix, np.ndarray, pd.DataFrame, DaskArray, sparse.csr_array, ), varm_types=( sparse.csr_matrix, np.ndarray, pd.DataFrame, DaskArray, sparse.csr_array, ), layers_types=( sparse.csr_matrix, np.ndarray, pd.DataFrame, DaskArray, sparse.csr_array, ), ) DEFAULT_KEY_TYPES = ( sparse.csr_matrix, np.ndarray, pd.DataFrame, sparse.csr_array, ) DEFAULT_COL_TYPES = ( pd.CategoricalDtype(ordered=False), pd.CategoricalDtype(ordered=True), np.int64, np.float64, np.uint8, np.bool_, pd.BooleanDtype, pd.Int32Dtype, ) def gen_vstr_recarray(m, n, dtype=None): size = m * n lengths = np.random.randint(3, 5, size) letters = np.array(list(ascii_letters)) gen_word = lambda l: "".join(np.random.choice(letters, l)) arr = np.array([gen_word(l) for l in lengths]).reshape(m, n) return pd.DataFrame(arr, columns=[gen_word(5) for i in range(n)]).to_records( index=False, column_dtypes=dtype ) def issubdtype( a: np.dtype | pd.api.extensions.ExtensionDtype | type, b: type[DT] | tuple[type[DT], ...], ) -> TypeGuard[DT]: if isinstance(b, tuple): return any(issubdtype(a, t) for t in b) if isinstance(a, type) and issubclass(a, pd.api.extensions.ExtensionDtype): return issubclass(a, b) if isinstance(a, pd.api.extensions.ExtensionDtype): return isinstance(a, b) try: return np.issubdtype(a, b) except TypeError: # pragma: no cover pytest.fail(f"issubdtype can’t handle everything yet: {a} {b}") def gen_random_column( n: int, dtype: np.dtype | pd.api.extensions.ExtensionDtype ) -> tuple[str, np.ndarray | pd.api.extensions.ExtensionArray]: if issubdtype(dtype, pd.CategoricalDtype): # TODO: Think about allowing index to be passed for n letters = np.fromiter(iter(ascii_letters), "U1") if n > len(letters): letters = letters[: n // 2] # Make sure categories are repeated key = "cat" if dtype.ordered else "cat_unordered" return key, pd.Categorical(np.random.choice(letters, n), dtype=dtype) if issubdtype(dtype, pd.BooleanDtype): return ( "nullable-bool", pd.arrays.BooleanArray( np.random.randint(0, 2, size=n, dtype=bool), mask=np.random.randint(0, 2, size=n, dtype=bool), ), ) if issubdtype(dtype, IntegerDtype): return ( "nullable-int", pd.arrays.IntegerArray( np.random.randint(0, 1000, size=n, dtype=np.int32), mask=np.random.randint(0, 2, size=n, dtype=bool), ), ) if issubdtype(dtype, pd.StringDtype): letters = np.fromiter(iter(ascii_letters), "U1") array = pd.array(np.random.choice(letters, n), dtype=pd.StringDtype()) array[np.random.randint(0, 2, size=n, dtype=bool)] = pd.NA return "string", array # if issubdtype(dtype, pd.DatetimeTZDtype): # return "datetime", pd.to_datetime(np.random.randint(0, 1000, size=n)) if issubdtype(dtype, np.bool_): return "bool", np.random.randint(0, 2, size=n, dtype=dtype) if not issubdtype(dtype, np.number): # pragma: no cover pytest.fail(f"Unexpected dtype: {dtype}") n_bits = 8 * (dtype().itemsize if isinstance(dtype, type) else dtype.itemsize) if issubdtype(dtype, np.unsignedinteger): return f"uint{n_bits}", np.random.randint(0, 255, n, dtype=dtype) if issubdtype(dtype, np.signedinteger): return f"int{n_bits}", np.random.randint(-50, 50, n, dtype=dtype) if issubdtype(dtype, np.floating): return f"float{n_bits}", np.random.random(n).astype(dtype) pytest.fail(f"Unexpected numeric dtype: {dtype}") # pragma: no cover def gen_typed_df( n: int, index: pd.Index[str] | None = None, dtypes: Collection[np.dtype | pd.api.extensions.ExtensionDtype] = DEFAULT_COL_TYPES, ): columns = [gen_random_column(n, dtype) for dtype in dtypes] col_names = [n for n, _ in columns] assert len(col_names) == len(set(col_names)), "Duplicate column names generated!" return pd.DataFrame(dict(columns), index=index) def _gen_awkward_inner(shape, rng, dtype): # the maximum length a ragged dimension can take MAX_RAGGED_DIM_LEN = 20 if not len(shape): # abort condition -> no dimension left, return an actual value instead return dtype(rng.randrange(1000)) else: curr_dim_len = shape[0] lil = [] if curr_dim_len is None: # ragged dimension, set random length curr_dim_len = rng.randrange(MAX_RAGGED_DIM_LEN) for _ in range(curr_dim_len): lil.append(_gen_awkward_inner(shape[1:], rng, dtype)) return lil def gen_awkward(shape, dtype=np.int32): """Function to generate an awkward array with random values. Awkward array dimensions can either be fixed-length ("regular") or variable length ("ragged") (the first dimension is always fixed-length). Parameters ---------- shape shape of the array to be generated. Any dimension specified as `None` will be simulated as ragged. """ import awkward as ak if shape[0] is None: msg = "The first dimension must be fixed-length." raise ValueError(msg) rng = random.Random(123) shape = np.array(shape) if np.any(shape == 0): # use empty numpy array for fixed dimensions, then add empty singletons for ragged dimensions var_dims = [i for i, s in enumerate(shape) if s is None] shape = [s for s in shape if s is not None] arr = ak.Array(np.empty(shape, dtype=dtype)) for d in var_dims: arr = ak.singletons(arr, axis=d - 1) return arr else: lil = _gen_awkward_inner(shape, rng, dtype) arr = ak.values_astype(AwkArray(lil), dtype) # make fixed-length dimensions regular for i, d in enumerate(shape): if d is not None: arr = ak.to_regular(arr, i) return arr def gen_typed_df_t2_size(m, n, index=None, columns=None) -> pd.DataFrame: s = 0 df = pd.DataFrame() new_vals = gen_typed_df(m) while s < (n / new_vals.shape[1]): new_vals = gen_typed_df(m, index=index) new_vals.columns = new_vals.columns + "_" + str(s) df[new_vals.columns] = new_vals s += 1 df = df.iloc[:m, :n].copy() if columns is not None: df.columns = columns return df def maybe_add_sparse_array( mapping: Mapping, types: Collection[type], format: Literal["csr", "csc"], random_state: np.random.Generator, shape: tuple[int, int], ): if sparse.csr_array in types or sparse.csr_matrix in types: mapping["sparse_array"] = sparse.csr_array( sparse.random(*shape, format=format, random_state=random_state) ) return mapping # TODO: Use hypothesis for this? def gen_adata( shape: tuple[int, int], X_type: Callable[[np.ndarray], object] = sparse.csr_matrix, *, X_dtype: np.dtype = np.float32, obs_dtypes: Collection[ np.dtype | pd.api.extensions.ExtensionDtype ] = DEFAULT_COL_TYPES, var_dtypes: Collection[ np.dtype | pd.api.extensions.ExtensionDtype ] = DEFAULT_COL_TYPES, obsm_types: Collection[type] = DEFAULT_KEY_TYPES + (AwkArray,), varm_types: Collection[type] = DEFAULT_KEY_TYPES + (AwkArray,), layers_types: Collection[type] = DEFAULT_KEY_TYPES, random_state: np.random.Generator | None = None, sparse_fmt: Literal["csr", "csc"] = "csr", ) -> AnnData: """\ Helper function to generate a random AnnData for testing purposes. Note: For `obsm_types`, `varm_types`, and `layers_types` these currently just filter already created objects. In future, these should choose which objects are created. Params ------ shape What shape you want the anndata to be. X_type What kind of container should `X` be? This will be called on a randomly generated 2d array. X_dtype What should the dtype of the `.X` container be? obsm_types What kinds of containers should be in `.obsm`? varm_types What kinds of containers should be in `.varm`? layers_types What kinds of containers should be in `.layers`? sparse_fmt What sparse format should be used for sparse matrices? (csr, csc) """ import dask.array as da if random_state is None: random_state = np.random.default_rng() M, N = shape obs_names = pd.Index(f"cell{i}" for i in range(shape[0])) var_names = pd.Index(f"gene{i}" for i in range(shape[1])) obs = gen_typed_df(M, obs_names, dtypes=obs_dtypes) var = gen_typed_df(N, var_names, dtypes=var_dtypes) # For #147 obs.rename(columns=dict(cat="obs_cat"), inplace=True) var.rename(columns=dict(cat="var_cat"), inplace=True) if X_type is None: X = None else: X = X_type(random_state.binomial(100, 0.005, (M, N)).astype(X_dtype)) obsm = dict( array=np.random.random((M, 50)), sparse=sparse.random(M, 100, format=sparse_fmt, random_state=random_state), df=gen_typed_df(M, obs_names, dtypes=obs_dtypes), awk_2d_ragged=gen_awkward((M, None)), da=da.random.random((M, 50)), ) obsm = {k: v for k, v in obsm.items() if type(v) in obsm_types} obsm = maybe_add_sparse_array( mapping=obsm, types=obsm_types, format=sparse_fmt, random_state=random_state, shape=(M, 100), ) varm = dict( array=np.random.random((N, 50)), sparse=sparse.random(N, 100, format=sparse_fmt, random_state=random_state), df=gen_typed_df(N, var_names, dtypes=var_dtypes), awk_2d_ragged=gen_awkward((N, None)), da=da.random.random((N, 50)), ) varm = {k: v for k, v in varm.items() if type(v) in varm_types} varm = maybe_add_sparse_array( mapping=varm, types=varm_types, format=sparse_fmt, random_state=random_state, shape=(N, 100), ) layers = dict( array=np.random.random((M, N)), sparse=sparse.random(M, N, format=sparse_fmt, random_state=random_state), da=da.random.random((M, N)), ) layers = maybe_add_sparse_array( mapping=layers, types=layers_types, format=sparse_fmt, random_state=random_state, shape=(M, N), ) layers = {k: v for k, v in layers.items() if type(v) in layers_types} obsp = dict( array=np.random.random((M, M)), sparse=sparse.random(M, M, format=sparse_fmt, random_state=random_state), ) obsp["sparse_array"] = sparse.csr_array( sparse.random(M, M, format=sparse_fmt, random_state=random_state) ) varp = dict( array=np.random.random((N, N)), sparse=sparse.random(N, N, format=sparse_fmt, random_state=random_state), ) varp["sparse_array"] = sparse.csr_array( sparse.random(N, N, format=sparse_fmt, random_state=random_state) ) uns = dict( O_recarray=gen_vstr_recarray(N, 5), nested=dict( scalar_str="str", scalar_int=42, scalar_float=3.0, nested_further=dict(array=np.arange(5)), ), awkward_regular=gen_awkward((10, 5)), awkward_ragged=gen_awkward((12, None, None)), # U_recarray=gen_vstr_recarray(N, 5, "U4") ) # https://github.com/zarr-developers/zarr-python/issues/2134 # zarr v3 on-disk does not write structured dtypes if anndata.settings.zarr_write_format == 3: del uns["O_recarray"] with warnings.catch_warnings(): warnings.simplefilter("ignore", ExperimentalFeatureWarning) adata = AnnData( X=X, obs=obs, var=var, obsm=obsm, varm=varm, layers=layers, obsp=obsp, varp=varp, uns=uns, ) return adata def array_bool_subset(index, min_size=2): b = np.zeros(len(index), dtype=bool) selected = np.random.choice( range(len(index)), size=np.random.randint(min_size, len(index), ()), replace=False, ) b[selected] = True return b def list_bool_subset(index, min_size=2): return array_bool_subset(index, min_size=min_size).tolist() def matrix_bool_subset(index, min_size=2): with warnings.catch_warnings(): warnings.simplefilter("ignore", PendingDeprecationWarning) indexer = np.matrix( array_bool_subset(index, min_size=min_size).reshape(len(index), 1) ) return indexer def spmatrix_bool_subset(index, min_size=2): return sparse.csr_matrix( array_bool_subset(index, min_size=min_size).reshape(len(index), 1) ) def sparray_bool_subset(index, min_size=2): return sparse.csr_array( array_bool_subset(index, min_size=min_size).reshape(len(index), 1) ) def array_subset(index, min_size=2): if len(index) < min_size: msg = f"min_size (={min_size}) must be smaller than len(index) (={len(index)}" raise ValueError(msg) return np.random.choice( index, size=np.random.randint(min_size, len(index), ()), replace=False ) def array_int_subset(index, min_size=2): if len(index) < min_size: msg = f"min_size (={min_size}) must be smaller than len(index) (={len(index)}" raise ValueError(msg) return np.random.choice( np.arange(len(index)), size=np.random.randint(min_size, len(index), ()), replace=False, ) def list_int_subset(index, min_size=2): return array_int_subset(index, min_size=min_size).tolist() def slice_subset(index, min_size=2): while True: points = np.random.choice(np.arange(len(index) + 1), size=2, replace=False) s = slice(*sorted(points)) if len(range(*s.indices(len(index)))) >= min_size: break return s def single_subset(index): return index[np.random.randint(0, len(index))] @pytest.fixture( params=[ array_subset, slice_subset, single_subset, array_int_subset, list_int_subset, array_bool_subset, list_bool_subset, matrix_bool_subset, spmatrix_bool_subset, sparray_bool_subset, ] ) def subset_func(request): return request.param ################### # Checking equality ################### def format_msg(elem_name: str | None) -> str: if elem_name is not None: return f"Error raised from element {elem_name!r}." else: return "" # TODO: it would be better to modify the other exception def report_name(func): """Report name of element being tested if test fails.""" @wraps(func) def func_wrapper(*args, _elem_name: str | None = None, **kwargs): try: return func(*args, **kwargs) except Exception as e: if _elem_name is not None and not hasattr(e, "_name_attached"): msg = format_msg(_elem_name) args = list(e.args) if len(args) == 0: args = [msg] else: args[0] = f"{args[0]}\n\n{msg}" e.args = tuple(args) e._name_attached = True raise e return func_wrapper @report_name def _assert_equal(a, b): """Allows reporting elem name for simple assertion.""" assert a == b @singledispatch def assert_equal( a: object, b: object, *, exact: bool = False, elem_name: str | None = None ): _assert_equal(a, b, _elem_name=elem_name) @assert_equal.register(CupyArray) def assert_equal_cupy( a: CupyArray, b: object, *, exact: bool = False, elem_name: str | None = None ): assert_equal(b, a.get(), exact=exact, elem_name=elem_name) @assert_equal.register(np.ndarray) def assert_equal_ndarray( a: np.ndarray, b: object, *, exact: bool = False, elem_name: str | None = None ): b = asarray(b) if not exact and is_numeric_dtype(a) and is_numeric_dtype(b): assert a.shape == b.shape, format_msg(elem_name) np.testing.assert_allclose(a, b, equal_nan=True, err_msg=format_msg(elem_name)) elif ( # Structured dtype not exact and hasattr(a, "dtype") and hasattr(b, "dtype") and len(a.dtype) > 1 and len(b.dtype) > 0 ): # Reshaping to allow >2d arrays assert a.shape == b.shape, format_msg(elem_name) assert_equal( pd.DataFrame(a.reshape(-1)), pd.DataFrame(b.reshape(-1)), exact=exact, elem_name=elem_name, ) else: assert np.all(a == b), format_msg(elem_name) @assert_equal.register(ArrayView) def assert_equal_arrayview( a: ArrayView, b: object, *, exact: bool = False, elem_name: str | None = None ): assert_equal(asarray(a), asarray(b), exact=exact, elem_name=elem_name) @assert_equal.register(BaseCompressedSparseDataset) @assert_equal.register(sparse.spmatrix) def assert_equal_sparse( a: BaseCompressedSparseDataset | sparse.spmatrix, b: object, *, exact: bool = False, elem_name: str | None = None, ): a = asarray(a) assert_equal(b, a, exact=exact, elem_name=elem_name) @assert_equal.register(CSArray) def assert_equal_sparse_array( a: CSArray, b: object, *, exact: bool = False, elem_name: str | None = None ): return assert_equal_sparse(a, b, exact=exact, elem_name=elem_name) @assert_equal.register(CupySparseMatrix) def assert_equal_cupy_sparse( a: CupySparseMatrix, b: object, *, exact: bool = False, elem_name: str | None = None ): a = a.toarray() assert_equal(b, a, exact=exact, elem_name=elem_name) @assert_equal.register(h5py.Dataset) @assert_equal.register(ZarrArray) def assert_equal_h5py_dataset( a: ArrayStorageType, b: object, *, exact: bool = False, elem_name: str | None = None ): a = asarray(a) assert_equal(b, a, exact=exact, elem_name=elem_name) @assert_equal.register(DaskArray) def assert_equal_dask_array( a: DaskArray, b: object, *, exact: bool = False, elem_name: str | None = None ): assert_equal(b, a.compute(), exact=exact, elem_name=elem_name) @assert_equal.register(pd.DataFrame) def are_equal_dataframe( a: pd.DataFrame, b: object, *, exact: bool = False, elem_name: str | None = None ): if not isinstance(b, pd.DataFrame): assert_equal(b, a, exact=exact, elem_name=elem_name) # , a.values maybe? report_name(pd.testing.assert_frame_equal)( a, b, check_exact=exact, check_column_type=exact, check_index_type=exact, _elem_name=elem_name, check_frame_type=False, ) @assert_equal.register(AwkArray) def assert_equal_awkarray( a: AwkArray, b: object, *, exact: bool = False, elem_name: str | None = None ): import awkward as ak if exact: assert isinstance(b, AwkArray) assert a.type == b.type, f"{a.type} != {b.type}, {format_msg(elem_name)}" assert ak.to_list(a) == ak.to_list(b), format_msg(elem_name) @assert_equal.register(Mapping) def assert_equal_mapping( a: Mapping, b: object, *, exact: bool = False, elem_name: str | None = None ): assert isinstance(b, Mapping) assert set(a.keys()) == set(b.keys()), format_msg(elem_name) for k in a.keys(): if elem_name is None: elem_name = "" assert_equal(a[k], b[k], exact=exact, elem_name=f"{elem_name}/{k}") @assert_equal.register(AlignedMappingBase) def assert_equal_aligned_mapping( a: AlignedMappingBase, b: object, *, exact: bool = False, elem_name: str | None = None, ): assert isinstance(b, AlignedMappingBase) a_indices = (a.parent.obs_names, a.parent.var_names) b_indices = (b.parent.obs_names, b.parent.var_names) for axis_idx in a.axes: assert_equal( a_indices[axis_idx], b_indices[axis_idx], exact=exact, elem_name=axis_idx ) assert a.attrname == b.attrname, format_msg(elem_name) assert_equal_mapping(a, b, exact=exact, elem_name=elem_name) @assert_equal.register(pd.Index) def assert_equal_index( a: pd.Index, b: object, *, exact: bool = False, elem_name: str | None = None ): params = dict(check_categorical=False) if not exact else {} report_name(pd.testing.assert_index_equal)( a, b, check_names=False, **params, _elem_name=elem_name ) @assert_equal.register(pd.api.extensions.ExtensionArray) def assert_equal_extension_array( a: pd.api.extensions.ExtensionArray, b: object, *, exact: bool = False, elem_name: str | None = None, ): report_name(pd.testing.assert_extension_array_equal)( a, b, check_dtype=exact, check_exact=exact, _elem_name=elem_name, ) @assert_equal.register(Raw) def assert_equal_raw( a: Raw, b: object, *, exact: bool = False, elem_name: str | None = None ): def assert_is_not_none(x): # can't put an assert in a lambda assert x is not None report_name(assert_is_not_none)(b, _elem_name=elem_name) for attr in ["X", "var", "varm", "obs_names"]: assert_equal( getattr(a, attr), getattr(b, attr), exact=exact, elem_name=f"{elem_name}/{attr}", ) @assert_equal.register(AnnData) def assert_adata_equal( a: AnnData, b: object, *, exact: bool = False, elem_name: str | None = None ): """\ Check whether two AnnData objects are equivalent, raising an AssertionError if they aren’t. Params ------ a b exact Whether comparisons should be exact or not. This has a somewhat flexible meaning and should probably get refined in the future. """ def fmt_name(x): if elem_name is None: return x else: return f"{elem_name}/{x}" assert isinstance(b, AnnData) # There may be issues comparing views, since np.allclose # can modify ArrayViews if they contain `nan`s assert_equal(a.obs_names, b.obs_names, exact=exact, elem_name=fmt_name("obs_names")) assert_equal(a.var_names, b.var_names, exact=exact, elem_name=fmt_name("var_names")) if not exact: # Reorder all elements if necessary idx = [slice(None), slice(None)] # Since it’s a pain to compare a list of pandas objects change_flag = False if not np.all(a.obs_names == b.obs_names): idx[0] = a.obs_names change_flag = True if not np.all(a.var_names == b.var_names): idx[1] = a.var_names change_flag = True if change_flag: b = b[tuple(idx)].copy() for attr in [ "X", "obs", "var", "obsm", "varm", "layers", "uns", "obsp", "varp", "raw", ]: assert_equal( getattr(a, attr), getattr(b, attr), exact=exact, elem_name=fmt_name(attr), ) def _half_chunk_size(a: tuple[int, ...]) -> tuple[int, ...]: def half_rounded_up(x): div, mod = divmod(x, 2) return div + (mod > 0) return tuple(half_rounded_up(x) for x in a) @singledispatch def as_dense_dask_array(a): import dask.array as da a = asarray(a) return da.asarray(a, chunks=_half_chunk_size(a.shape)) @as_dense_dask_array.register(CSMatrix) def _(a): return as_dense_dask_array(a.toarray()) @as_dense_dask_array.register(DaskArray) def _(a): return a.map_blocks(asarray, dtype=a.dtype, meta=np.ndarray) @singledispatch def as_sparse_dask_array(a) -> DaskArray: import dask.array as da return da.from_array(sparse.csr_matrix(a), chunks=_half_chunk_size(a.shape)) @as_sparse_dask_array.register(CSMatrix) def _(a): import dask.array as da return da.from_array(a, _half_chunk_size(a.shape)) @as_sparse_dask_array.register(CSArray) def _(a): import dask.array as da return da.from_array(sparse.csr_matrix(a), _half_chunk_size(a.shape)) @as_sparse_dask_array.register(DaskArray) def _(a): return a.map_blocks(sparse.csr_matrix) @singledispatch def as_dense_cupy_dask_array(a): import cupy as cp return as_dense_dask_array(a).map_blocks( cp.array, meta=cp.array((1.0), dtype=a.dtype), dtype=a.dtype ) @as_dense_cupy_dask_array.register(CupyArray) def _(a): import cupy as cp import dask.array as da return da.from_array( a, chunks=_half_chunk_size(a.shape), meta=cp.array((1.0), dtype=a.dtype), ) @as_dense_cupy_dask_array.register(DaskArray) def _(a): import cupy as cp if isinstance(a._meta, cp.ndarray): return a.copy() return a.map_blocks( partial(as_cupy, typ=CupyArray), dtype=a.dtype, meta=cp.array((1.0), dtype=a.dtype), ) try: import cupyx.scipy.sparse as cpsparse format_to_memory_class = {"csr": cpsparse.csr_matrix, "csc": cpsparse.csc_matrix} except ImportError: format_to_memory_class = {} # TODO: If there are chunks which divide along columns, then a coo_matrix is returned by compute # We should try and fix this upstream in dask/ cupy @singledispatch def as_cupy_sparse_dask_array(a, format="csr"): memory_class = format_to_memory_class[format] cpu_da = as_sparse_dask_array(a) return cpu_da.rechunk((cpu_da.chunks[0], -1)).map_blocks( memory_class, dtype=a.dtype, meta=memory_class(cpu_da._meta) ) @as_cupy_sparse_dask_array.register(CupyArray) @as_cupy_sparse_dask_array.register(CupySparseMatrix) def _(a, format="csr"): import dask.array as da memory_class = format_to_memory_class[format] return da.from_array(memory_class(a), chunks=(_half_chunk_size(a.shape)[0], -1)) @as_cupy_sparse_dask_array.register(DaskArray) def _(a, format="csr"): memory_class = format_to_memory_class[format] if isinstance(a._meta, memory_class): return a.copy() return a.rechunk((a.chunks[0], -1)).map_blocks( partial(as_cupy, typ=memory_class), dtype=a.dtype ) @contextmanager def pytest_8_raises(exc_cls, *, match: str | re.Pattern = None): """Error handling using pytest 8's support for __notes__. See: https://github.com/pytest-dev/pytest/pull/11227 Remove once pytest 8 is out! """ with pytest.raises(exc_cls) as exc_info: yield exc_info check_error_or_notes_match(exc_info, match) def check_error_or_notes_match(e: pytest.ExceptionInfo, pattern: str | re.Pattern): """ Checks whether the printed error message or the notes contains the given pattern. DOES NOT WORK IN IPYTHON - because of the way IPython handles exceptions """ import traceback message = "".join(traceback.format_exception_only(e.type, e.value)) assert re.search(pattern, message), ( f"Could not find pattern: '{pattern}' in error:\n\n{message}\n" ) def resolve_cupy_type(val): if not isinstance(val, type): input_typ = type(val) else: input_typ = val if issubclass(input_typ, np.ndarray): typ = CupyArray elif issubclass(input_typ, sparse.csr_matrix): typ = CupyCSRMatrix elif issubclass(input_typ, sparse.csc_matrix): typ = CupyCSCMatrix else: msg = f"No default target type for input type {input_typ}" raise NotImplementedError(msg) return typ @singledispatch def as_cupy(val, typ=None): """ Rough conversion function Will try to infer target type from input type if not specified. """ if typ is None: typ = resolve_cupy_type(val) if issubclass(typ, CupyArray): import cupy as cp if isinstance(val, CSMatrix): val = val.toarray() return cp.array(val) elif issubclass(typ, CupyCSRMatrix): import cupy as cp import cupyx.scipy.sparse as cpsparse if isinstance(val, np.ndarray): return cpsparse.csr_matrix(cp.array(val)) else: return cpsparse.csr_matrix(val) elif issubclass(typ, CupyCSCMatrix): import cupy as cp import cupyx.scipy.sparse as cpsparse if isinstance(val, np.ndarray): return cpsparse.csc_matrix(cp.array(val)) else: return cpsparse.csc_matrix(val) else: msg = f"Conversion from {type(val)} to {typ} not implemented" raise NotImplementedError(msg) # TODO: test @as_cupy.register(DaskArray) def as_cupy_dask(a, typ=None): if typ is None: typ = resolve_cupy_type(a._meta) return a.map_blocks(partial(as_cupy, typ=typ), dtype=a.dtype) @singledispatch def shares_memory(x, y) -> bool: return np.shares_memory(x, y) @shares_memory.register(CSMatrix) def shares_memory_sparse(x, y): return ( np.shares_memory(x.data, y.data) and np.shares_memory(x.indices, y.indices) and np.shares_memory(x.indptr, y.indptr) ) BASE_MATRIX_PARAMS = [ pytest.param(asarray, id="np_array"), pytest.param(sparse.csr_matrix, id="scipy_csr_matrix"), pytest.param(sparse.csc_matrix, id="scipy_csc_matrix"), pytest.param(sparse.csr_array, id="scipy_csr_array"), pytest.param(sparse.csc_array, id="scipy_csc_array"), ] DASK_MATRIX_PARAMS = [ pytest.param(as_dense_dask_array, id="dense_dask_array"), pytest.param(as_sparse_dask_array, id="sparse_dask_array"), ] CUPY_MATRIX_PARAMS = [ pytest.param( partial(as_cupy, typ=CupyArray), id="cupy_array", marks=pytest.mark.gpu ), pytest.param( partial(as_cupy, typ=CupyCSRMatrix), id="cupy_csr", marks=pytest.mark.gpu, ), pytest.param( partial(as_cupy, typ=CupyCSCMatrix), id="cupy_csc", marks=pytest.mark.gpu, ), ] DASK_CUPY_MATRIX_PARAMS = [ pytest.param( as_dense_cupy_dask_array, id="cupy_dense_dask_array", marks=pytest.mark.gpu, ), pytest.param( as_cupy_sparse_dask_array, id="cupy_csr_dask_array", marks=pytest.mark.gpu ), ] if is_zarr_v2(): from zarr.storage import DirectoryStore as LocalStore else: from zarr.storage import LocalStore class AccessTrackingStoreBase(LocalStore): _access_count: Counter[str] _accessed: defaultdict[str, set] _accessed_keys: defaultdict[str, list[str]] def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._access_count = Counter() self._accessed = defaultdict(set) self._accessed_keys = defaultdict(list) def _check_and_track_key(self, key: str): for tracked in self._access_count: if tracked in key: self._access_count[tracked] += 1 self._accessed[tracked].add(key) self._accessed_keys[tracked] += [key] def get_access_count(self, key: str) -> int: # access defaultdict when value is not there causes key to be there, # which causes it to be tracked if key not in self._access_count: msg = f"{key} not found among access count" raise KeyError(msg) return self._access_count[key] def get_subkeys_accessed(self, key: str) -> set[str]: if key not in self._accessed: msg = f"{key} not found among accessed" raise KeyError(msg) return self._accessed[key] def get_accessed_keys(self, key: str) -> list[str]: if key not in self._accessed_keys: msg = f"{key} not found among accessed keys" raise KeyError(msg) return self._accessed_keys[key] def initialize_key_trackers(self, keys_to_track: Iterable[str]) -> None: for k in keys_to_track: self._access_count[k] = 0 self._accessed_keys[k] = [] self._accessed[k] = set() def reset_key_trackers(self) -> None: self.initialize_key_trackers(self._access_count.keys()) def assert_access_count(self, key: str, count: int): keys_accessed = self.get_subkeys_accessed(key) access_count = self.get_access_count(key) assert self.get_access_count(key) == count, ( f"Found {access_count} accesses at {keys_accessed}" ) if is_zarr_v2(): class AccessTrackingStore(AccessTrackingStoreBase): def __getitem__(self, key: str) -> bytes: self._check_and_track_key(key) return super().__getitem__(key) else: class AccessTrackingStore(AccessTrackingStoreBase): async def get( self, key: str, prototype: BufferPrototype | None = None, byte_range: ByteRequest | None = None, ) -> object: self._check_and_track_key(key) return await super().get(key, prototype=prototype, byte_range=byte_range) if is_zarr_v2(): class AccessTrackingStore(AccessTrackingStoreBase): def __getitem__(self, key: str) -> bytes: self._check_and_track_key(key) return super().__getitem__(key) else: class AccessTrackingStore(AccessTrackingStoreBase): async def get( self, key: str, prototype: BufferPrototype | None = None, byte_range: ByteRequest | None = None, ) -> object: self._check_and_track_key(key) return await super().get(key, prototype=prototype, byte_range=byte_range) def get_multiindex_columns_df(shape: tuple[int, int]) -> pd.DataFrame: return pd.DataFrame( np.random.rand(shape[0], shape[1]), columns=pd.MultiIndex.from_tuples( list(itertools.product(["a"], range(shape[1] - (shape[1] // 2)))) + list(itertools.product(["b"], range(shape[1] // 2))) ), ) python-anndata-0.12.0~rc1/src/anndata/types.py000066400000000000000000000012721500370632200212360ustar00rootroot00000000000000from __future__ import annotations from typing import TYPE_CHECKING, Protocol, runtime_checkable if TYPE_CHECKING: from ._core.anndata import AnnData @runtime_checkable class ExtensionNamespace(Protocol): """Protocol for extension namespaces. Enforces that the namespace initializer accepts a class with the proper `__init__` method. Protocol's can't enforce that the `__init__` accepts the correct types. See `_check_namespace_signature` for that. This is mainly useful for static type checking with mypy and IDEs. """ def __init__(self, adata: AnnData) -> None: """ Used to enforce the correct signature for extension namespaces. """ python-anndata-0.12.0~rc1/src/anndata/typing.py000066400000000000000000000027661500370632200214150ustar00rootroot00000000000000from __future__ import annotations from typing import TYPE_CHECKING import numpy as np import pandas as pd from numpy import ma from . import abc from ._core.anndata import AnnData from .compat import ( AwkArray, CSArray, CSMatrix, CupyArray, CupySparseMatrix, DaskArray, H5Array, ZappyArray, ZarrArray, ) from .compat import Index as _Index if TYPE_CHECKING: from typing import TypeAlias __all__ = ["Index", "RWAble", "AxisStorable"] Index = _Index """1D or 2D index an :class:`~anndata.AnnData` object can be sliced with.""" XDataType: TypeAlias = ( np.ndarray | ma.MaskedArray | CSMatrix | CSArray | H5Array | ZarrArray | ZappyArray | abc.CSRDataset | abc.CSCDataset | DaskArray | CupyArray | CupySparseMatrix ) ArrayDataStructureTypes: TypeAlias = XDataType | AwkArray InMemoryArrayOrScalarType: TypeAlias = ( pd.DataFrame | np.number | str | ArrayDataStructureTypes ) AxisStorable: TypeAlias = ( InMemoryArrayOrScalarType | dict[str, "AxisStorable"] | list["AxisStorable"] ) """A serializable object, excluding :class:`anndata.AnnData` objects i.e., something that can be stored in `uns` or `obsm`.""" RWAble: TypeAlias = ( AxisStorable | AnnData | pd.Categorical | pd.api.extensions.ExtensionArray ) """A superset of :type:`anndata.typing.AxisStorable` (i.e., including :class:`anndata.AnnData`) which is everything can be read/written by :func:`anndata.io.read_elem` and :func:`anndata.io.write_elem`.""" python-anndata-0.12.0~rc1/src/anndata/utils.py000066400000000000000000000341411500370632200212330ustar00rootroot00000000000000from __future__ import annotations import re import warnings from functools import singledispatch, wraps from typing import TYPE_CHECKING import h5py import numpy as np import pandas as pd from scipy import sparse import anndata from ._core.sparse_dataset import BaseCompressedSparseDataset from .compat import CSArray, CupyArray, CupySparseMatrix, DaskArray from .logging import get_logger if TYPE_CHECKING: from collections.abc import Iterable, Mapping, Sequence from typing import Any, Literal logger = get_logger(__name__) def import_name(name: str) -> Any: from importlib import import_module parts = name.split(".") obj = import_module(parts[0]) for i, name in enumerate(parts[1:]): try: obj = import_module(f"{obj.__name__}.{name}") except ModuleNotFoundError: break for name in parts[i + 1 :]: try: obj = getattr(obj, name) except AttributeError: msg = f"{parts[:i]}, {parts[i + 1 :]}, {obj} {name}" raise RuntimeError(msg) return obj @singledispatch def asarray(x): """Convert x to a numpy array""" return np.asarray(x) @asarray.register(CSArray) @asarray.register(sparse.spmatrix) def asarray_sparse(x): return x.toarray() @asarray.register(BaseCompressedSparseDataset) def asarray_sparse_dataset(x): return asarray(x.to_memory()) @asarray.register(h5py.Dataset) def asarray_h5py_dataset(x): return x[...] @asarray.register(CupyArray) def asarray_cupy(x): return x.get() @asarray.register(CupySparseMatrix) def asarray_cupy_sparse(x): return x.toarray().get() @asarray.register(DaskArray) def asarray_dask(x): return asarray(x.compute()) @singledispatch def convert_to_dict(obj) -> dict: return dict(obj) @convert_to_dict.register(dict) def convert_to_dict_dict(obj: dict): return obj @convert_to_dict.register(np.ndarray) def convert_to_dict_ndarray(obj: np.ndarray): if obj.dtype.fields is None: msg = ( "Can only convert np.ndarray with compound dtypes to dict, " f"passed array had “{obj.dtype}”." ) raise TypeError(msg) return {k: obj[k] for k in obj.dtype.fields.keys()} @convert_to_dict.register(type(None)) def convert_to_dict_nonetype(obj: None): return dict() @singledispatch def axis_len(x, axis: Literal[0, 1]) -> int | None: """\ Return the size of an array in dimension `axis`. Returns None if `x` is an awkward array with variable length in the requested dimension. """ return x.shape[axis] try: from .compat import awkward as ak def _size_at_depth(layout, depth, lateral_context, **kwargs): """Callback function for dim_len_awkward, resolving the dim_len for a given level""" if layout.is_numpy: # if it's an embedded rectilinear array, we have to deal with its shape # which might not be 1-dimensional if layout.is_unknown: shape = (0,) else: shape = layout.shape numpy_axis = lateral_context["axis"] - depth + 1 if not (1 <= numpy_axis < len(shape)): msg = f"axis={lateral_context['axis']} is too deep" raise TypeError(msg) lateral_context["out"] = shape[numpy_axis] return ak.contents.EmptyArray() elif layout.is_list and depth == lateral_context["axis"]: if layout.parameter("__array__") in {"string", "bytestring"}: # Strings are implemented like an array of lists of uint8 (ListType(NumpyType(...))) # which results in an extra hierarchy-level that shouldn't show up in dim_len # See https://github.com/scikit-hep/awkward/discussions/1654#discussioncomment-3736747 msg = f"axis={lateral_context['axis']} is too deep" raise TypeError(msg) if layout.is_regular: # if it's a regular list, you want the size lateral_context["out"] = layout.size else: # if it's an irregular list, you want a null token lateral_context["out"] = -1 return ak.contents.EmptyArray() elif layout.is_record and depth == lateral_context["axis"]: lateral_context["out"] = len(layout.fields) return ak.contents.EmptyArray() elif layout.is_record: # currently, we don't recurse into records # in theory we could, just not sure how to do it at the moment # Would need to consider cases like: scalars, unevenly sized values msg = f"Cannot recurse into record type found at axis={lateral_context['axis']}" raise TypeError(msg) elif layout.is_union: # if it's a union, you could get the result of each union branch # separately and see if they're all the same; if not, it's an error result = None for content in layout.contents: context = {"axis": lateral_context["axis"]} ak.transform( _size_at_depth, content, lateral_context=context, ) if result is None: result = context["out"] elif result != context["out"]: # Union branches have different lengths -> return null token lateral_context["out"] = -1 return ak.contents.EmptyArray() lateral_context["out"] = result return ak.contents.EmptyArray() @axis_len.register(ak.Array) def axis_len_awkward(array, axis: Literal[0, 1]) -> int | None: """Get the length of an awkward array in a given axis Returns None if the axis is of variable length. Code adapted from @jpivarski's solution in https://github.com/scikit-hep/awkward/discussions/1654#discussioncomment-3521574 """ if axis < 0: # negative axis is another can of worms... maybe later msg = "Does not support negative axis" raise NotImplementedError(msg) elif axis == 0: return len(array) else: # communicate with the recursive function using a context (lateral) context = {"axis": axis} # "transform" but we don't care what kind of array it returns ak.transform( _size_at_depth, array, lateral_context=context, ) # Use `None` as null token. return None if context["out"] == -1 else context["out"] @asarray.register(ak.Array) def asarray_awkward(x): return x except ImportError: pass def make_index_unique(index: pd.Index, join: str = "-"): """ Makes the index unique by appending a number string to each duplicate index element: '1', '2', etc. If a tentative name created by the algorithm already exists in the index, it tries the next integer in the sequence. The first occurrence of a non-unique value is ignored. Parameters ---------- join The connecting string between name and integer. Examples -------- >>> from anndata import AnnData >>> adata = AnnData(np.ones((2, 3)), var=pd.DataFrame(index=["a", "a", "b"])) >>> adata.var_names Index(['a', 'a', 'b'], dtype='object') >>> adata.var_names_make_unique() >>> adata.var_names Index(['a', 'a-1', 'b'], dtype='object') """ if index.is_unique: return index from collections import Counter values = index.values.copy() indices_dup = index.duplicated(keep="first") values_dup = values[indices_dup] values_set = set(values) counter = Counter() issue_interpretation_warning = False example_colliding_values = [] for i, v in enumerate(values_dup): while True: counter[v] += 1 tentative_new_name = v + join + str(counter[v]) if tentative_new_name not in values_set: values_set.add(tentative_new_name) values_dup[i] = tentative_new_name break issue_interpretation_warning = True if len(example_colliding_values) < 5: example_colliding_values.append(tentative_new_name) if issue_interpretation_warning: warnings.warn( f"Suffix used ({join}[0-9]+) to deduplicate index values may make index " + "values difficult to interpret. There values with a similar suffixes in " + "the index. Consider using a different delimiter by passing " + "`join={delimiter}`" + "Example key collisions generated by the make_index_unique algorithm: " + str(example_colliding_values) ) values[indices_dup] = values_dup index = pd.Index(values, name=index.name) return index def join_english(words: Iterable[str], conjunction: str = "or") -> str: words = list(words) # no need to be efficient if len(words) == 0: return "" if len(words) == 1: return words[0] if len(words) == 2: return f"{words[0]} {conjunction} {words[1]}" return ", ".join(words[:-1]) + f", {conjunction} {words[-1]}" def warn_names_duplicates(attr: str): names = "Observation" if attr == "obs" else "Variable" warnings.warn( f"{names} names are not unique. " f"To make them unique, call `.{attr}_names_make_unique`.", UserWarning, stacklevel=2, ) def ensure_df_homogeneous( df: pd.DataFrame, name: str ) -> np.ndarray | sparse.csr_matrix: # TODO: rename this function, I would not expect this to return a non-dataframe if all(isinstance(dt, pd.SparseDtype) for dt in df.dtypes): arr = df.sparse.to_coo().tocsr() else: arr = df.to_numpy() if df.dtypes.nunique() != 1: warnings.warn(f"{name} converted to numpy array with dtype {arr.dtype}") return arr def convert_dictionary_to_structured_array(source: Mapping[str, Sequence[Any]]): names = list(source.keys()) try: # transform to byte-strings cols = [ np.asarray(col) if np.array(col[0]).dtype.char not in {"U", "S"} else np.asarray(col).astype("U") for col in source.values() ] except UnicodeEncodeError: msg = ( "Currently only support ascii strings. " "Don’t use “ö” etc. for sample annotation." ) raise ValueError(msg) # if old_index_key not in source: # names.append(new_index_key) # cols.append(np.arange(len(cols[0]) if cols else n_row).astype("U")) # else: # names[names.index(old_index_key)] = new_index_key # cols[names.index(old_index_key)] = cols[names.index(old_index_key)].astype("U") dtype_list = list( zip(names, [str(c.dtype) for c in cols], [(c.shape[1],) for c in cols]) ) # might be unnecessary dtype = np.dtype(dtype_list) arr = np.zeros((len(cols[0]),), dtype) # here, we do not want to call BoundStructArray.__getitem__ # but np.ndarray.__getitem__, therefore we avoid the following line # arr = np.ndarray.__new__(cls, (len(cols[0]),), dtype) for i, name in enumerate(dtype.names): arr[name] = np.array(cols[i], dtype=dtype_list[i][1]) return arr def warn_once(msg: str, category: type[Warning], stacklevel: int = 1): warnings.warn(msg, category, stacklevel=stacklevel) # Prevent from showing up every time an awkward array is used # You'd think `'once'` works, but it doesn't at the repl and in notebooks warnings.filterwarnings("ignore", category=category, message=re.escape(msg)) def deprecated( new_name: str, category: type[Warning] = FutureWarning, add_msg: str = "", *, hide: bool = True, ): """\ This is a decorator which can be used to mark functions as deprecated with a FutureWarning. It will result in a warning being emitted when the function is used. """ def decorator(func): name = func.__qualname__ msg = ( f"Use {new_name} instead of {name}, " f"{name} is deprecated and will be removed in the future." ) if add_msg: msg += f" {add_msg}" @wraps(func) def new_func(*args, **kwargs): warnings.warn(msg, category=category, stacklevel=2) return func(*args, **kwargs) setattr(new_func, "__deprecated", (category, msg, hide)) return new_func return decorator class DeprecationMixinMeta(type): """\ Use this as superclass so deprecated methods and properties do not appear in vars(MyClass)/dir(MyClass) """ def __dir__(cls): def is_hidden(attr) -> bool: if isinstance(attr, property): attr = attr.fget _, _, hide = getattr(attr, "__deprecated", (None, None, False)) return hide return [ item for item in type.__dir__(cls) if not is_hidden(getattr(cls, item, None)) ] def raise_value_error_if_multiindex_columns(df: pd.DataFrame, attr: str): if isinstance(df.columns, pd.MultiIndex): msg = ( "MultiIndex columns are not supported in AnnData. " f"Please use a single-level index for {attr}." ) raise ValueError(msg) def module_get_attr_redirect( attr_name: str, deprecated_mapping: Mapping[str, str], old_module_path: str | None = None, ) -> Any: full_old_module_path = ( f"anndata{'.' + old_module_path if old_module_path is not None else ''}" ) if new_path := deprecated_mapping.get(attr_name): msg = ( f"Importing {attr_name} from `{full_old_module_path}` is deprecated. " f"Import anndata.{new_path} instead." ) warnings.warn(msg, FutureWarning) # hacky import_object_by_name, but we test all these mod = anndata while "." in new_path: mod_name, new_path = new_path.split(".", 1) mod = getattr(mod, mod_name) return getattr(mod, new_path) msg = f"module {full_old_module_path} has no attribute {attr_name!r}" raise AttributeError(msg) python-anndata-0.12.0~rc1/src/testing/000077500000000000000000000000001500370632200175655ustar00rootroot00000000000000python-anndata-0.12.0~rc1/src/testing/anndata/000077500000000000000000000000001500370632200211735ustar00rootroot00000000000000python-anndata-0.12.0~rc1/src/testing/anndata/__init__.py000066400000000000000000000000001500370632200232720ustar00rootroot00000000000000python-anndata-0.12.0~rc1/src/testing/anndata/_doctest.py000066400000000000000000000005301500370632200233470ustar00rootroot00000000000000from __future__ import annotations from collections.abc import Callable from typing import TypeVar F = TypeVar("F", bound=Callable) def doctest_needs(mod: str) -> Callable[[F], F]: """Mark function with doctest dependency.""" def decorator(func: F) -> F: func._doctest_needs = mod return func return decorator python-anndata-0.12.0~rc1/src/testing/anndata/_pytest.py000066400000000000000000000077561500370632200232530ustar00rootroot00000000000000"""Private anndata pytest plugin. This file exists 1. to allow ignoring warnings without test collection failing on CI 2. as a pytest plugin/config that applies to doctests as well It lives outside of the anndata package in order to avoid importing anndata too early. """ from __future__ import annotations import re import warnings from importlib.util import find_spec from typing import TYPE_CHECKING, cast import pytest if TYPE_CHECKING: from collections.abc import Generator, Iterable from pathlib import Path @pytest.fixture(autouse=True) def _anndata_test_env(request: pytest.FixtureRequest) -> None: import anndata if isinstance(request.node, pytest.DoctestItem): request.getfixturevalue("_doctest_env") anndata.settings.reset(anndata.settings._registered_options.keys()) @pytest.fixture def _doctest_env( request: pytest.FixtureRequest, cache: pytest.Cache, tmp_path: Path ) -> Generator[None, None, None]: with warnings.catch_warnings(): warnings.filterwarnings( "ignore", message=r"Importing read_.* from `anndata` is deprecated" ) from scanpy import settings from contextlib import chdir from anndata.utils import import_name assert isinstance(request.node.parent, pytest.Module) # request.node.parent is either a DoctestModule or a DoctestTextFile. # Only DoctestModule has a .obj attribute (the imported module). if request.node.parent.obj: func = import_name(request.node.name) warning_detail: tuple[type[Warning], str, bool] | None if warning_detail := getattr(func, "__deprecated", None): cat, msg, _ = warning_detail warnings.filterwarnings("ignore", category=cat, message=re.escape(msg)) if (mod := getattr(func, "_doctest_needs", None)) is not None and not find_spec( mod ): request.applymarker(pytest.skip(reason=f"doctest needs {mod} to run")) old_dd, settings.datasetdir = settings.datasetdir, cache.mkdir("scanpy-data") with chdir(tmp_path): yield settings.datasetdir = old_dd def pytest_itemcollected(item: pytest.Item) -> None: """Define behavior of pytest.mark.gpu.""" is_gpu = len([mark for mark in item.iter_markers(name="gpu")]) > 0 if is_gpu: item.add_marker( pytest.mark.skipif(not find_spec("cupy"), reason="Cupy not installed.") ) def pytest_addoption(parser: pytest.Parser) -> None: """Hook to register custom CLI options and config values""" parser.addoption( "--strict-warnings", action="store_true", default=False, help="Turn warnings into errors that are not overridden by `filterwarnings` or `filterwarnings_when_strict`.", ) parser.addini( "filterwarnings_when_strict", "Filters to apply after `-Werror` when --strict-warnings is active", type="linelist", default=[], ) def pytest_collection_modifyitems( session: pytest.Session, config: pytest.Config, items: Iterable[pytest.Item] ): if not config.getoption("--strict-warnings"): return warning_filters = [ "error", *_config_get_strlist(config, "filterwarnings"), *_config_get_strlist(config, "filterwarnings_when_strict"), ] warning_marks = [pytest.mark.filterwarnings(f) for f in warning_filters] # Add warning filters defined in the config to all tests items. # Test items might already have @pytest.mark.filterwarnings applied, # so we prepend ours to ensure that an item’s explicit filters override these. # Reversing then individually prepending ensures that the order is preserved. for item in items: for mark in reversed(warning_marks): item.add_marker(mark, append=False) def _config_get_strlist(config: pytest.Config, name: str) -> list[str]: if strs := config.getini(name): assert isinstance(strs, list) assert all(isinstance(item, str) for item in strs) return cast("list[str]", strs) return [] python-anndata-0.12.0~rc1/src/testing/anndata/py.typed000066400000000000000000000000001500370632200226600ustar00rootroot00000000000000python-anndata-0.12.0~rc1/tests/000077500000000000000000000000001500370632200164635ustar00rootroot00000000000000python-anndata-0.12.0~rc1/tests/conftest.py000066400000000000000000000123041500370632200206620ustar00rootroot00000000000000from __future__ import annotations from functools import partial from typing import TYPE_CHECKING import dask import joblib import pytest from dask.base import normalize_token, tokenize from packaging.version import Version if Version(dask.__version__) < Version("2024.8.0"): from dask.base import normalize_seq else: from dask.tokenize import normalize_seq from filelock import FileLock from scipy import sparse import anndata as ad from anndata.tests.helpers import subset_func # noqa: F401 if TYPE_CHECKING: from collections.abc import Generator from types import EllipsisType @pytest.fixture def backing_h5ad(tmp_path): return tmp_path / "test.h5ad" @pytest.fixture( params=[("h5ad", None), ("zarr", 2), ("zarr", 3)], ids=["h5ad", "zarr2", "zarr3"] ) def diskfmt(request): if (fmt := request.param[0]) == "h5ad": yield fmt else: with ad.settings.override(zarr_write_format=request.param[1]): yield fmt @pytest.fixture def diskfmt2(diskfmt): if diskfmt == "h5ad": with ad.settings.override(zarr_write_format=2): yield "zarr" else: yield "h5ad" @pytest.fixture( params=[ pytest.param((..., (slice(None), slice(None))), id="ellipsis"), pytest.param(((...,), (slice(None), slice(None))), id="ellipsis_tuple"), pytest.param( ((..., slice(0, 10)), (slice(None), slice(0, 10))), id="obs-ellipsis" ), pytest.param( ((slice(0, 10), ...), (slice(0, 10), slice(None))), id="var-ellipsis" ), pytest.param( ((slice(0, 10), slice(0, 10), ...), (slice(0, 10), slice(0, 10))), id="obs-var-ellipsis", ), pytest.param( ((..., slice(0, 10), slice(0, 10)), (slice(0, 10), slice(0, 10))), id="ellipsis-obs-var", ), pytest.param( ((slice(0, 10), ..., slice(0, 10)), (slice(0, 10), slice(0, 10))), id="obs-ellipsis-var", ), ] ) def ellipsis_index_with_equivalent( request, ) -> tuple[tuple[EllipsisType | slice, ...] | EllipsisType, tuple[slice, slice]]: return request.param @pytest.fixture def ellipsis_index( ellipsis_index_with_equivalent: tuple[ tuple[EllipsisType | slice, ...] | EllipsisType, tuple[slice, slice] ], ) -> tuple[EllipsisType | slice, ...] | EllipsisType: return ellipsis_index_with_equivalent[0] @pytest.fixture def equivalent_ellipsis_index( ellipsis_index_with_equivalent: tuple[ tuple[EllipsisType | slice, ...] | EllipsisType, tuple[slice, slice] ], ) -> tuple[slice, slice]: return ellipsis_index_with_equivalent[1] @pytest.fixture(scope="session") def local_cluster_addr( tmp_path_factory: pytest.TempPathFactory, worker_id: str ) -> Generator[str, None, None]: # Adapted from https://pytest-xdist.readthedocs.io/en/latest/how-to.html#making-session-scoped-fixtures-execute-only-once import dask.distributed as dd def make_cluster() -> dd.LocalCluster: return dd.LocalCluster(n_workers=1, threads_per_worker=1) if worker_id == "master": with make_cluster() as cluster: yield cluster.scheduler_address return # get the temp directory shared by all workers root_tmp_dir = tmp_path_factory.getbasetemp().parent fn = root_tmp_dir / "dask_scheduler_address.txt" lock = FileLock(str(fn) + ".lock") lock.acquire() # can’t use context manager, because we need to release the lock before yielding address = fn.read_text() if fn.is_file() else None if address: lock.release() yield address return with make_cluster() as cluster: fn.write_text(cluster.scheduler_address) lock.release() yield cluster.scheduler_address ##################### # Dask tokenization # ##################### # TODO: Should we be exporting this? # sparray classes don't have tokenize defined yet, see: https://github.com/dask/dask/issues/10375 def normalize_sparse_matrix(x, attrs): return ( type(x).__name__, normalize_seq(normalize_token(getattr(x, key)) for key in attrs), ) for cls, attrs in [ (sparse.dia_array, ("data", "offsets", "shape")), (sparse.bsr_array, ("data", "indices", "indptr", "blocksize", "shape")), (sparse.coo_array, ("data", "row", "col", "shape")), (sparse.csr_array, ("data", "indices", "indptr", "shape")), (sparse.csc_array, ("data", "indices", "indptr", "shape")), (sparse.lil_array, ("data", "rows", "shape")), ]: normalize_token.register(cls, partial(normalize_sparse_matrix, attrs=attrs)) @normalize_token.register(sparse.dok_array) def normalize_dok_matrix(x): return type(x).__name__, normalize_token(sorted(x.items())) @normalize_token.register(ad.AnnData) def tokenize_anndata(adata: ad.AnnData): res = [] if adata.X is not None: res.append(tokenize(adata.X)) res.extend([tokenize(adata.obs), tokenize(adata.var)]) for attr in ["obsm", "varm", "obsp", "varp", "layers"]: elem = getattr(adata, attr) res.append(tokenize(list(dict(elem).items()))) res.append(joblib.hash(adata.uns)) if adata.raw is not None: res.append(tokenize(adata.raw.to_adata())) return tuple(res) python-anndata-0.12.0~rc1/tests/data/000077500000000000000000000000001500370632200173745ustar00rootroot00000000000000python-anndata-0.12.0~rc1/tests/data/adata-comments.tsv000066400000000000000000000001451500370632200230270ustar00rootroot00000000000000# A regular comment # The next comment is actually colnames # c1 c2 r1 1.0 0.0 r2 3.0 0.0 r3 5.0 6.0 python-anndata-0.12.0~rc1/tests/data/adata.csv000066400000000000000000000000501500370632200211560ustar00rootroot00000000000000,c1,c2 r1,1.0,0.0 r2,3.0,0.0 r3,5.0,6.0 python-anndata-0.12.0~rc1/tests/data/archives/000077500000000000000000000000001500370632200212005ustar00rootroot00000000000000python-anndata-0.12.0~rc1/tests/data/archives/readme.md000066400000000000000000000004771500370632200227670ustar00rootroot00000000000000# archives This directory contains an archive of anndata files written by older versions of the library. It's for testing backwards compat. This should really live somewhere else, but it's here for now. ## Directories Directories with version numbers contain files written by the corresponding version of `anndata`. python-anndata-0.12.0~rc1/tests/data/archives/v0.7.0/000077500000000000000000000000001500370632200220305ustar00rootroot00000000000000python-anndata-0.12.0~rc1/tests/data/archives/v0.7.0/adata.h5ad000066400000000000000000010152601500370632200236520ustar00rootroot00000000000000HDF  `TREE8@HEAPXX80p  (TREEHEAPX dataindicesindptr8SNOD HhHx!kn(8H<> Hencoding-type  Pencoding-versionGCOL csr_matrix0.1.0raw0.1.0 csr_matrix0.1.0 dataframe0.1.0 float64 uint8 int64 cat_ordered var_cat_indexgene20gene21gene22gene23gene24gene25gene26gene27gene28gene29gene30gene31gene32gene33gene34gene35gene36 gene37!gene38"gene39#gene10$gene11%gene12&gene13'gene14(gene15)gene16*gene17+gene18,gene19-gene5.gene6/gene70gene81gene92gene33gene44gene25gene16gene07e8g9h:i;o<p=q>s?t@vAyBzCMDNESFXGbHdIGJIKLLEMFNCOBPXQYRcSeTfUhViWkXmYnZp[r\t]u^x_z`JaNbQcRdTeUfVgWhEiFjGkIlCmDnBoAp csr_matrixq0.1.0r dataframes0.1.0tfloat64uuint8vint64w cat_orderedxcaty_indexzgene20{gene21|gene22}gene23~gene24gene25gene26gene27gene28gene29gene30gene31gene32gene33gene34gene35gene36gene37gene38gene39gene10gene11gene12gene13gene14gene15gene16gene17gene18gene19gene5gene6gene7gene8gene9gene3gene4gene2gene1gene0ZabcfjHh H shape@   (}lzfXSNOD+P4TREE#?@`@+@ @'7@O@ @@'@#@ `"@/@ /S@7?'@3@@@ @@/7[@#_@#; ;#@ G@  @+@ ?   @   @@  @    @@7 @7@# @' +@'@#@w@'  +O# #@S@+@S@@S@{@+'@O@#@{ ;+@K@'@O@{@@@o@@Gg@k@?@c@W@3@@s7@@o@O@@/@g@'@ [A{@C@+'@@@@3@@@@@{@/A @ @o@G@@w@7@@@@G@@@@3@/@@K@@_@@+w@C@/@@c@@g@?+@C@@@@_@@KC@'@o@@@@k@@g@@G@C@[@@@3@@@_@'@?@+@c_@'@O@@G@@#@@K@w'@@'@G@ &1;DLTY_cjpv~?@`#@ '@ +@ c@@@  `;@K@@?@ @7@ @@ @@@ @@@ @@ @@@@ @@@@ o@'G7O/@@3@@ @@ @@ #@C@C @#?@@@/@ @@ [@G#@ ?@@@#@ @ +o '@#@O @#@ /@@@@@ @@@ C?@'3@'@@@@ O`%`@@[ @#@@ +?2ATct (<Tet   6  5 423-./01  # $%&'()*+, !"   O  N LMIJKCDEFGH789:;<=>?@AB         o  n lmhijk`abcdefgPQRSTUVWXYZ[\]^_    E<䰇%V͔m-CrCn^ )R            (}lzf ,`TREE#  (}lzf|`5`TREE|i%P@0TREE@DHEAPX ?Xvarvarm8=? Hencoding-type Pencoding-version H shape@(E0TREEGHEAPX Cdataindicesindptr8SNODAACkkn(nAC Hencoding-type  Pencoding-version H shape@(   (}lzfIXSNODF@QbTREE%  (}lzfPR`TREE]Z           % & '     @? ;  ! $ @@C@@C @?@ @  @ @@G " @@@;@7 ;  ?@?@@@@C@  K@@KA @K@@G@AOO K @@W@O@@@;@@{AO@@3@@C@{@SA_@@S@W@@SOA@@@@@O@@CAk@?@@?A@C@?# @;@@?@@@;AW@?@wABg@@{@?@?B@sB@w@;@3/AO@@o @w@7@3@@o@@3@@@s@7@@@/A@3@o@AkA@;BGs@B{@;@k@+@kAK@@k@@@w@@@kAB@@/@_@g@7AK@3A@o@—@C@?@@;B@?@@C@A7A@@/@A @oAG@@s@@A[@{@K@{@?A@@@S@@@SB@WA;KAs@S@WAW@AW@c@@[A'S@W@SAoA+@;@@C@AA7@;@@A/?;@@7@s@;@@KG@gOA{@@WA;A@kAA@[@OA@AA'@@+A S@3@@?@AcA@@;COAAO@s@A@3@@7A@sA@;@@?@kA@@@@;AKA?@7@3@@3@g@'A@/@3@g+ 1 &,#a&?i=Y~{?ҕW?h-?V?mQv?LdGv?)K?![_?REeq?`.#?P- ?)%_?0ڂZ?߻6K?\? ?un?Z~6i}?'Ԋƪ?Qb45?6m?Un ^%?Pe?9l?,w ?~F/?S ?Џ? T?ft}n?P0li-?&m x??q*?ʹ? P?Va?A y?<|R?}q??WS?Pl;(*?2-?@%E̔?.??SN=?9?:A[? ?G`"?rOί?XZ]xq?䓘u?ӝ X;??/?!u\?P9v?WmM?vLΪ?@rq2?Y?v?2Ԕ?z/ ? P?a7?5SV?ʵ˯?sΫ?[1ZK?gF2?)3V?+Ɉ*?KO^B?G:,A?,Ɂ?B?Hހc?Dڥ*??T?P㯡n?zd?hD?(P?}!f(?*Y?,\ ?)uzW???[?lQ˙?yff?%|i?ԋ̼?(;ea?oL?l4ޭ>?e_1̹?QV?9?nj-?#?z4h?͚XP?tN??! 5?p\I ?^ʕ?  ?‘ЙՀ?@3\ؔ?|!?bk}?5)? T?Ǣ?Q5D?f[?$>!? %?l!?h-*?™?-Q?>Ud?D^(;?8?$س?֚,5?U?5J9?ߐ:?3=#i?a H?%Q?x%(?YI5&?Ve?IQH?Jsn?dMx]?Wh?:b"?}8? )?P?XU?*@?Fm?Nv;?fY?4 ~?/?.Ve?`uP/:?m?BR?Uά? )?{?T?MXe?b?R?"lK<$?K L? Ў?̿?l?6|4?%`I ? Z?9"4V?ZʐG?f/?2i ?tc?+9j?L5'l?7Ft?>.߻?3C  ?j륇?B*?ї?DTi?{,Y?_,؞?؟sG?8*?sܡ-G??.z$n?h?Z̅>?)[?v3#?o̡۳?@??tKX2?> ?/߯?*~=Zu?tXj?X?@[F*`?*כ?GtZ?Eb?N/g?_$@K?,?\ά?ϿMy?`+?%뱥?*lA?M P?r?51D? l\?PY?4?xx?J8 ?X0V!b?u?kGi? w?`]$n? ˆ?$ߤr?%F?u?(/-?,r?8l ?aIO@?A?^H>?i?.S1n3?nnI?)!?Pn'χ?JE?g*Q?6/i?׹}?6o+?|w?G??m5?^?8k?XP7?xoK?숷n?&B??/?魬,?I? U?^6?^f?L?a,?g=?'?r'?ۂh?e0 ?Xl?okB ?Ws?.hm?%Js?Z?6Rpn?5`QC?ީx ?PF?Xd?OIKo?J@?&f?K}?AtXE??t"ʃ?LQ,ފ?e"U?|E??0*z؍?Hvz?jǫ?Pd?D'B?ASq?@KU̚?I?H{M? R?O/ 2?Nj?(0?C4@'J?XzY?~?M洶?s?T~?{]?X?lb? ,#?$4p? q:>?vT?Ȧ𻤯?1?t[~e?Ci? q?I{Q?(zT?Q? O?3Di?u ?lA?G8?Iį?.*%?IT? ?f@?E?m 3?im?͕h?.?wu?L>@? r|??hu?Tp‚r?3J.?KK?lIp}?*WG?}Y?H+d;?t*)?kq?:7?)h?C p?h4_?>*-?bd???Fr8^ӕ?%n?7Ks?8$?n.??$4?zIOo? ?i? 1D?fM?m!?\?rʤ?Wܞ ?䇝$n?)ϲ!?J=m0?^J\/f?\T??WoO?1vbP?Ju?@+ 6?`@ WĜ?MWqT?H{"?g?`1M*?r#?Φ?[a?L3? &?"u9x?8+I̵?@v9H?Cj7?x?@V۹)?[?ԩf5?&ѥA?4?ɴ?B<÷`?(?" :?s4ǡH?, ?y#?6m1? :?h_&?FZy>y??HKB?l=;,??_O?Po9?WƧ?e{ۛ3? 3V ?mI?e[B?د06? "D?y>>?L?1 !`?@Adڀ?63vh?$Ia?$GC*?By4k?/n?M2݈?uʱ?sWE?7_?Aтkw?.?L,g?,?_iq? ävx?ػsO??[*?l~i ?Kj?@ܸ}?R Q?m@?(]$?ؼC?eDf? l?Tj'??4%c?Hߍp?E3?S9#?c?fCo?7ʹHX?$x?1?UI?&Hm?0-9q?:!?-wuʴ?8)? ??Vk!?c?8?@85?p?Ò?%0X]?֓-?sص?ޘ+{?Yd?۝?Qc?obl?4U?]?j|?&?2=?ٛ/#?zd?w%e?-?Rƴs?>q"?.XUB?Sdջ?0Q2?7]0h?>V/}?n>?3]?q&Á?ML?j?'?~-?@$ꝷ?+4%?!As?oqA?<5?(T?Mz ?H1`?_Vs9??")?@~?? 6 ?V~?h ?Xh?ys?/?9??d6?XјR9?0sYS?VKu? ? p\BU?TԔX?s?E-Gs?#Jsx]?p $%?6?ЬGl}?m9?dIf?b'?zxm?X,U?H ? $?nij?cXk?'1ծ?7UH̰??"A*?."&K?!$?/x?5_Z?u_g?rP?Fý\?hnK?vR~? `U٤?* 2??Le?6҆?>P?\= ?|?h?!-?@n&?PU "o?XJ?t{?Ӄ?84?a?X }u?pe?0wxK?A?ȹ?h"?/z?u!^?^3,?@ŭ ?3=;F\? ? O\? ߀H??l%?+}")?ys??hU3?Dm?R(M?ɇc?Bw?BJ3?zׄ?ĥR?P ?p]Xin?DM.?5sC?g?1Lp?DK=?4K?L{??[j?5K8?L?( :n?bj(?9`Y?' ?Xm? ɥ:?S?z'"C?e 2k? 4?$Z}y?jf? ?ֲ1?[F?xGF? ?@c?B>z?{X?|Ylr~?0?-I ?- I?ht*Yh?~BЭ? ?D?8m)?zF ?4Z?Vp9?NbR?fhf@?|g?t=mc?-Vqe?&P?CW?ZGn?&L? \6nw?E"ϣ?6S?(5`s`J?[]z6?dOR??;'?Gh5b?<%EX?@S" ?HC?+Rmz?pQۀî?(e5?io)?.3a?+?k?򎧶d?Q$pw?/H"?7? CZ?c!?xu?DЎ&?C?gj?" O?DZ?ev ?$4,?QME?;e?o?J??>?r㝟M?%iWD'??-?g`/ ?zOEf?0Q?lԈ8? i?Dx?3 ME? ?.]?oG$?'ӛ?g?GC׹?x֠?ܖ.?ʎq?1&)?0gc?qll?9S?(:b׫?Z?N?a?su?Ԏ)?2/"O?^d+?8*69? M?R;K?:UG?'?L?h'?bk&?o'`?c2R ?vZ 8o?XVGj?? wf ?>?-?,u?rf?\v#?Ьb?vِ?-T ?R?a/? Ol?P8??ml[Y?q?}l\?s?b_N?sM?'Lk&?]P?ao1d?xi?N08?&Q?>zC?'1lS?ؠ?0'_gE??h?\1?Օj?`*e?pm?d/(#O?HϭE?fR9?dx?m?13h? -?XǕȑ?,uݒ?w4?vz?E?cii? / n?}&?rﲯE?m?"G$?d??5~?@L5Z?^A?,2?G?AE%?# K?X1?1/Y?Y?ESN?T?\wXCn?3?<_,?zYt?;P[.?|'[?F9>?3?Te!?B5&{??qQ0?ZC?x[+?@n?0(?Wt?C?אߥ?Lo@E(?ȼl?ִbo?HT?UPOi? }?VA?(@ȳ?XE59?#4?C!?5V ?hي|?l@\?fiJ?{M?O? "U?((Kf?ZI^?*@U?G?6`j?Ԭ?JS| ?Ȩ_??X?eG??+$??I~?&?iH?&V?L5m?d| ?n~?\kվ?\?ZEB?t#?[?9(?!՘?h8ʙP?S>+?w#??Ob׸?whN"?4{C ?MWDq?( ?Pu&?F7?NR˭? d ??/K9?| B? *?t J?b?#{?ZJN\?$p9?Eh?!?`40?|4?iu? EK87z?&WJZ?а7?RI|t?H/\'??>ы?Q)? ^՚W?{ΐ? )z?h4?@%6i?ɤ ?}Ң?)?x?o~?eԜ?Pd?($|?0>Q? ea?9U ?^*?C΃r?R?*[6?G7?%B/?͹x??d?_4? DG?d"!7? ^?V??$)ֳ?ߧ_?*F?~H?ت?]$:?8l?1b?SK?}? ?o&?;J~I?J?D?n$?h4h?0Q?Riѽb?8 T?b ?)?{x ?q!$?򘏱?S?\?-?LSF?"6?+?LgF?Ag?b_? f7?~>Hq?#6C?<?&_?߈5b?*?=JR?Ԇ3LG?¯? ,?-.?Q?EÈ?:XN?{#:jt?XK`*??Nz)?l3?9?fyE?׈?Cx?3?z8iM?1rH??ԣ$?iNJ?u=?+?F\I?P$V?c?omg?Si[ ?Lw?.y4?7Ü?U'DnR?8bU?PmO?塼?n?7{uk?* ?*$kFB? +[?OO? 0?؅?v%?vX0?p?Q$?Hp? ??H< ?hY?C?Al?4?V?>V?Ã?i,\E?"e[ǜ?6 ?1?W?V.4ɣ?9G݀?&?],? "1|?@`?f&g3? rhǦ?md?"XTZ?>? $?ys?Tat?Z+?(^??\x~?e{‚??HD?0c?XO?eE?I?8ŏ?弎?J(|?]w?"k?)>{?©LU?3>>? G(?Z`?i.?w?iQ?ǟÅK? \??,?wK?F6??>TCt??b<%?h[֏?r?_"?Qx?4)?7I$7?I7 P?zۯ?˩p?5d?ml>?PeS?=]i??xo?z?PƧ?8( ?(&0?j q?ЛfB?T?5=S???TC O?xpp?]>?k?itl?u??Q?r}o?ؙ?ߩRb?~7?oY?1?FN :??9 ?+?h̀&W?~?6hĊ)a?]85?o]I?{2.?py?z"'?R{? ?l?&pi?Ur?~J?T\X ;?] ?M\m{? ?>յ{?띗@??ھ?H.n?I+?h?|Q?V9?[ǁ?L/Od?>?ǭ?zFn?C?hCs?TžE?)2?(;e?]g?D 1aq?(S?S? 1i5j?&o%?1{D-?ԆF?(ٴ?+? ?8?^໗?8R?_W?T𼌌?l l?P+T?N%?H'/?h{g?I?`D?<4?`Y?N4,?xu?i#?j/?@j?R?vn?fsҒs?FI\?DFkO?H?] +??kЭd?cXeX?!Y?ﱟ?zN5?>gCa?-o|?5Z?V߻?H\Xu?9 /?X\?Z6?@ABCDEFh; k @ jifghabcdeVWXYZ[\]^_`h;  @ |}~wxyz{lmnopqrstuv      "1-'!      !TREE$HHEAPP8knz|8X! Hencoding-type r Pencoding-versions  column-orderx wvtu @_indexy(( (}lzf%(XSNOD .H.h0# hK(X^@i8hH(jTREE(H.h0TREE1HEAPX 0catcat_ordered8  (}lzf83Tp@SNOD0UTREEiGCOLklnpqvxz P Q R U WXYKLOHIEBhijlprstu v!w"z#X$Z%a&c'd(e)O*R+U,I-K.D/B0 dataframe10.1.02float643uint84int645 cat_ordered6obs_cat7_index8cell159cell16:cell17;cell18<cell19=cell20>cell21?cell22@cell23Acell24Bcell25Ccell26Dcell27Ecell28Fcell29Gcell8Hcell9Icell10Jcell11Kcell12Lcell13Mcell14Ncell4Ocell5Pcell6Qcell7Rcell2Scell3Tcell1Ucell0VfWgXhYiZk[l\p]s^u_y`zaMbOcSdTeWfGgHhIiEjDkBldmenhojpkqmrnsrtuuvvwwRxSyVzW{c|H}K~LGFA dataframe0.1.0float64uint8int64 cat_orderedvar_cat_indexgene10gene11gene12gene13gene14gene15gene16gene17gene18gene19gene5gene6gene7gene8gene9gene3gene4gene2gene1gene0RVefnpqstuF(( (}lzf(xL( 0 categories0(TREE(*( H&orderedFALSETRUE  (}lzf(Vgp@TREE(( (}lzf(h_( 0 categoriesU(TREE(6b( H&orderedFALSETRUE((@@(( ?@4 4@(((_index__categoriescatcat_orderedint64float64uint8`TREEXqHHEAPPXxkn Hencoding-type h;0 Pencoding-versionh;1  column-orderh;6 h;5h;4h;2h;3 @_indexh;7 (}lzfrXSNOD!z|Hp(@8 HTREECTREEx~HEAPX }obs_catcat_ordered8  (}lzf`0p@SNODh}TREE} (}lzf 0 categoriesh}(TREE^b H&orderedFALSETRUE  (}lzf` p@TREE}p (}lzf 0 categories(TREE H&orderedFALSETRUE@  ?@4 42ϯ.?d=(e?Z K?&?BBP?mX]?EvD'2Qu3($ψ_h;       x  @ h;@E @ O@ @Ox  @     , ,?:z?ɮ?ˍ?͛_U??%?;O b?Tx(8?G'w47?$?dd?9|?P.r8?@??;f`?'䕐?@N#?w?(ۓŽgj ^a͂s?-N?ܻc?0 h8b?!M*?܈?#sI?"8?6Y*AP?h?3A?T/+w?s{4?_i4?|:s?ju?A?Z(?ƑȯL?!M?/N?FF7 ?uP:?t$?&[Úe?Tu a      @ @        x @  ? =>9:;<23  4 5678#$%&'()*+,-./01x V @ UTQRSLMNOPABCDEFGHIJK     x k @ jifghabcdeWXYZ[\]^_`      N:K~ [ ! < @#Y ]        㭩20KMkr_index__categoriesobs_catcat_orderedint64float64uint8`TREE HEAPPH=0p Hencoding-type h; Pencoding-versionh;  column-orderh; h;h;h;h; @_indexh; (}lzf@(XSNODPnXxд(h@(8H8 xTREE~TREEHEAPX var_catcat_ordered8  (}lzf@Hp@SNOD(TREE~<GCOLIKNPDECB A c g i lntzORUYHMGE csr_matrix0.1.0 dataframe0.1.0float64uint8int64 cat_ordered!cat"_index#cell15$cell16%cell17&cell18'cell19(cell20)cell21*cell22+cell23,cell24-cell25.cell26/cell270cell281cell292cell83cell94cell105cell116cell127cell138cell149cell4:cell5;cell6<cell7=cell2>cell3?cell1@cell0AmBoCpDrEsFtGuHvIwJxKyLQMVNWOePiQNROSPTMUBVAWSXVYXZY[f\i]j^k_x`yaMbNcOdPeQfHgJhKiGjEkDl csr_matrixm0.1.0n dataframeo0.1.0pfloat64quint8rint64s cat_orderedtcatu_indexvgene10wgene11xgene12ygene13zgene14{gene15|gene16}gene17~gene18gene19gene5gene6gene7gene8gene9gene3gene4gene2gene1gene0ZimvwxyPTUYMNLIbcfhjmxyMQTWKLDB (}lzf 0 categories(TREE* H&orderedFALSETRUE  (}lzf8p@TREEZ (}lzfx 0 categories((TREE H&orderedFALSETRUE@ ?@4 4T_index__categoriesvar_catcat_orderedint64float64uint8`TREEHEAPX 8arraysparsedf8(22 ?@4 4 (}lzf.2HSNOD[[^ز; =TREE. 2ς76;?EC?ȊI1?Q)ɟ?B(?!?U!?f_?(M?o@?|?hr!?"(a?@דc?_8!?RX?,?F?x??N[?쏴~1?ou4?Ϯ)?bK*%?gC?s!?_.*?PW?W? (nٙ?E ?b,W#? yO?ʞ? Bٿ?A?~pb?ZCC??(Imm?cPI ^b?#kAQ?C??ɳ?Y۠s?|.ʒ?rS?a?R0??[?~H>?wY?Jv?6CP?2~jy?^.3?xJQ?SR+Q?br?%K6?Kͯ?i| ?Lq?z̉>?PJA? ?XueIjn?`ZU?Ls "?S?gzr,?Xw?nu5&?Kx^?6Ѽ?U6#??$c H?zy?0)[?\\j.X?vAVȆ?Р%?Vn? ʙc?h?qa?) ?I?HDʼn?>.beI?:s)?}|?N? k?N? Ca?вk$?*l X}?:w q?ޠ_?(5Ov?t"Q?jf?J?r?j?< ?EWnrH??t?xӒ?1G?ȯ46?ɼº?oDJ?J]r?P$헕?ĄM?du?() 0?aqP?ж*8?9?"ʊ?!?&f*}?45MF?~H?aB?qʟ?74?`v?oJU?tP?W?~s?X~Z?mЄL?]g|u?ف"?,jx?͘$??ѦY?@ $8?N56!;?L1Q?dPy?LaO:?pu?^??@~b4?Aq???p4S?IUi/?2AlL?*A%?bʨ?r.? Tq?;ˑ?/㙁?טX5H?|?8g7?$Zp?TMm?9M?kE?ϸ"̺?1?lu#?)h;.?& jd%?1s?Wr?9#h?jzx?3 ??~6M?e_?xwC.x? ?.C)t?sx?. 2?Q-?ɾ?D?:OK{9?t?kEo?g 'n?et?2=p T}?M1?G?̠iR?D ?B?'?ٮg~?)v??3X?A ?PXU ?E ?L?e&i?=8?Luoi ?bq?&O ?RB?OR*S?(j?I?&?Xy q?㿘|? z?|z? ~l?)ӫ?,f?g?u[&o?Ɇkg??S?&#?U<8? ?z>i3P?``@-?FE?sσ?0Ő?"f?4+?1?vOq#H?˽(?P] Z?:4?lv^?䐝?DՆþ?"m?ka? T?-ۓ,?ɜC?胼Y?l&d7?_?ǒ?xQ?`Ʈ?NPDv?@\f?dI]/S??]ֆ?̗i,?ƚ?x6?SfHA'?T4yl?hqC? :p?ܟ<խ?;?*b?)|? ]S?:P ?Ns?X`?MvL?DaV??sb?ADUw?&H69l?,?&?f?<<8?4?[}?Tޟ?.+?p8 ,>n??O3?P?FvN?WaD?XߌW?ఙ?=f?pS?PŲ?D"?kθ#>?8AD?@ҜL?z?޺??Jqt?}?Mݕ\2?.$?-pMĮ?sF#Pq?ok?9?% ?Vq@? ѕ`?ů?όH?0? X?<@?#:?b!;?ivD?v*F[?跫u?BO?nŦh? 4?`A]w?p*b?DDsQ?ե虷? h7cR?8uS?O?,-?"+?Q??cn?]?fKK?^u?HOA?k?1,s?n>R?L% ?Ny?u /-?"t?ş?ˑC?H? nK:?k۽?sOx?RCXѩ?N?;?@C?/em?Ȇ]yG?_B?lEf?ժ&7?`>?La?jzw?*(X?=v-?H3Z?L5q9?$|C1^?'qV? *? 53?sb?@f=5?,:B?nrp]?: P?+J?I7?H] ?= ??>&>2?>Z&?Li?& ?VqkF?s_?\yb? xZ@?8ne ?Պk?1a?B ?,-׻? X?J?@?28%u?4?+:?*8?vOpM?9?L?ZI H?l_+,?*7_K?$ ?mD?ph?s)O?V*?C0?"q ?.G?䴤?eYy? &?2 ?0ɒ?4U^v?0۵?*#Ԫ?ҹ?B7tu?\8K?0$?@ɞiN?o&D?f0?g"x?hNzݮ?U@TU"??K??r}z#? 4J?#܎?q?@/uw?ܡy7d?ZN_P?X^KtI?$|?"yƚ?@|l??&P? j`"D?ݟ:?"?A^?S3?z?$[?PD?=?b7??`Xz?? 6z?3s+?k?E#?+8(?u^cA?U?47? kT?~?6$jT? 9?AU?Vam? >?N?- v?luY?]?jbEb?Hh~?Sr~?:4.oT? 9?ld2? ?'?(|?1?P l?`4Uu?b?wtծ?T4~Pi?t?{gb/?@]v?cڂ?`ҫ@?%V±9?"z? _?aw5[?NոQ?rkۤ?3??Z5B?FIZD?w~m? t?@O==u?Cҥ?꽄? ?\WsX?1.Z?TB??.s?\I+?'n?$vM?Ũ3W1?0QNs?"?ef?phɖ?uP?!?`?6R@+'_? u?C?HQ ?ؘ-e?C(?ch$^?N?=T?nm{=?b~|?FmB?T HU?o=\?3?2#?f[q}?d.??RTФ?%+?o4? DY(?H]L? ȇ> ?4Mۍ?p3sE?6j?ևjz?<)N?}? 45? @v?@?tGlz?T=7?CӺ8^?g"v?X?x?zl?QTu?:t)\x?h8/??=OC?Sv?H?)?Cր? g @?9@w?`VV?4=?~v z?fe?ցV?Ohͫ?H&\Դ?FGxl?hOS?Cu? ?7?,H D?#k6D?'(?%?dY?@얒&j?Ζv?kH ?,Ԣq?\WW%s?'ǘ?66?ÖQ?^?4;]?Y ?kQ?P%?\{?J'x8?ӄ?Kʶ?y$?kf? ?6}g?gE?0=?Dҏ?@A@k?({?0C? o?mۓ?@&2?ڕMeL? ZΕ?AՎ?Pҽ?{Gf̍?0!5 ?Ӓ>=?1i?< c#?=?z?ey[??̲5??Ҿ2?V?O|eWn?ܺ?}R?|4?\bv?1ܻ? Ɠ?7d?:. ?'ٹ7?Oϵ?NO?p]3C?pB?6KƲ?(v?S*i?~x?c?jlVm?@?\4?(4vs?/O&?0XV?a.l?͕?g?|F?`G?pv?Gh?%?v?@&=9?n j,\[?tK˄?]\?`Ekl ?*P{?O2F?;?Ǵ:?;-?jL~?Be!?f?Jv?G$?NVg?PHM^? +^?<8?L0"YV?P].?TQwn?c$ ?L>?=cd ?fЀ ?9E!?);{?!Yel?s1?˜?<]?IJ?4?tKi?_9/?ړ?t?i1ug?讼{+?b<m?{+?غ%[$?hI ?[?ً?0)û?L{̡!?mu?0glt?}0q??0Ab(?yna? -:7?&}?cm%w?@ENc?\0E?#3?b?݈j?`E?  ?:4MX?ToAL)?N~ ? > ?bs?◌?\?hZ($?#?f /?GR H?L#?oBbY?1Z?z?Pu)7?V+`??@ǽ?x?4 E?oF ?4v?8m?.h1 ?2r^?_z? Fs?Pǔ?z \? hs?Ki?tSs?X?-:?B,4?3?D-y)E?`oJߜ?B؏-?Tj9?{CJ?`B?h'E٣?p 6rw?2|?Tz?Q^?VI?T?yFtH?2e?`ϊkw?w?F] ?懸?ϝ_?D#?8☥?|'0?>bd?H_?_?7J?_f{? D?H L?p3;?(?^t??uAq?fa?3?"L(b+?q>.?cWGQ?!0E?|%?~$?+?o?" g?G ?IϑSR?[@9?6_'6?c*6?.?iOΫ?R+?@"]!,?8:4N?4Na?F?6p@m?.ᔵ?Vy3?B?0^D5O?Xe?[u>?=j?DջԘ?3j?'E?ʟ/>?5GJ?Ж ^?̫@?lӓ?VR?@w ?6ae?^e?(C%?`iN]?a?m߅?l?؁ B?bg?Uj?m,?x|V?9? Ig?`?ip2?T"iy,??@n:B?ֳa?"h?4y2?..$ 5?_d?HW Pٿ?اSξ?kj?{흷?L8?`P+?slns?@,G?j7?Ir? t??LYRG?]?A ??@b?0r?W?y.%? S@?^dJ?R?$2?|D?a+@=?+}?ϵ??.*?QRy?<c?'"?gt?J$O?,X3?J-? p?p:?#B?ó2}?(<|?Na7w+c?<6?a[[? ;X?gӠb?Y?Dݛ?`?lnlX?b"lzר?}Ţ8?ޡ?p5[h?b4?~ g^if?? W?VI1:?G~ސ? ?L?lQ?|)?Zs? t:?(XA?+ ?w{Ҋc?#6Z?j~ȅ?W*?i$f?H)Z?6WO? ~?6ɶ?PƿoU?.44r?T?H?߰K?0)杬?h<8l?l?\?~ztd?{]y?,.:C?|=t?f8?ԑ<0x? ½??ѺY??,\ܵ?`q?Ǻ?* (?m&?fPa?&T5? ??ָz6?*?q])u?Ջ?m?Ql#?#?|?0?fF??ܖJ?Po]?]k?6|Y?X#?TREE?HEAPX @=dataindicesindptr8; = Hencoding-type x Pencoding-versionx H shape@d ?@4 4 (}lzf AXSNOD>PIRTREEh  (}lzfx`J`TREEuX  (}lzf|S`TREEWͫ^TREEhaHHEAPPȢ0[^ Hencoding-type x Pencoding-versionx  column-orderx! x xxx @_indexx" (}lzfbXSNODjk(mX` (x(@؝8ȔHTREE$k(mTREEnHEAPX Hmcatcat_ordered8  (}lzf`ohp@SNODm؁TREE}Ԭ (}lzf8y 0 categoriesm(TREEQ H&orderedFALSETRUE  (}lzfPXp@TREExo (}lzf( 0 categories؁(TREE H&orderedFALSETRUE@ؕ($ 0#1 hVEw%?vfd?*?h??dfأ?^ӣ? Hi? z?j<?ӝ[?6A?ճC?H0C?BFC?Yu:q?HW6h?T?d%?P:ݦ?D#D?xeok?B썡*?B?]4C?U?`Z0И?/$?$0?_ xL2?\%5t=?k##?,0?x'x3v??f$?95?,~<x?ڬ/?|cB?>^d? }?8 ?Dݣ? VW(B?\.?Ю.?FL-??n?1ި7?    @     @x      v wxyz{|}~x  @ ( x@  @ .-1!,?&?pX5?u'8? q>?!T]?hr8d?!O?) r?n?` C ?вk?R@?L7#?0G?Ҙ?AJ??Z$0?    #+k)?{$)?0Ҹ?.2m?x5?,_w?    '   ?@4 4Ȗ_index__categoriescatcat_orderedint64float64uint8`TREEPHEAPX arraysparsedf8(22 ?@4 4 (}lzf@2HSNOD@(`^0TREE@Я2*X:[?X'Xm?qi۷ļ?IAO?.F}K5? ?^ ?2/CӘ?L&l??56F?ͅ)?XdOǍ?Nߨd?Cf/ ?!ڞ?05ǭ?U9.A?=?'kS?(JF"?QN?8_U)?f?g۲M?{>:[?^v?@)2J?ys?_ʃ ?@TT+?t-(?8U+?mR 7?-P?Du?4?i:K?`?tz^4?8L/4?Rw?g?P}D?Q"=?&'?H?2>E?9 }?4%#?cF?;Zr ?lݫF??ɋ| `?D~?.PDO?>Ux?0?X22?5`?]ADI[?HŒe?uoTi?N֐?5JZv? &26A?hEu"?J>H?p?D~ 1S?V[?(Mux=?bp!?9)#?2_?4?I?|>?W,,?6o?'?l7P?`KY?z_ry?di?9H`?Э`C?Yv?25?n/+?4즬?v;v? X?Mhu~?:?׌`e?"A?d&?c~I?`+?걇? ??zU8?1?np?Y?PSs{?㴣?F`ޔ?iG?ozP?%@?L0|?N4i?b_m?\_w?pŬd+X?!θ?2ɞ>?KJ?bw)?h}H?$Fn(?}Gb?墰I?֐P? `QF?4>ϏC?JU2?l ӮK?~$?fdo?u:s?h? J ??j ?ҟ?r?b?@ ?;Rb?g?8_?(?J?בN?>e?F4?:c7?>?_&!r? `Uʦ?6 ?ƞ?@ N d?0~?6?7T?t0??;+־?@ʔ?8?V?c&?8W?-'>?&?Bd?>i?~okE?}+f?J=F"?*P ߻?*]2.?U3?X+[©?4.!w?TWw?oڣ(?!~’?'b?{n?gڜs?H=#Z?Ծ?B?[zbC?˼L?V.y?6?ʳ;b?u!'(??(?c ??X?;?bw?JP?n"ZS?D?p?t>!?0% ?0?`~f?OM??z?͟(?\oYY}?;!{?,F ?*?:CC?p^w?!ox?5bS?s?\x[W?- ?{%4?9lU?Q'?*Ӛ?(h ?#Fw׽?jѯ?r?BMElc?a!z?]"?JЍu??3u? ?莳?xvu|?j%?cۨ"?O8?c??AXo?>hy:?n;?`?`{?ncw?JUa?h\?yz?_??3?h} ?:Y㕼?Ơ?vY9S?T5? u1?ر-T$?V؋;?,7 ⎨?챆;?wwFQ?)Io;?1+?fmx?1F?Ѝp]?{jwD?Gpw?@hk"K?`z?߱)nt?8_G?0V1 ?[?Li3?f/?]'9?4?$u+?@+? YG?0c?DΉN?f+Ώ?U?14@0?}CK?:ys}?fܬ+??/y?j~:o??pO }?6pdl?|Ͻ?#?=U?ҜѠv)?We?=Jc?k?x4?Eu?b ?VB)@?}(h?Ga¡?$&?z?!s ?Uc,?hg?<BZ?fNi?؂_?ճ"JM?!]2?%?Ol?? =3?BRfD?Z?5zƑ? u?@B?<d*Ȝ?F%`Ϗ?Jes?I;L?ºj?ͭ? N? r?hi?;$b?Ԍo?$5$?po1j?"1Y?\N?@v?8Ӱ=1?=?aǵ-? ?0 F?i"?c>/?^!?w?N~R6?X4?mtV5?~^?J5?Ԅ9!?i?6 ?g}??0ߵ?,?>CiI ?^]['?g0?@s?Ez?ݹyK?2?=$m?vB!?Zd*a?@굶? Yz8?o3a)?"%=?Nכ? L? r? [Y=?!F?0zk?j#$Md?Y;X?H9㦂?W7?9S?$`w?ۅt?ѩ?פ,?'|b?tHs?)DE?+{?~C?8K۵?70?0~_?NEgݟ?l?𯾒?<٪s?R?XrL=?HiPض?u?r? /ۡ?1?k64?p\'?IBp?p?7??V#?X?À?Ix?hƹeQd?gL?$d?8q?85T?<|0U?,9mN_?7?Fv?-5D?@a,`?8I}?|Y?H4?E4"?Ɍ ?PWbuN?@$?p_V?"eJCw?E?`2o??H:??AQ?hq7? $?6)w?`? ]?$ ?aw ?|2?R*]?X`-]?@]p7x?o? !?v1c ?TF4%?4S7??y?~Bbk?^?0\4? "+?UE?ۑJ@8? D?~^?E)?8,??px??WJ%?r?gP?_??6r??|?rk ?7)rDz?yc?cэ?^?Y?Io ?g ?tS?Bΐ?8{?>??FP?rOy?mC?j?xH8?l?!?%ڝA?DLD?z,H#|?"bjl?T|&g?$85?7p@?0??Jmz?."`?Xo?ME??*e"`3?oQ? 몂?<+?a.?r/?Qj?>1Y?@$N??}?`?bjO?rL?Y?իw¯?ki?_+?M:z?qޖH?,8F{?,qUԁ?DL?0?, >?.?[`gW?5VZ??dw?`.rL ?Jk??6މ?ZI?7 Aӂ?s`e?vRn?!Q?* ?(-\?gAh?(c?\:4?$ԅ?Pnժ?̣+?q*0??Dk?9:f?gX?4%܆?5EZS? ?d4>0?kgi?6sR?/(1R?T>$p?8 +1?W9?r\!1e?H 9?^\|?̗?`Y?L=?7>0s?V 1Ω?CV%?1?- j?8HH?xU?@d? Wy?$z?ė[?DRC?V7?ASջ?Dt~΅?R+?Sg?-:o?{ U?_)?/V^?(;9?B-?p;?!?ړpI?Eڦ?SR?,ǹEg?jO5j?P~Ab?$?#܆?  ?_Sy?Uu֘?:Zp2?|g?RC'?bfQ?C?Jo?s?y8/;e?Z?ߩ?vr ?-D?7x?@; ? Pc?jٲ~?B| ?pҦ?H?wl!?i-i?@@?.5F?$T1F?~@ߟ?'GO?Dfu?z.¬?%U?$Ѐj?DE?<? ysC?ߤnn?)҄?8t?k?y?0l?"? '?ɩ?EW`?jKI?tnZ?Vj0( Hencoding-type xn Pencoding-versionxo  column-orderxt xsxrxpxq @_indexxu (}lzf@XSNOD8h 8 ((/@988H:TREE~8TREEHEAPX Xcatcat_ordered8  (}lzfxp@SNODTREEZ (}lzfH  0 categories(TREEr H&orderedFALSETRUE  (}lzfh8p@TREEjoGCOLA csr_matrix0.1.0 csr_matrix0.1.0 csr_matrix0.1.0Yqtf RzZ EnLJ QOMh RLJ SahXJiGqeloVwBMZJdLOfSZJCJBIyAkEYTQJPdVmHiyEjKPUpTkOBpnFXdncJDfSAesVbS dARp!KLwh"TbWp#ngR$vFKI%mqmM&nxyD'NuX(oeD)Wzsb*Ngd+AgF,LUz-Wht.HTT/MXdi0Btc1AimJ2pQWO3gEbC4xsx5jISN6sBDj7Kvq8GYIb9qez:LPgy;GuGU<mCK=XKPK>jgrT?OYdN@qoWAzcTBDRXCFPrDaXdEzJQTFIhkTGcgkOHQuKAICtuJzvPKGGDZLKSNMtNbNJaTwOAyRPzdFNQCyBIRLNwXSOMXTUlXUMREVvDPWoktXsbgYYZeZvjs[hRf\lAIn]BJOg^tyA_YdjU`ryvOaSCGbjdHcrSRJdkKfeHUoafRGegnHRRhIWzfikSdcjUnqrkIRRZ (}lzf80 0 categories(TREE H&orderedFALSETRUE@ٙ ?@4 4y_index__categoriescatcat_orderedint64float64uint8`TREE`@HEAPX>arraysparse@( ?@4 4 (}lzf AHSNODP?ph jTREE KjR? E%?9y,??N6=]?o2?eX I?E0t7?k6?yv?R*?fS?$ eբ?3g?0$e?q,?j?6>T?lvA?FXOP?SBtK9?{җ?(cC?@2{~?杰R?dU?5u?@?!?l{Ua?HsOT?.2,m? :0?PRe?(rWξ?tge~?ux[ ?X^j\y?L(?*@?%p?0f4#?4Q3q?K?Uc?N"u?zk?z7?AG?(6H]?5{Ww?>~n?3.$a?"#?W\?.L;,6?VqěO?&?4mR??^?V?ILfm?L?dhJH?W?Uy?n.:?m?@*E}?aEB?.1e?Z=?HSU__?󢓭b?vV?ifܾ?VSA?Z '?iϹ ?=*??@q!?4L5?}m?ek??ՙ?5L"?/ 1?u?[?nv2?0ˢ+?rPd\?cNfT+?* ?c4? )[(?DZ@?j`?}?)+Z?2N?P?De? Ւ?s ?`` [з?pD2?s?&}?3s?z'Q?̣ '?֖?h}`J? ?{~D?5}?Ku?!X?@ kN?|d0?炒!?XkE?t[댝d?Dv?s ?C5dy?y0ή?Z*L'?,RF?H#&?5?y6xp?xڏs?>!k?"7|*? A֙?8W&%鳽?ws?;?xL>WV?hA?l?No9W?{h?T<\?7?NpC,?a4?XbAA?(0?@V2?;%ַq?]:T?X<,??2@u?nMCB?|(%?ҙě?Fp?=0e?=͐DA?оؠ?L-J4C?l_M:?Q`[U?jX?Ptf?4U??#觔? 1ee? qO$??~a?Jr?2.w[?$?e?G?ȇ2?{v?v]v?ڎ?C?bB?y-$?+?!?w?!=?p-N?P ?퓣ڜ?[? Ά?|\,?̭/(?\Y8?P܄i?԰ `N?ˢ\]?{%E? 0?rf?L?y%B?Αq?v4(Ok?)?TK5?ty3h?,{ /7?I2J=?h_?tT?W~5?Px?\9!?ؗ>!?z?5;e5&???V9?߆?v:5?9eM?Py1?Pbἰ? :(? “ׅ?f?P??K %?AVСi?4s?L/8?U>M ?ڬ??9?m?)w|?xh.i? zRX?-?db*?r?b3QU?>}?O ^?2ϊ?YM? ? 1?6ŕL?7J??j}U?Z-th?Ytר?z+?LD{?Σ٧?߭G5?kHo^?c?E?Y'?Dޣ?I?N9I?_ɝ?l?f* &?&Ϲ?$Jφ?تZ6?fa?j?-GC_?p;BᲖ?r :;?Kόz? 5C? F(0?s"?.qɗ?"^?`Ƨ=? 7)?|}?Sl ?tD?{t??@@8?Z]W?'9?)]l?\*/?+?N;jp?L8!A?H9xF+? wɤ?~ 7?n1x?O?r~?cwb ?$8?C3*?87&?so?T9?]$k?}31?v\m ?89?UU_'?BCM ?fSX?Rr?e?o?`.n ?fSC?8QD.?#ۂ?^av?%uh?;bJ?`{?P0 ?Q~? ~}?BW?BC4@?07Ul'? `?oe?pł?iqFxX?k(?0XgY2?4IŔ?v?c[k?2'B,?DG?8i?/~%?*J᳼?㒎n? ?,Y?s?jFOQ?2(F??;FY? y)?[?@$[k?5S?p@? 51?>pga?hOdPx?tnW?#ǿ?z*$>?l?d"`ݵ?/7?V&?X A ?Kbs?@܎ʩ^?6]?hXr?xnK`x?`-?jAOP?8} ?V$g*>?:}p.? l$? J\?RƦ] ?I;?8z?T+&?bUA?Gɥ~rO?$N ?A0v?Aqn?Pu؜O?jSޗf? ?Wo?b??p}q?CGX?|؋?,Tm?Sm?n=&?8?%$e?W{? "Q?*,MPR?枀/?zLe?maK?8j?О-?LO?@5?f5?Y?eN?Δ?(DL?Wl ??ۮ\?72P?pgIû?b2DOR?$W?YsV ?s?\Ck?ȭCF?g@? ?b6'?ч?^q:? ?^UG?iy ?{?֔?hIif߽?Qs?f ?& ?sQp?x?p a??iSu?82h?*Ҽ?Pֳ?Ғ|^?Y$?;??,?{*/?m90?c?_2?B}?Xt?`"SC?B؟?\W?21~?D;t?4sn?@??I?~-&/?Nb?P1?a.?BB]B?yOU?ӷŐW?``?0h?wح?r?REqp?Âm?>?2T?b?x/ ?0H.?U`s?p9Y?K}Q|?vk{?/I?"i?Lj?B f?pl?`?p?Gzr?l`?{(|??Ǚ??S?f/E?H@o?-)?\?#V?TREElHEAPX @jdataindicesindptr8h j Hencoding-type ( Pencoding-version( H shape@  ?@4 4 (}lzfH n XSNODkPvTREEH    (}lzf$`w `TREE$a   (}lzf|`TREE:TREEHEAPX8arraysparse@( ?@4 4 (}lzf HSNODȤTREE  pRH?aq05?Tf,?w;? .oQ?c?Bo?(?gӹ ?7Q@? h?ȂZ?X$?m?|Jz U?֎֤?%Tp?~EC*?hnk??IN?ﮪ?{ ?Hd?GNKQ?)R?< ?Nb?@*?BRTm?Й%(ż?f(L?ڈ'?R 5?>L"?R#?!?w7?p.?ɓ?k;1? p=Ӛ?6x?ܗE?28Z?pQ  ?<Ϊ?؂ ?=5a?oB?Ewxm?@+?To!yx?%j%\i?pGO?Jc?X'h?t?P$?wMFs_?vI?hK?莐㳩?׹ ?6ڟ+5?+z?^wn?N)?>Ik?M?D$~?]l?sAO[T?@l7?w>?"*6L?4O?Y8Ġ?Z{?OB?BYu?zID ? H ?s+pؿ?v*!?Xл?,?}e?wRx?@?\QZ?Fv??-zP?&??p-?}?2 LG?jo?#a2??i? CY ?D4У?ώm?zЛ?tWɠ5*?'@ﲟ?t?lar?zy?ޒ4I?۝m? X?*$8?3Y?vja? y?osٶ??PUe?6!)z?P?[a?$Jx?Oi~?4v8j?D{ox?d??%tklxC?W?1c?jv?N6T\?o_? w?c;+F? i?O/?0|p?DӀ?R'L?t? ?cQ?6r?CJL ?ԩ&?J??G5b?xȇ?,?|os?N<7?)>?;;X?h?zZ?lL(?@j2?9,'i:?Fi?dcՌ?2J̞?1??0?3KB?1?ȯ=F? r?sT?U?uFc?d:Η?0V'j?QS?DuB?{F?Vw?#m?5$ ?>~?~a*CI?rĵ?-?|,z?\?#`N?0l??zэ?|?F`%:L?xSU? Ȋ"?(Bob?M ?^9?>4#?dc??jj?s?z5EA?#VϨ?h_?p"K='?4eT?Яi`#?FL?܁? EB0Y?`Nq?!}O?MI3?.@~?{r]??f ?FS? @=??:b?6~?b,sg?ƴٲ?Hq.?>\PN?(͉?~C ?f.?0$@??$P8?(%U0?<H? w?ְtt?:?(K?;?Tϝ!?0e?rl9M?^?sd?(37=?l >+#?RL?ʒK?7lY?s,?QLi:?ߺ|@?M?y\P?x!Z??EO L?vb#?О? ۑ?d\?`+???nz/?']om?j U}?OM?xhMް?q^D ?So?Wa3W? ؿ-?~v7?*f?F^X?8}?,?R?g鐛?>{H?"W?LuM?p ?Y{k?45?X+,!?04??9l?nF ?P?H .?C?x:$???2W?Q4?I?/?Fܶ?0 T?&I?V?rJC?:dt?t;k?+4n?\„?\Q:?3&6?H5Р?@?֓+~-?TE?D3?la.?ry)-?tP ?? \^U? J?2[W?朌7?Bk҈?f˭g? ˆ?.2ʲ?B?$U(??p?u?"?A?ӂ_[?SWo?e睯? ,?vbH?${,8?#|K?ۅ'?nR},f?mt[?-V?;O ?JE?ƈnN?*b.|?.Wo?ٮ ?C"7?+DŽ8?ޙ?0Ɖ? |"?%?.n鏠?aіG?֪?H?qJ? -o?6?;q o$?u4(?W_.?o_cI?^:?اi/O?ޯj?ݡv?+?A?ڝ4S?Y{?qZ-?&?5x|?Λ눝?H,/?akT(?Y-?!xP?0xn?N0u?[?N??u`/v?M捎3?-B?yh?5<τ??|MJ?in? g!?:PJw׬? .S?&?H UM?$w[A,?* I:?o? 6n?@+[a?XRN?PH)N?#???OF?T!? ??(?j=_?yY8?H&?r4 ?Um!?雃?Mwf?_7?Rx??*~ ?A|[?0eAm?P!N'?Y)?k3o?7Ýc?`a ;?XZDc?e1?Mj?uڇ?3𢡄?l/~? ?r.2}?K|M?(?i]?DN?w•?+I?%ZI'?$U ?4={ ?ln'?Nԥ?4)>?ЃX?Kq?`0kWS?=5qͥ?'c |Q?4?.? ]?O?|?._{?{I)?^A? 7?rB?`51ڑ?jS??6? ?6ѥ? ?#?f(ǯ^?)[E?V(ɮN?XG}?Z?≴?n?1?)?Ar?4ԗ4?Y6ߟ5?2?_x p?aeB?-l?T?>?2Q@ǡ?¹2?I&?I"p?ȗ?0su??D95?6jhN3?Rvo7?Fh\?rkhd?4s7?s8~?/r!?r#yR??ˍdb?jp1?HA(?bC?P?D?HjNS?:@&?D?{ZC?Rԗj?B I?zLx?l~W?3+?hϥ?[?i)i?z8] ? ?QLA5U?;Ls?'>?CF*?^G,/?تh{?t˛Y?`?=Ƽ? P@?PJxC?&?mѲ6? Fr.f?=-,?d'?'?rk\?ju^?^*pTy?I?xZT?H`[?t3?.E7`?{?,/\A?6}7?4?8b?j=?Am1?L?AaS?QD&D?D_L?y?k(A ?n^ַ?3?o췹?\7?oB?@s?rIL?b_g??bF?6h??+*?!]x??Vw?"?ԫE?aB?$W}?=?Ivz?bĂ?/?d4]?7J S9?W8륱?e? Ѧ?.:?:T#? ?Zvy?6?Tm58?ju?"{?Z?7?`6̮`?@H?"&P^?b?I#?Yڊ ?T}L?Rf?K?^  $?;0'd??Ч#&>?Z?vQ?PUlmL?/P4C?{jl?P?@CnZ/?L]?y.om?ng'?ľr?d'Jt?k6?t3+-?d?x+?v3 ?\? ?bB?oM[?(sVɜ?pjY?wwv'?P ? +?N}u?r?|ZW?b@?wX?TŒ?5 ?3?n?0NG?~Cf?M]?`_?I,o?HߕX?y9?7?N#ߖ??}?}?d`?XVA ?} G?ǻN?a0*?OodaD?5?ęd?,|E3?.0?C]x܃?E?9 ?Oqk?+: q?5xV?@ƒi?Y^ ?0?;; ?'Hʋ?(R?`g?w¼X?֜R?5l?T/!? 8y? L(?>p?ǟ?zi?4M5?JIO?^QB_?<_?~?YO8B?iҶ#?z3ڭs?S??~Y?1.Ҁ?e?ng1?srY ?6!λ?3ǮI?z=-?@w񕏜??.ߍ?({C?Jm?3?~5W?*?ͤ=}?5Q#6:?y^?"* ?h,0?lˬ?l-1?V?G,Y?b? ;? F#?|$?? b ?{s? ?M,[?W?P&&E?=t6?!o?E!ԗ?ؗgb?HG\%N?w2(#c\?PW=ǺE?<?^? Q?,J?a?^]@$??9?@t?HW?&E֞?s=@q?*M?G#d?`!?<ϗ}?@U/?ӓ? n?梶 f?dg?9@\? WW?(>!?Bп?XyVʡ?)T5T=??ғQ?̄},?=o'?c+-c?l[?0F?9R?=J?"H?UVѿ?/?66{?`\;?H௅?`? .?U)?MM?k?)Fw?#3I?F?`9.,?kӍhW?p>;kq?I/l?~K?u?:,ʵ?*j?|?ǧ ?혙 p?Tzb?o_.?K͙= < ?;: /9 /8 7 65 /4 /321 ?0 ?/ .-,+* _) _(' /&%$ O# ?"!  _ _   O _ ?   / /python-anndata-0.12.0~rc1/tests/data/archives/v0.7.0/adata.zarr.zip000066400000000000000000003631041500370632200246120ustar00rootroot00000000000000PK vSobsp/UT aaux PK vS obsp/array/UT aaux PKvS'ٹQobsp/array/.zarrayUT aaux  0 kc jbXaᄊt[/CeY۾ed0:y҇GR̆cF դS {y!|7.eZ7\.BUtXNmXd[_[ Uʟkz,Jm7PKvSLkobsp/array/0.0UT aaux w4̲^QVvޑ^IVHVv$3!I-d{\^CD99aRh-%^]_hBCj`]>z![!dDsheG%LReJ~fd+? W!Lr![`]+&7fuTՅ2G ˜渙}u .dij)tL>9KvwC= VQJO.SLcwLp4<3!)~gހN8ʴع } нA  ?N1DBJl<Eu<sSԍPkpWqQX]nm1Kt\2گٲҞ,u̪z9_G"pmJu)C"u9r8$b9ׄU@l ԗ7t큮_+ -ꊵ50P@bN0(/lMOUz#X,.Kkf\EAB¦:>=X;Kf+}&EF[ +<oViyXc*?G*(=U<U% |>f}wt6ׯ8j^5kO4BY34`M\px&/K;8=\g\2M_U*&&8 3&⢧%0x*-@L&JqQ0aOק. cQX`(@1} P4JWЯԓ}[yއx_%kH=2{~n.Z:< y ڡtJzTK!RgAZa^Qlug5T[I1ofãL 'ˮgbf(nC)5ؙ̟NMN=GqܹbQ6jfBe?f`yk\P3RnnJ] #HJQzK;iTkU^$TH1sK-$0/}*v_+>جe<:%o A}M*&g{wgz\?9-SRf 'u5oE}rL-fN-//b9~ozUd6>"(e[m{{n0¨YCL:S$ǵm: Ӣ ~:>X6Ul%Əi:ݤYCu\M0rq"V̜!ך]Tl]{_IYkW&z# H%2KZO -MͲ6!vfp4Z2(d3yAz?oS@eJU$75 X9@]4.jtŴAp@W-b ~I P!1.>+Y5gzu x*ko\ҴJNg/ur O" V|ɞmgi]F)9hOPSyV[g9x`_=؛#wnShxO!`} "P.gtxӵ`͎la|q)?UH4t 7#eS7Y K=U4칕 K +4Y8XH&]&qW_(d .;$f J'CJNV"MŖ , go|ﷺ~&%PԷEOc[TnZWDE[TdsSMUլ:GcHk:')˃}l4x"b+X沼cYrVADŽG"~NY lfN(Q>1ZRJ:7\}xroB۱!yTW-~{ ŵӁ,&vg?%7[LWA=;?s>1bNjkmۙ=~+&zּB)9wKm^ձOjYCM6%'wX{n{Ë4ك<"[wC_^va9U3`od J+}6Z0?=& ,~;lnaٯJT[g,>r?T^j45l6 O׫-Ϙ`WOܿ=vQ.5=}7̧%6RVi36+㡼̠mB-EQo_g β'qԍx}Tܡw*uz'9%;L::yN]W}ODT;Ntk'[,Ҩ=;΂|lT ykdط?^N]Q2k6|3=Z&1[SAvy-V|FS Չ}LjjGD}3`o\ZӍMyg| ^t;!G?~3P:\ڷyzMr|~Ҧ>DE!;Yg5n+^ƙDf[HV>0<)ݗ3KW˳O;g)V;q돬ϡ'מ'[% Tzg8XU):<ͮ#pS\Iyޑ/AbL{`P y5_.,*%E)Uw"kRMQȗ[GLzLQUPmW؟Won§u11`;MBpǾ"_'0u/nTf|Xk](pw{2dS ](Y|R=x&E}7n>3mBfF-Jȹ$]j! 8dJ$A}GǢIlfK肌W1=U=v[LBAML߷6nOv5ev6VF wj"7M SAvͦqz`orotA7c8DO__Opp7U He90H6z4~ls&[3;l%GϬ7yJ=Xg|˖;s"2?_5އEf]%pBsGܹW0lw [/x"3+3T 9&q~Z>%3iY'ѡQ`2k/A,bCڸ^D2mZ]7?2U$ ޙ\@,Uwj|fR!jQ=KZ>Y)h ;mj]Ŋ=|%LڳmLZR7S^3tiΟQvp Tp89鰘i&#L+A Q6{ǽ:dC5T8 ļJginmW#;+&{\;]ZVՈ( >ƕx_8X 8_A0Ոh !e>Mflx S2o^|4mv9懘}`6VD[*&x_YWr(<@=D?,_j?^jcm2GI/*Ѧh.|sA6 '9-wTڏR t5ߩ<*={|AXJ9<[[ A~ |5n9Gg-3~s}Qr/|ePFc6(%C5mjXLN T]f ko^0%G iAXp}"Rz#2χY!*#vwu/;' nhY64_c V`+M޹ȕZl0o} 5nMZ8aMEȃRŨ$<7o5=j΀qMu]^o!\ŶvmŽ8;O{05C"ZyT>cjEd*8nZJJ#4~5*ݰ"6q^?#Ph!@Ս a h4AHI !W`XVJv;KB%b!صJY.Au$ FL!Ʋ \G`M(6bЛ JKC`]D"AƯHh44F#  CCѰ՚eqd{6Ń Inc7ڀm7HdT7&!냳4%nƠp"y5JB .‰c]xF8B.Lȩ ԣ\^^[wPK vSw obsp/.zgroupUT aaux { "zarr_format": 2 }PK vS obsp/sparse/UT aaux PK vSobsp/sparse/indices/UT aaux PKvS)a5obsp/sparse/indices/.zarrayUT aaux uA0E̚\hy cH-mh($Bmڍj__-5K|=M@ْ/e3+ŏKTB)ED:@281,#\[=Ęc@jCZCܠZpw1_YUR'FoPKvS"4obsp/sparse/indices/0UT aaux cb4fQa``a bi b^(_X9PKvSCZlIsobsp/sparse/.zattrsUT aaux RԼ̼tݒʂT%+Ē %4%eEřy Uzz0``. XZPK vSwobsp/sparse/.zgroupUT aaux { "zarr_format": 2 }PK vSobsp/sparse/indptr/UT aaux PKvS o 7obsp/sparse/indptr/.zarrayUT aaux u0 wyfRucTJC"" A6A*woBOݜ~@֮k.=a>Ua۞%*x Iy+~\zxNa' NpSXk5OeR >\f @ pPzuq*JmZD'ATjK-oPKvS,&obsp/sparse/indptr/0UT aaux cb4fa``LLP̌Y`V(fbv, sB1PK vSobsp/sparse/data/UT aaux PKvSXj 7obsp/sparse/data/.zarrayUT aaux uA 0EBf-Ŗ]Q1Aq42c*޽IH*y̟ v 5}Kx=('lIOp=)lըv"t K]-j~ao)wbn}5b1 ]A/Y ׀w+Q bp!Y>PK vSXXobsp/sparse/data/0UT aaux 3HHX?03?D_F?VD&?ܙ?-:?)?Z.?.jX?PK vSobsm/UT aaux PK vSobsm/df/UT aaux PK vS obsm/df/cat/UT aaux PKvSL\7obsm/df/cat/.zarrayUT aaux u E~ cm?i EQ&B[6ཛau|ޫdq&a&om_ `*<(s81 2}96Bns1/$Bl.;$@+(b:Jkz_Q+5zPoPKvS?(obsm/df/cat/.zattrsUT aaux RĒb%+x_TPKvS}w,. obsm/df/cat/0UT aaux cb4fc``a= dbadfbfecbPK vSobsm/df/cat_ordered/UT aaux PKvSL\7obsm/df/cat_ordered/.zarrayUT aaux u E~ cm?i EQ&B[6ཛau|ޫdq&a&om_ `*<(s81 2}96Bns1/$Bl.;$@+(b:Jkz_Q+5zPoPKvScBiHdBQfwr?BnI\[ѰÄȢ%`UOݪ=.W S8ʁEI$k(R)P[9/LP/\rY#@%K y˯69{oPKvS-.obsm/df/uint8/0UT aaux cb4fc``a= >ǫ:Ɂ'W Hw#w>APKvS#vobsm/df/.zattrsUT aaux R̼ %+8K"S_ZBdK`xd̼3dD4R" %0?jmj^r~Jf^nIeA*U)%iEJ*R3@ jPK vSwobsm/df/.zgroupUT aaux { "zarr_format": 2 }PK vSobsm/df/float64/UT aaux PKvS% '9obsm/df/float64/.zarrayUT aaux u 0E~Z}A)3J4&(FfPo꦳Jg.s$u]pM瘇3i{R̆5,x דr8V.Q [9G tÖ5^8!uyueD4{)ͻ c>YF,z6Z6(ڝ5 KwڤWT bQ!58YL_PK vS" obsm/df/__categories/cat/.zattrsUT aaux { "ordered": false }PKvS(Aobsm/df/__categories/cat/0UT aaux Ĺ00;C ((88[9E60tÎ^8!uhUe8G^JØm|xZGdh~k7,BUDʂ dyb?PK vSI(obsm/df/__categories/cat_ordered/.zattrsUT aaux { "ordered": true }PKvSL+H@}"obsm/df/__categories/cat_ordered/0UT aaux  E@]=D Ѐywzsb!$ObRRF:kШIڴЩKPK vSwobsm/df/__categories/.zgroupUT aaux { "zarr_format": 2 }PK vSobsm/df/int64/UT aaux PKvS^/7obsm/df/int64/.zarrayUT aaux u 0E~څ}A)3J4&!Tߛ iMg\Ex¥?͡G =LFXM֯OԼjՏsO9@"r`<ǶR"-"-{M1!脡^GLPVP }vmb 3 m_loPKvSÅJ_obsm/df/int64/0UT aaux cb4@,@_'G r.Rx?r s PK vSobsm/df/_index/UT aaux PKvS8Vkobsm/df/_index/.zarrayUT aaux uM 0BfmAJ4&(Fz&j\tV7/3eLRW@+5 #}p|fBdJ T㢂 ڨz`)2s ;z!pץn1V}>g9MC6j=lu;I/d|p?l R\Ȗh';$PKvSh~TLfobsm/df/_index/0UT aaux }; 0በul`+VFr'aM'=EvxI +K -XAO1L.FD+j%H Yi&%wn"yPK vS obsm/array/UT aaux PKvSg۬Qobsm/array/.zarrayUT aaux  0 } ɵ #= jbZaᄊtUq!ɚ7f4ϮyEgԪIh:]/T",Q-,b8  DQX7FJt[إޣkЃl+Z3uĿ^0J*d-dPKvS9)H)obsm/array/0.0UT aaux w4QVlBHddeUlBFYY{eyx{{wϹ{=~h.bOI@n񢴚p%tw8tn1V3#$@!TXZ8M ~ŀ7Oؑ) tz~Mm8)2.$-7ٶoW` 2+g7)|}idj";xnR9\w\<|TSAXp2Aua(Po _mb^7$x.pyIHEZ8[M KH|&ņb&qجm?el y@cA5X.gH߽NtwSS/ c n|`4,ڬ(- e;}WA@P߶$y;#L6q֒C$0$Thxgll¡[[HZ,Th݈]{\+ r;y#lhXRm Dx\<Q.=p[Vnl N?F>:izAأʨoG #4+i"v: [fQ%-ʟr7 ~T47k=i&Ɠ\RdT'<1Ф2tkӻKj/Qn~36rˌV{x8 Tl,w^s̲Oӗm2*3iw!)`ۋW$XM1 g["݊/a@DO Z9.oy2<6͢x3 6U@8[&gR9L=^3T-:3ib5AUu|q)x!ԗoecYA~W>ڱ*ǗaԚ]>IBڋ?׌cT+{tzx!(V IG9^ oo4H,Z D0m^5()Ǥ%L{=(xtSY<'W}bv.~x]V>X#SFji J~oA;8Gu&g%=C2=(:>CniS_ Sb~' Cgcw>ض S 7LO1&cnEև5OxҖPh l"<~zBYzsk4#q΂-Ov+,'B7BwmU%mxhhűXNҺtECigЋrMkE4s)iǯ-v(\+0|Ss!M%пV'JEf|k ܵw1Yx~zgj^xد{֥wFMXQ’o$=e6T>y@ skoQQϦSIL7|.6|$U|?4'Wd/hRFEA,,S<kYҭuk'L:ib3A{U[m&B}WCBGoON,SIIHɑXS|(6^39k,(\O^o Lnpz.wfVZ!y|f&~8մWiyK3e!(Um`fe.#E;GKbĭȉUy}D~V_S]*F\YXP`'8W޲/<8ܴV$}*z_߶_Fa:;Jp'%[Hs}ѧGMt"-p_+Otba0oQχWYɰz{Jݒm5lg]hJju3Q &܄dm)bW d8t&Bz;?8:<l-n Hd aBCnxECSiOUS\o۴5& D?6YęINq%[!Pڙl 7yЪmN 8ɕf~^+ӝbvg|AOpGnћ\a9tGcanj8Q!Vl-Ǫ oBp #S SYQ* ˟}dEFZ_ykjx;SE<:BM3.p Y%hۦ?.Ew l? WtsTs,jg]ECNsvϷ>v+_Hy;fE">ׅRpv7BxEg!oĚ)ISevluˊӧIuIn^_w-BQn"~+Z}*dX+)(Ƌ&ƓS wmRR-fw\ĤeXJ$.9\ٮMxI(稯ןy-bWG[\3C?k2$'W_/-rgr^9tNP"#t oy<^ {]6*<a*uoi3r#fR]uPi_zhENifJNߍ3hv}0|{r"hy_9Rg1aΎԭhށ _'c蝉,>}΅|@㴫?秘Py>o8S8SKxEI' D# Ȫc;:o9YqMcx_EuMr9OI}E "Hr$[McaR冡"tB_YІJO~߽"f8[zJ@+$f)T;#eK/%Ib'q[\+,(sW-G`"J&=QK$r 2c4@!.bڼ`UG6嬪T3fne~n5{!Ҁ_g52dډɗa k [3d[z7LKCkZEQMtҜlu<+cnV!Ք6i}-'%C\A0A5avoɛ /'>Yi] oL[ 2~*^}H;+~CWcg-) g8s]S*szR {~IfĂUإTIf!Z,;'@엳Q9{"#i<z,bHll$C",&'V\?~bn࢈m}[?^S|8Ȗ*ຩ=lK6>=;M3eM@0IPJKɺu;=v<,ͱ|9[kű HbЍ…9NtH::mwVVp͑RͰ`a{mnMLALq!<"\boϽh9/mpWT$Mv ;SK__կͮu9bQ'=gP7ʟ[O[$tnduoV~)q2FVO&}F'_ɾ7Ks:$ݬlcrvStdy̷ʲ)yڊi?}^/L.,ե=e~`.b~؍YRxղǭB6ɸ!\+I@鯟Tl0UDOsH 轶\qVK99gG?$ +{,8Z͋G\^*x9RgNCx<$hzQu^ ަ Q-gimNp\ԎYzoR'iOj/c}s̶sG*WkiL] *5e2w(TiPy9z"eC)gP$2%T GC:EHԛ*TҕǥVht篵⤹"ߑl~0>ǀ̪`5Li.0ꌬ>y8:aJtjΔ{$-3P2)I#[wDT- T]< Zd.u!QyB,bDGrIFJ;\ Cȹ|. 2O-bF|l(K.5z_jT*爵UޥZg*e3HFx'~yvÜf3YRVY {]k&8o=L`ܻEJzr?cqȉ6@(֌]g'n:[2o Ϸ!_׆=oŖ 4)M'g"^*5qGsMш`:2k3/4}W_Yal'377f XHasZl:ho4{(8JEM7k;hsAU% 9ӗWUztG"˫;HϺ>0zAv!Ho'Eu³ZO\8}6ЋBd71T42i|1UV1Tr9NBPl|υ¹@RÎ!h!x{Ch=ՊwCpq{[jJ"E ʷ:MX%RF.dy Xy1K38J 埈 ߙ6#Jv,FmBy}Wz뇙D!u\GlAS"ה0Gۓֶ7+OEjD/ "يrrҠ#?TݒHlE\7wdҲMN1ڎ׼u''|'Еf!$Ztjs%'a3oM,? C6}4Hko5Wb52 _1'_i>ʔkhrl&S)^Uk)%{\(bC]&QOlwwG8B9um።N!|Y6ٝ| q kjKl(Y#NuMWEN=_S1q_aXfD]@ޢe);yj$s8쯾?G`BY_OT;9ByWCm2ѫ=wI'GJD@ىMcّ)y$!$w=)&XREnN8Zarx7e;XW;CrDI~k֠z㭬!>W ZM\?zvyRH:h~\zY\R=oәAoz80uPnSXŸCWv@r_7w  -En҆75N6-H'fKJ5E} _(ܺ%zmOnoAffu>ZBQ-iyQt*/z[z;OE81|+ o?-ùbkMfPt D}= 3Q 0| ?f o t~ˮi/yXyAgCu{F/. bT}^-iʜ O&1$7uO"fN)fG1pL PXsљˊUS$^K^ø4ʛuQ&5g{B7}wL¾K'Eg itti܃d{} ֕yix4r4k3f5[9J^djhKFT<ػ8AժdKtlU} 9J71 _3S0҇J9ⅅ\kE.=e}+͞z)hORN6a׏<$8nҪymCC-:`wBj:zײJ8R/8䱂ooY9X5k, K\Ish ,cbq =_ѹ3 K%4E{z;=He#{*t:x/11'6#6 Oqtnk7ۆ\r5boSxhfXg],v^lOPhU'OQ.MNn1k8 `O⎦z+}{w*\:n{^u yV\Tj򀚨ǮzKhnz*S>z`JX[~Q]xֶ-L64)Kb{9+/UZKNlvڭ^%ΧSiXXi'0Ad aèɖ/m:)N3$ __9gpJ==@]m!_p]J z\w+|jqZ]cŧ=,q葉Zc(kGfpǦdiLl]f'/nk7t ^ U1:c,m@#:v 2)k |S>#WJMT8i"ٵv#,Wn:|q!KӪpUnm4Z{cccaᗔӞ颃ַO NZKwvFpm_B*N.Wsr^vl& KGy VqxFg;_j`LjBmRa ԓ=r{~ETa!sH,asx,=lKб _Y\b]P$J}X8f_"`:eb8" O0Cb)0$C$l5  Nf7V54t=D#)2_M U)8-LRqx"A^n’ۛ(#Nc1p*э\\%Qء50ix|ՏX\n 'Q8(l G&*CMQ( `| B-X3ֈ& BT*HE)3Ke9$ @C:(V(lme@` $["Z!ְԱᦁ9l#/a8D,z}E$:_kYc6jWfh2a8$PpBIcD"gM](" ͮPӰ8a$L`?m0?])%"=NaP0,t2KPW(x|)m@$`R+X";)rDM%a(mHHMFϵ #Qx" E& )_Oc "PGã K0;I̟畆484kScӨ " #S l@%ǸPK vSw obsm/.zgroupUT aaux { "zarr_format": 2 }PK vS obsm/sparse/UT aaux PK vSobsm/sparse/indices/UT aaux PKvS7obsm/sparse/indices/.zarrayUT aaux u0 EwyfϨ*DDT86K=%GWKups*YlRY V*qjR%XϐpS3 QN5^kCvK=s3{ hXM}rQ=-ՊK}uyςҖ:1혭PKvS`,Pobsm/sparse/indices/0UT aaux -A @@љ:CC 7BNUa۞%*x Iy+~\zxNa' NpSXk5OeR >\f @ pPzuq*JmZD'ATjK-oPKvS\><obsm/sparse/indptr/0UT aaux cb4fa```b fb4HyXXEX %X K,!aPK vSobsm/sparse/data/UT aaux PKvS% '9obsm/sparse/data/.zarrayUT aaux u 0E~Z}A)3J4&(FfPo꦳Jg.s$u]pM瘇3i{R̆5,x דr8V.Q ,ù m!CڢV⻯-q9%dMR[3tnigW<"3 jՏ$Vd뮂*vGn(f^b8 3DQX7FJt[إޣkЃl+Z3uĿ^0J*d-E}PK vS layers/array/0.0UT aaux !X51 : @XPT jyrUФA0P7`Xel Ѩ%$к`nR߄0H4q&#֫FdJ$Np.gJZ¾u|rp``O.`p8Ǩn`˴H'YV2@HH `tj fVܓ4Y䆜ΐ6R4srjH:RzlhiQ;'C^t P dr^xHt.{,8jALQn3\bd :ZTj7`ꑲY^ЇP/@Lynȹtv î(|wTn~`H7NdXǯO.C@0(`T 4^Yiz~ns@.( P<5TiQUp\L  0&UewVh0v]樗6#(X62_aT>2II0DjFr/bHDBԂТzLFGJ&mFkjIZ`/6ADkorb6+!VɎb7WTvu"I݋K ;UPPCgd33spw530IߌV}o5,]9Y;wѝ >JQŠO31egrzwJ!k@q=/7[a5 _mCEFuRe0}j-YFd7 =LQF0"˦VG &=G2W]W< 9y)oc096)#k~:*ǘTo͊ۨbZZ җGvmIfYGΊXgJ.& w [H#(Hm_R!k/K+4K=cJRWXa4Iٞx\`uJ {^5 4Ӈ :AimbO uZ6;m<41())xeQ"hvhks8#pPj?L~3[) AL법˗=xr=*T[E\}b=maD_(o@IFh"$I/Jئ.ym"ՙ6@&T0#vl4{n.r'k+xjw NbX NM,y#ad|0xEO+5^'58LpŁMI<8S.3٢֍ LL+nhQ7WR/Db"JI"߄guޢ-Z[rsIY#VnQ*ڬl, $b{W&ɗ\(= ,@Es*G@nd@ >T}'+="V/`UkF39>ˌ,jz_ejnھGJWa( |z4XrnSMAoaN̺_w7~|A)3c c2|DI{)XqWq|<%v'IX@JI5#;T7NB1l]{7Ԙ?E,魶. woDHP c}? Ӵ1A 2&pu5ˣh7~rd?&ZjxW]5s̾hY`@C6f-^T֮4Ĝy뷬s?]Eav4S: 5{ZHPb}Rd>Zjm6-+\BMPurZf]oӖ` N k ;Rly(zOB_~Ҩ !-5=6^ 01ƍb# E6!ԚcQJ$9t@d!/fg\W!BT,-F{)Mhk/ bIα:}Q2 'zXJ,:N& !fx [m'o;j}M(]N' '>SQ4Tm51F@{7UF L͎&X1 c,x.˙)AB<=ҋbqEIQ 7V%صѾ^EN}Zr45pBl37\d!Rb1(CDSCI܀ U>*/{ǯ؋,\y3`A71ҎDL ȹBLgՉxwBWz]9e#8`^# Lǰ$ȣQLCl/]t䲧[' +W_X}G*Dd3qVi ؆X!i_s?Y1ICm3W艸F@фa 5;_(EtqF#0f~;AQ`+fy?O ke˜t}:Y;s[obN\E?êq}С=QcRJڿ;I,Wql܅|p<%QԊ% csA@dX涐貕˥߻v޲و׬үl}DZf췧䐵ԗծƓĬټ᳻Ҳئ곿㛚Ɣ ?AP?????PK vSwlayers/.zgroupUT aaux { "zarr_format": 2 }PK vSlayers/sparse/UT aaux PK vSlayers/sparse/indices/UT aaux PKvS5layers/sparse/indices/.zarrayUT aaux uA0E̚tay cH-mh($Bmڍj__-5K|=M@ْ/e3+ŏKTB)ED:@281,#\[=Ęc@jCZCܠZpw1_YUR'FoPKvS (layers/sparse/indices/0UT aaux cb4f```a fb6 bf(_PKvS(Kslayers/sparse/.zattrsUT aaux RԼ̼tݒʂT%+Ē %4%eEřy Uzz0``.F`f,W-PK vSwlayers/sparse/.zgroupUT aaux { "zarr_format": 2 }PK vSlayers/sparse/indptr/UT aaux PKvS o 7layers/sparse/indptr/.zarrayUT aaux u0 wyfRucTJC"" A6A*woBOݜ~@֮k.=a>Ua۞%*x Iy+~\zxNa' NpSXk5OeR >\f @ pPzuq*JmZD'ATjK-oPKvS4alayers/sparse/indptr/0UT aaux cb4fa``܀H̄fCPK vSlayers/sparse/data/UT aaux PKvSs7layers/sparse/data/.zarrayUT aaux uA 0EBf-Ŗ]Q1Aq42c*޽IH*y̟ v 5}sx=('lIOp=)lըv"t K]-j~ao)wbn}5b1 ]A/Y ׀w+Q bp!Y>PK vS\@@layers/sparse/data/0UT aaux 300@+k)?{$)?0Ҹ?.2m?x5?,_w?PK vSw.zgroupUT aaux { "zarr_format": 2 }PK vSvar/UT aaux PK vSvar/cat_ordered/UT aaux PKvS&Ѥ7var/cat_ordered/.zarrayUT aaux u0E|5 qgCjaCIwۦj7Ϊ=9s3w/J?{7 a:>U28[Id4| I9UNl "vk"cRtXsy&4b2t!.҆^CLX!C ~vg1oY)cGx-7PKvScYkr*..u"- km(PKvS]t#$ var/uint8/0UT aaux cb4fa``a *ؠ{??yِ+.PKvS Q} var/.zattrsUT aaux ]A @ E̺RJR& hR(-Ż;RĬC##};.E4Ls2 Iőrނء>IW8F~} Q퍁gNUqPK vSw var/.zgroupUT aaux { "zarr_format": 2 }PK vS var/float64/UT aaux PKvS1+\9var/float64/.zarrayUT aaux uA 0EBf-J tc"iLPwo꦳Jo>$u}pMEt)fCZ~ȖY9g&xSyX1Vk xK5sӗnI  |"FhRܿ\,JmӊdPK vS(ל var/float64/0UT aaux 3?:z?ɮ?ˍ?͛_U??%?;O b?Tx(8?G'w47?$?dd?9|?P.r8?@??;f`?'䕐?@N#?w?PK vSvar/__categories/UT aaux PK vSvar/__categories/cat_ordered/UT aaux PKvSq(Zk$var/__categories/cat_ordered/.zarrayUT aaux uM01Cjia?$ݶPΪuIj hmV5}g$SJHWPV5m"ZXİ'NgxS9hG)cU==\J?sr b945Xh?/v]z!+P{aBvD[)YPK vSI$var/__categories/cat_ordered/.zattrsUT aaux { "ordered": true }PKvS~8_var/__categories/cat_ordered/0UT aaux cb4fg``x bF v D"BAD$H "D䀈<Q"PK vSwvar/__categories/.zgroupUT aaux { "zarr_format": 2 }PK vSvar/__categories/var_cat/UT aaux PKvS)k var/__categories/var_cat/.zarrayUT aaux uM0$Cjia?$ݶPΪuIj hmV5}"g$SJHWPV586N -<ǰ'N'xS9hG)cU=8z/~~r$h"j&6Y0_`.BVo&"eɅ숶"YPK vS" var/__categories/var_cat/.zattrsUT aaux { "ordered": false }PKvS1Z7>xvar/__categories/var_cat/0UT aaux K @PM$ 7 +Yd{/ɓTQN4kѪMN]PK vS var/int64/UT aaux PKvS9 7var/int64/.zarrayUT aaux uA 0EBvaR1J4&$B$m6Ux&z7 .=4u|>dqFId[bO0b JEN| "2}SppOt5ǘ>') .\vHVb59 v2`.<-rVZSoPKvS?8=K var/int64/0UT aaux cb4X@,@_g—Ց~ N)G:g@y#!PK vS var/var_cat/UT aaux PKvS&Ѥ7var/var_cat/.zarrayUT aaux u0E|5 qgCjaCIwۦj7Ϊ=9s3w/J?{7 a:>U28[Id4| I9UNl "vk"cRtXsy&4b2t!.҆^CLX!C ~vg1oY)cGx-7PKvSAXK",var/var_cat/.zattrsUT aaux RĒb%+x_,UPKvSW,"$ var/var_cat/0UT aaux cb4fa``a f`afgc`dPK vS var/_index/UT aaux PKvS)kvar/_index/.zarrayUT aaux uM0$Cjia?$ݶPΪuIj hmV5}"g$SJHWPV586N -<ǰ'N'xS9hG)cU=8z/~~r$h"j&6Y0_`.BVo&"eɅ숶"YPKvS!bH\ var/_index/0UT aaux 5;@@E;R2oTV#:e؊5XSL5HIoWNvlFJ8 /FCzD+k*Ԗy )T /PK vSobs/UT aaux PK vSobs/cat_ordered/UT aaux PKvSL\7obs/cat_ordered/.zarrayUT aaux u E~ cm?i EQ&B[6ཛau|ޫdq&a&om_ `*<(s81 2}96Bns1/$Bl.;$@+(b:Jkz_Q+5zPoPKvScBiHdBQfwr?BnI\[ѰÄȢ%`UOݪ=.W S8ʁEI$k(R)P[9/LP/\rY#@%K y˯69{oPKvS&c-. obs/uint8/0UT aaux cb4fc``a= ~i3O;?M-a,oܮѭr#PKvS} obs/.zattrsUT aaux ]A @ E=dmK*"C2&2M"HiV?. /̽C'Fk9{<"2&q Vq$!߷5W (ac#>LR]PK vS obs/obs_cat/UT aaux PKvSL\7obs/obs_cat/.zarrayUT aaux u E~ cm?i EQ&B[6ཛau|ޫdq&a&om_ `*<(s81 2}96Bns1/$Bl.;$@+(b:Jkz_Q+5zPoPKvS8e",obs/obs_cat/.zattrsUT aaux RĒb%+x_??UPKvS’,. obs/obs_cat/0UT aaux cb4fc``a= bffdaeefgPK vSw obs/.zgroupUT aaux { "zarr_format": 2 }PK vS obs/float64/UT aaux PKvS% '9obs/float64/.zarrayUT aaux u 0E~Z}A)3J4&(FfPo꦳Jg.s$u]pM瘇3i{R̆5,x דr8V.Q ].("8ޝV˲NVh1|o&eKɐD^b2;l Y;'K?ޭ?????P?????PK vSobs/__categories/UT aaux PK vSobs/__categories/cat_ordered/UT aaux PKvS_Ω}k$obs/__categories/cat_ordered/.zarrayUT aaux uA0E%Cjiqdڒrw,UwI xeۗKzw_ylV#"[9G tÖ5^8!uyueD4{)ͻ c>YF,z6Z6(ڝ5 KwڤWT bQ!58YL_PK vSI$obs/__categories/cat_ordered/.zattrsUT aaux { "ordered": true }PKvS娦!Cobs/__categories/cat_ordered/0UT aaux  D@]EhD>H΋ MUn1S*OVf1ըUz 5i֢Uvԥ[?_PK vSobs/__categories/obs_cat/UT aaux PKvS_Ω}k obs/__categories/obs_cat/.zarrayUT aaux uA0E%Cjiqdڒrw,UwI xeۗKzw_ylV#"[9G tÖ5^8!uyueD4{)ͻ c>YF,z6Z6(ڝ5 KwڤWT bQ!58YL_PK vS" obs/__categories/obs_cat/.zattrsUT aaux { "ordered": false }PKvSP"}Cobs/__categories/obs_cat/0UT aaux 1D@]rB!DqJ/Sd1G'$1)TRF:}kШIjӮS_PK vSwobs/__categories/.zgroupUT aaux { "zarr_format": 2 }PK vS obs/int64/UT aaux PKvS^/7obs/int64/.zarrayUT aaux u 0E~څ}A)3J4&!Tߛ iMg\Ex¥?͡G =LFXM֯OԼjՏsO9@"r`<ǶR"-"-{M1!脡^GLPVP }vmb 3 m_loPKvSJ_ obs/int64/0UT aaux cb4@,@_c%O\|7ynܗ{G@5 |(cfPK vS obs/_index/UT aaux PKvS8Vkobs/_index/.zarrayUT aaux uM 0BfmAJ4&(Fz&j\tV7/3eLRW@+5 #}p|fBdJ T㢂 ڨz`)2s ;z!pץn1V}>g9MC6j=lu;I/d|p?l R\Ȗh';$PKvSh~TLf obs/_index/0UT aaux }; 0በul`+VFr'aM'=EvxI +K -XAO1L.FD+j%H Yi&%wn"yPK vSvarp/UT aaux PK vS varp/array/UT aaux PKvSjᩪQvarp/array/.zarrayUT aaux  0 {9D"v-VIց{wRGsJ>'ɚ@v-̾Hދ6Ó!X*x=wr8qD5+tQ ^ ]㡋bbn K3!OQCg6,+xR-bπͭbD64Yl{PK vSL0 0 varp/array/0.0UT aaux ! 0 pTЊȀ|%~H)@Bfp pٶo@T%pPwh6^>Ds"YBzs\F&p2 Dzt' vP6P$O4%jNo CJx|N@9F d0QD{V5~|#0z(>?zhpІ `؋.{6H>0$( T0^(lRQߎv`'jxW ~F>Lp4X?nHx0rt\3HTDrt B.$"$#nЙ;ưٌ .⁘ߠ;oH!0NMdafw cg7 mJhI{G!w.pȗ2+ojGJXtM+N48OYI +v}- #i4Wzۃ3 o[JvDt cODݴ‹);lj,d23rsu>arFx B^4js5EN!@F :b\(~<:ϗs ʁMyxEb+n h˲avRgu+09߸ȕQF J;&ғlyP\2f uAev{Rm-O*+0na 6u^Yq5aYx0u5|eR.BQZTEndN*%LR7k=Q=E%'FvKIA@w*Zpew@Q޼j鞊@lYs!i{dk1;/R c6JGœozL2'J1K=FVSuw#$*ǜ`SؗdE#_"4iBM~r@qPC$P䝦rؖ7sL߷\Ŋz]UMq3*^{",F24Vk+\65+Eƚ^J[2U?S,|}nb.Cȓ%H-qW_Aڮ&k-Px`y<靐Hq,o(XzpCk KRTR#;6E <5w!\Ǹhs +)k$]Ol>6OBD*X,ʖLaCÛta߻X*jyU)a~olv6_w+0'tL5<;(icBFTc:' ~C\ڍ|%Uα9#cjAVK0qf,.͐f@%w(!ld3>LKκPO# do}O^SfH Y54 C:WѐC:4~kB_W bKtJͅ"|閪ҏ 4_c:ijʒZ,[/MZa0o @$ Ԗ ņm(L "Ӳ8 xyi$_Izw~[7HВR̴-oYm5ry4$aٶz8?xWTirb7hZ:՞΍Fϰ-,`l:oM=e`F}I] S=~sN 8UH;e97+li|! /mMDo7X8M{$ߚT&dnQ@3a)gβ([H',V EN.WƮݏqo/v4-ƈT?-ׄ<45;QhUڭ*NޮQbŪ'5Ԙ1xZ aBmxOcԳ5nMlTL{u !xZvz?}G2 *Im8ƄexjxCc\F|LQ &Ԕ,s>Xi01UjBmIzN¥L"b Ġɨ'T#LYO3?bgࡉ.0лtKɷM=#Y,:@ZL\ W-},Wޮk!l P.I/It:砝-.- UW7ƍpo,8f[|o 78"GJo$(.IO̞+S{x/(nuNv3Bh_PDzpѣȾߊƨֳtʵհ߳| ?xP?????PK vSw varp/.zgroupUT aaux { "zarr_format": 2 }PK vS varp/sparse/UT aaux PK vSvarp/sparse/indices/UT aaux PKvSSP5varp/sparse/indices/.zarrayUT aaux uA 0EBf-vS1J4&Lݛ fY%7kup˟S׫ +lYT%Iacp)!fLk zs7G hXM}rQ=-ՊKuy/Җ:1mPKvS‡s varp/sparse/indices/0UT aaux cb4f```a bV fdPKvSrLIsvarp/sparse/.zattrsUT aaux RԼ̼tݒʂT%+Ē %4%eEřy Uzz0``. XZPK vSwvarp/sparse/.zgroupUT aaux { "zarr_format": 2 }PK vSvarp/sparse/indptr/UT aaux PKvS.7varp/sparse/indptr/.zarrayUT aaux u0D|3%z1 cH-mhX(R! ڋ{},Yd%Ǜ$u}pMg 9iIB#[nfpm\:x^t^ Ecj|!/ @9bEP| !_YP'FgPKvSʖq/0varp/sparse/data/0UT aaux cb4P```a 9 T.Ͱw(oPK vSvarm/UT aaux PK vSvarm/df/UT aaux PK vS varm/df/cat/UT aaux PKvS&Ѥ7varm/df/cat/.zarrayUT aaux u0E|5 qgCjaCIwۦj7Ϊ=9s3w/J?{7 a:>U28[Id4| I9UNl "vk"cRtXsy&4b2t!.҆^CLX!C ~vg1oY)cGx-7PKvS?(varm/df/cat/.zattrsUT aaux RĒb%+x_TPKvSE"$ varm/df/cat/0UT aaux cb4fa``a fbcafcdecbPK vSvarm/df/cat_ordered/UT aaux PKvS&Ѥ7varm/df/cat_ordered/.zarrayUT aaux u0E|5 qgCjaCIwۦj7Ϊ=9s3w/J?{7 a:>U28[Id4| I9UNl "vk"cRtXsy&4b2t!.҆^CLX!C ~vg1oY)cGx-7PKvScYkr*..u"- km(PKvS_#$varm/df/uint8/0UT aaux cb4fa``a ^xnك }{^}PKvS#vvarm/df/.zattrsUT aaux R̼ %+8K"S_ZBdK`xd̼3dD4R" %0?jmj^r~Jf^nIeA*U)%iEJ*R3@ jPK vSwvarm/df/.zgroupUT aaux { "zarr_format": 2 }PK vSvarm/df/float64/UT aaux PKvS1+\9varm/df/float64/.zarrayUT aaux uA 0EBf-J tc"iLPwo꦳Jo>$u}pMEt)fCZ~ȖY9g&xSyX1Vk xK5sӗnI  |"FhRܿ\,JmӊdPK vS,varm/df/float64/0UT aaux 3,?&?pX5?u'8? q>?!T]?hr8d?!O?) r?n?` C ?вk?R@?L7#?0G?dqFId[bO0b JEN| "2}SppOt5ǘ>') .\vHVb59 v2`.<-rVZSoPKvSr 9Kvarm/df/int64/0UT aaux cb4X@,@_/cxk(sN$ CBPK vSvarm/df/_index/UT aaux PKvS)kvarm/df/_index/.zarrayUT aaux uM0$Cjia?$ݶPΪuIj hmV5}"g$SJHWPV586N -<ǰ'N'xS9hG)cU=8z/~~r$h"j&6Y0_`.BVo&"eɅ숶"YPKvS!bH\varm/df/_index/0UT aaux 5;@@E;R2oTV#:e؊5XSL5HIoWNvlFJ8 /FCzD+k*Ԗy )T /PK vS varm/array/UT aaux PKvSrmYQvarm/array/.zarrayUT aaux  0 >,Ʉ1vc (F+kKz\N$ْ֌=-}UgG_澭Väֱ*x=u8rG'\/qF1FJt{إߓkЃ녡vW1@΀M-beZRR$PKvSvarm/array/0.0UT aaux g< +BCJ Q!$2BBB&+[RI*[Vru0s>qkY@ PK}+UZUf5*]&u&Qm.T/4_6xZ=WuU곝7pZ9ԃ r@@gv n1Vo[0mTt#^k'P *=4?:)aMdGԦ`HKnm֤]25D2HißoiF>5gꧩ[mt(ި77|* P#L{\Kx)0ݢf2g#X",—5 ?̩,+Y:z]\2.+Y81*Mp8P_wڏMr_ri8f WXN_304K|VQ=!]1;舘d"aϯ . >.Hq0*Du9Ӟ&77SKKJM|$䞝>||iSeUF8 ”Ia)6]#.ԫ Gۚ4֌eU)fU͠q\Ab^ɁU /(;t%}Colsͪ>h;[xu V}r|Q@1DyMueF۪}IAdKIX2#yK_?F9H T'w;6L%5KF±Upb~[s1YN?Z5z)yA}{y>ݨ0YԑªX&kMy;Ow_:vdZ3A 2x~FoX+=а )2&M0^bqj3nKʎNn韛)?<{$17^2C5y:h<2|M껞[ќov0g AR/y35_cps:ˡnt,j,~m\B-d- 0Z 6`G)3CJy3PTm'-v豕uWτ4KdoBtGcK3Ʀ|ȶuDJ:~zG]ZU)@eC5su?%]q-{6+ib8 QMSi,ýu [D~ VYT*]Q~I]Q\Eo.##X,w1rmՍY3@/gDG~ 1r,bӪ=̬EC{]h':1+]=ˢ/M8O0hr4!tmJB>(xuq,%x[XF[ƑGedgoLN~we<<[uku@ %{;y@%ͫyySX! 'O]vX`e!Y2]^XF)^'6p; υncĞڒ?e_6(a&kZ?Bo.q WN,z,s=FHΤ͏XiyMJr<9wEK?ocS"/֨-Y.2 Ӊ~~(P u~~ftff9xtNۄ4I Q-T&ul&ej?&~}f뻫{\Oa۷f=k* c<}ѽІKV$iTӗ<(CUp H]mӬ!Uu_6*qW^} ZWƲ9xTM1RQUPy,6{/`@4דfL. hT8ɦ\o-ɞٰ=;N:e^L"N%Zh*'WKTY0iQecĶPR:k%C _NvI{0j4sC_g87vfyy>S%gXN;yinۺ#_o+|敂DCf> K¨%5T[d!u 7?(-h&[|LŸ#_wϛkL 7t4ˢ)b Xm'`Cm?="k~ZjV;|G;|6əťY/Vx~ujxfœ?[uun *rʯYDհ#y)聼I6-M.y ߜb>P7z tihRRC]'yeVcvzvC7;H8HxɏQeC֭JW*Fڼk7npk&>iwdӕєYlܯɯMHѽeHyB<6lo,*1&s/ɾA%.8Cwr9ˑm /nG k7s\Η>a)=U]rFM Tv3ad|RXgkIi<|eA,$. Us<׿곉6]]{0#.ΫVQY{O>.-CAԪtʵ+V+'/Df㋼gg[ Bf""66ª<$6>.<-K҅ՖeI|#ɸx~Ou솭P+gQ[\˄ p@dE֬1>Z_+p=o;aeNz3Qǫ[Sfz~U\%r]өP?!r5~ju<wrӑ&GҗycʦԖ'uzNj!.mh/ NЭzkHJ%(Oijpyxkszfeu,&U}.7[,:hn/mJP}}tIߞ[l =gtYUS@oa^LkHYˍ24ɂ|V7[e3Vqh&Ժ9o߼й5.p`{z/ 5ZlPA{SN#k^pK)6Rl/OQ=*QZZ 39#^;Q=T{A!^|qʸfˑ]sی?\&k{7-.cb2: @O_UN|5Wt_ԏLWBls}ck̳iᭋybn)j+ l{+ܺso7_ENEi?-oڤST|2_{rGOũ_`mJ;(P@H.YIzJl?xflq֒Wkz\HYjj/HOy<7։G7/o-%>E(dckK]հlBi;&\.'i5Ҡgy=vmò nLBu%# ~u4 `R~*KǦjnoÏFzz&;s5pN[V4. qִsaޙ MKWHo@"u 2f=B5=Gģb$o3!?|/,Qng:e; r-_9/tZwn6M\OGENGwks:(vkN>^iW8{a(!#'Y{aWȣgGm\5.A@*oYA^(@y8wm*c&;g{~%vϹm7h'c0\p}phHgWrwFci^M9ZCPa>2w5#, ʻ]# J/V5ۖ4!|S~=L›Z$ W;B K&~D5Z*V24?6.'9藙{ M) 鵧8y47θU1/LpCX6%;n%\ fy&',q~; Ty ]R}P[;! NR<^y? :n hU}hVnSvpY~-BJ~JLK /~.@9bǺeַ-ʹ>+,ExOX;OZ{Ht_F +*٬~\B3%~*PVq7{ҫ-$/}? wΝ3Jڏ_)<:kb\`1>^_X}ڕxA35M=DYhE3lO{:dАvπm#<Ϗk;u` BJ'}wmj#Xv9ާtr;aQ5wnEBSרj۾vo Cs撞>ķt%I`5IuهAjy /߬x&ǶdJv17E]sJG~I΁+JqH)f-re{{~+; /??n:I;TiU–` :ం. w>S5{{ jIw[;"bIZS*J'k+qWflGfmYKR!0nG('BƬ{af$Z#BQ3o'f;MFˮ%VxK=r|qr׉l`]UƟNᇙQ/goIͿ::@b h'O_YW(-B3O(7?N5qٻ5m2jS{*r Ӌ2)ȿ' <(zO&rKvf:ʄx2iZDQW@4ADa9A:gvv&A-1cdÉop]] fHaBX$$DQ7 E汁Il$!T")A %bWa6lgNgٌU j P"OA_}A5, һK,1D0.wo`8F&zMn09EARa"Rh,5YK{p,>(iE쥒P3X<: &Á\?Ɔ)E*{ 9QI( lJZ"lj&d1D108 BmgPhS=OցH ayH_An0IS V(U %m0A D,y Ģi$";!!p:GYaLpL I &÷OpF8l2NӨ4q~uKɌ <@.&Рi0dќƎ! zlEb(U 5ƢІt6yϜ@d$zAǖ>1x EEщ$m~J"O#Fid sj5a{ad, mdzV No с,:nơϬ@86 d3p,t!ã(!l tSp2J{ lf h* D b JWPfBQU CQzSTE6eD43Mg0p V!$`<̱c ?1hPK vSw varm/.zgroupUT aaux { "zarr_format": 2 }PK vS varm/sparse/UT aaux PK vSvarm/sparse/indices/UT aaux PKvS17varm/sparse/indices/.zarrayUT aaux uA 0EBvaݔzR$ G#3Fݛ fY%7kCG?⟧*YGD~-:,~\5xI {,gp0 QRL7xL1#g >9ᤑ{ `%إVƼXVePKvSfz*::`varm/sparse/indices/0UT aaux -ʱ @`eyvUDNnNMFI🃋PKvSƿLtvarm/sparse/.zattrsUT aaux RԼ̼tݒʂT%+Ē %4%eEřy Uzz0``.ن`v,W-PK vSwvarm/sparse/.zgroupUT aaux { "zarr_format": 2 }PK vSvarm/sparse/indptr/UT aaux PKvS.7varm/sparse/indptr/.zarrayUT aaux u0D|3%z1 cH-mhX(R! ڋ{},Yd%Ǜ$u}pMEt)fCZ~ȖY9g&xSyX1Vk xK5sӗnI  |"FhRܿ\,JmӊdPK vS1ְvarm/sparse/data/0UT aaux 3\%5t=?k##?,0?x'x3v??f$?95?,~<x?ڬ/?|cB?>^d? }?8 ?Dݣ? VW(B?\.?Ю.?FL-??n?1ި7?PK vSuns/UT aaux PK vSw uns/.zgroupUT aaux { "zarr_format": 2 }PK vSuns/O_recarray/UT aaux PKvSۓuns/O_recarray/.zarrayUT aaux O @}سDKDE!$6wMqteW {9xox0Le!/ȓHgU[ר,N9qz5*tE",ĦrhAAy&8VB:<$ Ub0Ӱ^ضmKg˳_i(#!ǑT8Hapoo~p+0 2%9@ 'T|d"K޲Ĝ|cI5PKvS%EIuns/O_recarray/0UT aaux 5ˮP&4M:hIۜϨrQdonn@.3q ",ZJVe_٣W[V*{O;nND:f$qҸR]x~ŊZ#z& bN(^̝׍DW3v?qLn`tAXúr$h侥FNE#'?;.>LHB9\)jo_$RNgS1WR3_ǩq18dd*-ZfUVp#;cBוJ%hل γp6d7)"p cB3k)=yCp@+m++HKSDS?'R,??8Oմ=sncHs5y],?WPK vSraw/UT aaux PKvSR8Dl raw/.zattrsUT aaux RԼ̼tݒʂT%+r%4Ԣ<LAqF"XS4 :pU PK vSw raw/.zgroupUT aaux { "zarr_format": 2 }PK vSraw/var/UT aaux PK vSraw/var/cat_ordered/UT aaux PKvSz AI7raw/var/cat_ordered/.zarrayUT aaux u0E|5 0qgCjaCIwۦj7Ϊ=9s3w/J?{7 a:>U28[Id4| I9UNl "BƵG1N) MGE,9Ƽt|P[iC!&h\?ԳƬ픱#[v*7PKvScIW8F~} Q퍁gNUqPK vSwraw/var/.zgroupUT aaux { "zarr_format": 2 }PK vSraw/var/float64/UT aaux PKvS ՟9raw/var/float64/.zarrayUT aaux uA 0EBf- tc"iLPwo꦳Jo>$u}pMEt)fCZ~ȖY9g&vdqFi`m_=U\W?.@.S]8N"<@;I%k{_9b XoPKvS=r~raw/var/int64/0UT aaux cb4p`d`: NZ_|{B7q÷wyϽK2/}!Ci. !AbR  ,   PK vSraw/var/var_cat/UT aaux PKvSz AI7raw/var/var_cat/.zarrayUT aaux u0E|5 0qgCjaCIwۦj7Ϊ=9s3w/J?{7 a:>U28[Id4| I9UNl "BƵG1N) MGE,9Ƽt|P[iC!&h\?ԳƬ픱#[v*7PKvSAXK",raw/var/var_cat/.zattrsUT aaux RĒb%+x_,UPKvS(68raw/var/var_cat/0UT aaux cb4f```a ace``df`eb`fdgPK vSraw/var/_index/UT aaux PKvSYzkraw/var/_index/.zarrayUT aaux uM04Cjia?$ݶPΪuIj hmV5}"g$SJHWPV586N -<ǰ'ND =\J?sr b945Xh?/v]z!+P{aBvD[)YPKvS6kraw/var/_index/0UT aaux ;@@3D#1cT6CtbU:˰ pw'JԮ@IwZ.RKFJtC!@.D6 SG 5dzEg?ou6>PK vS raw/varm/UT aaux PK vS raw/varm/df/UT aaux PK vSraw/varm/df/cat/UT aaux PKvSz AI7raw/varm/df/cat/.zarrayUT aaux u0E|5 0qgCjaCIwۦj7Ϊ=9s3w/J?{7 a:>U28[Id4| I9UNl "BƵG1N) MGE,9Ƽt|P[iC!&h\?ԳƬ픱#[v*7PKvS?(raw/varm/df/cat/.zattrsUT aaux RĒb%+x_TPKvS$68raw/varm/df/cat/0UT aaux cb4f```a ffeaedgcfdgbebgPK vSraw/varm/df/cat_ordered/UT aaux PKvSz AI7raw/varm/df/cat_ordered/.zarrayUT aaux u0E|5 0qgCjaCIwۦj7Ϊ=9s3w/J?{7 a:>U28[Id4| I9UNl "BƵG1N) MGE,9Ƽt|P[iC!&h\?ԳƬ픱#[v*7PKvSc$u}pMEt)fCZ~ȖY9g&vLcm֣#`4/`.BVo&"eɅ숶dqFi`m_=U\W?.@.S]8N"<@;I%k{_9b XoPKvS"jWsraw/varm/df/int64/0UT aaux cb4p`d`b o{?Iי?ԻAG{,Z|[)}^f D 2,h03PK vSraw/varm/df/_index/UT aaux PKvSYzkraw/varm/df/_index/.zarrayUT aaux uM04Cjia?$ݶPΪuIj hmV5}"g$SJHWPV586N -<ǰ'ND =\J?sr b945Xh?/v]z!+P{aBvD[)YPKvS6kraw/varm/df/_index/0UT aaux ;@@3D#1cT6CtbU:˰ pw'JԮ@IwZ.RKFJtC!@.D6 SG 5dzEg?ou6>PK vSraw/varm/array/UT aaux PKvSQraw/varm/array/.zarrayUT aaux  0 >, 1vc (F+kKz\N$ْ֌=-}UgG_澭Väֱ*x=u8rG'\ x8hǘZ#%:=5AvENB+Gҍpg릖Ŀ^2kfk$PKvSƮu66raw/varm/array/0.0UT aaux %w8 EvNB%Dve&DIY2222 IB9q~u}?_qXc $@jc%Xoz2D M:bFn.%Fu2$E~Ɔ9EE;^M5ϑspx-dnMlyo(dvFz邭U\7;1ߥl8i +QMV=󷔍f#ɞ48)ƌ TB5rl?4Kmb 엑h>ipِ̾b`pgDs줓o\=_7دqh | !@7AI PќU;÷eN`:09q2荽Ƥ=lɦ5--w[:T0#:NLeUn* z˩fzxZlsu 2;|.ˢl}$K;SrOrQɨi'Og1Yq}OI<r'p}̕*cH 7NMq fo`Ә͛ BS3/cT{R2^L$4bhWվ im1-V*3FgǑOA?YeN{2sB@z -fl,{nkx=;QŞ h_W-7 3c:NKxKkj#Jc|% ]kf-ߩjY /@Nt1COu>+ӂǺmdq[_]2N!Zr- 2b0.NÒ&e8Sڙ*ř8׋ 6KL}NeQ1uJ.1'r=[z`9똭|!/Jdl7?_"^D(O&X9T$ ?ё'],j#_zd]p̤G5೘|_Uv݃˖RC5>5JN6!'o@Zw/PrA׽ƈ[LL;@7aK3MkVQ|V0_\HaYղR>n?0YpC%{0F9+1ve.k''9wa[rld8RCX7 IqPV{$pZML^P̨XURYT].KqdH^˜u!ȍN 6Q*Aƛ<2,-X 1lØ3z$ᖩqH(Qv,szhf OGׇ}Y.NPncQ`O q5O7焘V~ $JFN ;W',.eBÉ$O'!SsawµyCo#ztٸ́FYjhjMBd!>JF 6*s+MnaGQ NV@%ݲG9tÝYkjdO9!͹təM7'2sR#W[1s[2@HwpIp2tJ-*BYֳ/$sޚbRlXoȀֹ_YYgxg!RB}k2+nͱ[%UM  balf.df~Ä0sHxSෙ s.@124QN?% P CIja>H_ʈ 0`<Q|ȝ-N_%=zC1辔!b{ Cj.?{Hz1z珣SŰ[A^ӿO C\}NKv*TvC}\ǿbcS:~\4ij"CV{zCӴnjDw~TuE~rSy+KEGMXuEw0'U ɖғ*8N2KC?Ҫ  $b<5!ҝDғw eϫ3@)ΙW𛸽v'N[MU\lSn۩y+#}5h. ~L`wpySH-9+;m-|Jxa{ӛ 7 xD+Ž_aT^sM.%FQ|7@`p /^c*&"lrg>gub>|X#9:2 O{2t={3qwD5]jw57=:`xK%W#dхMr͵ێ6fp&cZ|ﭰnM>{n;8A|3%}Y fBմ%3? fK|/RP7]CE?7 {=wqW"̂O_%+xmD*3 fsǜ J&XQ'*sv(1Ş7cDe[{;WMq|m2҃z lT]7Z#_%efcD${_]>Jr43hbʉ̩n\}+/{EGCsZ>?S;mIh~}-۫:~jIBSx6źóY^n-{F*Mѳ. dzl"g}ZN7XqS U(SS,Z#u)BE _ (OPlZD(,̑)8u:Es! [VAHO?}JqbȪ!ʽʒ)Ʒ>erav̅1.4?<4O >y_\JP^\Kd庺Tx'/#$;A#yxz[t's!rqqΉ2:ߞtpo+z;|WVu):jK0mqm;7&6fMӾ;%CB^N!8ByHt!E@4FV|yi=ajCܠttnS֩breggk;jQMg.:N<0[f )Im")^^&˫`wGkvM? q!-S7V;ȴ,xv#˪YZB'vMnV8L(#Gی%bt҈K߃Ǣ\-We)Ac\#"D%> /":\,xp?ɥ M*M0R\j-GW킖c  w"Jˁv <ד83=bANfgMS戕[9t|H*s ґZo=(jU 4vIB|yllzC2{^*8]ѱ4 v+Tw?!~"Qު'ţ|pdL:uө=.%#(7}lcSR[b޴m|Ol**2/ju`x0.'}M$şY/WrYcjvR#-03q_N2ܩm x\xA:&Zk78#tm `_:pyȭsUB^ӺKDju4q/uG9 NFv]7~j!91Eq҅  z^C |ϱ"cO}6A uA@}&G[Uj|{F.z.eo'N% JX_,&+y! 3ַS#"F ;YdSeKH?`>]xj(|Ҏ#щDޑ= ˊzPO?d]tzG%LmW#YqA~9O+Þ+L:E'Kmk{XDzr<5g<; -i2Z2@M\^eb~_B9qd S%;HHWV[i_4cB4k!eѻpՍ%5h>j Hrz-O[&\w5~h\ ;D6yji זU,:w(+,~#vZ) g`miq2bpznJs&]n .'m+*w]-*i=r!r.o=tuwo2˚կb- P\wR!@۰ׁ\v;jPA"_%(V1&R]&"Op~XԽf t~r~xlaUغA.n6#Ⱥ,J?l L6+td|S4،hYyz^ Oj5-4~1[Jnal0؈^PxTvfHo>=W㞚u!k$YoNbV ʝ8fl>yj[Jm6'' 4"&**ΈxQ\c[Z_!E"b|Vmqa@ʙM@K4/~ HͱؗTQOԽ6q:,ѦP.KZ];jBh~!FmD8IB"_YUHYHu(S[j/%T5|Wn`=&_~ j3kX:#IվhMl؏)hWV{E`[x9(jzӥcrHݰ;fk &>GoY>_UZ`yʿL1x>ٶQ Px7XWUaYټff]wrV[̣I^Rio,ߩLcoTegn닌H?~n튽 rۅ^}xPQʺhMia{vۄ;쨌% R)r"Տ?WjaU`n{3#4KRLUJ߈~"S''zCO_l[>c1˳:u+dtG?E T2YkߺP>;ĵ"uZ¶Ⴡ}ö r]\]gj BFbod䁇3`ȅĥ>}=#96GO{ A"tÀeߕ{V C7/}=rigS4#`%8sCG7cB"JjS|iVkKgIqXFLmZ~$oʱg W{4yǰxxS诓8u>pju^ 6YOwŦd׫+&--ׇN%. rS{}DCE!/Y&no;'vEwPW:[q]G@Rޣ|vd@cGLCeZ1C^c޸njK/i`yBIt~*-*r ) |ʐp{0~37 'c*]bR/*k[P]ޣWoۼE]ySrþۣ>L3;EAyKN}_sz9 ]qO|LZwo3L2z](Ƞ3GDb&it$.'}oMBǻ x0&|;%NyX]R@פG5S m[)ym>EhvmUV=̏;6z+͑x"s ڹ|e9Y*l QƔգFӤ”߉ nZMG~Lq* O}}+*` ]fHnݥrlE&:1g(ú;%v[z)هl2OLSCkNŹ|,^l&}p_~r~B\FTnC{'_y* "s]D㽤Aliy^>|y=~+4EDHPlC{(qvtcT/ A {۞3 *X2,io f(<3ћ0<=G(/ȢO+LF,/VU!:o"ay̼ݼلW v{&//?Sf )ٔ}~Ǧh4Kn7Dapˉ?v~l׹j+AߩBs6p2=~瘮d>d y׳g&cU Z, ^%>rt5 nU[ Q@fU?+@〛ꚑM{G4nkqMuzx5a va{Mˤ,%Fk3ؼ(o1hw$ 1eNě⋖!e3E![J'ؿe&tҽ R찵;^r`KD u'wN0a%:Bm Hs.4YHַm.kQ%18eإX5\!PCqC9$k|8Um6 d{ 3~ͩ|R=ިi2WR(6+2:U5~X0a.w`=~P.H* abVG?$ L+Ӱ,0-R#.bxunx|YTc#c+:ܔ&&0 b@!uқIg #.׍W鮊|&G#F^"xd$nUpBܭݔ=mgn|G8m?Op=ST2cU0(A[]q<|j>G-ɨEc|rjp*2#5(?X ^ -5t|v #.U8hk)sޛ 3FIKRYV4w6ʬUXu[WT+$r1b}lzNn>_Ia~n XbvWZz56>)XbiDuXbMV]痋>5F{r9ʀ|3He>+fҪWX|Oq!} !6W wη)a7N.ԟ-!sA~wLOnFEI4k&͆iGr-I K~ڥ)PW~/C>srDC%? ZHЮ?>7ؠ^`Ȩ+3-lkMbfAyWkZU7-HKWIOҎC_3Zs}*%+7~7B|pq!5O5ȸ%Վ7nǟ٨?|;O=FUb8l/-v%3S4oD+{;+Yk(&ff.t-׊GzA.M8F|;OwkUJ g T!KJ{P帏\- }9}D;_ik;Ɵl*mGuG;LvWr֖ }ӟvBR?phb/@ɢw9R匶2چ}m>V}.E'}/9F,2* 5yjͲٞ-_(4+D?\­ݯ}#ڪ_;Ӯk/y͋ŗJ }_>۳f 5&k׽瘓5 ] [1,9nTSL{.И-3b#.Eojb~ruSuTT<1J~WSߣuk$zvbi"nmR P]07.ϯ}봓Swi'V= ՜bjY,[7} *~w^Kgb W/M[$'g2M.'B?b,n( b'TU ,YW?̨+X7KÅD{$3W9 xR,]]auhKs)-[?{֒j!o8G8Pk`.U:5ߝJ:ڟ72B_/H_xܩB؃-{߻."\C~f}TALD~ ƭ5?z+9ՂUL>(FJ8Ġܴ+b-zs$,)޸~: c qʶABҏt0A] ܆;2q\zQp6!¿F" =k/*c>;Lp7:ybm2~aj|`qՍí1Vj'<.g?,|ZzM35~M{ghYiM5^)v5yhlfJ$!{F?' %Ϳ_nʾӬF.p|KǭB_)a8pgZ=b` .gx/Rz]qzpZKP`ڄㅘU%Tx\Tڻ[IasiΑ.xxׇZOcTr8D}&뙿8_6/MVUK01JiIhNiB't= E.SP/iD)E|"!X$A%DH!-Sj$%BPd <CQ$DB#w+xT;="@ D"* BR*9^tHE<] \0d*l lt#;hE@x6z8EMcڐS$lRhe28 b R##id YwJFRuy7CRVы F%-Ԡhu(Au 8.) -'W),Z1VL9(RKQE>N5/,DҊEE>0X PKvSpuLtraw/varm/sparse/.zattrsUT aaux RԼ̼tݒʂT%+Ē %4%eEřy Uzz0``.ن`v,W-PK vSwraw/varm/sparse/.zgroupUT aaux { "zarr_format": 2 }PK vSraw/varm/sparse/indptr/UT aaux PKvSr.7raw/varm/sparse/indptr/.zarrayUT aaux u0D|35x1 cH-mhX(R! ڋ{},Yd%Ǜ$u}pMEt)fCZ~ȖY9g&vUa۞%*x Iy+~\zxNa' NpSXk5OeR >\f @ pPzuq*JmZD'ATjK-oPKvS$Uraw/X/indptr/0UT aaux %ñ @@DjZn0mp0 󉗼Tq/;2gɆ-G\x͇2iq;OPK vS raw/X/data/UT aaux PKvSy);raw/X/data/.zarrayUT aaux uA 0EBf- tc4&(FfPŻ7 ZtVgn@Vk+d4dULۓb6lisjT;zpC򈹲Z_8ç1ŀQ |"P| xn\$ m.PKvS6iK raw/X/data/0UT aaux m.CA336J*iZEHĢ;XH, [qHM;3g8T93 pRx)4q&t -~#Qw0&Ɛ q'b2ZEʕ^f:CAEx͇lkh,~Dᚰl옆x{!5}DmQۈ۔5;ֹQ];6FR1VUiJRB Vp9j{bݔc?Мjjш0iSu%({9} emqf3\ьύ+2Ia>E;A~Xae,e%]|`x3epXf{ PK vSX/UT aaux PK vS X/indices/UT aaux PKvS]}9X/indices/.zarrayUT aaux uA 0EBvJ tc"iL081BL6*y_ۚa"bj1 z|1fa,@5I! G\tXk.!.52o?K82[?PKvS) X/indices/0UT aaux }Q! Dsp'X}نLf?·d7"EE˓I6h(됆'Z!˥ևoBTotri< g8K<,#rHX`m"t;|b0az`d_C_W!8o,Q@$Swb{PKvS(Ks X/.zattrsUT aaux RԼ̼tݒʂT%+Ē %4%eEřy Uzz0``.F`f,W-PK vSw X/.zgroupUT aaux { "zarr_format": 2 }PK vS X/indptr/UT aaux PKvS o 7X/indptr/.zarrayUT aaux u0 wyfRucTJC"" A6A*woBOݜ~@֮k.=a>Ua۞%*x Iy+~\zxNa' NpSXk5OeR >\f @ pPzuq*JmZD'ATjK-oPKvSjs)U X/indptr/0UT aaux %áPf:3ZD@3Hbbc#d L%y !v/dԫ7>}w/PK vSX/data/UT aaux PKvSu@;X/data/.zarrayUT aaux uA 0EBf-J tc4&(FfPŻ7 ZtVgn@Vk+d-dULۓb6lisjT;zpץ.seFppYo)Obz]#@9;jE\Rܶ!IP+ePKvS1 2?X/data/0UT aaux eJQl&.l mH;HJ|<@o =0gι*[ @X |<苰B)s29:zE.8*2콙?gWşZ r6'@DEt hֈZKq㻜-s7,w"._=6R W-.?~PK vSAobsp/UTaux PK vS A?obsp/array/UTaux PKvS'ٹQobsp/array/.zarrayUTaux PKvSLkzobsp/array/0.0UTaux PK vSw obsp/.zgroupUTaux PK vS Aobsp/sparse/UTaux PK vSABobsp/sparse/indices/UTaux PKvS)a5obsp/sparse/indices/.zarrayUTaux PKvS"4obsp/sparse/indices/0UTaux PKvSCZlIsobsp/sparse/.zattrsUTaux PK vSwobsp/sparse/.zgroupUTaux PK vSAobsp/sparse/indptr/UTaux PKvS o 7Bobsp/sparse/indptr/.zarrayUTaux PKvS,&:obsp/sparse/indptr/0UTaux PK vSAobsp/sparse/data/UTaux PKvSXj 7obsp/sparse/data/.zarrayUTaux PK vSXX obsp/sparse/data/0UTaux PK vSA!obsm/UTaux PK vSA!obsm/df/UTaux PK vS A"obsm/df/cat/UTaux PKvSL\7\"obsm/df/cat/.zarrayUTaux PKvS?(M#obsm/df/cat/.zattrsUTaux PKvS}w,. #obsm/df/cat/0UTaux PK vSA,$obsm/df/cat_ordered/UTaux PKvSL\7z$obsm/df/cat_ordered/.zarrayUTaux PKvSc<dobsm/sparse/indptr/0UTaux PK vSANeobsm/sparse/data/UTaux PKvS% '9eobsm/sparse/data/.zarrayUTaux PK vS fobsm/sparse/data/0UTaux PK vSAglayers/UTaux PK vS Ahlayers/array/UTaux PKvSY+Q`hlayers/array/.zarrayUTaux PK vS Zilayers/array/0.0UTaux PK vSwMzlayers/.zgroupUTaux PK vSAzlayers/sparse/UTaux PK vSAzlayers/sparse/indices/UTaux PKvS5E{layers/sparse/indices/.zarrayUTaux PKvS (@|layers/sparse/indices/0UTaux PKvS(Ks|layers/sparse/.zattrsUTaux PK vSwE}layers/sparse/.zgroupUTaux PK vSA}layers/sparse/indptr/UTaux PKvS o 7}layers/sparse/indptr/.zarrayUTaux PKvS4a~layers/sparse/indptr/0UTaux PK vSAalayers/sparse/data/UTaux PKvSs7layers/sparse/data/.zarrayUTaux PK vS\@@layers/sparse/data/0UTaux PK vSw6.zgroupUTaux PK vSAvar/UTaux PK vSÁvar/cat_ordered/UTaux PKvS&Ѥ7var/cat_ordered/.zarrayUTaux PKvScxҍvar/__categories/var_cat/0UTaux PK vS Advar/int64/UTaux PKvS9 7var/int64/.zarrayUTaux PKvS?8=K var/int64/0UTaux PK vS Avar/var_cat/UTaux PKvS&Ѥ7`var/var_cat/.zarrayUTaux PKvSAXK",Qvar/var_cat/.zattrsUTaux PKvSW,"$ var/var_cat/0UTaux PK vS A)var/_index/UTaux PKvS)knvar/_index/.zarrayUTaux PKvS!bH\ nvar/_index/0UTaux PK vSAobs/UTaux PK vSANobs/cat_ordered/UTaux PKvSL\7obs/cat_ordered/.zarrayUTaux PKvScq?r@sAtBvCxDyERFSGUHWIZJaKbLMMPNQOFPGQDRCSbTcUeVfWgXiYkZp[q\r]s^v_Q`RaUbVcZdaeLfNgPhIiKjHkAl csr_matrixm0.1.0n dataframeo0.1.0pfloat64quint8rint64s cat_orderedtcatu_indexvgene20wgene21xgene22ygene23zgene24{gene25|gene26}gene27~gene28gene29gene30gene31gene32gene33gene34gene35gene36gene37gene38gene39gene10gene11gene12gene13gene14gene15gene16gene17gene18gene19gene5gene6gene7gene8gene9gene3gene4gene2gene1gene0YcefhiklopHh H shape@   (}lzfXSNOD+P4TREE#? @  O# @ { @@@# @@# ?+ [@ @` ` s ' 7@O@ @@+ 3@S_@ @@@@ @_C@#O'#''?@         @# @   @@G @?  @C@#@k@K# @g @k@S@O @S@/@3@+#@@s@O@ws@@Kw@?@g@@g@@[;@@@3{@7@S@@w;@'@@;@7@@@_@@;@@3@'@_@@#C[@+AG@@'#@O@@S@@s@@@@k@@C@_@@@c@_@@@?[@@s?;_@G@?@;@7A'@s@3AS@w'A+@@/@@;@7g#s@@#@C@@w@'{@s@G@#@c@@7s@@S@@@C@@? g@@@s@3k@ %/7?ENV\eqz?@`@ @3 ? @[ @@ @@7@7+/@7 @3@3S@@7 @+ 3+ 7 @ # @@@@ @/ @W @@ @3@C@ +@ @@@ @ @ @W@@+@@ @@@@@@ @@ @@ o3 S@C s@3@+ @++ @7G@oS @@@C@ @3@@/@ @ #@?        ! " $ % & ' @     @; @?    ;@7   @{ @?  OG# K@@O S?@@;;@A  K @@_@# c@#@@@@@/ @W@7@+@O@@3 #AAAO@/@@cS@g@AA@;@AS@C@{@s@kA@k?@+@s@/@wA@@7@kB@;@s@oA@?@@o@A@ A{G@?@AOw@?@?@@@o@@O@S@O@AO@KGAAA@7ABAo A@3@wA;@s@oAk@AA@C@A@s@S@K@O@A@{@3@/@@w@+@w@+@{@@@{@@s@?@s@?@oBsA3G@?@C'@;7@3@kA@[@@AA@[A@W@@[AO@@_AW@@@Gk@@@O@K@@G@@@CAW@C@@@C@@@;@A_@C@A+@@C@@CA_@s@@A@w@o3@w@/@{@?@C_@@CA?@?A@?@@?@{@@@@@KAG@O@GA;@SB'A?@;@@;@A@;@@{@s@73A?Ao@A3Ao@@sA{@@GA AGA@{@@wB@GB3B@O@G&            (}lzf ,`TREE#  (}lzf|`5`TREE|%P@0TREE@DHEAPX ?Xvarvarm8=? Hencoding-type Pencoding-version H shape@(E0TREEGHEAPX Cdataindicesindptr8SNODAACkkn(n(HAC Hencoding-type  Pencoding-version H shape@(   (}lzfIXSNODF@QZTREE&  (}lzfPR`TREE-'  (}lzf|[`TREE|c1ALUcr'9FUgu   6  5 423-./01  # $%&'()*+, !"   R  Q OPLMNEFGHIJK789:;<=>?@ABCD   k  j hiefg_`abcdSTUVWXYZ[\]^     %'π ڀg) ?׀ޠ+܀Р*\E?be?d4v?h@? t1?,_:k?^x7?}Uڝ?H? ?.F\?܇?&o?x? Eq?b>'?ƿ?(;peL? ن??$m?\~۩C?ioz?p^?T6$?|}I?u ?VL ?6e ?\(S$"?Uz?D&?j?:aRO?̎N?w\?b?0HQA?B?UXY?p??R?; ?@r阑?ɬt?g?}5?i ?> @ / I V  @ *   ]   8 # N      @73 :  T  @WD 6  @7 " @G' C O             " $' @(        vwxyz{|}~vnTREEXq HEAPPH(Hh0kn Hencoding-type  Pencoding-version  column-order     @_index(( (}lzfr(XSNODzz}Hp(@8H8 TREEР?fOK?іt?pI?h|??OKT@??i|?}վQ5?FM?%Ô?x}W?Lha?7޸?[$2f?`DRlT?QOu~?J?RVFb?򜘇?<2;?86?GCx5?y?/:? ?l?n(w?X?47?^1??Óְ ?a ?[F?y?.?X͋U?0$; `? iV?N\G?? յO?tC1?\ ?9+Q?.Ms?!$?nl}?k?uB?'Ǽ?珰i?PgC\?h飘? ٘ǝ?8<`f?GI!?4̸S?<L}?0 ?ip?G?ž#?dO£2Y? .$?mZ?u#4#?q5w%f?I?Xf?/?d C?Zj?v p?.`-?k ?Nq??]??w?%MJk?t ?&8?B"?K ?K0?\ь>?P|??i.?D`;?$>5=?^[:?B?[=ex?BAN??,5? U5?tUn޲5?t?.8? #?h#\њ??M+(?.-r?Daw?lD?RHu4?(}oR%?dMG?j V?Wv?2|?7?Qa 7?%/?aXw?^H0??HP?2R?PX?oZ,?]?l#j3?⽖Y?}?l,? r􆜹?4`?̬ضL?m}:?0s2ޛR??@~(? 8d?S7ni+?"@ix?FUm$?8>?8 Е?㷍 ?0c?&όq?)?9? #? )N?5hO?R̎m?\8ǝ?Gٝv?(Y ’?D͓!?d; ?8?JW@i?3?o?w?~!X7?@VB?L5F???k>?EB?fD/? ?4kA?Sb?[?pz?p ajһ?O[?vbٓ?o,X?9}?0J?u&?Dl?wU?f^? P?1AjQG?x?y릍?Fc?@^?HHj? *?ȗ@a6?6p%+?W}!w?E?|jQ)?ذOs?/ڲx?zJ@?0+n,?+?"{?cp>?p@? :?F?EAd5?ݬ?h3?zj?@]j?H?6ma#?& 9[?Hjw?N1?Ƃ_? ΰ?xY?T?S?6"J ? 絒Դ?s3?sYN?($?X"?L$٭,?[`SQ?5p?cSb?l&_?Aã?`NO?`w?e=c?1|?U?J.?X"a???r)W\?oJ?D?V/?N?g[{=?p~ ?KQ?(Ӧ;?Xs??>|?9!cTs?L&+?i_\?c%3#?rX*s{R?ß|C?B['?Ãs?r7?x ?xwY_?|]?{fʍ?0Ô@\?{N?m]bΠ?T~-f?j gs?y?` ?#7?'m7?ǻ)?9?<o?*O?^^>?x+Sa?@f#|_?u;?PGIGδ?>j?ؓ_@?{y?P->T?.3 ?ҫ?Hfjro?(?0 =?'Y?e-k?;gH i?bߟ?ʈ%v?B%?=!?c ???Tb?!q?xY?/k?5dؽ? ;`2?X>?݁?u q?V;L?2 ?dtS?@?rWt-=?l Qq4?^?H]m}4?Wr:Mq?hm[?<Z? ?=ѐ?%zh?UH̝?rS ??4 ~?lbˆ?0-"?1Y>?8]Ξ?̄?Og)?/F?4m?^?JGA?&LC?R15uQ?g?v$d?7]\?f_[?zCˑ#?H_r!?21?H2?4lf?2)7?]©b?<ǐW+?ěF?Bc?-SeΉ?wm*?Yn?rHA?$ ?,FCB?za,u?u7?3K?T!?.?7?GR?W.GS?xn L?ͺ~D?Rw?nՍc?Њ??bO)G?H=?$#?6׷?WG%?'??!tz?.>?|^??Ĭ?l?HQz?wS׎?iyiJ?z=?fL4?-j?$Tj?yݤZ?4N?րž?lTt?ĴGew?> ?P`#?X_8=?1`? l$?w6l?|M?t|?|/+?V$?U.?|(?R]r^?T&?V5ޯS/?PZG\7!?Q?!d!?0E??a޷?1%l?SSn?tp?{.?b?A!? Eq@?.Պ@?A}J?n ?w?RFD?c|?LZۖ?ѶY?qQ?Tt-?y?ƃ/d?9Ȓ?Pgߙ?xt?D '?hzp=g?J?} K??}5?Ns+?I? ??W|H?,j?g#? Ӂ?FH HV?{a-?Z??W/? (?N'?R,?ɍ?#Jo?2Hm?PW? _?iB٭?c },?D? JH?bݩn?oe^?mPX#?Fv=?I?(߼p?`!4iJ ?-X{:?zh0 ?|繷U?*?&ɳ?xEm,?|OoC&?6̠Jk9?r'I?[&*?`J#?o ?-=?ހ@}%S?fgoi?fC?tDY2?-?YC?3w?`U'?xIb?-A?E ?di/N?%eJ=?u \W??3n5;?BetpC?]?[[nT?VXƬ?U^Ž?VVy?@ ?3=>?JW?*P҄?9X3?w#??f\=?TlL{?ujv4? 3?<ҷU7?3T?.X?Cz?q> [D?7L?n׉I?߱!?H7K?,c@1 ?,H1.?gI3?wn:?8,"A?̉ ?ɃH? r?e?WE s?Ge[?<ʤ?c@_?hq[?i ?ݵ3U?yse#?iLI?׋ ?ӌqE?֫?DTL?˧-?*"?}.}s?5^Rk?G,ލ?]?^?W/?4-?Tt?eD?l?3;?$J?s)kKk?DJxU?1@W?dؐ'?8n)?ٱ!?丩?Ru2W_?XǾΤ ?^HF?`dN? +?p?䜂]`S?P=ĥ?4z?asTR?@ L?h&&??}?f:<~?ЌKo?(%)m?`3?ȍH?@,?F~?^Ԉj9?Tanu?+ ?#xY?>?!?ZC?0;7?fS? \?e ?WZ?!.b:?^GB?X (?Yx35 ?ј?٠D?",?z?82}?|3J?d0 ?aIC?2 ? A?ݖd?q._1S?LK?>$?&?4?Qp!?MNHz?_D?~Hƶ?|`Vc?w?5,Q?XR?xA}#?xې?DӺ?.{!d? ?Q?,?lB%?,Eq?,+:e?QՑL? ? ?N1?*?6|*?H&B?$Y?ض%m? df?pJI?=?~?Ry?^f48?g ݔ?xҊD?ߏya?Ҹ}%?R<9?u?P@? M?U?Аbo?BY?bӰ?N]A?AN?,M_d?୴NH?*??e?X/?jd:T? :?xN/Ef?9?x^B?`s#?ے>??(hh?&}?:z?O)?B Q?.D^?PU8?vk?4e"?){?AI?0$?fE6?I_?b?BV3?d ZA?%Z ?|z83?ki_q?Ċކ?NW?ۏ!4^?*>MW?P-Kp?(M?XYPI?d?Q<7?w?j ? x?(./?b38?2[s?p?wtI?Uv?[?h]? @"H?pKr?H(?4UDr?\\q?Lj?3?(m?$`3q?? :?(&?pS?j]s?,n?~bRV?_(?3XO?"<?9]|/?;Tr?j=?u߈z?8?U?12?82?p_/살?lծ?\?zҶ?% ?_LE?Tl?DBe?]v?^"?Fo쌦? '?<\sJ?I=Q ?D=&? ۺHm?SC]!?FB?򏐳?p~?` ??Xon?ԍ?8f-?CO?? /?yr ?|3$DL?Y?@*i?#֙'?] a?` ?E ,?hЈeL?C1t.?@l78?ڢ {$?,뒲? m`?캾̛??g~?ś?~u޷???qE?v13&+?hHkTY?F/?(R? ?Ĉ?ЈSo'?k?^-5?1,?#?^?:jV?d=)+?ZAV?k zD?8NC?b-U??E?,^[[?Ժ??;?e~D?Ǵy?`g"?Dz)?-|?臧ve?*?oS#?]֖f?ECb6?ԹZ‰P?{0V??jRܥ?̽)?p D?gb?}2??p|1A?Nkaɝ?|J ?eU??n?ЩĜ?U-+?G|\?:L9?yEo?Pɗ?RC$??X ?u?H6?ϳ?rS F?`{Ҷ?.)?B]>?*l?w!?o= ?T_?jp$^?dz1?/?##ϰ_? $[?*Mo?D0-?mB!?h?[K?"DvQ? 8ϗ? R-?A?|ڧ?r?ȂP?_h?jzz}?ʱv?2?Ss?V狑?@t?/?%l?06{J?u?((?+?Jlkl?Id?m{?zGo?m-?\ӸB?(QN?G~5?Ci?;e?֗? u?x;r?2vϗ?Q;7?lH?\<#?\X%?=56?̷w??a3?B4?CAS?z4?M9?ńf?T? #?(j?n?-5B??(#k?"P?a?*:?ޒ?&0Ln?0i8?X_ֱ?P/Wv??,?T/?@=?jP?,lW?"F*?$p?j?ɕSB?B\z?9n4?N?T?(}?"LR?7e5?>?Cy>?k(I?R{oj?2,{?rk?;^Ͳ?tO--?E4E?Šb?g? #e?1?BnP?/(rp?/D*?:qܫ? d??_Y,?hh=?34?=~>?u2h?~ܲ?h+1?`9SQ?C?<~?,o?w`N?db*?0X.?@gm?VC?}&H-?z?L?X?A ;?I,?b*Ib?D ?8-?? p?B?]\V?Pk'?г0d4?fnA?Ր?Jذ?U?~G@?ւcO?x C?@n(37?I?j?qU0w?{ zI.?Z?fXx?oi?\?lf۪?ݥ`?If?kĴ?Hx ?x0?8|nY?`~hH? y?i }?:?֐ue ?>.?}?!?Ot?$\?MT,wCY?18|?.vޗ?]j;?[ڪ?wN:-?"1?FHsD?h$)?!?d+?-jX?A>d ?)j?US)&?ȡL1?@Yk?'O?2?6)N?*ܔ?|g?-?}?v'?ZST-?rC? n?"󮼝?F9?.1?X, e? #?ؑHq?dՂ{?!? a1?lm4??s?d/q?^s}0?O˱??GKoy??\7*S?u;'?籥s?Tk_?_,!??cۜ?A ?"w? Qȴj?xWq?P?\J T?T:F?̶"j ?@Vk?9d? ,O?_6Lq?عWv:?Xp?zv ?!0o?(j{z?*ݧ?̦]?RH{ v?~~z?1?xEC2?*rW?7Y?X?t?1?׃K?TREEHEAPX dataindicesindptr8 Hencoding-type l Pencoding-versionm H shape@(d( ?@4 4 (}lzf@(XSNOD `&TREE@h((  (}lzf0(`TREEi()  (}lzfp')`TREEjdj)`2TREE85HHEAPP(/1 Hencoding-type n Pencoding-versiono  column-ordert srpq @_indexu(( (}lzf6(XSNOD>>@(4 c(v@؉8HTREEj(>@TREEBHEAPX Acatcat_ordered8  (}lzfC8mp@SNODpAmTREE[GCOLqrsyKNPR S V W G HIDEBAYZabdehkltuwx z!O"P#R$S%V&W'X(I)J*L+M,E-H.C/B0 dataframe10.1.02float643uint84int645 cat_ordered6obs_cat7_index8cell159cell16:cell17;cell18<cell19=cell20>cell21?cell22@cell23Acell24Bcell25Ccell26Dcell27Ecell28Fcell29Gcell8Hcell9Icell10Jcell11Kcell12Lcell13Mcell14Ncell4Ocell5Pcell6Qcell7Rcell2Scell3Tcell1Ucell0VdWfXhYjZm[q\u]w^x_z`RaSbUcZdbeFfJgMhEiBjAkdlemfngokpmqorpsrtuuvvzwOxRyTzZ{a|c}K~LMJFC dataframe0.1.0float64uint8int64 cat_orderedvar_cat_indexgene10gene11gene12gene13gene14gene15gene16gene17gene18gene19gene5gene6gene7gene8gene9gene3gene4gene2gene1gene0cdhkmprxySK  @      `8 @o@@      K / @ .,-()*+!"#$%&'      ΀!'(,.݀& Qs0h?U˜?`A4?Hԅ*?DW? +? ]?de>?VŐ?In?fyE&?] w?q"/?Ac^?m?,?WJ?6M?wZ?\?<˖T?? ,}?K]?cz?~?^?Cv?R~K@?`t?N ?fb??99?%wG?"??U-T»?> >H?*]㽥?҈?NN?lf?G:$6?^*ۖZCw:7tXIdʥMKK U  T RSNOPQGH  I JKLM89:;<=>?@ABCDEFK j @ ihefg`abcdVWXYZ[\]^_K  @ }~wxyz{|klmnopqrstuv    ֠Ԡ%Π1Р_) $(f3?/Jp?8? x?Z?@*$?N?ք? Q``?ޖ?>.ܿ?W9Z?PBjLѫ?l} ?uv? H?e|5I?vt?_@0E?M?lg?U ?h?qаk?5GF?hg?]xP?Ƣo?pT*|?ok?I;юM]?IFݻf;YӍMsK                {\$}#&         *(( (}lzf(e( 0 categoriespA(TREE(\( H&orderedFALSETRUE  (}lzfn(p@TREE\(( (}lzf(w( 0 categoriesm(TREE(c]( H&orderedFALSETRUE((@ (}lzf@(`TREE]((( ?@4 4 (}lzf@(XTREE@A^((( (}lzf(ؔ(`_index__categoriescatcat_orderedint64float64uint8`TREE(_(TREExHHEAPP@(( Hencoding-type K0 Pencoding-versionK1  column-orderK6 K5K4K2K3 @_indexK7 (}lzfXSNOD02h(@8 H0TREE_TREEHEAPX 0obs_catcat_ordered8  (}lzfPPp@SNODTREExY` (}lzf  0 categories(TREE* H&orderedFALSETRUE  (}lzf@p@TREE` (}lzf 0 categories(TREEXa H&orderedFALSETRUE@ (}lzf`TREEva ?@4 4 (}lzfXTREEa (}lzf`_index__categoriesobs_catcat_orderedint64float64uint8`TREEbTREE HEAPPXV8_Xa؟0 @ Hencoding-type K Pencoding-versionK  column-orderK KKKK @_indexK (}lzf@XSNODp((9@L8BHHU (&TREE~cTREEHEAPX Hvar_catcat_ordered8  (}lzf0h/p@SNOD/TREE}(GCOLTVXaQROE B a c g hmotzQSUYJKGFA csr_matrix0.1.0 dataframe0.1.0float64 uint8!int64" cat_ordered#cat$_index%cell15&cell16'cell17(cell18)cell19*cell20+cell21,cell22-cell23.cell24/cell250cell261cell272cell283cell294cell85cell96cell107cell118cell129cell13:cell14;cell4<cell5=cell6>cell7?cell2@cell3Acell1Bcell0CcDeEgFhGkHmInJpKrLsMyNzOOPQQSRYSZTbUEVKWMXDYCZA[h\i]k^n_p`qarbscudwexfzgQhSiVjdkflgmLnNoPpHqKrDsCt csr_matrixu0.1.0v dataframew0.1.0xfloat64yuint8zint64{ cat_ordered|cat}_index~gene10gene11gene12gene13gene14gene15gene16gene17gene18gene19gene5gene6gene7gene8gene9gene3gene4gene2gene1gene0cefghmpruWYabUVJHCPWadi(    K@5 @ O@ @@u/O(        "',W(`{a?"@?sX¾?z~?'?\L?@VC'?0?y? $C?#? ,y?k?8k6?r7?¸N1׈?Ǭ;?Sg?vhk?K?+]? `?h/?d:K:?ֿW?GK6j5?|R?"!?Ĵ?!6hc?n@b?ٲL??2u ?eSf?|N'iM?RU?xZx]?OZb?;?#ޗ? fF?4G?M ?d4?^-f?N=?DFP?5?i—?W  0 Z  \ B _ ,  :  U ]     >   @7% < + D N @7`              @( B  A ?@;<=>45  6 789:%&'()*+,-./0123( Z  Y XUVWOPQRSTCDEFGHIJKLMN( s  r pqmnoghijkl[\]^_`abcdef    '*-/ , րԠ'὾U?z?ބR?*p?(zNMk?mB?ԍ?wr#?#d~hl?d-3p?EbE?~f$?֬???gV6bd]-'Qeotx,|愲.T?\Nֶ?38??ζd?DNؔ?jz?3~Y?O ?Z?;pF?*ai?2Q?*\.?8v?S.?΁^?-Lao? '?@Iސ?PM?XZmU?$n?(\*o?|M^?>AHV?8;?]_?y۳?@椕?28\?+;cJ?[(˘?pW?J?,0;S?’?^~N? )?@Ǩ?P#.z1?&X?uQ63?hmޏŸ?nܧwȡ?1m?8Y| m?W韗?@e??0:A?hflwG?%?0:d?^0o?7?k5 ??[?ų*?_?p"Ԅզ?g?] ?+Jcb?@3?@(??De?]ީ?2Ç?;b?$W[|?PI\rL#?B?R͝? C?S?-;Y?sl5?x\?<q ?J?r$͠?P `?} y?@ll?Е &?]c?xw?XWɖ?Nhf?di? 8P?P~͠?ܒRr?8YȒ? ?tf#Z? /+?֓$?S?/~?Ȇ*P?0ruU?\4F2?ns[K?.L{?N.ĺ-?97?~q,?㢥|?hT{k?ȭ ?LU?r\G?jm?!Y?8'?|?NZS?`+9YcJ?7n~? z#?8_~?v?teE%?w?(ӆCa?y??4 1;? GJ;uv?2ۿ?:֓?й,-?i^?TU?a ^M?/?j{?qs? ?̒ϻ?%?>x5/?)!??@?%ro&^?V? 6(?C^?hq?{v?P$̸?mԧxf?$2"0?_? T>U?&?g.?s)_4??*?F9aS?Y?ސ?t)v ?Yk?h齽?d¹U_?\(j`?9U?fZQ?JK? a{+?`O?ܽ?y?Z1*֊?(o?m<'?) &r?g=?"(Ѵ?߃?BdK?V (?ky? -?ǛԽ?`U?]?Y?5S?`Q?Joa?V5J?o_M&c?[+*??bʈ&?2oUH?| b]?.s? @S?xN?v?ʥ~l?"?0:?PtR??ԢD)?@y?9?K'6?@}m?w"1?P^?X>"h]??lK"?3? Q$?Ls?<[?Z1?L?p?D;:~B?#|w ?Pmμ?\u?8l?|mX?j?EoO?`;l?t?TB?0?(-;?7?|Mܘ?|?X? ۶?Pg4I?@op;? ?J? s? _??C?$?Ijh?XE-?Sj7J?e?Zck?PqcZ?0m?%4?l T?2v?<R?e?ֽ#p? S?5;i?۶'&?N ?*m|?.;q ?s?NƇ?i3 ?1?)_pY?f]k̡?VQ ???d.[?%,?=(9?u(0? Z? pp?I?XV 0y?Y P?Ռ?`]8?hHI?FY@?AoA?+CK?t?ҩ?p ] ?$Dl?pr?㲊6T?m KMy?(E[?@L P? i?Ge5?X6?O?̟O?@d+?+‚?Ӎ?X?lv?;D?xp&?VO?h6_?zs?vmz?X0?l񭫔?8?nЩ?Dh:)?f@&?Ų7"L?_| ?v"{i/?.Dz?<W?J&s?`? ;?*Ҿ?t7-M?@ L>?\o&4?Jѱ?z?ˆ^'?KX.?kFb?l`m?@%|,?Hg(`?:~?մ?c`3b>?(OC? %ef3m?4-S?7"!?" ?l3Jfl?0y8?$_?Em ??k?t^?*f|? %V?76,?~ HΆ? B?h{hh_?HQ#?WHו?1? B?LH?NZ[c?`c?b[?;?Rbm?_U? q?6]?8?PQ?9n?(X? ?;ҥj?I? k????{%d?`ٍ?M?_o?Sl?<G@?{v?kW?a3-?pec?DYX?23?kΔ ?dN?"*?vYu?:Z)? 0?5uݜ?>;?`*RE?PdA?LsMV?ԡ:ޫW?0cJ{?|](??0D+9?Dsi+?L>D?+?+҉?L>o?? ??u1R`? }?鹰?Z"bAR?;ҥ?lpN?ֵf?M̈?pO?)~? Z?o ?/_z?u: ?dz!?W5Dϰ?x#o?JzN?T@?}?F*L?6?%*8?VW?<]U$?X?6z?&kv?}"pJ?xK?Ti?[Q`?lme?~/V?u8?d܊?i_و??yL7'?a? ɲ?Ŕ?U菒?p*=?ˢh? ?MW?ب)?D62?@V2?cde?9$? w?AU)?Iw?g?c3Y?f]?C23ʀ?X/?r6@W?OC-?FN??sqO8?>Fb??^F?@ej?8S7t$?Njp?f?ň?K=??LIer?C??+-.!?Rr0N?BES;?s-X?ވ*!n?V!? '?œa?e?kdu?PK?W?PD?`PG?#$?IR?ZvN?ө?r⠯? p?$5?8y?@:ʨ)ڀ?Ctx?B9?xņ?lsN?~~?'42?d?E9?S5u?.f~]?I+"ʰ? J?dc?6 v?ۥug?5?zUcW? ~?L?Znސ?VӪUm$?yPPV'??Ay?/?> :?i&?1e?Jm?Z $?;|?,pr?0@?tɲW?D?爈? 0?}ƛm?`a?K?gT?|&>?J?uX?@9ַ?\c?w?wћ?4j?x@?UD˳0?"j6?/Ew0?Ӎ>?ؒ:0?a?}Vt~?r]? ?.?LUƑ?]&?{Ч?] J?JZi?0I#2?t{X?@+?cPr ?5Xl?Br?v.?g?6|ƌt?N ? V ?pG?՜:?ޝG?MҞ?+M?̮1!? tjB^?a\?e"1 X?C0_*?qz?.!?'+da?X*=?nC?]U،?Pޝ} h?BFzww?ul&') ?qp?x?6?x?ک$?ܰW~e?y~0'u?M?,+?ɥed?ܯYc?MT?hOI? 8?IؑN? ?Բn?90\?]Ŕ?rSC?^5(f?^ (?Ժޡ?A?=RS2?)ɭ?ȯb_?|P'=?ˠb ?J3p ?Ϝmh?mE?TI?Y-?U O?n渮?HT? zE?ɵb. ?yYߔ?D7?h4?w_9?ȣ]?p2?a.%?Lt3d?E?z#?h7?T=?"M?nZqG?C{ ?HwbI?Dfʰt??P٪?ʔ=?i<4?%R:?  &}?(kN?p{X?ya?z?@|?@gj6?j/?8isl?_?QэD?zm.?`mW!A?XEc?؉-?Oa;?ȷq???'?^'Hj?Q4 ?"piM?T C?05+?Opx? K0|?dq?7?^?$?%/?@m/:?&l? z?@͐?%&ע?xn ??bj?>p?h*? p?C?1F?lMt?-ZGt?۩?'T+,l?{▦? ɮJA? ϕ`?? E?WO?n`ֲ?44RH?[b?yb÷?Ra$?6?|?ToK?ؚe?+?.nއ?mg?걌?8!i(d?P?H?!ӏ??~??wYM?Z a^?遘?dȟ?X?ݞ?)GS?@?`qZy?b=w)?7l/R?0ow_9O?>A&?B?;p? c$m?-k?w4<#??$w?,hQ?T*{?Veqok?f %\?@D?{K?KUR?0 ?$پ?8h?7M?AR?*ܙ&?IgM?,@"q?n%?"Z? DY?/??E?#.?/~I?8?~:?N?C ?q)c/?Un⚫?"5 ?&#?^'n_? s?E͆? ӕ?d󢞂?H憹? ?TREEHEAPX dataindicesindptr8@` Hencoding-type ( Pencoding-version( H shape@d ?@4 4 (}lzf`XSNODдTREE   (}lzfx`TREEu   (}lzf|`TREE[s!TREEHHEAPP xxD08X Hencoding-type ( Pencoding-version(  column-order(# ("(!(( @_index($ (}lzfXSNOD Hh h(X@H8H TREE!HhTREEHEAPX catcat_ordered8  (}lzf8p@SNODTREE~" (}lzfx 0 categories(TREEc H&orderedFALSETRUE  (}lzf(p@TREE # (}lzfh 0 categories(TREE# H&orderedFALSETRUE@ (}lzf`TREE# ?@4 4 (}lzfXXTREEB$ (}lzfH `_index__categoriescatcat_orderedint64float64uint8`TREE2%TREE HEAPX arraysparsedf8(22 ?@4 4 (}lzf@h2HSNODjjlADTREE@"2T?;OCt?CM!?J??p3?%h٬?Ƽ2?D] BN?`*?J?h6Nz?b,VD?ĽaY?Hz:?^l9?u,<[?g[?^\#?#;hZe?>?P+}?ذ sr?\?sZ? 9|W?4wue?$0_ ?Vr? oJ2?vBak?-% ;?$´:?ɮ'?ꦽ?jǓbx?WN?H~a?K'OF?iB?TLy?9,?p~6Ť?-?++;s?e@6i?8r?lk?uo?kaFjH?h8bi?pb5?$br g?XIt?Cg?e/F?mhk?vA-?j6԰?Tk?!d?m$?Mw3q|?$s? vF??2N)US?)?f,R!?ZuI?~?ۋ?Ԫ|}h?r 6d??ݟ;9?3?bp;?^?%4??da?;?,/tr??9-p?,>t\?"}?: uPF?i_PK?( d?8ۍH?NJzB?5??]!B?VFfd?X!ۮ?Fe?X&?;wT??R ?mb?\ ?ԽC?_]?5p? ˰o?T'?k" ?TH.h?6}?'b?f ?h0k_?Jg?r8s?*u?n2?Ǝud?YH?U?n?n?٤?Dnu? T?9/l_?Cu!??S';f??R?`on.?E&~!E?x]?cR)?˜? )[?.<=? *PW? 4ד?Iⳇl?8Y*2?FB|~U٣?LO'? =b??? ,#?5??_΁8?p]b?!?U\a[?M0?Ĕmf?:?@Ʌܺ?n?F0?Pw5h?i/Ǡ?u 7?im?0J-F?* (?K۽#?_h ?8-X?M?K-:?X?>zB??hfBT?3+s?N.?g3?G[n?3^ '?3f?4"?rd_?V{u?lx*6?gr-?ܭ˽?eZz?M2y?4 ?U?.5-?} a?O??/m?1QD-?ya?.6? ?`a$OO?x,* ?<@'?Jo?H8?D?Xb(?E0`s?%Y_?_ a?ߐ*7l?fm3?RIx?̡+oN?h]s!?}e]?< >?6?l#q?*&?,NҰ.Y?"?T()?Tc?ǵ?.j?Ln??ż4}?^ڈCP?`𤤭?XgG?-ph?yG?A?'sO?!n;&Y?^0-?*)Bo?@\b? ;?Y.?3;7?Gޏ?Gp?c6?$6`?𞈔x?LhԻ?S]?o}?R2?KR?#RV?#7?\VY ?}?n?І~yc? ?4?4C&?z? ?x?"?W *?+r?` $? Rn??+to?($c?'/?P?ƺI?|BF?7? ?(Q֚?L&-?Y1@?x&?@l*?"O8$?lgp?]5?Y௓?xK #?D]??/{?lg4EBh?Èf?aU?{{ñ?{??!?hI?XкM?1O~??z7Wfm?80O?uݚ?2?ބ?`eJ?"(?L2i=?md?0iJ?CgvkJ?X /-? H?Myw?Z ?rwU?,3s?eh9?&? " ?WCuѷ?Ci e?th?D«Ԛ?]p]?kj?>?£b?#B^n?v8/?[($w?Ʌ_0?eKI?[8?0 ?c*Rpp$?.?`-ɪ?1LE(?/0X.?Ds? ?i?|6?))A?x{?OQ?H4 ? [~?:? ɲ?_ ? 6g T?xD?M,|?g?Xw1?9r?vak?/!m?YP?^Y?p8?;߶?@Mۉ?ઐ?K=~? 9?t*H? i7?MU) ?1 ?J:Q3?l%_غ? :S, ?t?Z#? o([?;@q?W?|EH*?SF;?B*?,vL?l?e^?K?W#?<(?Cöh|?mF#R?&?'2(?^+׿?9Ǡܾ?Si?Ui?ٿ?4?3b?1?8( ?xMI?cpA+?L ??"O C? -?lB?m?ʳ1 ?,e(?@V? )Q?pbZ?9? c?اi+?f?ɏ]j?r:F?qw6))?)O?)S?=?Z4I"?>=ฒ?T]?b?|[cX?k\ ?Q Z~?,k?Cޅ^R?S}l?pX?Qq%?R?S?K?Lj*?ֶ{Ox?v)?j&?cE?r7]g?%?|0?.3;U?l?Q? 6I? 2߶?F?wMH8E?6D?]pq'?.m? ]?'۶?Zu|?S_?m?`<ĭȶ?0Oe??MO?o?8p?"6]?6P?^QOql?q[?$+?%?%?q[<?o:?t%g*?؋s ]?5?9~gf?K?v~KP?wZ?5ZN?-Cs?E}V?gdO?piK?Cr?}'fU?Y_?3+z?|ؘ˴?@"j ?xQ]2?TlUo?۪/?"! Id?[K?su?(zP?I晩?ZO?=Ϥƾ?39?HDd?^Ǧ? ɠ?*?E?ZP[-?P)?-@tIx?H݁?C? #jk?kJ?sp,?;&%?_ñx?(X?}ђ-h??e?(S㉞?:?ȭ>g?:?+3 Y?m[A?Nj?]W*?nUu?l&?톘?͏-?fY?كU*?9e2?£{ ?pk ? ?#?@nX?P<{???ڻW?*s? P?T?QI??`}`*?c#Į?@@9?E15r?:hj?0/:?`f eu?7hx]?-?tC?Xn$? Uj?ua?2ā=?λob?^L@?|X?kn?\8[?$7R#?l?ȴ8?TU?`seROl? c?❊jp?a"?0W B?RK?8PXk?}1? b?Č$,?1g3?f/%??hҀ?r㿛|?Vyq:r?c ?B.?fMa?D($?s#?dΙ?BX?f>?xF?L|p$?nB?lڢ"?Hjl?A2?shb ? ּ?TREEFHEAPX Ddataindicesindptr8AD Hencoding-type (t Pencoding-version(u H shape@d ?@4 4 (}lzfHXSNODE0PpaTREEP%  (}lzfP@Q`TREEOpYG )   : \  " > A G J    @S ]  @          (      ~ (         (  @o@K@@eՀ', /#@Y?xd?s?^=y>?+ޤ^?įbC?We{?Eʣ?Z_]?A ?? źlT?xI(T?yT?S'?3?P[1d?ӵZ?z? KJt?c&?(?V}.%?9"N,?%f7?)Qj?M? N?m?      @         ]m?RuI?޸?,bM\?   ?*Do??x'd?Mt?9=<; O: O987 ?65 O4 /3 21 /0/ ?.- ?,+*)( o' _& %$ /# /"! /  ? /   / /   / / @`  (}lzfTb`TREE@YpmTREEHpHHEAPP880jl Hencoding-type (v Pencoding-version(w  column-order(| ({(z(x(y @_index(} (}lzf@qXSNODyy|8o (@8H(TREE~Yy|TREE}HEAPX (|catcat_ordered8  (}lzf ~Hp@SNOD|TREEm}Z (}lzf 0 categories|(TREEc H&orderedFALSETRUE  (}lzfȑ8p@TREEoZGCOLlpwHIJMF G D C csr_matrix 0.1.0 csr_matrix0.1.0 csr_matrix0.1.0ettOSTzAgZLBEJOuzpOpGqHKPGsyOSBEORjwNfizOvwtVvMKMK YMyh!uyF"VQu#gMYY$ETS%QQx&AOcm'bhI(ssaz)SXfk*Cxrp+dWTi,WUim-swN.PjM/tAyY0AOzP1pWP2ieK3mFbp4TqX5ceKj6XyCJ7otv8TPC9cuN:ybPc;RuD<cMm=dNf>ega?iuTB@WJVAuffbBJhfbCPzXDXolEheYFgOyEGrZaHZDyIsdWJlceKkqWPLrYcSMcWpvNqCgvObxnEPNPDQADIpRzwXtSJaVvTjOSBULkIVeaRWOmuHXXBoYblPFZhWUV[NgqH\UoRJ]oXM^KLWZ_mDu`PLriaezFybibzBcHlsTduCFeRhFufgYNgIWvKhgbnHiXZLjcRakiQzlkrIgmLDrMnOtSAoXWmrpPPtsqNyLrQVqsjTutDxjQubdcvstr (}lzf 0 categories(TREE% H&orderedFALSETRUE@ (}lzf`TREE[Y[ ?@4 4 (}lzfXTREE[ (}lzf`_index__categoriescatcat_orderedint64float64uint8`TREE&TREEHEAPXXarraysparse@( ?@4 4 (}lzf HSNOD@m`TREE @&S&?Y{g*?No'?}5?rr?"?Fbj?pR?Yniҕ?|h%?;?:e֑?i?1/?.d;Y?f|J?*p#$?mR? d?WU?m(鮓?1] 9?#?Z¢3Z?t?{OT?\]?c[a?ew??)ivD?鲆:?0]"?}l?1@_??,9?~QoVRL?^?@DO|?xSL?E?J=c3?pGQ=?O?`6}s?_?(?H;? H?Pϭ?s`?mͮ?${FK?6*?&?>0?s?lp?Ol?}?P9`x?`P_y?{c ?P?pm4?tF(?wż?`W?DnȒǮ?H%?-a?Xq?= ?g?Pkl?nKQ??YʉaY?^5r?0΄=?Z52?ު ?9`hQ?ű0^? ?I8,Y{?ҘMG?p#g?0rl;?6?[?XW'?, ?iVi?jᅝ?0{zv?p?Ј(?vbd?Oq?O*d?k䅶?XK"/?|͊ L?`OA?樃?@z8?@Å ?+5c?c;;x?M*?x2??;"?;&/?9f_,=?=>y?J_?הlPrK?7DZo?q?-?[6‰?DQY{?7p?2?T?!?)?S ?%V?|?n?'LŞ?ɍ,?۹Ty?gE?Z?je?9W?53?lLW?=cL ?qUm?L4?z@|f?x6(e?@6%; ?k ]?Pc?.͵?_\x?c_?`٢?FGw?;lFؒ?"?p4~Y?<=?{h?}WCG? J彝?ܿf?'V?ſWA?UiA?tg#==?sA?r ?{ge?jƒ? }Rc+?@B[M?jz0?],? /T.? KF?(}?P&?.?N-E? ??sH?Ttg?^ P?"u?ߍ(Q?C?1,?G"2q?3?=?`C?$~?a8?^#,?Jz? ?'G?4sU7?6O/?(W?;S?֩0A?.)?1rN?/-?Wf?X"?1?轀a?Ye\R ?p0S?@8?T5?"z?Sކ?Q??Qt?xF?7?˻X?܄K? Y̺?ߋ~?b?3#8?fq?K8i?|/w?B?EI?k?'<{?Uu?J?j?} Gl?ɹ+U?E?iVJ?QM?9`?囈?%??|_3t?Ь* ?s/΢?"Md?Taƻ?_/?Dr?䤕FQ?gX/?ZLH?d?ś4?}}?@s* ?2]jD?$O&?Jw|?h=1\?m?ښK?v_h}?z"ă?^ѝ?@//?` ?]z(?vť??OŰ?Sf?vP?εϹ?H?sٟ?Hjck?sSR?Yz?@?%dQ?N?0@ ?v?tm?kD;?jH?CUO?߶?*d4? ]G?OkW??;?\c ??,w??ș?>>"?M߇_?@p?dmke{ ?L?c!~??X8L?>5i?>~?-?[ܩ?5s?3ȟ?8{ ?u5?\?خ <.a? gh ?3H?} r?\.?Zra?>@?i?["u?bD}p?h, 1?sT-?S?Zt?yH.?gԨ?g?pG?}zѕ?{?7~?6?0u3}2>?"?SS?>Ǚ(w?(s\UW?Ǣ;׮?P@K??&Q?b^?aG*-?ioa?=ܯ{?m&?SZ?sdg^?HDT? l\?Q?jAU^?KC2?wUN?b{G5?:¹?Ĩ=?`Ƌj?ֆ>?ahE~^?7?HQ4?*O?N~x%?2a?H(`?zk?R/p?T HJ̕?\aa\?al?|w+?N+/?^t}?⢖H?h:]?5ߞ?ug?ws.*J?^?3T7?=(?y1̐^?ŭ?̃]?`V?zy1e?Ug?8!D?IGq?;?+O??m?ꎪ;?. s?v?t?C?5\?_?_.fkC?s?y b?T";6\?K87??i?&at??gFe??'c?ʆO?؏GmƸ?e]?4? +??N?*?Pz??-kV?Xpf?Xmɪ?+|]?ТY?/C=?r??!x?2Ô?foa?wV?V<؊?ÌxE?XSZ(?刴r?~`?s~?>[?(v?`4Iɢ?&I?& #K?Hκ?o??h]?iL@ۿ?6.?-|Q?^MBA?m? Om?Wq?=_?!5?.yp?/ ?>aU?pPհ?sF???@* ?,R?rй?'&O?l o? }`?Yb咾?qI?&?M?n_ ?CX]z{?dw?ҏ?@?n+?O#:N?^Т??W;S? }?Q*L*?|Ә?hpe(? J?l?LvՑN?Fh΅?fB?uZ?lӨ?HC?[G3?%?9݊?d:4J?H޶?_\?q?č\?CEP?X>?Ȫv?^?Jzз?nzn?p͏?w? ۹? DX*?fyt?V?`?l{Xc??Q ? {?2Τ?bj'?>!~i?Y?kq?C ~?8\? f ?|Dmj?BH{j?ijo?i" ?bM.?2??ě?{?׎?Ψ(?21޾?Д?B^L?&˭nW?hJ銻?ҫ?8N[#6?B?xSBh?WZ?>璂?p |?nE?􅶠ԭ?l.?|L\?U?Ӯ ?Sx?T,?8RY?Z(@?J}?JyZc?}>?0q3k?;Y3?hMn?PrQ.?Br?^? "?cR?P?ʲ4?dM_?Uuuj?>h_~?g`i?ұ-?H?Ȗe?У?C"/?8?#?Ko4A6?Z+ ?$`?%6l?a6?*b4?Ψd?Rj'T?B?.f9aV?TQt&;\?2Ƙ?x?I?u~C$?BI? 0w;?~A?u]?*14݇?FM?F?߰?#?>o L?1ʣ?S?w7F?JH?>a`c?6/?CE?0{?}?P.s?{V*?lrE?څz-?o?c?xD3۟?U>ri?4*`?.-?f;j?bw[^?Hi˫?Iu?c}?C?v1?;T ?&|?Q??(0 ??Tial~?:?{FR?9f=<,?샟3F?Ү@?xp ?}^B?y>?L?!p?"Ā?"|? |3?,?Mj3O?v}^a?eLF;?d=H&?dtN?(ܥ?`N/ ?p R?,6?!PS ?TREE8HEAPX dataindicesindptr8` Hencoding-type  Pencoding-version H shape@  ?@4 4 (}lzfH XSNOD( TREEHT\    (}lzf$  `TREE#\   (}lzf|`TREE2\XxTREE HEAPXarraysparse@( ?@4 4 (}lzf H!HSNOD8(8H:TREE +ז?j ?k3?B?.T ? Zw?3^^BP?/y?FT}?u,?R^?"|s?XW5gW?p@rfm?B?X>J?^AU[?|V:?}7?/S"?^?wO`?FƼ?xPH?H~ c?6]z? Q0?"?8̨?p].+?*F?#?`?¡м?%,{?|H?X/Wm?ƻMD?0;w?,gJ?=*_?x?lK)Ԫm?e0/?W5.h?B?nv._?r?3K ?JS0?poͧ?B>? \q?OE?A ?~?gX>?X?Z<$ ?u>?ල~?DnaK?;G?X?P殯?m^N? >(s?" u/?h?ׁ ?L5?[?Z?/~?zw?1q?PQZR?l(?/?YWz?(So$?r?|Y?;?Ro10?U`?l8&? xC?;?pBJ1?~;c?v?(s'? ?C"?zS?ubE?jƞF?Q?Mz0dcb?>?[qr?.[9?lh?hG??KG?|?}lh ?f?0?QvK`?h?0E?[R-Q?wƑB?뤆*?`?\ QO?&kU?WSAi?Xj??m?0+\?ƠwYB?z.E?N2?7?4"]?\?i?.w?@{?k$?K?3}@i?noƐ?!?Zs?od)?YEo?IkN-?̿~ ?p띧?Ӌó?\UWi? w?]h?FeB?:+X?к ?Z _n?}rO?nH)Lv?9a r?dx$ ?OD?d3Ug?G"g[7?%?K?J1@j?z;hz??ad??Pv??tvdqM?ʌ?;_?Qx? ?OzX?J1(?Lp?6@F?Vzx0?!C:?Z F?BugC?A72-?|l4?sk?R >?Ž䭞?"\9?(wK?er~|?o?, #܊??:yZ??Unh?<ع?HǨ?=R?p?\ssj0?e ^?7ZL4?6\?zӼ ?@h?R??iJ?Lд?6e5? Z?Z?M4??".w"?48"?~{L?DG#?7`?Re?Tl ./?W?RN|v?jخM?$ y?G?•0A?s?:L ?\)? 6\?84#?ޣ' ?2O9??&.?F!T?f;?*-D?̰F?gV"%?hYƽ?l7;'?hqM? @>+?dB_?5 ?VjFk?Kԯ?nSG??N{?ȳfx?Gn&?{?yp?;D;R?m_??U? \~}?'k?P`AGMy?Z4??༏?HU? ?v_*?*wJCϧ?#?5'?8WT?Ơ#5?!?MAo?mA?(7չ?1ZRb?M^W3?5?/l??ݥm~?@a ?JZ?̪@ !?K脀r?WQ?pDrg?&{?`?i/+2?' U?딈h?J]:?8H_w? J?u{?K,vH?80??%&JN?75'? ?[?m+? ?00?.u?!%d?e|?〝?;rO?M-Q?p@ӳ?.9?l;?Z?LO7O۱?)S?@?Y?Rv'?bסVD?UMmt?@c?ٷ4?;b? ,d&?" f4?Ҁ?T??:m?8a?D4?"à?0a? D?tA]N?E?7?ف?s? 7v1?4Ŧy0? %p{?2En?Fkc\5F?ZlS ?Bxb?:0TREE=HEAPX h:dataindicesindptr8(8H: Hencoding-type  Pencoding-version H shape@ ?@4 4 (}lzf H>XSNOD;xFOTREE \  (}lzfG`TREE&  (}lzfTP`TREE] Y@[TREE^HEAPX`[arraysparse@XrawobsvarobsmvarmobspvarplayersunsXSNOD==?PН H @0xx@0Xx( ?@4 4 (}lzf`HSNOD]} TREE@j1kgD@?V L@?me?C?:*6L? Kei?S0?#?zt?c|_7?B\ p?'?Fu?S=?Pl9?8P?)l?]Au?iǵ?|߉v`,?ȸVy$? ?^5?EH!?d/U?Rf?t0j?B? o;?o ?yUU$^?(7bVO?nZ?>B'?f9Kl?n?dLE?\ʲsn?HR?8J?oѺ?#?]똃?n?Jv?R8y8cQ?t?X 4?BI?j!;?J6?*#??Xw? Ol?\?O~-?1 ?Z ?БNne?L~Cߔ?fJt,?:ni? ?|4 ?N6 ?t5J?L)&q?#?QE0?2"g%?]6?D2?sQs?z?}uW?*V͡h?q" c?.u,??+=?)P?0Y?z@#?a?^?-wpI? ?d\uo?LL?(n?@w?jѾ?t0~?d[o?;޴G(?|N?ׂ7g?w?K=b?qݤC?Yڣ9=?? 1j?QcQ?QP'?E?дY?0w?^2`y?6J~?ۂ?~?Y (;??p?g3?PhO??l ?ק-?Pj?Pɦ?>YZ?lMއ?j{E?닢?+?e)VqYn?ZnU?Ȉ$/?p_)?x5S{?(qJ!x?{`?X!v(+?vKK?5Pv?gbI?|ss?d Gƕ? 6 ?W{К?8ِ?1?쭶L??$v1?-J?8} ?b@ ?bpE?z?s-C?\=a0?צ+?U}?Ё?Xs1?Fc?Ǵ#?ּ?Nޡ7\??s)z?ʂM>??-?vsE?ȔUr?Qg?#]?(u?(!q?_M%v?,ĮD?:i4?Y#\?Бjn?>ZfI?UN0?&m4?V?N ?hsظ?3/]b~[?2`)0?r %?GȎ?V9l?1 rv?\~ uo?"ݷ?.$?4|S,?™;8?]}?EGT ?%}a_?2R?=??&X)?@h u?S? _?Iw?BR $?*uj?pM?9LY?4?}?k?]7?7?$Sߡb?NXB?6?Ӡv'?H䰁>?0"խ?gr?ӏ? ɕI?J5? e~?01v?91? I7<?:x8F?S&k?F9g?Ma?zKؒ?U"W?H&Yz?1w>f?L@{&o?|i7?Va{P?>u [h@,l qCd@^ hLFH[dHXȼJpfl 2>ß8 B2Bh8Bx>n|8J0;hPdUƭK$a*R.Txu *Ⱥ>PlIP֜x4bI&T:x}HԎD%䫔`hlN\X``[Q 'df8Oo[)複ܐ,ved`p,!&Nni1pm1tcei}@Q@S=G`_c;g~dA%U3mc`&R*ބ@.լ"}&g+'/sGĩ6ݒlOP{Pmtn%-XgPnʭ9Иr?W,ibOkXOz+cܤ;fËq[7!?ɹgj95l=z6P\cGl4W'g{j} (-ǩT u"$#J6(ƕWѽeTS˄ bfK|B'U}Q_sM스g}@]J՚_zsjsԙ@e 0: RӁsu{kjC \̪mմ~3הg3Z>",s?LH2,'),u*gԇ tCq׉}u>sPbamSd jw:Ɔa4&hljm !./*r'YnOW Q|pv9:_Czn î{Q!Cڎi^&NWpLZJqMP^u>g̖CoZ%fQ?~B0Ժ؅>1waѦk0{4\/OlaocD.HֱvQ(i{9p֜CX7Ŧf@#G~.F*X KPB0u-mZ~"M}L(NPS{obp:/d#W(]#{9oDcQ69Wa kZ`08Mlۅ{pOK52;_Jl6QpS%'cq@%䴐wF~WUrRB/K& ܹtݖsO0.1-0"tY߉#q8/Ekؠ +EM3/Tމě}jw=mKh"OcSdN {􄢧/Lm9V>DUdߊ>M@kLcX5-8 }r@b ?kEO(QLm\oz f!:➝ƆxZ>8sGz{ߐ3?\;QGgAKUٲNjhXL<ޢ]]{T>a˜/Ha+:sTyŕ5fs;Kag'G IhePsvv^zwDyXb~k\f|H"b2?1LJ[ZᒶU惸RyYB cMhH 66dj9t2C~1F#oSJ`6>.rtanʣOg3˺*fwic;af 7ᇲ1$jŕ_j32`B#0/eh0Oĩ{rިOxy! j^/ 6S}"ihe|$ 3Ov]l_V^OL̢;_t\J^C (6FpG{wL|&UtKy)cF*sp҉Fŷa5h^ ,' VЈq ;M>PoY?LTe3L4|6 _"JܬA#A gj]T}.g(=C8,?/ArŏX\F7ܲ8w<Vۻ%*aFXL4sDO|1}^ vSvYӖGK(i!t]O;cȚȝpe!8[{U`[˯|0lF=3@^'ri)+sڮVY#߯[ y 41a@)+m>>p|=dR &g'5rjR;Jm \[wD؇RE=}&077΋RM <8oJ@2D Z`0 wN?h>`0l`_cqlKY52QY#;𼠦zd*"ک8 c;*?"&,yr{2nyELU(;xف{C忷i=ހc[z,F፼H,23~z'U;)N1aRS85zQXݤI{uG`? d/*&/ŰfP?RzQ/s,@/)<~Rm;O߬Gk ,>{~L5 \. ra}T(ظ nۘV,*U3m^| ^$5<Cg762S(W@-oD\^2NG>~{\L 1Ɂ%pwȢg*^7(V1!GO?s\k8ecO+3K!,sk ^c ]Vp>m񜣔EZr[(Qԣyձo z:SLɄJΘC߮К҆?{'iY mڬM{n6BEl\x,ى–3QRزj_-e":>A+lTB;;AFߡ ʰH/s* O\z֟r;}T ~R<<@(@&u/d7 +^53 "1QElv)M{8I&|~>SwK^a{&T5^UΊ6svO`kƕ\+/}HJ^egDm;󩷸Cb\7t]Ҳ=lFBʩci(bV]Y=?aV(߻vɪKAOq_pUFRO`M{w+N*(lNBZJ\P׹*V` q~jjo .(Wh|.@}c>k3n.r4~i/8#6 `4V\I]MLFcLP?????PK vSw obsp/.zgroupUT aaux { "zarr_format": 2 }PK vS obsp/sparse/UT aaux PK vSobsp/sparse/indices/UT aaux PKvS)a5obsp/sparse/indices/.zarrayUT aaux uA0E̚\hy cH-mh($Bmڍj__-5K|=M@ْ/e3+ŏKTB)ED:@281,#\[=Ęc@jCZCܠZpw1_YUR'FoPKvSa*"4obsp/sparse/indices/0UT aaux cb4fQa``a bF bi fb6([PKvSCZlIsobsp/sparse/.zattrsUT aaux RԼ̼tݒʂT%+Ē %4%eEřy Uzz0``. XZPK vSwobsp/sparse/.zgroupUT aaux { "zarr_format": 2 }PK vSobsp/sparse/indptr/UT aaux PKvS o 7obsp/sparse/indptr/.zarrayUT aaux u0 wyfRucTJC"" A6A*woBOݜ~@֮k.=a>Ua۞%*x Iy+~\zxNa' NpSXk5OeR >\f @ pPzuq*JmZD'ATjK-oPKvS8&obsp/sparse/indptr/0UT aaux cb4fa```b&Y`V fÂٱ` DPK vSobsp/sparse/data/UT aaux PKvSXj 7obsp/sparse/data/.zarrayUT aaux uA 0EBf-Ŗ]Q1Aq42c*޽IH*y̟ v 5}Kx=('lIOp=)lըv"t K]-j~ao)wbn}5b1 ]A/Y ׀w+Q bp!Y>PK vSPXXobsp/sparse/data/0UT aaux 3HHXc&?(?V}.%?9"N,?%f7?)Qj?M? N?m?PK vSobsm/UT aaux PK vSobsm/df/UT aaux PK vS obsm/df/cat/UT aaux PKvSL\7obsm/df/cat/.zarrayUT aaux u E~ cm?i EQ&B[6ཛau|ޫdq&a&om_ `*<(s81 2}96Bns1/$Bl.;$@+(b:Jkz_Q+5zPoPKvS?(obsm/df/cat/.zattrsUT aaux RĒb%+x_TPKvSx8,. obsm/df/cat/0UT aaux cb4fc``a= ceg`cb`cbePK vSobsm/df/cat_ordered/UT aaux PKvSL\7obsm/df/cat_ordered/.zarrayUT aaux u E~ cm?i EQ&B[6ཛau|ޫdq&a&om_ `*<(s81 2}96Bns1/$Bl.;$@+(b:Jkz_Q+5zPoPKvScBiHdBQfwr?BnI\[ѰÄȢ%`UOݪ=.W S8ʁEI$k(R)P[9/LP/\rY#@%K y˯69{oPKvS2-.obsm/df/uint8/0UT aaux cb4fc``a= N=L,)X]T{2sn_RS PKvS#vobsm/df/.zattrsUT aaux R̼ %+8K"S_ZBdK`xd̼3dD4R" %0?jmj^r~Jf^nIeA*U)%iEJ*R3@ jPK vSwobsm/df/.zgroupUT aaux { "zarr_format": 2 }PK vSobsm/df/float64/UT aaux PKvS% '9obsm/df/float64/.zarrayUT aaux u 0E~Z}A)3J4&(FfPo꦳Jg.s$u]pM瘇3i{R̆5,x דr8V.Q rd-bfRMђ}Zq; U5m~3EUpk=yOaܕ{xB#hp$?ĵl?????P?????PK vSobsm/df/__categories/UT aaux PK vSobsm/df/__categories/cat/UT aaux PKvSYk obsm/df/__categories/cat/.zarrayUT aaux uM0%Cjia wmY߼μ)I]M*pɔҩU/U>f,C1I煀co*(Ecj9KC/r bi ȇ}z"F y^Ȋp-T5 ~H,N'PK vS" obsm/df/__categories/cat/.zattrsUT aaux { "ordered": false }PKvS-RHobsm/df/__categories/cat/0UT aaux ı @L=@+TZ$(ڿxg\/UTZ>jթW~5i֢UvPK vS!obsm/df/__categories/cat_ordered/UT aaux PKvSL̴k(obsm/df/__categories/cat_ordered/.zarrayUT aaux uM0%Cjia?$ݶPΪuIj hmV5}g$SJHWPV5m"ZXİ'NgxS9hG)cU=8y/~~r$h"j&6Y0_`.BVo&"eɅ숶g9MC6j=lu;I/d|p?l R\Ȗh';$PKvSh~TLfobsm/df/_index/0UT aaux }; 0በul`+VFr'aM'=EvxI +K -XAO1L.FD+j%H Yi&%wn"yPK vS obsm/array/UT aaux PKvSg۬Qobsm/array/.zarrayUT aaux  0 } ɵ #= jbZaᄊtUq!ɚ7f4ϮyEgԪIh:]/T",Q-,b8  DQX7FJt[إޣkЃl+Z3uĿ^0J*d-dPKvS|9)H)obsm/array/0.0UT aaux w<ǏIVV)+IP)Dgdddgo{esdYB^_x}afgC*,6?WciyĴYJ?TRs.z* 7ۉm8h]MZv]Jb:Be擮V㖠d\Gұ.i\gr>P "fu` k|| byEcP3 q,;|#_`׊ZWѲ7J)?@/vv~9R(|2l*b|i]N"9yZ7Y6ټ%{g=}ssosd^,l4O*snt̟#|Ҋ*j7a wolqQ~VW+Π\V!н|ˌ?1WF[Ƃ{E@zțt8s/`˧nkc^ʚD>|MtUG߅gɖ2G-|?IurBv @Wt37t @>91#KoֳsX e-ޏޏT8RC'䷘ N Uh`9!2ZےQ{OKIADYn0ZO흄Di]AjyK|(Sq4~Uf E&$he?VI25<Lxf`'8@ ˹Zq/H;n[خTV3Skϡ~puJK7W&& :R6;PmJXIoBԈRk3 >ؑ_*6G,!+-J&Wg5a#^R ^mɠ ./خf*g% (qŸ-_CWpՏJW;^qy\)uv˖QJ`$Y'q?q~hs4,.Eһcu{$.dޖ)kX/>i9Ŕ蜪W^ߛH>#ݹw7'O5aQZh]hXZגh+kqRa/7" n7>FTN<ѹUK8TSJͧ\K%)9E)KYKO)fY)x~wkoBP!E ArMd]ו#F=' ky׻-DRj?%ƀ'D}BgeHw5$b\jhb d"H)RsPɤ3nYmҶ9ʵ+|KEVa~ԙl f#b_k%qp>ƯTi%q DLY[U97WM[Qqz9- {ޓ(P|RcR)ʥR>зE%h1"W~Sc6%php^ ZT*#W_ 9i ]ՙ;8MsǎRf毷oV/9~/32g^>򧏺hDخ# Xط;̾xӲ:YᜊՆ2Ixd! \d p]j#Ji#ՅK ݷߧ$]ÅiR9hvX˳a/DXE]ۛYT50w \[܀B꯬ ;*}Ƨj"\彳v{锝ڕnjǡ g o+Yjkrիօh>7KI X8DLNuV )#SuydA7-z*Xmm~JC|pydO+hŦ%F=Пp/*䎖y]9P*r4:YC8-59+XӼ'uՙS+9Ŧ|ͷ{q!,#ŇK,NFoNIIYa5Lo{.Db?YrK2b}Ot2qN>\8Ykfߠ@WM3 [qu+~ttX4>D'?a<+Z*s0֖iz~ŒC~%jm}$WM& >#u*OOl!sҽJC﮵`'φƳzǧz{wz(rVa|_hN/f6U. O< ːYc<<΁Nm^8;u썿&zEzV)֌Lo.T}Pr3|Y1*B[G@d\|vj$05̺4Qƴ7pPkڙܲuy|lŷ^LHƆ5~X4ЧÊ)` oo\`&h=? x2;evd$r-ޑ ڳ>;uLjݳd䌓Ui FT#Pu˸B~v$WTMW esKr&SsOVg;|Dž^P؛E}Ϟr_`ķ(wmF)鄐m6!Z~Jɥ]/FO⋛>KSESզ~M\SJ3)4uByX{9kUѪ5ϼzzFX ǣQ;BvS 5>/]'t5OZ6C")aSi6YC,cseU\wnyS/3 qi՘.c+߸5^bW$68>"#4n˓gt Ֆnp]*xIF#{F\' & Y{2b&ȣy?^n#tNRyYOS].>A)5!-&%; a3˖e_X^S&S+ o"ސ_x□WA͙Zp سFeT5kb.]^C7:GQLr51j Bs-_S}S]}+q2wT sqL5i)*AÐ-{:| 0(tȨ'[{[3{Ƌ7ђ.Vy}F+\̾E+:*|ht I-K [.߁sO ׿vC>9Zf{uoN9l|-6r|EtvVo޽;M3i=zymL&$,˩r?_[wǞ-S?9b#&ܬl_:؏[%<;"ɩ?r@*glfZ>ixOŋ\X?;wCruE*~Ť13FrD/h uNq 5r%d݇,y떥ƖhŃ~vmE7  })Gam `限#|lOdUKvymIByi;HZlÛq%~aVq)"v٦ev#O% /S39JM̅Ԟ|h1V4urVo 4uuzp?2_{VrGz>aqjŨ؉0ҜF?&|ޞ j&PhϔQ_yx'>,GOwmZk׬s$ L+Z $.Tͧ|g mesK.ؽĕ)eU*txD;8}b]/rM7gF#k;hr/\l*.AJLrD3Xx_cM4eUXb3}k1Qiƙ~K ?sf^^C2 Yni8t|t[1!zzPybxFS]K`ЖC?)!R2l/4И4ILDRDy71y3‡?p9do?E̋/>8]]n1ԠiӔ3uU}\_ ^a1EC^?\\q{/~MxY-^{;h#{5\PhEJuޝ~ɦd$an K׸̺) `N43:.՞,9~*${"y{ύ$Q'nyq"%r T\Pw8Ѯj*L"'[#4Eઁ,̪U:%]-_7Ր~jdu8qapE<䮡mq Y.&o.3YRdw .KI\Odj.Jz+@)~'cr{=QehBݮ'7.R_'Sŋ&T|O \Kv {L%.EO5O)݋P :Rf>mVwc',.l[`-}Y^ >ryP 7(g̵wXəbY0#r//raKG֔N}ף_?*%%fOC9Ȋ%n&+WBïAbѺ \;(Gmgl~tFXcL}/%`¥{T eba=~gX+>x$gpĸ8r2dܗwnIۮ '+ɸy]I }):rM0e:Ksӌ{Hf'&F^Fg_eM ߴ}3|f)N@L|%Vvwq* H  XeEWgz1` wNB#"Ϲ/5o=?oc.M>}~d[ǿ_s<C؍8pԚm>.m$S?e6gzbAn`)_[)߸? x1SZ#D#j9qQ]6D>ٿxӟI6|]K|LoS2Le9;d[ڇܼ_"ur>۾{~Pގ#SR6(' ?]W{\LLvNtIirлqKN_ _uEu{e_r0rn/9V\N637>a:*rHLy+ÃB0]6/5'ewZudJJTB^^;3$-3]f~ 7}_[=s톳!DRd6# /Q^1޺ϗtoq&R-,7PkcϳZ U-]G .݈?,lmNu:@qfZ",pݬj98 K9mogoݙ,G{>zlѹjÆ+_[T?czBbp9cyX^3.TOycy'/D1*lyVѽDVS[l(y/!#fsz/?3NDݳc=ܩ_}= &a0g(W5@70LJ+ 鐃+!$.Kc$SS䡓~2OדQ۞| Ö {O2ZhV7Y9UvwwϮ }mBPMR/O47O L_l6x*RjH{RPk*b!BoGhGӺ59Aw bmm cZCƓ_I5[aJ9UeÇ*m(:{':Fv~X2y4Zl(1r\cJ*"?dQcCE:+z,QqަK}%q竡pI_n'޼6suXfy<``^Z*jv ?,̥ eޗL\& j[|xTz/ :%rcV}~$MSbVg ߳^J4svOo8WlBJz6EF%X.yviH xmҖ'~CT{W/D߼MۗeEiVeZamC XƋ.἟jIUWPCY_$dxg _'"<^HBh9aG>AIڻGy˻y_PxʌXIc[F9}=4y&kDhS^5`d7u" epzw.8Nn=yS顥tW&rHfin7&@~T k[kB'CA%5a]/sM]%9~bNfVyAsDDo3-/\j%ۗwNw&?3n=:}:R)WÒؚ3LێqE_-<8,(zѵ@[[b4KKP0~gm/M8\ފ#uGt4 Cnp3KdX21*L[xZFDV* ݓi(PCYb1KdEg|C^8 I1X:r@64W)M]!( qA$бxlu1LllB(yއ-ѹ(*%,l 0Cx6ȠROYZ7K]G0qF!A#[P˨gt ͤdmQ%*L 6Ka =JXza IхT"L6x$z$rkK͑ V6[YoERVnT mr#Sk`* Hz Nu,sGL d?PK vSw obsm/.zgroupUT aaux { "zarr_format": 2 }PK vS obsm/sparse/UT aaux PK vSobsm/sparse/indices/UT aaux PKvS7obsm/sparse/indices/.zarrayUT aaux u0 EwyfϨ*DDT86K=%GWKups*YlRY V*qjR%XϐpS3 QN5^kCvK=s3{ hXM}rQ=-ՊK}uyςҖ:1혭PKvSЁ@9Oobsm/sparse/indices/0UT aaux -˱ P#)\j"MP3>pCs\C?gnUa۞%*x Iy+~\zxNa' NpSXk5OeR >\f @ pPzuq*JmZD'ATjK-oPKvSp=obsm/sparse/indptr/0UT aaux MQ@0@-J!4.zО؏dwu|9ڳ7 ȅ+1pׇYPK vSobsm/sparse/data/UT aaux PKvS% '9obsm/sparse/data/.zarrayUT aaux u 0E~Z}A)3J4&(FfPo꦳Jg.s$u]pM瘇3i{R̆5,x דr8V.Q ,ù m!CڢV⻯-q9%dMR[3tnigW<"3 jՏ$Vd뮂*vGn(f^b8 3DQX7FJt[إޣkЃl+Z3uĿ^0J*d-E}PK vS#G<layers/array/0.0UT aaux !X1Vm: zP<z@ ϐ`]|Ety(n\RBj*<j|fh@PXؒ:l:V`-p| daNΔP_ەσ8C;PXo֏ohZpTlɬpx|PVt*P&@,gj}aHux@DL`O̔Lf:N]P2q0z-dL@j|Y 0^6YgPlא hٹPXoXX;ɸT@}pc]PleZpx(Xv W8欼$8bs\UзNʼv_,>Nh3T$*GD f*X]$t#4>2rV\4¸E2&@ۀ*ƞ0g[:b|xrt-9Yz%cۨR`Nvu8B`D+  0 :H1|fHՊn8XI!J#JV6ϐsL`#;ط@fO#y~ÌQ|A,O}E$0ar2{[7o|+NGN ')(6O" AQȁ"Bn`# |9(mm/ǐa}`*JYb 'R) Z~ tL(8?>#"D}*")^w\d;NqQw2J3HgJzrI M=E;@EPY{Pםzf?DMj+)5q{!K|d-}@b-FssQ(:⑼&sHWG@[FR6hoa)VdBoUb9dRo#]y ;གྷb޴V_vK5{v p=+MU#MiYUm/M\4(2N p/RZG/yy

@a~ȨnTvGli?)q[O=}po%XgLe'uzrв-W& J+AivyHjo$BKL88 6?K__i`64.vݔ3 5wvQ9&j;jY+1jfw/;"dܸ}Ʀl n$S( j3qft  *z!wpu]۩.=YK_ъNC4z`&E2Qu+#IL(wgwjP䷂~ pԆũ9ehksoLr qAP-@FTx?a>jY{qn$)SJ`(Pg 1CXc7)-Erg%DZNV]›ciז4[jwN=8}=KsOlI) ru}X wRߐ6vՙ'GžD0VeW"&N n(fpy*J~78S9Kf{{O*uP߷(amI5<&g"& Vr\^#|$-$\%_jylW([X@ܒL07pO<-$F;0y*`R6Pu,$ !Uò; OlnEnvQt4O8i%L=mK-AU\h"vŧ`:]<%OOgЦqD?DY55Zräًē,^Q\uRw$}鞹QUh.Iԓn1d\U+_̰Uxw e, J<q֡0%6szh~o(b=Q'Ey~;Oݏ]uqv\nI<?6h67FtwXm `|wSјFQ60%lηѻ_?_$jb>g7ԴxV%b.Jj&S%7[=y:}{{ Uڴv1FkaWzo7P/g4őHY|$%Q~C_:Z9X䎻пߴԉڊӜϺģڻ͢ˬҽ껠˃ƞبйܟم赫~ߪһ谶ޕƾȗ֦uʏҵʷ ?AP?????PK vSwlayers/.zgroupUT aaux { "zarr_format": 2 }PK vSlayers/sparse/UT aaux PK vSlayers/sparse/indices/UT aaux PKvS5layers/sparse/indices/.zarrayUT aaux uA0E̚tay cH-mh($Bmڍj__-5K|=M@ْ/e3+ŏKTB)ED:@281,#\[=Ęc@jCZCܠZpw1_YUR'FoPKvSt\(layers/sparse/indices/0UT aaux cb4f```a bF b~ bPKvS(Kslayers/sparse/.zattrsUT aaux RԼ̼tݒʂT%+Ē %4%eEřy Uzz0``.F`f,W-PK vSwlayers/sparse/.zgroupUT aaux { "zarr_format": 2 }PK vSlayers/sparse/indptr/UT aaux PKvS o 7layers/sparse/indptr/.zarrayUT aaux u0 wyfRucTJC"" A6A*woBOݜ~@֮k.=a>Ua۞%*x Iy+~\zxNa' NpSXk5OeR >\f @ pPzuq*JmZD'ATjK-oPKvSlayers/sparse/indptr/0UT aaux cb4fa``LLHY0+f#PK vSlayers/sparse/data/UT aaux PKvSs7layers/sparse/data/.zarrayUT aaux uA 0EBf-Ŗ]Q1Aq42c*޽IH*y̟ v 5}sx=('lIOp=)lըv"t K]-j~ao)wbn}5b1 ]A/Y ׀w+Q bp!Y>PK vS:@@layers/sparse/data/0UT aaux 300@ ?*Do??x'd?Mt?9U28[Id4| I9UNl "vk"cRtXsy&4b2t!.҆^CLX!C ~vg1oY)cGx-7PKvScYkr*..u"- km(PKvSX##$ var/uint8/0UT aaux cb4fa``a U:OJMPKvS Q} var/.zattrsUT aaux ]A @ E̺RJR& hR(-Ż;RĬC##};.E4Ls2 Iőrނء>IW8F~} Q퍁gNUqPK vSw var/.zgroupUT aaux { "zarr_format": 2 }PK vS var/float64/UT aaux PKvS1+\9var/float64/.zarrayUT aaux uA 0EBf-J tc"iLPwo꦳Jo>$u}pMEt)fCZ~ȖY9g&xSyX1Vk xK5sӗnI  |"FhRܿ\,JmӊdPK vS var/float64/0UT aaux 3{a?"@?sX¾?z~?'?\L?@VC'?0?y? $C?#? ,y?k?8k6?r7?¸N1׈?Ǭ;?Sg?vhk?K?PK vSvar/__categories/UT aaux PK vSvar/__categories/cat_ordered/UT aaux PKvSk$var/__categories/cat_ordered/.zarrayUT aaux uM01[!0PҿD6eo^g똤Vk\Gfh{ɔҩE/Qg+,CO1H녀"^QXUsK}r5ba ȇmz"vF ,i^Ȓp-T ~@,-NvH/PK vSI$var/__categories/cat_ordered/.zattrsUT aaux { "ordered": true }PKvS@L=ivar/__categories/cat_ordered/0UT aaux ġ @;6`& (BV45(xi."x7jtRL*5tѠIkPK vSwvar/__categories/.zgroupUT aaux { "zarr_format": 2 }PK vSvar/__categories/var_cat/UT aaux PKvSk var/__categories/var_cat/.zarrayUT aaux uM01[!0PҿD6eo^g똤Vk\G/fh{ɔҩE/Qg+,CO1H녀"^QXUsK}r5ba ȇmz"vF3,i^Ȓp-T ~@,-NvH/PK vS" var/__categories/var_cat/.zattrsUT aaux { "ordered": false }PKvSU?svar/__categories/var_cat/0UT aaux A @љQ:Ґ э޾uEݬFv:(P]tSBQFPK vS var/int64/UT aaux PKvS9 7var/int64/.zarrayUT aaux uA 0EBvaR1J4&$B$m6Ux&z7 .=4u|>dqFId[bO0b JEN| "2}SppOt5ǘ>') .\vHVb59 v2`.<-rVZSoPKvS>q>N var/int64/0UT aaux cb4X~@,f@ILJ:?i`AH L " `.PK vS var/var_cat/UT aaux PKvS&Ѥ7var/var_cat/.zarrayUT aaux u0E|5 qgCjaCIwۦj7Ϊ=9s3w/J?{7 a:>U28[Id4| I9UNl "vk"cRtXsy&4b2t!.҆^CLX!C ~vg1oY)cGx-7PKvSAXK",var/var_cat/.zattrsUT aaux RĒb%+x_,UPKvSDl"$ var/var_cat/0UT aaux cb4fa``a a`cedbbdPK vS var/_index/UT aaux PKvS)kvar/_index/.zarrayUT aaux uM0$Cjia?$ݶPΪuIj hmV5}"g$SJHWPV586N -<ǰ'N'xS9hG)cU=8z/~~r$h"j&6Y0_`.BVo&"eɅ숶"YPKvS!bH\ var/_index/0UT aaux 5;@@E;R2oTV#:e؊5XSL5HIoWNvlFJ8 /FCzD+k*Ԗy )T /PK vSobs/UT aaux PK vSobs/cat_ordered/UT aaux PKvSL\7obs/cat_ordered/.zarrayUT aaux u E~ cm?i EQ&B[6ཛau|ޫdq&a&om_ `*<(s81 2}96Bns1/$Bl.;$@+(b:Jkz_Q+5zPoPKvScBiHdBQfwr?BnI\[ѰÄȢ%`UOݪ=.W S8ʁEI$k(R)P[9/LP/\rY#@%K y˯69{oPKvSPr}-. obs/uint8/0UT aaux cb4fc``a= ط7vݳdXyzfPKvS} obs/.zattrsUT aaux ]A @ E=dmK*"C2&2M"HiV?. /̽C'Fk9{<"2&q Vq$!߷5W (ac#>LR]PK vS obs/obs_cat/UT aaux PKvSL\7obs/obs_cat/.zarrayUT aaux u E~ cm?i EQ&B[6ཛau|ޫdq&a&om_ `*<(s81 2}96Bns1/$Bl.;$@+(b:Jkz_Q+5zPoPKvS8e",obs/obs_cat/.zattrsUT aaux RĒb%+x_??UPKvS.,. obs/obs_cat/0UT aaux cb4fc``a= edac`facfbPK vSw obs/.zgroupUT aaux { "zarr_format": 2 }PK vS obs/float64/UT aaux PKvS% '9obs/float64/.zarrayUT aaux u 0E~Z}A)3J4&(FfPo꦳Jg.s$u]pM瘇3i{R̆5,x דr8V.Q WB}u e_MկTJ¬v|vl h]k3`9j5@ŰGo*ܒZ$`.L HI0 kx|pxNʡZǾtEgFgPݫ?????P?????PK vSobs/__categories/UT aaux PK vSobs/__categories/cat_ordered/UT aaux PKvSYk$obs/__categories/cat_ordered/.zarrayUT aaux uM0%Cjia wmY߼μ)I]M*pɔҩU/U>f,C1I煀co*(Ecj9KC/r bi ȇ}z"F y^Ȋp-T5 ~H,N'PK vSI$obs/__categories/cat_ordered/.zattrsUT aaux { "ordered": true }PKvSP*Iobs/__categories/cat_ordered/0UT aaux ı @L=AHARى1"Ep{,!$Ko*UVZuՠFMhզ]7PK vSobs/__categories/obs_cat/UT aaux PKvS}k obs/__categories/obs_cat/.zarrayUT aaux uM0%Cjia?$mE( g~:^$u2]W~G6+DKN5-*xG>[9E60tÎ^8!uhUe8G^JØm|xZGdh~k7,BUDʂ dyb?PK vS" obs/__categories/obs_cat/.zattrsUT aaux { "ordered": false }PKvSXoNA}obs/__categories/obs_cat/0UT aaux 1 @PIbP%I "b_'1)UVVz 5i֢Uv:u֣PK vSwobs/__categories/.zgroupUT aaux { "zarr_format": 2 }PK vS obs/int64/UT aaux PKvS^/7obs/int64/.zarrayUT aaux u 0E~څ}A)3J4&!Tߛ iMg\Ex¥?͡G =LFXM֯OԼjՏsO9@"r`<ǶR"-"-{M1!脡^GLPVP }vmb 3 m_loPKvSt֔J_ obs/int64/0UT aaux cb4@,@_G߯"zEs]w&3@99`PK vS obs/_index/UT aaux PKvS8Vkobs/_index/.zarrayUT aaux uM 0BfmAJ4&(Fz&j\tV7/3eLRW@+5 #}p|fBdJ T㢂 ڨz`)2s ;z!pץn1V}>g9MC6j=lu;I/d|p?l R\Ȗh';$PKvSh~TLf obs/_index/0UT aaux }; 0በul`+VFr'aM'=EvxI +K -XAO1L.FD+j%H Yi&%wn"yPK vSvarp/UT aaux PK vS varp/array/UT aaux PKvSjᩪQvarp/array/.zarrayUT aaux  0 {9D"v-VIց{wRGsJ>'ɚ@v-̾Hދ6Ó!X*x=wr8qD5+tQ ^ ]㡋bbn K3!OQCg6,+xR-bπͭbD64Yl{PK vS|0 0 varp/array/0.0UT aaux ! 0 B 3XpX|֬wxH6 p*X0,xlWn3JpAZDP"L/zPl(|< xd~4p~̐h`j!H(jM.l}f[`\WX0z4\<`TҼ %IH3nZYIp\ FZn9OdGJzеtJLV!ZAؙ"(eo,p\e@LܘTRj: 2*hldVN; PH*8Mє@Kpܲ'8 %.ZL)@Rb":­t4 2FПܮ^R@OFP]8]F%;g=KeSB ϔX<׶;X 5~QSr;yzPe9T~K L>l ;W;FvgDuQz[hKQRw돲S+."nZIυç6L_o!EU:d"1;?vʞO6zB|R\wz U<=s 76ߣi "4~Rl$GL\68-gh 5ƋnG{y`w5WAmM/ͥJWژi͔HuK&7m!e;M.ۖOƈ ֜D" kBjkZ^/Wr^}`H~.,/wJ*)05vK> O$̳m uל1Z(Yo#HX'%*URUBvKsCzb0lvhׂ&AwN@Я0"mT9X҂ʱ }ƀokļ]Hax3gKaPz@ s>r:HsZzR6ZM.{D7e  0 4'&fðY7@jSDmª'AUvJ#(15aZD/ {,J +0u%rpl7vU@؅Tm80A7%cZx .Fu5f>A/#|WM/ Ф۳~gu~nܯ^>/ҷZwWV2@Yi";FKGn?\GяC!7Z^m @r&+J_v850@;OSMc,f4ã ]svpE\lb3TByT,^"gmBJUV7S zQ"+`{HmƈḪ..o\EX aN( qz$Y^^ʅ{c~b͠T1&c͞k>c>9h`0*Qki?m7w{kBtȨ E]pii>is) WB+ _rLr$g7vqQШ(Fg2lk 9Kրܸyhʨ^¼hJ4w"#/W|ys\#9!"q+B k{f&;_U~kMZƈ _π'T5oARWȕ~ g`Uh]JH0N'[ ȷ|O-9'Vm;day5Sپ wP}|W[:"^c0èἼD_mhB_r0qɬ>>KGРh[R/Ǽ٪oJS0`C;1<|ͪR04\ ´5ٻ"vMA) .T;%'M_xpR}y4*#b3l!rQ{2:w?ϦdQ˱YDt4b&4aDNE710{nF ߷߳s鿤ᶣݧsܷӎGܷ߻֥Z‘ʤùӼݨֳ ?xP?????PK vSw varp/.zgroupUT aaux { "zarr_format": 2 }PK vS varp/sparse/UT aaux PK vSvarp/sparse/indices/UT aaux PKvSSP5varp/sparse/indices/.zarrayUT aaux uA 0EBf-vS1J4&Lݛ fY%7kup˟S׫ +lYT%Iacp)!fLk zs7G hXM}rQ=-ՊKuy/Җ:1mPKvSX varp/sparse/indices/0UT aaux cb4f```a fb^ b PKvSrLIsvarp/sparse/.zattrsUT aaux RԼ̼tݒʂT%+Ē %4%eEřy Uzz0``. XZPK vSwvarp/sparse/.zgroupUT aaux { "zarr_format": 2 }PK vSvarp/sparse/indptr/UT aaux PKvS.7varp/sparse/indptr/.zarrayUT aaux u0D|3%z1 cH-mhX(R! ڋ{},Yd%Ǜ$u}pMg 9iIB#[nfpm\:x^t^ Ecj|!/ @9bEP| !_YP'FgPK vS500varp/sparse/data/0UT aaux 3 0]m?RuI?޸?,bM\?PK vSvarm/UT aaux PK vSvarm/df/UT aaux PK vS varm/df/cat/UT aaux PKvS&Ѥ7varm/df/cat/.zarrayUT aaux u0E|5 qgCjaCIwۦj7Ϊ=9s3w/J?{7 a:>U28[Id4| I9UNl "vk"cRtXsy&4b2t!.҆^CLX!C ~vg1oY)cGx-7PKvS?(varm/df/cat/.zattrsUT aaux RĒb%+x_TPKvSMft"$ varm/df/cat/0UT aaux cb4fa``a ddebcc`dfPK vSvarm/df/cat_ordered/UT aaux PKvS&Ѥ7varm/df/cat_ordered/.zarrayUT aaux u0E|5 qgCjaCIwۦj7Ϊ=9s3w/J?{7 a:>U28[Id4| I9UNl "vk"cRtXsy&4b2t!.҆^CLX!C ~vg1oY)cGx-7PKvScYkr*..u"- km(PKvS#$varm/df/uint8/0UT aaux cb4fa``a ~:/e?'lr.C_PKvS#vvarm/df/.zattrsUT aaux R̼ %+8K"S_ZBdK`xd̼3dD4R" %0?jmj^r~Jf^nIeA*U)%iEJ*R3@ jPK vSwvarm/df/.zgroupUT aaux { "zarr_format": 2 }PK vSvarm/df/float64/UT aaux PKvS1+\9varm/df/float64/.zarrayUT aaux uA 0EBf-J tc"iLPwo꦳Jo>$u}pMEt)fCZ~ȖY9g&xSyX1Vk xK5sӗnI  |"FhRܿ\,JmӊdPK vS}l varm/df/float64/0UT aaux 3Y?xd?s?^=y>?+ޤ^?įbC?We{?Eʣ?Z_]?A ?? źlT?xI(T?yT?S'?3?P[1d?ӵZ?z? KJt?PK vSvarm/df/__categories/UT aaux PK vSvarm/df/__categories/cat/UT aaux PKvS\Hjk varm/df/__categories/cat/.zarrayUT aaux uM01Cjia?$ݶPΪuIj hmV5}g$SJHWPV5m"ZxaO:'rЎR4ƪ6{{/~~r$h"j&6Y0v]z!+P{aBvD[)YPK vS" varm/df/__categories/cat/.zattrsUT aaux { "ordered": false }PKvS6=nvarm/df/__categories/cat/0UT aaux ı @;FpG!jhH6b/sjt5EhtEnzAPK vS!varm/df/__categories/cat_ordered/UT aaux PKvS;YӴk(varm/df/__categories/cat_ordered/.zarrayUT aaux uM01j[!0PҿD6eo^g똤Vk\Gfh{ɔҩE/Qg+,CO1H녀"^QXUsK}r5ba ȇmz"vF ,i^Ȓp-T ~@,-NvH/PK vSI(varm/df/__categories/cat_ordered/.zattrsUT aaux { "ordered": true }PKvSdڮ;d"varm/df/__categories/cat_ordered/0UT aaux cb4f a`` bF v. D "@/ "DL" @D9PK vSwvarm/df/__categories/.zgroupUT aaux { "zarr_format": 2 }PK vSvarm/df/int64/UT aaux PKvS9 7varm/df/int64/.zarrayUT aaux uA 0EBvaR1J4&$B$m6Ux&z7 .=4u|>dqFId[bO0b JEN| "2}SppOt5ǘ>') .\vHVb59 v2`.<-rVZSoPKvSdJTvarm/df/int64/0UT aaux cb4X!@,6@AՇ?pUWfG?0g  @_/(PK vSvarm/df/_index/UT aaux PKvS)kvarm/df/_index/.zarrayUT aaux uM0$Cjia?$ݶPΪuIj hmV5}"g$SJHWPV586N -<ǰ'N'xS9hG)cU=8z/~~r$h"j&6Y0_`.BVo&"eɅ숶"YPKvS!bH\varm/df/_index/0UT aaux 5;@@E;R2oTV#:e؊5XSL5HIoWNvlFJ8 /FCzD+k*Ԗy )T /PK vS varm/array/UT aaux PKvSrmYQvarm/array/.zarrayUT aaux  0 >,Ʉ1vc (F+kKz\N$ْ֌=-}UgG_澭Väֱ*x=u8rG'\/qF1FJt{إߓkЃ녡vW1@΀M-beZRR$PK vS٫varm/array/0.0UT aaux !@@̛D`bģذ 4$ jHTp-+elkhpmjTЄM 2f~rbd,:(VFRmThr*nUɤ9S`E I8BI@w܆`~j65><|DD(zhE d :7> ,mdĐ( (+ @8@jEhpl^.询 ˅pU:@i0*K_8M>NrVl4U}ԕ1捞%_Rhl,TL`-!ӕ G$L\}wS TaSV(|dj>zx` ?|(LYx@"lxv8lXz`L0X M,b|+n\ Jd CD>ؔ0c`1/D H VLȦ~gG4|0_ xXv@1<j"L2xbĬF.lPwv-$ٳiL@u޷$Xeh6!wvZ٪݈p%,>" i8ǚ]FX; kH0J2YD 'o&c* F9ԩh'J=0_rHdJ7:Bݗ*B)p ]SgPޟFL=? ul…+) /`8&J4fu}E=zjpf/൛\MĻnFPiuJ چh3G334ۃgeM OQy.`x<žEfI}NT.^XߢAn^*@GcRK#n/g{{Ц178u2e2miCyr&QKWD(q#rWtkv[e[*Ls|)xDܯ$FMn걼wfIV; cMl`WXz 6{DWud5tD ;"J>/YKtM1:%: ;|Km&2+D^E h;9(-Vw5^_I)MpLOe@ rw))=4>kQ,އQ쳺3lQ6 M6'ZS:mjU{ n<Ƥ@ hta^k\7ec b$gq.XF|nljO!J3% NVazlgh |_rJaɽݰ'+68FbrIkkm3))R}6;3룐a/-\P ۹5d۱&b\]6'fu?nlufnR)ׇY|.(HJ'U@Uhy`V7"7V/Z˫O,\r鍸NqHX[ǫO{ ̊4t_75檍cmҦwFX-fg u*z.$'8D`*+]<#*(n4gGs;0\Y;pϩhS,5mpo}e钍˗4+R+c&pћ dEhM~fOŦJiv/eƷ+QhSX3YzʅijB8p(XQ[sqC*AH*l[̽gg9+"-iYhMX~B}ǡ۞-nޛ5Y8M9Hi) Q,t#(qE;vĈ#܋b*mur$)ĥD|\(A m)F)Oc ^?L{j]%|IH.]Oŏ+ ԸZNEiſ+]UIKuIOPtCjkÛ- or4x2u֮ :n0腥J8)Z֛ Or`dJp$JAdᭈW TAýYE%Vݱ}^zYwfW5~IIA WYUe #I#1h/xCjo8lTRjaୢ1ҿcad?l2hC끽hBϸY:<[\Z+sǝu k :bNOBy6;ji5 tgFAԗ$qU!I;;tp}Pdz!wßT }gdn!R.!])a0f꣥5/m(#hzB+3n'f"_˝25amD݄ O*bY7mos q&.")ҤCpO&-B.76`©RR7YyovGtMz)@3a`[ںC" rntF7 -@$5#]<`W{BIm"JkwUhj %qDYIqL48 OaML u hpb^$_K p.ɯȷiA{4~:<bsea___֧ w3. > ,w^BRC,?Yi:2qnk!Pֶ~ H*Lh(S ҫ6M3'fL/OAe ?Cl ЋQbc]؜"T ZkR}%SjO)&E0U8quOo8]q:g]gKC}KU_zj2o/9DɎ[IkJp&hPcu/T.8_5Ql/MgjI&#`GԬ(.Tj^;1R%U;8yMU<螤)%1K^YS˶Y[u&허*īXT`Į5j:e$LXnR8Op"BRX,3%: $fpbt2NJzDң9[#e}r\ZWe2;'xaF,ŽsirkoHg-d|sShd94rFKHBBeT ȷpo'hb _sHuT_E[=2Y?}PGhYob쏵x]}2V cmi\C܏cv_4& *$o/Iއʁ&*ʂԀ?gwhfU!(=dJ-Ʈ s9eŚ]n/w0I$.̓ Z-6۹JET|1r+-Bج(Z9+j)]bX~Ӓl*xgщѶE'm|_mepPlȕ*5fKPsVOr dՔPdΠ-x,%xK9aɿwSF0X[xv,MHT(o"t8t2 c}zfoڱ@P耎^kq(LweqRh̨r(S>ΞgA*-2 {WP*9ru]=b@ؙ[#UlKk|r#>$B" ׸Ī߽ԌՊӏ鷳ؿqrΦЖ߹ܷޠsҟƷ߫ݺ״˒ۢٻѣ녚ޕ緩׾s|ҦӰٶ¥꺲߮ߋ፩۩׼ѷ՘۲Ʒ薡†޶߬ӻ?P?????PK vSw varm/.zgroupUT aaux { "zarr_format": 2 }PK vS varm/sparse/UT aaux PK vSvarm/sparse/indices/UT aaux PKvS17varm/sparse/indices/.zarrayUT aaux uA 0EBvaݔzR$ G#3Fݛ fY%7kCG?⟧*YGD~-:,~\5xI {,gp0 QRL7xL1#g >9ᤑ{ `%إVƼXVePKvS]<`varm/sparse/indices/0UT aaux cb4f ``` vb6 b b+ b! Vb; vbF bi|01;PKvSƿLtvarm/sparse/.zattrsUT aaux RԼ̼tݒʂT%+Ē %4%eEřy Uzz0``.ن`v,W-PK vSwvarm/sparse/.zgroupUT aaux { "zarr_format": 2 }PK vSvarm/sparse/indptr/UT aaux PKvS.7varm/sparse/indptr/.zarrayUT aaux u0D|3%z1 cH-mhX(R! ڋ{},Yd%Ǜ$u}pMEt)fCZ~ȖY9g&xSyX1Vk xK5sӗnI  |"FhRܿ\,JmӊdPK vS⽰varm/sparse/data/0UT aaux 3愲.T?\Nֶ?38??ζd?DNؔ?jz?3~Y?O ?Z?;pF?*ai?2Q?*\.?8v?S.?΁> Ɨ @/-\+!= |~~uqnh`-bZX6y)cJs:հl"ܿ _+C 8jl,qz $[BiBPd^KTѶ Y$킹8]9g+mthTI.@V"qɚu>3|0<(MJ<إN`H\SvFU)US`)ln3tuv7d6 P$E/efO9?ݠYXPM!Y:6qV->oPK vS uns/nested/UT aaux PK vSwuns/nested/.zgroupUT aaux { "zarr_format": 2 }PK vSuns/nested/scalar_float/UT aaux PKvS\kuns/nested/scalar_float/.zarrayUT aaux RҼb%+XH~nAQjqq~P44'*RRY RIPeė%攂$ %EŨRRAF*9tg$M[]XT_X6PKvS'uns/nested/scalar_float/0UT aaux c`PK vSuns/nested/scalar_int/UT aaux PKvSiuns/nested/scalar_int/.zarrayUT aaux RҼb%+XH~nAQjqq~P44'*RRY RɴPeė%攂$ %EŨRRA*9g$̓[\XT_X6PKvS uns/nested/scalar_int/0UT aaux bPK vSuns/nested/nested_further/UT aaux PK vS uns/nested/nested_further/array/UT aaux PKvS{ @5'uns/nested/nested_further/array/.zarrayUT aaux uA 0E=Eu*Dy ӄN4[zw3!j6*y_2 Ώ=?O_* vx:Ed]֯'wՏKTb`p9BM0D1u^k}[^s16,g0ZGLкVq ~v1_E8h1PloPKvSz|b8!uns/nested/nested_further/array/0UT aaux cb4```a `LPJ@iPK vSw!uns/nested/nested_further/.zgroupUT aaux { "zarr_format": 2 }PK vSuns/nested/scalar_str/UT aaux PKvSsKhuns/nested/scalar_str/.zarrayUT aaux RҼb%+XH~nAQjqq~P44'*RRY R 5Veė%攂%DKRQLTr)+H*(>-(7(lU PKvSb- uns/nested/scalar_str/0UT aaux +f``(" PK vSraw/UT aaux PKvSR8Dl raw/.zattrsUT aaux RԼ̼tݒʂT%+r%4Ԣ<LAqF"XS4 :pU PK vSw raw/.zgroupUT aaux { "zarr_format": 2 }PK vSraw/var/UT aaux PK vSraw/var/cat_ordered/UT aaux PKvSz AI7raw/var/cat_ordered/.zarrayUT aaux u0E|5 0qgCjaCIwۦj7Ϊ=9s3w/J?{7 a:>U28[Id4| I9UNl "BƵG1N) MGE,9Ƽt|P[iC!&h\?ԳƬ픱#[v*7PKvScIW8F~} Q퍁gNUqPK vSwraw/var/.zgroupUT aaux { "zarr_format": 2 }PK vSraw/var/float64/UT aaux PKvS ՟9raw/var/float64/.zarrayUT aaux uA 0EBf- tc"iLPwo꦳Jo>$u}pMEt)fCZ~ȖY9g&veo^$ $U&jΈـLE@1k7H\o'LCzI "zON< Ԁxث????? P?????PK vSraw/var/__categories/UT aaux PK vS!raw/var/__categories/cat_ordered/UT aaux PKvSL̴k(raw/var/__categories/cat_ordered/.zarrayUT aaux uM0%Cjia?$ݶPΪuIj hmV5}g$SJHWPV5m"ZXİ'NgxS9hG)cU=8y/~~r$h"j&6Y0_`.BVo&"eɅ숶Lcm֣#`4/`.BVo&"eɅ숶բU?PK vSraw/var/int64/UT aaux PKvS7raw/var/int64/.zarrayUT aaux uA 0EBvaR1J4&Lwo&m6Uxe&z? ]{4u|>dqFi`m_=U\W?.@.S]8N"<@;I%k{_9b XoPKvSg_`lraw/var/int64/0UT aaux cb4p`d` Z2nP^UVyϷ|({~޹?00 b`f``g`$PK vSraw/var/var_cat/UT aaux PKvSz AI7raw/var/var_cat/.zarrayUT aaux u0E|5 0qgCjaCIwۦj7Ϊ=9s3w/J?{7 a:>U28[Id4| I9UNl "BƵG1N) MGE,9Ƽt|P[iC!&h\?ԳƬ픱#[v*7PKvSAXK",raw/var/var_cat/.zattrsUT aaux RĒb%+x_,UPKvS68raw/var/var_cat/0UT aaux cb4f```a fedegcbfg`bdeeaPK vSraw/var/_index/UT aaux PKvSYzkraw/var/_index/.zarrayUT aaux uM04Cjia?$ݶPΪuIj hmV5}"g$SJHWPV586N -<ǰ'ND =\J?sr b945Xh?/v]z!+P{aBvD[)YPKvS6kraw/var/_index/0UT aaux ;@@3D#1cT6CtbU:˰ pw'JԮ@IwZ.RKFJtC!@.D6 SG 5dzEg?ou6>PK vS raw/varm/UT aaux PK vS raw/varm/df/UT aaux PK vSraw/varm/df/cat/UT aaux PKvSz AI7raw/varm/df/cat/.zarrayUT aaux u0E|5 0qgCjaCIwۦj7Ϊ=9s3w/J?{7 a:>U28[Id4| I9UNl "BƵG1N) MGE,9Ƽt|P[iC!&h\?ԳƬ픱#[v*7PKvS?(raw/varm/df/cat/.zattrsUT aaux RĒb%+x_TPKvST68raw/varm/df/cat/0UT aaux cb4f```a `fdabfbbfdcePK vSraw/varm/df/cat_ordered/UT aaux PKvSz AI7raw/varm/df/cat_ordered/.zarrayUT aaux u0E|5 0qgCjaCIwۦj7Ϊ=9s3w/J?{7 a:>U28[Id4| I9UNl "BƵG1N) MGE,9Ƽt|P[iC!&h\?ԳƬ픱#[v*7PKvSc$u}pMEt)fCZ~ȖY9g&vGQUH]̫?ˮ c?`N?%U *N:AD eyq\Kޕ~9->]s4W+͎"cMT Ctfw"T$0ד*]E ^w,]אK 9Nlh>n&w/,JZ}z^v@bGHf6ήئ????? P?????PK vSraw/varm/df/__categories/UT aaux PK vSraw/varm/df/__categories/cat/UT aaux PKvSaik$raw/varm/df/__categories/cat/.zarrayUT aaux uM0%Cjia?$ݶPΪuIj hmV5}g$SJHWPV5m"ZxaO:'rЎR4ƪ6{pp ^*=ȁ7HD>Lcm֣#`4/`.BVo&"eɅ숶dqFi`m_=U\W?.@.S]8N"<@;I%k{_9b XoPKvS"ĈUpraw/varm/df/int64/0UT aaux cb4p`d`f ⏪؄?ytN ;oh\蕻ԤgTL2|b`0x5v5PK vSraw/varm/df/_index/UT aaux PKvSYzkraw/varm/df/_index/.zarrayUT aaux uM04Cjia?$ݶPΪuIj hmV5}"g$SJHWPV586N -<ǰ'ND =\J?sr b945Xh?/v]z!+P{aBvD[)YPKvS6kraw/varm/df/_index/0UT aaux ;@@3D#1cT6CtbU:˰ pw'JԮ@IwZ.RKFJtC!@.D6 SG 5dzEg?ou6>PK vSraw/varm/array/UT aaux PKvSQraw/varm/array/.zarrayUT aaux  0 >, 1vc (F+kKz\N$ْ֌=-}UgG_澭Väֱ*x=u8rG'\ x8hǘZ#%:=5AvENB+Gҍpg릖Ŀ^2kfk$PKvSl66raw/varm/array/0.0UT aaux %w8oP!!+##ʖHd$![d] {o {\׵Ce|yy_P]᧍~,x A$JxҞk,D[]Jfs+L?cҩĢcJeتb#5Ժp7c۰c{N7]Y8ɳ4-E-/;h5CdDBb}u*%2Vj!TgXǁ5<3CPs겗/1VVn]c'/CF $h/8c"Xm,˱zs<F厼W5kG}'AU}oU i Rmi7[ENzT'2~=X C?ITu{#Kåjx3fи>5@=᛭ҏNA{0p,h4LO2_y0k$,,&udNߩ b-+84MkȰHc ӣf|tb^-ǂQ:c\ӭHDA0iF df85tݿ0->{G[GDD1,w|.n3$2N]E s!,mujw\Lo{@xpX$=ڮK~">_'Ymj`f cGT. >Bfl.T>0ϐ.o(Q.Aٰzľ$6R Cr 131 ߻wT 'Y}q_BǪU?)cЩjCP1?r4b%=)fؠoTʟFRkTyAo-*L5)wl96YN TA4H# TV"{TIѶ{R.|Z'U)v@ U95fy 8 +r raM`sWR35'ڷY`d#Z1'eUU+dd|;0A/9[x0!췎' rH%='9%E i"FDM~[\~2m_ _N36Dˮ_#KѡaĎǷT+j #N&G?1)hNۜ<4TKbzSy䷗e)TsJ|l =eM *))ZI&"t9'VRw.m1SF~xg a4!oTѼb`q l?ЏLHhJɤ'TR$0-x]xVG+jNTM"nRX=>R:$A =bAjfF3k15Y#[|""i KYvv;. EWȀgAye9Ԓ27rߠ}*8ץEӋL̆@0#xΧ{A49aՕmi%>!]jfӪ1wΰ8ajl7tZLW MUތXn0s+AczHyDq(\Ħh6OHB?`hyu/QڧQn;V 12 cZWό9v.HWXN|궡Q*Fg51Nl!+Q?]3 IȳV5Dug8omrm4b''5M (b9m!< ,k%et-1+P~VXe1wMu:9ߟ:^Ihz5Euq9TYVf\7?/9.?O۬*Qg 7X-Ǯ_s}P0A߄˿ P!:]=}i[ԚֵwEqx5w>T~;y!Y,0ل9_ gfr"mO"d.=9.Nf{/V{(xy؂Z#WۓF[eQѦ]Ϗ줽B$˽3AL"0ZkT>u_'oF1 {b8\ St/nQc`!"nt hז=8˂|)m>:M9Yk6zjFUViWp-hCByOw*2-Tm#koIds˦ٚ(^&|=s{ ݂@:a =P61ckB#oﲸjl0~=H>:l{` .l˦oSqyk爺*fFS2)K\ܰ\޸Z) skwfU}]05'v5hsOƳ?h^棗6E&{;f*eGwi_؇v`a.}؝>: >jnjt5qu!R7'.y."Z^(rC&@Bĉv3[/iǺHh1q݈@Kү/ešB~<ѭ\.P3$?r Y^\N*TnOg`ۿmk 4R)e[Qѹm2˧nf©XMrݖI KlK-~kï&}^>ͳ8n1dbfMMkF|N}v$yR}3gz.+sՖrc5՜1ѱtĬܨ–nRͧuN_E8=tUK¦t=_]#S\mֱDLq$+hv^ֶ 6K -+SklM $ikixU& K"5*`2m2޹b/X{D+: po<8 h; 6.1OOA.;t򭮞U%t}:hh2}4̻PQEsˆRr/$IK\¯ZIa?Gew bs՟޾imxl^U?t?-*;;y#\8)sSl!,u O]6 ï}*5vY.8 v~pWH|_ / *+>^ƀm7%?-ktִq6h~תsm+E=oRGf酪2:4~=`Fnm-mT%kT* =kUɷQPvJ}LHlJ3N~ R ːmY+Z|-:m|lr ‚}QҼ-$GdhdJK[sOPϑu#ߺǷVOʰ|dHBրs:k7S $ξkMꛮ.47<$R#@߻#7h<4(_U5~$:sjh=nw-R$AUzڵb!^l`Æ};̓kqf9TlV-+FYRG䔴%Q*TWRB>2=kt:{շ2q_ǿoq qzgQy:ݿ]u~qq?ZԞ[qpn:t0LT&VNUԧ/ 2v.;{e =??uQ yy@ُE\]z9sWlD#_yTv.Fׇ@sq%6p=tO] 9 49>2x,[y Ÿ>g/6 زӦj4LKxJ1Ҍ fC(m %~ ZbNUl5uq{A~ʨ!w+7 \|Yx3uGB[od>GhYM_X%?LeUM~aܭ=LZ\2K=fP1~X)iGv# *Vޖn5>~n^uUS]T' 7 O#Zx*AN~Gyr2h:PB]*ݼ1,L]^dRQ{Ocv-۩"}K^ȠtWv:4 8:nONG,| g}h6m~{juSn΁'r6]ĭ e%ٳbڀ$;CZ`,}5-c^Y~X 酆JWT: l'9Lb\a8ZƷe9w`$m";תE F< Fj&cꐽ]jٔXU/V s)k*(BƳV\kcH%NۏL:Yl醟.yyHf):fy+9O [@mu?:=aP.q'uLXf] ld J2?k9: yCqrG/.άꭵkvv Hp?Ҟu:e-<?^2? .oJQ5ӭx*upئ۲upRbG@!5^s?{\h$DxD:ִbpewt c3i`CkZIt.iQU1ӯr'MM`~T!kS}U"\nv P$!=UVd(ؐWW#ml E@=Nyi`֒O˱D۫5~=nR"IWnO>8[^/S) S), luM5G!Sk< X#~-M=1NE6uOG'LY܃_MLc;U_19Rݷ@q_S\TAkOK:Ə-OGhaeq23[M~E]BaoUPwg|ؒz;mwX>}i1#Agy'^7mOW|50. ^q4Ch>_g5ٿu8sv2^߼?2g/#A_̘",6{PhDrkZE=Ra[~BL[ZX`j(m/f H1:1ݺrQMz_=5ޅ 1ZGeگkςrm{.']3u+GVo(cTc!ft=.S mWsF/?p َ,MLnը5Jh(tڿYڸf2&-85w3R*&'Bv, [.*ՋW4?R./Ti:|/n-G} `^"տqFM6jQ~̢٦)rL"jZU;ixW`8~l64=_vx},N_P8DW@鼻y-j"֌;VnƏCf&p}wjLdlUw>>X{{Qf&z%RR08ullFKŵ_wEm ,(J1J,$oV]JWg Vdh=`VTžϯa ĘVFhױ.smxrU 'ali#QUcpR,iOS09edK [];2N'u>Ys/oIϐOVҁ9أ##SwռưGN0QiOќ ^Ӿi־y&/\T{>y%i\HQa|@D`÷=uBèK'Wǒr+[ړ>~pqJMWn󾺾l*IܿiC%MSpvD+X[x(/sEBQ-JkOһ: { _~( ݎ8.t]6e+zfNjvQ7HniDg]R2 WƩksxV1vl{XmXI7by7qӡ84OuGq#_mzVӺ}e;iext |$jyﳘDӟxMYk7¦t 掮@dSMG/x=Muga4J%SDGs~nLc{-W405Q$ B?z5v^x }l?EjL4vpKz,yS ɹ$Jku?< ڦ]x״>_=_ `ce4dᄄKo#ǏU6N8E@X˧ibiyR짢Dz(T.Y%cj BPvZc3e F.jՕֱC] Kq7$¥u:]7ǧƷO臕WJ2%pbG " ;JWK/l+KUfa=Q<%/>:ZqQ8{eEy|bzV2lAUy}w0Khg+as%2úck"rlGF}I~yQt/PQй%¡W]8܈_;V'V>_b5A]u-a<:Sz9/_2Br=8hZάߊZϑn##7y:sOd2IzV,{1og'6IxdmSS5Fq_>a(I,>Q?D]G7(QJ\4]7@O>-Ew ߹Q_衮 ch}zI8xihlJ`bWJ d3oނOdz]}m6N9& V C0ɦb+=`MؖEb4w#=TȬl [+)J~/ԨSY{Q>q=|8bvk>tvܲS˖-l=(rmPm|#GZI~;1`s j#J)el2k=3RQ9,v_g/4e_Q#ӫI{}oDdhY?\\\qyZ4{ J#, |yk'(-9+d/o]r. ?mV*otY3}i^]RUsBUK=*ޘH2BI%g^:_}bI*˓j)Ŧq 9kƫd Nj*w.$N=^x ]Mt Ҩ\I??s dx,zdO"aW|,,~qxp(oi垷oh8cZ=vRSFc4s_$Idl%Nz"z)vL9w*$8>J@O y7sFjr09^\D`-76$V|:tncX\"-=NZڥ^[b3>ܼ8JUk+a`j`R67w|{ϽngհQȲQbY񻞧y9%hwS۠d7`ԇIYğPxxPEǦ/wNd2|_]ӽm,ŲNO}9 J}@ c7`/XyvfA$& dL8;wIS^tx;(QAi8ߜDߐaS&Z8hd@{?Ď|O2+yWKT}jH~n'L4= [   LYqQր AXm\JaRz )o2ZwAqQu=Z%cz1^#'dAM@yLgCHә$K5z?Z4> *CHʥd@E}]$'>f[C=/-Ӯ< o,k/\frW0te?XfudƛסLs1:B}+Re-;x!ZJ% #>hsk]c+MH9uXK6VE1C"ͱt:3gߕ+*.^WÊZ[!C5yK:olU$oe}wW_BApg7oyu SڣC ӜwҮ?奎`#j%>41\c3 3}mbݑ@O| l:/h|l%ˉuXUoigxDu{9 ԫ)w̸5><u+^k$u}pMEt)fCZ~ȖY9g&vb0U@im~q(AxHBwHX} T&^'7_iWHyVQRr☂zVg{V̛s2qyBp;5X,9a2N_ШAY?ߘ餁&C7K-q:@0p|v\ܑ tg㱿ײ߻ڦ????? P?????PK vSraw/X/UT aaux PK vSraw/X/indices/UT aaux PKvS;9raw/X/indices/.zarrayUT aaux u0 Ewyf[?PaHe&Y)9:] %:ۿH1[VvlfqjRaAAKHiQ,S̭1u߯s5{ hXO]rHPlpԨPn."o? Z[S~PKvSٯK raw/X/indices/0UT aaux  D7DT犸30~\3PEQp >6!a '!?%$J~~^ChUyYHeUv/G`I^]?,yaNlM.t)na^EcQthPũl-&TȲu8/qѴiu~Hcd](G Yϴ3 v5 0Bc^)y۽ t y#.Je&Qj cہ|*]\wR. n_,@Ƞyaj >,JQx٠Fp_ȧIsqv `'#6?gtPKvSXKs raw/X/.zattrsUT aaux RԼ̼tݒʂT%+Ē %4%eEřy Uzz0``.&`f,W-PK vSw raw/X/.zgroupUT aaux { "zarr_format": 2 }PK vS raw/X/indptr/UT aaux PKvS o 7raw/X/indptr/.zarrayUT aaux u0 wyfRucTJC"" A6A*woBOݜ~@֮k.=a>Ua۞%*x Iy+~\zxNa' NpSXk5OeR >\f @ pPzuq*JmZD'ATjK-oPKvSL'Vraw/X/indptr/0UT aaux %!@`7&I" ݦ)DQr Ua۞%*x Iy+~\zxNa' NpSXk5OeR >\f @ pPzuq*JmZD'ATjK-oPKvS,~W X/indptr/0UT aaux %`oh`aLwA8q0y2JA_~w3=z2bjfna՛wk6v|;8qrPK vSX/data/UT aaux PKvS;X/data/.zarrayUT aaux uA0E̚D]hy cH-m P!ݶaڗ7?(vmp';],i:R̆64x5Wr8M6.Q a+/gEc.-EB]1+|h!tý"_ 4T(_6"$? \jD,PKvS#_e4X/data/0UT aaux e1NA EhW$Y@ iB)h#A R"\3 m[.dlzu,~_9jPMc加؀_<՛|ľNp %Qy瘃Ob>UjH tJ5CVummS 8Ƣ<bRGȋNPK vSAobsp/UTaux PK vS A?obsp/array/UTaux PKvS'ٹQobsp/array/.zarrayUTaux PK vS zobsp/array/0.0UTaux PK vSw obsp/.zgroupUTaux PK vS Aobsp/sparse/UTaux PK vSAIobsp/sparse/indices/UTaux PKvS)a5obsp/sparse/indices/.zarrayUTaux PKvSa*"4obsp/sparse/indices/0UTaux PKvSCZlIsobsp/sparse/.zattrsUTaux PK vSwobsp/sparse/.zgroupUTaux PK vSAobsp/sparse/indptr/UTaux PKvS o 7Iobsp/sparse/indptr/.zarrayUTaux PKvS8&Aobsp/sparse/indptr/0UTaux PK vSAobsp/sparse/data/UTaux PKvSXj 7 obsp/sparse/data/.zarrayUTaux PK vSPXX obsp/sparse/data/0UTaux PK vSA!obsm/UTaux PK vSA!obsm/df/UTaux PK vS A"obsm/df/cat/UTaux PKvSL\7c"obsm/df/cat/.zarrayUTaux PKvS?(T#obsm/df/cat/.zattrsUTaux PKvSx8,. #obsm/df/cat/0UTaux PK vSA3$obsm/df/cat_ordered/UTaux PKvSL\7$obsm/df/cat_ordered/.zarrayUTaux PKvScq>N ͏var/int64/0UTaux PK vS APvar/var_cat/UTaux PKvS&Ѥ7var/var_cat/.zarrayUTaux PKvSAXK",var/var_cat/.zattrsUTaux PKvSDl"$ var/var_cat/0UTaux PK vS A_var/_index/UTaux PKvS)kvar/_index/.zarrayUTaux PKvS!bH\ var/_index/0UTaux PK vSAFobs/UTaux PK vSAobs/cat_ordered/UTaux PKvSL\7Δobs/cat_ordered/.zarrayUTaux PKvScvarp/sparse/indices/.zarrayUTaux PKvSX 6varp/sparse/indices/0UTaux PKvSrLIsvarp/sparse/.zattrsUTaux PK vSw3varp/sparse/.zgroupUTaux PK vSAvarp/sparse/indptr/UTaux PKvS.7varp/sparse/indptr/.zarrayUTaux PKvS5Kd޸varp/sparse/indptr/0UTaux PK vSACvarp/sparse/data/UTaux PKvSM%J7varp/sparse/data/.zarrayUTaux PK vS500varp/sparse/data/0UTaux PK vSAvarm/UTaux PK vSAAvarm/df/UTaux PK vS Avarm/df/cat/UTaux PKvS&Ѥ7ɻvarm/df/cat/.zarrayUTaux PKvS?(varm/df/cat/.zattrsUTaux PKvSMft"$ &varm/df/cat/0UTaux PK vSAvarm/df/cat_ordered/UTaux PKvS&Ѥ7ݽvarm/df/cat_ordered/.zarrayUTaux PKvScēƪc[iiBj7{2hnmƻR U^7/%rZY@1__fqR4DAJh>Vƹ Z9NV8ʩji){^-I"{v^P!XS)bRrKs(3`c07M4ZƐk+|\|z(P6h_-[@!Pk2n}?L %ddN"m,ǞDO97*~ɸ8Oc|nEB!$};{[2PK!U0#L _rels/.rels (MO0 HݐBKwAH!T~I$ݿ'TG~xl/_rels/workbook.xml.rels (RMK0 0wvt/"Uɴ)&!3~*]XK/oyv5+zl;obG s>,8(%"D҆4j0u2jsMY˴S쭂 )fCy I< y!+EfMyk K5=|t G)s墙UtB),fPK! Uaxl/workbook.xmlUn8}_`A"R7[B²dl"H%*""ZҖ4"Mkؤ9C9CwsϤ_ rRT]wi_VlBqn+ĝZF!ƱQޣB*߂!ꚗ,ر^@$kლк-pwb[,(r2텤ޓK#: \x)>h@U{HuކzsS+U|Še4ҲZI!yDN|~\37Lo&`spSO1KgA$NKꐳ/8'E @1ܐYnohFumȚwQs>ͨMujz2ñN/z/MeO͍-lhM[PK!  xl/styles.xmlUmo0>iIJ$(Ti&$XKd6H>{^R=3k"ĈRW\m2fYGUEV,fUUjA-c:,Ij/tH`k66z')HS")WGXO@$5&(lk.;tXrqQеm4%jQkA:8F[] %yɞҝ9 _%$ޚ"Ma{ˇYTr`U#p+OW,1&yZj rP:P.E%o\S׆{cM%uI[<l.`]L A# zCB8fThXUA[跷789.`:<v᛭uvZ-O+N7ZQu;z HdB~Hd!max^=^c{忇Em}w'uFi7~W`7To sF@tzDžjrόNSPb5 w:yU|' >vDw`{g]/o77Mp9 &, r$jU8>\0A & +`!فٖѦ;smAqFdJglzEūdyɈ{('',LpuձBc+ H+AbPK!N xl/theme/theme1.xmlY͋7? sw5%l$dQV32%9R(Bo=@ $'#$lJZv G~ztҽzG ’_P=ؘ$Ӗk8(4|OHe n ,K۟~rmDlI9*f8&H#ޘ+R#^bP{}2!# J{O1B (W%òBR!a1;{(~h%/V&DYCn2L`|Xsj Z{_\Zҧh4:na PաWU_]נT E A)>\Çfgנ_[K^PkPDIr.jwd A)Q RSLX"7Z2>R$I O(9%o&`T) JU>#02]`XRxbL+7 /={=_*Kn%SSՏ__7'Ŀ˗:/}}O!c&a?0BĒ@v^[ uXsXa3W"`J+U`ek)r+emgoqx(ߤDJ]8TzM5)0IYgz|]p+~o`_=|j QkekZAj|&O3!ŻBw}ь0Q'j"5,ܔ#-q&?'2ڏ ZCeLTx3&cu+ЭNxNg x)\CJZ=ޭ~TwY(aLfQuQ_B^g^ٙXtXPꗡZFq 0mxEAAfc ΙFz3Pb/3 tSٺqyjuiE-#t00,;͖Yƺ2Obr3kE"'&&S;nj*#4kx#[SvInwaD:\N1{-_- 4m+W>Z@+qt;x2#iQNSp$½:7XX/+r1w`h׼9#:Pvd5O+Oٚ.<O7sig*t; CԲ*nN-rk.yJ}0-2MYNÊQ۴3, O6muF8='?ȝZu@,JܼfwTz}vLm'U16!H#HEw &rcv"Ҵi% (r|R%СQ1)nCVhBȚjʽZ 4Օ9N`ה7w-(8LC M$TT#*ybWSthgL-ZxKgJFHgקztWjΤPst{ڦlt&׷%W+mHr^o4 F3dxyL~nr,],]l.N<'$QMW"&f>'u64} s'ē>⒃G4?*&5&WWX+j.П 6s]bI|qr(_#}q5Mr%02>2iIq[ԼKn`1#M; {;&NdS<\+KZ]*Aa BIt1!)ޤl؛an3K:B7( ݄|s7tx7%rBww?Y/v{(d[gc-Zֻ%vBkQC̱B=LLBPK!a5ս(xl/sharedStrings.xmltj0 {a`t_PF]Js56r)c{e1$d%w8UrЬ (/'P,+Ode;"Nk":"-KJ%A8a9"Jɺ5fKH3-uO{[GjV.M}lМͯSPK!ύL{docProps/core.xml (]K0C}~8?Bہʮ n(ޅlD{v)x>9%|'Z(bb )zQ`UZA`Ѽ*~@e'7aiJ,{rċI&#w)PK!^YdocProps/app.xml (Mo0  9mQ bHWbvgNc$Ovڍ/^>+zLdrQ J<.?| .xIOjB*2ǕdZs i4}0ozWey+k/PL״fࣗ1f`ίO֤@ - :%29hޒ.jk: 8B%? aXl"z^h8쯼+Q=$ 3 1v8!RȤdL1k籽Qs`09βCl ?sap4s7>9O{wy^TN>cdrɺ]wc8vQ^_g5%?ZPK-!bh^[Content_Types].xmlPK-!U0#L _rels/.relsPK-!>xl/_rels/workbook.xml.relsPK-! Uaxl/workbook.xmlPK-!  } xl/styles.xmlPK-!N yxl/theme/theme1.xmlPK-!HSόt\xl/worksheets/sheet1.xmlPK-!a5ս(xl/sharedStrings.xmlPK-!ύL{docProps/core.xmlPK-!^YdocProps/app.xmlPK python-anndata-0.12.0~rc1/tests/data/umi_tools.tsv.gz000066400000000000000000000003471500370632200225670ustar00rootroot000000000000001acount_single_cells_gene_tag.tsvm10 Ep(߉d*ԍޠX*XԴFӳ|4/t\- Q\w0-8})Lm"v;E=$mWkY[Kl-f\YN*J Literal["zarr", "h5ad"]: return request.param @pytest.fixture( params=[True, False], scope="session", ids=["load-annotation-index", "dont-load-annotation-index"], ) def load_annotation_index(request): return request.param @pytest.fixture(params=["outer", "inner"], scope="session") def join(request): return request.param @pytest.fixture( params=[ pytest.param(lambda x: x, id="full"), pytest.param(lambda x: x[0:10, :], id="subset"), ], scope="session", ) def simple_subset_func(request): return request.param @pytest.fixture(scope="session") def adata_remote_orig_with_path( tmp_path_factory, diskfmt: str, mtx_format, worker_id: str = "serial", ) -> tuple[Path, AnnData]: """Create remote fixtures, one without a range index and the other with""" file_name = f"orig_{worker_id}.{diskfmt}" if diskfmt == "h5ad": orig_path = tmp_path_factory.mktemp("h5ad_file_dir") / file_name else: orig_path = tmp_path_factory.mktemp(file_name) orig = gen_adata( (100, 110), mtx_format, obs_dtypes=(*DEFAULT_COL_TYPES, pd.StringDtype), var_dtypes=(*DEFAULT_COL_TYPES, pd.StringDtype), ) orig.raw = orig.copy() with ad.settings.override(allow_write_nullable_strings=True): getattr(ad.io, f"write_{diskfmt}")( orig_path, orig, convert_strings_to_categoricals=False ) return orig_path, orig @pytest.fixture def adata_remote( adata_remote_orig_with_path: tuple[Path, AnnData], *, load_annotation_index: bool ) -> AnnData: orig_path, _ = adata_remote_orig_with_path return read_lazy(orig_path, load_annotation_index=load_annotation_index) @pytest.fixture def adata_orig(adata_remote_orig_with_path: tuple[Path, AnnData]) -> AnnData: _, orig = adata_remote_orig_with_path return orig @pytest.fixture(scope="session") def adata_remote_with_store_tall_skinny_path( tmp_path_factory, mtx_format, worker_id: str = "serial", ) -> Path: orig_path = tmp_path_factory.mktemp(f"orig_{worker_id}.zarr") M = 100_000 # forces zarr to chunk `obs` columns multiple ways - that way 1 access to `int64` below is actually only one access N = 5 obs_names = pd.Index(f"cell{i}" for i in range(M)) var_names = pd.Index(f"gene{i}" for i in range(N)) obs = gen_typed_df(M, obs_names) var = gen_typed_df(N, var_names) orig = AnnData( obs=obs, var=var, X=mtx_format(np.random.binomial(100, 0.005, (M, N)).astype(np.float32)), ) orig.raw = orig.copy() orig.write_zarr(orig_path) return orig_path @pytest.fixture(scope="session") def adatas_paths_var_indices_for_concatenation( tmp_path_factory, *, are_vars_different: bool, worker_id: str = "serial" ) -> tuple[list[AnnData], list[Path], list[pd.Index]]: adatas = [] var_indices = [] paths = [] M = 1000 N = 50 n_datasets = 3 for dataset_index in range(n_datasets): orig_path = tmp_path_factory.mktemp(f"orig_{worker_id}_{dataset_index}.zarr") paths.append(orig_path) obs_names = pd.Index(f"cell_{dataset_index}_{i}" for i in range(M)) var_names = pd.Index( f"gene_{i}{f'_{dataset_index}_ds' if are_vars_different and (i % 2) else ''}" for i in range(N) ) var_indices.append(var_names) obs = gen_typed_df(M, obs_names) var = gen_typed_df(N, var_names) orig = AnnData( obs=obs, var=var, X=np.random.binomial(100, 0.005, (M, N)).astype(np.float32), ) orig.write_zarr(orig_path) adatas.append(orig) return adatas, paths, var_indices @pytest.fixture def var_indices_for_concat( adatas_paths_var_indices_for_concatenation, ) -> list[pd.Index]: _, _, var_indices = adatas_paths_var_indices_for_concatenation return var_indices @pytest.fixture def adatas_for_concat( adatas_paths_var_indices_for_concatenation, ) -> list[AnnData]: adatas, _, _ = adatas_paths_var_indices_for_concatenation return adatas @pytest.fixture def stores_for_concat( adatas_paths_var_indices_for_concatenation, ) -> list[AccessTrackingStore]: _, paths, _ = adatas_paths_var_indices_for_concatenation return [AccessTrackingStore(path) for path in paths] @pytest.fixture def lazy_adatas_for_concat( stores_for_concat, ) -> list[AnnData]: return [read_lazy(store) for store in stores_for_concat] @pytest.fixture def adata_remote_with_store_tall_skinny( adata_remote_with_store_tall_skinny_path: Path, ) -> tuple[AnnData, AccessTrackingStore]: store = AccessTrackingStore(adata_remote_with_store_tall_skinny_path) remote = read_lazy(store) return remote, store @pytest.fixture def remote_store_tall_skinny( adata_remote_with_store_tall_skinny_path: Path, ) -> AccessTrackingStore: return AccessTrackingStore(adata_remote_with_store_tall_skinny_path) @pytest.fixture def adata_remote_tall_skinny( remote_store_tall_skinny: AccessTrackingStore, ) -> AnnData: remote = read_lazy(remote_store_tall_skinny) return remote def get_key_trackers_for_columns_on_axis( adata: AnnData, axis: Literal["obs", "var"] ) -> Generator[str, None, None]: """Generate keys for tracking, using `codes` from categorical columns instead of the column name Parameters ---------- adata Object to get keys from axis Axis to get keys from Yields ------ Keys for tracking """ for col in getattr(adata, axis).columns: yield f"{axis}/{col}" if "cat" not in col else f"{axis}/{col}/codes" ANNDATA_ELEMS = typing.get_args(AnnDataElem) python-anndata-0.12.0~rc1/tests/lazy/test_concat.py000066400000000000000000000300431500370632200223220ustar00rootroot00000000000000from __future__ import annotations from functools import reduce from importlib.util import find_spec from typing import TYPE_CHECKING import numpy as np import pandas as pd import pytest import anndata as ad from anndata._core.file_backing import to_memory from anndata.experimental import read_lazy from anndata.tests.helpers import assert_equal, gen_adata from .conftest import ANNDATA_ELEMS, get_key_trackers_for_columns_on_axis pytestmark = pytest.mark.skipif(not find_spec("xarray"), reason="xarray not installed") if TYPE_CHECKING: from collections.abc import Callable from pathlib import Path from typing import Literal from numpy.typing import NDArray from anndata import AnnData from anndata._types import AnnDataElem, Join_T from anndata.tests.helpers import AccessTrackingStore def unify_extension_dtypes( remote: pd.DataFrame, memory: pd.DataFrame ) -> tuple[pd.DataFrame, pd.DataFrame]: """ For concatenated lazy datasets, we send the extension arrays through dask But this means we lose the pandas dtype, so this function corrects that. Parameters ---------- remote The dataset that comes from the concatenated lazy operation memory The in-memory, "correct" version Returns ------- The two dataframes unified """ for col in memory.columns: dtype = memory[col].dtype if pd.api.types.is_extension_array_dtype(dtype): remote[col] = remote[col].astype(dtype) return remote, memory @pytest.mark.parametrize("join", ["outer", "inner"]) @pytest.mark.parametrize( ("elem_key", "sub_key"), [ ("obs", "cat"), ("obs", "int64"), *((elem_name, None) for elem_name in ANNDATA_ELEMS), ], ) def test_concat_access_count( adatas_for_concat: list[AnnData], stores_for_concat: list[AccessTrackingStore], lazy_adatas_for_concat: list[AnnData], join: Join_T, elem_key: AnnDataElem, sub_key: str, simple_subset_func: Callable[[AnnData], AnnData], ): # track all elems except categories from categoricals because they must be read in for concatenation # due to the dtype check on the elements (which causes `categories` to be read in) non_categorical_columns = ( f"{elem}/{col}" if "cat" not in col else f"{elem}/{col}/codes" for elem in ["obs", "var"] for col in adatas_for_concat[0].obs.columns ) category_columns = ( f"{elem}/{col}/categories" for elem in ["obs", "var"] for col in adatas_for_concat[0].obs.columns if "cat" in col ) non_obs_var_keys = filter(lambda e: e not in {"obs", "var"}, ANNDATA_ELEMS) zero_access_count_keys = [*non_categorical_columns, *non_obs_var_keys] keys_to_track = [*zero_access_count_keys, *category_columns] for store in stores_for_concat: store.initialize_key_trackers(keys_to_track) concated_remote = ad.concat(lazy_adatas_for_concat, join=join) # a series of methods that should __not__ read in any data elem = getattr(simple_subset_func(concated_remote), elem_key) if sub_key is not None: getattr(elem, sub_key) for store in stores_for_concat: for elem in zero_access_count_keys: store.assert_access_count(elem, 0) for elem in category_columns: # once for .zarray, once for the actual data store.assert_access_count(elem, 2) def test_concat_to_memory_obs_access_count( adatas_for_concat: list[AnnData], stores_for_concat: list[AccessTrackingStore], lazy_adatas_for_concat: list[AnnData], join: Join_T, simple_subset_func: Callable[[AnnData], AnnData], ): """This test ensures that only the necessary chunks are accessed in `to_memory` call after a subsetting operation""" concated_remote = simple_subset_func(ad.concat(lazy_adatas_for_concat, join=join)) concated_remote_subset = simple_subset_func(concated_remote) n_datasets = len(adatas_for_concat) obs_keys_to_track = get_key_trackers_for_columns_on_axis( adatas_for_concat[0], "obs" ) for store in stores_for_concat: store.initialize_key_trackers(obs_keys_to_track) concated_remote_subset.to_memory() # check access count for the stores - only the first should be accessed when reading into memory for col in obs_keys_to_track: stores_for_concat[0].assert_access_count(col, 1) for i in range(1, n_datasets): # if the shapes are the same, data was read in to bring the object into memory; otherwise, not stores_for_concat[i].assert_access_count( col, concated_remote_subset.shape[0] == concated_remote.shape[0] ) def test_concat_to_memory_obs( adatas_for_concat: list[AnnData], lazy_adatas_for_concat: list[AnnData], join: Join_T, simple_subset_func: Callable[[AnnData], AnnData], ): concatenated_memory = simple_subset_func(ad.concat(adatas_for_concat, join=join)) concated_remote = simple_subset_func(ad.concat(lazy_adatas_for_concat, join=join)) assert_equal( *unify_extension_dtypes(to_memory(concated_remote.obs), concatenated_memory.obs) ) def test_concat_to_memory_obs_dtypes( lazy_adatas_for_concat: list[AnnData], join: Join_T, ): concated_remote = ad.concat(lazy_adatas_for_concat, join=join) # check preservation of non-categorical dtypes on the concat axis assert concated_remote.obs["int64"].dtype == "int64" assert concated_remote.obs["uint8"].dtype == "uint8" assert concated_remote.obs["nullable-int"].dtype == "int32" assert concated_remote.obs["float64"].dtype == "float64" assert concated_remote.obs["bool"].dtype == "bool" assert concated_remote.obs["nullable-bool"].dtype == "bool" def test_concat_to_memory_var( var_indices_for_concat: list[pd.Index], adatas_for_concat: list[AnnData], stores_for_concat: list[AccessTrackingStore], lazy_adatas_for_concat: list[AnnData], join: Join_T, simple_subset_func: Callable[[AnnData], AnnData], *, are_vars_different: bool, ): """\ The goal of this test to ensure that the various `join` operations work as expected under various scenarios. We test two things here: first, we take all the overlapping indices for var. Then if the underlying vars are different and this is an outer join (i.e., there are non-overlapping indices), we take the unique indices from one of the dataframes. We then check if the var dataframe subsetted from lazily-concatenated object and put into memory is the same as the underlying anndata object that created it, up to some corrections. We also test for key access counts to ensure that data was not taken from the var df of other on-disk anndata objects that might be different i.e., in the case of an outer join. """ concated_remote = simple_subset_func(ad.concat(lazy_adatas_for_concat, join=join)) var_keys_to_track = get_key_trackers_for_columns_on_axis( adatas_for_concat[0], "var" ) for store in stores_for_concat: store.initialize_key_trackers(var_keys_to_track) # check non-different variables, taken from first annotation. pd_index_overlapping = reduce(pd.Index.intersection, var_indices_for_concat) var_df_overlapping = adatas_for_concat[0][:, pd_index_overlapping].var.copy() test_cases = [(pd_index_overlapping, var_df_overlapping, 0)] if are_vars_different and join == "outer": # check a set of unique variables from the first object since we only take from there if different pd_index_only_ds_0 = pd.Index( filter(lambda x: "0_ds" in x, var_indices_for_concat[1]) ) var_df_only_ds_0 = adatas_for_concat[0][:, pd_index_only_ds_0].var.copy() test_cases.append((pd_index_only_ds_0, var_df_only_ds_0, 0)) for pd_index, var_df, store_idx in test_cases: remote_df = to_memory(concated_remote[:, pd_index].var) remote_df_corrected, _ = unify_extension_dtypes(remote_df, var_df) # NOTE: xr.merge always upcasts to float due to NA and you can't downcast? for col in remote_df_corrected.columns: dtype = remote_df_corrected[col].dtype if dtype in [np.float64, np.float32]: var_df[col] = var_df[col].astype(dtype) assert_equal(remote_df_corrected, var_df) for key in var_keys_to_track: stores_for_concat[store_idx].assert_access_count(key, 1) for store in stores_for_concat: if store != stores_for_concat[store_idx]: store.assert_access_count(key, 0) stores_for_concat[store_idx].reset_key_trackers() @pytest.mark.xdist_group("dask") def test_concat_data_with_cluster_to_memory( adata_remote: AnnData, join: Join_T, local_cluster_addr: str ) -> None: import dask.distributed as dd with dd.Client(local_cluster_addr): ad.concat([adata_remote, adata_remote], join=join).to_memory() @pytest.mark.parametrize( "index", [ pytest.param( slice(50, 150), id="slice", ), pytest.param( np.arange(95, 105), id="consecutive integer array", ), pytest.param( np.random.randint(80, 110, 5), id="random integer array", ), pytest.param( np.random.choice([True, False], 200), id="boolean array", ), pytest.param(slice(None), id="full slice"), pytest.param("a", id="categorical_subset"), pytest.param(None, id="No index"), ], ) def test_concat_data_subsetting( adata_remote: AnnData, adata_orig: AnnData, join: Join_T, index: slice | NDArray | Literal["a"] | None, ): from anndata.experimental.backed._compat import Dataset2D remote_concatenated = ad.concat([adata_remote, adata_remote], join=join) if index is not None: if np.isscalar(index) and index == "a": index = remote_concatenated.obs["obs_cat"] == "a" remote_concatenated = remote_concatenated[index] orig_concatenated = ad.concat([adata_orig, adata_orig], join=join) if index is not None: orig_concatenated = orig_concatenated[index] in_memory_remote_concatenated = remote_concatenated.to_memory() corrected_remote_obs, corrected_memory_obs = unify_extension_dtypes( in_memory_remote_concatenated.obs, orig_concatenated.obs ) assert isinstance(remote_concatenated.obs, Dataset2D) assert_equal(corrected_remote_obs, corrected_memory_obs) assert_equal(in_memory_remote_concatenated.X, orig_concatenated.X) assert ( in_memory_remote_concatenated.var_names.tolist() == orig_concatenated.var_names.tolist() ) @pytest.mark.parametrize( ("attr", "key"), ( pytest.param(param[0], param[1], id="-".join(map(str, param))) for param in [("obs", None), ("var", None), ("obsm", "df"), ("varm", "df")] ), ) def test_concat_df_ds_mixed_types( adata_remote: AnnData, adata_orig: AnnData, join: Join_T, attr: str, key: str | None, *, load_annotation_index: bool, ): def with_elem_in_memory(adata: AnnData, attr: str, key: str | None) -> AnnData: parent_elem = getattr(adata, attr) if key is not None: getattr(adata, attr)[key] = to_memory(parent_elem[key]) return adata setattr(adata, attr, to_memory(parent_elem)) return adata if not load_annotation_index: pytest.skip( "Testing for mixed types is independent of the axis since the indices always have to match." ) remote = with_elem_in_memory(adata_remote, attr, key) in_memory_concatenated = ad.concat([adata_orig, adata_orig], join=join) mixed_concatenated = ad.concat([remote, adata_orig], join=join) assert_equal(mixed_concatenated, in_memory_concatenated) def test_concat_bad_mixed_types(tmp_path: Path): orig = gen_adata((100, 200), np.array) orig.write_zarr(tmp_path) remote = read_lazy(tmp_path) orig.obsm["df"] = orig.obsm["array"] with pytest.raises(ValueError, match=r"Cannot concatenate a Dataset2D*"): ad.concat([remote, orig], join="outer") python-anndata-0.12.0~rc1/tests/lazy/test_read.py000066400000000000000000000133431500370632200217720ustar00rootroot00000000000000from __future__ import annotations from importlib.util import find_spec from typing import TYPE_CHECKING import pytest from anndata.compat import DaskArray from anndata.experimental import read_lazy from anndata.tests.helpers import AccessTrackingStore, assert_equal, gen_adata from .conftest import ANNDATA_ELEMS if TYPE_CHECKING: from collections.abc import Callable from pathlib import Path from anndata import AnnData from anndata._types import AnnDataElem pytestmark = pytest.mark.skipif(not find_spec("xarray"), reason="xarray not installed") @pytest.mark.parametrize( ("elem_key", "sub_key"), [ ("raw", "X"), ("obs", "cat"), ("obs", "int64"), *((elem_name, None) for elem_name in ANNDATA_ELEMS), ], ) def test_access_count_elem_access( remote_store_tall_skinny: AccessTrackingStore, adata_remote_tall_skinny: AnnData, elem_key: AnnDataElem, sub_key: str, simple_subset_func: Callable[[AnnData], AnnData], ): full_path = f"{elem_key}/{sub_key}" if sub_key is not None else elem_key remote_store_tall_skinny.initialize_key_trackers({full_path, "X"}) # a series of methods that should __not__ read in any data elem = getattr(simple_subset_func(adata_remote_tall_skinny), elem_key) if sub_key is not None: getattr(elem, sub_key) remote_store_tall_skinny.assert_access_count(full_path, 0) remote_store_tall_skinny.assert_access_count("X", 0) def test_access_count_subset( remote_store_tall_skinny: AccessTrackingStore, adata_remote_tall_skinny: AnnData, ): non_obs_elem_names = filter(lambda e: e != "obs", ANNDATA_ELEMS) remote_store_tall_skinny.initialize_key_trackers( ["obs/cat/codes", *non_obs_elem_names] ) adata_remote_tall_skinny[adata_remote_tall_skinny.obs["cat"] == "a", :] # all codes read in for subset (from 1 chunk) remote_store_tall_skinny.assert_access_count("obs/cat/codes", 1) for elem_name in non_obs_elem_names: remote_store_tall_skinny.assert_access_count(elem_name, 0) def test_access_count_subset_column_compute( remote_store_tall_skinny: AccessTrackingStore, adata_remote_tall_skinny: AnnData, ): remote_store_tall_skinny.initialize_key_trackers(["obs/int64"]) adata_remote_tall_skinny[adata_remote_tall_skinny.shape[0] // 2, :].obs[ "int64" ].compute() # two chunks needed for 0:10 subset remote_store_tall_skinny.assert_access_count("obs/int64", 1) def test_access_count_index( remote_store_tall_skinny: AccessTrackingStore, ): remote_store_tall_skinny.initialize_key_trackers(["obs/_index"]) read_lazy(remote_store_tall_skinny, load_annotation_index=False) remote_store_tall_skinny.assert_access_count("obs/_index", 0) read_lazy(remote_store_tall_skinny) # 4 is number of chunks remote_store_tall_skinny.assert_access_count("obs/_index", 4) def test_access_count_dtype( remote_store_tall_skinny: AccessTrackingStore, adata_remote_tall_skinny: AnnData, ): remote_store_tall_skinny.initialize_key_trackers(["obs/cat/categories"]) remote_store_tall_skinny.assert_access_count("obs/cat/categories", 0) # This should only cause categories to be read in once adata_remote_tall_skinny.obs["cat"].dtype adata_remote_tall_skinny.obs["cat"].dtype adata_remote_tall_skinny.obs["cat"].dtype remote_store_tall_skinny.assert_access_count("obs/cat/categories", 1) def test_uns_uses_dask(adata_remote: AnnData): assert isinstance(adata_remote.uns["nested"]["nested_further"]["array"], DaskArray) def test_to_memory(adata_remote: AnnData, adata_orig: AnnData): remote_to_memory = adata_remote.to_memory() assert_equal(remote_to_memory, adata_orig) def test_view_to_memory(adata_remote: AnnData, adata_orig: AnnData): obs_cats = adata_orig.obs["obs_cat"].cat.categories subset_obs = adata_orig.obs["obs_cat"] == obs_cats[0] assert_equal(adata_orig[subset_obs, :], adata_remote[subset_obs, :].to_memory()) var_cats = adata_orig.var["var_cat"].cat.categories subset_var = adata_orig.var["var_cat"] == var_cats[0] assert_equal(adata_orig[:, subset_var], adata_remote[:, subset_var].to_memory()) def test_view_of_view_to_memory(adata_remote: AnnData, adata_orig: AnnData): cats_obs = adata_orig.obs["obs_cat"].cat.categories subset_obs = (adata_orig.obs["obs_cat"] == cats_obs[0]) | ( adata_orig.obs["obs_cat"] == cats_obs[1] ) subsetted_adata = adata_orig[subset_obs, :] subset_subset_obs = subsetted_adata.obs["obs_cat"] == cats_obs[1] subsetted_subsetted_adata = subsetted_adata[subset_subset_obs, :] assert_equal( subsetted_subsetted_adata, adata_remote[subset_obs, :][subset_subset_obs, :].to_memory(), ) cats_var = adata_orig.var["var_cat"].cat.categories subset_var = (adata_orig.var["var_cat"] == cats_var[0]) | ( adata_orig.var["var_cat"] == cats_var[1] ) subsetted_adata = adata_orig[:, subset_var] subset_subset_var = subsetted_adata.var["var_cat"] == cats_var[1] subsetted_subsetted_adata = subsetted_adata[:, subset_subset_var] assert_equal( subsetted_subsetted_adata, adata_remote[:, subset_var][:, subset_subset_var].to_memory(), ) def test_unconsolidated(tmp_path: Path, mtx_format): adata = gen_adata((1000, 1000), mtx_format) orig_pth = tmp_path / "orig.zarr" adata.write_zarr(orig_pth) (orig_pth / ".zmetadata").unlink() store = AccessTrackingStore(orig_pth) store.initialize_key_trackers(["obs/.zgroup", ".zgroup"]) with pytest.warns(UserWarning, match=r"Did not read zarr as consolidated"): remote = read_lazy(store) remote_to_memory = remote.to_memory() assert_equal(remote_to_memory, adata) store.assert_access_count("obs/.zgroup", 1) python-anndata-0.12.0~rc1/tests/lazy/test_write.py000066400000000000000000000026511500370632200222110ustar00rootroot00000000000000from __future__ import annotations from importlib.util import find_spec from typing import TYPE_CHECKING import numpy as np import pytest from anndata import AnnData from anndata.experimental.backed._io import read_lazy if TYPE_CHECKING: from pathlib import Path from typing import Literal pytestmark = pytest.mark.skipif(not find_spec("xarray"), reason="xarray not installed") @pytest.mark.parametrize("fmt", ["zarr", "h5ad", "loom", "csvs"]) @pytest.mark.parametrize("key", ["obs", "var", "obsm", "varm"]) def test_write_error( tmp_path: Path, fmt: Literal["zarr", "h5ad", "loom", "csvs"], key: Literal["obs", "var", "obsm", "varm"], ): path = tmp_path / "adata.h5ad" X = np.random.random((4, 4)) adata = AnnData(X=X) if key.endswith("m"): elem = {"df": getattr(adata, key[:-1])} setattr(adata, key, elem) adata.write_h5ad(path) adata_lazy = read_lazy(path) if key.endswith("m"): adata_lazy.obs = adata_lazy.obs.to_memory() adata_lazy.obs = adata_lazy.var.to_memory() noop_path = tmp_path / f"adata_noop.{fmt}" with pytest.raises( NotImplementedError, match=r"Writing AnnData objects with a Dataset2D not supported yet. Please use `ds.to_memory`", ): getattr(adata_lazy, f"write_{fmt}")(noop_path) assert not noop_path.exists(), ( "Found a directory at the path at which no data should have been written" ) python-anndata-0.12.0~rc1/tests/test_anncollection.py000066400000000000000000000053141500370632200227270ustar00rootroot00000000000000from __future__ import annotations import numpy as np import pytest from scipy.sparse import csr_matrix, issparse from sklearn.preprocessing import LabelEncoder import anndata as ad from anndata.experimental.multi_files import AnnCollection _dense = lambda a: a.toarray() if issparse(a) else a @pytest.fixture def adatas(request): adata1 = ad.AnnData(X=request.param([[1, 2, 0], [4, 5, 0], [7, 8, 0]])) adata1.obs["a_test"] = ["a", "a", "b"] adata1.obsm["o_test"] = np.ones((adata1.n_obs, 2)) adata2 = ad.AnnData(X=request.param([[1, 3, 0], [9, 8, 0]])) adata2.obs["a_test"] = ["c", "c"] adata2.obsm["o_test"] = np.zeros((adata2.n_obs, 2)) return adata1, adata2 @pytest.mark.parametrize("adatas", [np.array, csr_matrix], indirect=True) def test_full_selection(adatas): dat = AnnCollection(adatas, index_unique="_") adt_concat = ad.concat(adatas, index_unique="_") # sorted selection from one adata dat_view = dat[:2, :2] for adata in (adatas[0], adt_concat): adt_view = adata[:2, :2] np.testing.assert_allclose(_dense(dat_view.X), _dense(adt_view.X)) np.testing.assert_allclose(dat_view.obsm["o_test"], adt_view.obsm["o_test"]) np.testing.assert_array_equal(dat_view.obs["a_test"], adt_view.obs["a_test"]) # sorted and unsorted selection from 2 adatas rand_idxs = np.random.choice(dat.shape[0], 4, replace=False) for select in (slice(2, 5), [4, 2, 3], rand_idxs): dat_view = dat[select, :2] adt_view = adt_concat[select, :2] np.testing.assert_allclose(_dense(dat_view.X), _dense(adt_view.X)) np.testing.assert_allclose(dat_view.obsm["o_test"], adt_view.obsm["o_test"]) np.testing.assert_array_equal(dat_view.obs["a_test"], adt_view.obs["a_test"]) # test duplicate selection idxs = [1, 2, 4, 4] dat_view = dat[idxs, :2] np.testing.assert_allclose( _dense(dat_view.X), np.array([[4, 5], [7, 8], [9, 8], [9, 8]]) ) @pytest.mark.parametrize("adatas", [np.array, csr_matrix], indirect=True) def test_creation(adatas): adatas_inner = [adatas[0], adatas[1][:, :2].copy()] dat = AnnCollection(adatas_inner, join_vars="inner", index_unique="_") adt_concat = ad.concat(adatas_inner, index_unique="_") np.testing.assert_array_equal(dat.var_names, adt_concat.var_names) @pytest.mark.parametrize("adatas", [np.array], indirect=True) def test_convert(adatas): dat = AnnCollection(adatas, index_unique="_") le = LabelEncoder() le.fit(dat[:].obs["a_test"]) obs_no_convert = dat[:].obs["a_test"] convert = dict(obs={"a_test": lambda a: le.transform(a)}) dat.convert = convert np.testing.assert_array_equal(dat[:].obs["a_test"], le.transform(obs_no_convert)) python-anndata-0.12.0~rc1/tests/test_annot.py000066400000000000000000000046101500370632200212140ustar00rootroot00000000000000"""Test handling of values in `obs`/ `var`""" from __future__ import annotations import numpy as np import pandas as pd import pytest from natsort import natsorted import anndata as ad from anndata.tests.helpers import get_multiindex_columns_df @pytest.mark.parametrize("dtype", [object, "string"]) def test_str_to_categorical(dtype): obs = pd.DataFrame( {"str": ["a", "a", None, "b", "b"]}, index=[f"cell-{i}" for i in range(5)] ) obs["str"] = obs["str"].astype(dtype) a = ad.AnnData(obs=obs.copy()) a.strings_to_categoricals() expected = obs["str"].astype("category") pd.testing.assert_series_equal(expected, a.obs["str"]) @pytest.mark.parametrize("dtype", [object, "string"]) def test_to_categorical_ordering(dtype): obs = pd.DataFrame( {"str": ["10", "11", "3", "9", "10", "10"]}, index=[f"cell-{i}" for i in range(6)], ) obs["str"] = obs["str"].astype(dtype) a = ad.AnnData(obs=obs.copy()) a.strings_to_categoricals() expected = obs["str"].astype( pd.CategoricalDtype(categories=natsorted(obs["str"].unique())) ) pd.testing.assert_series_equal(expected, a.obs["str"]) def test_non_str_to_not_categorical(): # Test case based on https://github.com/scverse/anndata/issues/141#issuecomment-802105259 obs = pd.DataFrame(index=[f"cell-{i}" for i in range(5)]).assign( str_with_nan=["foo", "bar", None, np.nan, "foo"], boolean_with_nan_and_none=[True, False, np.nan, None, True], boolean_with_nan=[True, False, np.nan, np.nan, True], boolean_with_none=[True, False, None, None, True], ) adata = ad.AnnData(obs=obs.copy()) orig_dtypes = {k: v.name for k, v in obs.dtypes.items()} expected_dtypes = orig_dtypes.copy() expected_dtypes["str_with_nan"] = "category" adata.strings_to_categoricals() result_dtypes = {k: v.name for k, v in adata.obs.dtypes.items()} assert expected_dtypes == result_dtypes expected_non_transformed = obs.drop(columns=["str_with_nan"]) result_non_transformed = adata.obs.drop(columns=["str_with_nan"]) pd.testing.assert_frame_equal(expected_non_transformed, result_non_transformed) def test_error_multiindex(): adata = ad.AnnData(np.random.rand(100, 10)) df = get_multiindex_columns_df((adata.shape[0], 20)) with pytest.raises(ValueError, match=r"MultiIndex columns are not supported"): adata.obs = df python-anndata-0.12.0~rc1/tests/test_awkward.py000066400000000000000000000353101500370632200215360ustar00rootroot00000000000000"""Tests related to awkward arrays""" from __future__ import annotations import warnings import numpy as np import numpy.testing as npt import pandas as pd import pytest import anndata from anndata import ( AnnData, ImplicitModificationWarning, read_h5ad, ) from anndata.compat import AwkArray from anndata.compat import awkward as ak from anndata.tests.helpers import assert_equal, gen_adata, gen_awkward from anndata.utils import axis_len @pytest.mark.parametrize( ("array", "shape"), [ # numpy array (ak.Array(np.arange(2 * 3 * 4 * 5).reshape((2, 3, 4, 5))), (2, 3, 4, 5)), # record (ak.Array([{"a": 1, "b": 2}, {"a": 1, "b": 3}]), (2, 2)), # ListType, variable length (ak.Array([[1], [2, 3], [4, 5, 6]]), (3, None)), # ListType, happens to have the same length, but is not regular (ak.Array([[2], [3], [4]]), (3, None)), # RegularType + nested ListType (ak.to_regular(ak.Array([[[1, 2], [3]], [[2], [3, 4, 5]]]), 1), (2, 2, None)), # nested record ( ak.to_regular(ak.Array([[{"a": 0}, {"b": 1}], [{"c": 2}, {"d": 3}]]), 1), (2, 2, 4), ), # mixed types (variable length) (ak.Array([[1, 2], ["a"]]), (2, None)), # mixed types (but regular) (ak.to_regular(ak.Array([[1, 2], ["a", "b"]]), 1), (2, 2)), # zero-size edge cases (ak.Array(np.ones((0, 7))), (0, 7)), (ak.Array(np.ones((7, 0))), (7, 0)), # UnionType of two regular types with different dimensions ( ak.concatenate([ak.Array(np.ones((2, 2))), ak.Array(np.ones((2, 3)))]), (4, None), ), # UnionType of two regular types with same dimension ( ak.concatenate( [ ak.Array(np.ones((2, 2))), ak.Array(np.array([["a", "a"], ["a", "a"]])), ] ), (4, 2), ), # Array of string types (ak.Array(["a", "b", "c"]), (3,)), (ak.Array([["a", "b"], ["c", "d"], ["e", "f"]]), (3, None)), (ak.to_regular(ak.Array([["a", "b"], ["c", "d"], ["e", "f"]]), 1), (3, 2)), ], ) def test_axis_len(array, shape): """Test that axis_len returns the right value for awkward arrays.""" for axis, size in enumerate(shape): assert size == axis_len(array, axis) # Requesting the size for an axis higher than the array has dimensions should raise a TypeError with pytest.raises(TypeError): axis_len(array, len(shape)) @pytest.mark.parametrize( ("field", "value", "valid"), [ ("obsm", gen_awkward((10, 5)), True), ("obsm", gen_awkward((10, None)), True), ("obsm", gen_awkward((10, None, None)), True), ("obsm", gen_awkward((10, 5, None)), True), ("obsm", gen_awkward((8, 10)), False), ("obsm", gen_awkward((8, None)), False), ("varm", gen_awkward((20, 5)), True), ("varm", gen_awkward((20, None)), True), ("varm", gen_awkward((20, None, None)), True), ("varm", gen_awkward((20, 5, None)), True), ("varm", gen_awkward((8, 20)), False), ("varm", gen_awkward((8, None)), False), ("uns", gen_awkward((7,)), True), ("uns", gen_awkward((7, None)), True), ("uns", gen_awkward((7, None, None)), True), ], ) def test_set_awkward(field, value, valid): """Check if we can set obsm, .varm and .uns with different types of awkward arrays and if error messages are properly raised when the dimensions do not align. """ adata = gen_adata((10, 20), varm_types=(), obsm_types=(), layers_types=()) def _assign(): getattr(adata, field)["test"] = value if not valid: with pytest.raises(ValueError, match="incorrect shape"): _assign() else: _assign() @pytest.mark.parametrize("key", ["obsm", "varm", "uns"]) def test_copy(key): """Check that modifying a copy does not modify the original""" adata = gen_adata((3, 3), varm_types=(), obsm_types=(), layers_types=()) getattr(adata, key)["awk"] = ak.Array([{"a": [1], "b": [2], "c": [3]}] * 3) adata_copy = adata.copy() getattr(adata_copy, key)["awk"]["c"] = np.full((3, 1), 4) getattr(adata_copy, key)["awk"]["d"] = np.full((3, 1), 5) # values in copy were correctly set npt.assert_equal(getattr(adata_copy, key)["awk"]["c"], np.full((3, 1), 4)) npt.assert_equal(getattr(adata_copy, key)["awk"]["d"], np.full((3, 1), 5)) # values in original were not updated npt.assert_equal(getattr(adata, key)["awk"]["c"], np.full((3, 1), 3)) with pytest.raises(IndexError): getattr(adata, key)["awk"]["d"] @pytest.mark.parametrize("key", ["obsm", "varm"]) def test_view(key): """Check that modifying a view does not modify the original""" adata = gen_adata((3, 3), varm_types=(), obsm_types=(), layers_types=()) getattr(adata, key)["awk"] = ak.Array([{"a": [1], "b": [2], "c": [3]}] * 3) adata_view = adata[:2, :2] with pytest.warns( ImplicitModificationWarning, match=r"initializing view as actual" ): getattr(adata_view, key)["awk"]["c"] = np.full((2, 1), 4) getattr(adata_view, key)["awk"]["d"] = np.full((2, 1), 5) # values in view were correctly set npt.assert_equal(getattr(adata_view, key)["awk"]["c"], np.full((2, 1), 4)) npt.assert_equal(getattr(adata_view, key)["awk"]["d"], np.full((2, 1), 5)) # values in original were not updated npt.assert_equal(getattr(adata, key)["awk"]["c"], np.full((3, 1), 3)) with pytest.raises(IndexError): getattr(adata, key)["awk"]["d"] def test_view_of_awkward_array_with_custom_behavior(): """Currently can't create view of arrays with custom __name__ (in this case "string") See https://github.com/scverse/anndata/pull/647#discussion_r963494798_""" from uuid import uuid4 BEHAVIOUR_ID = str(uuid4()) class ReversibleArray(ak.Array): def reversed(self): return self[..., ::-1] ak.behavior[BEHAVIOUR_ID] = ReversibleArray adata = gen_adata((3, 3), varm_types=(), obsm_types=(), layers_types=()) adata.obsm["awk_string"] = ak.with_parameter( ak.Array(["AAA", "BBB", "CCC"]), "__list__", BEHAVIOUR_ID ) adata_view = adata[:2] with pytest.raises(NotImplementedError): adata_view.obsm["awk_string"] @pytest.mark.parametrize( "array", [ # numpy array ak.Array(np.arange(2 * 3 * 4 * 5).reshape((2, 3, 4, 5))), # record ak.Array([{"a": 1, "b": 2}, {"a": 1, "b": 3}]), # ListType, variable length ak.Array([[1], [2, 3], [4, 5, 6]]), # RegularType + nested ListType ak.to_regular(ak.Array([[[1, 2], [3]], [[2], [3, 4, 5]]]), 1), # nested record ak.to_regular(ak.Array([[{"a": 0}, {"b": 1}], [{"c": 2}, {"d": 3}]]), 1), # mixed types (variable length) ak.Array([[1, 2], ["a"]]), # zero-size edge cases ak.Array(np.ones((0, 7))), ak.Array(np.ones((7, 0))), # UnionType of two regular types with different dimensions ak.concatenate([ak.Array(np.ones((2, 2))), ak.Array(np.ones((2, 3)))]), # UnionType of two regular types with same dimension ak.concatenate( [ ak.Array(np.ones((2, 2))), ak.Array(np.array([["a", "a"], ["a", "a"]])), ] ), # categorical array ak.str.to_categorical(ak.Array([["a", "b", "c"], ["a", "b"]])), ak.str.to_categorical(ak.Array([[1, 1, 2], [3, 3]])), # tyical record type with AIRR data consisting of different dtypes ak.Array( [ [ { "v_call": "TRV1", "junction_aa": "ADDEEKK", "productive": True, "locus": None, "consensus_count": 3, }, { "v_call": "TRV2", "productive": False, "locus": "TRA", "consensus_count": 4, }, ], [ { "v_call": None, "junction_aa": "ADDEKK", "productive": None, "locus": "IGK", "consensus_count": 3, } ], ] ), ], ) def test_awkward_io(tmp_path, array): adata = AnnData() adata.uns["awk"] = array adata_path = tmp_path / "adata.h5ad" adata.write_h5ad(adata_path) adata2 = read_h5ad(adata_path) assert_equal(adata.uns["awk"], adata2.uns["awk"], exact=True) def test_awkward_io_view(tmp_path): """Check that views are converted to actual arrays on save, i.e. the _view_args and __list__ parameters are removed""" adata = gen_adata((3, 3), varm_types=(), obsm_types=(AwkArray,), layers_types=()) v = adata[1:] adata_path = tmp_path / "adata.h5ad" v.write_h5ad(adata_path) adata2 = read_h5ad(adata_path) # parameters are not fully removed, but set to None assert ak.parameters(adata2.obsm["awk_2d_ragged"]) == { "__list__": None, "_view_args": None, } # @pytest.mark.parametrize("join", ["outer", "inner"]) @pytest.mark.parametrize( ("arrays", "join", "expected"), [ pytest.param( [ak.Array([{"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}]), None], "inner", None, id="awk:recordoflists_null-inner", ), pytest.param( [ak.Array([{"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}]), None], "outer", ak.Array( [{"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}, None, None, None] ), # maybe should return: ak.Array([{"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}, {}, {}, {}]), id="awk:recordoflists_null-outer", ), pytest.param( [ak.Array([[{"a": 1}, {"a": 2}], []]), None], "outer", ak.Array([[{"a": 1}, {"a": 2}], [], None, None, None]), # maybe should return: ak.Array([[{"a": 1}, {"a": 2}], [], [], []]), id="awk:listofrecords_null-outer", ), pytest.param( [None, ak.Array([{"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}])], "inner", None, id="null_awk-inner", ), pytest.param( [None, ak.Array([{"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}])], "outer", ak.Array( [None, None, None, {"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}] ), # maybe should return: ak.Array([{}, {}, {}, {"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}]), id="null_awk:recordoflists-outer", ), pytest.param( [ak.Array([{"a": 1}, {"a": 2}]), ak.Array([{"a": 3}, {"a": 4}])], "inner", ak.Array([{"a": i} for i in range(1, 5)]), id="awk-simple-record", ), pytest.param( [ ak.Array([{"a": 1, "b": 1}, {"a": 2, "b": 2}]), ak.Array([{"a": 3}, {"a": 4}]), ], "inner", ak.Array([{"a": i} for i in range(1, 5)]), id="awk-simple-record-inner", ), # TODO: # pytest.param( # [ # ak.Array([{"a": 1, "b": 1}, {"a": 2, "b": 2}]), # ak.Array([{"a": 3}, {"a": 4}]), # ], # "outer", # ak.Array([{"a": 1, "b": 1}, {"a": 2, "b": 2}, {"a": 3}, {"a": 4},]), # id="awk-simple-record-outer", # ), pytest.param( [ None, ak.Array([{"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}]), pd.DataFrame(), ], "outer", NotImplementedError, # TODO: ak.Array([{}, {}, {}, {"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}]), id="null_awk_empty-pd", ), pytest.param( [ ak.Array([{"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}]), pd.DataFrame(), ], "outer", NotImplementedError, # TODO: ak.Array([{"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}]), id="awk_empty-pd", ), pytest.param( [ ak.Array([{"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}]), pd.DataFrame().assign(a=[3, 4], b=[5, 6]), ], "outer", # TODO: Should try inner too if implemented NotImplementedError, ), pytest.param( [ ak.Array([{"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}]), np.ones((3, 2)), ], "outer", NotImplementedError, ), ], ) @pytest.mark.parametrize("key", ["obsm", "varm"]) def test_concat_mixed_types(key, arrays, expected, join): """Test that concatenation of AwkwardArrays with arbitrary types, but zero length dimension or missing values works.""" axis = 0 if key == "obsm" else 1 to_concat = [] cell_id, gene_id = 0, 0 for a in arrays: shape = np.array([3, 3]) # default shape (in case of missing array) if a is not None: length = axis_len(a, 0) shape[axis] = length tmp_adata = gen_adata( tuple(shape), varm_types=(), obsm_types=(), layers_types=() ) prev_cell_id, prev_gene_id = cell_id, gene_id cell_id, gene_id = cell_id + shape[0], gene_id + shape[1] tmp_adata.obs_names = pd.RangeIndex(prev_cell_id, cell_id).astype(str) tmp_adata.var_names = pd.RangeIndex(prev_gene_id, gene_id).astype(str) if a is not None: if isinstance(a, pd.DataFrame): a.set_index( tmp_adata.obs_names if key == "obsm" else tmp_adata.var_names, inplace=True, ) getattr(tmp_adata, key)["test"] = a to_concat.append(tmp_adata) if isinstance(expected, type) and issubclass(expected, Exception): with warnings.catch_warnings(): warnings.filterwarnings( "ignore", r"The behavior of DataFrame concatenation with empty or all-NA entries is deprecated", FutureWarning, ) with pytest.raises(expected): anndata.concat(to_concat, axis=axis, join=join) else: result_adata = anndata.concat(to_concat, axis=axis, join=join) result = getattr(result_adata, key).get("test", None) assert_equal(expected, result, exact=True) python-anndata-0.12.0~rc1/tests/test_backed_dense.py000066400000000000000000000046441500370632200224730ustar00rootroot00000000000000"""Tests for backing by just sticking zarr/h5py objects into AnnData.""" from __future__ import annotations from typing import TYPE_CHECKING import h5py import numpy as np import pytest import zarr from anndata import AnnData from anndata._io.zarr import open_write_group from anndata.io import write_elem from anndata.tests.helpers import assert_equal if TYPE_CHECKING: from pathlib import Path from typing import Literal @pytest.fixture def file(tmp_path: Path, diskfmt: Literal["h5ad", "zarr"]) -> h5py.File | zarr.Group: path = tmp_path / f"test.{diskfmt}" if diskfmt == "zarr": return open_write_group(path, mode="a") if diskfmt == "h5ad": return h5py.File(path, "a") pytest.fail(f"Unknown diskfmt: {diskfmt}") @pytest.mark.parametrize("assign", ["init", "assign"]) @pytest.mark.parametrize("attr", ["X", "obsm", "varm", "layers"]) def test_create_delete( diskfmt: Literal["h5ad", "zarr"], file: h5py.File | zarr.Group, assign: Literal["init", "assign"], attr: Literal["X", "obsm", "varm", "layers"], ): x = np.random.randn(10, 10) write_elem(file, "a", x) # initialize (and if applicable, assign) if assign == "init": kw = ( dict(X=file["a"]) if attr == "X" else {attr: dict(a=file["a"]), "shape": x.shape} ) adata = AnnData(**kw) elif assign == "assign": adata = AnnData(shape=x.shape) if attr == "X": adata.X = file["a"] else: getattr(adata, attr)["a"] = file["a"] else: pytest.fail(f"Unexpected assign: {assign}") # check equality if attr == "X": # TODO: should that be inverted, e.g. when the Dataset’s path matches the backed mode path? assert not adata.isbacked backed_array = adata.X else: backed_array = getattr(adata, attr)["a"] assert isinstance(backed_array, zarr.Array if diskfmt == "zarr" else h5py.Dataset) assert_equal(backed_array, x) # check that there’s no error deleting it either if attr == "X": del adata.X else: del getattr(adata, attr)["a"] def test_assign_x_subset(file: h5py.File | zarr.Group): x = np.ones((10, 10)) write_elem(file, "a", x) adata = AnnData(file["a"]) view = adata[3:7, 6:8] view.X = np.zeros((4, 2)) expected = x.copy() expected[3:7, 6:8] = np.zeros((4, 2)) assert_equal(adata.X, expected) python-anndata-0.12.0~rc1/tests/test_backed_hdf5.py000066400000000000000000000257541500370632200222300ustar00rootroot00000000000000"""Tests for backing using the `.file` and `.isbacked` attributes.""" from __future__ import annotations from pathlib import Path import joblib import numpy as np import pytest from scipy import sparse import anndata as ad from anndata.compat import CSArray, CSMatrix from anndata.tests.helpers import ( GEN_ADATA_DASK_ARGS, as_dense_dask_array, assert_equal, gen_adata, subset_func, ) from anndata.utils import asarray subset_func2 = subset_func # ------------------------------------------------------------------------------- # Some test data # ------------------------------------------------------------------------------- @pytest.fixture def adata(): X_list = [ [1, 2, 3], [4, 5, 6], [7, 8, 9], ] # data matrix of shape n_obs x n_vars X = np.array(X_list) obs_dict = dict( # annotation of observations / rows row_names=["name1", "name2", "name3"], # row annotation oanno1=["cat1", "cat2", "cat2"], # categorical annotation oanno2=["o1", "o2", "o3"], # string annotation oanno3=[2.1, 2.2, 2.3], # float annotation ) var_dict = dict(vanno1=[3.1, 3.2, 3.3]) # annotation of variables / columns uns_dict = dict( # unstructured annotation oanno1_colors=["#000000", "#FFFFFF"], uns2=["some annotation"] ) return ad.AnnData( X, obs=obs_dict, var=var_dict, uns=uns_dict, obsm=dict(o1=np.zeros((X.shape[0], 10))), varm=dict(v1=np.ones((X.shape[1], 20))), layers=dict(float=X.astype(float), sparse=sparse.csr_matrix(X)), ) @pytest.fixture( params=[sparse.csr_matrix, sparse.csc_matrix, np.array, as_dense_dask_array], ids=["scipy-csr", "scipy-csc", "np-array", "dask_array"], ) def mtx_format(request): return request.param @pytest.fixture(params=[sparse.csr_matrix, sparse.csc_matrix]) def sparse_format(request): return request.param @pytest.fixture(params=["r+", "r", False]) def backed_mode(request): return request.param @pytest.fixture(params=(("X",), ())) def as_dense(request): return request.param # ------------------------------------------------------------------------------- # The test functions # ------------------------------------------------------------------------------- # h5py internally calls `product` on min-versions @pytest.mark.filterwarnings("ignore:`product` is deprecated as of NumPy 1.25.0") # TODO: Check to make sure obs, obsm, layers, ... are written and read correctly as well @pytest.mark.filterwarnings("error") def test_read_write_X(tmp_path, mtx_format, backed_mode, as_dense): base_pth = Path(tmp_path) orig_pth = base_pth / "orig.h5ad" backed_pth = base_pth / "backed.h5ad" orig = ad.AnnData(mtx_format(asarray(sparse.random(10, 10, format="csr")))) orig.write(orig_pth) backed = ad.read_h5ad(orig_pth, backed=backed_mode) backed.write(backed_pth, as_dense=as_dense) backed.file.close() from_backed = ad.read_h5ad(backed_pth) assert np.all(asarray(orig.X) == asarray(from_backed.X)) # this is very similar to the views test @pytest.mark.filterwarnings("ignore::anndata.ImplicitModificationWarning") def test_backing(adata, tmp_path, backing_h5ad): assert not adata.isbacked adata.filename = backing_h5ad adata.write() assert not adata.file.is_open assert adata.isbacked assert adata[:, 0].is_view assert adata[:, 0].X.tolist() == np.reshape([1, 4, 7], (3, 1)).tolist() # this might give us a trouble as the user might not # know that the file is open again.... assert adata.file.is_open adata[:2, 0].X = [0, 0] assert adata[:, 0].X.tolist() == np.reshape([0, 0, 7], (3, 1)).tolist() adata_subset = adata[:2, [0, 1]] assert adata_subset.is_view subset_hash = joblib.hash(adata_subset) # cannot set view in backing mode... with pytest.raises(ValueError, match=r"pass a filename.*to_memory"): adata_subset.obs["foo"] = range(2) with pytest.raises(ValueError, match=r"pass a filename.*to_memory"): adata_subset.var["bar"] = -12 with pytest.raises(ValueError, match=r"pass a filename.*to_memory"): adata_subset.obsm["o2"] = np.ones((2, 2)) with pytest.raises(ValueError, match=r"pass a filename.*to_memory"): adata_subset.varm["v2"] = np.zeros((2, 2)) with pytest.raises(ValueError, match=r"pass a filename.*to_memory"): adata_subset.layers["float2"] = adata_subset.layers["float"].copy() # Things should stay the same after failed operations assert subset_hash == joblib.hash(adata_subset) assert adata_subset.is_view # need to copy first adata_subset = adata_subset.copy(tmp_path / "test.subset.h5ad") # now transition to actual object assert not adata_subset.is_view adata_subset.obs["foo"] = range(2) assert not adata_subset.is_view assert adata_subset.isbacked assert adata_subset.obs["foo"].tolist() == list(range(2)) # save adata_subset.write() def test_backing_copy(adata, tmp_path, backing_h5ad): adata.filename = backing_h5ad adata.write() copypath = tmp_path / "test.copy.h5ad" copy = adata.copy(copypath) assert adata.filename == backing_h5ad assert copy.filename == copypath assert adata.isbacked assert copy.isbacked # TODO: Also test updating the backing file inplace def test_backed_raw(tmp_path): backed_pth = tmp_path / "backed.h5ad" final_pth = tmp_path / "final.h5ad" mem_adata = gen_adata((10, 10), **GEN_ADATA_DASK_ARGS) mem_adata.raw = mem_adata mem_adata.write(backed_pth) backed_adata = ad.read_h5ad(backed_pth, backed="r") assert_equal(backed_adata, mem_adata) backed_adata.write_h5ad(final_pth) final_adata = ad.read_h5ad(final_pth) assert_equal(final_adata, mem_adata) @pytest.mark.parametrize( "array_type", [ pytest.param(asarray, id="dense_array"), pytest.param(sparse.csr_matrix, id="csr_matrix"), pytest.param(sparse.csr_array, id="csr_array"), ], ) def test_backed_raw_subset(tmp_path, array_type, subset_func, subset_func2): backed_pth = tmp_path / "backed.h5ad" final_pth = tmp_path / "final.h5ad" mem_adata = gen_adata((10, 10), X_type=array_type) mem_adata.raw = mem_adata obs_idx = subset_func(mem_adata.obs_names) var_idx = subset_func2(mem_adata.var_names) if ( array_type is asarray and isinstance(obs_idx, list | np.ndarray | CSMatrix | CSArray) and isinstance(var_idx, list | np.ndarray | CSMatrix | CSArray) ): pytest.xfail( "Fancy indexing does not work with multiple arrays on a h5py.Dataset" ) mem_adata.write(backed_pth) ### Backed view has same values as in memory view ### backed_adata = ad.read_h5ad(backed_pth, backed="r") backed_v = backed_adata[obs_idx, var_idx] assert backed_v.is_view mem_v = mem_adata[obs_idx, var_idx] # Value equivalent assert_equal(mem_v, backed_v) # Type and value equivalent assert_equal(mem_v.copy(), backed_v.to_memory(copy=True), exact=True) assert backed_v.is_view assert backed_v.isbacked ### Write from backed view ### backed_v.write_h5ad(final_pth) final_adata = ad.read_h5ad(final_pth) assert_equal(mem_v, final_adata) assert_equal(final_adata, backed_v.to_memory()) # assert loading into memory @pytest.mark.parametrize( "array_type", [ pytest.param(asarray, id="dense_array"), pytest.param(sparse.csr_matrix, id="csr_matrix"), pytest.param(as_dense_dask_array, id="dask_array"), ], ) def test_to_memory_full(tmp_path, array_type): backed_pth = tmp_path / "backed.h5ad" mem_adata = gen_adata((15, 10), X_type=array_type, **GEN_ADATA_DASK_ARGS) mem_adata.raw = gen_adata((15, 12), X_type=array_type, **GEN_ADATA_DASK_ARGS) mem_adata.write_h5ad(backed_pth, compression="lzf") backed_adata = ad.read_h5ad(backed_pth, backed="r") assert_equal(mem_adata, backed_adata.to_memory()) # Test that raw can be removed del backed_adata.raw del mem_adata.raw assert_equal(mem_adata, backed_adata.to_memory()) def test_double_index(adata, backing_h5ad): adata.filename = backing_h5ad with pytest.raises(ValueError, match=r"cannot make a view of a view"): # no view of view of backed object currently adata[:2][:, 0] # close backing file adata.write() def test_return_to_memory_mode(adata, backing_h5ad): bdata = adata.copy() adata.filename = backing_h5ad assert adata.isbacked adata.filename = None assert not adata.isbacked assert adata.X is not None # make sure the previous file had been properly closed # when setting `adata.filename = None` # if it hadn’t the following line would throw an error bdata.filename = backing_h5ad # close the file bdata.filename = None def test_backed_modification(adata, backing_h5ad): adata.X[:, 1] = 0 # Make it a little sparse adata.X = sparse.csr_matrix(adata.X) assert not adata.isbacked # While this currently makes the file backed, it doesn’t write it as sparse adata.filename = backing_h5ad adata.write() assert not adata.file.is_open assert adata.isbacked adata.X[0, [0, 2]] = 10 adata.X[1, [0, 2]] = [11, 12] adata.X[2, 1] = 13 # If it were written as sparse, this should fail assert adata.isbacked assert np.all(adata.X[0, :] == np.array([10, 0, 10])) assert np.all(adata.X[1, :] == np.array([11, 0, 12])) assert np.all(adata.X[2, :] == np.array([7, 13, 9])) def test_backed_modification_sparse(adata, backing_h5ad, sparse_format): adata.X[:, 1] = 0 # Make it a little sparse adata.X = sparse_format(adata.X) assert not adata.isbacked adata.write(backing_h5ad) adata = ad.read_h5ad(backing_h5ad, backed="r+") assert adata.filename == backing_h5ad assert adata.isbacked with pytest.warns( FutureWarning, match=r"__setitem__ for backed sparse will be removed" ): adata.X[0, [0, 2]] = 10 adata.X[1, [0, 2]] = [11, 12] with pytest.raises(ValueError, match=r"cannot change the sparsity structure"): adata.X[2, 1] = 13 assert adata.isbacked assert np.all(adata.X[0, :] == np.array([10, 0, 10])) assert np.all(adata.X[1, :] == np.array([11, 0, 12])) assert np.all(adata.X[2, :] == np.array([7, 0, 9])) # TODO: Work around h5py not supporting this # def test_backed_view_modification(adata, backing_h5ad): # adata.write(backing_h5ad) # backed_adata = ad.read_h5ad(backing_h5ad, backed=True) # backed_view = backed_adata[[1, 2], :] # backed_view.X = 0 # assert np.all(backed_adata.X[:3, :] == 0) # TODO: Implement # def test_backed_view_modification_sparse(adata, backing_h5ad, sparse_format): # adata[:, 1] = 0 # Make it a little sparse # adata.X = sparse_format(adata.X) # adata.write(backing_h5ad) # backed_adata = ad.read_h5ad(backing_h5ad, backed=True) # backed_view = backed_adata[[1,2], :] # backed_view.X = 0 # assert np.all(backed_adata.X[[1,2], :] == 0) python-anndata-0.12.0~rc1/tests/test_backed_sparse.py000066400000000000000000000550711500370632200226720ustar00rootroot00000000000000from __future__ import annotations from functools import partial from itertools import product from typing import TYPE_CHECKING, Literal, get_args import h5py import numpy as np import pytest import zarr from scipy import sparse import anndata as ad from anndata._core.anndata import AnnData from anndata._core.sparse_dataset import sparse_dataset from anndata._io.specs.registry import read_elem_lazy from anndata._io.zarr import open_write_group from anndata.compat import ( CSArray, CSMatrix, DaskArray, ZarrGroup, is_zarr_v2, ) from anndata.experimental import read_dispatched from anndata.tests.helpers import AccessTrackingStore, assert_equal, subset_func if TYPE_CHECKING: from collections.abc import Callable, Generator, Sequence from pathlib import Path from types import EllipsisType from _pytest.mark import ParameterSet from numpy.typing import ArrayLike, NDArray from pytest_mock import MockerFixture from anndata.abc import CSCDataset, CSRDataset Idx = slice | int | NDArray[np.integer] | NDArray[np.bool_] subset_func2 = subset_func M = 50 N = 50 @pytest.fixture def zarr_metadata_key(): return ".zarray" if ad.settings.zarr_write_format == 2 else "zarr.json" @pytest.fixture def zarr_separator(): return "" if ad.settings.zarr_write_format == 2 else "/c" @pytest.fixture def ondisk_equivalent_adata( tmp_path: Path, diskfmt: Literal["h5ad", "zarr"] ) -> tuple[AnnData, AnnData, AnnData, AnnData]: csr_path = tmp_path / f"csr.{diskfmt}" csc_path = tmp_path / f"csc.{diskfmt}" dense_path = tmp_path / f"dense.{diskfmt}" write = lambda x, pth, **kwargs: getattr(x, f"write_{diskfmt}")(pth, **kwargs) csr_mem = ad.AnnData(X=sparse.random(M, N, format="csr", density=0.1)) csc_mem = ad.AnnData(X=csr_mem.X.tocsc()) dense_mem = ad.AnnData(X=csr_mem.X.toarray()) write(csr_mem, csr_path) write(csc_mem, csc_path) # write(csr_mem, dense_path, as_dense="X") write(dense_mem, dense_path) if diskfmt == "h5ad": csr_disk = ad.read_h5ad(csr_path, backed="r") csc_disk = ad.read_h5ad(csc_path, backed="r") dense_disk = ad.read_h5ad(dense_path, backed="r") else: def read_zarr_backed(path): path = str(path) f = zarr.open(path, mode="r") # Read with handling for backwards compat def callback(func, elem_name, elem, iospec): if iospec.encoding_type == "anndata" or elem_name.endswith("/"): return AnnData( **{ k: read_dispatched(v, callback) for k, v in dict(elem).items() } ) if iospec.encoding_type in {"csc_matrix", "csr_matrix"}: return sparse_dataset(elem) return func(elem) adata = read_dispatched(f, callback=callback) return adata csr_disk = read_zarr_backed(csr_path) csc_disk = read_zarr_backed(csc_path) dense_disk = read_zarr_backed(dense_path) return csr_mem, csr_disk, csc_disk, dense_disk @pytest.mark.parametrize( "empty_mask", [[], np.zeros(M, dtype=bool)], ids=["empty_list", "empty_bool_mask"] ) def test_empty_backed_indexing( ondisk_equivalent_adata: tuple[AnnData, AnnData, AnnData, AnnData], empty_mask, ): csr_mem, csr_disk, csc_disk, _ = ondisk_equivalent_adata assert_equal(csr_mem.X[empty_mask], csr_disk.X[empty_mask]) assert_equal(csr_mem.X[:, empty_mask], csc_disk.X[:, empty_mask]) # The following do not work because of https://github.com/scipy/scipy/issues/19919 # Our implementation returns a (0,0) sized matrix but scipy does (1,0). # assert_equal(csr_mem.X[empty_mask, empty_mask], csr_disk.X[empty_mask, empty_mask]) # assert_equal(csr_mem.X[empty_mask, empty_mask], csc_disk.X[empty_mask, empty_mask]) def test_backed_indexing( ondisk_equivalent_adata: tuple[AnnData, AnnData, AnnData, AnnData], subset_func, subset_func2, ): csr_mem, csr_disk, csc_disk, dense_disk = ondisk_equivalent_adata obs_idx = subset_func(csr_mem.obs_names) var_idx = subset_func2(csr_mem.var_names) assert_equal(csr_mem[obs_idx, var_idx].X, csr_disk[obs_idx, var_idx].X) assert_equal(csr_mem[obs_idx, var_idx].X, csc_disk[obs_idx, var_idx].X) assert_equal(csr_mem.X[...], csc_disk.X[...]) assert_equal(csr_mem[obs_idx, :].X, dense_disk[obs_idx, :].X) assert_equal(csr_mem[obs_idx].X, csr_disk[obs_idx].X) assert_equal(csr_mem[:, var_idx].X, dense_disk[:, var_idx].X) def test_backed_ellipsis_indexing( ondisk_equivalent_adata: tuple[AnnData, AnnData, AnnData, AnnData], ellipsis_index: tuple[EllipsisType | slice, ...] | EllipsisType, equivalent_ellipsis_index: tuple[slice, slice], ): csr_mem, csr_disk, csc_disk, _ = ondisk_equivalent_adata assert_equal(csr_mem.X[equivalent_ellipsis_index], csr_disk.X[ellipsis_index]) assert_equal(csr_mem.X[equivalent_ellipsis_index], csc_disk.X[ellipsis_index]) def make_randomized_mask(size: int) -> np.ndarray: randomized_mask = np.zeros(size, dtype=bool) inds = np.random.choice(size, 20, replace=False) inds.sort() for i in range(0, len(inds) - 1, 2): randomized_mask[inds[i] : inds[i + 1]] = True return randomized_mask def make_alternating_mask(size: int, step: int) -> np.ndarray: mask_alternating = np.ones(size, dtype=bool) for i in range(0, size, step): # 5 is too low to trigger new behavior mask_alternating[i] = False return mask_alternating # non-random indices, with alternating one false and n true make_alternating_mask_5 = partial(make_alternating_mask, step=5) make_alternating_mask_15 = partial(make_alternating_mask, step=15) def make_one_group_mask(size: int) -> np.ndarray: one_group_mask = np.zeros(size, dtype=bool) one_group_mask[1 : size // 2] = True return one_group_mask def make_one_elem_mask(size: int) -> np.ndarray: one_elem_mask = np.zeros(size, dtype=bool) one_elem_mask[size // 4] = True return one_elem_mask # test behavior from https://github.com/scverse/anndata/pull/1233 @pytest.mark.parametrize( ("make_bool_mask", "should_trigger_optimization"), [ (make_randomized_mask, None), (make_alternating_mask_15, True), (make_alternating_mask_5, False), (make_one_group_mask, True), (make_one_elem_mask, False), ], ids=["randomized", "alternating_15", "alternating_5", "one_group", "one_elem"], ) def test_consecutive_bool( mocker: MockerFixture, ondisk_equivalent_adata: tuple[AnnData, AnnData, AnnData, AnnData], make_bool_mask: Callable[[int], np.ndarray], should_trigger_optimization: bool | None, ): """Tests for optimization from https://github.com/scverse/anndata/pull/1233 Parameters ---------- mocker Mocker object ondisk_equivalent_adata AnnData objects with sparse X for testing make_bool_mask Function for creating a boolean mask. should_trigger_optimization Whether or not a given mask should trigger the optimized behavior. """ _, csr_disk, csc_disk, _ = ondisk_equivalent_adata mask = make_bool_mask(csr_disk.shape[0]) # indexing needs to be on `X` directly to trigger the optimization. # `_normalize_indices`, which is used by `AnnData`, converts bools to ints with `np.where` from anndata._core import sparse_dataset spy = mocker.spy(sparse_dataset, "get_compressed_vectors_for_slices") assert_equal(csr_disk.X[mask, :], csr_disk.X[np.where(mask)]) if should_trigger_optimization is not None: assert ( spy.call_count == 1 if should_trigger_optimization else not spy.call_count ) assert_equal(csc_disk.X[:, mask], csc_disk.X[:, np.where(mask)[0]]) if should_trigger_optimization is not None: assert ( spy.call_count == 2 if should_trigger_optimization else not spy.call_count ) assert_equal(csr_disk[mask, :], csr_disk[np.where(mask)]) if should_trigger_optimization is not None: assert ( spy.call_count == 3 if should_trigger_optimization else not spy.call_count ) subset = csc_disk[:, mask] assert_equal(subset, csc_disk[:, np.where(mask)[0]]) if should_trigger_optimization is not None: assert ( spy.call_count == 4 if should_trigger_optimization else not spy.call_count ) if should_trigger_optimization is not None and not csc_disk.isbacked: size = subset.shape[1] if should_trigger_optimization: subset_subset_mask = np.ones(size).astype("bool") subset_subset_mask[size // 2] = False else: subset_subset_mask = make_one_elem_mask(size) assert_equal( subset[:, subset_subset_mask], subset[:, np.where(subset_subset_mask)[0]] ) assert ( spy.call_count == 5 if should_trigger_optimization else not spy.call_count ), f"Actual count: {spy.call_count}" @pytest.mark.parametrize( ("sparse_format", "append_method"), [ pytest.param(sparse.csr_matrix, sparse.vstack), pytest.param(sparse.csc_matrix, sparse.hstack), pytest.param(sparse.csr_array, sparse.vstack), pytest.param(sparse.csc_array, sparse.hstack), ], ) def test_dataset_append_memory( tmp_path: Path, sparse_format: Callable[[ArrayLike], CSMatrix], append_method: Callable[[list[CSMatrix]], CSMatrix], diskfmt: Literal["h5ad", "zarr"], ): path = tmp_path / f"test.{diskfmt.replace('ad', '')}" a = sparse_format(sparse.random(100, 100)) b = sparse_format(sparse.random(100, 100)) if diskfmt == "zarr": f = open_write_group(path, mode="a") else: f = h5py.File(path, "a") ad.io.write_elem(f, "mtx", a) diskmtx = sparse_dataset(f["mtx"]) diskmtx.append(b) fromdisk = diskmtx.to_memory() frommem = append_method([a, b]) assert_equal(fromdisk, frommem) def test_append_array_cache_bust(tmp_path: Path, diskfmt: Literal["h5ad", "zarr"]): path = tmp_path / f"test.{diskfmt.replace('ad', '')}" a = sparse.random(100, 100, format="csr") if diskfmt == "zarr": f = open_write_group(path, mode="a") else: f = h5py.File(path, "a") ad.io.write_elem(f, "mtx", a) ad.io.write_elem(f, "mtx_2", a) diskmtx = sparse_dataset(f["mtx"]) old_array_shapes = {} array_names = ["indptr", "indices", "data"] for name in array_names: old_array_shapes[name] = getattr(diskmtx, f"_{name}").shape diskmtx.append(sparse_dataset(f["mtx_2"])) for name in array_names: assert old_array_shapes[name] != getattr(diskmtx, f"_{name}").shape @pytest.mark.parametrize("sparse_format", [sparse.csr_matrix, sparse.csc_matrix]) @pytest.mark.parametrize( ("subset_func", "subset_func2"), product( [ ad.tests.helpers.array_subset, ad.tests.helpers.slice_subset, ad.tests.helpers.array_int_subset, ad.tests.helpers.array_bool_subset, ], repeat=2, ), ) def test_read_array( tmp_path: Path, sparse_format: Callable[[ArrayLike], CSMatrix], diskfmt: Literal["h5ad", "zarr"], subset_func, subset_func2, ): path = tmp_path / f"test.{diskfmt.replace('ad', '')}" a = sparse_format(sparse.random(100, 100)) obs_idx = subset_func(np.arange(100)) var_idx = subset_func2(np.arange(100)) if diskfmt == "zarr": f = open_write_group(path, mode="a") else: f = h5py.File(path, "a") ad.io.write_elem(f, "mtx", a) diskmtx = sparse_dataset(f["mtx"]) ad.settings.use_sparse_array_on_read = True assert issubclass(type(diskmtx[obs_idx, var_idx]), CSArray) ad.settings.use_sparse_array_on_read = False assert issubclass(type(diskmtx[obs_idx, var_idx]), CSMatrix) @pytest.mark.parametrize( ("sparse_format", "append_method"), [ pytest.param(sparse.csr_matrix, sparse.vstack), pytest.param(sparse.csc_matrix, sparse.hstack), ], ) def test_dataset_append_disk( tmp_path: Path, sparse_format: Callable[[ArrayLike], CSMatrix], append_method: Callable[[list[CSMatrix]], CSMatrix], diskfmt: Literal["h5ad", "zarr"], ): path = tmp_path / f"test.{diskfmt.replace('ad', '')}" a = sparse_format(sparse.random(10, 10)) b = sparse_format(sparse.random(10, 10)) if diskfmt == "zarr": f = open_write_group(path, mode="a") else: f = h5py.File(path, "a") ad.io.write_elem(f, "a", a) ad.io.write_elem(f, "b", b) a_disk = sparse_dataset(f["a"]) b_disk = sparse_dataset(f["b"]) a_disk.append(b_disk) fromdisk = a_disk.to_memory() frommem = append_method([a, b]) assert_equal(fromdisk, frommem) @pytest.mark.parametrize("sparse_format", [sparse.csr_matrix, sparse.csc_matrix]) def test_lazy_array_cache( tmp_path: Path, sparse_format: Callable[[ArrayLike], CSMatrix], zarr_metadata_key ): elems = {"indptr", "indices", "data"} path = tmp_path / "test.zarr" a = sparse_format(sparse.random(10, 10)) f = open_write_group(path, mode="a") ad.io.write_elem(f, "X", a) store = AccessTrackingStore(path) for elem in elems: store.initialize_key_trackers([f"X/{elem}"]) f = open_write_group(store, mode="a") a_disk = sparse_dataset(f["X"]) a_disk[:1] a_disk[3:5] a_disk[6:7] a_disk[8:9] # one each for .zarray and actual access # see https://github.com/zarr-developers/zarr-python/discussions/2760 for why 4 assert store.get_access_count("X/indptr") == 2 if is_zarr_v2() else 4 for elem_not_indptr in elems - {"indptr"}: assert ( sum( zarr_metadata_key in key_accessed for key_accessed in store.get_accessed_keys(f"X/{elem_not_indptr}") ) == 1 ) Kind = Literal["slice", "int", "array", "mask"] def mk_idx_kind(idx: Sequence[int], *, kind: Kind, l: int) -> Idx | None: """Convert sequence of consecutive integers (e.g. range with step=1) into different kinds of indexing.""" if kind == "slice": start = idx[0] if idx[0] > 0 else None if len(idx) == 1: return slice(start, idx[0] + 1) if all(np.diff(idx) == 1): stop = idx[-1] + 1 if idx[-1] < l - 1 else None return slice(start, stop) if kind == "int": if len(idx) == 1: return idx[0] if kind == "array": return np.asarray(idx) if kind == "mask": return np.isin(np.arange(l), idx) return None def idify(x: object) -> str: if isinstance(x, slice): start, stop = ("" if s is None else str(s) for s in (x.start, x.stop)) return f"{start}:{stop}" + (f":{x.step}" if x.step not in (1, None) else "") return str(x) def width_idx_kinds( *idxs: tuple[Sequence[int], Idx, Sequence[str]], l: int ) -> Generator[ParameterSet, None, None]: """Convert major (first) index into various identical kinds of indexing.""" for (idx_maj_raw, idx_min, exp), maj_kind in product(idxs, get_args(Kind)): if (idx_maj := mk_idx_kind(idx_maj_raw, kind=maj_kind, l=l)) is None: continue id_ = "-".join(map(idify, [idx_maj_raw, idx_min, maj_kind])) yield pytest.param(idx_maj, idx_min, exp, id=id_) @pytest.mark.parametrize("sparse_format", [sparse.csr_matrix, sparse.csc_matrix]) @pytest.mark.parametrize( ("idx_maj", "idx_min", "exp"), width_idx_kinds( ( [0], slice(None, None), ["X/data/{zarr_metadata_key}", "X/data{zarr_separator}/0"], ), ( [0], slice(None, 3), ["X/data/{zarr_metadata_key}", "X/data{zarr_separator}/0"], ), ( [3, 4, 5], slice(None, None), [ "X/data/{zarr_metadata_key}", "X/data{zarr_separator}/3", "X/data{zarr_separator}/4", "X/data{zarr_separator}/5", ], ), l=10, ), ) @pytest.mark.parametrize( "open_func", [ sparse_dataset, lambda x: read_elem_lazy( x, chunks=(1, -1) if x.attrs["encoding-type"] == "csr_matrix" else (-1, 1) ), ], ids=["sparse_dataset", "read_elem_lazy"], ) def test_data_access( tmp_path: Path, sparse_format: Callable[[ArrayLike], CSMatrix], idx_maj: Idx, idx_min: Idx, exp: list[str], open_func: Callable[[ZarrGroup], CSRDataset | CSCDataset | DaskArray], zarr_metadata_key, zarr_separator, ): exp = [ e.format(zarr_metadata_key=zarr_metadata_key, zarr_separator=zarr_separator) for e in exp ] path = tmp_path / "test.zarr" a = sparse_format(np.eye(10, 10)) f = open_write_group(path, mode="a") ad.io.write_elem(f, "X", a) data = f["X/data"][...] del f["X/data"] # chunk one at a time to count properly zarr.array( data, store=path / "X" / "data", chunks=(1,), zarr_format=ad.settings.zarr_write_format, ) store = AccessTrackingStore(path) store.initialize_key_trackers(["X/data"]) f = zarr.open_group(store) a_disk = AnnData(X=open_func(f["X"])) if a.format == "csr": subset = a_disk[idx_maj, idx_min] else: subset = a_disk[idx_min, idx_maj] if isinstance(subset.X, DaskArray): subset.X.compute(scheduler="single-threaded") # zarr v2 fetches all and not just metadata for that node in 3.X.X python package # TODO: https://github.com/zarr-developers/zarr-python/discussions/2760 if ad.settings.zarr_write_format == 2 and not is_zarr_v2(): exp = exp + ["X/data/.zgroup", "X/data/.zattrs"] assert store.get_access_count("X/data") == len(exp), store.get_accessed_keys( "X/data" ) # dask access order is not guaranteed so need to sort assert sorted(store.get_accessed_keys("X/data")) == sorted(exp) @pytest.mark.parametrize( ("sparse_format", "a_shape", "b_shape"), [ pytest.param("csr", (100, 100), (100, 200)), pytest.param("csc", (100, 100), (200, 100)), ], ) def test_wrong_shape( tmp_path: Path, sparse_format: Literal["csr", "csc"], a_shape: tuple[int, int], b_shape: tuple[int, int], diskfmt: Literal["h5ad", "zarr"], ): path = tmp_path / f"test.{diskfmt.replace('ad', '')}" a_mem = sparse.random(*a_shape, format=sparse_format) b_mem = sparse.random(*b_shape, format=sparse_format) if diskfmt == "zarr": f = open_write_group(path, mode="a") else: f = h5py.File(path, "a") ad.io.write_elem(f, "a", a_mem) ad.io.write_elem(f, "b", b_mem) a_disk = sparse_dataset(f["a"]) b_disk = sparse_dataset(f["b"]) with pytest.raises(AssertionError): a_disk.append(b_disk) def test_reset_group(tmp_path: Path, diskfmt: Literal["h5ad", "zarr"]): path = tmp_path / "test.zarr" base = sparse.random(100, 100, format="csr") if diskfmt == "zarr": f = open_write_group(path, mode="a") else: f = h5py.File(path, "a") ad.io.write_elem(f, "base", base) disk_mtx = sparse_dataset(f["base"]) with pytest.raises(AttributeError): disk_mtx.group = f def test_wrong_formats(tmp_path: Path, diskfmt: Literal["h5ad", "zarr"]): path = tmp_path / f"test.{diskfmt.replace('ad', '')}" base = sparse.random(100, 100, format="csr") if diskfmt == "zarr": f = open_write_group(path, mode="a") else: f = h5py.File(path, "a") ad.io.write_elem(f, "base", base) disk_mtx = sparse_dataset(f["base"]) pre_checks = disk_mtx.to_memory() with pytest.raises(ValueError, match="must have same format"): disk_mtx.append(sparse.random(100, 100, format="csc")) with pytest.raises(ValueError, match="must have same format"): disk_mtx.append(sparse.random(100, 100, format="coo")) with pytest.raises(NotImplementedError): disk_mtx.append(np.random.random((100, 100))) if isinstance(f, ZarrGroup) and not is_zarr_v2(): data = np.random.random((100, 100)) disk_dense = f.create_array("dense", shape=(100, 100), dtype=data.dtype) disk_dense[...] = data else: disk_dense = f.create_dataset( "dense", data=np.random.random((100, 100)), shape=(100, 100) ) with pytest.raises(NotImplementedError): disk_mtx.append(disk_dense) post_checks = disk_mtx.to_memory() # Check nothing changed assert not np.any((pre_checks != post_checks).toarray()) def test_anndata_sparse_compat(tmp_path: Path, diskfmt: Literal["h5ad", "zarr"]): path = tmp_path / f"test.{diskfmt.replace('ad', '')}" base = sparse.random(100, 100, format="csr") if diskfmt == "zarr": f = open_write_group(path, mode="a") else: f = h5py.File(path, "a") ad.io.write_elem(f, "/", base) adata = ad.AnnData(sparse_dataset(f["/"])) assert_equal(adata.X, base) def test_backed_sizeof( ondisk_equivalent_adata: tuple[AnnData, AnnData, AnnData, AnnData], ): csr_mem, csr_disk, csc_disk, _ = ondisk_equivalent_adata assert csr_mem.__sizeof__() == csr_disk.__sizeof__(with_disk=True) assert csr_mem.__sizeof__() == csc_disk.__sizeof__(with_disk=True) assert csr_disk.__sizeof__(with_disk=True) == csc_disk.__sizeof__(with_disk=True) assert csr_mem.__sizeof__() > csr_disk.__sizeof__() assert csr_mem.__sizeof__() > csc_disk.__sizeof__() @pytest.mark.parametrize( "group_fn", [ pytest.param(lambda _: zarr.group(), id="zarr"), pytest.param(lambda p: h5py.File(p / "test.h5", mode="a"), id="h5py"), ], ) @pytest.mark.parametrize( "sparse_class", [ sparse.csr_matrix, pytest.param( sparse.csr_array, marks=[pytest.mark.skip(reason="scipy bug causes view to be allocated")], ), ], ) def test_append_overflow_check(group_fn, sparse_class, tmp_path): group = group_fn(tmp_path) typemax_int32 = np.iinfo(np.int32).max orig_mtx = sparse_class(np.ones((1, 1), dtype=bool)) # Minimally allocating new matrix new_mtx = sparse_class( ( np.broadcast_to(True, typemax_int32 - 1), # noqa: FBT003 np.broadcast_to(np.int32(1), typemax_int32 - 1), [0, typemax_int32 - 1], ), shape=(1, 2), ) ad.io.write_elem(group, "mtx", orig_mtx) backed = sparse_dataset(group["mtx"]) # Checking for correct caching behaviour backed._indptr with pytest.raises( OverflowError, match=r"This array was written with a 32 bit intptr, but is now large.*", ): backed.append(new_mtx) # Check for any modification assert_equal(backed, orig_mtx) python-anndata-0.12.0~rc1/tests/test_base.py000066400000000000000000000632751500370632200210230ustar00rootroot00000000000000from __future__ import annotations import re import warnings from itertools import product from typing import TYPE_CHECKING import numpy as np import pandas as pd import pytest from numpy import ma from scipy import sparse as sp from scipy.sparse import csr_matrix, issparse import anndata as ad from anndata import AnnData, ImplicitModificationWarning from anndata._settings import settings from anndata.tests.helpers import assert_equal, gen_adata, get_multiindex_columns_df if TYPE_CHECKING: from pathlib import Path from typing import Literal # some test objects that we use below adata_dense = AnnData(np.array([[1, 2], [3, 4]])) adata_dense.layers["test"] = adata_dense.X adata_sparse = AnnData( csr_matrix([[0, 2, 3], [0, 5, 6]]), dict(obs_names=["s1", "s2"], anno1=["c1", "c2"]), dict(var_names=["a", "b", "c"]), ) def test_creation(): AnnData(np.array([[1, 2], [3, 4]])) AnnData(np.array([[1, 2], [3, 4]]), {}, {}) AnnData(ma.array([[1, 2], [3, 4]]), uns=dict(mask=[0, 1, 1, 0])) AnnData(sp.eye(2, format="csr")) AnnData(sp.csr_array([[1, 0], [0, 1]])) X = np.array([[1, 2, 3], [4, 5, 6]]) adata = AnnData( X=X, obs=dict(Obs=["A", "B"]), var=dict(Feat=["a", "b", "c"]), obsm=dict(X_pca=np.array([[1, 2], [3, 4]])), raw=dict(X=X, var=dict(var_names=["a", "b", "c"])), ) assert adata.raw.X.tolist() == X.tolist() assert adata.raw.var_names.tolist() == ["a", "b", "c"] # init with empty data matrix shape = (3, 5) adata = AnnData(None, uns=dict(test=np.array((3, 3))), shape=shape) assert adata.X is None assert adata.shape == shape assert "test" in adata.uns @pytest.mark.parametrize( ("src", "src_arg", "dim_msg"), [ pytest.param( "X", adata_dense.X, "`{dim}` must have as many rows as `X` has {mat_dim}s", id="x", ), pytest.param( "shape", (2, 2), "`shape` is inconsistent with `{dim}`", id="shape" ), ], ) @pytest.mark.parametrize("dim", ["obs", "var"]) @pytest.mark.parametrize( ("dim_arg", "msg"), [ pytest.param( lambda _: dict(TooLong=[1, 2, 3, 4]), "Length of values (4) does not match length of index (2)", id="too_long_col", ), pytest.param( lambda dim: {f"{dim}_names": ["a", "b", "c"]}, None, id="too_many_names" ), pytest.param( lambda _: pd.DataFrame(index=["a", "b", "c"]), None, id="too_long_df" ), ], ) def test_creation_error(src, src_arg, dim_msg, dim, dim_arg, msg: str | None): if msg is None: mat_dim = "row" if dim == "obs" else "column" msg = dim_msg.format(dim=dim, mat_dim=mat_dim) with pytest.raises(ValueError, match=re.escape(msg)): AnnData(**{src: src_arg, dim: dim_arg(dim)}) def test_invalid_X(): with pytest.raises( ValueError, match=r"X needs to be of one of .*not \.", ): AnnData("string is not a valid X") def test_create_with_dfs(): X = np.ones((6, 3)) obs = pd.DataFrame(dict(cat_anno=pd.Categorical(["a", "a", "a", "a", "b", "a"]))) obs_copy = obs.copy() adata = AnnData(X=X, obs=obs) assert obs.index.equals(obs_copy.index) assert obs.index.astype(str).equals(adata.obs.index) def test_create_from_df(): df = pd.DataFrame(np.ones((3, 2)), index=["a", "b", "c"], columns=["A", "B"]) ad = AnnData(df) assert df.values.tolist() == ad.X.tolist() assert df.columns.tolist() == ad.var_names.tolist() assert df.index.tolist() == ad.obs_names.tolist() @pytest.mark.parametrize("attr", ["X", "obs", "obsm"]) def test_error_create_from_multiindex_df(attr): df = get_multiindex_columns_df((100, 20)) val = df if attr != "obsm" else {"df": df} with pytest.raises(ValueError, match=r"MultiIndex columns are not supported"): AnnData(**{attr: val}, shape=(100, 10)) def test_create_from_sparse_df(): s = sp.random(20, 30, density=0.2, format="csr") obs_names = [f"obs{i}" for i in range(20)] var_names = [f"var{i}" for i in range(30)] df = pd.DataFrame.sparse.from_spmatrix(s, index=obs_names, columns=var_names) a = AnnData(df) b = AnnData(s, obs=pd.DataFrame(index=obs_names), var=pd.DataFrame(index=var_names)) assert_equal(a, b) assert issparse(a.X) def test_create_from_df_with_obs_and_var(): df = pd.DataFrame(np.ones((3, 2)), index=["a", "b", "c"], columns=["A", "B"]) obs = pd.DataFrame(np.ones((3, 1)), index=df.index, columns=["C"]) var = pd.DataFrame(np.ones((2, 1)), index=df.columns, columns=["D"]) ad = AnnData(df, obs=obs, var=var) assert df.values.tolist() == ad.X.tolist() assert df.columns.tolist() == ad.var_names.tolist() assert df.index.tolist() == ad.obs_names.tolist() assert obs.equals(ad.obs) assert var.equals(ad.var) with pytest.raises(ValueError, match=r"Index of obs must match index of X."): AnnData(df, obs=obs.reset_index()) with pytest.raises(ValueError, match=r"Index of var must match columns of X."): AnnData(df, var=var.reset_index()) def test_matching_int_index(): adata = AnnData( pd.DataFrame(dict(a=[0.0, 0.5]), index=[0, 1]), obs=pd.DataFrame(index=[0, 1]) ) pd.testing.assert_index_equal(adata.obs_names, pd.Index(["0", "1"])) def test_from_df_and_dict(): df = pd.DataFrame(dict(a=[0.1, 0.2, 0.3], b=[1.1, 1.2, 1.3])) adata = AnnData(df, dict(species=pd.Categorical(["a", "b", "a"]))) assert adata.obs["species"].values.tolist() == ["a", "b", "a"] def test_df_warnings(): df = pd.DataFrame(dict(A=[1, 2, 3], B=[1.0, 2.0, 3.0]), index=["a", "b", "c"]) with pytest.warns(UserWarning, match=r"X.*dtype float64"): adata = AnnData(df) with pytest.warns(UserWarning, match=r"X.*dtype float64"): adata.X = df @pytest.mark.parametrize("attr", ["X", "layers", "obsm", "varm", "obsp", "varp"]) @pytest.mark.parametrize("when", ["init", "assign"]) def test_convert_matrix(attr, when): """Test that initializing or assigning aligned arrays to a np.matrix converts it.""" with warnings.catch_warnings(): warnings.filterwarnings( "ignore", r"the matrix.*not.*recommended", PendingDeprecationWarning ) mat = np.matrix([[1, 2], [3, 0]]) direct = attr in {"X"} with pytest.warns(ImplicitModificationWarning, match=r"np\.ndarray"): if when == "init": adata = ( AnnData(**{attr: mat}) if direct else AnnData(shape=(2, 2), **{attr: {"a": mat}}) ) elif when == "assign": adata = AnnData(shape=(2, 2)) if direct: setattr(adata, attr, mat) else: getattr(adata, attr)["a"] = mat else: raise ValueError(when) arr = getattr(adata, attr) if direct else getattr(adata, attr)["a"] assert isinstance(arr, np.ndarray), f"{arr} is not an array" assert not isinstance(arr, np.matrix), f"{arr} is still a matrix" def test_attr_deletion(): full = gen_adata((30, 30)) # Empty has just X, obs_names, var_names empty = AnnData(None, obs=full.obs[[]], var=full.var[[]]) for attr in ["X", "obs", "var", "obsm", "varm", "obsp", "varp", "layers", "uns"]: delattr(full, attr) assert_equal(getattr(full, attr), getattr(empty, attr)) assert_equal(full, empty, exact=True) def test_names(): adata = AnnData( np.array([[1, 2, 3], [4, 5, 6]]), dict(obs_names=["A", "B"]), dict(var_names=["a", "b", "c"]), ) assert adata.obs_names.tolist() == "A B".split() assert adata.var_names.tolist() == "a b c".split() adata = AnnData(np.array([[1, 2], [3, 4], [5, 6]]), var=dict(var_names=["a", "b"])) assert adata.var_names.tolist() == ["a", "b"] @pytest.mark.parametrize( ("names", "after"), [ pytest.param(["a", "b"], None, id="list"), pytest.param( pd.Series(["AAD", "CCA"], name="barcodes"), "barcodes", id="Series-str" ), pytest.param(pd.Series(["x", "y"], name=0), None, id="Series-int"), ], ) @pytest.mark.parametrize("attr", ["obs_names", "var_names"]) def test_setting_index_names(names, after, attr): adata = adata_dense.copy() assert getattr(adata, attr).name is None setattr(adata, attr, names) assert getattr(adata, attr).name == after if hasattr(names, "name"): assert names.name is not None # Testing for views new = adata[:, :] assert new.is_view setattr(new, attr, names) assert_equal(new, adata, exact=True) assert not new.is_view @pytest.mark.parametrize("attr", ["obs_names", "var_names"]) def test_setting_index_names_error(attr): orig = adata_sparse[:2, :2] adata = adata_sparse[:2, :2] assert getattr(adata, attr).name is None with pytest.raises(ValueError, match=rf"AnnData expects \.{attr[:3]}\.index\.name"): setattr(adata, attr, pd.Index(["x", "y"], name=0)) assert adata.is_view assert getattr(adata, attr).tolist() != ["x", "y"] assert getattr(adata, attr).tolist() == getattr(orig, attr).tolist() assert_equal(orig, adata, exact=True) @pytest.mark.parametrize("dim", ["obs", "var"]) def test_setting_dim_index(dim): index_attr = f"{dim}_names" mapping_attr = f"{dim}m" orig = gen_adata((5, 5)) orig.raw = orig.copy() curr = orig.copy() view = orig[:, :] new_idx = pd.Index(list("abcde"), name="letters") setattr(curr, index_attr, new_idx) pd.testing.assert_index_equal(getattr(curr, index_attr), new_idx) pd.testing.assert_index_equal(getattr(curr, mapping_attr)["df"].index, new_idx) pd.testing.assert_index_equal(getattr(curr, mapping_attr).dim_names, new_idx) pd.testing.assert_index_equal(curr.obs_names, curr.raw.obs_names) # Testing view behaviour setattr(view, index_attr, new_idx) assert not view.is_view pd.testing.assert_index_equal(getattr(view, index_attr), new_idx) pd.testing.assert_index_equal(getattr(view, mapping_attr)["df"].index, new_idx) pd.testing.assert_index_equal(getattr(view, mapping_attr).dim_names, new_idx) with pytest.raises(AssertionError): pd.testing.assert_index_equal( getattr(view, index_attr), getattr(orig, index_attr) ) assert_equal(view, curr, exact=True) # test case in #459 fake_m = pd.DataFrame(curr.X.T, index=getattr(curr, index_attr)) getattr(curr, mapping_attr)["df2"] = fake_m def test_indices_dtypes(): adata = AnnData( np.array([[1, 2, 3], [4, 5, 6]]), dict(obs_names=["A", "B"]), dict(var_names=["a", "b", "c"]), ) adata.obs_names = ["ö", "a"] assert adata.obs_names.tolist() == ["ö", "a"] def test_slicing(): adata = AnnData(np.array([[1, 2, 3], [4, 5, 6]])) # assert adata[:, 0].X.tolist() == adata.X[:, 0].tolist() # No longer the case assert adata[0, 0].X.tolist() == np.reshape(1, (1, 1)).tolist() assert adata[0, :].X.tolist() == np.reshape([1, 2, 3], (1, 3)).tolist() assert adata[:, 0].X.tolist() == np.reshape([1, 4], (2, 1)).tolist() assert adata[:, [0, 1]].X.tolist() == [[1, 2], [4, 5]] assert adata[:, np.array([0, 2])].X.tolist() == [[1, 3], [4, 6]] assert adata[:, np.array([False, True, True])].X.tolist() == [ [2, 3], [5, 6], ] assert adata[:, 1:3].X.tolist() == [[2, 3], [5, 6]] assert adata[0:2, :][:, 0:2].X.tolist() == [[1, 2], [4, 5]] assert adata[0:1, :][:, 0:2].X.tolist() == np.reshape([1, 2], (1, 2)).tolist() assert adata[0, :][:, 0].X.tolist() == np.reshape(1, (1, 1)).tolist() assert adata[:, 0:2][0:2, :].X.tolist() == [[1, 2], [4, 5]] assert adata[:, 0:2][0:1, :].X.tolist() == np.reshape([1, 2], (1, 2)).tolist() assert adata[:, 0][0, :].X.tolist() == np.reshape(1, (1, 1)).tolist() def test_boolean_slicing(): adata = AnnData(np.array([[1, 2, 3], [4, 5, 6]])) obs_selector = np.array([True, False], dtype=bool) vars_selector = np.array([True, False, False], dtype=bool) assert adata[obs_selector, :][:, vars_selector].X.tolist() == [[1]] assert adata[:, vars_selector][obs_selector, :].X.tolist() == [[1]] assert adata[obs_selector, :][:, 0].X.tolist() == [[1]] assert adata[:, 0][obs_selector, :].X.tolist() == [[1]] assert adata[0, :][:, vars_selector].X.tolist() == [[1]] assert adata[:, vars_selector][0, :].X.tolist() == [[1]] obs_selector = np.array([True, False], dtype=bool) vars_selector = np.array([True, True, False], dtype=bool) assert adata[obs_selector, :][:, vars_selector].X.tolist() == [[1, 2]] assert adata[:, vars_selector][obs_selector, :].X.tolist() == [[1, 2]] assert adata[obs_selector, :][:, 0:2].X.tolist() == [[1, 2]] assert adata[:, 0:2][obs_selector, :].X.tolist() == [[1, 2]] assert adata[0, :][:, vars_selector].X.tolist() == [[1, 2]] assert adata[:, vars_selector][0, :].X.tolist() == [[1, 2]] obs_selector = np.array([True, True], dtype=bool) vars_selector = np.array([True, True, False], dtype=bool) assert adata[obs_selector, :][:, vars_selector].X.tolist() == [ [1, 2], [4, 5], ] assert adata[:, vars_selector][obs_selector, :].X.tolist() == [ [1, 2], [4, 5], ] assert adata[obs_selector, :][:, 0:2].X.tolist() == [[1, 2], [4, 5]] assert adata[:, 0:2][obs_selector, :].X.tolist() == [[1, 2], [4, 5]] assert adata[0:2, :][:, vars_selector].X.tolist() == [[1, 2], [4, 5]] assert adata[:, vars_selector][0:2, :].X.tolist() == [[1, 2], [4, 5]] def test_oob_boolean_slicing(): len1, len2 = np.random.choice(100, 2, replace=False) with pytest.raises(IndexError) as e: AnnData(np.empty((len1, 100)))[np.random.randint(0, 2, len2, dtype=bool), :] assert str(len1) in str(e.value) assert str(len2) in str(e.value) len1, len2 = np.random.choice(100, 2, replace=False) with pytest.raises(IndexError) as e: AnnData(np.empty((100, len1)))[:, np.random.randint(0, 2, len2, dtype=bool)] assert str(len1) in str(e.value) assert str(len2) in str(e.value) def test_slicing_strings(): adata = AnnData( np.array([[1, 2, 3], [4, 5, 6]]), dict(obs_names=["A", "B"]), dict(var_names=["a", "b", "c"]), ) assert adata["A", "a"].X.tolist() == [[1]] assert adata["A", :].X.tolist() == [[1, 2, 3]] assert adata[:, "a"].X.tolist() == [[1], [4]] assert adata[:, ["a", "b"]].X.tolist() == [[1, 2], [4, 5]] assert adata[:, np.array(["a", "c"])].X.tolist() == [[1, 3], [4, 6]] assert adata[:, "b":"c"].X.tolist() == [[2, 3], [5, 6]] with pytest.raises(KeyError): _ = adata[:, "X"] with pytest.raises(KeyError): _ = adata["X", :] with pytest.raises(KeyError): _ = adata["A":"X", :] with pytest.raises(KeyError): _ = adata[:, "a":"X"] # Test if errors are helpful with pytest.raises(KeyError, match=r"not_in_var"): adata[:, ["A", "B", "not_in_var"]] with pytest.raises(KeyError, match=r"not_in_obs"): adata[["A", "B", "not_in_obs"], :] def test_slicing_series(): adata = AnnData( np.array([[1, 2], [3, 4], [5, 6]]), dict(obs_names=["A", "B", "C"]), dict(var_names=["a", "b"]), ) df = pd.DataFrame(dict(a=["1", "2", "2"])) df1 = pd.DataFrame(dict(b=["1", "2"])) assert adata[df["a"].values == "2"].X.tolist() == adata[df["a"] == "2"].X.tolist() assert ( adata[:, df1["b"].values == "2"].X.tolist() == adata[:, df1["b"] == "2"].X.tolist() ) def test_strings_to_categoricals(): adata = AnnData( np.array([[1, 2], [3, 4], [5, 6], [7, 8]]), dict(k=["a", "a", "b", "b"]) ) adata.strings_to_categoricals() assert adata.obs["k"].cat.categories.tolist() == ["a", "b"] def test_slicing_remove_unused_categories(): adata = AnnData( np.array([[1, 2], [3, 4], [5, 6], [7, 8]]), dict(k=["a", "a", "b", "b"]) ) adata._sanitize() assert adata[2:4].obs["k"].cat.categories.tolist() == ["b"] def test_slicing_dont_remove_unused_categories(): with settings.override(remove_unused_categories=False): adata = AnnData( np.array([[1, 2], [3, 4], [5, 6], [7, 8]]), dict(k=["a", "a", "b", "b"]) ) adata._sanitize() assert adata[2:4].obs["k"].cat.categories.tolist() == ["a", "b"] def test_no_uniqueness_check_gives_repeat_indices(): with settings.override(check_uniqueness=False): obs_names = ["0", "0", "1", "1"] with warnings.catch_warnings(): warnings.simplefilter("error") adata = AnnData( np.array([[1, 2], [3, 4], [5, 6], [7, 8]]), obs=pd.DataFrame(index=obs_names), ) assert adata.obs_names.values.tolist() == obs_names def test_get_subset_annotation(): adata = AnnData( np.array([[1, 2, 3], [4, 5, 6]]), dict(S=["A", "B"]), dict(F=["a", "b", "c"]), ) assert adata[0, 0].obs["S"].tolist() == ["A"] assert adata[0, 0].var["F"].tolist() == ["a"] def test_append_col(): adata = AnnData(np.array([[1, 2, 3], [4, 5, 6]])) adata.obs["new"] = [1, 2] # this worked in the initial AnnData, but not with a dataframe # adata.obs[['new2', 'new3']] = [['A', 'B'], ['c', 'd']] with pytest.raises( ValueError, match="Length of values.*does not match length of index" ): adata.obs["new4"] = "far too long".split() def test_delete_col(): adata = AnnData(np.array([[1, 2, 3], [4, 5, 6]]), dict(o1=[1, 2], o2=[3, 4])) assert ["o1", "o2"] == adata.obs_keys() del adata.obs["o1"] assert ["o2"] == adata.obs_keys() assert [3, 4] == adata.obs["o2"].tolist() def test_set_obs(): adata = AnnData(np.array([[1, 2, 3], [4, 5, 6]])) adata.obs = pd.DataFrame(dict(a=[3, 4])) assert adata.obs_names.tolist() == [0, 1] with pytest.raises(ValueError, match="but this AnnData has shape"): adata.obs = pd.DataFrame(dict(a=[3, 4, 5])) with pytest.raises(ValueError, match="Can only assign pd.DataFrame"): adata.obs = dict(a=[1, 2]) def test_multicol(): adata = AnnData(np.array([[1, 2, 3], [4, 5, 6]])) # 'c' keeps the columns as should be adata.obsm["c"] = np.array([[0.0, 1.0], [2, 3]]) assert adata.obsm_keys() == ["c"] assert adata.obsm["c"].tolist() == [[0.0, 1.0], [2, 3]] def test_n_obs(): adata = AnnData(np.array([[1, 2], [3, 4], [5, 6]])) assert adata.n_obs == 3 adata1 = adata[:2] assert adata1.n_obs == 2 def test_equality_comparisons(): adata1 = AnnData(np.array([[1, 2], [3, 4], [5, 6]])) adata2 = AnnData(np.array([[1, 2], [3, 4], [5, 6]])) with pytest.raises(NotImplementedError): adata1 == adata1 with pytest.raises(NotImplementedError): adata1 == adata2 with pytest.raises(NotImplementedError): adata1 != adata2 with pytest.raises(NotImplementedError): adata1 == 1 with pytest.raises(NotImplementedError): adata1 != 1 def test_rename_categories(): X = np.ones((6, 3)) obs = pd.DataFrame(dict(cat_anno=pd.Categorical(["a", "a", "a", "a", "b", "a"]))) adata = AnnData(X=X, obs=obs) adata.uns["tool"] = {} adata.uns["tool"]["cat_array"] = np.rec.fromarrays( [np.ones(2) for cat in adata.obs["cat_anno"].cat.categories], dtype=[(cat, "float32") for cat in adata.obs["cat_anno"].cat.categories], ) adata.uns["tool"]["params"] = dict(groupby="cat_anno") new_categories = ["c", "d"] with warnings.catch_warnings(): warnings.simplefilter("error") adata.rename_categories("cat_anno", new_categories) assert list(adata.obs["cat_anno"].cat.categories) == new_categories assert list(adata.uns["tool"]["cat_array"].dtype.names) == new_categories def test_pickle(): import pickle adata = AnnData() adata2 = pickle.loads(pickle.dumps(adata)) assert adata2.obsm.parent is adata2 def test_to_df_dense(): X_df = adata_dense.to_df() layer_df = adata_dense.to_df(layer="test") np.testing.assert_array_equal(adata_dense.layers["test"], layer_df.values) np.testing.assert_array_equal(adata_dense.X, X_df.values) pd.testing.assert_index_equal(X_df.columns, layer_df.columns) pd.testing.assert_index_equal(X_df.index, layer_df.index) def test_convenience(): adata = adata_sparse.copy() adata.layers["x2"] = adata.X * 2 adata.var["anno2"] = ["p1", "p2", "p3"] adata.raw = adata.copy() adata.X = adata.X / 2 adata_dense = adata.copy() adata_dense.X = adata_dense.X.toarray() def assert_same_op_result(a1, a2, op): r1 = op(a1) r2 = op(a2) assert np.all(r1 == r2) assert type(r1) is type(r2) assert np.allclose(adata.obs_vector("b"), np.array([1.0, 2.5])) assert np.allclose(adata.raw.obs_vector("c"), np.array([3, 6])) assert np.all(adata.obs_vector("anno1") == np.array(["c1", "c2"])) assert np.allclose(adata.var_vector("s1"), np.array([0, 1.0, 1.5])) assert np.allclose(adata.raw.var_vector("s2"), np.array([0, 5, 6])) for obs_k, layer in product(["a", "b", "c", "anno1"], [None, "x2"]): assert_same_op_result( adata, adata_dense, lambda x: x.obs_vector(obs_k, layer=layer) ) for obs_k in ["a", "b", "c"]: assert_same_op_result(adata, adata_dense, lambda x: x.raw.obs_vector(obs_k)) for var_k, layer in product(["s1", "s2", "anno2"], [None, "x2"]): assert_same_op_result( adata, adata_dense, lambda x: x.var_vector(var_k, layer=layer) ) for var_k in ["s1", "s2", "anno2"]: assert_same_op_result(adata, adata_dense, lambda x: x.raw.var_vector(var_k)) def test_1d_slice_dtypes(): N, M = 10, 20 obs_df = pd.DataFrame( dict( cat=pd.Categorical(np.arange(N, dtype=int)), int=np.arange(N, dtype=int), float=np.arange(N, dtype=float), obj=[str(i) for i in np.arange(N, dtype=int)], ), index=[f"cell{i}" for i in np.arange(N, dtype=int)], ) var_df = pd.DataFrame( dict( cat=pd.Categorical(np.arange(M, dtype=int)), int=np.arange(M, dtype=int), float=np.arange(M, dtype=float), obj=[str(i) for i in np.arange(M, dtype=int)], ), index=[f"gene{i}" for i in np.arange(M, dtype=int)], ) adata = AnnData(X=np.random.random((N, M)), obs=obs_df, var=var_df) new_obs_df = pd.DataFrame(index=adata.obs_names) for k in obs_df.columns: new_obs_df[k] = adata.obs_vector(k) assert new_obs_df[k].dtype == obs_df[k].dtype assert np.all(new_obs_df == obs_df) new_var_df = pd.DataFrame(index=adata.var_names) for k in var_df.columns: new_var_df[k] = adata.var_vector(k) assert new_var_df[k].dtype == var_df[k].dtype assert np.all(new_var_df == var_df) def test_to_df_sparse(): X = adata_sparse.X.toarray() df = adata_sparse.to_df() assert df.values.tolist() == X.tolist() def test_to_df_no_X(): adata = AnnData( obs=pd.DataFrame(index=[f"cell-{i:02}" for i in range(20)]), var=pd.DataFrame(index=[f"gene-{i:02}" for i in range(30)]), layers={"present": np.ones((20, 30))}, ) v = adata[:10] with pytest.raises(ValueError, match=r"X is None"): _ = adata.to_df() with pytest.raises(ValueError, match=r"X is None"): _ = v.to_df() expected = pd.DataFrame( np.ones(adata.shape), index=adata.obs_names, columns=adata.var_names ) actual = adata.to_df(layer="present") pd.testing.assert_frame_equal(actual, expected) view_expected = pd.DataFrame( np.ones(v.shape), index=v.obs_names, columns=v.var_names ) view_actual = v.to_df(layer="present") pd.testing.assert_frame_equal(view_actual, view_expected) def test_copy(): adata_copy = adata_sparse.copy() def assert_eq_not_id(a, b): assert a is not b assert issparse(a) == issparse(b) if issparse(a): assert np.all(a.data == b.data) assert np.all(a.indices == b.indices) assert np.all(a.indptr == b.indptr) else: assert np.all(a == b) assert adata_sparse is not adata_copy assert_eq_not_id(adata_sparse.X, adata_copy.X) for attr in "layers var obs obsm varm".split(): map_sprs = getattr(adata_sparse, attr) map_copy = getattr(adata_copy, attr) assert map_sprs is not map_copy if attr not in {"obs", "var"}: # check that we don’t create too many references assert getattr(adata_copy, f"_{attr}") is map_copy._data assert_eq_not_id(map_sprs.keys(), map_copy.keys()) for key in map_sprs.keys(): assert_eq_not_id(map_sprs[key], map_copy[key]) def test_to_memory_no_copy(): adata = gen_adata((3, 5)) mem = adata.to_memory() assert mem.X is adata.X # Currently does not hold for `obs`, `var`, but should in future for key in adata.layers: assert mem.layers[key] is adata.layers[key] for key in adata.obsm: assert mem.obsm[key] is adata.obsm[key] for key in adata.varm: assert mem.varm[key] is adata.varm[key] for key in adata.obsp: assert mem.obsp[key] is adata.obsp[key] for key in adata.varp: assert mem.varp[key] is adata.varp[key] @pytest.mark.parametrize("axis", ["obs", "var"]) @pytest.mark.parametrize("elem_type", ["p", "m"]) def test_create_adata_from_single_axis_elem( axis: Literal["obs", "var"], elem_type: Literal["m", "p"], tmp_path: Path ): d = dict( a=np.zeros((10, 10)), ) in_memory = AnnData(**{f"{axis}{elem_type}": d}) assert in_memory.shape == (10, 0) if axis == "obs" else (0, 10) in_memory.write_h5ad(tmp_path / "adata.h5ad") from_disk = ad.read_h5ad(tmp_path / "adata.h5ad") ad.tests.helpers.assert_equal(from_disk, in_memory) python-anndata-0.12.0~rc1/tests/test_concatenate.py000066400000000000000000001521031500370632200223620ustar00rootroot00000000000000from __future__ import annotations import warnings from collections.abc import Hashable from contextlib import nullcontext from copy import deepcopy from functools import partial, singledispatch from itertools import chain, permutations, product from operator import attrgetter from typing import TYPE_CHECKING import numpy as np import pandas as pd import pytest import scipy from boltons.iterutils import default_exit, remap, research from numpy import ma from packaging.version import Version from scipy import sparse from anndata import AnnData, Raw, concat from anndata._core import merge from anndata._core.index import _subset from anndata.compat import AwkArray, CSArray, CSMatrix, CupySparseMatrix, DaskArray from anndata.tests import helpers from anndata.tests.helpers import ( BASE_MATRIX_PARAMS, CUPY_MATRIX_PARAMS, DASK_MATRIX_PARAMS, DEFAULT_COL_TYPES, GEN_ADATA_DASK_ARGS, as_dense_dask_array, assert_equal, gen_adata, gen_vstr_recarray, ) from anndata.utils import asarray if TYPE_CHECKING: from collections.abc import Callable from typing import Any, Literal mark_legacy_concatenate = pytest.mark.filterwarnings( r"ignore:.*AnnData\.concatenate is deprecated:FutureWarning" ) @singledispatch def filled_like(a, fill_value=None): raise NotImplementedError() @filled_like.register(np.ndarray) def _filled_array_np(a, fill_value=None): if fill_value is None: fill_value = np.nan return np.broadcast_to(fill_value, a.shape) @filled_like.register(DaskArray) def _filled_array(a, fill_value=None): return as_dense_dask_array(_filled_array_np(a, fill_value)) @filled_like.register(CSMatrix) def _filled_sparse(a, fill_value=None): if fill_value is None: return sparse.csr_matrix(a.shape) else: return sparse.csr_matrix(np.broadcast_to(fill_value, a.shape)) @filled_like.register(CSArray) def _filled_sparse_array(a, fill_value=None): return sparse.csr_array(filled_like(sparse.csr_matrix(a))) @filled_like.register(pd.DataFrame) def _filled_df(a, fill_value=np.nan): # dtype from pd.concat can be unintuitive, this returns something close enough return a.loc[[], :].reindex(index=a.index, fill_value=fill_value) def check_filled_like(x, fill_value=None, elem_name=None): if fill_value is None: assert_equal(x, filled_like(x), elem_name=elem_name) else: assert_equal(x, filled_like(x, fill_value=fill_value), elem_name=elem_name) def make_idx_tuple(idx, axis): tup = [slice(None), slice(None)] tup[axis] = idx return tuple(tup) # Will call func(sparse_matrix) so these types should be sparse compatible # See array_type if only dense arrays are expected as input. @pytest.fixture(params=BASE_MATRIX_PARAMS + DASK_MATRIX_PARAMS + CUPY_MATRIX_PARAMS) def array_type(request): return request.param @pytest.fixture(params=BASE_MATRIX_PARAMS + DASK_MATRIX_PARAMS) def cpu_array_type(request): return request.param @pytest.fixture(params=["inner", "outer"]) def join_type(request): return request.param @pytest.fixture(params=[0, np.nan, np.pi]) def fill_val(request): return request.param @pytest.fixture(params=["obs", "var"]) def axis_name(request) -> Literal["obs", "var"]: return request.param @pytest.fixture(params=list(merge.MERGE_STRATEGIES.keys())) def merge_strategy(request): return request.param def fix_known_differences( orig: AnnData, result: AnnData, *, backwards_compat: bool = True ): """ Helper function for reducing anndata's to only the elements we expect to be equivalent after concatenation. Only for the case where orig is the ground truth result of what concatenation should be. If backwards_compat, checks against what `AnnData.concatenate` could do. Otherwise checks for `concat`. """ orig = orig.copy() result = result.copy() result.strings_to_categoricals() # Should this be implicit in concatenation? # TODO # * merge varm, varp similar to uns # * merge obsp, but some information should be lost del orig.obsp # TODO if backwards_compat: del orig.varm del orig.varp result.obs.drop(columns=["batch"], inplace=True) # Possibly need to fix this, ordered categoricals lose orderedness for get_df in [lambda k: getattr(k, "obs"), lambda k: getattr(k, "obsm")["df"]]: str_to_df_converted = get_df(result) for k, dtype in get_df(orig).dtypes.items(): if isinstance(dtype, pd.CategoricalDtype) and dtype.ordered: str_to_df_converted[k] = str_to_df_converted[k].astype(dtype) return orig, result def test_concat_interface_errors(): adatas = [gen_adata((5, 10)), gen_adata((5, 10))] with pytest.raises(ValueError, match="`axis` must be.*0, 1, 'obs', or 'var'"): concat(adatas, axis=3) with pytest.raises(ValueError, match="'inner' or 'outer'"): concat(adatas, join="not implemented") with pytest.raises(ValueError, match="No objects to concatenate"): concat([]) @mark_legacy_concatenate @pytest.mark.parametrize( ("concat_func", "backwards_compat"), [ (partial(concat, merge="unique"), False), (lambda x, **kwargs: x[0].concatenate(x[1:], **kwargs), True), ], ) def test_concatenate_roundtrip(join_type, array_type, concat_func, backwards_compat): adata = gen_adata((100, 10), X_type=array_type, **GEN_ADATA_DASK_ARGS) remaining = adata.obs_names subsets = [] while len(remaining) > 0: n = min(len(remaining), np.random.choice(50)) subset_idx = np.random.choice(remaining, n, replace=False) subsets.append(adata[subset_idx]) remaining = remaining.difference(subset_idx) result = concat_func(subsets, join=join_type, uns_merge="same", index_unique=None) # Correcting for known differences orig, result = fix_known_differences( adata, result, backwards_compat=backwards_compat ) assert_equal(result[orig.obs_names].copy(), orig) base_type = type(orig.X) if sparse.issparse(orig.X): base_type = CSArray if isinstance(orig.X, CSArray) else CSMatrix if isinstance(orig.X, CupySparseMatrix): base_type = CupySparseMatrix assert isinstance(result.X, base_type) @mark_legacy_concatenate def test_concatenate_dense(): # dense data X1 = np.array([[1, 2, 3], [4, 5, 6]]) X2 = np.array([[1, 2, 3], [4, 5, 6]]) X3 = np.array([[1, 2, 3], [4, 5, 6]]) adata1 = AnnData( X1, dict(obs_names=["s1", "s2"], anno1=["c1", "c2"]), dict(var_names=["a", "b", "c"], annoA=[0, 1, 2]), obsm=dict(X_1=X1, X_2=X2, X_3=X3), layers=dict(Xs=X1), ) adata2 = AnnData( X2, dict(obs_names=["s3", "s4"], anno1=["c3", "c4"]), dict(var_names=["d", "c", "b"], annoA=[0, 1, 2]), obsm=dict(X_1=X1, X_2=X2, X_3=X3), layers={"Xs": X2}, ) adata3 = AnnData( X3, dict(obs_names=["s1", "s2"], anno2=["d3", "d4"]), dict(var_names=["d", "c", "b"], annoB=[0, 1, 2]), obsm=dict(X_1=X1, X_2=X2), layers=dict(Xs=X3), ) # inner join adata = adata1.concatenate(adata2, adata3) X_combined = [[2, 3], [5, 6], [3, 2], [6, 5], [3, 2], [6, 5]] assert adata.X.astype(int).tolist() == X_combined assert adata.layers["Xs"].astype(int).tolist() == X_combined assert adata.obs_keys() == ["anno1", "anno2", "batch"] assert adata.var_keys() == ["annoA-0", "annoA-1", "annoB-2"] assert adata.var.values.tolist() == [[1, 2, 2], [2, 1, 1]] assert adata.obsm_keys() == ["X_1", "X_2"] assert adata.obsm["X_1"].tolist() == np.concatenate([X1, X1, X1]).tolist() # with batch_key and batch_categories adata = adata1.concatenate(adata2, adata3, batch_key="batch1") assert adata.obs_keys() == ["anno1", "anno2", "batch1"] adata = adata1.concatenate(adata2, adata3, batch_categories=["a1", "a2", "a3"]) assert adata.obs["batch"].cat.categories.tolist() == ["a1", "a2", "a3"] assert adata.var_names.tolist() == ["b", "c"] # outer join adata = adata1.concatenate(adata2, adata3, join="outer") X_ref = np.array( [ [1.0, 2.0, 3.0, np.nan], [4.0, 5.0, 6.0, np.nan], [np.nan, 3.0, 2.0, 1.0], [np.nan, 6.0, 5.0, 4.0], [np.nan, 3.0, 2.0, 1.0], [np.nan, 6.0, 5.0, 4.0], ] ) np.testing.assert_equal(adata.X, X_ref) var_ma = ma.masked_invalid(adata.var.values.tolist()) var_ma_ref = ma.masked_invalid( np.array( [ [0.0, np.nan, np.nan], [1.0, 2.0, 2.0], [2.0, 1.0, 1.0], [np.nan, 0.0, 0.0], ] ) ) assert np.array_equal(var_ma.mask, var_ma_ref.mask) assert np.allclose(var_ma.compressed(), var_ma_ref.compressed()) @mark_legacy_concatenate def test_concatenate_layers(array_type, join_type): adatas = [] for _ in range(5): a = array_type(sparse.random(100, 200, format="csr")) adatas.append(AnnData(X=a, layers={"a": a})) merged = adatas[0].concatenate(adatas[1:], join=join_type) assert_equal(merged.X, merged.layers["a"]) @pytest.fixture def obsm_adatas(): def gen_index(n): return [f"cell{i}" for i in range(n)] return [ AnnData( X=sparse.csr_matrix((3, 5)), obs=pd.DataFrame(index=gen_index(3)), obsm={ "dense": np.arange(6).reshape(3, 2), "sparse": sparse.csr_matrix(np.arange(6).reshape(3, 2)), "df": pd.DataFrame( { "a": np.arange(3), "b": list("abc"), "c": pd.Categorical(list("aab")), }, index=gen_index(3), ), }, ), AnnData( X=sparse.csr_matrix((4, 10)), obs=pd.DataFrame(index=gen_index(4)), obsm=dict( dense=np.arange(12).reshape(4, 3), df=pd.DataFrame(dict(a=np.arange(3, 7)), index=gen_index(4)), ), ), AnnData( X=sparse.csr_matrix((2, 100)), obs=pd.DataFrame(index=gen_index(2)), obsm={ "sparse": np.arange(8).reshape(2, 4), "dense": np.arange(4, 8).reshape(2, 2), "df": pd.DataFrame( { "a": np.arange(7, 9), "b": list("cd"), "c": pd.Categorical(list("ab")), }, index=gen_index(2), ), }, ), ] @mark_legacy_concatenate def test_concatenate_obsm_inner(obsm_adatas): adata = obsm_adatas[0].concatenate(obsm_adatas[1:], join="inner") assert set(adata.obsm.keys()) == {"dense", "df"} assert adata.obsm["dense"].shape == (9, 2) assert adata.obsm["dense"].tolist() == [ [0, 1], [2, 3], [4, 5], [0, 1], [3, 4], [6, 7], [9, 10], [4, 5], [6, 7], ] assert adata.obsm["df"].columns == ["a"] assert adata.obsm["df"]["a"].tolist() == list(range(9)) # fmt: off true_df = ( pd.concat([a.obsm["df"] for a in obsm_adatas], join="inner") .reset_index(drop=True) ) # fmt: on cur_df = adata.obsm["df"].reset_index(drop=True) pd.testing.assert_frame_equal(true_df, cur_df) @mark_legacy_concatenate def test_concatenate_obsm_outer(obsm_adatas, fill_val): outer = obsm_adatas[0].concatenate( obsm_adatas[1:], join="outer", fill_value=fill_val ) inner = obsm_adatas[0].concatenate(obsm_adatas[1:], join="inner") for k, inner_v in inner.obsm.items(): assert np.array_equal( _subset(outer.obsm[k], (slice(None), slice(None, inner_v.shape[1]))), inner_v, ) assert set(outer.obsm.keys()) == {"dense", "df", "sparse"} assert isinstance(outer.obsm["dense"], np.ndarray) np.testing.assert_equal( outer.obsm["dense"], np.array( [ [0, 1, fill_val], [2, 3, fill_val], [4, 5, fill_val], [0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11], [4, 5, fill_val], [6, 7, fill_val], ] ), ) assert isinstance(outer.obsm["sparse"], CSMatrix) np.testing.assert_equal( outer.obsm["sparse"].toarray(), np.array( [ [0, 1, fill_val, fill_val], [2, 3, fill_val, fill_val], [4, 5, fill_val, fill_val], [fill_val, fill_val, fill_val, fill_val], [fill_val, fill_val, fill_val, fill_val], [fill_val, fill_val, fill_val, fill_val], [fill_val, fill_val, fill_val, fill_val], [0, 1, 2, 3], [4, 5, 6, 7], ] ), ) # fmt: off true_df = ( pd.concat([a.obsm["df"] for a in obsm_adatas], join="outer") .reset_index(drop=True) ) # fmt: on cur_df = outer.obsm["df"].reset_index(drop=True) pd.testing.assert_frame_equal(true_df, cur_df) @pytest.mark.parametrize( ("axis", "axis_name"), [("obs", 0), ("var", 1)], ) def test_concat_axis_param(axis, axis_name): a, b = gen_adata((10, 10)), gen_adata((10, 10)) assert_equal(concat([a, b], axis=axis), concat([a, b], axis=axis_name)) def test_concat_annot_join(obsm_adatas, join_type): adatas = [ AnnData(sparse.csr_matrix(a.shape), obs=a.obsm["df"], var=a.var) for a in obsm_adatas ] pd.testing.assert_frame_equal( concat(adatas, join=join_type).obs, pd.concat([a.obs for a in adatas], join=join_type), ) @mark_legacy_concatenate def test_concatenate_layers_misaligned(array_type, join_type): adatas = [] for _ in range(5): a = array_type(sparse.random(100, 200, format="csr")) adata = AnnData(X=a, layers={"a": a}) adatas.append( adata[:, np.random.choice(adata.var_names, 150, replace=False)].copy() ) merged = adatas[0].concatenate(adatas[1:], join=join_type) assert_equal(merged.X, merged.layers["a"]) @mark_legacy_concatenate def test_concatenate_layers_outer(array_type, fill_val): # Testing that issue #368 is fixed a = AnnData( X=np.ones((10, 20)), layers={"a": array_type(sparse.random(10, 20, format="csr"))}, ) b = AnnData(X=np.ones((10, 20))) c = a.concatenate(b, join="outer", fill_value=fill_val, batch_categories=["a", "b"]) np.testing.assert_array_equal( asarray(c[c.obs["batch"] == "b"].layers["a"]), fill_val ) @mark_legacy_concatenate def test_concatenate_fill_value(fill_val): def get_obs_els(adata): return { "X": adata.X, **{f"layer_{k}": adata.layers[k] for k in adata.layers}, **{f"obsm_{k}": adata.obsm[k] for k in adata.obsm}, } adata1 = gen_adata((10, 10)) adata1.obsm = { k: v for k, v in adata1.obsm.items() if not isinstance(v, pd.DataFrame | AwkArray) } adata2 = gen_adata((10, 5)) adata2.obsm = { k: v[:, : v.shape[1] // 2] for k, v in adata2.obsm.items() if not isinstance(v, pd.DataFrame | AwkArray) } adata3 = gen_adata((7, 3)) adata3.obsm = { k: v[:, : v.shape[1] // 3] for k, v in adata3.obsm.items() if not isinstance(v, pd.DataFrame | AwkArray) } # remove AwkArrays from adata.var, as outer joins are not yet implemented for them for tmp_ad in [adata1, adata2, adata3]: for k in [k for k, v in tmp_ad.varm.items() if isinstance(v, AwkArray)]: del tmp_ad.varm[k] joined = adata1.concatenate([adata2, adata3], join="outer", fill_value=fill_val) ptr = 0 for orig in [adata1, adata2, adata3]: cur = joined[ptr : ptr + orig.n_obs] cur_els = get_obs_els(cur) orig_els = get_obs_els(orig) for k, cur_v in cur_els.items(): orig_v = orig_els.get(k, sparse.csr_matrix((orig.n_obs, 0))) assert_equal(cur_v[:, : orig_v.shape[1]], orig_v) np.testing.assert_equal(asarray(cur_v[:, orig_v.shape[1] :]), fill_val) ptr += orig.n_obs @mark_legacy_concatenate def test_concatenate_dense_duplicates(): X1 = np.array([[1, 2, 3], [4, 5, 6]]) X2 = np.array([[1, 2, 3], [4, 5, 6]]) X3 = np.array([[1, 2, 3], [4, 5, 6]]) # inner join duplicates adata1 = AnnData( X1, dict(obs_names=["s1", "s2"], anno1=["c1", "c2"]), dict( var_names=["a", "b", "c"], annoA=[0, 1, 2], annoB=[1.1, 1.0, 2.0], annoC=[1.1, 1.0, 2.0], annoD=[2.1, 2.0, 3.0], ), ) adata2 = AnnData( X2, dict(obs_names=["s3", "s4"], anno1=["c3", "c4"]), dict( var_names=["a", "b", "c"], annoA=[0, 1, 2], annoB=[1.1, 1.0, 2.0], annoC=[1.1, 1.0, 2.0], annoD=[2.1, 2.0, 3.0], ), ) adata3 = AnnData( X3, dict(obs_names=["s1", "s2"], anno2=["d3", "d4"]), dict( var_names=["a", "b", "c"], annoA=[0, 1, 2], annoB=[1.1, 1.0, 2.0], annoD=[2.1, 2.0, 3.1], ), ) adata = adata1.concatenate(adata2, adata3) assert adata.var_keys() == [ "annoA", "annoB", "annoC-0", "annoD-0", "annoC-1", "annoD-1", "annoD-2", ] @mark_legacy_concatenate def test_concatenate_sparse(): # sparse data from scipy.sparse import csr_matrix X1 = csr_matrix([[0, 2, 3], [0, 5, 6]]) X2 = csr_matrix([[0, 2, 3], [0, 5, 6]]) X3 = csr_matrix([[1, 2, 0], [0, 5, 6]]) adata1 = AnnData( X1, dict(obs_names=["s1", "s2"], anno1=["c1", "c2"]), dict(var_names=["a", "b", "c"]), layers=dict(Xs=X1), ) adata2 = AnnData( X2, dict(obs_names=["s3", "s4"], anno1=["c3", "c4"]), dict(var_names=["d", "c", "b"]), layers=dict(Xs=X2), ) adata3 = AnnData( X3, dict(obs_names=["s5", "s6"], anno2=["d3", "d4"]), dict(var_names=["d", "c", "b"]), layers=dict(Xs=X3), ) # inner join adata = adata1.concatenate(adata2, adata3) X_combined = [[2, 3], [5, 6], [3, 2], [6, 5], [0, 2], [6, 5]] assert adata.X.toarray().astype(int).tolist() == X_combined assert adata.layers["Xs"].toarray().astype(int).tolist() == X_combined # outer join adata = adata1.concatenate(adata2, adata3, join="outer") assert adata.X.toarray().tolist() == [ [0.0, 2.0, 3.0, 0.0], [0.0, 5.0, 6.0, 0.0], [0.0, 3.0, 2.0, 0.0], [0.0, 6.0, 5.0, 0.0], [0.0, 0.0, 2.0, 1.0], [0.0, 6.0, 5.0, 0.0], ] @mark_legacy_concatenate def test_concatenate_mixed(): X1 = sparse.csr_matrix(np.array([[1, 2, 0], [4, 0, 6], [0, 0, 9]])) X2 = sparse.csr_matrix(np.array([[0, 2, 3], [4, 0, 0], [7, 0, 9]])) X3 = sparse.csr_matrix(np.array([[1, 0, 3], [0, 0, 6], [0, 8, 0]])) X4 = np.array([[0, 2, 3], [4, 0, 0], [7, 0, 9]]) adata1 = AnnData( X1, dict(obs_names=["s1", "s2", "s3"], anno1=["c1", "c2", "c3"]), dict(var_names=["a", "b", "c"], annoA=[0, 1, 2]), layers=dict(counts=X1), ) adata2 = AnnData( X2, dict(obs_names=["s4", "s5", "s6"], anno1=["c3", "c4", "c5"]), dict(var_names=["d", "c", "b"], annoA=[0, 1, 2]), layers=dict(counts=X4), # sic ) adata3 = AnnData( X3, dict(obs_names=["s7", "s8", "s9"], anno2=["d3", "d4", "d5"]), dict(var_names=["d", "c", "b"], annoA=[0, 2, 3], annoB=[0, 1, 2]), layers=dict(counts=X3), ) adata4 = AnnData( X4, dict(obs_names=["s4", "s5", "s6"], anno1=["c3", "c4", "c5"]), dict(var_names=["d", "c", "b"], annoA=[0, 1, 2]), layers=dict(counts=X2), # sic ) adata_all = AnnData.concatenate(adata1, adata2, adata3, adata4) assert isinstance(adata_all.X, sparse.csr_matrix) assert isinstance(adata_all.layers["counts"], sparse.csr_matrix) @mark_legacy_concatenate def test_concatenate_with_raw(): # dense data X1 = np.array([[1, 2, 3], [4, 5, 6]]) X2 = np.array([[1, 2, 3], [4, 5, 6]]) X3 = np.array([[1, 2, 3], [4, 5, 6]]) X4 = np.array([[1, 2, 3, 4], [5, 6, 7, 8]]) adata1 = AnnData( X1, dict(obs_names=["s1", "s2"], anno1=["c1", "c2"]), dict(var_names=["a", "b", "c"], annoA=[0, 1, 2]), layers=dict(Xs=X1), ) adata2 = AnnData( X2, dict(obs_names=["s3", "s4"], anno1=["c3", "c4"]), dict(var_names=["d", "c", "b"], annoA=[0, 1, 2]), layers=dict(Xs=X2), ) adata3 = AnnData( X3, dict(obs_names=["s1", "s2"], anno2=["d3", "d4"]), dict(var_names=["d", "c", "b"], annoB=[0, 1, 2]), layers=dict(Xs=X3), ) adata4 = AnnData( X4, dict(obs_names=["s1", "s2"], anno1=["c1", "c2"]), dict(var_names=["a", "b", "c", "z"], annoA=[0, 1, 2, 3]), layers=dict(Xs=X4), ) adata1.raw = adata1.copy() adata2.raw = adata2.copy() adata3.raw = adata3.copy() adata_all = AnnData.concatenate(adata1, adata2, adata3) assert isinstance(adata_all.raw, Raw) assert set(adata_all.raw.var_names) == {"b", "c"} assert_equal(adata_all.raw.to_adata().obs, adata_all.obs) assert np.array_equal(adata_all.raw.X, adata_all.X) adata_all = AnnData.concatenate(adata1, adata2, adata3, join="outer") assert isinstance(adata_all.raw, Raw) assert set(adata_all.raw.var_names) == set("abcd") assert_equal(adata_all.raw.to_adata().obs, adata_all.obs) assert np.array_equal(np.nan_to_num(adata_all.raw.X), np.nan_to_num(adata_all.X)) adata3.raw = adata4.copy() adata_all = AnnData.concatenate(adata1, adata2, adata3, join="outer") assert isinstance(adata_all.raw, Raw) assert set(adata_all.raw.var_names) == set("abcdz") assert set(adata_all.var_names) == set("abcd") assert not np.array_equal( np.nan_to_num(adata_all.raw.X), np.nan_to_num(adata_all.X) ) del adata3.raw with pytest.warns( UserWarning, match=( "Only some AnnData objects have `.raw` attribute, " "not concatenating `.raw` attributes." ), ): adata_all = AnnData.concatenate(adata1, adata2, adata3) assert adata_all.raw is None del adata1.raw del adata2.raw assert all(_adata.raw is None for _adata in (adata1, adata2, adata3)) adata_all = AnnData.concatenate(adata1, adata2, adata3) assert adata_all.raw is None def test_concatenate_awkward(join_type): import awkward as ak a = ak.Array([[{"a": 1, "b": "foo"}], [{"a": 2, "b": "bar"}, {"a": 3, "b": "baz"}]]) b = ak.Array( [ [{"a": 4}, {"a": 5}], [{"a": 6}], [{"a": 7}], ] ) adata_a = AnnData(np.zeros((2, 0), dtype=float), obsm={"awk": a}) adata_b = AnnData(np.zeros((3, 0), dtype=float), obsm={"awk": b}) if join_type == "inner": expected = ak.Array( [ [{"a": 1}], [{"a": 2}, {"a": 3}], [{"a": 4}, {"a": 5}], [{"a": 6}], [{"a": 7}], ] ) elif join_type == "outer": # TODO: This is what we would like to return, but waiting on: # * https://github.com/scikit-hep/awkward/issues/2182 and awkward 2.1.0 # * https://github.com/scikit-hep/awkward/issues/2173 # expected = ak.Array( # [ # [{"a": 1, "b": "foo"}], # [{"a": 2, "b": "bar"}, {"a": 3, "b": "baz"}], # [{"a": 4, "b": None}, {"a": 5, "b": None}], # [{"a": 6, "b": None}], # [{"a": 7, "b": None}], # ] # ) expected = ak.concatenate( [ # I don't think I can construct a UnionArray directly ak.Array( [ [{"a": 1, "b": "foo"}], [{"a": 2, "b": "bar"}, {"a": 3, "b": "baz"}], ] ), ak.Array( [ [{"a": 4}, {"a": 5}], [{"a": 6}], [{"a": 7}], ] ), ] ) result = concat([adata_a, adata_b], join=join_type).obsm["awk"] assert_equal(expected, result) @pytest.mark.parametrize( "other", [ pd.DataFrame({"a": [4, 5, 6], "b": ["foo", "bar", "baz"]}, index=list("cde")), np.ones((3, 2)), sparse.random(3, 100, format="csr"), ], ) def test_awkward_does_not_mix(join_type, other): import awkward as ak awk = ak.Array( [[{"a": 1, "b": "foo"}], [{"a": 2, "b": "bar"}, {"a": 3, "b": "baz"}]] ) adata_a = AnnData( np.zeros((2, 3), dtype=float), obs=pd.DataFrame(index=list("ab")), obsm={"val": awk}, ) adata_b = AnnData( np.zeros((3, 3), dtype=float), obs=pd.DataFrame(index=list("cde")), obsm={"val": other}, ) with pytest.raises( NotImplementedError, match=r"Cannot concatenate an AwkwardArray with other array types", ): concat([adata_a, adata_b], join=join_type) def test_pairwise_concat(axis_name, array_type): axis, axis_name = merge._resolve_axis(axis_name) _, alt_axis_name = merge._resolve_axis(1 - axis) axis_sizes = [[100, 200, 50], [50, 50, 50]] if axis_name == "var": axis_sizes.reverse() Ms, Ns = axis_sizes axis_attr = f"{axis_name}p" alt_attr = f"{alt_axis_name}p" def gen_axis_array(m): return array_type(sparse.random(m, m, format="csr", density=0.1)) adatas = { k: AnnData( X=sparse.csr_matrix((m, n)), obsp={"arr": gen_axis_array(m)}, varp={"arr": gen_axis_array(n)}, ) for k, m, n in zip("abc", Ms, Ns) } w_pairwise = concat(adatas, axis=axis, label="orig", pairwise=True) wo_pairwise = concat(adatas, axis=axis, label="orig", pairwise=False) # Check that argument controls whether elements are included assert getattr(wo_pairwise, axis_attr) == {} assert getattr(w_pairwise, axis_attr) != {} # Check values of included elements full_inds = np.arange(w_pairwise.shape[axis]) obs_var: pd.DataFrame = getattr(w_pairwise, axis_name) groups = obs_var.groupby("orig", observed=True).indices for k, inds in groups.items(): orig_arr = getattr(adatas[k], axis_attr)["arr"] full_arr = getattr(w_pairwise, axis_attr)["arr"] if isinstance(full_arr, DaskArray): full_arr = full_arr.compute() # Check original values are intact assert_equal(orig_arr, _subset(full_arr, (inds, inds))) # Check that entries are filled with zeroes assert_equal( sparse.csr_matrix((len(inds), len(full_inds) - len(inds))), _subset(full_arr, (inds, np.setdiff1d(full_inds, inds))), ) assert_equal( sparse.csr_matrix((len(full_inds) - len(inds), len(inds))), _subset(full_arr, (np.setdiff1d(full_inds, inds), inds)), ) # Check that argument does not affect alternative axis assert "arr" in getattr( concat(adatas, axis=axis, pairwise=False, merge="first"), alt_attr ) def test_nan_merge(axis_name, join_type, array_type): axis, _ = merge._resolve_axis(axis_name) alt_axis, alt_axis_name = merge._resolve_axis(1 - axis) mapping_attr = f"{alt_axis_name}m" adata_shape = (20, 10) arr = array_type( sparse.random(adata_shape[alt_axis], 10, density=0.1, format="csr") ) arr_nan = arr.copy() with warnings.catch_warnings(): warnings.simplefilter("ignore", category=sparse.SparseEfficiencyWarning) for _ in range(10): arr_nan[np.random.choice(arr.shape[0]), np.random.choice(arr.shape[1])] = ( np.nan ) _data = {"X": sparse.csr_matrix(adata_shape), mapping_attr: {"arr": arr_nan}} orig1 = AnnData(**_data) orig2 = AnnData(**_data) result = concat([orig1, orig2], axis=axis, join=join_type, merge="same") assert_equal(getattr(orig1, mapping_attr), getattr(result, mapping_attr)) orig_nonan = AnnData( **{"X": sparse.csr_matrix(adata_shape), mapping_attr: {"arr": arr}} ) result_nonan = concat([orig1, orig_nonan], axis=axis, merge="same") assert len(getattr(result_nonan, mapping_attr)) == 0 def test_merge_unique(): from anndata._core.merge import merge_unique # Simple cases assert merge_unique([{"a": "b"}, {"a": "b"}]) == {"a": "b"} assert merge_unique([{"a": {"b": "c"}}, {"a": {"b": "c"}}]) == {"a": {"b": "c"}} assert merge_unique([{"a": {"b": "c"}}, {"a": {"b": "d"}}]) == {} assert merge_unique([{"a": {"b": "c", "d": "e"}}, {"a": {"b": "c", "d": "f"}}]) == { "a": {"b": "c"} } assert merge_unique( [{"a": {"b": {"c": {"d": "e"}}}}, {"a": {"b": {"c": {"d": "e"}}}}] ) == {"a": {"b": {"c": {"d": "e"}}}} assert ( merge_unique( [ {"a": {"b": {"c": {"d": "e"}}}}, {"a": {"b": {"c": {"d": "f"}}}}, {"a": {"b": {"c": {"d": "e"}}}}, ] ) == {} ) assert merge_unique([{"a": 1}, {"b": 2}]) == {"a": 1, "b": 2} assert merge_unique([{"a": 1}, {"b": 2}, {"a": 1, "b": {"c": 2, "d": 3}}]) == { "a": 1 } # Test equivalency between arrays and lists assert list( merge_unique([{"a": np.ones(5)}, {"a": list(np.ones(5))}])["a"] ) == list(np.ones(5)) assert merge_unique([{"a": np.ones(5)}, {"a": list(np.ones(4))}]) == {} def test_merge_same(): from anndata._core.merge import merge_same # Same as unique for a number of cases: assert merge_same([{"a": "b"}, {"a": "b"}]) == {"a": "b"} assert merge_same([{"a": {"b": "c"}}, {"a": {"b": "c"}}]) == {"a": {"b": "c"}} assert merge_same([{"a": {"b": "c"}}, {"a": {"b": "d"}}]) == {} assert merge_same([{"a": {"b": "c", "d": "e"}}, {"a": {"b": "c", "d": "f"}}]) == { "a": {"b": "c"} } assert merge_same([{"a": {"b": "c"}, "d": "e"}, {"a": {"b": "c"}, "d": 2}]) == { "a": {"b": "c"} } assert merge_same( [{"a": {"b": {"c": {"d": "e"}}}}, {"a": {"b": {"c": {"d": "e"}}}}] ) == {"a": {"b": {"c": {"d": "e"}}}} assert merge_same([{"a": 1}, {"b": 2}]) == {} assert merge_same([{"a": 1}, {"b": 2}, {"a": 1, "b": {"c": 2, "d": 3}}]) == {} # Test equivalency between arrays and lists assert list(merge_same([{"a": np.ones(5)}, {"a": list(np.ones(5))}])["a"]) == list( np.ones(5) ) def test_merge_first(): from anndata._core.merge import merge_first assert merge_first([{"a": "b"}, {"a": "b"}]) == {"a": "b"} assert merge_first([{"a": {"b": "c"}}, {"a": {"b": "c"}}]) == {"a": {"b": "c"}} assert merge_first([{"a": 1}, {"a": 2}]) == {"a": 1} assert merge_first([{"a": 1}, {"a": {"b": {"c": {"d": "e"}}}}]) == {"a": 1} assert merge_first([{"a": {"b": {"c": {"d": "e"}}}}, {"a": 1}]) == { "a": {"b": {"c": {"d": "e"}}} } # Helpers for test_concatenate_uns def uns_ad(uns): return AnnData(np.zeros((10, 10)), uns=uns) def map_values(mapping, path, key, old_parent, new_parent, new_items): ret = default_exit(path, key, old_parent, new_parent, new_items) for k, v in ret.items(): if isinstance(v, Hashable) and v in mapping: ret[k] = mapping[v] return ret def permute_nested_values(dicts: list[dict], gen_val: Callable[[int], Any]): """ This function permutes the values of a nested mapping, for testing that out merge method work regardless of the values types. Assumes the initial dictionary had integers for values. """ dicts = deepcopy(dicts) initial_values = [ x[1] for x in research(dicts, query=lambda p, k, v: isinstance(v, int)) ] mapping = {k: gen_val(k) for k in initial_values} return [remap(d, exit=partial(map_values, mapping)) for d in dicts] def gen_df(n): return helpers.gen_typed_df(n) def gen_array(n): return np.random.randn(n) def gen_list(n): return list(gen_array(n)) def gen_sparse(n): return sparse.random( np.random.randint(1, 100), np.random.randint(1, 100), format="csr" ) def gen_something(n): options = [gen_df, gen_array, gen_list, gen_sparse] return np.random.choice(options)(n) def gen_3d_numeric_array(n): return np.random.randn(n, n, n) def gen_3d_recarray(_): # Ignoring n as it can get quite slow return gen_vstr_recarray(8, 3).reshape(2, 2, 2) def gen_concat_params(unss, compat2result): value_generators = [ lambda x: x, gen_df, gen_array, gen_list, gen_sparse, gen_something, gen_3d_numeric_array, gen_3d_recarray, ] for gen, (mode, result) in product(value_generators, compat2result.items()): yield pytest.param(unss, mode, result, gen) @pytest.mark.parametrize( ("unss", "merge_strategy", "result", "value_gen"), chain( gen_concat_params( [{"a": 1}, {"a": 2}], {None: {}, "first": {"a": 1}, "unique": {}, "same": {}, "only": {}}, ), gen_concat_params( [{"a": 1}, {"b": 2}], { None: {}, "first": {"a": 1, "b": 2}, "unique": {"a": 1, "b": 2}, "same": {}, "only": {"a": 1, "b": 2}, }, ), gen_concat_params( [ {"a": {"b": 1, "c": {"d": 3}}}, {"a": {"b": 1, "c": {"e": 4}}}, ], { None: {}, "first": {"a": {"b": 1, "c": {"d": 3, "e": 4}}}, "unique": {"a": {"b": 1, "c": {"d": 3, "e": 4}}}, "same": {"a": {"b": 1}}, "only": {"a": {"c": {"d": 3, "e": 4}}}, }, ), gen_concat_params( [ {"a": 1}, {"a": 1, "b": 2}, {"a": 1, "b": {"b.a": 1}, "c": 3}, {"d": 4}, ], { None: {}, "first": {"a": 1, "b": 2, "c": 3, "d": 4}, "unique": {"a": 1, "c": 3, "d": 4}, "same": {}, "only": {"c": 3, "d": 4}, }, ), gen_concat_params( [{"a": i} for i in range(15)], {None: {}, "first": {"a": 0}, "unique": {}, "same": {}, "only": {}}, ), gen_concat_params( [{"a": 1} for i in range(10)] + [{"a": 2}], {None: {}, "first": {"a": 1}, "unique": {}, "same": {}, "only": {}}, ), ), ) def test_concatenate_uns(unss, merge_strategy, result, value_gen): """ Test that concatenation works out for different strategies and sets of values. Params ------ unss Set of patterns for values in uns. compat Strategy to use for merging uns. result Pattern we expect to see for the given input and strategy. value_gen Maps values in unss and results to another set of values. This is for checking that we're comparing values correctly. For example `[{"a": 1}, {"a": 1}]` may get mapped to `[{"a": [1, 2, 3]}, {"a": [1, 2, 3]}]`. """ # So we can see what the initial pattern was meant to be print(merge_strategy, "\n", unss, "\n", result) result, *unss = permute_nested_values([result] + unss, value_gen) adatas = [uns_ad(uns) for uns in unss] with pytest.warns(FutureWarning, match=r"concatenate is deprecated"): merged = AnnData.concatenate(*adatas, uns_merge=merge_strategy).uns assert_equal(merged, result, elem_name="uns") def test_transposed_concat(array_type, axis_name, join_type, merge_strategy): axis, axis_name = merge._resolve_axis(axis_name) alt_axis = 1 - axis lhs = gen_adata((10, 10), X_type=array_type, **GEN_ADATA_DASK_ARGS) rhs = gen_adata((10, 12), X_type=array_type, **GEN_ADATA_DASK_ARGS) a = concat([lhs, rhs], axis=axis, join=join_type, merge=merge_strategy) b = concat([lhs.T, rhs.T], axis=alt_axis, join=join_type, merge=merge_strategy).T assert_equal(a, b) def test_batch_key(axis_name): """Test that concat only adds a label if the key is provided""" get_annot = attrgetter(axis_name) lhs = gen_adata((10, 10), **GEN_ADATA_DASK_ARGS) rhs = gen_adata((10, 12), **GEN_ADATA_DASK_ARGS) # There is probably a prettier way to do this annot = get_annot(concat([lhs, rhs], axis=axis_name)) assert ( list( annot.columns.difference( get_annot(lhs).columns.union(get_annot(rhs).columns) ) ) == [] ) batch_annot = get_annot(concat([lhs, rhs], axis=axis_name, label="batch")) assert list( batch_annot.columns.difference( get_annot(lhs).columns.union(get_annot(rhs).columns) ) ) == ["batch"] def test_concat_categories_from_mapping(): mapping = { "a": gen_adata((10, 10)), "b": gen_adata((10, 10)), } keys = list(mapping.keys()) adatas = list(mapping.values()) mapping_call = partial(concat, mapping) iter_call = partial(concat, adatas, keys=keys) assert_equal(mapping_call(), iter_call()) assert_equal(mapping_call(label="batch"), iter_call(label="batch")) assert_equal(mapping_call(index_unique="-"), iter_call(index_unique="-")) assert_equal( mapping_call(label="group", index_unique="+"), iter_call(label="group", index_unique="+"), ) def test_concat_categories_maintain_dtype(): a = AnnData( X=np.ones((5, 1)), obs=pd.DataFrame( { "cat": pd.Categorical(list("aabcc")), "cat_ordered": pd.Categorical(list("aabcc"), ordered=True), }, index=[f"cell{i:02}" for i in range(5)], ), ) b = AnnData( X=np.ones((5, 1)), obs=pd.DataFrame( { "cat": pd.Categorical(list("bccdd")), "cat_ordered": pd.Categorical(list("bccdd"), ordered=True), }, index=[f"cell{i:02}" for i in range(5, 10)], ), ) c = AnnData( X=np.ones((5, 1)), obs=pd.DataFrame( { "cat_ordered": pd.Categorical(list("bccdd"), ordered=True), }, index=[f"cell{i:02}" for i in range(5, 10)], ), ) result = concat({"a": a, "b": b, "c": c}, join="outer") assert isinstance(result.obs["cat"].dtype, pd.CategoricalDtype), ( f"Was {result.obs['cat'].dtype}" ) assert pd.api.types.is_string_dtype(result.obs["cat_ordered"]) def test_concat_ordered_categoricals_retained(): a = AnnData( X=np.ones((5, 1)), obs=pd.DataFrame( { "cat_ordered": pd.Categorical(list("aabcd"), ordered=True), }, index=[f"cell{i:02}" for i in range(5)], ), ) b = AnnData( X=np.ones((5, 1)), obs=pd.DataFrame( { "cat_ordered": pd.Categorical(list("abcdd"), ordered=True), }, index=[f"cell{i:02}" for i in range(5, 10)], ), ) c = concat([a, b]) assert isinstance(c.obs["cat_ordered"].dtype, pd.CategoricalDtype) assert c.obs["cat_ordered"].cat.ordered def test_concat_categorical_dtype_promotion(): """https://github.com/scverse/anndata/issues/1170 When concatenating categorical with other dtype, defer to pandas. """ a = AnnData( np.ones((3, 3)), obs=pd.DataFrame( {"col": pd.Categorical(["a", "a", "b"])}, index=[f"cell_{i:02d}" for i in range(3)], ), ) b = AnnData( np.ones((3, 3)), obs=pd.DataFrame( {"col": ["c", "c", "c"]}, index=[f"cell_{i:02d}" for i in range(3, 6)], ), ) result = concat([a, b]) expected = pd.concat([a.obs, b.obs]) assert_equal(result.obs, expected) def test_bool_promotion(): np_bool = AnnData( np.ones((5, 1)), obs=pd.DataFrame({"bool": [True] * 5}, index=[f"cell{i:02}" for i in range(5)]), ) missing = AnnData( np.ones((5, 1)), obs=pd.DataFrame(index=[f"cell{i:02}" for i in range(5, 10)]), ) result = concat({"np_bool": np_bool, "b": missing}, join="outer", label="batch") assert pd.api.types.is_bool_dtype(result.obs["bool"]) assert pd.isnull(result.obs.loc[result.obs["batch"] == "missing", "bool"]).all() # Check that promotion doesn't occur if it doesn't need to: np_bool_2 = AnnData( np.ones((5, 1)), obs=pd.DataFrame( {"bool": [True] * 5}, index=[f"cell{i:02}" for i in range(5, 10)] ), ) result = concat( {"np_bool": np_bool, "np_bool_2": np_bool_2}, join="outer", label="batch" ) assert result.obs["bool"].dtype == np.dtype(bool) def test_concat_names(axis_name): get_annot = attrgetter(axis_name) lhs = gen_adata((10, 10)) rhs = gen_adata((10, 10)) assert not get_annot(concat([lhs, rhs], axis=axis_name)).index.is_unique assert get_annot( concat([lhs, rhs], axis=axis_name, index_unique="-") ).index.is_unique def axis_labels(adata: AnnData, axis: Literal[0, 1]) -> pd.Index: return (adata.obs_names, adata.var_names)[axis] def expected_shape( a: AnnData, b: AnnData, axis: Literal[0, 1], join: Literal["inner", "outer"] ) -> tuple[int, int]: alt_axis = 1 - axis labels = partial(axis_labels, axis=alt_axis) shape = [None, None] shape[axis] = a.shape[axis] + b.shape[axis] if join == "inner": shape[alt_axis] = len(labels(a).intersection(labels(b))) elif join == "outer": shape[alt_axis] = len(labels(a).union(labels(b))) else: raise ValueError() return tuple(shape) @pytest.mark.parametrize( "shape", [pytest.param((8, 0), id="no_var"), pytest.param((0, 10), id="no_obs")] ) def test_concat_size_0_axis(axis_name, join_type, merge_strategy, shape): """Regression test for https://github.com/scverse/anndata/issues/526""" axis, axis_name = merge._resolve_axis(axis_name) alt_axis = 1 - axis col_dtypes = (*DEFAULT_COL_TYPES, pd.StringDtype) a = gen_adata((5, 7), obs_dtypes=col_dtypes, var_dtypes=col_dtypes) b = gen_adata(shape, obs_dtypes=col_dtypes, var_dtypes=col_dtypes) expected_size = expected_shape(a, b, axis=axis, join=join_type) ctx_concat_empty = ( pytest.warns( FutureWarning, match=r"The behavior of DataFrame concatenation with empty or all-NA entries is deprecated", ) if shape[axis] == 0 and Version(pd.__version__) >= Version("2.1") else nullcontext() ) with ctx_concat_empty: result = concat( {"a": a, "b": b}, axis=axis, join=join_type, merge=merge_strategy, pairwise=True, index_unique="-", ) assert result.shape == expected_size if join_type == "outer": # Check new entries along axis of concatenation axis_new_inds = axis_labels(result, axis).str.endswith("-b") altaxis_new_inds = ~axis_labels(result, alt_axis).isin(axis_labels(a, alt_axis)) axis_idx = make_idx_tuple(axis_new_inds, axis) altaxis_idx = make_idx_tuple(altaxis_new_inds, 1 - axis) check_filled_like(result.X[axis_idx], elem_name="X") check_filled_like(result.X[altaxis_idx], elem_name="X") for k, elem in getattr(result, "layers").items(): check_filled_like(elem[axis_idx], elem_name=f"layers/{k}") check_filled_like(elem[altaxis_idx], elem_name=f"layers/{k}") if shape[axis] > 0: b_result = result[axis_idx].copy() mapping_elem = f"{axis_name}m" setattr(b_result, f"{axis_name}_names", getattr(b, f"{axis_name}_names")) for k, result_elem in getattr(b_result, mapping_elem).items(): elem_name = f"{mapping_elem}/{k}" # pd.concat can have unintuitive return types. is similar to numpy promotion if isinstance(result_elem, pd.DataFrame): assert_equal( getattr(b, mapping_elem)[k].astype(object), result_elem.astype(object), elem_name=elem_name, ) else: assert_equal( getattr(b, mapping_elem)[k], result_elem, elem_name=elem_name, ) @pytest.mark.parametrize("elem", ["sparse", "array", "df", "da"]) @pytest.mark.parametrize("axis", ["obs", "var"]) def test_concat_outer_aligned_mapping(elem, axis): a = gen_adata((5, 5), **GEN_ADATA_DASK_ARGS) b = gen_adata((3, 5), **GEN_ADATA_DASK_ARGS) del getattr(b, f"{axis}m")[elem] concated = concat({"a": a, "b": b}, join="outer", label="group", axis=axis) mask = getattr(concated, axis)["group"] == "b" result = getattr( concated[(mask, slice(None)) if axis == "obs" else (slice(None), mask)], f"{axis}m", )[elem] check_filled_like(result, elem_name=f"{axis}m/{elem}") @mark_legacy_concatenate def test_concatenate_size_0_axis(): # https://github.com/scverse/anndata/issues/526 a = gen_adata((5, 10)) b = gen_adata((5, 0)) # Mostly testing that this doesn't error a.concatenate([b]).shape == (10, 0) b.concatenate([a]).shape == (10, 0) def test_concat_null_X(): adatas_orig = {k: gen_adata((20, 10)) for k in list("abc")} adatas_no_X = {} for k, v in adatas_orig.items(): v = v.copy() del v.X adatas_no_X[k] = v orig = concat(adatas_orig, index_unique="-") no_X = concat(adatas_no_X, index_unique="-") del orig.X assert_equal(no_X, orig) # https://github.com/scverse/ehrapy/issues/151#issuecomment-1016753744 @pytest.mark.parametrize("sparse_indexer_type", [np.int64, np.int32]) def test_concat_X_dtype(cpu_array_type, sparse_indexer_type): adatas_orig = { k: AnnData(cpu_array_type(np.ones((20, 10), dtype=np.int8))) for k in list("abc") } for adata in adatas_orig.values(): adata.raw = AnnData(cpu_array_type(np.ones((20, 30), dtype=np.float64))) if sparse.issparse(adata.X): adata.X.indptr = adata.X.indptr.astype(sparse_indexer_type) adata.X.indices = adata.X.indices.astype(sparse_indexer_type) result = concat(adatas_orig, index_unique="-") assert result.X.dtype == np.int8 assert result.raw.X.dtype == np.float64 if sparse.issparse(result.X): # https://github.com/scipy/scipy/issues/20389 was merged in 1.15 but is still an issue with matrix if sparse_indexer_type == np.int64 and ( ( (issubclass(cpu_array_type, CSArray) or adata.X.format == "csc") and Version(scipy.__version__) < Version("1.15.0") ) or issubclass(cpu_array_type, CSMatrix) ): pytest.xfail( "Data type int64 is not maintained for sparse matrices or csc array" ) assert result.X.indptr.dtype == sparse_indexer_type, result.X assert result.X.indices.dtype == sparse_indexer_type # Leaving out for now. See definition of these values for explanation # def test_concatenate_uns_types(): # from anndata._core.merge import UNS_STRATEGIES, UNS_STRATEGIES_TYPE # assert set(UNS_STRATEGIES.keys()) == set(UNS_STRATEGIES_TYPE.__args__) # Tests how dask plays with other types on concatenation. def test_concat_different_types_dask(merge_strategy, array_type): import dask.array as da from scipy import sparse import anndata as ad varm_array = sparse.random(5, 20, density=0.5, format="csr") ad1 = ad.AnnData(X=np.ones((5, 5)), varm={"a": varm_array}) ad1_other = ad.AnnData(X=np.ones((5, 5)), varm={"a": array_type(varm_array)}) ad2 = ad.AnnData(X=np.zeros((5, 5)), varm={"a": da.ones(5, 20)}) result1 = ad.concat([ad1, ad2], merge=merge_strategy) target1 = ad.concat([ad1_other, ad2], merge=merge_strategy) result2 = ad.concat([ad2, ad1], merge=merge_strategy) target2 = ad.concat([ad2, ad1_other], merge=merge_strategy) assert_equal(result1, target1) assert_equal(result2, target2) def test_concat_missing_elem_dask_join(join_type): import dask.array as da import anndata as ad ad1 = ad.AnnData(X=np.ones((5, 10))) ad2 = ad.AnnData(X=np.zeros((5, 5)), layers={"a": da.ones((5, 5))}) ad_in_memory_with_layers = ad2.to_memory() result1 = ad.concat([ad1, ad2], join=join_type) result2 = ad.concat([ad1, ad_in_memory_with_layers], join=join_type) assert_equal(result1, result2) def test_impute_dask(axis_name): import dask.array as da from anndata._core.merge import _resolve_axis, missing_element axis, _ = _resolve_axis(axis_name) els = [da.ones((5, 5))] missing = missing_element(6, els, axis=axis, off_axis_size=17) assert isinstance(missing, DaskArray) in_memory = missing.compute() assert np.all(np.isnan(in_memory)) assert in_memory.shape[axis] == 6 assert in_memory.shape[axis - 1] == 17 def test_outer_concat_with_missing_value_for_df(): # https://github.com/scverse/anndata/issues/901 # TODO: Extend this test to cover all cases of missing values # TODO: Check values a_idx = ["a", "b", "c", "d", "e"] b_idx = ["f", "g", "h", "i", "j", "k", "l", "m"] a = AnnData( np.ones((5, 5)), obs=pd.DataFrame(index=a_idx), ) b = AnnData( np.zeros((8, 9)), obs=pd.DataFrame(index=b_idx), obsm={"df": pd.DataFrame({"col": np.arange(8)}, index=b_idx)}, ) concat([a, b], join="outer") def test_outer_concat_outputs_nullable_bool_writable(tmp_path): a = gen_adata((5, 5), obsm_types=(pd.DataFrame,)) b = gen_adata((3, 5), obsm_types=(pd.DataFrame,)) del b.obsm["df"] adatas = concat({"a": a, "b": b}, join="outer", label="group") adatas.write(tmp_path / "test.h5ad") def test_concat_duplicated_columns(join_type): # https://github.com/scverse/anndata/issues/483 a = AnnData( obs=pd.DataFrame( np.ones((5, 2)), columns=["a", "a"], index=[str(x) for x in range(5)] ) ) b = AnnData( obs=pd.DataFrame( np.ones((5, 1)), columns=["a"], index=[str(x) for x in range(5, 10)] ) ) with pytest.raises(pd.errors.InvalidIndexError, match=r"'a'"): concat([a, b], join=join_type) @pytest.mark.gpu def test_error_on_mixed_device(): """https://github.com/scverse/anndata/issues/1083""" import cupy import cupyx.scipy.sparse as cupy_sparse cp_adata = AnnData( cupy.random.randn(10, 10), obs=pd.DataFrame(index=[f"cell_{i:02d}" for i in range(10)]), ) cp_sparse_adata = AnnData( cupy_sparse.random(10, 10, format="csr", density=0.2), obs=pd.DataFrame(index=[f"cell_{i:02d}" for i in range(10, 20)]), ) np_adata = AnnData( np.random.randn(10, 10), obs=pd.DataFrame(index=[f"cell_{i:02d}" for i in range(20, 30)]), ) sparse_adata = AnnData( sparse.random(10, 10, format="csr", density=0.2), obs=pd.DataFrame(index=[f"cell_{i:02d}" for i in range(30, 40)]), ) adatas = { "cupy": cp_adata, "cupy_sparse": cp_sparse_adata, "numpy": np_adata, "sparse": sparse_adata, } for p in map(dict, permutations(adatas.items())): print(list(p.keys())) with pytest.raises( NotImplementedError, match=r"Cannot concatenate a cupy array with other" ): concat(p) for p in permutations([cp_adata, cp_sparse_adata]): concat(p) def test_concat_on_var_outer_join(array_type): # https://github.com/scverse/anndata/issues/1286 a = AnnData( obs=pd.DataFrame(index=[f"cell_{i:02d}" for i in range(10)]), var=pd.DataFrame(index=[f"gene_{i:02d}" for i in range(10)]), layers={ "X": array_type(np.ones((10, 10))), }, ) b = AnnData( obs=pd.DataFrame(index=[f"cell_{i:02d}" for i in range(10)]), var=pd.DataFrame(index=[f"gene_{i:02d}" for i in range(10, 20)]), ) # This shouldn't error # TODO: specify expected result while accounting for null value _ = concat([a, b], join="outer", axis=1) def test_concat_dask_sparse_matches_memory(join_type, merge_strategy): import dask.array as da X = sparse.random(50, 20, density=0.5, format="csr") X_dask = da.from_array(X, chunks=(5, 20)) var_names_1 = [f"gene_{i}" for i in range(20)] var_names_2 = [f"gene_{i}{'_foo' if (i % 2) else ''}" for i in range(20, 40)] ad1 = AnnData(X=X, var=pd.DataFrame(index=var_names_1)) ad2 = AnnData(X=X, var=pd.DataFrame(index=var_names_2)) ad1_dask = AnnData(X=X_dask, var=pd.DataFrame(index=var_names_1)) ad2_dask = AnnData(X=X_dask, var=pd.DataFrame(index=var_names_2)) res_in_memory = concat([ad1, ad2], join=join_type, merge=merge_strategy) res_dask = concat([ad1_dask, ad2_dask], join=join_type, merge=merge_strategy) assert_equal(res_in_memory, res_dask) python-anndata-0.12.0~rc1/tests/test_concatenate_disk.py000066400000000000000000000167131500370632200234020ustar00rootroot00000000000000from __future__ import annotations from collections.abc import Mapping from typing import TYPE_CHECKING import numpy as np import pandas as pd import pytest from scipy import sparse from anndata import AnnData, concat from anndata._core.merge import _resolve_axis from anndata.experimental.merge import as_group, concat_on_disk from anndata.io import read_elem, write_elem from anndata.tests.helpers import assert_equal, gen_adata from anndata.utils import asarray if TYPE_CHECKING: from pathlib import Path from typing import Literal GEN_ADATA_OOC_CONCAT_ARGS = dict( obsm_types=( sparse.csr_matrix, np.ndarray, pd.DataFrame, ), varm_types=(sparse.csr_matrix, np.ndarray, pd.DataFrame), layers_types=(sparse.csr_matrix, np.ndarray, pd.DataFrame), ) @pytest.fixture(params=[0, 1]) def axis(request) -> Literal[0, 1]: return request.param @pytest.fixture(params=["array", "sparse", "sparse_array"]) def array_type(request) -> Literal["array", "sparse", "sparse_array"]: return request.param @pytest.fixture(params=["inner", "outer"]) def join_type(request) -> Literal["inner", "outer"]: return request.param @pytest.fixture(params=["zarr", "h5ad"]) def file_format(request) -> Literal["zarr", "h5ad"]: return request.param # 1000 is enough to guarantee that the feature is being used @pytest.fixture(params=[1_000, 100_000_000]) def max_loaded_elems(request) -> int: return request.param def _adatas_to_paths(adatas, tmp_path, file_format): """ Gets list of adatas, writes them and returns their paths as zarr """ paths = None if isinstance(adatas, Mapping): paths = {} for k, v in adatas.items(): p = tmp_path / (f"{k}." + file_format) write_elem(as_group(p, mode="a"), "", v) paths[k] = p else: paths = [] for i, a in enumerate(adatas): p = tmp_path / (f"{i}." + file_format) write_elem(as_group(p, mode="a"), "", a) paths += [p] return paths def assert_eq_concat_on_disk( adatas, tmp_path: Path, file_format: Literal["zarr", "h5ad"], max_loaded_elems: int | None = None, *args, **kwargs, ): # create one from the concat function res1 = concat(adatas, *args, **kwargs) # create one from the on disk concat function paths = _adatas_to_paths(adatas, tmp_path, file_format) out_name = tmp_path / f"out.{file_format}" if max_loaded_elems is not None: kwargs["max_loaded_elems"] = max_loaded_elems concat_on_disk(paths, out_name, *args, **kwargs) res2 = read_elem(as_group(out_name, mode="r")) assert_equal(res1, res2, exact=False) def get_array_type(array_type, axis): if array_type == "sparse": return sparse.csr_matrix if axis == 0 else sparse.csc_matrix if array_type == "sparse_array": return sparse.csr_array if axis == 0 else sparse.csc_array if array_type == "array": return asarray msg = f"array_type {array_type} not implemented" raise NotImplementedError(msg) @pytest.mark.parametrize("reindex", [True, False], ids=["reindex", "no_reindex"]) def test_anndatas( *, axis: Literal[0, 1], array_type: Literal["array", "sparse", "sparse_array"], join_type: Literal["inner", "outer"], tmp_path: Path, max_loaded_elems: int, file_format: Literal["zarr", "h5ad"], reindex: bool, ): _, off_axis_name = _resolve_axis(1 - axis) random_axes = {0, 1} if reindex else {axis} sparse_fmt = "csr" if axis == 0 else "csc" kw = ( GEN_ADATA_OOC_CONCAT_ARGS if not reindex else dict( obsm_types=(get_array_type("sparse", 1 - axis), np.ndarray, pd.DataFrame), varm_types=(get_array_type("sparse", 1 - axis), np.ndarray, pd.DataFrame), layers_types=(get_array_type("sparse", axis), np.ndarray, pd.DataFrame), ) ) adatas = [] for i in range(5): M, N = (np.random.randint(1, 100) if a in random_axes else 50 for a in (0, 1)) a = gen_adata( (M, N), X_type=get_array_type(array_type, axis), sparse_fmt=sparse_fmt, **kw ) # ensure some names overlap, others do not, for the off-axis so that inner/outer is properly tested off_names = getattr(a, f"{off_axis_name}_names").array off_names[1::2] = f"{i}-" + off_names[1::2] setattr(a, f"{off_axis_name}_names", off_names) adatas.append(a) assert_eq_concat_on_disk( adatas, tmp_path, file_format, max_loaded_elems, axis=axis, join=join_type, ) def test_concat_ordered_categoricals_retained(tmp_path, file_format): a = AnnData( X=np.ones((5, 1)), obs=pd.DataFrame( { "cat_ordered": pd.Categorical(list("aabcd"), ordered=True), }, index=[f"cell{i:02}" for i in range(5)], ), ) b = AnnData( X=np.ones((5, 1)), obs=pd.DataFrame( { "cat_ordered": pd.Categorical(list("abcdd"), ordered=True), }, index=[f"cell{i:02}" for i in range(5, 10)], ), ) adatas = [a, b] assert_eq_concat_on_disk(adatas, tmp_path, file_format) @pytest.fixture def xxxm_adatas(): def gen_index(n): return [f"cell{i}" for i in range(n)] return [ AnnData( X=sparse.csr_matrix((3, 5)), obs=pd.DataFrame(index=gen_index(3)), obsm={ "dense": np.arange(6).reshape(3, 2), "sparse": sparse.csr_matrix(np.arange(6).reshape(3, 2)), "df": pd.DataFrame( { "a": np.arange(3), "b": list("abc"), "c": pd.Categorical(list("aab")), }, index=gen_index(3), ), }, ), AnnData( X=sparse.csr_matrix((4, 10)), obs=pd.DataFrame(index=gen_index(4)), obsm=dict( dense=np.arange(12).reshape(4, 3), df=pd.DataFrame(dict(a=np.arange(3, 7)), index=gen_index(4)), ), ), AnnData( X=sparse.csr_matrix((2, 100)), obs=pd.DataFrame(index=gen_index(2)), obsm={ "sparse": np.arange(8).reshape(2, 4), "dense": np.arange(4, 8).reshape(2, 2), "df": pd.DataFrame( { "a": np.arange(7, 9), "b": list("cd"), "c": pd.Categorical(list("ab")), }, index=gen_index(2), ), }, ), ] def test_concatenate_xxxm(xxxm_adatas, tmp_path, file_format, join_type): if join_type == "outer": for i in range(len(xxxm_adatas)): xxxm_adatas[i] = xxxm_adatas[i].T xxxm_adatas[i].X = sparse.csr_matrix(xxxm_adatas[i].X) assert_eq_concat_on_disk(xxxm_adatas, tmp_path, file_format, join=join_type) def test_output_dir_exists(tmp_path): in_pth = tmp_path / "in.h5ad" out_pth = tmp_path / "does_not_exist" / "out.h5ad" AnnData(X=np.ones((5, 1))).write_h5ad(in_pth) with pytest.raises(FileNotFoundError, match=f"{out_pth}"): concat_on_disk([in_pth], out_pth) def test_failure_w_no_args(tmp_path): with pytest.raises(ValueError, match=r"No objects to concatenate"): concat_on_disk([], tmp_path / "out.h5ad") python-anndata-0.12.0~rc1/tests/test_dask.py000066400000000000000000000261751500370632200210310ustar00rootroot00000000000000""" For tests using dask """ from __future__ import annotations from typing import TYPE_CHECKING import numpy as np import pandas as pd import pytest from scipy import sparse import anndata as ad from anndata._core.anndata import AnnData from anndata.compat import CupyArray, DaskArray from anndata.experimental.merge import as_group from anndata.tests.helpers import ( GEN_ADATA_DASK_ARGS, as_dense_cupy_dask_array, as_dense_dask_array, as_sparse_dask_array, assert_equal, gen_adata, ) if TYPE_CHECKING: from pathlib import Path from typing import Literal pytest.importorskip("dask.array") @pytest.fixture( params=[ [(2000, 1000), (100, 100)], [(200, 100), (100, 100)], [(200, 100), (100, 100)], [(20, 10), (1, 1)], [(20, 10), (1, 1)], ] ) def sizes(request): return request.param @pytest.fixture def adata(sizes): import dask.array as da import numpy as np (M, N), chunks = sizes X = da.random.random((M, N), chunks=chunks) obs = pd.DataFrame( {"batch": np.random.choice(["a", "b"], M)}, index=[f"cell{i:03d}" for i in range(M)], ) var = pd.DataFrame(index=[f"gene{i:03d}" for i in range(N)]) return AnnData(X, obs=obs, var=var) def test_dask_X_view(): import dask.array as da M, N = 50, 30 adata = ad.AnnData( obs=pd.DataFrame(index=[f"cell{i:02}" for i in range(M)]), var=pd.DataFrame(index=[f"gene{i:02}" for i in range(N)]), ) adata.X = da.ones((M, N)) view = adata[:30] view.copy() def test_dask_write(adata, tmp_path, diskfmt): import dask.array as da import numpy as np pth = tmp_path / f"test_write.{diskfmt}" write = lambda x, y: getattr(x, f"write_{diskfmt}")(y) read = lambda x: getattr(ad, f"read_{diskfmt}")(x) M, N = adata.X.shape adata.obsm["a"] = da.random.random((M, 10)) adata.obsm["b"] = da.random.random((M, 10)) adata.varm["a"] = da.random.random((N, 10)) orig = adata write(orig, pth) curr = read(pth) with pytest.raises(AssertionError): assert_equal(curr.obsm["a"], curr.obsm["b"]) assert_equal(curr.varm["a"], orig.varm["a"]) assert_equal(curr.obsm["a"], orig.obsm["a"]) assert isinstance(curr.X, np.ndarray) assert isinstance(curr.obsm["a"], np.ndarray) assert isinstance(curr.varm["a"], np.ndarray) assert isinstance(orig.X, DaskArray) assert isinstance(orig.obsm["a"], DaskArray) assert isinstance(orig.varm["a"], DaskArray) @pytest.mark.xdist_group("dask") def test_dask_distributed_write( adata: AnnData, tmp_path: Path, diskfmt: Literal["h5ad", "zarr"], local_cluster_addr: str, ) -> None: import dask.array as da import dask.distributed as dd import numpy as np pth = tmp_path / f"test_write.{diskfmt}" g = as_group(pth, mode="w") with dd.Client(local_cluster_addr): M, N = adata.X.shape adata.obsm["a"] = da.random.random((M, 10)) adata.obsm["b"] = da.random.random((M, 10)) adata.varm["a"] = da.random.random((N, 10)) orig = adata if diskfmt == "h5ad": with pytest.raises(ValueError, match=r"Cannot write dask arrays to hdf5"): ad.io.write_elem(g, "", orig) return ad.io.write_elem(g, "", orig) # TODO: See https://github.com/zarr-developers/zarr-python/issues/2716 g = as_group(pth, mode="r") curr = ad.io.read_elem(g) with pytest.raises(AssertionError): assert_equal(curr.obsm["a"], curr.obsm["b"]) assert_equal(curr.varm["a"], orig.varm["a"]) assert_equal(curr.obsm["a"], orig.obsm["a"]) assert isinstance(curr.X, np.ndarray) assert isinstance(curr.obsm["a"], np.ndarray) assert isinstance(curr.varm["a"], np.ndarray) assert isinstance(orig.X, DaskArray) assert isinstance(orig.obsm["a"], DaskArray) assert isinstance(orig.varm["a"], DaskArray) def test_dask_to_memory_check_array_types(adata, tmp_path, diskfmt): import dask.array as da import numpy as np pth = tmp_path / f"test_write.{diskfmt}" write = lambda x, y: getattr(x, f"write_{diskfmt}")(y) read = lambda x: getattr(ad, f"read_{diskfmt}")(x) M, N = adata.X.shape adata.obsm["a"] = da.random.random((M, 10)) adata.obsm["b"] = da.random.random((M, 10)) adata.varm["a"] = da.random.random((N, 10)) orig = adata write(orig, pth) curr = read(pth) assert isinstance(orig.X, DaskArray) assert isinstance(orig.obsm["a"], DaskArray) assert isinstance(orig.varm["a"], DaskArray) mem = orig.to_memory() with pytest.raises(AssertionError): assert_equal(curr.obsm["a"], curr.obsm["b"]) assert_equal(curr.varm["a"], orig.varm["a"]) assert_equal(curr.obsm["a"], orig.obsm["a"]) assert_equal(mem.obsm["a"], orig.obsm["a"]) assert_equal(mem.varm["a"], orig.varm["a"]) assert isinstance(curr.X, np.ndarray) assert isinstance(curr.obsm["a"], np.ndarray) assert isinstance(curr.varm["a"], np.ndarray) assert isinstance(mem.X, np.ndarray) assert isinstance(mem.obsm["a"], np.ndarray) assert isinstance(mem.varm["a"], np.ndarray) assert isinstance(orig.X, DaskArray) assert isinstance(orig.obsm["a"], DaskArray) assert isinstance(orig.varm["a"], DaskArray) def test_dask_to_memory_copy_check_array_types(adata, tmp_path, diskfmt): import dask.array as da import numpy as np pth = tmp_path / f"test_write.{diskfmt}" write = lambda x, y: getattr(x, f"write_{diskfmt}")(y) read = lambda x: getattr(ad, f"read_{diskfmt}")(x) M, N = adata.X.shape adata.obsm["a"] = da.random.random((M, 10)) adata.obsm["b"] = da.random.random((M, 10)) adata.varm["a"] = da.random.random((N, 10)) orig = adata write(orig, pth) curr = read(pth) mem = orig.to_memory(copy=True) with pytest.raises(AssertionError): assert_equal(curr.obsm["a"], curr.obsm["b"]) assert_equal(curr.varm["a"], orig.varm["a"]) assert_equal(curr.obsm["a"], orig.obsm["a"]) assert_equal(mem.obsm["a"], orig.obsm["a"]) assert_equal(mem.varm["a"], orig.varm["a"]) assert isinstance(curr.X, np.ndarray) assert isinstance(curr.obsm["a"], np.ndarray) assert isinstance(curr.varm["a"], np.ndarray) assert isinstance(mem.X, np.ndarray) assert isinstance(mem.obsm["a"], np.ndarray) assert isinstance(mem.varm["a"], np.ndarray) assert isinstance(orig.X, DaskArray) assert isinstance(orig.obsm["a"], DaskArray) assert isinstance(orig.varm["a"], DaskArray) def test_dask_copy_check_array_types(adata): import dask.array as da M, N = adata.X.shape adata.obsm["a"] = da.random.random((M, 10)) adata.obsm["b"] = da.random.random((M, 10)) adata.varm["a"] = da.random.random((N, 10)) orig = adata curr = adata.copy() with pytest.raises(AssertionError): assert_equal(curr.obsm["a"], curr.obsm["b"]) assert_equal(curr.varm["a"], orig.varm["a"]) assert_equal(curr.obsm["a"], orig.obsm["a"]) assert isinstance(curr.X, DaskArray) assert isinstance(curr.obsm["a"], DaskArray) assert isinstance(curr.varm["a"], DaskArray) assert isinstance(orig.X, DaskArray) assert isinstance(orig.obsm["a"], DaskArray) assert isinstance(orig.varm["a"], DaskArray) def test_assign_X(adata): """Check if assignment works""" import dask.array as da import numpy as np from anndata.compat import DaskArray adata.X = da.ones(adata.X.shape) prev_type = type(adata.X) adata_copy = adata.copy() adata.X = -1 * da.ones(adata.X.shape) assert prev_type is DaskArray assert type(adata_copy.X) is DaskArray assert_equal(adata.X, -1 * np.ones(adata.X.shape)) assert_equal(adata_copy.X, np.ones(adata.X.shape)) # Test if dask arrays turn into numpy arrays after to_memory is called @pytest.mark.parametrize( ("array_func", "mem_type"), [ pytest.param(as_dense_dask_array, np.ndarray, id="dense_dask_array"), pytest.param(as_sparse_dask_array, sparse.csr_matrix, id="sparse_dask_array"), pytest.param( as_dense_cupy_dask_array, CupyArray, id="cupy_dense_dask_array", marks=pytest.mark.gpu, ), ], ) def test_dask_to_memory_unbacked(array_func, mem_type): orig = gen_adata((15, 10), X_type=array_func, **GEN_ADATA_DASK_ARGS) orig.uns = {"da": {"da": array_func(np.ones((4, 12)))}} assert isinstance(orig.X, DaskArray) assert isinstance(orig.obsm["da"], DaskArray) assert isinstance(orig.layers["da"], DaskArray) assert isinstance(orig.varm["da"], DaskArray) assert isinstance(orig.uns["da"]["da"], DaskArray) curr = orig.to_memory() assert_equal(orig, curr) assert isinstance(curr.X, mem_type) assert isinstance(curr.obsm["da"], np.ndarray) assert isinstance(curr.varm["da"], np.ndarray) assert isinstance(curr.layers["da"], np.ndarray) assert isinstance(curr.uns["da"]["da"], mem_type) assert isinstance(orig.X, DaskArray) assert isinstance(orig.obsm["da"], DaskArray) assert isinstance(orig.layers["da"], DaskArray) assert isinstance(orig.varm["da"], DaskArray) assert isinstance(orig.uns["da"]["da"], DaskArray) # Test if dask arrays turn into numpy arrays after to_memory is called def test_dask_to_memory_copy_unbacked(): import numpy as np orig = gen_adata((15, 10), X_type=as_dense_dask_array, **GEN_ADATA_DASK_ARGS) orig.uns = {"da": {"da": as_dense_dask_array(np.ones(12))}} curr = orig.to_memory(copy=True) assert_equal(orig, curr) assert isinstance(curr.X, np.ndarray) assert isinstance(curr.obsm["da"], np.ndarray) assert isinstance(curr.varm["da"], np.ndarray) assert isinstance(curr.layers["da"], np.ndarray) assert isinstance(curr.uns["da"]["da"], np.ndarray) assert isinstance(orig.X, DaskArray) assert isinstance(orig.obsm["da"], DaskArray) assert isinstance(orig.layers["da"], DaskArray) assert isinstance(orig.varm["da"], DaskArray) assert isinstance(orig.uns["da"]["da"], DaskArray) def test_to_memory_raw(): import dask.array as da import numpy as np orig = gen_adata((20, 10), **GEN_ADATA_DASK_ARGS) orig.X = da.ones((20, 10)) with_raw = orig[:, ::2].copy() with_raw.raw = orig.copy() assert isinstance(with_raw.raw.X, DaskArray) assert isinstance(with_raw.raw.varm["da"], DaskArray) curr = with_raw.to_memory() assert isinstance(with_raw.raw.X, DaskArray) assert isinstance(with_raw.raw.varm["da"], DaskArray) assert isinstance(curr.raw.X, np.ndarray) assert isinstance(curr.raw.varm["da"], np.ndarray) def test_to_memory_copy_raw(): import dask.array as da import numpy as np orig = gen_adata((20, 10), **GEN_ADATA_DASK_ARGS) orig.X = da.ones((20, 10)) with_raw = orig[:, ::2].copy() with_raw.raw = orig.copy() assert isinstance(with_raw.raw.X, DaskArray) assert isinstance(with_raw.raw.varm["da"], DaskArray) curr = with_raw.to_memory(copy=True) assert isinstance(with_raw.raw.X, DaskArray) assert isinstance(with_raw.raw.varm["da"], DaskArray) assert isinstance(curr.raw.X, np.ndarray) assert isinstance(curr.raw.varm["da"], np.ndarray) python-anndata-0.12.0~rc1/tests/test_dask_view_mem.py000066400000000000000000000113761500370632200227160ustar00rootroot00000000000000from __future__ import annotations from typing import TYPE_CHECKING import pytest import anndata as ad if TYPE_CHECKING: import pandas as pd pytest.importorskip("pytest_memray") # ------------------------------------------------------------------------------ # Some test data # ------------------------------------------------------------------------------ @pytest.fixture(params=["layers", "obsm", "varm"]) def mapping_name(request): return request.param @pytest.fixture(params=["obs", "var"]) def attr_name(request): return request.param @pytest.fixture(params=[True, False]) def give_chunks(request): return request.param # ------------------------------------------------------------------------------ # The test functions # ------------------------------------------------------------------------------ # Does some stuff so that dask can cache the # subclasscheck before the run. @pytest.fixture def _alloc_cache(): import dask.array as da N = 2**6 size = ((N, N), (N, N)) adata = ad.AnnData( da.random.random(*size), **{ "layers": dict(m=da.random.random(*size)), "obsm": dict(m=da.random.random(*size)), "obs": dict(m=da.random.random(N)), "var": dict(m=da.random.random(N)), "varm": dict(m=da.random.random(*size)), }, ) subset = adata[:10, :][:, :10] for mn in ["varm", "obsm", "layers"]: m = getattr(subset, mn)["m"] m[0, 0] = 100 _ = adata.to_memory(copy=False) # Theoretically this is expected to allocate: # N*N*4 bytes per matrix (we have 2). # N*4 bytes per index (we have 1). # N*N*(2**3) + N*(2**2) bytes # N*N*(2**3) + N*(2**2) bytes # 2**19 + 2**10 # if we put a 2 factor on 2**19 # the results seems more accurate with the experimental results # For example from dask.random we allocate 1mb @pytest.mark.usefixtures("_alloc_cache") @pytest.mark.limit_memory("1.5 MB") def test_size_of_view(mapping_name, give_chunks): import dask.array as da N = 2**8 size = ((N, N), (N, N)) if give_chunks else ((N, N), "auto") adata = ad.AnnData( da.random.random(*size), **{mapping_name: dict(m=da.random.random(*size))}, ) _ = adata.to_memory(copy=False) # Normally should expect something around 90 kbs # Pandas does some indexing stuff that requires more sometimes # since the array we allocated would be 4mb for both arrays + 2mb # Thus, if we allocated it all it should at least have 6mb # experimentally we should at least have 10mb # for index this should be ok @pytest.mark.usefixtures("_alloc_cache") @pytest.mark.limit_memory("1.5 MB") def test_modify_view_mapping_component_memory(mapping_name, give_chunks): import dask.array as da N = 2**8 M = 2**9 size = ((M, M), (M, M)) if give_chunks else ((M, M), "auto") adata = ad.AnnData( da.random.random(*size), **{mapping_name: dict(m=da.random.random(*size))}, ) subset = adata[:N, :N] assert subset.is_view m = getattr(subset, mapping_name)["m"] m[0, 0] = 100 # Normally should expect something around 90 kbs # Pandas does some indexing stuff that requires more sometimes # since the array we allocated would be 4mb for both arrays + 2mb # Thus, if we allocated it all it should at least have 6mb # experimentally we should at least have 10mb # for index this should be ok @pytest.mark.usefixtures("_alloc_cache") @pytest.mark.limit_memory("1.5 MB") def test_modify_view_X_memory(mapping_name, give_chunks): import dask.array as da N = 2**8 M = 2**9 size = ((M, M), (M, M)) if give_chunks else ((M, M), "auto") adata = ad.AnnData( da.random.random(*size), **{mapping_name: dict(m=da.random.random(*size))}, ) subset = adata[:N, :N] assert subset.is_view m = subset.X with pytest.warns( ad.ImplicitModificationWarning, match=r"Trying to modify attribute `.X` of view, initializing view as actual.", ): m[0, 0] = 100 # Normally should expect something around 90 kbs # Pandas does some indexing stuff that requires more sometimes # since the array we allocated would be 4mb for both arrays + 2mb # Thus, if we allocated it all it should at least have 6mb # experimentally we should at least have 10mb # for index this should be ok @pytest.mark.usefixtures("_alloc_cache") @pytest.mark.limit_memory("1.5 MB") def test_modify_view_mapping_obs_var_memory(attr_name, give_chunks): import dask.array as da N = 2**8 M = 2**9 size = ((M, M), (M, M)) if give_chunks else ((M, M), "auto") adata = ad.AnnData( da.random.random(*size), **{attr_name: dict(m=da.random.random(M))}, ) subset = adata[:N, :N] assert subset.is_view m: pd.Series = getattr(subset, attr_name)["m"] m.iloc[0] = 100 python-anndata-0.12.0~rc1/tests/test_deprecations.py000066400000000000000000000077351500370632200225700ustar00rootroot00000000000000"""\ This file contains tests for deprecated functions. This includes correct behaviour as well as throwing warnings. """ from __future__ import annotations import warnings import h5py import numpy as np import pytest from scipy import sparse import anndata.experimental from anndata import AnnData from anndata.tests.helpers import assert_equal @pytest.fixture def adata(): adata = AnnData( X=sparse.csr_matrix([[0, 2, 3], [0, 5, 6]], dtype=np.float32), obs=dict(obs_names=["s1", "s2"], anno1=["c1", "c2"]), var=dict(var_names=["a", "b", "c"]), ) adata.raw = adata.copy() adata.layers["x2"] = adata.X * 2 adata.var["anno2"] = ["p1", "p2", "p3"] adata.X = adata.X / 2 return adata def test_get_obsvar_array_warn(adata): with pytest.warns(FutureWarning): adata._get_obs_array("a") with pytest.warns(FutureWarning): adata._get_var_array("s1") @pytest.mark.filterwarnings("ignore::FutureWarning") def test_get_obsvar_array(adata): assert np.allclose(adata._get_obs_array("a"), adata.obs_vector("a")) assert np.allclose( adata._get_obs_array("a", layer="x2"), adata.obs_vector("a", layer="x2"), ) assert np.allclose( adata._get_obs_array("a", use_raw=True), adata.raw.obs_vector("a") ) assert np.allclose(adata._get_var_array("s1"), adata.var_vector("s1")) assert np.allclose( adata._get_var_array("s1", layer="x2"), adata.var_vector("s1", layer="x2"), ) assert np.allclose( adata._get_var_array("s1", use_raw=True), adata.raw.var_vector("s1") ) def test_obsvar_vector_Xlayer(adata): with pytest.warns(FutureWarning): adata.var_vector("s1", layer="X") with pytest.warns(FutureWarning): adata.obs_vector("a", layer="X") adata = adata.copy() adata.layers["X"] = adata.X * 3 with warnings.catch_warnings(): warnings.simplefilter("error") adata.var_vector("s1", layer="X") adata.obs_vector("a", layer="X") # This should break in 0.9 def test_dtype_warning(): # Tests a warning is thrown with pytest.warns(FutureWarning): a = AnnData(np.ones((3, 3)), dtype=np.float32) assert a.X.dtype == np.float32 # This shouldn't warn, shouldn't copy with warnings.catch_warnings(record=True) as record: b_X = np.ones((3, 3), dtype=np.float64) b = AnnData(b_X) assert not record assert b_X is b.X assert b.X.dtype == np.float64 # Should warn, should copy with pytest.warns(FutureWarning): c_X = np.ones((3, 3), dtype=np.float32) c = AnnData(c_X, dtype=np.float64) assert not record assert c_X is not c.X assert c.X.dtype == np.float64 def test_deprecated_write_attribute(tmp_path): pth = tmp_path / "file.h5" A = np.random.randn(20, 10) from anndata._io.utils import read_attribute, write_attribute from anndata.io import read_elem with h5py.File(pth, "w") as f: with pytest.warns(FutureWarning, match=r"write_elem"): write_attribute(f, "written_attribute", A) with h5py.File(pth, "r") as f: elem_A = read_elem(f["written_attribute"]) with pytest.warns(FutureWarning, match=r"read_elem"): attribute_A = read_attribute(f["written_attribute"]) assert_equal(elem_A, attribute_A) assert_equal(A, attribute_A) @pytest.mark.parametrize( ("old_name", "new_name", "module"), ( (old_name, new_name, module) for module in [anndata, anndata.experimental] for (old_name, new_name) in module._DEPRECATED.items() ), ) def test_warn_on_import_with_redirect(old_name: str, new_name: str, module): with pytest.warns(FutureWarning, match=rf"Importing {old_name}.*is deprecated"): getattr(module, old_name) def test_warn_on_deprecated__io_module(): with pytest.warns( FutureWarning, match=r"Importing read_h5ad from `anndata._io` is deprecated" ): from anndata._io import read_h5ad # noqa python-anndata-0.12.0~rc1/tests/test_extensions.py000066400000000000000000000164441500370632200223040ustar00rootroot00000000000000from __future__ import annotations from typing import TYPE_CHECKING import numpy as np import pytest import anndata as ad from anndata._core import extensions if TYPE_CHECKING: from collections.abc import Generator @pytest.fixture(autouse=True) def _cleanup_dummy() -> Generator[None, None, None]: """Automatically cleanup dummy namespace after each test.""" original = getattr(ad.AnnData, "dummy", None) yield if original is not None: setattr(ad.AnnData, "dummy", original) else: if hasattr(ad.AnnData, "dummy"): delattr(ad.AnnData, "dummy") @pytest.fixture def dummy_namespace() -> type: """Create a basic dummy namespace class.""" ad.AnnData._accessors = set() @ad.register_anndata_namespace("dummy") class DummyNamespace: def __init__(self, adata: ad.AnnData) -> None: self._adata = adata def greet(self) -> str: return "hello" return DummyNamespace @pytest.fixture def adata() -> ad.AnnData: """Create a basic AnnData object for testing.""" rng = np.random.default_rng(42) return ad.AnnData(X=rng.poisson(1, size=(10, 10))) def test_find_stacklevel() -> None: """Test that find_stacklevel returns a positive integer. This function helps determine the correct stacklevel for warnings, so we just need to verify it returns a sensible value. """ level = extensions.find_stacklevel() assert isinstance(level, int) # It should be at least 1, otherwise something is wrong. assert level > 0 def test_accessor_namespace() -> None: """Test the behavior of the AccessorNameSpace descriptor. This test verifies that: - When accessed at the class level (i.e., without an instance), the descriptor returns the namespace type. - When accessed via an instance, the descriptor instantiates the namespace, passing the instance to its constructor. - The instantiated namespace is then cached on the instance such that subsequent accesses of the same attribute return the cached namespace instance. """ # Define a dummy namespace class to be used via the descriptor. class DummyNamespace: def __init__(self, adata: ad.AnnData) -> None: self._adata = adata def foo(self) -> str: return "foo" class Dummy: pass descriptor = extensions.AccessorNameSpace("dummy", DummyNamespace) # When accessed on the class, it should return the namespace type. ns_class = descriptor.__get__(None, Dummy) assert ns_class is DummyNamespace # When accessed via an instance, it should instantiate DummyNamespace. dummy_obj = Dummy() ns_instance = descriptor.__get__(dummy_obj, Dummy) assert isinstance(ns_instance, DummyNamespace) assert ns_instance._adata is dummy_obj # __get__ should cache the namespace instance on the object. # Subsequent access should return the same cached instance. assert dummy_obj.dummy is ns_instance def test_descriptor_instance_caching(dummy_namespace: type, adata: ad.AnnData) -> None: """Test that namespace instances are cached on individual AnnData objects.""" # First access creates the instance ns_instance = adata.dummy # Subsequent accesses should return the same instance assert adata.dummy is ns_instance def test_register_namespace_basic(dummy_namespace: type, adata: ad.AnnData) -> None: """Test basic namespace registration and access.""" assert adata.dummy.greet() == "hello" def test_register_namespace_override(dummy_namespace: type) -> None: """Test namespace registration and override behavior.""" assert "dummy" in ad.AnnData._accessors # Override should warn and update the namespace with pytest.warns( UserWarning, match="Overriding existing custom namespace 'dummy'" ): @ad.register_anndata_namespace("dummy") class DummyNamespaceOverride: def __init__(self, adata: ad.AnnData) -> None: self._adata = adata def greet(self) -> str: return "world" # Verify the override worked adata = ad.AnnData(X=np.random.poisson(1, size=(10, 10))) assert adata.dummy.greet() == "world" @pytest.mark.parametrize( "attr", ["X", "obs", "var", "uns", "obsm", "varm", "layers", "copy", "write"], ) def test_register_existing_attributes(attr: str) -> None: """ Test that registering an accessor with a name that is a reserved attribute of AnnData raises an attribute error. We only test a representative sample of important attributes rather than all of them. """ # Test a representative sample of key AnnData attributes with pytest.raises( AttributeError, match=f"cannot override reserved attribute {attr!r}", ): @ad.register_anndata_namespace(attr) class DummyNamespace: def __init__(self, adata: ad.AnnData) -> None: self._adata = adata def test_valid_signature() -> None: """Test that a namespace with valid signature is accepted.""" @ad.register_anndata_namespace("valid") class ValidNamespace: def __init__(self, adata: ad.AnnData) -> None: self.adata = adata def test_missing_param() -> None: """Test that a namespace missing the second parameter is rejected.""" with pytest.raises( TypeError, match="Namespace initializer must accept an AnnData instance as the second parameter.", ): @ad.register_anndata_namespace("missing_param") class MissingParamNamespace: def __init__(self) -> None: pass def test_wrong_name() -> None: """Test that a namespace with wrong parameter name is rejected.""" with pytest.raises( TypeError, match="Namespace initializer's second parameter must be named 'adata', got 'notadata'.", ): @ad.register_anndata_namespace("wrong_name") class WrongNameNamespace: def __init__(self, notadata: ad.AnnData) -> None: self.notadata = notadata def test_wrong_annotation() -> None: """Test that a namespace with wrong parameter annotation is rejected.""" with pytest.raises( TypeError, match="Namespace initializer's second parameter must be annotated as the 'AnnData' class, got 'int'.", ): @ad.register_anndata_namespace("wrong_annotation") class WrongAnnotationNamespace: def __init__(self, adata: int) -> None: self.adata = adata def test_missing_annotation() -> None: """Test that a namespace with missing parameter annotation is rejected.""" with pytest.raises(AttributeError): @ad.register_anndata_namespace("missing_annotation") class MissingAnnotationNamespace: def __init__(self, adata) -> None: self.adata = adata def test_both_wrong() -> None: """Test that a namespace with both wrong name and annotation is rejected.""" with pytest.raises( TypeError, match=( r"Namespace initializer's second parameter must be named 'adata', got 'info'\. " r"And must be annotated as 'AnnData', got 'str'\." ), ): @ad.register_anndata_namespace("both_wrong") class BothWrongNamespace: def __init__(self, info: str) -> None: self.info = info python-anndata-0.12.0~rc1/tests/test_get_vector.py000066400000000000000000000044701500370632200222420ustar00rootroot00000000000000from __future__ import annotations import numpy as np import pandas as pd import pytest from scipy import sparse import anndata as ad def test_amgibuous_keys(): """Tests that an error is raised if obs_vector or var_vector is ambiguous.""" var_keys = ["The", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog"] obs_keys = [ "Lorem", "ipsum", "dolor", "sit", "amet", "consectetur", "adipiscing", "elit", ] adata = ad.AnnData( X=sparse.random(len(obs_keys), len(var_keys), format="csr"), layers={"layer": sparse.random(len(obs_keys), len(var_keys), format="csr")}, obs=pd.DataFrame( np.random.randn(len(obs_keys), len(obs_keys) + len(var_keys)), index=obs_keys, columns=obs_keys + var_keys, ), var=pd.DataFrame( np.random.randn(len(var_keys), len(obs_keys) + len(var_keys)), index=var_keys, columns=var_keys + obs_keys, ), ) adata.raw = adata.copy() for k in var_keys: # These are mostly to check that the test is working assert k in adata.var_names assert k in adata.obs.columns # Now the actual checks: with pytest.raises(ValueError, match=r".*var_names.*obs\.columns.*"): adata.obs_vector(k) with pytest.raises(ValueError, match=r".*var_names.*obs\.columns.*"): adata.obs_vector(k, layer="layer") # Should uniquely select column from in adata.var assert list(adata.var[k]) == list(adata.var_vector(k)) assert list(adata.var[k]) == list(adata.var_vector(k, layer="layer")) assert list(adata.raw.var[k]) == list(adata.raw.var_vector(k)) for k in obs_keys: assert k in adata.obs_names assert k in adata.var.columns with pytest.raises(ValueError, match=r".*obs_names.*var\.columns"): adata.var_vector(k) with pytest.raises(ValueError, match=r".*obs_names.*var\.columns"): adata.var_vector(k, layer="layer") assert list(adata.obs[k]) == list(adata.obs_vector(k)) assert list(adata.obs[k]) == list(adata.obs_vector(k, layer="layer")) with pytest.raises(ValueError, match=r".*obs_names.*var\.columns*"): adata.raw.var_vector(k) python-anndata-0.12.0~rc1/tests/test_gpu.py000066400000000000000000000015731500370632200206750ustar00rootroot00000000000000from __future__ import annotations import pytest from scipy import sparse from anndata import AnnData, Raw @pytest.mark.gpu def test_gpu(): """ For testing that the gpu mark works """ import cupy # This test shouldn't run if cupy isn't installed cupy.ones(1) @pytest.mark.gpu def test_adata_raw_gpu(): import cupy as cp from cupyx.scipy import sparse as cupy_sparse adata = AnnData( X=cupy_sparse.random(500, 50, density=0.01, format="csr", dtype=cp.float32) ) adata.raw = adata.copy() assert isinstance(adata.raw.X, sparse.csr_matrix) @pytest.mark.gpu def test_raw_gpu(): import cupy as cp from cupyx.scipy import sparse as cupy_sparse adata = AnnData( X=cupy_sparse.random(500, 50, density=0.01, format="csr", dtype=cp.float32) ) araw = Raw(adata) assert isinstance(araw.X, sparse.csr_matrix) python-anndata-0.12.0~rc1/tests/test_helpers.py000066400000000000000000000241541500370632200215440ustar00rootroot00000000000000from __future__ import annotations from string import ascii_letters import numpy as np import pandas as pd import pytest from scipy import sparse import anndata as ad from anndata.compat import CupyArray, CupyCSRMatrix, DaskArray from anndata.tests.helpers import ( BASE_MATRIX_PARAMS, CUPY_MATRIX_PARAMS, DASK_MATRIX_PARAMS, DEFAULT_COL_TYPES, as_cupy, as_cupy_sparse_dask_array, as_dense_cupy_dask_array, as_dense_dask_array, asarray, assert_equal, gen_adata, gen_awkward, gen_random_column, issubdtype, report_name, ) from anndata.utils import axis_len # Testing to see if all error types can have the key name appended. # Currently fails for 22/118 since they have required arguments. Not sure what to do about that. # # @singledispatch # def iswarning(x): # return iswarning(type(x)) # @iswarning.register(type) # def _notwarning(x): # return False # @iswarning.register(Warning) # def _iswarning(x): # return True # @pytest.mark.parametrize("exception", list(filter(lambda t: not iswarning(t), Exception.__subclasses__()))) # def test_report_name_types(exception): # def throw(e): # raise e() # tag = "".join(np.random.permutation(list(ascii_letters))) # with pytest.raises(exception) as err: # report_name(throw)(exception, _elem_name=tag) # assert tag in str(err.value) @pytest.fixture def reusable_adata(): """Reusable anndata for when tests shouldn’t mutate it""" return gen_adata((10, 10)) @pytest.mark.parametrize( ("shape", "datashape"), [ ((4, 2), "4 * 2 * int32"), ((100, 200, None), "100 * 200 * var * int32"), ((4, None), "4 * var * int32"), ((0, 4), "0 * 4 * int32"), ((4, 0), "4 * 0 * int32"), ((8, None, None), "8 * var * var * int32"), ((8, None, None, None), "8 * var * var * var * int32"), ((4, None, 8), "4 * var * 8 * int32"), ((100, 200, 4), "100 * 200 * 4 * int32"), ((4, 0, 0), "4 * 0 * 0 * int32"), ((0, 0, 0), "0 * 0 * 0 * int32"), ((0, None), "0 * var * int32"), ], ) def test_gen_awkward(shape, datashape): import awkward as ak arr = gen_awkward(shape) for i, s in enumerate(shape): assert axis_len(arr, i) == s arr_type = ak.types.from_datashape(datashape) assert arr.type == arr_type @pytest.mark.parametrize("dtype", [*DEFAULT_COL_TYPES, pd.StringDtype]) def test_gen_random_column(dtype): _, col = gen_random_column(10, dtype) assert len(col) == 10 # CategoricalDtypes are the only one specified as instances currently if isinstance(dtype, pd.CategoricalDtype): assert issubdtype(col.dtype, pd.CategoricalDtype) assert col.dtype.ordered == dtype.ordered else: assert issubdtype(col.dtype, dtype) # Does this work for every warning? def test_report_name(): def raise_error(): msg = "an error occurred!" raise Exception(msg) letters = np.array(list(ascii_letters)) tag = "".join(np.random.permutation(letters)) with pytest.raises(Exception, match=r"an error occurred!") as e1: raise_error() with pytest.raises(Exception, match=r"an error occurred!") as e2: report_name(raise_error)(_elem_name=tag) assert str(e2.value).startswith(str(e1.value)) assert tag in str(e2.value) def test_assert_equal(): # ndarrays assert_equal(np.ones((10, 10)), np.ones((10, 10))) assert_equal( # Should this require an exact test? np.ones((10, 10), dtype="i8"), np.ones((10, 10), dtype="f8") ) assert_equal( np.array(list(ascii_letters)), np.array(list(ascii_letters)), exact=True ) with pytest.raises(AssertionError): assert_equal(np.array(list(ascii_letters)), np.array(list(ascii_letters))[::-1]) adata = gen_adata((10, 10)) adata.raw = adata.copy() assert_equal(adata, adata.copy(), exact=True) # TODO: I’m not sure this is good behaviour, I’ve disabled in for now. # assert_equal( # adata, # adata[ # np.random.permutation(adata.obs_names), # np.random.permutation(adata.var_names), # ].copy(), # exact=False, # ) adata2 = adata.copy() to_modify = list(adata2.layers.keys())[0] del adata2.layers[to_modify] with pytest.raises(AssertionError) as missing_layer_error: assert_equal(adata, adata2) assert "layers" in str(missing_layer_error.value) # `to_modify` will be in pytest info adata2 = adata.copy() adata2.layers[to_modify][0, 0] = adata2.layers[to_modify][0, 0] + 1 with pytest.raises(AssertionError) as changed_layer_error: assert_equal(adata, adata2) assert "layers" in str(changed_layer_error.value) assert to_modify in str(changed_layer_error.value) assert_equal(adata.obs, adata.obs.copy(), exact=True) csr = sparse.random(100, 100, format="csr") csc = csr.tocsc() dense = csr.toarray() assert_equal(csr, csc) assert_equal(csc, dense) assert_equal(dense, csc) unordered_cat = pd.Categorical(list("aabdcc"), ordered=False) ordered_cat = pd.Categorical(list("aabdcc"), ordered=True) assert_equal(unordered_cat, unordered_cat.copy()) assert_equal(ordered_cat, ordered_cat.copy()) assert_equal(ordered_cat, unordered_cat, exact=False) with pytest.raises(AssertionError): assert_equal(ordered_cat, unordered_cat, exact=True) def test_assert_equal_raw(): base = gen_adata((10, 10)) orig = base.copy() orig.raw = base.copy() mod = base.copy() mod.X[0, 0] = mod.X[0, 0] + 1 to_compare = base.copy() to_compare.raw = mod.copy() with pytest.raises(AssertionError): assert_equal(orig, to_compare) mod = base.copy() mod.var["new_val"] = 1 to_compare = base.copy() to_compare.raw = mod.copy() with pytest.raises(AssertionError): assert_equal(orig, to_compare) def test_assert_equal_raw_presence(): # This was causing some testing issues during # https://github.com/scverse/anndata/pull/542 a = gen_adata((10, 20)) b = a.copy() a.raw = a.copy() assert b.raw is None with pytest.raises(AssertionError): assert_equal(a, b) with pytest.raises(AssertionError): assert_equal(b, a) # TODO: Should views be equal to actual? # Should they not be if an exact comparison is made? def test_assert_equal_aligned_mapping(): adata1 = gen_adata((10, 10)) adata2 = adata1.copy() for attr in ["obsm", "varm", "layers", "obsp", "varp"]: assert_equal(getattr(adata1, attr), getattr(adata2, attr)) # Checking that subsetting other axis only changes some attrs obs_subset = adata2[:5, :] for attr in ["obsm", "layers", "obsp"]: with pytest.raises(AssertionError): assert_equal(getattr(adata1, attr), getattr(obs_subset, attr)) for attr in ["varm", "varp"]: assert_equal(getattr(adata1, attr), getattr(obs_subset, attr)) var_subset = adata2[:, 5:] for attr in ["varm", "layers", "varp"]: with pytest.raises(AssertionError): assert_equal(getattr(adata1, attr), getattr(var_subset, attr)) for attr in ["obsm", "obsp"]: assert_equal(getattr(adata1, attr), getattr(var_subset, attr)) def test_assert_equal_aligned_mapping_empty(): chars = np.array(list(ascii_letters)) adata = ad.AnnData( X=np.zeros((10, 10)), obs=pd.DataFrame([], index=np.random.choice(chars[:20], 10, replace=False)), var=pd.DataFrame([], index=np.random.choice(chars[:20], 10, replace=False)), ) diff_idx = ad.AnnData( X=np.zeros((10, 10)), obs=pd.DataFrame([], index=np.random.choice(chars[20:], 10, replace=False)), var=pd.DataFrame([], index=np.random.choice(chars[20:], 10, replace=False)), ) same_idx = ad.AnnData(adata.X, obs=adata.obs.copy(), var=adata.var.copy()) for attr in ["obsm", "varm", "layers", "obsp", "varp"]: with pytest.raises(AssertionError): assert_equal(getattr(adata, attr), getattr(diff_idx, attr)) assert_equal(getattr(adata, attr), getattr(same_idx, attr)) def test_assert_equal_dask_arrays(): import dask.array as da a = da.from_array([[1, 2, 3], [4, 5, 6]]) b = da.from_array([[1, 2, 3], [4, 5, 6]]) assert_equal(a, b) c = da.ones(10, dtype="int32") d = da.ones(10, dtype="int64") assert_equal(c, d) def test_assert_equal_dask_sparse_arrays(): import dask.array as da from scipy import sparse x = sparse.random(10, 10, format="csr", density=0.1) y = da.from_array(asarray(x)) assert_equal(x, y) assert_equal(y, x) @pytest.mark.parametrize( "input_type", BASE_MATRIX_PARAMS + DASK_MATRIX_PARAMS + CUPY_MATRIX_PARAMS ) @pytest.mark.parametrize( ( "as_dask_type", "mem_type", ), [ pytest.param( as_dense_cupy_dask_array, CupyArray, id="cupy_dense", marks=pytest.mark.gpu ), pytest.param(as_dense_dask_array, np.ndarray, id="numpy_dense"), pytest.param( as_cupy_sparse_dask_array, CupyCSRMatrix, id="cupy_csr", marks=pytest.mark.gpu, ), ], ) def test_as_dask_functions(input_type, as_dask_type, mem_type): SHAPE = (1000, 100) rng = np.random.default_rng(42) X_source = rng.poisson(size=SHAPE).astype(np.float32) X_input = input_type(X_source) X_output = as_dask_type(X_input) X_computed = X_output.compute() assert isinstance(X_output, DaskArray) assert X_output.shape == SHAPE assert X_output.dtype == X_input.dtype assert isinstance(X_computed, mem_type) assert_equal(asarray(X_computed), X_source) @pytest.mark.parametrize( "dask_matrix_type", DASK_MATRIX_PARAMS, ) @pytest.mark.gpu def test_as_cupy_dask(dask_matrix_type): SHAPE = (100, 10) rng = np.random.default_rng(42) X_cpu = dask_matrix_type(rng.normal(size=SHAPE)) X_gpu_roundtripped = as_cupy(X_cpu).map_blocks(lambda x: x.get(), meta=X_cpu._meta) assert isinstance(X_gpu_roundtripped._meta, type(X_cpu._meta)) assert isinstance(X_gpu_roundtripped.compute(), type(X_cpu.compute())) assert_equal(X_gpu_roundtripped.compute(), X_cpu.compute()) python-anndata-0.12.0~rc1/tests/test_inplace_subset.py000066400000000000000000000055761500370632200231110ustar00rootroot00000000000000from __future__ import annotations import numpy as np import pytest from scipy import sparse from anndata.tests.helpers import ( as_dense_dask_array, assert_equal, gen_adata, ) from anndata.utils import asarray @pytest.fixture( params=[ np.array, sparse.csr_matrix, sparse.csc_matrix, sparse.csr_array, sparse.csc_array, as_dense_dask_array, ], ids=[ "np_array", "scipy_csr", "scipy_csc", "scipy_csr_array", "scipy_csc_array", "dask_array", ], ) def matrix_type(request): return request.param def subset_dim(adata, *, obs=slice(None), var=slice(None)): # Should probably get used for test_inplace_subset_var and test_inplace_subset_obs from anndata._core.index import _subset return _subset(adata, (obs, var)) # TODO: Test values of .uns def test_inplace_subset_var(matrix_type, subset_func): orig = gen_adata((30, 30), X_type=matrix_type) subset_idx = subset_func(orig.var_names) modified = orig.copy() from_view = orig[:, subset_idx].copy() modified._inplace_subset_var(subset_idx) assert_equal(asarray(from_view.X), asarray(modified.X), exact=True) assert_equal(from_view.obs, modified.obs, exact=True) assert_equal(from_view.var, modified.var, exact=True) for k in from_view.obsm: assert_equal(from_view.obsm[k], modified.obsm[k], exact=True) assert_equal(orig.obsm[k], modified.obsm[k], exact=True) for k in from_view.varm: assert_equal(from_view.varm[k], modified.varm[k], exact=True) for k in from_view.layers: assert_equal(from_view.layers[k], modified.layers[k], exact=True) def test_inplace_subset_obs(matrix_type, subset_func): orig = gen_adata((30, 30), X_type=matrix_type) subset_idx = subset_func(orig.obs_names) modified = orig.copy() from_view = orig[subset_idx, :].copy() modified._inplace_subset_obs(subset_idx) assert_equal(asarray(from_view.X), asarray(modified.X), exact=True) assert_equal(from_view.obs, modified.obs, exact=True) assert_equal(from_view.var, modified.var, exact=True) for k in from_view.obsm: assert_equal(from_view.obsm[k], modified.obsm[k], exact=True) for k in from_view.varm: assert_equal(from_view.varm[k], modified.varm[k], exact=True) assert_equal(orig.varm[k], modified.varm[k], exact=True) for k in from_view.layers: assert_equal(from_view.layers[k], modified.layers[k], exact=True) @pytest.mark.parametrize("dim", ["obs", "var"]) def test_inplace_subset_no_X(subset_func, dim): orig = gen_adata((30, 30)) del orig.X subset_idx = subset_func(getattr(orig, f"{dim}_names")) modified = orig.copy() from_view = subset_dim(orig, **{dim: subset_idx}).copy() getattr(modified, f"_inplace_subset_{dim}")(subset_idx) assert_equal(modified, from_view, exact=True) python-anndata-0.12.0~rc1/tests/test_io_backwards_compat.py000066400000000000000000000030021500370632200240620ustar00rootroot00000000000000from __future__ import annotations from pathlib import Path import pandas as pd import pytest import zarr import zarr.storage from scipy import sparse import anndata as ad from anndata.compat import is_zarr_v2 from anndata.tests.helpers import assert_equal ARCHIVE_PTH = Path(__file__).parent / "data/archives" @pytest.fixture(params=list(ARCHIVE_PTH.glob("v*")), ids=lambda x: x.name) def archive_dir(request): return request.param def test_backwards_compat_files(archive_dir): with pytest.warns(ad.OldFormatWarning): from_h5ad = ad.read_h5ad(archive_dir / "adata.h5ad") with pytest.warns(ad.OldFormatWarning): path = archive_dir / "adata.zarr.zip" if is_zarr_v2(): store = path else: store = zarr.storage.ZipStore(path) from_zarr = ad.read_zarr(store) assert_equal(from_h5ad, from_zarr, exact=True) def test_clean_uns_backwards_compat(tmp_path, diskfmt): pth = tmp_path / f"test_write.{diskfmt}" write = lambda x, y: getattr(x, f"write_{diskfmt}")(y) read = lambda x: getattr(ad, f"read_{diskfmt}")(x) orig = ad.AnnData( sparse.csr_matrix((3, 5), dtype="float32"), obs=pd.DataFrame( {"a": pd.Categorical(list("aab")), "b": [1, 2, 3]}, index=[f"cell_{i}" for i in range(3)], ), uns={ "a_categories": "some string", "b_categories": "another string", }, ) write(orig, pth) from_disk = read(pth) assert_equal(orig, from_disk) python-anndata-0.12.0~rc1/tests/test_io_conversion.py000066400000000000000000000076141500370632200227600ustar00rootroot00000000000000"""\ This file contains tests for conversion made during io. """ from __future__ import annotations import h5py import numpy as np import pytest from scipy import sparse import anndata as ad from anndata.compat import CSMatrix from anndata.tests.helpers import assert_equal, gen_adata @pytest.fixture( params=[sparse.csr_matrix, sparse.csc_matrix, np.array], ids=["scipy-csr", "scipy-csc", "np-array"], ) def mtx_format(request): return request.param @pytest.fixture( params=[sparse.csr_matrix, sparse.csc_matrix], ids=["scipy-csr", "scipy-csc"], ) def spmtx_format(request): return request.param @pytest.fixture(params=[("raw/X",), ("X",), ("X", "raw/X")]) def to_convert(request): return request.param def test_sparse_to_dense_disk(tmp_path, mtx_format, to_convert): mem_pth = tmp_path / "orig.h5ad" dense_from_mem_pth = tmp_path / "dense_mem.h5ad" dense_from_disk_pth = tmp_path / "dense_disk.h5ad" mem = gen_adata((50, 50), mtx_format) mem.raw = mem.copy() mem.write_h5ad(mem_pth) disk = ad.read_h5ad(mem_pth, backed="r") mem.write_h5ad(dense_from_mem_pth, as_dense=to_convert) disk.write_h5ad(dense_from_disk_pth, as_dense=to_convert) with h5py.File(dense_from_mem_pth, "r") as f: for k in to_convert: assert isinstance(f[k], h5py.Dataset) with h5py.File(dense_from_disk_pth, "r") as f: for k in to_convert: assert isinstance(f[k], h5py.Dataset) for backed in [None, "r"]: from_mem = ad.read_h5ad(dense_from_mem_pth, backed=backed) from_disk = ad.read_h5ad(dense_from_disk_pth, backed=backed) assert_equal(mem, from_mem) assert_equal(mem, from_disk) assert_equal(disk, from_mem) assert_equal(disk, from_disk) def test_sparse_to_dense_inplace(tmp_path, spmtx_format): pth = tmp_path / "adata.h5ad" orig = gen_adata((50, 50), spmtx_format) orig.raw = orig.copy() orig.write(pth) backed = ad.read_h5ad(pth, backed="r+") backed.write(as_dense=("X", "raw/X")) new = ad.read_h5ad(pth) assert_equal(orig, new) assert_equal(backed, new) assert isinstance(new.X, np.ndarray) assert isinstance(new.raw.X, np.ndarray) assert isinstance(orig.X, spmtx_format) assert isinstance(orig.raw.X, spmtx_format) assert isinstance(backed.X, h5py.Dataset) assert isinstance(backed.raw.X, h5py.Dataset) def test_sparse_to_dense_errors(tmp_path): adata = ad.AnnData(X=sparse.random(50, 50, format="csr")) adata.layers["like_X"] = adata.X.copy() with pytest.raises(ValueError, match=r"Cannot specify writing"): adata.write_h5ad(tmp_path / "failure.h5ad", as_dense=("raw/X",)) with pytest.raises(NotImplementedError): adata.write_h5ad(tmp_path / "failure.h5ad", as_dense=("raw", "X")) with pytest.raises(NotImplementedError): adata.write_h5ad(tmp_path / "failure.h5ad", as_dense=("layers/like_X",)) def test_dense_to_sparse_memory(tmp_path, spmtx_format, to_convert): dense_path = tmp_path / "dense.h5ad" orig = gen_adata((50, 50), np.array) orig.raw = orig.copy() orig.write_h5ad(dense_path) assert not isinstance(orig.X, CSMatrix) assert not isinstance(orig.raw.X, CSMatrix) curr = ad.read_h5ad(dense_path, as_sparse=to_convert, as_sparse_fmt=spmtx_format) if "X" in to_convert: assert isinstance(curr.X, spmtx_format) if "raw/X" in to_convert: assert isinstance(curr.raw.X, spmtx_format) assert_equal(orig, curr) def test_dense_to_sparse_errors(tmp_path): dense_pth = tmp_path / "dense.h5ad" adata = ad.AnnData(X=np.ones((50, 50))) adata.layers["like_X"] = adata.X.copy() adata.write(dense_pth) with pytest.raises(NotImplementedError): ad.read_h5ad(dense_pth, as_sparse=("X",), as_sparse_fmt=sparse.coo_matrix) with pytest.raises(NotImplementedError): ad.read_h5ad(dense_pth, as_sparse=("layers/like_X",)) python-anndata-0.12.0~rc1/tests/test_io_dispatched.py000066400000000000000000000154371500370632200227050ustar00rootroot00000000000000from __future__ import annotations import re from typing import TYPE_CHECKING import h5py import zarr import anndata as ad from anndata._io.zarr import open_write_group from anndata.compat import CSArray, CSMatrix, ZarrGroup, is_zarr_v2 from anndata.experimental import read_dispatched, write_dispatched from anndata.tests.helpers import assert_equal, gen_adata if TYPE_CHECKING: from collections.abc import Callable from pathlib import Path def test_read_dispatched_w_regex(tmp_path: Path): def read_only_axis_dfs(func, elem_name: str, elem, iospec): if iospec.encoding_type == "anndata": return func(elem) elif re.match(r"^/((obs)|(var))?(/.*)?$", elem_name): return func(elem) else: return None adata = gen_adata((1000, 100)) z = open_write_group(tmp_path) ad.io.write_elem(z, "/", adata) # TODO: see https://github.com/zarr-developers/zarr-python/issues/2716 if not is_zarr_v2() and isinstance(z, ZarrGroup): z = zarr.open(z.store) expected = ad.AnnData(obs=adata.obs, var=adata.var) actual = read_dispatched(z, read_only_axis_dfs) assert_equal(expected, actual) def test_read_dispatched_dask(tmp_path: Path): import dask.array as da def read_as_dask_array(func, elem_name: str, elem, iospec): if iospec.encoding_type in { "dataframe", "csr_matrix", "csc_matrix", "awkward-array", }: # Preventing recursing inside of these types return ad.io.read_elem(elem) elif iospec.encoding_type == "array": return da.from_zarr(elem) else: return func(elem) adata = gen_adata((1000, 100)) z = open_write_group(tmp_path) ad.io.write_elem(z, "/", adata) # TODO: see https://github.com/zarr-developers/zarr-python/issues/2716 if not is_zarr_v2() and isinstance(z, ZarrGroup): z = zarr.open(z.store) dask_adata = read_dispatched(z, read_as_dask_array) assert isinstance(dask_adata.layers["array"], da.Array) assert isinstance(dask_adata.obsm["array"], da.Array) assert isinstance(dask_adata.uns["nested"]["nested_further"]["array"], da.Array) expected = ad.io.read_elem(z) actual = dask_adata.to_memory(copy=False) assert_equal(expected, actual) def test_read_dispatched_null_case(tmp_path: Path): adata = gen_adata((100, 100)) z = open_write_group(tmp_path) ad.io.write_elem(z, "/", adata) # TODO: see https://github.com/zarr-developers/zarr-python/issues/2716 if not is_zarr_v2() and isinstance(z, ZarrGroup): z = zarr.open(z.store) expected = ad.io.read_elem(z) actual = read_dispatched(z, lambda _, __, x, **___: ad.io.read_elem(x)) assert_equal(expected, actual) def test_write_dispatched_chunks(tmp_path: Path): from itertools import chain, repeat def determine_chunks(elem_shape, specified_chunks): chunk_iterator = chain(specified_chunks, repeat(None)) return tuple(e if c is None else c for e, c in zip(elem_shape, chunk_iterator)) adata = gen_adata((1000, 100)) def write_chunked(func, store, k, elem, dataset_kwargs, iospec): M, N = 13, 42 def set_copy(d, **kwargs): d = dict(d) d.update(kwargs) return d # TODO: Should the passed path be absolute? path = "/" + store.path + "/" + k if hasattr(elem, "shape") and not isinstance( elem, CSMatrix | CSArray | ad.AnnData ): if re.match(r"^/((X)|(layers)).*", path): chunks = (M, N) elif path.startswith("/obsp"): chunks = (M, M) elif path.startswith("/obs"): chunks = (M,) elif path.startswith("/varp"): chunks = (N, N) elif path.startswith("/var"): chunks = (N,) else: chunks = dataset_kwargs.get("chunks", ()) func( store, k, elem, dataset_kwargs=set_copy( dataset_kwargs, chunks=determine_chunks(elem.shape, chunks) ), ) else: func(store, k, elem, dataset_kwargs=dataset_kwargs) z = open_write_group(tmp_path) write_dispatched(z, "/", adata, callback=write_chunked) def check_chunking(k: str, v: ZarrGroup | zarr.Array): if ( not isinstance(v, zarr.Array) or v.shape == () or any(k.endswith(x) for x in ("data", "indices", "indptr")) ): return if re.match(r"obs[mp]?/\w+", k): assert v.chunks[0] == 13 elif re.match(r"var[mp]?/\w+", k): assert v.chunks[0] == 42 if is_zarr_v2(): z.visititems(check_chunking) else: def visititems( z: ZarrGroup, visitor: Callable[[str, ZarrGroup | zarr.Array], None] ) -> None: for key in z: maybe_group = z[key] if isinstance(maybe_group, ZarrGroup): visititems(maybe_group, visitor) else: visitor(key, maybe_group) visititems(z, check_chunking) def test_io_dispatched_keys(tmp_path: Path): h5ad_write_keys = [] zarr_write_keys = [] h5ad_read_keys = [] zarr_read_keys = [] h5ad_path = tmp_path / "test.h5ad" zarr_path = tmp_path / "test.zarr" def h5ad_writer(func, store, k, elem, dataset_kwargs, iospec): h5ad_write_keys.append(k if is_zarr_v2() else k.strip("/")) func(store, k, elem, dataset_kwargs=dataset_kwargs) def zarr_writer(func, store, k, elem, dataset_kwargs, iospec): zarr_write_keys.append( k if is_zarr_v2() else f"{store.name.strip('/')}/{k.strip('/')}".strip("/") ) func(store, k, elem, dataset_kwargs=dataset_kwargs) def h5ad_reader(func, elem_name: str, elem, iospec): h5ad_read_keys.append(elem_name if is_zarr_v2() else elem_name.strip("/")) return func(elem) def zarr_reader(func, elem_name: str, elem, iospec): zarr_read_keys.append(elem_name if is_zarr_v2() else elem_name.strip("/")) return func(elem) adata = gen_adata((50, 100)) with h5py.File(h5ad_path, "w") as f: write_dispatched(f, "/", adata, callback=h5ad_writer) _ = read_dispatched(f, h5ad_reader) f = open_write_group(zarr_path) write_dispatched(f, "/", adata, callback=zarr_writer) _ = read_dispatched(f, zarr_reader) assert sorted(h5ad_read_keys) == sorted(zarr_read_keys) assert sorted(h5ad_write_keys) == sorted(zarr_write_keys) for sub_sparse_key in ["data", "indices", "indptr"]: assert f"/X/{sub_sparse_key}" not in h5ad_read_keys assert f"/X/{sub_sparse_key}" not in h5ad_write_keys python-anndata-0.12.0~rc1/tests/test_io_elementwise.py000066400000000000000000000517731500370632200231210ustar00rootroot00000000000000""" Tests that each element in an anndata is written correctly """ from __future__ import annotations import re from pathlib import Path from typing import TYPE_CHECKING import h5py import numpy as np import pandas as pd import pytest import zarr from packaging.version import Version from scipy import sparse import anndata as ad from anndata._io.specs import _REGISTRY, IOSpec, get_spec from anndata._io.specs.registry import IORegistryError from anndata._io.zarr import open_write_group from anndata.compat import CSArray, CSMatrix, ZarrGroup, _read_attr, is_zarr_v2 from anndata.experimental import read_elem_lazy from anndata.io import read_elem, write_elem from anndata.tests.helpers import ( as_cupy, as_cupy_sparse_dask_array, as_dense_cupy_dask_array, assert_equal, gen_adata, ) if TYPE_CHECKING: from pathlib import Path from typing import Literal, TypeVar from anndata.compat import H5Group G = TypeVar("G", H5Group, ZarrGroup) @pytest.fixture def store(diskfmt, tmp_path) -> H5Group | ZarrGroup: if diskfmt == "h5ad": file = h5py.File(tmp_path / "test.h5ad", "w") store = file["/"] elif diskfmt == "zarr": store = open_write_group(tmp_path / "test.zarr") else: pytest.fail(f"Unknown store type: {diskfmt}") try: yield store finally: if diskfmt == "h5ad": file.close() sparse_formats = ["csr", "csc"] SIZE = 2500 DEFAULT_SHAPE = (SIZE, SIZE * 2) @pytest.fixture(params=sparse_formats) def sparse_format(request: pytest.FixtureRequest) -> Literal["csr", "csc"]: return request.param def create_dense_store( store: str, *, shape: tuple[int, ...] = DEFAULT_SHAPE ) -> H5Group | ZarrGroup: X = np.random.randn(*shape) write_elem(store, "X", X) return store def create_sparse_store( sparse_format: Literal["csc", "csr"], store: G, shape=DEFAULT_SHAPE ) -> G: """Returns a store Parameters ---------- sparse_format store Returns ------- A store with a key, `X` that is simply a sparse matrix, and `X_dask` where that same array is wrapped by dask """ import dask.array as da X = sparse.random( shape[0], shape[1], format=sparse_format, density=0.01, random_state=np.random.default_rng(), ) X_dask = da.from_array( X, chunks=(100 if format == "csr" else SIZE, SIZE * 2 if format == "csr" else 100), ) write_elem(store, "X", X) write_elem(store, "X_dask", X_dask) return store @pytest.mark.parametrize( ("value", "encoding_type"), [ pytest.param(None, "null", id="none"), pytest.param("hello world", "string", id="py_str"), pytest.param(np.str_("hello world"), "string", id="np_str"), pytest.param(np.array([1, 2, 3]), "array", id="np_arr_int"), pytest.param( np.array(["hello", "world"], dtype=object), "string-array", id="np_arr_str" ), pytest.param(1, "numeric-scalar", id="py_int"), pytest.param(True, "numeric-scalar", id="py_bool"), pytest.param(1.0, "numeric-scalar", id="py_float"), pytest.param({"a": 1}, "dict", id="py_dict"), pytest.param(gen_adata((3, 2)), "anndata", id="anndata"), pytest.param( sparse.random(5, 3, format="csr", density=0.5), "csr_matrix", id="sp_mat_csr", ), pytest.param( sparse.random(5, 3, format="csc", density=0.5), "csc_matrix", id="sp_mat_csc", ), pytest.param(pd.DataFrame({"a": [1, 2, 3]}), "dataframe", id="pd_df"), pytest.param( pd.Categorical(list("aabccedd") + [pd.NA]), "categorical", id="pd_cat_np_str", ), pytest.param( pd.Categorical(list("aabccedd"), ordered=True), "categorical", id="pd_cat_np_str_ord", ), pytest.param( pd.array(list("aabccedd") + [pd.NA], dtype="string").astype("category"), "categorical", id="pd_cat_pd_str", ), pytest.param( pd.Categorical([1, 2, 1, 3], ordered=True), "categorical", id="pd_cat_num" ), pytest.param( pd.array(["hello", "world"], dtype="string"), "nullable-string-array", id="pd_arr_str", ), pytest.param( pd.array(["hello", "world", pd.NA], dtype="string"), "nullable-string-array", id="pd_arr_str_mask", ), pytest.param( pd.arrays.IntegerArray( np.ones(5, dtype=int), mask=np.array([True, False, True, False, True]) ), "nullable-integer", id="pd_arr_int_mask", ), pytest.param(pd.array([1, 2, 3]), "nullable-integer", id="pd_arr_int"), pytest.param( pd.arrays.BooleanArray( np.random.randint(0, 2, size=5, dtype=bool), mask=np.random.randint(0, 2, size=5, dtype=bool), ), "nullable-boolean", id="pd_arr_bool_mask", ), pytest.param( pd.array([True, False, True, True]), "nullable-boolean", id="pd_arr_bool" ), pytest.param( zarr.ones((100, 100), chunks=(10, 10)), "array", id="zarr_dense_array", ), pytest.param( create_dense_store( h5py.File("test1.h5", mode="w", driver="core", backing_store=False) )["X"], "array", id="h5_dense_array", ), # pytest.param(bytes, b"some bytes", "bytes", id="py_bytes"), # Does not work for zarr # TODO consider how specific encodings should be. Should we be fully describing the written type? # Currently the info we add is: "what you wouldn't be able to figure out yourself" # but that's not really a solid rule. # pytest.param(bool, True, "bool", id="py_bool"), # pytest.param(bool, np.bool_(False), "bool", id="np_bool"), ], ) def test_io_spec(store, value, encoding_type): # zarr v3 can't write recarray # https://github.com/zarr-developers/zarr-python/issues/2134 if ( ad.settings.zarr_write_format == 3 and encoding_type == "anndata" and "O_recarray" in value.uns ): del value.uns["O_recarray"] with ad.settings.override(allow_write_nullable_strings=True): key = f"key_for_{encoding_type}" write_elem(store, key, value, dataset_kwargs={}) assert encoding_type == _read_attr(store[key].attrs, "encoding-type") from_disk = read_elem(store[key]) assert_equal(value, from_disk) assert get_spec(store[key]) == _REGISTRY.get_spec(value) @pytest.mark.parametrize( ("value", "encoding_type"), [ pytest.param(np.asarray(1), "numeric-scalar", id="scalar_int"), pytest.param(np.asarray(1.0), "numeric-scalar", id="scalar_float"), pytest.param(np.asarray(True), "numeric-scalar", id="scalar_bool"), # noqa: FBT003 pytest.param(np.asarray("test"), "string", id="scalar_string"), ], ) def test_io_spec_compressed_scalars(store: G, value: np.ndarray, encoding_type: str): key = f"key_for_{encoding_type}" write_elem( store, key, value, dataset_kwargs={"compression": "gzip", "compression_opts": 5} ) assert encoding_type == _read_attr(store[key].attrs, "encoding-type") from_disk = read_elem(store[key]) assert_equal(value, from_disk) # Can't instantiate cupy types at the top level, so converting them within the test @pytest.mark.gpu @pytest.mark.parametrize( ("value", "encoding_type"), [ (np.array([1, 2, 3]), "array"), (np.arange(12).reshape(4, 3), "array"), (sparse.random(5, 3, format="csr", density=0.5), "csr_matrix"), (sparse.random(5, 3, format="csc", density=0.5), "csc_matrix"), ], ) @pytest.mark.parametrize("as_dask", [False, True]) def test_io_spec_cupy(store, value, encoding_type, as_dask): if as_dask: if isinstance(value, CSMatrix): value = as_cupy_sparse_dask_array(value, format=encoding_type[:3]) else: value = as_dense_cupy_dask_array(value) else: value = as_cupy(value) key = f"key_for_{encoding_type}" write_elem(store, key, value, dataset_kwargs={}) assert encoding_type == _read_attr(store[key].attrs, "encoding-type") from_disk = as_cupy(read_elem(store[key])) assert_equal(value, from_disk) assert get_spec(store[key]) == _REGISTRY.get_spec(value) def test_dask_write_sparse(sparse_format, store): x_sparse_store = create_sparse_store(sparse_format, store) X_from_disk = read_elem(x_sparse_store["X"]) X_dask_from_disk = read_elem(x_sparse_store["X_dask"]) assert_equal(X_from_disk, X_dask_from_disk) assert_equal(dict(x_sparse_store["X"].attrs), dict(x_sparse_store["X_dask"].attrs)) assert x_sparse_store["X_dask/indptr"].dtype == np.int64 assert x_sparse_store["X_dask/indices"].dtype == np.int64 def test_read_lazy_2d_dask(sparse_format, store): arr_store = create_sparse_store(sparse_format, store) X_dask_from_disk = read_elem_lazy(arr_store["X"]) X_from_disk = read_elem(arr_store["X"]) assert_equal(X_from_disk, X_dask_from_disk) random_int_indices = np.random.randint(0, SIZE, (SIZE // 10,)) random_int_indices.sort() index_slice = slice(0, SIZE // 10) for index in [random_int_indices, index_slice]: assert_equal(X_from_disk[index, :], X_dask_from_disk[index, :]) assert_equal(X_from_disk[:, index], X_dask_from_disk[:, index]) random_bool_mask = np.random.randn(SIZE) > 0 assert_equal( X_from_disk[random_bool_mask, :], X_dask_from_disk[random_bool_mask, :] ) random_bool_mask = np.random.randn(SIZE * 2) > 0 assert_equal( X_from_disk[:, random_bool_mask], X_dask_from_disk[:, random_bool_mask] ) assert arr_store["X_dask/indptr"].dtype == np.int64 assert arr_store["X_dask/indices"].dtype == np.int64 @pytest.mark.parametrize( ("n_dims", "chunks"), [ (1, (100,)), (1, (400,)), (2, (100, 100)), (2, (400, 400)), (2, (200, 400)), (1, None), (2, None), (2, (400, -1)), (2, (400, None)), ], ) def test_read_lazy_subsets_nd_dask(store, n_dims, chunks): arr_store = create_dense_store(store, shape=DEFAULT_SHAPE[:n_dims]) X_dask_from_disk = read_elem_lazy(arr_store["X"], chunks=chunks) X_from_disk = read_elem(arr_store["X"]) assert_equal(X_from_disk, X_dask_from_disk) random_int_indices = np.random.randint(0, SIZE, (SIZE // 10,)) random_int_indices.sort() random_bool_mask = np.random.randn(SIZE) > 0 index_slice = slice(0, SIZE // 10) for index in [random_int_indices, index_slice, random_bool_mask]: assert_equal(X_from_disk[index], X_dask_from_disk[index]) @pytest.mark.xdist_group("dask") def test_read_lazy_h5_cluster( sparse_format: Literal["csr", "csc"], tmp_path: Path, local_cluster_addr: str ) -> None: import dask.distributed as dd with h5py.File(tmp_path / "test.h5", "w") as file: store = file["/"] arr_store = create_sparse_store(sparse_format, store) X_dask_from_disk = read_elem_lazy(arr_store["X"]) X_from_disk = read_elem(arr_store["X"]) with dd.Client(local_cluster_addr): assert_equal(X_from_disk, X_dask_from_disk) def test_undersized_shape_to_default(store: H5Group | ZarrGroup): shape = (3000, 50) arr_store = create_dense_store(store, shape=shape) X_dask_from_disk = read_elem_lazy(arr_store["X"]) assert (c < s for c, s in zip(X_dask_from_disk.chunksize, shape)) assert X_dask_from_disk.shape == shape @pytest.mark.parametrize( ("arr_type", "chunks", "expected_chunksize"), [ ("dense", (100, 100), (100, 100)), ("csc", (SIZE, 10), (SIZE, 10)), ("csr", (10, SIZE * 2), (10, SIZE * 2)), ("csc", None, (SIZE, 1000)), ("csr", None, (1000, SIZE * 2)), ("csr", (10, -1), (10, SIZE * 2)), ("csc", (-1, 10), (SIZE, 10)), ("csr", (10, None), (10, SIZE * 2)), ("csc", (None, 10), (SIZE, 10)), ("csc", (None, None), DEFAULT_SHAPE), ("csr", (None, None), DEFAULT_SHAPE), ("csr", (-1, -1), DEFAULT_SHAPE), ("csc", (-1, -1), DEFAULT_SHAPE), ], ) def test_read_lazy_2d_chunk_kwargs( store: H5Group | ZarrGroup, arr_type: Literal["csr", "csc", "dense"], chunks: None | tuple[int | None, int | None], expected_chunksize: tuple[int, int], ): if arr_type == "dense": arr_store = create_dense_store(store) X_dask_from_disk = read_elem_lazy(arr_store["X"], chunks=chunks) else: arr_store = create_sparse_store(arr_type, store) X_dask_from_disk = read_elem_lazy(arr_store["X"], chunks=chunks) assert X_dask_from_disk.chunksize == expected_chunksize X_from_disk = read_elem(arr_store["X"]) assert_equal(X_from_disk, X_dask_from_disk) def test_read_lazy_bad_chunk_kwargs(tmp_path): arr_type = "csr" with h5py.File(tmp_path / "test.h5", "w") as file: store = file["/"] arr_store = create_sparse_store(arr_type, store) with pytest.raises( ValueError, match=r"`chunks` must be a tuple of two integers" ): read_elem_lazy(arr_store["X"], chunks=(SIZE,)) with pytest.raises(ValueError, match=r"Only the major axis can be chunked"): read_elem_lazy(arr_store["X"], chunks=(SIZE, 10)) @pytest.mark.parametrize("sparse_format", ["csr", "csc"]) def test_write_indptr_dtype_override(store, sparse_format): X = sparse.random( 100, 100, format=sparse_format, density=0.1, random_state=np.random.default_rng(), ) write_elem(store, "X", X, dataset_kwargs=dict(indptr_dtype="int64")) assert store["X/indptr"].dtype == np.int64 assert X.indptr.dtype == np.int32 np.testing.assert_array_equal(store["X/indptr"][...], X.indptr) def test_io_spec_raw(store): adata = gen_adata((3, 2)) adata.raw = adata.copy() write_elem(store, "adata", adata) assert "raw" == _read_attr(store["adata/raw"].attrs, "encoding-type") from_disk = read_elem(store["adata"]) assert_equal(from_disk.raw, adata.raw) def test_write_anndata_to_root(store): adata = gen_adata((3, 2)) write_elem(store, "/", adata) # TODO: see https://github.com/zarr-developers/zarr-python/issues/2716 if not is_zarr_v2() and isinstance(store, ZarrGroup): store = zarr.open(store.store) from_disk = read_elem(store) assert "anndata" == _read_attr(store.attrs, "encoding-type") assert_equal(from_disk, adata) @pytest.mark.parametrize( ("attribute", "value"), [ ("encoding-type", "floob"), ("encoding-version", "10000.0"), ], ) def test_read_iospec_not_found(store, attribute, value): adata = gen_adata((3, 2)) write_elem(store, "/", adata) store["obs"].attrs.update({attribute: value}) with pytest.raises(IORegistryError) as exc_info: read_elem(store) msg = str(exc_info.value) assert "No read method registered for IOSpec" in msg assert f"{attribute.replace('-', '_')}='{value}'" in msg @pytest.mark.parametrize( "obj", [(b"x",)], ) def test_write_io_error(store, obj): full_pattern = re.compile( rf"No method registered for writing {type(obj)} into .*Group" ) with pytest.raises(IORegistryError, match=r"while writing key '/el'") as exc_info: write_elem(store, "/el", obj) msg = str(exc_info.value) assert re.search(full_pattern, msg) def test_write_nullable_string_error(store): with pytest.raises(RuntimeError, match=r"allow_write_nullable_strings.*is False"): write_elem(store, "/el", pd.array([""], dtype="string")) def test_categorical_order_type(store): # https://github.com/scverse/anndata/issues/853 cat = pd.Categorical([0, 1], ordered=True) write_elem(store, "ordered", cat) write_elem(store, "unordered", cat.set_ordered(False)) assert isinstance(read_elem(store["ordered"]).ordered, bool) assert read_elem(store["ordered"]).ordered is True assert isinstance(read_elem(store["unordered"]).ordered, bool) assert read_elem(store["unordered"]).ordered is False def test_override_specification(): """ Test that trying to overwrite an existing encoding raises an error. """ from copy import deepcopy registry = deepcopy(_REGISTRY) with pytest.raises(TypeError): @registry.register_write( ZarrGroup, ad.AnnData, IOSpec("some new type", "0.1.0") ) def _(store, key, adata): pass @pytest.mark.parametrize( "value", [ pytest.param({"a": 1}, id="dict"), pytest.param(gen_adata((3, 2)), id="anndata"), pytest.param(sparse.random(5, 3, format="csr", density=0.5), id="csr_matrix"), pytest.param(sparse.random(5, 3, format="csc", density=0.5), id="csc_matrix"), pytest.param(pd.DataFrame({"a": [1, 2, 3]}), id="dataframe"), pytest.param(pd.Categorical(list("aabccedd")), id="categorical"), pytest.param( pd.Categorical(list("aabccedd"), ordered=True), id="categorical-ordered" ), pytest.param( pd.Categorical([1, 2, 1, 3], ordered=True), id="categorical-numeric" ), pytest.param( pd.arrays.IntegerArray( np.ones(5, dtype=int), mask=np.array([True, False, True, False, True]) ), id="nullable-integer", ), pytest.param(pd.array([1, 2, 3]), id="nullable-integer-no-nulls"), pytest.param( pd.arrays.BooleanArray( np.random.randint(0, 2, size=5, dtype=bool), mask=np.random.randint(0, 2, size=5, dtype=bool), ), id="nullable-boolean", ), pytest.param( pd.array([True, False, True, True]), id="nullable-boolean-no-nulls" ), ], ) def test_write_to_root(store, value): """ Test that elements which are written as groups can we written to the root group. """ # zarr v3 can't write recarray # https://github.com/zarr-developers/zarr-python/issues/2134 if ad.settings.zarr_write_format == 3 and isinstance(value, ad.AnnData): del value.uns["O_recarray"] write_elem(store, "/", value) # See: https://github.com/zarr-developers/zarr-python/issues/2716 if isinstance(store, ZarrGroup) and not is_zarr_v2(): store = zarr.open(store.store) result = read_elem(store) assert_equal(result, value) @pytest.mark.parametrize("consolidated", [True, False]) def test_read_zarr_from_group(tmp_path, consolidated): # https://github.com/scverse/anndata/issues/1056 pth = tmp_path / "test.zarr" adata = gen_adata((3, 2)) z = open_write_group(pth) write_elem(z, "table/table", adata) if consolidated: zarr.consolidate_metadata(z.store) if consolidated: read_func = zarr.open_consolidated else: read_func = zarr.open z = read_func(pth) expected = ad.read_zarr(z["table/table"]) assert_equal(adata, expected) def test_dataframe_column_uniqueness(store): repeated_cols = pd.DataFrame(np.ones((3, 2)), columns=["a", "a"]) with pytest.raises( ValueError, match=r"Found repeated column names: \['a'\]\. Column names must be unique\.", ): write_elem(store, "repeated_cols", repeated_cols) index_shares_col_name = pd.DataFrame( {"col_name": [1, 2, 3]}, index=pd.Index([1, 3, 2], name="col_name") ) with pytest.raises( ValueError, match=r"DataFrame\.index\.name \('col_name'\) is also used by a column whose values are different\.", ): write_elem(store, "index_shares_col_name", index_shares_col_name) index_shared_okay = pd.DataFrame( {"col_name": [1, 2, 3]}, index=pd.Index([1, 2, 3], name="col_name") ) write_elem(store, "index_shared_okay", index_shared_okay) result = read_elem(store["index_shared_okay"]) assert_equal(result, index_shared_okay) @pytest.mark.parametrize("copy_on_write", [True, False]) def test_io_pd_cow(store, copy_on_write): if Version(pd.__version__) < Version("2"): pytest.xfail("copy_on_write option is not available in pandas < 2") # https://github.com/zarr-developers/numcodecs/issues/514 with pd.option_context("mode.copy_on_write", copy_on_write): orig = gen_adata((3, 2)) write_elem(store, "adata", orig) from_store = read_elem(store["adata"]) assert_equal(orig, from_store) def test_read_sparse_array( tmp_path: Path, sparse_format: Literal["csr", "csc"], diskfmt: Literal["h5ad", "zarr"], ): path = tmp_path / f"test.{diskfmt.replace('ad', '')}" a = sparse.random(100, 100, format=sparse_format) if diskfmt == "zarr": f = open_write_group(path, mode="a") else: f = h5py.File(path, "a") ad.io.write_elem(f, "mtx", a) ad.settings.use_sparse_array_on_read = True mtx = ad.io.read_elem(f["mtx"]) assert issubclass(type(mtx), CSArray) python-anndata-0.12.0~rc1/tests/test_io_partial.py000066400000000000000000000065121500370632200222230ustar00rootroot00000000000000from __future__ import annotations import warnings from importlib.util import find_spec from pathlib import Path import h5py import numpy as np import pytest import zarr from scipy.sparse import csr_matrix import anndata from anndata import AnnData from anndata._io.specs.registry import read_elem_partial from anndata.io import read_elem, write_h5ad, write_zarr X = np.array([[1.0, 0.0, 3.0], [4.0, 0.0, 6.0], [0.0, 8.0, 0.0]], dtype="float32") X_check = np.array([[4.0, 0.0], [0.0, 8.0]], dtype="float32") WRITER = dict(h5ad=write_h5ad, zarr=write_zarr) READER = dict(h5ad=h5py.File, zarr=zarr.open) @pytest.mark.parametrize("typ", [np.asarray, csr_matrix]) def test_read_partial_X(tmp_path, typ, diskfmt): adata = AnnData(X=typ(X)) path = Path(tmp_path) / ("test_tp_X." + diskfmt) WRITER[diskfmt](path, adata) store = READER[diskfmt](path, mode="r") if diskfmt == "zarr": X_part = read_elem_partial(store["X"], indices=([1, 2], [0, 1])) else: # h5py doesn't allow fancy indexing across multiple dimensions X_part = read_elem_partial(store["X"], indices=([1, 2],)) X_part = X_part[:, [0, 1]] store.close() assert np.all(X_check == X_part) @pytest.mark.skipif(not find_spec("scanpy"), reason="Scanpy is not installed") def test_read_partial_adata(tmp_path, diskfmt): with warnings.catch_warnings(): warnings.filterwarnings( "ignore", message=r"Importing read_.* from `anndata` is deprecated" ) import scanpy as sc adata = sc.datasets.pbmc68k_reduced() # zarr v3 can't write recarray # https://github.com/zarr-developers/zarr-python/issues/2134 if anndata.settings.zarr_write_format == 3 and isinstance(adata, AnnData): del adata.uns["rank_genes_groups"]["scores"] del adata.uns["rank_genes_groups"]["names"] path = Path(tmp_path) / ("test_rp." + diskfmt) WRITER[diskfmt](path, adata) storage = READER[diskfmt](path, mode="r") obs_idx = [1, 2] var_idx = [0, 3] adata_sbs = adata[obs_idx, var_idx] if diskfmt == "zarr": part = read_elem_partial(storage["X"], indices=(obs_idx, var_idx)) else: # h5py doesn't allow fancy indexing across multiple dimensions part = read_elem_partial(storage["X"], indices=(obs_idx,)) part = part[:, var_idx] assert np.all(part == adata_sbs.X) part = read_elem_partial(storage["obs"], indices=(obs_idx,)) assert np.all(part.keys() == adata_sbs.obs.keys()) assert np.all(part.index == adata_sbs.obs.index) part = read_elem_partial(storage["var"], indices=(var_idx,)) assert np.all(part.keys() == adata_sbs.var.keys()) assert np.all(part.index == adata_sbs.var.index) for key in storage["obsm"].keys(): part = read_elem_partial(storage["obsm"][key], indices=(obs_idx,)) assert np.all(part == adata_sbs.obsm[key]) for key in storage["varm"].keys(): part = read_elem_partial(storage["varm"][key], indices=(var_idx,)) np.testing.assert_equal(part, adata_sbs.varm[key]) for key in storage["obsp"].keys(): part = read_elem_partial(storage["obsp"][key], indices=(obs_idx, obs_idx)) part = part.toarray() assert np.all(part == adata_sbs.obsp[key]) # check uns just in case np.testing.assert_equal(read_elem(storage["uns"]).keys(), adata.uns.keys()) python-anndata-0.12.0~rc1/tests/test_io_utils.py000066400000000000000000000067331500370632200217340ustar00rootroot00000000000000from __future__ import annotations from contextlib import AbstractContextManager, suppress from typing import TYPE_CHECKING import h5py import numpy as np import pandas as pd import pytest import zarr import anndata as ad from anndata._io.specs.registry import IORegistryError from anndata._io.utils import report_read_key_on_error from anndata.compat import _clean_uns if TYPE_CHECKING: from collections.abc import Callable from pathlib import Path @pytest.mark.parametrize( "group_fn", [ pytest.param(lambda _: zarr.group(), id="zarr"), pytest.param(lambda p: h5py.File(p / "test.h5", mode="a"), id="h5py"), ], ) @pytest.mark.parametrize("nested", [True, False], ids=["nested", "root"]) def test_key_error( *, tmp_path, group_fn: Callable[[Path], zarr.Group | h5py.Group], nested: bool ): @report_read_key_on_error def read_attr(_): raise NotImplementedError() group = group_fn(tmp_path) with group if isinstance(group, AbstractContextManager) else suppress(): if nested: group = group.create_group("nested") path = "/nested" else: path = "/" group["X"] = np.array([1, 2, 3]) group.create_group("group") with pytest.raises( NotImplementedError, match=rf"reading key 'X'.*from {path}$" ): read_attr(group["X"]) with pytest.raises( NotImplementedError, match=rf"reading key 'group'.*from {path}$" ): read_attr(group["group"]) def test_write_error_info(diskfmt, tmp_path): pth = tmp_path / f"failed_write.{diskfmt}" write = lambda x: getattr(x, f"write_{diskfmt}")(pth) # Assuming we don't define a writer for tuples a = ad.AnnData(uns={"a": {"b": {"c": (1, 2, 3)}}}) with pytest.raises( IORegistryError, match=r"Error raised while writing key 'c'.*to /uns/a/b" ): write(a) def test_clean_uns(): adata = ad.AnnData( uns=dict(species_categories=["a", "b"]), obs=pd.DataFrame({"species": [0, 1, 0]}, index=["a", "b", "c"]), var=pd.DataFrame({"species": [0, 1, 0, 2]}, index=["a", "b", "c", "d"]), ) _clean_uns(adata) assert "species_categories" not in adata.uns assert isinstance(adata.obs["species"].dtype, pd.CategoricalDtype) assert adata.obs["species"].tolist() == ["a", "b", "a"] # var’s categories were overwritten by obs’s, # which we can detect here because var has too high codes assert pd.api.types.is_integer_dtype(adata.var["species"]) @pytest.mark.parametrize( "group_fn", [ pytest.param(lambda _: zarr.group(), id="zarr"), pytest.param(lambda p: h5py.File(p / "test.h5", mode="a"), id="h5py"), ], ) def test_only_child_key_reported_on_failure(tmp_path, group_fn): class Foo: pass group = group_fn(tmp_path) # This regex checks that the pattern inside the (?!...) group does not exist in the string # (?!...) is a negative lookahead # (?s) enables the dot to match newlines # https://stackoverflow.com/a/406408/130164 <- copilot suggested lol pattern = r"(?s)^((?!Error raised while writing key '/?a').)*$" with pytest.raises(IORegistryError, match=pattern): ad.io.write_elem(group, "/", {"a": {"b": Foo()}}) ad.io.write_elem(group, "/", {"a": {"b": [1, 2, 3]}}) group["a/b"].attrs["encoding-type"] = "not a real encoding type" with pytest.raises(IORegistryError, match=pattern): ad.io.read_elem(group) python-anndata-0.12.0~rc1/tests/test_io_warnings.py000066400000000000000000000043721500370632200224210ustar00rootroot00000000000000from __future__ import annotations import re import warnings from importlib.util import find_spec from pathlib import Path import h5py import pytest from packaging.version import Version import anndata as ad from anndata.tests.helpers import gen_adata @pytest.mark.skipif(not find_spec("scanpy"), reason="Scanpy is not installed") def test_old_format_warning_thrown(): with warnings.catch_warnings(): warnings.filterwarnings( "ignore", message=r"Importing read_.* from `anndata` is deprecated" ) import scanpy as sc pth = Path(sc.datasets.__file__).parent / "10x_pbmc68k_reduced.h5ad" # TODO: with Pytest 8, all this can be a # `with pytest.warns(...), pytest.warns(...):` with warnings.catch_warnings(record=True) as record: warnings.simplefilter("always", ad.OldFormatWarning) warnings.simplefilter("always", FutureWarning) ad.read_h5ad(pth) assert any(issubclass(w.category, ad.OldFormatWarning) for w in record), [ w.message for w in record if not issubclass(w.category, FutureWarning) ] assert any( issubclass(w.category, FutureWarning) and re.match( r"Moving element from \.uns\['neighbors']\['distances'] to \.obsp\['distances']\.", str(w.message), ) for w in record ), [w.message for w in record if not issubclass(w.category, ad.OldFormatWarning)] def test_old_format_warning_not_thrown(tmp_path): pth = tmp_path / "current.h5ad" adata = gen_adata((20, 10)) adata.write_h5ad(pth) with warnings.catch_warnings(record=True) as record: warnings.simplefilter("always", ad.OldFormatWarning) if Version(h5py.__version__) < Version("3.2"): # https://github.com/h5py/h5py/issues/1808 warnings.filterwarnings( "ignore", r"Passing None into shape arguments as an alias for \(\) is deprecated\.", category=DeprecationWarning, ) ad.read_h5ad(pth) if len(record) != 0: msg_content = "\n".join( [f"\t{w.category.__name__}('{w.message}')" for w in record] ) pytest.fail( f"Warnings were thrown when they shouldn't be. Got:\n\n{msg_content}" ) python-anndata-0.12.0~rc1/tests/test_layers.py000066400000000000000000000100131500370632200213660ustar00rootroot00000000000000from __future__ import annotations import warnings from importlib.util import find_spec import numpy as np import pandas as pd import pytest from numba.core.errors import NumbaDeprecationWarning from anndata import AnnData, ImplicitModificationWarning, read_h5ad from anndata.io import read_loom from anndata.tests.helpers import gen_typed_df_t2_size X_ = np.arange(12).reshape((3, 4)) L = np.arange(12).reshape((3, 4)) + 12 @pytest.fixture(params=[X_, None]) def X(request): return request.param def test_creation(X: np.ndarray | None): adata = AnnData(X=X, layers=dict(L=L.copy())) assert list(adata.layers.keys()) == ["L"] assert "L" in adata.layers assert "X" not in adata.layers assert "some_other_thing" not in adata.layers assert (adata.layers["L"] == L).all() assert adata.shape == L.shape def test_views(): adata = AnnData(X=X_, layers=dict(L=L.copy())) adata_view = adata[1:, 1:] assert adata_view.layers.is_view assert adata_view.layers.parent_mapping == adata.layers assert adata_view.layers.keys() == adata.layers.keys() assert (adata_view.layers["L"] == adata.layers["L"][1:, 1:]).all() adata.layers["S"] = X_ assert adata_view.layers.keys() == adata.layers.keys() assert (adata_view.layers["S"] == adata.layers["S"][1:, 1:]).all() with pytest.warns(ImplicitModificationWarning): adata_view.layers["T"] = X_[1:, 1:] assert not adata_view.layers.is_view assert not adata_view.is_view @pytest.mark.parametrize( ("df", "homogenous", "dtype"), [ (lambda: gen_typed_df_t2_size(*X_.shape), True, np.object_), (lambda: pd.DataFrame(X_**2), False, np.int_), ], ) def test_set_dataframe(homogenous, df, dtype): adata = AnnData(X_) if homogenous: with pytest.warns(UserWarning, match=r"Layer 'df'.*dtype object"): adata.layers["df"] = df() else: with warnings.catch_warnings(): warnings.simplefilter("error") adata.layers["df"] = df() assert isinstance(adata.layers["df"], np.ndarray) assert np.issubdtype(adata.layers["df"].dtype, dtype) def test_readwrite(X: np.ndarray | None, backing_h5ad): adata = AnnData(X=X, layers=dict(L=L.copy())) adata.write(backing_h5ad) adata_read = read_h5ad(backing_h5ad) assert adata.layers.keys() == adata_read.layers.keys() assert (adata.layers["L"] == adata_read.layers["L"]).all() @pytest.mark.skipif(find_spec("loompy") is None, reason="loompy not installed") def test_readwrite_loom(tmp_path): loom_path = tmp_path / "test.loom" adata = AnnData(X=X_, layers=dict(L=L.copy())) with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=NumbaDeprecationWarning) # loompy uses “is” for ints warnings.filterwarnings("ignore", category=SyntaxWarning) warnings.filterwarnings( "ignore", message=r"datetime.datetime.utcnow\(\) is deprecated", category=DeprecationWarning, ) adata.write_loom(loom_path) adata_read = read_loom(loom_path, X_name="") assert adata.layers.keys() == adata_read.layers.keys() assert (adata.layers["L"] == adata_read.layers["L"]).all() def test_backed(): # backed mode for layers isn’t implemented, layers stay in memory pass def test_copy(): adata = AnnData(X=X_, layers=dict(L=L.copy())) bdata = adata.copy() # check that we don’t create too many references assert bdata._layers is bdata.layers._data # check that we have a copy adata.layers["L"] += 10 assert np.all(adata.layers["L"] != bdata.layers["L"]) # 201 def test_shape_error(): adata = AnnData(X=X_) with pytest.raises( ValueError, match=( r"Value passed for key 'L' is of incorrect shape\. " r"Values of layers must match dimensions \('obs', 'var'\) of parent\. " r"Value had shape \(4, 4\) while it should have had \(3, 4\)\." ), ): adata.layers["L"] = np.zeros((X_.shape[0] + 1, X_.shape[1])) python-anndata-0.12.0~rc1/tests/test_obsmvarm.py000066400000000000000000000114651500370632200217310ustar00rootroot00000000000000from __future__ import annotations import joblib import numpy as np import pandas as pd import pytest from scipy import sparse from anndata import AnnData from anndata.tests.helpers import get_multiindex_columns_df M, N = (100, 100) @pytest.fixture def adata(): X = np.zeros((M, N)) obs = pd.DataFrame( dict(batch=np.array(["a", "b"])[np.random.randint(0, 2, M)]), index=[f"cell{i:03d}" for i in range(N)], ) var = pd.DataFrame(index=[f"gene{i:03d}" for i in range(N)]) return AnnData(X, obs=obs, var=var) def test_assignment_dict(adata: AnnData): d_obsm = dict( a=pd.DataFrame( dict(a1=np.ones(M), a2=[f"a{i}" for i in range(M)]), index=adata.obs_names, ), b=np.zeros((M, 2)), ) d_varm = dict( a=pd.DataFrame( dict(a1=np.ones(N), a2=[f"a{i}" for i in range(N)]), index=adata.var_names, ), b=np.zeros((N, 2)), ) adata.obsm = d_obsm for k, v in d_obsm.items(): assert np.all(adata.obsm[k] == v) adata.varm = d_varm for k, v in d_varm.items(): assert np.all(adata.varm[k] == v) def test_setting_ndarray(adata: AnnData): adata.obsm["a"] = np.ones((M, 10)) adata.varm["a"] = np.ones((N, 10)) assert np.all(adata.obsm["a"] == np.ones((M, 10))) assert np.all(adata.varm["a"] == np.ones((N, 10))) h = joblib.hash(adata) with pytest.raises(ValueError, match=r"incorrect shape"): adata.obsm["b"] = np.ones((int(M / 2), 10)) with pytest.raises(ValueError, match=r"incorrect shape"): adata.obsm["b"] = np.ones((int(M * 2), 10)) with pytest.raises(ValueError, match=r"incorrect shape"): adata.varm["b"] = np.ones((int(N / 2), 10)) with pytest.raises(ValueError, match=r"incorrect shape"): adata.varm["b"] = np.ones((int(N * 2), 10)) assert h == joblib.hash(adata) def test_setting_dataframe(adata: AnnData): obsm_df = pd.DataFrame(dict(b_1=np.ones(M), b_2=["a"] * M), index=adata.obs_names) varm_df = pd.DataFrame(dict(b_1=np.ones(N), b_2=["a"] * N), index=adata.var_names) adata.obsm["b"] = obsm_df assert np.all(adata.obsm["b"] == obsm_df) adata.varm["b"] = varm_df assert np.all(adata.varm["b"] == varm_df) bad_obsm_df = obsm_df.copy() bad_obsm_df.reset_index(inplace=True) with pytest.raises(ValueError, match=r"index does not match.*obs names"): adata.obsm["c"] = bad_obsm_df bad_varm_df = varm_df.copy() bad_varm_df.reset_index(inplace=True) with pytest.raises(ValueError, match=r"index does not match.*var names"): adata.varm["c"] = bad_varm_df def test_setting_sparse(adata: AnnData): obsm_sparse = sparse.random(M, 100, format="csr") adata.obsm["a"] = obsm_sparse assert not np.any((adata.obsm["a"] != obsm_sparse).data) varm_sparse = sparse.random(N, 100, format="csr") adata.varm["a"] = varm_sparse assert not np.any((adata.varm["a"] != varm_sparse).data) h = joblib.hash(adata) bad_obsm_sparse = sparse.random(M * 2, M, format="csr") with pytest.raises(ValueError, match=r"incorrect shape"): adata.obsm["b"] = bad_obsm_sparse bad_varm_sparse = sparse.random(N * 2, N, format="csr") with pytest.raises(ValueError, match=r"incorrect shape"): adata.varm["b"] = bad_varm_sparse assert h == joblib.hash(adata) def test_setting_daskarray(adata: AnnData): import dask.array as da adata.obsm["a"] = da.ones((M, 10)) adata.varm["a"] = da.ones((N, 10)) assert da.all(adata.obsm["a"] == da.ones((M, 10))) assert da.all(adata.varm["a"] == da.ones((N, 10))) assert isinstance(adata.obsm["a"], da.Array) assert isinstance(adata.varm["a"], da.Array) h = joblib.hash(adata) with pytest.raises(ValueError, match=r"incorrect shape"): adata.obsm["b"] = da.ones((int(M / 2), 10)) with pytest.raises(ValueError, match=r"incorrect shape"): adata.obsm["b"] = da.ones((int(M * 2), 10)) with pytest.raises(ValueError, match=r"incorrect shape"): adata.varm["b"] = da.ones((int(N / 2), 10)) with pytest.raises(ValueError, match=r"incorrect shape"): adata.varm["b"] = da.ones((int(N * 2), 10)) assert h == joblib.hash(adata) def test_shape_error(adata: AnnData): with pytest.raises( ValueError, match=( r"Value passed for key 'b' is of incorrect shape\. " r"Values of obsm must match dimensions \('obs',\) of parent\. " r"Value had shape \(101,\) while it should have had \(100,\)\." ), ): adata.obsm["b"] = np.zeros((adata.shape[0] + 1, adata.shape[0])) def test_error_set_multiindex_df(adata: AnnData): df = get_multiindex_columns_df((adata.shape[0], 20)) with pytest.raises(ValueError, match=r"MultiIndex columns are not supported"): adata.obsm["df"] = df python-anndata-0.12.0~rc1/tests/test_obspvarp.py000066400000000000000000000115151500370632200217330ustar00rootroot00000000000000# TODO: These tests should share code with test_layers, and test_obsmvarm from __future__ import annotations import warnings import joblib import numpy as np import pandas as pd import pytest from scipy import sparse from anndata import AnnData from anndata.tests.helpers import gen_typed_df_t2_size from anndata.utils import asarray M, N = (200, 100) @pytest.fixture def adata(): X = np.zeros((M, N)) obs = pd.DataFrame( dict(batch=np.array(["a", "b"])[np.random.randint(0, 2, M)]), index=[f"cell{i:03d}" for i in range(M)], ) var = pd.DataFrame(index=[f"gene{i:03d}" for i in range(N)]) return AnnData(X, obs=obs, var=var) def test_assigmnent_dict(adata: AnnData): d_obsp = dict( a=pd.DataFrame(np.ones((M, M)), columns=adata.obs_names, index=adata.obs_names), b=np.zeros((M, M)), c=sparse.random(M, M, format="csr"), ) d_varp = dict( a=pd.DataFrame(np.ones((N, N)), columns=adata.var_names, index=adata.var_names), b=np.zeros((N, N)), c=sparse.random(N, N, format="csr"), ) adata.obsp = d_obsp for k, v in d_obsp.items(): assert np.all(asarray(adata.obsp[k]) == asarray(v)) adata.varp = d_varp for k, v in d_varp.items(): assert np.all(asarray(adata.varp[k]) == asarray(v)) def test_setting_ndarray(adata: AnnData): adata.obsp["a"] = np.ones((M, M)) adata.varp["a"] = np.ones((N, N)) assert np.all(adata.obsp["a"] == np.ones((M, M))) assert np.all(adata.varp["a"] == np.ones((N, N))) h = joblib.hash(adata) with pytest.raises(ValueError, match=r"incorrect shape"): adata.obsp["b"] = np.ones((int(M / 2), M)) with pytest.raises(ValueError, match=r"incorrect shape"): adata.obsp["b"] = np.ones((M, int(M * 2))) with pytest.raises(ValueError, match=r"incorrect shape"): adata.varp["b"] = np.ones((int(N / 2), 10)) with pytest.raises(ValueError, match=r"incorrect shape"): adata.varp["b"] = np.ones((N, int(N * 2))) assert h == joblib.hash(adata) def test_setting_sparse(adata: AnnData): obsp_sparse = sparse.random(M, M, format="csr") adata.obsp["a"] = obsp_sparse assert not np.any((adata.obsp["a"] != obsp_sparse).data) varp_sparse = sparse.random(N, N, format="csr") adata.varp["a"] = varp_sparse assert not np.any((adata.varp["a"] != varp_sparse).data) h = joblib.hash(adata) bad_obsp_sparse = sparse.random(M * 2, M, format="csr") with pytest.raises(ValueError, match=r"incorrect shape"): adata.obsp["b"] = bad_obsp_sparse bad_varp_sparse = sparse.random(N * 2, N, format="csr") with pytest.raises(ValueError, match=r"incorrect shape"): adata.varp["b"] = bad_varp_sparse assert h == joblib.hash(adata) @pytest.mark.parametrize(("field", "dim"), [("obsp", M), ("varp", N)]) @pytest.mark.parametrize( ("df", "homogenous", "dtype"), [ (lambda dim: gen_typed_df_t2_size(dim, dim), True, np.object_), (lambda dim: pd.DataFrame(np.random.randn(dim, dim)), False, np.floating), ], ids=["heterogeneous", "homogeneous"], ) def test_setting_dataframe(adata: AnnData, field, dim, homogenous, df, dtype): if homogenous: with pytest.warns(UserWarning, match=rf"{field.title()} 'df'.*dtype object"): getattr(adata, field)["df"] = df(dim) else: with warnings.catch_warnings(): warnings.simplefilter("error") getattr(adata, field)["df"] = df(dim) assert isinstance(getattr(adata, field)["df"], np.ndarray) assert np.issubdtype(getattr(adata, field)["df"].dtype, dtype) def test_setting_daskarray(adata: AnnData): import dask.array as da adata.obsp["a"] = da.ones((M, M)) adata.varp["a"] = da.ones((N, N)) assert da.all(adata.obsp["a"] == da.ones((M, M))) assert da.all(adata.varp["a"] == da.ones((N, N))) assert isinstance(adata.obsp["a"], da.Array) assert isinstance(adata.varp["a"], da.Array) h = joblib.hash(adata) with pytest.raises(ValueError, match=r"incorrect shape"): adata.obsp["b"] = da.ones((int(M / 2), M)) with pytest.raises(ValueError, match=r"incorrect shape"): adata.obsp["b"] = da.ones((M, int(M * 2))) with pytest.raises(ValueError, match=r"incorrect shape"): adata.varp["b"] = da.ones((int(N / 2), 10)) with pytest.raises(ValueError, match=r"incorrect shape"): adata.varp["b"] = da.ones((N, int(N * 2))) assert h == joblib.hash(adata) def test_shape_error(adata: AnnData): with pytest.raises( ValueError, match=( r"Value passed for key 'a' is of incorrect shape\. " r"Values of obsp must match dimensions \('obs', 'obs'\) of parent\. " r"Value had shape \(201, 200\) while it should have had \(200, 200\)\." ), ): adata.obsp["a"] = np.zeros((adata.shape[0] + 1, adata.shape[0])) python-anndata-0.12.0~rc1/tests/test_raw.py000066400000000000000000000122521500370632200206670ustar00rootroot00000000000000from __future__ import annotations import numpy as np import pytest import anndata as ad from anndata import ImplicitModificationWarning from anndata.tests.helpers import GEN_ADATA_DASK_ARGS, assert_equal, gen_adata # ------------------------------------------------------------------------------- # Some test data # ------------------------------------------------------------------------------- data = [ [1, 2, 3], [4, 5, 6], [7, 8, 9], ] # data matrix of shape n_obs × n_vars obs_dict = dict( # annotation of observations / rows row_names=["name1", "name2", "name3"], # row annotation oanno1=["cat1", "cat2", "cat2"], # categorical annotation oanno2=["o1", "o2", "o3"], # string annotation oanno3=[2.1, 2.2, 2.3], # float annotation ) var_dict = dict( # annotation of variables / columns col_names=["var1", "var2", "var3"], vanno1=[3.1, 3.2, 3.3] ) uns_dict = dict( # unstructured annotation oanno1_colors=["#000000", "#FFFFFF"], uns2=["some annotation"] ) @pytest.fixture def adata_raw() -> ad.AnnData: adata = ad.AnnData( np.array(data, dtype="int32"), obs=obs_dict, var=var_dict, uns=uns_dict ) adata.raw = adata.copy() # Make them different shapes adata = adata[:, [0, 1]].copy() return adata # ------------------------------------------------------------------------------- # The test functions # ------------------------------------------------------------------------------- def test_raw_init(adata_raw: ad.AnnData): assert adata_raw.var_names.tolist() == ["var1", "var2"] assert adata_raw.raw.var_names.tolist() == ["var1", "var2", "var3"] assert adata_raw.raw[:, 0].X.tolist() == [[1], [4], [7]] def test_raw_del(adata_raw: ad.AnnData): del adata_raw.raw assert adata_raw.raw is None def test_raw_set_as_none(adata_raw: ad.AnnData): # Test for scverse/anndata#445 a = adata_raw b = adata_raw.copy() del a.raw b.raw = None assert_equal(a, b) def test_raw_of_view(adata_raw: ad.AnnData): adata_view = adata_raw[adata_raw.obs["oanno1"] == "cat2"] assert adata_view.raw.X.tolist() == [ [4, 5, 6], [7, 8, 9], ] def test_raw_rw(adata_raw: ad.AnnData, backing_h5ad): adata_raw.write(backing_h5ad) adata_read = ad.read_h5ad(backing_h5ad) assert_equal(adata_read, adata_raw, exact=True) assert adata_raw.var_names.tolist() == ["var1", "var2"] assert adata_raw.raw.var_names.tolist() == ["var1", "var2", "var3"] assert adata_raw.raw[:, 0].X.tolist() == [[1], [4], [7]] def test_raw_view_rw(adata_raw: ad.AnnData, backing_h5ad): # Make sure it still writes correctly if the object is a view adata_raw_view = adata_raw[:, adata_raw.var_names] assert_equal(adata_raw_view, adata_raw) with pytest.warns( ImplicitModificationWarning, match=r"initializing view as actual" ): adata_raw_view.write(backing_h5ad) adata_read = ad.read_h5ad(backing_h5ad) assert_equal(adata_read, adata_raw_view, exact=True) assert adata_raw.var_names.tolist() == ["var1", "var2"] assert adata_raw.raw.var_names.tolist() == ["var1", "var2", "var3"] assert adata_raw.raw[:, 0].X.tolist() == [[1], [4], [7]] def test_raw_backed(adata_raw: ad.AnnData, backing_h5ad): adata_raw.filename = backing_h5ad assert adata_raw.var_names.tolist() == ["var1", "var2"] assert adata_raw.raw.var_names.tolist() == ["var1", "var2", "var3"] if adata_raw.raw[:, 0].X.shape[1] != 1: pytest.xfail("Raw is broken for backed slices") assert adata_raw.raw[:, 0].X[:].tolist() == [[1], [4], [7]] def test_raw_view_backed(adata_raw: ad.AnnData, backing_h5ad): adata_raw.filename = backing_h5ad assert adata_raw.var_names.tolist() == ["var1", "var2"] assert adata_raw.raw.var_names.tolist() == ["var1", "var2", "var3"] if adata_raw.raw[:, 0].X.shape[1] != 1: pytest.xfail("Raw is broken for backed slices") assert adata_raw.raw[:, 0].X[:].tolist() == [[1], [4], [7]] def test_raw_as_parent_view(): # https://github.com/scverse/anndata/issues/288 a = ad.AnnData(np.ones((4, 3))) a.varm["PCs"] = np.ones((3, 3)) a.raw = a.copy() # create a Raw containing views. This used to trigger #288. b = a.raw[:, "0"] # actualize b.varm["PCs"] = np.array([[1, 2, 3]]) def test_to_adata(): # https://github.com/scverse/anndata/pull/404 adata = gen_adata((20, 10), **GEN_ADATA_DASK_ARGS) with_raw = adata[:, ::2].copy() with_raw.raw = adata.copy() # Raw doesn't do layers or varp currently # Deleting after creation so we know to rewrite the test if they are supported del adata.layers, adata.varp assert_equal(adata, with_raw.raw.to_adata()) def test_to_adata_populates_obs(): adata = gen_adata((20, 10), **GEN_ADATA_DASK_ARGS) del adata.layers, adata.uns, adata.varp adata_w_raw = adata.copy() raw = adata.copy() del raw.obs, raw.obsm, raw.obsp, raw.uns adata_w_raw.raw = raw from_raw = adata_w_raw.raw.to_adata() assert_equal(adata, from_raw) def test_no_copy(): adata = gen_adata((20, 10), X_type=np.asarray) adata.raw = adata # no .copy() herer np.log1p(adata.X, out=adata.X) assert adata.X is adata.raw.X python-anndata-0.12.0~rc1/tests/test_readwrite.py000066400000000000000000001010541500370632200220630ustar00rootroot00000000000000from __future__ import annotations import re import warnings from contextlib import contextmanager from functools import partial from importlib.util import find_spec from pathlib import Path from string import ascii_letters from typing import TYPE_CHECKING import h5py import numpy as np import pandas as pd import pytest import zarr from numba.core.errors import NumbaDeprecationWarning from scipy.sparse import csc_array, csc_matrix, csr_array, csr_matrix import anndata as ad from anndata._io.specs.registry import IORegistryError from anndata._io.zarr import open_write_group from anndata.compat import ( CSArray, CSMatrix, DaskArray, ZarrArray, ZarrGroup, _read_attr, is_zarr_v2, ) from anndata.tests.helpers import as_dense_dask_array, assert_equal, gen_adata if TYPE_CHECKING: from typing import Literal HERE = Path(__file__).parent # ------------------------------------------------------------------------------ # Some test data # ------------------------------------------------------------------------------ X_sp = csr_matrix([[1, 0, 0], [3, 0, 0], [5, 6, 0], [0, 0, 0], [0, 0, 0]]) X_list = [[1, 0], [3, 0], [5, 6]] # data matrix of shape n_obs x n_vars obs_dict = dict( # annotation of observations / rows row_names=["name1", "name2", "name3"], # row annotation oanno1=["cat1", "cat2", "cat2"], # categorical annotation oanno1b=["cat1", "cat1", "cat1"], # categorical annotation with one category oanno1c=["cat1", "cat1", np.nan], # categorical annotation with a missing value oanno2=["o1", "o2", "o3"], # string annotation oanno3=[2.1, 2.2, 2.3], # float annotation oanno4=[3.3, 1.1, 2.2], # float annotation ) var_dict = dict( # annotation of variables / columns vanno1=[3.1, 3.2], vanno2=["cat1", "cat1"], # categorical annotation vanno3=[2.1, 2.2], # float annotation vanno4=[3.3, 1.1], # float annotation ) uns_dict = dict( # unstructured annotation oanno1_colors=["#000000", "#FFFFFF"], uns2=["some annotation"], uns3="another annotation", uns4=dict( a=1, b=[2, 3], c="4", d=["some", "strings"], e=np.ones(5), f=np.int32(7), g=[1, np.float32(2.5)], ), ) @pytest.fixture(params=[{}, dict(compression="gzip")]) def dataset_kwargs(request): return request.param @pytest.fixture def rw(backing_h5ad): M, N = 100, 101 orig = gen_adata((M, N)) orig.write(backing_h5ad) curr = ad.read_h5ad(backing_h5ad) return curr, orig @pytest.fixture(params=[np.uint8, np.int32, np.int64, np.float32, np.float64]) def dtype(request): return request.param # ------------------------------------------------------------------------------ # The test functions # ------------------------------------------------------------------------------ @pytest.mark.parametrize("typ", [np.array, csr_matrix, csr_array, as_dense_dask_array]) def test_readwrite_roundtrip(typ, tmp_path, diskfmt, diskfmt2): pth1 = tmp_path / f"first.{diskfmt}" write1 = lambda x: getattr(x, f"write_{diskfmt}")(pth1) read1 = lambda: getattr(ad, f"read_{diskfmt}")(pth1) pth2 = tmp_path / f"second.{diskfmt2}" write2 = lambda x: getattr(x, f"write_{diskfmt2}")(pth2) read2 = lambda: getattr(ad, f"read_{diskfmt2}")(pth2) adata1 = ad.AnnData(typ(X_list), obs=obs_dict, var=var_dict, uns=uns_dict) write1(adata1) adata2 = read1() write2(adata2) adata3 = read2() assert_equal(adata2, adata1) assert_equal(adata3, adata1) assert_equal(adata2, adata1) def test_readwrite_roundtrip_async(tmp_path): import asyncio async def _do_test(): zarr_path = tmp_path / "first.zarr" adata1 = ad.AnnData( csr_matrix(X_list), obs=obs_dict, var=var_dict, uns=uns_dict ) adata1.write_zarr(zarr_path) adata2 = ad.read_zarr(zarr_path) assert_equal(adata2, adata1) # This test ensures our file i/o never calls `asyncio.run` internally asyncio.run(_do_test()) @pytest.mark.parametrize("storage", ["h5ad", "zarr"]) @pytest.mark.parametrize("typ", [np.array, csr_matrix, csr_array, as_dense_dask_array]) def test_readwrite_kitchensink(tmp_path, storage, typ, backing_h5ad, dataset_kwargs): X = typ(X_list) adata_src = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict) assert not isinstance(adata_src.obs["oanno1"].dtype, pd.CategoricalDtype) adata_src.raw = adata_src.copy() if storage == "h5ad": adata_src.write(backing_h5ad, **dataset_kwargs) adata_mid = ad.read_h5ad(backing_h5ad) adata_mid.write(tmp_path / "mid.h5ad", **dataset_kwargs) adata = ad.read_h5ad(tmp_path / "mid.h5ad") else: adata_src.write_zarr(tmp_path / "test_zarr_dir") adata = ad.read_zarr(tmp_path / "test_zarr_dir") assert isinstance(adata.obs["oanno1"].dtype, pd.CategoricalDtype) assert not isinstance(adata.obs["oanno2"].dtype, pd.CategoricalDtype) assert adata.obs.index.tolist() == ["name1", "name2", "name3"] assert adata.obs["oanno1"].cat.categories.tolist() == ["cat1", "cat2"] assert adata.obs["oanno1c"].cat.categories.tolist() == ["cat1"] assert isinstance(adata.raw.var["vanno2"].dtype, pd.CategoricalDtype) pd.testing.assert_frame_equal(adata.obs, adata_src.obs) pd.testing.assert_frame_equal(adata.var, adata_src.var) assert_equal(adata.var.index, adata_src.var.index) assert adata.var.index.dtype == adata_src.var.index.dtype # Dev. Note: # either load as same type or load the convert DaskArray to array # since we tested if assigned types and loaded types are DaskArray # this would also work if they work if isinstance(adata_src.raw.X, CSArray): assert isinstance(adata.raw.X, CSMatrix) else: assert isinstance(adata_src.raw.X, type(adata.raw.X) | DaskArray) assert isinstance( adata_src.uns["uns4"]["c"], type(adata.uns["uns4"]["c"]) | DaskArray ) assert isinstance(adata_src.varm, type(adata.varm) | DaskArray) assert_equal(adata.raw.X, adata_src.raw.X) pd.testing.assert_frame_equal(adata.raw.var, adata_src.raw.var) assert isinstance(adata.uns["uns4"]["a"], int | np.integer) assert isinstance(adata_src.uns["uns4"]["a"], int | np.integer) assert_equal(adata, adata_src) @pytest.mark.parametrize("typ", [np.array, csr_matrix, csr_array, as_dense_dask_array]) def test_readwrite_maintain_X_dtype(typ, backing_h5ad): X = typ(X_list).astype("int8") adata_src = ad.AnnData(X) adata_src.write(backing_h5ad) adata = ad.read_h5ad(backing_h5ad) assert adata.X.dtype == adata_src.X.dtype def test_read_write_maintain_obsmvarm_dtypes(rw): curr, orig = rw assert type(orig.obsm["array"]) is type(curr.obsm["array"]) assert np.all(orig.obsm["array"] == curr.obsm["array"]) assert np.all(orig.varm["array"] == curr.varm["array"]) assert type(orig.obsm["sparse"]) is type(curr.obsm["sparse"]) assert not np.any((orig.obsm["sparse"] != curr.obsm["sparse"]).toarray()) assert not np.any((orig.varm["sparse"] != curr.varm["sparse"]).toarray()) assert type(orig.obsm["df"]) is type(curr.obsm["df"]) assert np.all(orig.obsm["df"] == curr.obsm["df"]) assert np.all(orig.varm["df"] == curr.varm["df"]) def test_maintain_layers(rw): curr, orig = rw assert type(orig.layers["array"]) is type(curr.layers["array"]) assert np.all(orig.layers["array"] == curr.layers["array"]) assert type(orig.layers["sparse"]) is type(curr.layers["sparse"]) assert not np.any((orig.layers["sparse"] != curr.layers["sparse"]).toarray()) @pytest.mark.parametrize("typ", [np.array, csr_matrix, csr_array, as_dense_dask_array]) def test_readwrite_h5ad_one_dimension(typ, backing_h5ad): X = typ(X_list) adata_src = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict) adata_one = adata_src[:, 0].copy() adata_one.write(backing_h5ad) adata = ad.read_h5ad(backing_h5ad) assert adata.shape == (3, 1) assert_equal(adata, adata_one) @pytest.mark.parametrize("typ", [np.array, csr_matrix, csr_array, as_dense_dask_array]) def test_readwrite_backed(typ, backing_h5ad): X = typ(X_list) adata_src = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict) adata_src.filename = backing_h5ad # change to backed mode adata_src.write() adata = ad.read_h5ad(backing_h5ad) assert isinstance(adata.obs["oanno1"].dtype, pd.CategoricalDtype) assert not isinstance(adata.obs["oanno2"].dtype, pd.CategoricalDtype) assert adata.obs.index.tolist() == ["name1", "name2", "name3"] assert adata.obs["oanno1"].cat.categories.tolist() == ["cat1", "cat2"] assert_equal(adata, adata_src) @pytest.mark.parametrize( "typ", [np.array, csr_matrix, csc_matrix, csr_array, csc_array] ) def test_readwrite_equivalent_h5ad_zarr(tmp_path, typ): h5ad_pth = tmp_path / "adata.h5ad" zarr_pth = tmp_path / "adata.zarr" M, N = 100, 101 adata = gen_adata((M, N), X_type=typ) adata.raw = adata.copy() adata.write_h5ad(h5ad_pth) adata.write_zarr(zarr_pth) from_h5ad = ad.read_h5ad(h5ad_pth) from_zarr = ad.read_zarr(zarr_pth) assert_equal(from_h5ad, from_zarr, exact=True) @contextmanager def store_context(path: Path): if path.suffix == ".zarr": store = open_write_group(path, mode="r+") else: file = h5py.File(path, "r+") store = file["/"] yield store if "file" in locals(): file.close() @pytest.mark.parametrize( ("name", "read", "write"), [ ("adata.h5ad", ad.read_h5ad, ad.AnnData.write_h5ad), ("adata.zarr", ad.read_zarr, ad.AnnData.write_zarr), ], ) def test_read_full_io_error(tmp_path, name, read, write): adata = gen_adata((4, 3)) path = tmp_path / name write(adata, path) with store_context(path) as store: if not is_zarr_v2() and isinstance(store, ZarrGroup): # see https://github.com/zarr-developers/zarr-python/issues/2716 for the issue # with re-opening without syncing attributes explicitly # TODO: Having to fully specify attributes to not override fixed in zarr v3.0.5 # See https://github.com/zarr-developers/zarr-python/pull/2870 store["obs"].update_attributes( {**dict(store["obs"].attrs), "encoding-type": "invalid"} ) zarr.consolidate_metadata(store.store) else: store["obs"].attrs["encoding-type"] = "invalid" with pytest.raises( IORegistryError, match=r"raised while reading key 'obs'.*from /$", ) as exc_info: read(path) assert re.search( r"No read method registered for IOSpec\(encoding_type='invalid', encoding_version='0.2.0'\)", str(exc_info.value), ) @pytest.mark.parametrize( ("compression", "compression_opts"), [ (None, None), ("lzf", None), ("gzip", None), ("gzip", 8), ], ) def test_hdf5_compression_opts(tmp_path, compression, compression_opts): # https://github.com/scverse/anndata/issues/497 pth = Path(tmp_path) / "adata.h5ad" adata = gen_adata((10, 8)) kwargs = {} if compression is not None: kwargs["compression"] = compression if compression_opts is not None: kwargs["compression_opts"] = compression_opts not_compressed = [] adata.write_h5ad(pth, **kwargs) def check_compressed(key, value): if isinstance(value, h5py.Dataset) and value.shape != (): if compression is not None and value.compression != compression: not_compressed.append(key) elif ( compression_opts is not None and value.compression_opts != compression_opts ): not_compressed.append(key) with h5py.File(pth) as f: f.visititems(check_compressed) if not_compressed: sep = "\n\t" msg = ( f"These elements were not compressed correctly:{sep}" f"{sep.join(not_compressed)}" ) raise AssertionError(msg) expected = ad.read_h5ad(pth) assert_equal(adata, expected) @pytest.mark.parametrize("zarr_write_format", [2, 3]) def test_zarr_compression(tmp_path, zarr_write_format): ad.settings.zarr_write_format = zarr_write_format pth = str(Path(tmp_path) / "adata.zarr") adata = gen_adata((10, 8)) if zarr_write_format == 2 or is_zarr_v2(): from numcodecs import Blosc compressor = Blosc(cname="zstd", clevel=3, shuffle=Blosc.BITSHUFFLE) else: from zarr.codecs import BloscCodec # Typesize is forced to be 1 so that the codecs always match on the roundtrip. # Otherwise this value would vary depending on the datatype. # See github.com/zarr-developers/numcodecs/pull/713 for a related issue/explanation. # In practice, you would never want to set this parameter. compressor = BloscCodec( cname="zstd", clevel=3, shuffle="bitshuffle", typesize=1 ) not_compressed = [] ad.io.write_zarr(pth, adata, compressor=compressor) def check_compressed(value, key): if not isinstance(value, ZarrArray) or value.shape == (): return None (read_compressor,) = value.compressors if zarr_write_format == 2: if read_compressor != compressor: not_compressed.append(key) return None if read_compressor.to_dict() != compressor.to_dict(): not_compressed.append(key) if is_zarr_v2(): with zarr.open(str(pth), "r") as f: f.visititems(check_compressed) else: f = zarr.open(str(pth), mode="r") for key, value in f.members(max_depth=None): check_compressed(value, key) if not_compressed: sep = "\n\t" msg = ( f"These elements were not compressed correctly:{sep}" f"{sep.join(not_compressed)}" ) raise AssertionError(msg) expected = ad.read_zarr(pth) assert_equal(adata, expected) def test_changed_obs_var_names(tmp_path, diskfmt): filepth = tmp_path / f"test.{diskfmt}" orig = gen_adata((10, 10)) orig.obs_names.name = "obs" orig.var_names.name = "var" modified = orig.copy() modified.obs_names.name = "cells" modified.var_names.name = "genes" getattr(orig, f"write_{diskfmt}")(filepth) read = getattr(ad, f"read_{diskfmt}")(filepth) assert_equal(orig, read, exact=True) assert orig.var.index.name == "var" assert read.obs.index.name == "obs" with pytest.raises(AssertionError): assert_equal(orig, modified, exact=True) with pytest.raises(AssertionError): assert_equal(read, modified, exact=True) @pytest.mark.skipif(not find_spec("loompy"), reason="Loompy is not installed") @pytest.mark.parametrize("typ", [np.array, csr_matrix]) @pytest.mark.parametrize("obsm_mapping", [{}, dict(X_composed=["oanno3", "oanno4"])]) @pytest.mark.parametrize("varm_mapping", [{}, dict(X_composed2=["vanno3", "vanno4"])]) def test_readwrite_loom(typ, obsm_mapping, varm_mapping, tmp_path): X = typ(X_list) obs_dim = "meaningful_obs_dim_name" var_dim = "meaningful_var_dim_name" adata_src = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict) adata_src.obs_names.name = obs_dim adata_src.var_names.name = var_dim adata_src.obsm["X_a"] = np.zeros((adata_src.n_obs, 2)) adata_src.varm["X_b"] = np.zeros((adata_src.n_vars, 3)) with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=NumbaDeprecationWarning) # loompy uses “is” for ints warnings.filterwarnings("ignore", category=SyntaxWarning) warnings.filterwarnings( "ignore", message=r"datetime.datetime.utcnow\(\) is deprecated", category=DeprecationWarning, ) adata_src.write_loom(tmp_path / "test.loom", write_obsm_varm=True) adata = ad.io.read_loom( tmp_path / "test.loom", sparse=typ is csr_matrix, obsm_mapping=obsm_mapping, obs_names=obs_dim, varm_mapping=varm_mapping, var_names=var_dim, cleanup=True, ) if isinstance(X, np.ndarray): assert np.allclose(adata.X, X) else: # TODO: this should not be necessary assert np.allclose(adata.X.toarray(), X.toarray()) assert "X_a" in adata.obsm_keys() assert adata.obsm["X_a"].shape[1] == 2 assert "X_b" in adata.varm_keys() assert adata.varm["X_b"].shape[1] == 3 # as we called with `cleanup=True` assert "oanno1b" in adata.uns["loom-obs"] assert "vanno2" in adata.uns["loom-var"] for k, v in obsm_mapping.items(): assert k in adata.obsm_keys() assert adata.obsm[k].shape[1] == len(v) for k, v in varm_mapping.items(): assert k in adata.varm_keys() assert adata.varm[k].shape[1] == len(v) assert adata.obs_names.name == obs_dim assert adata.var_names.name == var_dim @pytest.mark.skipif(not find_spec("loompy"), reason="Loompy is not installed") def test_readloom_deprecations(tmp_path): loom_pth = tmp_path / "test.loom" adata_src = gen_adata((5, 10), obsm_types=[np.ndarray], varm_types=[np.ndarray]) with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=NumbaDeprecationWarning) warnings.filterwarnings( "ignore", message=r"datetime.datetime.utcnow\(\) is deprecated", category=DeprecationWarning, ) adata_src.write_loom(loom_pth, write_obsm_varm=True) # obsm_names -> obsm_mapping obsm_mapping = {"df": adata_src.obs.columns} with pytest.warns(FutureWarning): depr_result = ad.io.read_loom(loom_pth, obsm_names=obsm_mapping) actual_result = ad.io.read_loom(loom_pth, obsm_mapping=obsm_mapping) assert_equal(actual_result, depr_result) with pytest.raises(ValueError, match=r"ambiguous"), pytest.warns(FutureWarning): ad.io.read_loom(loom_pth, obsm_mapping=obsm_mapping, obsm_names=obsm_mapping) # varm_names -> varm_mapping varm_mapping = {"df": adata_src.var.columns} with pytest.warns(FutureWarning): depr_result = ad.io.read_loom(loom_pth, varm_names=varm_mapping) actual_result = ad.io.read_loom(loom_pth, varm_mapping=varm_mapping) assert_equal(actual_result, depr_result) with pytest.raises(ValueError, match=r"ambiguous"), pytest.warns(FutureWarning): ad.io.read_loom(loom_pth, varm_mapping=varm_mapping, varm_names=varm_mapping) # positional -> keyword with pytest.warns(FutureWarning, match=r"sparse"): depr_result = ad.io.read_loom(loom_pth, True) # noqa: FBT003 actual_result = ad.io.read_loom(loom_pth, sparse=True) assert type(depr_result.X) == type(actual_result.X) def test_read_csv(): adata = ad.io.read_csv(HERE / "data" / "adata.csv") assert adata.obs_names.tolist() == ["r1", "r2", "r3"] assert adata.var_names.tolist() == ["c1", "c2"] assert adata.X.tolist() == X_list def test_read_tsv_strpath(): adata = ad.io.read_text(str(HERE / "data" / "adata-comments.tsv"), "\t") assert adata.obs_names.tolist() == ["r1", "r2", "r3"] assert adata.var_names.tolist() == ["c1", "c2"] assert adata.X.tolist() == X_list def test_read_tsv_iter(): with (HERE / "data" / "adata-comments.tsv").open() as f: adata = ad.io.read_text(f, "\t") assert adata.obs_names.tolist() == ["r1", "r2", "r3"] assert adata.var_names.tolist() == ["c1", "c2"] assert adata.X.tolist() == X_list @pytest.mark.parametrize("typ", [np.array, csr_matrix]) def test_write_csv(typ, tmp_path): X = typ(X_list) adata = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict) adata.write_csvs(tmp_path / "test_csv_dir", skip_data=False) @pytest.mark.parametrize("typ", [np.array, csr_matrix]) def test_write_csv_view(typ, tmp_path): # https://github.com/scverse/anndata/issues/401 import hashlib def md5_path(pth: Path) -> bytes: checksum = hashlib.md5() with pth.open("rb") as f: while True: buf = f.read(checksum.block_size * 100) if not buf: break checksum.update(buf) return checksum.digest() def hash_dir_contents(dir: Path) -> dict[str, bytes]: root_pth = str(dir) return { str(k)[len(root_pth) :]: md5_path(k) for k in dir.rglob("*") if k.is_file() } adata = ad.AnnData(typ(X_list), obs=obs_dict, var=var_dict, uns=uns_dict) # Test writing a view view_pth = tmp_path / "test_view_csv_dir" copy_pth = tmp_path / "test_copy_csv_dir" adata[::2].write_csvs(view_pth, skip_data=False) adata[::2].copy().write_csvs(copy_pth, skip_data=False) assert hash_dir_contents(view_pth) == hash_dir_contents(copy_pth) @pytest.mark.parametrize( ("read", "write", "name"), [ pytest.param(ad.read_h5ad, ad.io.write_h5ad, "test_empty.h5ad"), pytest.param( ad.io.read_loom, ad.io.write_loom, "test_empty.loom", marks=pytest.mark.xfail(reason="Loom can’t handle 0×0 matrices"), ), pytest.param(ad.read_zarr, ad.io.write_zarr, "test_empty.zarr"), ], ) def test_readwrite_empty(read, write, name, tmp_path): adata = ad.AnnData(uns=dict(empty=np.array([], dtype=float))) write(tmp_path / name, adata) ad_read = read(tmp_path / name) assert ad_read.uns["empty"].shape == (0,) def test_read_excel(): with warnings.catch_warnings(): warnings.filterwarnings( "ignore", message=r"datetime.datetime.utcnow\(\) is deprecated", category=DeprecationWarning, ) adata = ad.io.read_excel(HERE / "data/excel.xlsx", "Sheet1", dtype=int) assert adata.X.tolist() == X_list def test_read_umi_tools(): adata = ad.io.read_umi_tools(HERE / "data/umi_tools.tsv.gz") assert adata.obs_names.name == "cell" assert adata.var_names.name == "gene" assert adata.shape == (2, 13) assert "ENSG00000070404.9" in adata.var_names assert set(adata.obs_names) == {"ACAAGG", "TTCACG"} @pytest.mark.parametrize("s2c", [True, False], ids=["str2cat", "preserve"]) def test_write_categorical( *, tmp_path: Path, diskfmt: Literal["h5ad", "zarr"], s2c: bool ) -> None: with ad.settings.override(allow_write_nullable_strings=True): adata_pth = tmp_path / f"adata.{diskfmt}" obs = dict( str=pd.array(["a", "a", "b", pd.NA, pd.NA], dtype="string"), cat=pd.Categorical(["a", "a", "b", np.nan, np.nan]), **(dict(obj=["a", "a", "b", np.nan, np.nan]) if s2c else {}), ) orig = ad.AnnData(obs=pd.DataFrame(obs)) getattr(orig, f"write_{diskfmt}")( adata_pth, convert_strings_to_categoricals=s2c ) curr: ad.AnnData = getattr(ad, f"read_{diskfmt}")(adata_pth) assert np.all(orig.obs.notna() == curr.obs.notna()) assert np.all(orig.obs.stack().dropna() == curr.obs.stack().dropna()) assert curr.obs["str"].dtype == ("category" if s2c else "string") assert curr.obs["cat"].dtype == "category" def test_write_categorical_index(tmp_path, diskfmt): adata_pth = tmp_path / f"adata.{diskfmt}" orig = ad.AnnData( uns={"df": pd.DataFrame({}, index=pd.Categorical(list("aabcd")))}, ) getattr(orig, f"write_{diskfmt}")(adata_pth) curr = getattr(ad, f"read_{diskfmt}")(adata_pth) # Also covered by next assertion, but checking this value specifically pd.testing.assert_index_equal( orig.uns["df"].index, curr.uns["df"].index, exact=True ) assert_equal(orig, curr, exact=True) @pytest.mark.parametrize("colname", ["_index"]) @pytest.mark.parametrize("attr", ["obs", "varm_df"]) def test_dataframe_reserved_columns(tmp_path, diskfmt, colname, attr): adata_pth = tmp_path / f"adata.{diskfmt}" orig = ad.AnnData( obs=pd.DataFrame(index=np.arange(5)), var=pd.DataFrame(index=np.arange(5)) ) to_write = orig.copy() if attr == "obs": to_write.obs[colname] = np.ones(5) elif attr == "varm_df": to_write.varm["df"] = pd.DataFrame( {colname: list("aabcd")}, index=to_write.var_names ) else: pytest.fail(f"Unexpected attr: {attr}") with pytest.raises(ValueError, match=rf"{colname}.*reserved name"): getattr(to_write, f"write_{diskfmt}")(adata_pth) def test_write_large_categorical(tmp_path, diskfmt): M = 30_000 N = 1000 ls = np.array(list(ascii_letters)) def random_cats(n): cats = { "".join(np.random.choice(ls, np.random.choice(range(5, 30)))) for _ in range(n) } while len(cats) < n: # For the rare case that there’s duplicates cats |= random_cats(n - len(cats)) return cats cats = np.array(sorted(random_cats(10_000))) adata_pth = tmp_path / f"adata.{diskfmt}" n_cats = len(np.unique(cats)) orig = ad.AnnData( csr_matrix(([1], ([0], [0])), shape=(M, N)), obs=dict( cat1=cats[np.random.choice(n_cats, M)], cat2=pd.Categorical.from_codes(np.random.choice(n_cats, M), cats), ), ) getattr(orig, f"write_{diskfmt}")(adata_pth) curr = getattr(ad, f"read_{diskfmt}")(adata_pth) assert_equal(orig, curr) def test_write_string_type_error(tmp_path, diskfmt): adata = ad.AnnData(obs=dict(obs_names=list("abc"))) adata.obs[b"c"] = np.zeros(3) # This should error, and tell you which key is at fault with pytest.raises(TypeError, match=r"writing key 'obs'") as exc_info: getattr(adata, f"write_{diskfmt}")(tmp_path / f"adata.{diskfmt}") assert "b'c'" in str(exc_info.value) @pytest.mark.parametrize( "teststring", ["teststring", np.asarray(["test1", "test2", "test3"], dtype="object")], ) @pytest.mark.parametrize("encoding", ["ascii", "utf-8"]) @pytest.mark.parametrize("length", [None, 15]) def test_hdf5_attribute_conversion(tmp_path, teststring, encoding, length): with h5py.File(tmp_path / "attributes.h5", "w") as file: dset = file.create_dataset("dset", data=np.arange(10)) attrs = dset.attrs attrs.create( "string", teststring, dtype=h5py.h5t.string_dtype(encoding=encoding, length=length), ) assert_equal(teststring, _read_attr(attrs, "string")) def test_zarr_chunk_X(tmp_path): import zarr zarr_pth = Path(tmp_path) / "test.zarr" adata = gen_adata((100, 100), X_type=np.array) adata.write_zarr(zarr_pth, chunks=(10, 10)) z = zarr.open(str(zarr_pth)) # As of v2.3.2 zarr won’t take a Path assert z["X"].chunks == (10, 10) from_zarr = ad.read_zarr(zarr_pth) assert_equal(from_zarr, adata) ################################ # Round-tripping scanpy datasets ################################ def _do_roundtrip( adata: ad.AnnData, pth: Path, diskfmt: Literal["h5ad", "zarr"] ) -> ad.AnnData: getattr(adata, f"write_{diskfmt}")(pth) return getattr(ad, f"read_{diskfmt}")(pth) @pytest.fixture def roundtrip(diskfmt): return partial(_do_roundtrip, diskfmt=diskfmt) def test_write_string_types(tmp_path, diskfmt, roundtrip): # https://github.com/scverse/anndata/issues/456 adata_pth = tmp_path / f"adata.{diskfmt}" adata = ad.AnnData( obs=pd.DataFrame( np.ones((3, 2)), columns=["a", np.str_("b")], index=["a", "b", "c"], ), ) from_disk = roundtrip(adata, adata_pth) assert_equal(adata, from_disk) @pytest.mark.skipif(not find_spec("scanpy"), reason="Scanpy is not installed") def test_scanpy_pbmc68k(tmp_path, diskfmt, roundtrip, diskfmt2): roundtrip2 = partial(_do_roundtrip, diskfmt=diskfmt2) filepth1 = tmp_path / f"test1.{diskfmt}" filepth2 = tmp_path / f"test2.{diskfmt2}" with warnings.catch_warnings(): warnings.filterwarnings( "ignore", message=r"Importing read_.* from `anndata` is deprecated" ) import scanpy as sc with warnings.catch_warnings(): warnings.simplefilter("ignore", ad.OldFormatWarning) pbmc = sc.datasets.pbmc68k_reduced() # zarr v3 can't write recarray # https://github.com/zarr-developers/zarr-python/issues/2134 if ad.settings.zarr_write_format == 3: del pbmc.uns["rank_genes_groups"]["names"] del pbmc.uns["rank_genes_groups"]["scores"] from_disk1 = roundtrip(pbmc, filepth1) # Do we read okay from_disk2 = roundtrip2(from_disk1, filepth2) # Can we round trip assert_equal(pbmc, from_disk1) # Not expected to be exact due to `nan`s assert_equal(pbmc, from_disk2) @pytest.mark.skipif(not find_spec("scanpy"), reason="Scanpy is not installed") def test_scanpy_krumsiek11(tmp_path, diskfmt, roundtrip): filepth = tmp_path / f"test.{diskfmt}" with warnings.catch_warnings(): warnings.filterwarnings( "ignore", message=r"Importing read_.* from `anndata` is deprecated" ) import scanpy as sc # TODO: this should be fixed in scanpy instead with pytest.warns(UserWarning, match=r"Observation names are not unique"): orig = sc.datasets.krumsiek11() del orig.uns["highlights"] # Can’t write int keys # Can’t write "string" dtype: https://github.com/scverse/anndata/issues/679 orig.obs["cell_type"] = orig.obs["cell_type"].astype(str) with pytest.warns(UserWarning, match=r"Observation names are not unique"): curr = roundtrip(orig, filepth) assert_equal(orig, curr, exact=True) # Checking if we can read legacy zarr files # TODO: Check how I should add this file to the repo @pytest.mark.filterwarnings("ignore::anndata.OldFormatWarning") @pytest.mark.skipif(not find_spec("scanpy"), reason="Scanpy is not installed") @pytest.mark.skipif( not Path(HERE / "data/pbmc68k_reduced_legacy.zarr.zip").is_file(), reason="File not present.", ) def test_backwards_compat_zarr(): with warnings.catch_warnings(): warnings.filterwarnings( "ignore", message=r"Importing read_.* from `anndata` is deprecated" ) import scanpy as sc import zarr pbmc_orig = sc.datasets.pbmc68k_reduced() # Old zarr writer couldn’t do sparse arrays pbmc_orig.raw._X = pbmc_orig.raw.X.toarray() del pbmc_orig.uns["neighbors"] # Since these have moved, see PR #337 del pbmc_orig.obsp["distances"] del pbmc_orig.obsp["connectivities"] # This was written out with anndata=0.6.22.post1 zarrpth = HERE / "data/pbmc68k_reduced_legacy.zarr.zip" with zarr.ZipStore(zarrpth, mode="r") as z: pbmc_zarr = ad.read_zarr(z) assert_equal(pbmc_zarr, pbmc_orig) def test_adata_in_uns(tmp_path, diskfmt, roundtrip): pth = tmp_path / f"adatas_in_uns.{diskfmt}" orig = gen_adata((4, 5)) orig.uns["adatas"] = { "a": gen_adata((1, 2)), "b": gen_adata((12, 8)), } another_one = gen_adata((2, 5)) another_one.raw = gen_adata((2, 7)) orig.uns["adatas"]["b"].uns["another_one"] = another_one curr = roundtrip(orig, pth) assert_equal(orig, curr) @pytest.mark.parametrize( "uns_val", [ pytest.param(dict(base=None), id="dict_val"), pytest.param( pd.DataFrame(dict(col_0=["string", None])).convert_dtypes(), id="df" ), ], ) def test_none_dict_value_in_uns(diskfmt, tmp_path, roundtrip, uns_val): pth = tmp_path / f"adata_dtype.{diskfmt}" orig = ad.AnnData(np.ones((3, 4)), uns=dict(val=uns_val)) with ad.settings.override(allow_write_nullable_strings=True): curr = roundtrip(orig, pth) if isinstance(orig.uns["val"], pd.DataFrame): pd.testing.assert_frame_equal(curr.uns["val"], orig.uns["val"]) else: assert curr.uns["val"] == orig.uns["val"] def test_io_dtype(tmp_path, diskfmt, dtype, roundtrip): pth = tmp_path / f"adata_dtype.{diskfmt}" orig = ad.AnnData(np.ones((5, 8), dtype=dtype)) curr = roundtrip(orig, pth) assert curr.X.dtype == dtype def test_h5py_attr_limit(tmp_path): N = 10_000 a = ad.AnnData(np.ones((5, 10))) a.obsm["df"] = pd.DataFrame( np.ones((5, N)), index=a.obs_names, columns=[str(i) for i in range(N)] ) a.write(tmp_path / "tmp.h5ad") @pytest.mark.parametrize( "elem_key", ["obs", "var", "obsm", "varm", "layers", "obsp", "varp", "uns"] ) def test_forward_slash_key(elem_key, tmp_path): a = ad.AnnData(np.ones((10, 10))) getattr(a, elem_key)["bad/key"] = np.ones( (10,) if elem_key in ["obs", "var"] else (10, 10) ) with pytest.raises(ValueError, match="Forward slashes"): a.write_h5ad(tmp_path / "does_not_matter_the_path.h5ad") @pytest.mark.skipif( find_spec("xarray"), reason="Xarray is installed so `read_lazy` will not error" ) def test_read_lazy_import_error(): with pytest.raises(ImportError, match="xarray"): ad.experimental.read_lazy("test.zarr") python-anndata-0.12.0~rc1/tests/test_repr.py000066400000000000000000000032521500370632200210460ustar00rootroot00000000000000from __future__ import annotations import re from string import ascii_letters import numpy as np import pandas as pd import pytest import anndata as ad ADATA_ATTRS = ("obs", "var", "varm", "obsm", "layers", "obsp", "varp", "uns") @pytest.fixture def adata(): return ad.AnnData( np.zeros((20, 10)), obs=pd.DataFrame( dict(obs_key=list(ascii_letters[:20])), index=[f"cell{i}" for i in range(20)], ), var=pd.DataFrame( dict(var_key=np.arange(10)), index=[f"gene{i}" for i in range(10)] ), varm=dict(varm_key=np.zeros((10, 20))), obsm=dict(obsm_key=np.zeros((20, 20))), layers=dict(layers_key=np.zeros((20, 10))), obsp=dict(obsp_key=np.zeros((20, 20))), varp=dict(varp_key=np.zeros((10, 10))), uns=dict(uns_key=dict(zip("abc", range(3)))), ) @pytest.fixture(params=ADATA_ATTRS) def adata_attr(request): return request.param def test_anndata_repr(adata): assert f"{adata.n_obs} × {adata.n_vars}" in repr(adata) for idxr in [ (slice(10, 20), 9), (12, 9), (["cell1", "cell2"], slice(10, 15)), ]: v = adata[idxr] v_repr = repr(v) assert f"{v.n_obs} × {v.n_vars}" in v_repr assert "View of" in v_repr for attr in ADATA_ATTRS: assert re.search( rf"^\s+{attr}:[^$]*{attr}_key.*$", v_repr, flags=re.MULTILINE ) def test_removal(adata, adata_attr): attr = adata_attr assert re.search(rf"^\s+{attr}:.*$", repr(adata), flags=re.MULTILINE) delattr(adata, attr) assert re.search(rf"^\s+{attr}:.*$", repr(adata), flags=re.MULTILINE) is None python-anndata-0.12.0~rc1/tests/test_settings.py000066400000000000000000000172241500370632200217420ustar00rootroot00000000000000from __future__ import annotations import os import re from enum import Enum import pytest from anndata._settings import ( SettingsManager, check_and_get_bool, check_and_get_environ_var, validate_bool, ) option = "test_var" default_val = False description = "My doc string!" option_2 = "test_var_2" default_val_2 = False description_2 = "My doc string 2!" option_3 = "test_var_3" default_val_3 = [1, 2] description_3 = "My doc string 3!" type_3 = list[int] def validate_int_list(val) -> bool: if not isinstance(val, list) or not [isinstance(type(e), int) for e in val]: msg = f"{val!r} is not a valid int list" raise TypeError(msg) return True @pytest.fixture def settings() -> SettingsManager: settings = SettingsManager() settings.register(option, default_val, description, validate_bool) settings.register(option_2, default_val_2, description_2, validate_bool) settings.register(option_3, default_val_3, description_3, validate_int_list, type_3) return settings def test_register_option_default(settings: SettingsManager): assert getattr(settings, option) == default_val assert description in settings.describe(option) def test_register_with_env(settings: SettingsManager, monkeypatch: pytest.MonkeyPatch): option_env = "test_var_env" default_val_env = False description_env = "My doc string env!" option_env_var = "ANNDATA_" + option_env.upper() monkeypatch.setenv(option_env_var, "1") settings.register( option_env, default_val_env, description_env, validate_bool, get_from_env=check_and_get_bool, ) assert settings.test_var_env def test_register_with_env_enum( settings: SettingsManager, monkeypatch: pytest.MonkeyPatch ): option_env = "test_var_env" default_val_env = False description_env = "My doc string env!" option_env_var = "ANNDATA_" + option_env.upper() monkeypatch.setenv(option_env_var, "b") class TestEnum(Enum): a = False b = True def check_and_get_bool_enum(option, default_value): return check_and_get_environ_var( "ANNDATA_" + option.upper(), "a", cast=TestEnum ).value settings.register( option_env, default_val_env, description_env, validate_bool, get_from_env=check_and_get_bool_enum, ) assert settings.test_var_env def test_register_bad_option(settings: SettingsManager): with pytest.raises(TypeError, match=r"'foo' is not a valid int list"): settings.register( "test_var_4", "foo", # should be a list of ints description_3, validate_int_list, type_3, ) def test_set_option(settings: SettingsManager): setattr(settings, option, not default_val) assert getattr(settings, option) == (not default_val) settings.reset(option) assert getattr(settings, option) == default_val def test_dir(settings: SettingsManager): assert {option, option_2, option_3} <= set(dir(settings)) assert dir(settings) == sorted(dir(settings)) def test_reset_multiple(settings: SettingsManager): setattr(settings, option, not default_val) setattr(settings, option_2, not default_val_2) settings.reset([option, option_2]) assert getattr(settings, option) == default_val assert getattr(settings, option_2) == default_val_2 def test_get_unregistered_option(settings: SettingsManager): with pytest.raises(AttributeError): setattr(settings, option + "_different", default_val) def test_override(settings: SettingsManager): with settings.override(**{option: not default_val}): assert getattr(settings, option) == (not default_val) assert getattr(settings, option) == default_val def test_override_multiple(settings: SettingsManager): with settings.override(**{option: not default_val, option_2: not default_val_2}): assert getattr(settings, option) == (not default_val) assert getattr(settings, option_2) == (not default_val_2) assert getattr(settings, option) == default_val assert getattr(settings, option_2) == default_val_2 def test_deprecation(settings: SettingsManager): warning = "This is a deprecation warning!" version = "0.1.0" settings.deprecate(option, version, warning) described_option = settings.describe(option, should_print_description=False) # first line is message, second two from deprecation default_deprecation_message = f"{option} will be removed in {version}.*" assert described_option.endswith(default_deprecation_message) described_option = ( described_option.rstrip().removesuffix(default_deprecation_message).rstrip() ) assert described_option.endswith(warning) with pytest.warns( FutureWarning, match=r"'test_var' will be removed in 0\.1\.0\. This is a deprecation warning!", ): assert getattr(settings, option) == default_val def test_deprecation_no_message(settings: SettingsManager): version = "0.1.0" settings.deprecate(option, version) described_option = settings.describe(option, should_print_description=False) # first line is message, second from deprecation version assert described_option.endswith(f"{option} will be removed in {version}.*") def test_option_typing(settings: SettingsManager): assert settings._registered_options[option_3].type == type_3 assert str(type_3) in settings.describe(option_3, should_print_description=False) def test_check_and_get_environ_var(monkeypatch: pytest.MonkeyPatch): option_env_var = "ANNDATA_OPTION" assert hash("foo") == check_and_get_environ_var( option_env_var, "foo", ["foo", "bar"], lambda x: hash(x) ) monkeypatch.setenv(option_env_var, "bar") assert hash("bar") == check_and_get_environ_var( option_env_var, "foo", ["foo", "bar"], lambda x: hash(x) ) monkeypatch.setenv(option_env_var, "Not foo or bar") with pytest.warns( match=f"Value '{re.escape(os.environ[option_env_var])}' is not in allowed" ): check_and_get_environ_var( option_env_var, "foo", ["foo", "bar"], lambda x: hash(x) ) assert hash("Not foo or bar") == check_and_get_environ_var( option_env_var, "foo", cast=lambda x: hash(x) ) def test_check_and_get_bool(monkeypatch: pytest.MonkeyPatch): option_env_var = f"ANNDATA_{option.upper()}" assert not check_and_get_bool(option, default_val) monkeypatch.setenv(option_env_var, "1") assert check_and_get_bool(option, default_val) monkeypatch.setenv(option_env_var, "Not 0 or 1") with pytest.warns( match=f"Value '{re.escape(os.environ[option_env_var])}' is not in allowed" ): check_and_get_bool(option, default_val) def test_check_and_get_bool_enum(monkeypatch: pytest.MonkeyPatch): option_env_var = f"ANNDATA_{option.upper()}" monkeypatch.setenv(option_env_var, "b") class TestEnum(Enum): a = False b = True assert check_and_get_environ_var(option_env_var, "a", cast=TestEnum).value @pytest.mark.parametrize( ("as_rst", "expected"), [ pytest.param( True, ( ".. attribute:: settings.test_var_3\n" " :type: list[int]\n" " :value: [1, 2]\n" "\n" " My doc string 3!" ), id="rst", ), pytest.param( False, "test_var_3: `list[int]`\n My doc string 3! (default: `[1, 2]`).", id="plain", ), ], ) def test_describe(*, as_rst: bool, expected: str, settings: SettingsManager): assert settings.describe("test_var_3", as_rst=as_rst) == expected python-anndata-0.12.0~rc1/tests/test_structured_arrays.py000066400000000000000000000040571500370632200236670ustar00rootroot00000000000000from __future__ import annotations from contextlib import nullcontext from itertools import combinations, product from typing import TYPE_CHECKING import numpy as np import pytest import anndata as ad from anndata import AnnData from anndata.tests.helpers import gen_vstr_recarray if TYPE_CHECKING: from typing import Literal def assert_str_contents_equal(A, B): lA = [ [str(el) if not isinstance(el, bytes) else el.decode("utf-8") for el in a] for a in A ] lB = [ [str(el) if not isinstance(el, bytes) else el.decode("utf-8") for el in b] for b in B ] assert lA == lB def test_io( tmp_path, diskfmt: Literal["zarr", "h5ad"], diskfmt2: Literal["zarr", "h5ad"] ): read1 = lambda pth: getattr(ad, f"read_{diskfmt}")(pth) write1 = lambda adata, pth: getattr(adata, f"write_{diskfmt}")(pth) read2 = lambda pth: getattr(ad, f"read_{diskfmt2}")(pth) write2 = lambda adata, pth: getattr(adata, f"write_{diskfmt2}")(pth) filepth1 = tmp_path / f"test1.{diskfmt}" filepth2 = tmp_path / f"test2.{diskfmt2}" str_recarray = gen_vstr_recarray(3, 5) u_recarray = str_recarray.astype([(n, "U10") for n in str_recarray.dtype.fields]) s_recarray = str_recarray.astype([(n, "S10") for n in str_recarray.dtype.fields]) initial = AnnData(np.zeros((3, 3))) initial.uns = dict(str_rec=str_recarray, u_rec=u_recarray, s_rec=s_recarray) with ( pytest.raises( NotImplementedError, match=r"zarr v3 does not support structured dtypes" ) if diskfmt == "zarr" and ad.settings.zarr_write_format == 3 else nullcontext() ): write1(initial, filepth1) disk_once = read1(filepth1) write2(disk_once, filepth2) disk_twice = read2(filepth2) adatas = [initial, disk_once, disk_twice] keys = [ "str_rec", "u_rec", # "s_rec" ] for (ad1, key1), (ad2, key2) in combinations(product(adatas, keys), 2): assert_str_contents_equal(ad1.uns[key1], ad2.uns[key2]) python-anndata-0.12.0~rc1/tests/test_transpose.py000066400000000000000000000056111500370632200221150ustar00rootroot00000000000000from __future__ import annotations import numpy as np import pytest from scipy import sparse import anndata as ad from anndata.tests.helpers import assert_equal, gen_adata, shares_memory def test_transpose_orig(): """ Original test for transpose, should be covered by more thorough tests below, but keeping around just in case. """ adata = gen_adata((5, 3)) adata.varp = {f"varp_{k}": v for k, v in adata.varp.items()} adata1 = adata.T adata1.uns["test123"] = 1 assert "test123" in adata.uns assert_equal(adata1.X.shape, (3, 5)) assert_equal(adata1.obsp.keys(), adata.varp.keys()) def _add_raw(adata, *, var_subset=slice(None)): new = adata[:, var_subset].copy() new.raw = adata.copy() return new # TODO: Cases to add: # * Views # * X is None should have the xfail marker removed # * Backed @pytest.fixture( params=[ pytest.param(gen_adata((50, 20)), id="csr_X"), pytest.param(gen_adata((50, 20), sparse.csc_matrix), id="csc_X"), pytest.param(_add_raw(gen_adata((50, 20))), id="with_raw"), pytest.param(gen_adata((20, 10), X_type=None), id="None_X"), ] ) def adata(request): return request.param def test_transpose_doesnt_copy(): adata = ad.AnnData( sparse.random(50, 20, format="csr"), layers={ "sparse": sparse.random(50, 20, format="csc"), "dense": np.random.rand(50, 20), }, obsm={ "sparse": sparse.random(50, 10, format="csc"), "dense": np.random.rand(50, 10), }, obsp={ "sparse": sparse.random(50, 50, format="csc"), "dense": np.random.rand(50, 50), }, ) t = adata.T assert shares_memory(adata.X, t.X) for k in adata.obsm: assert shares_memory(adata.obsm[k], t.varm[k]) for k in adata.obsp: assert shares_memory(adata.obsp[k], t.varp[k]) for k in adata.layers: assert shares_memory(adata.layers[k], t.layers[k]) def test_transpose_removes_raw(adata): """ Since Raw must have the same `obs_names` as AnnData, but does not have the same `var_names`, transpose doesn't really make sense for Raw. So it should just get deleted. """ assert adata.T.raw is None def test_transposed_contents(adata): t = adata.T if adata.X is not None: assert_equal(adata.X.T, t.X) else: assert adata.X is t.X is None assert_equal( {k: v.T for k, v in adata.layers.items()}, {k: v for k, v in t.layers.items()} ) assert_equal(adata.obs, t.var) assert_equal(adata.var, t.obs) assert_equal(dict(adata.obsm), dict(t.varm)) assert_equal(dict(adata.varm), dict(t.obsm)) assert_equal(dict(adata.obsp), dict(t.varp)) assert_equal(dict(adata.varp), dict(t.obsp)) assert_equal(adata.uns, t.uns) def test_transpose_roundtrip(adata): del adata.raw assert_equal(adata, adata.T.T) python-anndata-0.12.0~rc1/tests/test_uns.py000066400000000000000000000031051500370632200207000ustar00rootroot00000000000000from __future__ import annotations import numpy as np import pandas as pd import pytest from anndata import AnnData from anndata.tests.helpers import assert_equal def test_uns_color_subset(): # Tests for https://github.com/scverse/anndata/issues/257 obs = pd.DataFrame( { "cat1": pd.Categorical(list("aabcd")), "cat2": pd.Categorical(list("aabbb")), }, index=[f"cell{i}" for i in range(5)], ) # If number of categories does not match number of colors, they should be reset wrong_color_length_adata = AnnData( np.ones((5, 5)), obs=obs, uns={ "cat1_colors": ["red", "green", "blue"], "cat2_colors": ["red", "green", "blue"], }, ) v = wrong_color_length_adata[:, [0, 1]] assert "cat1_colors" not in v.uns assert "cat2_colors" not in v.uns # Otherwise the colors should still match after resetting cat1_colors = np.array(["red", "green", "blue", "yellow"], dtype=object) adata = AnnData(np.ones((5, 5)), obs=obs, uns={"cat1_colors": cat1_colors.copy()}) for color, idx in [("red", [0, 1]), ("green", [2]), ("blue", [3]), ("yellow", [4])]: v = adata[idx, :] assert len(v.uns["cat1_colors"]) == 1 assert v.uns["cat1_colors"][0] == color c = v.copy() assert_equal(v.uns, c.uns, elem_name="uns") with pytest.raises(AssertionError): assert_equal(adata.uns, c.uns, elem_name="uns") # But original object should not change assert list(adata.uns["cat1_colors"]) == list(cat1_colors) python-anndata-0.12.0~rc1/tests/test_utils.py000066400000000000000000000033361500370632200212410ustar00rootroot00000000000000from __future__ import annotations from itertools import repeat import pandas as pd import pytest from scipy import sparse import anndata as ad from anndata.tests.helpers import gen_typed_df from anndata.utils import make_index_unique def test_make_index_unique(): index = pd.Index(["val", "val", "val-1", "val-1"]) with pytest.warns(UserWarning): result = make_index_unique(index) expected = pd.Index(["val", "val-2", "val-1", "val-1-1"]) assert list(expected) == list(result) assert result.is_unique def test_adata_unique_indices(): m, n = (10, 20) obs_index = pd.Index(repeat("a", m), name="obs") var_index = pd.Index(repeat("b", n), name="var") adata = ad.AnnData( X=sparse.random(m, n, format="csr"), obs=gen_typed_df(m, index=obs_index), var=gen_typed_df(n, index=var_index), obsm={"df": gen_typed_df(m, index=obs_index)}, varm={"df": gen_typed_df(n, index=var_index)}, ) pd.testing.assert_index_equal(adata.obsm["df"].index, adata.obs_names) pd.testing.assert_index_equal(adata.varm["df"].index, adata.var_names) adata.var_names_make_unique() adata.obs_names_make_unique() assert adata.obs_names.name == "obs" assert adata.var_names.name == "var" assert len(pd.unique(adata.obs_names)) == m assert len(pd.unique(adata.var_names)) == n pd.testing.assert_index_equal(adata.obsm["df"].index, adata.obs_names) pd.testing.assert_index_equal(adata.varm["df"].index, adata.var_names) v = adata[:5, :5] assert v.obs_names.name == "obs" assert v.var_names.name == "var" pd.testing.assert_index_equal(v.obsm["df"].index, v.obs_names) pd.testing.assert_index_equal(v.varm["df"].index, v.var_names) python-anndata-0.12.0~rc1/tests/test_views.py000066400000000000000000000672461500370632200212500ustar00rootroot00000000000000from __future__ import annotations from contextlib import ExitStack from copy import deepcopy from operator import mul from typing import TYPE_CHECKING import joblib import numpy as np import pandas as pd import pytest from dask.base import tokenize from packaging.version import Version from scipy import sparse import anndata as ad from anndata._core.index import _normalize_index from anndata._core.views import ( ArrayView, SparseCSCArrayView, SparseCSCMatrixView, SparseCSRArrayView, SparseCSRMatrixView, ) from anndata.compat import CupyCSCMatrix, DaskArray from anndata.tests.helpers import ( BASE_MATRIX_PARAMS, CUPY_MATRIX_PARAMS, DASK_MATRIX_PARAMS, GEN_ADATA_DASK_ARGS, assert_equal, gen_adata, single_subset, slice_subset, subset_func, ) from anndata.utils import asarray if TYPE_CHECKING: from types import EllipsisType IGNORE_SPARSE_EFFICIENCY_WARNING = pytest.mark.filterwarnings( "ignore:Changing the sparsity structure:scipy.sparse.SparseEfficiencyWarning" ) # ------------------------------------------------------------------------------ # Some test data # ------------------------------------------------------------------------------ # data matrix of shape n_obs x n_vars X_list = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] # annotation of observations / rows obs_dict = dict( row_names=["name1", "name2", "name3"], # row annotation oanno1=["cat1", "cat2", "cat2"], # categorical annotation oanno2=["o1", "o2", "o3"], # string annotation oanno3=[2.1, 2.2, 2.3], # float annotation ) # annotation of variables / columns var_dict = dict(vanno1=[3.1, 3.2, 3.3]) # unstructured annotation uns_dict = dict(oanno1_colors=["#000000", "#FFFFFF"], uns2=["some annotation"]) subset_func2 = subset_func class NDArraySubclass(np.ndarray): def view(self, dtype=None, typ=None): return self @pytest.fixture def adata(): adata = ad.AnnData(np.zeros((100, 100))) adata.obsm["o"] = np.zeros((100, 50)) adata.varm["o"] = np.zeros((100, 50)) return adata @pytest.fixture( params=BASE_MATRIX_PARAMS + DASK_MATRIX_PARAMS + CUPY_MATRIX_PARAMS, ) def matrix_type(request): return request.param @pytest.fixture(params=BASE_MATRIX_PARAMS + DASK_MATRIX_PARAMS) def matrix_type_no_gpu(request): return request.param @pytest.fixture(params=BASE_MATRIX_PARAMS) def matrix_type_base(request): return request.param @pytest.fixture(params=["layers", "obsm", "varm"]) def mapping_name(request): return request.param # ------------------------------------------------------------------------------ # The test functions # ------------------------------------------------------------------------------ def test_views(): X = np.array(X_list, dtype="int32") adata = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict) assert adata[:, 0].is_view assert adata[:, 0].X.tolist() == np.reshape([1, 4, 7], (3, 1)).tolist() adata[:2, 0].X = [0, 0] assert adata[:, 0].X.tolist() == np.reshape([0, 0, 7], (3, 1)).tolist() adata_subset = adata[:2, [0, 1]] assert adata_subset.is_view # now transition to actual object with pytest.warns(ad.ImplicitModificationWarning, match=r".*\.obs.*"): adata_subset.obs["foo"] = range(2) assert not adata_subset.is_view assert adata_subset.obs["foo"].tolist() == list(range(2)) def test_convert_error(): adata = ad.AnnData(np.array([[1, 2], [3, 0]])) no_array = [[1], []] if Version(np.__version__) >= Version("1.24"): stack = pytest.raises(ValueError, match=r"Failed to convert") else: stack = ExitStack() stack.enter_context( pytest.warns( np.VisibleDeprecationWarning, match=r"ndarray from ragged.*is deprecated", ) ) stack.enter_context( pytest.raises(ValueError, match=r"setting an array element with a sequence") ) with stack: adata[:, 0].X = no_array def test_view_subset_shapes(): adata = gen_adata((20, 10), **GEN_ADATA_DASK_ARGS) view = adata[:, ::2] assert view.var.shape == (5, 8) assert {k: v.shape[0] for k, v in view.varm.items()} == {k: 5 for k in view.varm} def test_modify_view_component(matrix_type, mapping_name, request): adata = ad.AnnData( np.zeros((10, 10)), **{mapping_name: dict(m=matrix_type(asarray(sparse.random(10, 10))))}, ) # Fix if and when dask supports tokenizing GPU arrays # https://github.com/dask/dask/issues/6718 if isinstance(matrix_type(np.zeros((1, 1))), DaskArray): hash_func = tokenize else: hash_func = joblib.hash init_hash = hash_func(adata) subset = adata[:5, :][:, :5] assert subset.is_view m = getattr(subset, mapping_name)["m"] with pytest.warns(ad.ImplicitModificationWarning, match=rf".*\.{mapping_name}.*"): m[0, 0] = 100 assert not subset.is_view assert getattr(subset, mapping_name)["m"][0, 0] == 100 assert init_hash == hash_func(adata) if "sparse_array_dask_array" in request.node.callspec.id: msg = "sparse arrays in dask are generally expected to fail but in this case they do not" pytest.fail(msg) @pytest.mark.parametrize("attr", ["obsm", "varm"]) def test_set_obsm_key(adata, attr): init_hash = joblib.hash(adata) orig_val = getattr(adata, attr)["o"].copy() subset = adata[:50] if attr == "obsm" else adata[:, :50] assert subset.is_view with pytest.warns(ad.ImplicitModificationWarning, match=rf".*\.{attr}\['o'\].*"): getattr(subset, attr)["o"] = new_val = np.ones((50, 20)) assert not subset.is_view assert np.all(getattr(adata, attr)["o"] == orig_val) assert np.any(getattr(subset, attr)["o"] == new_val) assert init_hash == joblib.hash(adata) def test_set_obs(adata, subset_func): init_hash = joblib.hash(adata) subset = adata[subset_func(adata.obs_names), :] new_obs = pd.DataFrame( dict(a=np.ones(subset.n_obs), b=np.ones(subset.n_obs)), index=subset.obs_names, ) assert subset.is_view subset.obs = new_obs assert not subset.is_view assert np.all(subset.obs == new_obs) assert joblib.hash(adata) == init_hash def test_set_var(adata, subset_func): init_hash = joblib.hash(adata) subset = adata[:, subset_func(adata.var_names)] new_var = pd.DataFrame( dict(a=np.ones(subset.n_vars), b=np.ones(subset.n_vars)), index=subset.var_names, ) assert subset.is_view subset.var = new_var assert not subset.is_view assert np.all(subset.var == new_var) assert joblib.hash(adata) == init_hash def test_drop_obs_column(): adata = ad.AnnData(np.array(X_list, dtype="int32"), obs=obs_dict) subset = adata[:2] assert subset.is_view # returns a copy of obs assert subset.obs.drop(columns=["oanno1"]).columns.tolist() == ["oanno2", "oanno3"] assert subset.is_view # would modify obs, so it should actualize subset and not modify adata subset.obs.drop(columns=["oanno1"], inplace=True) assert not subset.is_view assert subset.obs.columns.tolist() == ["oanno2", "oanno3"] assert adata.obs.columns.tolist() == ["oanno1", "oanno2", "oanno3"] def test_set_obsm(adata): init_hash = joblib.hash(adata) dim0_size = np.random.randint(2, adata.shape[0] - 1) dim1_size = np.random.randint(1, 99) orig_obsm_val = adata.obsm["o"].copy() subset_idx = np.random.choice(adata.obs_names, dim0_size, replace=False) subset = adata[subset_idx, :] assert subset.is_view subset.obsm = dict(o=np.ones((dim0_size, dim1_size))) assert not subset.is_view assert np.all(orig_obsm_val == adata.obsm["o"]) # Checking for mutation assert np.all(subset.obsm["o"] == np.ones((dim0_size, dim1_size))) subset = adata[subset_idx, :] subset_hash = joblib.hash(subset) with pytest.raises(ValueError, match=r"incorrect shape"): subset.obsm = dict(o=np.ones((dim0_size + 1, dim1_size))) with pytest.raises(ValueError, match=r"incorrect shape"): subset.varm = dict(o=np.ones((dim0_size - 1, dim1_size))) assert subset_hash == joblib.hash(subset) # Only modification have been made to a view assert init_hash == joblib.hash(adata) def test_set_varm(adata): init_hash = joblib.hash(adata) dim0_size = np.random.randint(2, adata.shape[1] - 1) dim1_size = np.random.randint(1, 99) orig_varm_val = adata.varm["o"].copy() subset_idx = np.random.choice(adata.var_names, dim0_size, replace=False) subset = adata[:, subset_idx] assert subset.is_view subset.varm = dict(o=np.ones((dim0_size, dim1_size))) assert not subset.is_view assert np.all(orig_varm_val == adata.varm["o"]) # Checking for mutation assert np.all(subset.varm["o"] == np.ones((dim0_size, dim1_size))) subset = adata[:, subset_idx] subset_hash = joblib.hash(subset) with pytest.raises(ValueError, match=r"incorrect shape"): subset.varm = dict(o=np.ones((dim0_size + 1, dim1_size))) with pytest.raises(ValueError, match=r"incorrect shape"): subset.varm = dict(o=np.ones((dim0_size - 1, dim1_size))) # subset should not be changed by failed setting assert subset_hash == joblib.hash(subset) assert init_hash == joblib.hash(adata) # TODO: Determine if this is the intended behavior, # or just the behaviour we’ve had for a while @IGNORE_SPARSE_EFFICIENCY_WARNING def test_not_set_subset_X(matrix_type_base, subset_func): adata = ad.AnnData(matrix_type_base(asarray(sparse.random(20, 20)))) init_hash = joblib.hash(adata) orig_X_val = adata.X.copy() while True: subset_idx = slice_subset(adata.obs_names) if len(adata[subset_idx, :]) > 2: break subset = adata[subset_idx, :] subset = adata[:, subset_idx] internal_idx = _normalize_index( subset_func(np.arange(subset.X.shape[1])), subset.var_names ) assert subset.is_view with pytest.warns(ad.ImplicitModificationWarning, match=r".*X.*"): subset.X[:, internal_idx] = 1 assert not subset.is_view assert not np.any(asarray(adata.X != orig_X_val)) assert init_hash == joblib.hash(adata) assert isinstance(subset.X, type(adata.X)) # TODO: Determine if this is the intended behavior, # or just the behaviour we’ve had for a while @IGNORE_SPARSE_EFFICIENCY_WARNING def test_not_set_subset_X_dask(matrix_type_no_gpu, subset_func): adata = ad.AnnData(matrix_type_no_gpu(asarray(sparse.random(20, 20)))) init_hash = tokenize(adata) orig_X_val = adata.X.copy() while True: subset_idx = slice_subset(adata.obs_names) if len(adata[subset_idx, :]) > 2: break subset = adata[subset_idx, :] subset = adata[:, subset_idx] internal_idx = _normalize_index( subset_func(np.arange(subset.X.shape[1])), subset.var_names ) assert subset.is_view with pytest.warns(ad.ImplicitModificationWarning, match=r".*X.*"): subset.X[:, internal_idx] = 1 assert not subset.is_view assert not np.any(asarray(adata.X != orig_X_val)) assert init_hash == tokenize(adata) assert isinstance(subset.X, type(adata.X)) @IGNORE_SPARSE_EFFICIENCY_WARNING def test_set_scalar_subset_X(matrix_type, subset_func): adata = ad.AnnData(matrix_type(np.zeros((10, 10)))) orig_X_val = adata.X.copy() subset_idx = subset_func(adata.obs_names) adata_subset = adata[subset_idx, :] adata_subset.X = 1 assert adata_subset.is_view assert np.all(asarray(adata[subset_idx, :].X) == 1) if isinstance(adata.X, CupyCSCMatrix): # Comparison broken for CSC matrices # https://github.com/cupy/cupy/issues/7757 assert asarray(orig_X_val.tocsr() != adata.X.tocsr()).sum() == mul( *adata_subset.shape ) else: assert asarray(orig_X_val != adata.X).sum() == mul(*adata_subset.shape) # TODO: Use different kind of subsetting for adata and view def test_set_subset_obsm(adata, subset_func): init_hash = joblib.hash(adata) orig_obsm_val = adata.obsm["o"].copy() while True: subset_idx = slice_subset(adata.obs_names) if len(adata[subset_idx, :]) > 2: break subset = adata[subset_idx, :] internal_idx = _normalize_index( subset_func(np.arange(subset.obsm["o"].shape[0])), subset.obs_names ) assert subset.is_view with pytest.warns(ad.ImplicitModificationWarning, match=r".*obsm.*"): subset.obsm["o"][internal_idx] = 1 assert not subset.is_view assert np.all(adata.obsm["o"] == orig_obsm_val) assert init_hash == joblib.hash(adata) def test_set_subset_varm(adata, subset_func): init_hash = joblib.hash(adata) orig_varm_val = adata.varm["o"].copy() while True: subset_idx = slice_subset(adata.var_names) if (adata[:, subset_idx]).shape[1] > 2: break subset = adata[:, subset_idx] internal_idx = _normalize_index( subset_func(np.arange(subset.varm["o"].shape[0])), subset.var_names ) assert subset.is_view with pytest.warns(ad.ImplicitModificationWarning, match=r".*varm.*"): subset.varm["o"][internal_idx] = 1 assert not subset.is_view assert np.all(adata.varm["o"] == orig_varm_val) assert init_hash == joblib.hash(adata) @pytest.mark.parametrize("attr", ["obsm", "varm", "obsp", "varp", "layers"]) def test_view_failed_delitem(attr): adata = gen_adata((10, 10), **GEN_ADATA_DASK_ARGS) view = adata[5:7, :][:, :5] adata_hash = joblib.hash(adata) view_hash = joblib.hash(view) with pytest.raises(KeyError): getattr(view, attr).__delitem__("not a key") assert view.is_view assert adata_hash == joblib.hash(adata) assert view_hash == joblib.hash(view) @pytest.mark.parametrize("attr", ["obsm", "varm", "obsp", "varp", "layers"]) def test_view_delitem(attr): adata = gen_adata((10, 10), **GEN_ADATA_DASK_ARGS) getattr(adata, attr)["to_delete"] = np.ones((10, 10)) # Shouldn’t be a subclass, should be an ndarray assert type(getattr(adata, attr)["to_delete"]) is np.ndarray view = adata[5:7, :][:, :5] adata_hash = joblib.hash(adata) view_hash = joblib.hash(view) with pytest.warns( ad.ImplicitModificationWarning, match=rf".*\.{attr}\['to_delete'\].*" ): getattr(view, attr).__delitem__("to_delete") assert not view.is_view assert "to_delete" not in getattr(view, attr) assert "to_delete" in getattr(adata, attr) assert adata_hash == joblib.hash(adata) assert view_hash != joblib.hash(view) @pytest.mark.parametrize( "attr", ["X", "obs", "var", "obsm", "varm", "obsp", "varp", "layers", "uns"] ) def test_view_delattr(attr, subset_func): base = gen_adata((10, 10), **GEN_ADATA_DASK_ARGS) orig_hash = tokenize(base) subset = base[subset_func(base.obs_names), subset_func(base.var_names)] empty = ad.AnnData(obs=subset.obs[[]], var=subset.var[[]]) delattr(subset, attr) assert not subset.is_view # Should now have same value as default assert_equal(getattr(subset, attr), getattr(empty, attr)) assert orig_hash == tokenize(base) # Original should not be modified @pytest.mark.parametrize( "attr", ["obs", "var", "obsm", "varm", "obsp", "varp", "layers", "uns"] ) def test_view_setattr_machinery(attr, subset_func, subset_func2): # Tests that setting attributes on a view doesn't mess anything up too bad adata = gen_adata((10, 10), **GEN_ADATA_DASK_ARGS) view = adata[subset_func(adata.obs_names), subset_func2(adata.var_names)] actual = view.copy() setattr(view, attr, getattr(actual, attr)) assert_equal(actual, view, exact=True) def test_layers_view(): X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) L = np.array([[10, 11, 12], [13, 14, 15], [16, 17, 18]]) real_adata = ad.AnnData(X) real_adata.layers["L"] = L view_adata = real_adata[1:, 1:] real_hash = joblib.hash(real_adata) view_hash = joblib.hash(view_adata) assert view_adata.is_view with pytest.raises(ValueError, match=r"incorrect shape"): view_adata.layers["L2"] = L + 2 assert view_adata.is_view # Failing to set layer item makes adata not view assert real_hash == joblib.hash(real_adata) assert view_hash == joblib.hash(view_adata) with pytest.warns(ad.ImplicitModificationWarning, match=r".*layers.*"): view_adata.layers["L2"] = L[1:, 1:] + 2 assert not view_adata.is_view assert real_hash == joblib.hash(real_adata) assert view_hash != joblib.hash(view_adata) # TODO: This can be flaky. Make that stop def test_view_of_view(matrix_type, subset_func, subset_func2): adata = gen_adata((30, 15), X_type=matrix_type) adata.raw = adata.copy() if subset_func is single_subset: pytest.xfail("Other subset generating functions have trouble with this") var_s1 = subset_func(adata.var_names, min_size=4) var_view1 = adata[:, var_s1] adata[:, var_s1].X var_s2 = subset_func2(var_view1.var_names) var_view2 = var_view1[:, var_s2] assert var_view2._adata_ref is adata assert isinstance(var_view2.X, type(adata.X)) obs_s1 = subset_func(adata.obs_names, min_size=4) obs_view1 = adata[obs_s1, :] obs_s2 = subset_func2(obs_view1.obs_names) assert adata[obs_s1, :][:, var_s1][obs_s2, :]._adata_ref is adata assert isinstance(obs_view1.X, type(adata.X)) view_of_actual_copy = adata[:, var_s1].copy()[obs_s1, :].copy()[:, var_s2].copy() view_of_view_copy = adata[:, var_s1][obs_s1, :][:, var_s2].copy() assert_equal(view_of_actual_copy, view_of_view_copy, exact=True) assert isinstance(view_of_actual_copy.X, type(adata.X)) assert isinstance(view_of_view_copy.X, type(adata.X)) def test_view_of_view_modification(): adata = ad.AnnData(np.zeros((10, 10))) adata[0, :][:, 5:].X = np.ones(5) assert np.all(adata.X[0, 5:] == np.ones(5)) adata[[1, 2], :][:, [1, 2]].X = np.ones((2, 2)) assert np.all(adata.X[1:3, 1:3] == np.ones((2, 2))) adata.X = sparse.csr_matrix(adata.X) adata[0, :][:, 5:].X = np.ones(5) * 2 assert np.all(asarray(adata.X)[0, 5:] == np.ones(5) * 2) adata[[1, 2], :][:, [1, 2]].X = np.ones((2, 2)) * 2 assert np.all(asarray(adata.X)[1:3, 1:3] == np.ones((2, 2)) * 2) def test_double_index(subset_func, subset_func2): adata = gen_adata((10, 10), **GEN_ADATA_DASK_ARGS) obs_subset = subset_func(adata.obs_names) var_subset = subset_func2(adata.var_names) v1 = adata[obs_subset, var_subset] v2 = adata[obs_subset, :][:, var_subset] assert np.all(asarray(v1.X) == asarray(v2.X)) assert np.all(v1.obs == v2.obs) assert np.all(v1.var == v2.var) def test_view_different_type_indices(matrix_type): orig = gen_adata((30, 30), X_type=matrix_type) boolean_array_mask = np.random.randint(0, 2, 30).astype("bool") boolean_list_mask = boolean_array_mask.tolist() integer_array_mask = np.where(boolean_array_mask)[0] integer_list_mask = integer_array_mask.tolist() assert_equal(orig[integer_array_mask, :], orig[boolean_array_mask, :]) assert_equal(orig[integer_list_mask, :], orig[boolean_list_mask, :]) assert_equal(orig[integer_list_mask, :], orig[integer_array_mask, :]) assert_equal(orig[:, integer_array_mask], orig[:, boolean_array_mask]) assert_equal(orig[:, integer_list_mask], orig[:, boolean_list_mask]) assert_equal(orig[:, integer_list_mask], orig[:, integer_array_mask]) # check that X element is same independent of access assert_equal(orig[:, integer_list_mask].X, orig.X[:, integer_list_mask]) assert_equal(orig[:, boolean_list_mask].X, orig.X[:, boolean_list_mask]) assert_equal(orig[:, integer_array_mask].X, orig.X[:, integer_array_mask]) assert_equal(orig[:, integer_list_mask].X, orig.X[:, integer_list_mask]) assert_equal(orig[integer_list_mask, :].X, orig.X[integer_list_mask, :]) assert_equal(orig[boolean_list_mask, :].X, orig.X[boolean_list_mask, :]) assert_equal(orig[integer_array_mask, :].X, orig.X[integer_array_mask, :]) assert_equal(orig[integer_list_mask, :].X, orig.X[integer_list_mask, :]) def test_view_retains_ndarray_subclass(): adata = ad.AnnData(np.zeros((10, 10))) adata.obsm["foo"] = np.zeros((10, 5)).view(NDArraySubclass) view = adata[:5, :] assert isinstance(view.obsm["foo"], NDArraySubclass) assert view.obsm["foo"].shape == (5, 5) def test_modify_uns_in_copy(): # https://github.com/scverse/anndata/issues/571 adata = ad.AnnData(np.ones((5, 5)), uns={"parent": {"key": "value"}}) adata_copy = adata[:3].copy() adata_copy.uns["parent"]["key"] = "new_value" assert adata.uns["parent"]["key"] != adata_copy.uns["parent"]["key"] @pytest.mark.parametrize("index", [-101, 100, (slice(None), -101), (slice(None), 100)]) def test_invalid_scalar_index(adata, index): # https://github.com/scverse/anndata/issues/619 with pytest.raises(IndexError, match=r".*index.* out of range\."): _ = adata[index] @pytest.mark.parametrize("obs", [False, True]) @pytest.mark.parametrize("index", [-100, -50, -1]) def test_negative_scalar_index(*, adata, index: int, obs: bool): pos_index = index + (adata.n_obs if obs else adata.n_vars) if obs: adata_pos_subset = adata[pos_index] adata_neg_subset = adata[index] else: adata_pos_subset = adata[:, pos_index] adata_neg_subset = adata[:, index] np.testing.assert_array_equal( adata_pos_subset.obs_names, adata_neg_subset.obs_names ) np.testing.assert_array_equal( adata_pos_subset.var_names, adata_neg_subset.var_names ) def test_viewness_propagation_nan(): """Regression test for https://github.com/scverse/anndata/issues/239""" adata = ad.AnnData(np.random.random((10, 10))) adata = adata[:, [0, 2, 4]] v = adata.X.var(axis=0) assert not isinstance(v, ArrayView), type(v).mro() # this used to break v[np.isnan(v)] = 0 def test_viewness_propagation_allclose(adata): """Regression test for https://github.com/scverse/anndata/issues/191""" adata.varm["o"][4:10] = np.tile(np.nan, (10 - 4, adata.varm["o"].shape[1])) a = adata[:50].copy() b = adata[:50] # .copy() turns view to ndarray, so this was fine: assert np.allclose(a.varm["o"], b.varm["o"].copy(), equal_nan=True) # Next line triggered the mutation: assert np.allclose(a.varm["o"], b.varm["o"], equal_nan=True) # Showing that the mutation didn’t happen: assert np.allclose(a.varm["o"], b.varm["o"].copy(), equal_nan=True) spmat = [sparse.csr_matrix, sparse.csc_matrix, sparse.csr_array, sparse.csc_array] @pytest.mark.parametrize("spmat", spmat) def test_deepcopy_subset(adata, spmat: type): adata.obsp["arr"] = np.zeros((adata.n_obs, adata.n_obs)) adata.obsp["spmat"] = spmat((adata.n_obs, adata.n_obs)) adata = deepcopy(adata[:10].copy()) assert isinstance(adata.obsp["arr"], np.ndarray) assert not isinstance(adata.obsp["arr"], ArrayView) np.testing.assert_array_equal(adata.obsp["arr"].shape, (10, 10)) assert isinstance(adata.obsp["spmat"], spmat) view_type = ( SparseCSRMatrixView if spmat is sparse.csr_matrix else SparseCSCMatrixView ) view_type = SparseCSRArrayView if spmat is sparse.csr_array else SparseCSCArrayView assert not isinstance( adata.obsp["spmat"], view_type, ) np.testing.assert_array_equal(adata.obsp["spmat"].shape, (10, 10)) array_type = [ asarray, sparse.csr_matrix, sparse.csc_matrix, sparse.csr_array, sparse.csc_array, ] # https://github.com/scverse/anndata/issues/680 @pytest.mark.parametrize("array_type", array_type) @pytest.mark.parametrize("attr", ["X", "layers", "obsm", "varm", "obsp", "varp"]) def test_view_mixin_copies_data(adata, array_type: type, attr): N = 100 adata = ad.AnnData( obs=pd.DataFrame(index=np.arange(N).astype(str)), var=pd.DataFrame(index=np.arange(N).astype(str)), ) X = array_type(sparse.eye(N, N).multiply(np.arange(1, N + 1))) if attr == "X": adata.X = X else: getattr(adata, attr)["arr"] = X view = adata[:50] if attr == "X": arr_view = view.X else: arr_view = getattr(view, attr)["arr"] arr_view_copy = arr_view.copy() if sparse.issparse(X): assert not np.shares_memory(arr_view.indices, arr_view_copy.indices) assert not np.shares_memory(arr_view.indptr, arr_view_copy.indptr) assert not np.shares_memory(arr_view.data, arr_view_copy.data) arr_view_copy.data[0] = -5 assert not np.array_equal(arr_view_copy.data, arr_view.data) else: assert not np.shares_memory(arr_view, arr_view_copy) arr_view_copy[0, 0] = -5 assert not np.array_equal(arr_view_copy, arr_view) def test_copy_X_dtype(): adata = ad.AnnData(sparse.eye(50, dtype=np.float64, format="csr")) adata_c = adata[::2].copy() assert adata_c.X.dtype == adata.X.dtype def test_x_none(): orig = ad.AnnData(obs=pd.DataFrame(index=np.arange(50))) assert orig.shape == (50, 0) view = orig[2:4] assert view.shape == (2, 0) assert view.obs_names.tolist() == ["2", "3"] new = view.copy() assert new.shape == (2, 0) assert new.obs_names.tolist() == ["2", "3"] def test_empty_list_subset(): orig = gen_adata((10, 10)) subset = orig[:, []] assert subset.X.shape == (10, 0) assert subset.obsm["sparse"].shape == (10, 100) assert subset.varm["sparse"].shape == (0, 100) def test_dataframe_view_index_setting(): a1 = ad.AnnData( X=np.array([[1, 2, 3], [4, 5, 6]]), obs={"obs_names": ["aa", "bb"], "property": [True, True]}, var={"var_names": ["c", "d", "e"]}, ) a2 = a1[:, ["c", "d"]] with pytest.warns( ad.ImplicitModificationWarning, match=r"Trying to modify index.*" ): a2.obs.index = a2.obs.index.map(lambda x: x[-1]) assert not isinstance(a2.obs, ad._core.views.DataFrameView) assert isinstance(a2.obs, pd.DataFrame) assert a1.obs.index.values.tolist() == ["aa", "bb"] assert a2.obs.index.values.tolist() == ["a", "b"] def test_ellipsis_index( ellipsis_index: tuple[EllipsisType | slice, ...] | EllipsisType, equivalent_ellipsis_index: tuple[slice, slice], matrix_type, ): adata = gen_adata((10, 10), X_type=matrix_type, **GEN_ADATA_DASK_ARGS) subset_ellipsis = adata[ellipsis_index] subset = adata[equivalent_ellipsis_index] assert_equal(subset_ellipsis, subset) @pytest.mark.parametrize( ("index", "expected_error"), [ ((..., 0, ...), r"only have a single ellipsis"), ((0, 0, 0), r"Received a length 3 index"), ], ids=["ellipsis-int-ellipsis", "int-int-int"], ) def test_index_3d_errors(index: tuple[int | EllipsisType, ...], expected_error: str): with pytest.raises(IndexError, match=expected_error): gen_adata((10, 10))[index] @pytest.mark.parametrize( "index", [ pytest.param(sparse.csr_matrix(np.random.random((1, 10))), id="sparse"), pytest.param([1.2, 3.4], id="list"), *( pytest.param(np.array([1.2, 2.3], dtype=dtype), id=f"ndarray-{dtype}") for dtype in [np.float32, np.float64] ), ], ) def test_index_float_sequence_raises_error(index): with pytest.raises(IndexError, match=r"has floating point values"): gen_adata((10, 10))[index] # @pytest.mark.parametrize("dim", ["obs", "var"]) # @pytest.mark.parametrize( # ("idx", "pat"), # [ # pytest.param( # [1, "cell_c"], r"Mixed type list indexers not supported", id="mixed" # ), # pytest.param( # [[1, 2], [2]], r"setting an array element with a sequence", id="nested" # ), # ], # ) # def test_subset_errors(dim, idx, pat): # orig = gen_adata((10, 10)) # with pytest.raises(ValueError, match=pat): # if dim == "obs": # orig[idx, :].X # elif dim == "var": # orig[:, idx].X python-anndata-0.12.0~rc1/tests/test_x.py000066400000000000000000000131711500370632200203460ustar00rootroot00000000000000"""Tests for the attribute .X""" from __future__ import annotations import numpy as np import pandas as pd import pytest from scipy import sparse import anndata as ad from anndata import AnnData from anndata._warnings import ImplicitModificationWarning from anndata.tests.helpers import assert_equal, gen_adata from anndata.utils import asarray UNLABELLED_ARRAY_TYPES = [ pytest.param(sparse.csr_matrix, id="csr"), pytest.param(sparse.csc_matrix, id="csc"), pytest.param(sparse.csr_array, id="csr_array"), pytest.param(sparse.csc_array, id="csc_array"), pytest.param(asarray, id="ndarray"), ] SINGULAR_SHAPES = [ pytest.param(shape, id=str(shape)) for shape in [(1, 10), (10, 1), (1, 1)] ] @pytest.mark.parametrize("shape", SINGULAR_SHAPES) @pytest.mark.parametrize("orig_array_type", UNLABELLED_ARRAY_TYPES) @pytest.mark.parametrize("new_array_type", UNLABELLED_ARRAY_TYPES) def test_setter_singular_dim(shape, orig_array_type, new_array_type): # https://github.com/scverse/anndata/issues/500 adata = gen_adata(shape, X_type=orig_array_type) to_assign = new_array_type(np.ones(shape)) adata.X = to_assign np.testing.assert_equal(asarray(adata.X), 1) assert isinstance(adata.X, type(to_assign)) def test_repeat_indices_view(): adata = gen_adata((10, 10), X_type=np.asarray) subset = adata[[0, 0, 1, 1], :] mat = np.array([np.ones(adata.shape[1]) * i for i in range(4)]) with pytest.warns( FutureWarning, match=r"You are attempting to set `X` to a matrix on a view which has non-unique indices", ): subset.X = mat @pytest.mark.parametrize("orig_array_type", UNLABELLED_ARRAY_TYPES) @pytest.mark.parametrize("new_array_type", UNLABELLED_ARRAY_TYPES) def test_setter_view(orig_array_type, new_array_type): adata = gen_adata((10, 10), X_type=orig_array_type) orig_X = adata.X to_assign = new_array_type(np.ones((9, 9))) if isinstance(orig_X, np.ndarray) and sparse.issparse(to_assign): # https://github.com/scverse/anndata/issues/500 pytest.xfail("Cannot set a dense array with a sparse array") view = adata[:9, :9] view.X = to_assign np.testing.assert_equal(asarray(view.X), np.ones((9, 9))) assert isinstance(view.X, type(orig_X)) ############################### # Tests for `adata.X is None` # ############################### def test_set_x_is_none(): # test setter and getter adata = AnnData(np.array([[1, 2, 3], [4, 5, 6]]), dict(o1=[1, 2], o2=[3, 4])) adata.X = None assert adata.X is None def test_del_set_equiv_X(): """Tests that `del adata.X` is equivalent to `adata.X = None`""" # test setter and deleter orig = gen_adata((10, 10)) copy = orig.copy() del orig.X copy.X = None assert orig.X is None assert_equal(orig, copy) # Check that deleting again is still fine del orig.X assert orig.X is None @pytest.mark.parametrize( ("obs", "var", "shape_expected"), [ pytest.param(dict(obs_names=["1", "2"]), None, (2, 0), id="obs"), pytest.param(None, dict(var_names=["a", "b"]), (0, 2), id="var"), pytest.param( dict(obs_names=["1", "2", "3"]), dict(var_names=["a", "b"]), (3, 2), id="both", ), ], ) def test_init_x_as_none_shape_from_obs_var(obs, var, shape_expected): adata = AnnData(None, obs, var) assert adata.X is None assert adata.shape == shape_expected def test_init_x_as_none_explicit_shape(): shape = (3, 5) adata = AnnData(None, uns=dict(test=np.array((3, 3))), shape=shape) assert adata.X is None assert adata.shape == shape @pytest.mark.parametrize("shape", SINGULAR_SHAPES + [pytest.param((5, 3), id="(5, 3)")]) def test_transpose_with_X_as_none(shape): adata = gen_adata(shape, X_type=lambda x: None) adataT = adata.transpose() assert_equal(adataT.shape, shape[::-1]) assert_equal(adataT.obsp.keys(), adata.varp.keys()) assert_equal(adataT.T, adata) def test_copy(): adata = AnnData( None, obs=pd.DataFrame(index=[f"cell{i:03}" for i in range(100)]), var=pd.DataFrame(index=[f"gene{i:03}" for i in range(200)]), ) assert_equal(adata.copy(), adata) def test_copy_view(): adata = AnnData( None, obs=pd.DataFrame(index=[f"cell{i:03}" for i in range(100)]), var=pd.DataFrame(index=[f"gene{i:03}" for i in range(200)]), ) v = adata[::-2, ::-2] assert_equal(v.copy(), v) ############ # IO tests # ############ def test_io_missing_X(tmp_path, diskfmt): file_pth = tmp_path / f"x_none_adata.{diskfmt}" write = lambda obj, pth: getattr(obj, f"write_{diskfmt}")(pth) read = lambda pth: getattr(ad, f"read_{diskfmt}")(pth) adata = gen_adata((20, 30)) del adata.X write(adata, file_pth) from_disk = read(file_pth) assert_equal(from_disk, adata) def test_set_dense_x_view_from_sparse(): x = np.zeros((100, 30)) x1 = np.ones((100, 30)) orig = ad.AnnData(x) view = orig[:30] with ( pytest.warns( UserWarning, match=r"Trying to set a dense array with a sparse array on a view", ), pytest.warns( ImplicitModificationWarning, match=r"Modifying `X` on a view results" ), ): view.X = sparse.csr_matrix(x1[:30]) assert_equal(view.X, x1[:30]) assert_equal(orig.X[:30], x1[:30]) # change propagates through assert_equal(orig.X[30:], x[30:]) # change propagates through def test_fail_on_non_csr_csc_matrix(): X = sparse.eye(100, format="coo") with pytest.raises( ValueError, match=r"Only CSR and CSC.*", ): ad.AnnData(X=X)