pax_global_header 0000666 0000000 0000000 00000000064 15003706322 0014510 g ustar 00root root 0000000 0000000 52 comment=09b201e2936d4d794d08363439fc50c6e38d5e7d
python-anndata-0.12.0~rc1/ 0000775 0000000 0000000 00000000000 15003706322 0015321 5 ustar 00root root 0000000 0000000 python-anndata-0.12.0~rc1/.cirun.yml 0000664 0000000 0000000 00000000336 15003706322 0017244 0 ustar 00root root 0000000 0000000 runners:
- name: aws-gpu-runner
cloud: aws
instance_type: g4dn.xlarge
machine_image: ami-067a4ba2816407ee9
region: eu-north-1
preemptible:
- true
- false
labels:
- cirun-aws-gpu
python-anndata-0.12.0~rc1/.codecov.yml 0000664 0000000 0000000 00000000404 15003706322 0017542 0 ustar 00root root 0000000 0000000 # Based on pydata/xarray
codecov:
require_ci_to_pass: false
coverage:
status:
project:
default:
# Require 80% coverage
target: 80
changes: false
comment:
layout: "diff, flags, files"
behavior: once
require_base: false
python-anndata-0.12.0~rc1/.editorconfig 0000664 0000000 0000000 00000000310 15003706322 0017770 0 ustar 00root root 0000000 0000000 root = true
[*]
charset = utf-8
end_of_line = lf
insert_final_newline = true
trim_trailing_whitespace = true
max_line_length = 88
indent_size = 4
indent_style = space
[*.{yml,yaml}]
indent_size = 2
python-anndata-0.12.0~rc1/.github/ 0000775 0000000 0000000 00000000000 15003706322 0016661 5 ustar 00root root 0000000 0000000 python-anndata-0.12.0~rc1/.github/ISSUE_TEMPLATE/ 0000775 0000000 0000000 00000000000 15003706322 0021044 5 ustar 00root root 0000000 0000000 python-anndata-0.12.0~rc1/.github/ISSUE_TEMPLATE/bug-report.yml 0000664 0000000 0000000 00000004214 15003706322 0023656 0 ustar 00root root 0000000 0000000 name: Bug report
description: anndata doesn’t do what it should? Please help us fix it!
#title: ...
labels:
- Bug 🐛
- Triage 🩺
#assignees: []
body:
- type: checkboxes
id: terms
attributes:
label: Please make sure these conditions are met
# description: ...
options:
- label: I have checked that this issue has not already been reported.
required: true
- label: I have confirmed this bug exists on the latest version of anndata.
required: true
- label: (optional) I have confirmed this bug exists on the master branch of anndata.
required: false
- type: markdown
attributes:
value: |
**Note**: Please read [this guide][] detailing how to provide the necessary information for us to reproduce your bug.
[this guide]: https://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports
- type: textarea
id: Report
attributes:
label: Report
description: |
Describe the bug you encountered, and what you were trying to do. Please use [github markdown][] features for readability.
[github markdown]: https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax
value: |
Code:
```python
```
Traceback:
```pytb
```
validations:
required: true
- type: textarea
id: versions
attributes:
label: Versions
description: |
Which version of anndata and other related software you used.
Please install `session-info2`, run the following command in a notebook,
click the “Copy as Markdown” button,
then paste the results into the text box below.
```python
In[1]: import anndata, session_info2; session_info2.session_info(dependencies=True)
```
Alternatively, run this in a console:
```python
>>> import session_info2; print(session_info2.session_info(dependencies=True)._repr_mimebundle_()["text/markdown"])
```
render: python
validations:
required: true
python-anndata-0.12.0~rc1/.github/ISSUE_TEMPLATE/config.yml 0000664 0000000 0000000 00000000663 15003706322 0023041 0 ustar 00root root 0000000 0000000 blank_issues_enabled: true
contact_links:
- name: Scverse Community Forum
url: https://discourse.scverse.org/
about: If you have questions about “How to do X”, please ask them here.
- name: Blank issue
url: https://github.com/scverse/anndata/issues/new
about: For things that don't quite fit elsewhere. Please note that other templates should be used in most cases – this is mainly for use by the developers.
python-anndata-0.12.0~rc1/.github/ISSUE_TEMPLATE/enhancement-request.yml 0000664 0000000 0000000 00000000542 15003706322 0025543 0 ustar 00root root 0000000 0000000 name: Enhancement request
description: Anything you’d like to see in anndata?
#title: ...
labels:
- enhancement
- Triage 🩺
#assignees: []
body:
- type: textarea
id: description
attributes:
label: |
Please describe your wishes and possible alternatives to achieve the desired result.
validations:
required: true
python-anndata-0.12.0~rc1/.github/PULL_REQUEST_TEMPLATE.md 0000664 0000000 0000000 00000000266 15003706322 0022466 0 ustar 00root root 0000000 0000000
- [ ] Closes #
- [ ] Tests added
- [ ] Release note added (or unnecessary)
python-anndata-0.12.0~rc1/.github/workflows/ 0000775 0000000 0000000 00000000000 15003706322 0020716 5 ustar 00root root 0000000 0000000 python-anndata-0.12.0~rc1/.github/workflows/benchmark.yml 0000664 0000000 0000000 00000002607 15003706322 0023400 0 ustar 00root root 0000000 0000000 name: Benchmark
on:
push:
branches: [main, "[0-9]+.[0-9]+.x"]
pull_request:
branches: [main]
env:
FORCE_COLOR: "1"
defaults:
run:
shell: bash -el {0}
jobs:
benchmark:
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
python: ["3.12"]
os: [ubuntu-latest]
env:
OS: ${{ matrix.os }}
PYTHON: ${{ matrix.python }}
ASV_DIR: "./benchmarks"
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- run: git fetch origin main:main
if: ${{ github.ref_name != 'main' }}
# Errors on main branch
- uses: mamba-org/setup-micromamba@v2
with:
environment-name: asv
cache-environment: true
# Deps documented in https://asv.readthedocs.io/en/latest/installing.html
# libmambapy upper bound: https://github.com/airspeed-velocity/asv/issues/1438
create-args: >-
python=${{ matrix.python }}
asv
libmambapy<2
conda-build
- name: Cache datasets
uses: actions/cache@v4
with:
path: |
~/.cache
key: benchmark-state-${{ hashFiles('benchmarks/**') }}
- name: Quick benchmark run
working-directory: ${{ env.ASV_DIR }}
run: |
asv machine --yes
asv run --quick --show-stderr --verbose
python-anndata-0.12.0~rc1/.github/workflows/check-pr-milestoned.yml 0000664 0000000 0000000 00000001647 15003706322 0025306 0 ustar 00root root 0000000 0000000 name: Pull Request Validation
on:
pull_request:
branches:
- main
- master
types:
# milestone changes
- milestoned
- demilestoned
# label changes for “no milestone”
- labeled
- unlabeled
# initial check
- opened
- edited
- reopened
# code change (e.g. this workflow)
- synchronize
env:
LABELS: ${{ join(github.event.pull_request.labels.*.name, '|') }}
jobs:
check-milestone:
name: "Triage: Check Milestone"
runs-on: ubuntu-latest
steps:
- name: Check if merging isn’t blocked
uses: flying-sheep/check@v1
with:
success: ${{ ! contains(env.LABELS, 'DON’T MERGE') }}
- name: Check if a milestone is necessary and exists
uses: flying-sheep/check@v1
with:
success: ${{ github.event.pull_request.milestone != null || contains(env.LABELS, 'no milestone') }}
python-anndata-0.12.0~rc1/.github/workflows/close-stale.yml 0000664 0000000 0000000 00000001126 15003706322 0023654 0 ustar 00root root 0000000 0000000 name: "Close stale issues"
on:
schedule:
- cron: "0 2 * * *"
workflow_dispatch:
jobs:
stale:
runs-on: ubuntu-latest
steps:
- uses: actions/stale@v5
with:
days-before-issue-stale: -1 # We don't want to mark issues as stale in this action
days-before-issue-close: 14
days-before-pr-close: -1 # don't close PRs
days-before-pr-stale: -1 # don't mark PRs as stale
stale-issue-label: stale
any-of-labels: "needs info"
debug-only: true # enable dry-run, remove when we know from the logs it's working.
python-anndata-0.12.0~rc1/.github/workflows/codespell.yml 0000664 0000000 0000000 00000000563 15003706322 0023417 0 ustar 00root root 0000000 0000000 ---
name: Codespell
on:
push:
branches: [main]
pull_request:
branches: [main]
permissions:
contents: read
jobs:
codespell:
name: Check for spelling errors
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
filter: blob:none
- uses: codespell-project/actions-codespell@v2
python-anndata-0.12.0~rc1/.github/workflows/label-stale.yml 0000664 0000000 0000000 00000001402 15003706322 0023623 0 ustar 00root root 0000000 0000000 name: "Label stale issues"
on:
schedule:
- cron: "30 1 * * 1,2,3,4,5"
workflow_dispatch:
jobs:
stale:
runs-on: ubuntu-latest
steps:
- uses: actions/stale@v5
with:
days-before-issue-stale: 60
days-before-pr-stale: -1 # We don't want to mark PRs as stale
days-before-close: -1 # We don't want to close issues in this action
stale-issue-label: stale
exempt-issue-labels: pinned,enhancement
stale-issue-message: |
This issue has been automatically marked as stale because it has not had recent activity.
Please add a comment if you want to keep the issue open. Thank you for your contributions!
debug-only: false # set to `true` to enable dry-run
python-anndata-0.12.0~rc1/.github/workflows/publish.yml 0000664 0000000 0000000 00000001055 15003706322 0023110 0 ustar 00root root 0000000 0000000 name: Publish Python Package
on:
release:
types: [published]
jobs:
publish:
runs-on: ubuntu-latest
environment: pypi
permissions:
id-token: write # to authenticate as Trusted Publisher to pypi.org
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
filter: blob:none
- uses: actions/setup-python@v5
with:
python-version: "3.x"
cache: "pip"
- run: pip install build
- run: python -m build
- uses: pypa/gh-action-pypi-publish@release/v1
python-anndata-0.12.0~rc1/.github/workflows/test-cpu.yml 0000664 0000000 0000000 00000006702 15003706322 0023212 0 ustar 00root root 0000000 0000000 name: CI
on:
push:
branches:
- main
- "[0-9]+.[0-9]+.x"
pull_request:
env:
PYTEST_ADDOPTS: "-v --color=yes"
FORCE_COLOR: "1"
defaults:
run:
shell: bash -el {0}
# Cancel the job if new commits are pushed
# https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
pytest:
runs-on: ubuntu-latest
strategy:
matrix:
include:
- python-version: '3.13'
test-type: coverage
- python-version: '3.11'
test-type: standard
- python-version: '3.13'
dependencies-version: pre-release
test-type: strict-warning
- python-version: '3.11'
dependencies-version: minimum
test-type: coverage
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
filter: blob:none
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install UV
uses: astral-sh/setup-uv@v5
with:
enable-cache: true
cache-dependency-glob: pyproject.toml
- name: Install dependencies
if: matrix.dependencies-version == null
run: uv pip install --system --compile "anndata[dev,test-full] @ ." -c ci/constraints.txt
- name: Install minimum dependencies
if: matrix.dependencies-version == 'minimum'
run: |
uv pip install --system --compile tomli packaging
deps=$(python3 ci/scripts/min-deps.py pyproject.toml --extra dev test)
uv pip install --system --compile $deps "anndata @ ."
- name: Install dependencies release candidates
if: matrix.dependencies-version == 'pre-release'
run: uv pip install -v --system --compile --pre "anndata[dev,test-full] @ ." -c ci/constraints.txt
- name: Display installed versions
run: uv pip list
- name: Run Pytest
if: matrix.test-type == 'standard'
run: pytest -n auto
- name: Run Pytest (coverage)
if: matrix.test-type == 'coverage'
run: coverage run -m pytest -n auto --cov --cov-report=xml
- name: Run Pytest (treat warnings as errors)
if: matrix.test-type == 'strict-warning'
run: pytest --strict-warnings -n auto
- uses: codecov/codecov-action@v4
if: matrix.test-type == 'coverage'
with:
token: ${{ secrets.CODECOV_TOKEN }}
fail_ci_if_error: true
files: test-data/coverage.xml
check-build:
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
filter: blob:none
- name: Set up Python 3.13
uses: actions/setup-python@v5
with:
python-version: '3.13'
- name: Install build tools and requirements
run: |
python -m pip install --upgrade pip
pip install build twine
- name: Display installed versions
run: pip list
- name: Build & Twine check
run: |
python -m build --sdist --wheel .
twine check dist/*
- name: Check runtime version
run: |
pip install dist/*.whl
python -c 'import anndata; print(anndata.__version__)'
python-anndata-0.12.0~rc1/.github/workflows/test-gpu.yml 0000664 0000000 0000000 00000006337 15003706322 0023222 0 ustar 00root root 0000000 0000000 name: AWS GPU
on:
push:
branches: [main, "[0-9]+.[0-9]+.x"]
pull_request:
types:
- labeled
- opened
- synchronize
env:
PYTEST_ADDOPTS: "-v --color=yes"
FORCE_COLOR: "1"
defaults:
run:
shell: bash -el {0}
# Cancel the job if new commits are pushed
# https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
# There are two jobs:
# 1. `check` determines if the second job (`test`) will be run (through a job dependency).
# 2. `test` runs on an AWS runner and executes the GPU tests.
jobs:
# If the `skip-gpu-ci` label is set, this job is skipped, and consequently the `test` job too.
# If the `run-gpu-ci` label is set or we reacted to a `push` event, this job succeeds (and `test` is run).
# If neither is set, this job fails, `test` is skipped, and the whole workflow fails.
check:
name: "Triage: Check if GPU tests are allowed to run"
if: (!contains(github.event.pull_request.labels.*.name, 'skip-gpu-ci'))
runs-on: ubuntu-latest
steps:
- uses: flying-sheep/check@v1
with:
success: ${{ github.event_name == 'push' || contains(github.event.pull_request.labels.*.name, 'run-gpu-ci') }}
# If `check` wasn’t skipped or failed, start an AWS runner and run the GPU tests on it.
test:
name: GPU Tests
needs: check
runs-on: "cirun-aws-gpu--${{ github.run_id }}"
# Setting a timeout of 30 minutes, as the AWS costs money
# At time of writing, a typical run takes about 5 minutes
timeout-minutes: 30
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
filter: blob:none
- name: Nvidia SMI sanity check
run: nvidia-smi
- name: Install yq
run: |
sudo snap install yq
- name: Extract max Python version from classifiers
run: |
classifiers=$(yq .project.classifiers pyproject.toml -oy | grep --only-matching --perl-regexp '(?<=Python :: )(\d\.\d+)')
max_version=$(echo "$classifiers" | sort -V | tail -1)
echo "max_python_version=$max_version" >> $GITHUB_ENV
- name: Install Python
uses: actions/setup-python@v5
with:
# https://github.com/cupy/cupy/issues/8651 cupy does not support python3.13 yet
python-version: "3.12"
- name: Install UV
uses: astral-sh/setup-uv@v5
with:
enable-cache: true
cache-dependency-glob: pyproject.toml
- name: Install AnnData
run: uv pip install --system -e ".[dev,test,cu12]" -c ci/constraints.txt
- name: Env list
run: pip list
- name: Run test
run: coverage run -m pytest -m gpu -n auto --cov --cov-report=xml
- uses: codecov/codecov-action@v4
with:
token: ${{ secrets.CODECOV_TOKEN }}
fail_ci_if_error: true
files: test-data/coverage.xml
- name: Remove 'run-gpu-ci' Label
if: always()
uses: actions-ecosystem/action-remove-labels@v1
with:
labels: "run-gpu-ci"
github_token: ${{ secrets.GITHUB_TOKEN }}
python-anndata-0.12.0~rc1/.gitignore 0000664 0000000 0000000 00000000701 15003706322 0017307 0 ustar 00root root 0000000 0000000 # Temp files
.DS_Store
*~
# Caches for compiled and downloaded files
__pycache__/
/*cache/
/node_modules/
/data/
/venv/
# Distribution / packaging
/dist/
/ci/min-deps.txt
/requirements*.lock
/.python-version
# Test results (nunit/junit) and coverage
/test-data/
/*coverage*
# jupyter
.ipynb_checkpoints
# docs
/docs/generated/
/docs/_build/
# IDEs
/.idea/
# Benchmark
.asv
benchmark/benchmarks/data
benchmarks/benchmarks/data
benchmarks/pkgs
python-anndata-0.12.0~rc1/.gitmodules 0000664 0000000 0000000 00000000174 15003706322 0017500 0 ustar 00root root 0000000 0000000 [submodule "docs/tutorials/notebooks"]
path = docs/tutorials/notebooks
url = https://github.com/scverse/anndata-tutorials
python-anndata-0.12.0~rc1/.pre-commit-config.yaml 0000664 0000000 0000000 00000001611 15003706322 0021601 0 ustar 00root root 0000000 0000000 repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.11.4
hooks:
- id: ruff
args: ["--fix"]
- id: ruff-format
- repo: https://github.com/biomejs/pre-commit
rev: v1.9.4
hooks:
- id: biome-format
- repo: https://github.com/ComPWA/taplo-pre-commit
rev: v0.9.3
hooks:
- id: taplo-format
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-added-large-files
- id: check-case-conflict
- id: check-toml
- id: check-yaml
- id: check-merge-conflict
- id: detect-private-key
- id: no-commit-to-branch
args: ["--branch=main"]
- repo: https://github.com/codespell-project/codespell
rev: v2.4.1
hooks:
- id: codespell
additional_dependencies:
- tomli
python-anndata-0.12.0~rc1/.prettierignore 0000664 0000000 0000000 00000000005 15003706322 0020357 0 ustar 00root root 0000000 0000000 *.md
python-anndata-0.12.0~rc1/.prettierrc.yaml 0000664 0000000 0000000 00000000257 15003706322 0020452 0 ustar 00root root 0000000 0000000 overrides:
# JSON with comments and trailing commas
- files: benchmarks/asv.conf.json
options:
parser: json5
quoteProps: preserve
singleQuote: false
python-anndata-0.12.0~rc1/.readthedocs.yml 0000664 0000000 0000000 00000001176 15003706322 0020414 0 ustar 00root root 0000000 0000000 version: 2
build:
os: ubuntu-20.04
tools:
python: "3.13"
jobs:
post_checkout:
# unshallow so version can be derived from tag
- git fetch --unshallow || true
pre_build:
# run towncrier to preview the next version’s release notes
- ( find docs/release-notes -regex '[^.]+[.][^.]+.md' | grep -q . ) && towncrier build --keep || true
sphinx:
configuration: docs/conf.py
fail_on_warning: true # do not change or you will be fired
python:
install:
- method: pip
path: .
extra_requirements:
- doc
submodules:
include:
- "docs/tutorials/notebooks"
recursive: true
python-anndata-0.12.0~rc1/.taplo.toml 0000664 0000000 0000000 00000000152 15003706322 0017411 0 ustar 00root root 0000000 0000000 [formatting]
array_auto_collapse = false
column_width = 120
compact_arrays = false
indent_string = ' '
python-anndata-0.12.0~rc1/.vscode/ 0000775 0000000 0000000 00000000000 15003706322 0016662 5 ustar 00root root 0000000 0000000 python-anndata-0.12.0~rc1/.vscode/launch.json 0000664 0000000 0000000 00000001401 15003706322 0021023 0 ustar 00root root 0000000 0000000 {
"version": "0.2.0",
"configurations": [
{
"name": "Python: Build Docs",
"type": "debugpy",
"request": "launch",
"module": "sphinx",
"args": ["-M", "html", ".", "_build"],
"cwd": "${workspaceFolder}/docs",
"console": "internalConsole",
"justMyCode": false,
},
{
"name": "Python: Debug Test",
"type": "debugpy",
"request": "launch",
"program": "${file}",
"purpose": ["debug-test"],
"console": "internalConsole",
"justMyCode": false,
"env": { "PYTEST_ADDOPTS": "--color=yes" },
"presentation": { "hidden": true },
},
],
}
python-anndata-0.12.0~rc1/.vscode/settings.json 0000664 0000000 0000000 00000001350 15003706322 0021414 0 ustar 00root root 0000000 0000000 {
"[python][toml][json][jsonc]": {
"editor.formatOnSave": true,
"editor.codeActionsOnSave": {
"source.organizeImports": "explicit",
"source.fixAll": "explicit",
},
},
"[python]": {
"editor.defaultFormatter": "charliermarsh.ruff",
},
"[toml]": {
"editor.defaultFormatter": "tamasfe.even-better-toml",
},
"[json][jsonc]": {
"editor.defaultFormatter": "biomejs.biome",
},
"python.analysis.typeCheckingMode": "basic",
"python.testing.pytestEnabled": true,
"python.testing.pytestArgs": [
"--color=yes",
"-vv",
"--strict-warnings",
//"-nauto",
],
"python.terminal.activateEnvironment": true,
}
python-anndata-0.12.0~rc1/LICENSE 0000664 0000000 0000000 00000003047 15003706322 0016332 0 ustar 00root root 0000000 0000000 BSD 3-Clause License
Copyright (c) 2025, scverse®
Copyright (c) 2017-2018, P. Angerer, F. Alexander Wolf, Theis Lab
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
python-anndata-0.12.0~rc1/README.md 0000664 0000000 0000000 00000010671 15003706322 0016605 0 ustar 00root root 0000000 0000000 [](https://github.com/scverse/anndata/actions)
[](https://anaconda.org/conda-forge/anndata)
[](https://codecov.io/gh/scverse/anndata)
[](https://anndata.readthedocs.io)
[](https://pypi.org/project/anndata)
[](https://pepy.tech/project/anndata)
[](https://pepy.tech/project/anndata)
[](https://github.com/scverse/anndata/stargazers)
[](http://numfocus.org)
# anndata - Annotated data
anndata is a Python package for handling annotated data matrices in memory and on disk, positioned between pandas and xarray. anndata offers a broad range of computationally efficient features including, among others, sparse data support, lazy operations, and a PyTorch interface.
- Discuss development on [GitHub](https://github.com/scverse/anndata).
- Read the [documentation](https://anndata.readthedocs.io).
- Ask questions on the [scverse Discourse](https://discourse.scverse.org).
- Install via `pip install anndata` or `conda install anndata -c conda-forge`.
- See [Scanpy's documentation](https://scanpy.readthedocs.io/) for usage related to single cell data. anndata was initially built for Scanpy.
[//]: # (numfocus-fiscal-sponsor-attribution)
anndata is part of the scverse® project ([website](https://scverse.org), [governance](https://scverse.org/about/roles)) and is fiscally sponsored by [NumFOCUS](https://numfocus.org/).
If you like scverse® and want to support our mission, please consider making a tax-deductible [donation](https://numfocus.org/donate-to-scverse) to help the project pay for developer time, professional services, travel, workshops, and a variety of other needs.
## Public API
Our public API is documented in the [API section][] of these docs.
We cannot guarantee the stability of our internal APIs, whether it's the location of a function, its arguments, or something else.
In other words, we do not officially support (or encourage users to do) something like `from anndata._core import AnnData` as `_core` is both not documented and contains a [leading underscore][].
However, we are aware that [many users do use these internal APIs][] and thus encourage them to [open an issue][] or migrate to the public API.
That is, if something is missing from our public API as documented, for example a feature you wish to be exported publicly, please open an issue.
[api section]: https://anndata.readthedocs.io/en/stable/api.html
[leading underscore]: https://peps.python.org/pep-0008/#public-and-internal-interfaces
[many users do use these internal APIs]: https://github.com/search?q=%22anndata._io%22&type=code
[open an issue]: https://github.com/scverse/anndata/issues/new/choose
## Citation
If you use `anndata` in your work, please cite the `anndata` publication as follows:
> **anndata: Annotated data**
>
> Isaac Virshup, Sergei Rybakov, Fabian J. Theis, Philipp Angerer, F. Alexander Wolf
>
> _JOSS_ 2024 Sep 16. doi: [10.21105/joss.04371](https://doi.org/10.21105/joss.04371).
You can cite the scverse publication as follows:
> **The scverse project provides a computational ecosystem for single-cell omics data analysis**
>
> Isaac Virshup, Danila Bredikhin, Lukas Heumos, Giovanni Palla, Gregor Sturm, Adam Gayoso, Ilia Kats, Mikaela Koutrouli, Scverse Community, Bonnie Berger, Dana Pe’er, Aviv Regev, Sarah A. Teichmann, Francesca Finotello, F. Alexander Wolf, Nir Yosef, Oliver Stegle & Fabian J. Theis
>
> _Nat Biotechnol._ 2023 Apr 10. doi: [10.1038/s41587-023-01733-8](https://doi.org/10.1038/s41587-023-01733-8).
python-anndata-0.12.0~rc1/benchmarks/ 0000775 0000000 0000000 00000000000 15003706322 0017436 5 ustar 00root root 0000000 0000000 python-anndata-0.12.0~rc1/benchmarks/README.md 0000664 0000000 0000000 00000007433 15003706322 0020724 0 ustar 00root root 0000000 0000000 # AnnData Benchmarks
This repo contains some work in progress benchmarks for [AnnData](https://github.com/theislab/anndata) using [asv](https://asv.readthedocs.io).
## Setup
I definitely recommend reading through the asv docs. Currently, this assumes the benchmark suite can reach the `anndata` repo via the path `../anndata`. Otherwise, all you'll need to do is create a [machine file](https://asv.readthedocs.io/en/stable/commands.html#asv-machine) for your system and make sure `anndata`s dependencies are installable via `conda`.
### Data
Data will need to be retrieved for these benchmarks. This can be downloaded using the script fetch_datasets.py.
Note that the `h5ad` format has changed since it's inception. While the `anndata` package maintains backwards compatibility, older versions of `anndata` will not be able to read files written by more recent versions. To get around this for the benchmarks, datasets have to be able to be read by all versions which can require a setup function that creates the anndata object.
## Usage
### Runnings the benchmarks:
To run benchmarks for a particular commit: `asv run {commit} --steps 1 -b`
To run benchmarks for a range of commits: `asv run {commit1}..{commit2}`
You can filter out the benchmarks which are run with the `-b {pattern}` flag.
### Accessing the benchmarks
You can see what benchmarks you've already run using `asv show`. If you don't specify a commit, it will search for the available commits. If you specify a commit it'll show you those results. For example:
```bash
$ asv show -b "views"
Commits with results:
Machine : mimir.mobility.unimelb.net.au
Environment: conda-py3.7-h5py-memory_profiler-natsort-numpy-pandas-scipy
61eb5bb7
e9ccfc33
22f12994
0ebe187e
```
```bash
$ asv show -b "views" 0ebe187e
Commit: 0ebe187e
views.SubsetMemorySuite.track_repeated_subset_memratio [mimir.mobility.unimelb.net.au/conda-py3.7-h5py-memory_profiler-natsort-numpy-pandas-scipy]
ok
======= ======= ========== ============ ===================== ====================== ======================
-- index_kind
--------------------------------------- -------------------------------------------------------------------
n_obs n_var attr_set subset_dim intarray boolarray slice
======= ======= ========== ============ ===================== ====================== ======================
100 100 X-csr obs 2.84 1.7916666666666667 0.5
100 100 X-csr var 2.5357142857142856 1.8695652173913044 0.5652173913043478
100 100 X-dense obs 3.1739130434782608 1.6538461538461537 0.6
...
```
You can compare two commits with `asv compare`
```bash
$ asv compare e9ccfc 0ebe187e
All benchmarks:
before after ratio
[e9ccfc33] [0ebe187e]
- 2.16 1.7916666666666667 0.83 views.SubsetMemorySuite.track_repeated_subset_memratio(100, 100, 'X-csr', 'obs', 'boolarray')
+ 2.533333333333333 2.84 1.12 views.SubsetMemorySuite.track_repeated_subset_memratio(100, 100, 'X-csr', 'obs', 'intarray')
- 1.1923076923076923 0.5 0.42 views.SubsetMemorySuite.track_repeated_subset_memratio(100, 100, 'X-csr', 'obs', 'slice')
1.9615384615384615 1.8695652173913044 0.95 views.SubsetMemorySuite.track_repeated_subset_memratio(100, 100, 'X-csr', 'var', 'boolarray')
```
### View in the browser:
You can view the benchmarks in the browser with `asv publish` followed by `asv preview`. If you want to include benchmarks of a local branch, I think you'll have to add that branch to the `"branches"` list in `asv.conf.json`.
python-anndata-0.12.0~rc1/benchmarks/asv.conf.json 0000664 0000000 0000000 00000015375 15003706322 0022061 0 ustar 00root root 0000000 0000000 {
// The version of the config file format. Do not change, unless
// you know what you are doing.
"version": 1,
// The name of the project being benchmarked
"project": "anndata",
// The project's homepage
"project_url": "https://anndata.readthedocs.io/",
// The URL or local path of the source code repository for the
// project being benchmarked
"repo": "../../anndata",
// The Python project's subdirectory in your repo. If missing or
// the empty string, the project is assumed to be located at the root
// of the repository.
// "repo_subdir": "",
// Customizable commands for building, installing, and
// uninstalling the project. See asv.conf.json documentation.
//
// "install_command": ["python -mpip install {wheel_file}"],
// "uninstall_command": ["return-code=any python -mpip uninstall -y {project}"],
"build_command": [
"python -m pip install build",
"python -m build --wheel -o {build_cache_dir} {build_dir}",
],
// List of branches to benchmark. If not provided, defaults to "master"
// (for git) or "default" (for mercurial).
"branches": ["main"], // for git
// "branches": ["default"], // for mercurial
// The DVCS being used. If not set, it will be automatically
// determined from "repo" by looking at the protocol in the URL
// (if remote), or by looking for special directories, such as
// ".git" (if local).
"dvcs": "git",
// The tool to use to create environments. May be "conda",
// "virtualenv" or other value depending on the plugins in use.
// If missing or the empty string, the tool will be automatically
// determined by looking for tools on the PATH environment
// variable.
"environment_type": "mamba",
// timeout in seconds for installing any dependencies in environment
// defaults to 10 min
//"install_timeout": 600,
// the base URL to show a commit for the project.
"show_commit_url": "https://github.com/theislab/anndata/commit/",
// The Pythons you'd like to test against. If not provided, defaults
// to the current version of Python used to run `asv`.
// "pythons": ["2.7", "3.6"],
// The list of conda channel names to be searched for benchmark
// dependency packages in the specified order
"conda_channels": ["conda-forge", "defaults"],
// The matrix of dependencies to test. Each key is the name of a
// package (in PyPI) and the values are version numbers. An empty
// list or empty string indicates to just test against the default
// (latest) version. null indicates that the package is to not be
// installed. If the package to be tested is only available from
// PyPi, and the 'environment_type' is conda, then you can preface
// the package name by 'pip+', and the package will be installed via
// pip (with all the conda available packages installed first,
// followed by the pip installed packages).
//
"matrix": {
"numpy": [""],
// "scipy": ["1.2", ""],
"scipy": [""],
"h5py": [""],
"natsort": [""],
"pandas": [""],
"memory_profiler": [""],
"zarr": [""],
"pytoml": [""],
"pytest": [""],
"pooch": [""],
// "scanpy": [""],
// "psutil": [""]
},
// Combinations of libraries/python versions can be excluded/included
// from the set to test. Each entry is a dictionary containing additional
// key-value pairs to include/exclude.
//
// An exclude entry excludes entries where all values match. The
// values are regexps that should match the whole string.
//
// An include entry adds an environment. Only the packages listed
// are installed. The 'python' key is required. The exclude rules
// do not apply to includes.
//
// In addition to package names, the following keys are available:
//
// - python
// Python version, as in the *pythons* variable above.
// - environment_type
// Environment type, as above.
// - sys_platform
// Platform, as in sys.platform. Possible values for the common
// cases: 'linux2', 'win32', 'cygwin', 'darwin'.
//
// "exclude": [
// {"python": "3.2", "sys_platform": "win32"}, // skip py3.2 on windows
// {"environment_type": "conda", "six": null}, // don't run without six on conda
// ],
//
// "include": [
// // additional env for python2.7
// {"python": "2.7", "numpy": "1.8"},
// // additional env if run on windows+conda
// {"platform": "win32", "environment_type": "mamba", "python": "2.7", "libpython": ""},
// ],
// The directory (relative to the current directory) that benchmarks are
// stored in. If not provided, defaults to "benchmarks"
// "benchmark_dir": "benchmarks",
// The directory (relative to the current directory) to cache the Python
// environments in. If not provided, defaults to "env"
"env_dir": ".asv/env",
// The directory (relative to the current directory) that raw benchmark
// results are stored in. If not provided, defaults to "results".
"results_dir": ".asv/results",
// The directory (relative to the current directory) that the html tree
// should be written to. If not provided, defaults to "html".
"html_dir": ".asv/html",
// The number of characters to retain in the commit hashes.
// "hash_length": 8,
// `asv` will cache results of the recent builds in each
// environment, making them faster to install next time. This is
// the number of builds to keep, per environment.
// "build_cache_size": 2,
// The commits after which the regression search in `asv publish`
// should start looking for regressions. Dictionary whose keys are
// regexps matching to benchmark names, and values corresponding to
// the commit (exclusive) after which to start looking for
// regressions. The default is to start from the first commit
// with results. If the commit is `null`, regression detection is
// skipped for the matching benchmark.
//
// "regressions_first_commits": {
// "some_benchmark": "352cdf", // Consider regressions only after this commit
// "another_benchmark": null, // Skip regression detection altogether
// },
// The thresholds for relative change in results, after which `asv
// publish` starts reporting regressions. Dictionary of the same
// form as in ``regressions_first_commits``, with values
// indicating the thresholds. If multiple entries match, the
// maximum is taken. If no entry matches, the default is 5%.
//
// "regressions_thresholds": {
// "some_benchmark": 0.01, // Threshold of 1%
// "another_benchmark": 0.5, // Threshold of 50%
// },
}
python-anndata-0.12.0~rc1/benchmarks/benchmarks/ 0000775 0000000 0000000 00000000000 15003706322 0021553 5 ustar 00root root 0000000 0000000 python-anndata-0.12.0~rc1/benchmarks/benchmarks/__init__.py 0000664 0000000 0000000 00000000000 15003706322 0023652 0 ustar 00root root 0000000 0000000 python-anndata-0.12.0~rc1/benchmarks/benchmarks/anndata.py 0000664 0000000 0000000 00000002377 15003706322 0023544 0 ustar 00root root 0000000 0000000 from __future__ import annotations
import tracemalloc
import numpy as np
from .utils import gen_adata
class GarbargeCollectionSuite:
runs = 10
# custom because `memory_profiler` is a line-by-line profiler (also: https://github.com/pythonprofilers/memory_profiler/issues/402)
def track_peakmem_garbage_collection(self, *_):
def display_top(snapshot, key_type="lineno"):
snapshot = snapshot.filter_traces(
(
tracemalloc.Filter(
inclusive=False,
filename_pattern="",
),
tracemalloc.Filter(
inclusive=False,
filename_pattern="",
),
)
)
top_stats = snapshot.statistics(key_type)
total = sum(stat.size for stat in top_stats)
return total
total = np.zeros(self.runs)
tracemalloc.start()
for i in range(self.runs):
data = gen_adata(10000, 10000, "X-csc") # noqa: F841
snapshot = tracemalloc.take_snapshot()
total[i] = display_top(snapshot)
tracemalloc.stop()
return max(total)
python-anndata-0.12.0~rc1/benchmarks/benchmarks/readwrite.py 0000664 0000000 0000000 00000013516 15003706322 0024121 0 ustar 00root root 0000000 0000000 """
This module will benchmark io of AnnData objects
Things to test:
* Read time, write time
* Peak memory during io
* File sizes
Parameterized by:
* What method is being used
* What data is being included
* Size of data being used
Also interesting:
* io for views
* io for backed objects
* Reading dense as sparse, writing sparse as dense
"""
from __future__ import annotations
import sys
import tempfile
from pathlib import Path
import numpy as np
import pooch
from memory_profiler import memory_usage
# from . import datasets
import anndata
from .utils import get_actualsize, get_peak_mem, sedate
PBMC_3K_URL = "https://falexwolf.de/data/pbmc3k_raw.h5ad"
# PBMC_3K_PATH = Path(__file__).parent / "data/pbmc3k_raw.h5ad"
# PBMC_REDUCED_PATH = Path(__file__).parent / "10x_pbmc68k_reduced.h5ad"
# BM_43K_CSR_PATH = Path(__file__).parent.parent / "datasets/BM2_43k-cells.h5ad"
# BM_43K_CSC_PATH = Path(__file__).parent.parent / "datasets/BM2_43k-cells_CSC.h5ad"
# class ZarrReadSuite:
# params = []
# param_names = ["input_url"]
# def setup(self, input_url):
# self.filepath = pooch.retrieve(url=input_url, known_hash=None)
# def time_read_full(self, input_url):
# anndata.read_zarr(self.filepath)
# def peakmem_read_full(self, input_url):
# anndata.read_zarr(self.filepath)
# def mem_readfull_object(self, input_url):
# return anndata.read_zarr(self.filepath)
# def track_read_full_memratio(self, input_url):
# mem_recording = memory_usage(
# (sedate(anndata.read_zarr, 0.005), (self.filepath,)), interval=0.001
# )
# adata = anndata.read_zarr(self.filepath)
# base_size = mem_recording[-1] - mem_recording[0]
# print(np.max(mem_recording) - np.min(mem_recording))
# print(base_size)
# return (np.max(mem_recording) - np.min(mem_recording)) / base_size
# def peakmem_read_backed(self, input_url):
# anndata.read_zarr(self.filepath, backed="r")
# def mem_read_backed_object(self, input_url):
# return anndata.read_zarr(self.filepath, backed="r")
class H5ADInMemorySizeSuite:
_urls = dict(pbmc3k=PBMC_3K_URL)
params = _urls.keys()
param_names = ["input_data"]
def setup(self, input_data: str):
self.filepath = pooch.retrieve(url=self._urls[input_data], known_hash=None)
def track_in_memory_size(self, *_):
adata = anndata.read_h5ad(self.filepath)
adata_size = sys.getsizeof(adata)
return adata_size
def track_actual_in_memory_size(self, *_):
adata = anndata.read_h5ad(self.filepath)
adata_size = get_actualsize(adata)
return adata_size
class H5ADReadSuite:
_urls = dict(pbmc3k=PBMC_3K_URL)
params = _urls.keys()
param_names = ["input_data"]
def setup(self, input_data: str):
self.filepath = pooch.retrieve(url=self._urls[input_data], known_hash=None)
def time_read_full(self, *_):
anndata.read_h5ad(self.filepath)
def peakmem_read_full(self, *_):
anndata.read_h5ad(self.filepath)
def mem_readfull_object(self, *_):
return anndata.read_h5ad(self.filepath)
def track_read_full_memratio(self, *_):
mem_recording = memory_usage(
(sedate(anndata.read_h5ad, 0.005), (self.filepath,)), interval=0.001
)
# adata = anndata.read_h5ad(self.filepath)
base_size = mem_recording[-1] - mem_recording[0]
print(np.max(mem_recording) - np.min(mem_recording))
print(base_size)
return (np.max(mem_recording) - np.min(mem_recording)) / base_size
def peakmem_read_backed(self, *_):
anndata.read_h5ad(self.filepath, backed="r")
# causes benchmarking to break from: https://github.com/pympler/pympler/issues/151
# def mem_read_backed_object(self, *_):
# return anndata.read_h5ad(self.filepath, backed="r")
class H5ADWriteSuite:
_urls = dict(pbmc3k=PBMC_3K_URL)
params = _urls.keys()
param_names = ["input_data"]
def setup(self, input_data: str):
mem_recording, adata = memory_usage(
(
sedate(anndata.read_h5ad, 0.005),
(pooch.retrieve(self._urls[input_data], known_hash=None),),
),
retval=True,
interval=0.001,
)
self.adata = adata
self.base_size = mem_recording[-1] - mem_recording[0]
self.tmpdir = tempfile.TemporaryDirectory()
self.writepth = Path(self.tmpdir.name) / "out.h5ad"
def teardown(self, *_):
self.tmpdir.cleanup()
def time_write_full(self, *_):
self.adata.write_h5ad(self.writepth, compression=None)
def peakmem_write_full(self, *_):
self.adata.write_h5ad(self.writepth)
def track_peakmem_write_full(self, *_):
return get_peak_mem((sedate(self.adata.write_h5ad), (self.writepth,)))
def time_write_compressed(self, *_):
self.adata.write_h5ad(self.writepth, compression="gzip")
def peakmem_write_compressed(self, *_):
self.adata.write_h5ad(self.writepth, compression="gzip")
def track_peakmem_write_compressed(self, *_):
return get_peak_mem(
(sedate(self.adata.write_h5ad), (self.writepth,), {"compression": "gzip"})
)
class H5ADBackedWriteSuite(H5ADWriteSuite):
_urls = dict(pbmc3k=PBMC_3K_URL)
params = _urls.keys()
param_names = ["input_data"]
def setup(self, input_data):
mem_recording, adata = memory_usage(
(
sedate(anndata.read_h5ad, 0.005),
(pooch.retrieve(self._urls[input_data], known_hash=None),),
{"backed": "r"},
),
retval=True,
interval=0.001,
)
self.adata = adata
self.base_size = mem_recording[-1] - mem_recording[0]
self.tmpdir = tempfile.TemporaryDirectory()
self.writepth = Path(self.tmpdir.name) / "out.h5ad"
python-anndata-0.12.0~rc1/benchmarks/benchmarks/sparse_dataset.py 0000664 0000000 0000000 00000003002 15003706322 0025122 0 ustar 00root root 0000000 0000000 from __future__ import annotations
import numpy as np
import zarr
from scipy import sparse
from anndata import AnnData
from anndata._core.sparse_dataset import sparse_dataset
from anndata._io.specs import write_elem
def make_alternating_mask(n):
mask_alternating = np.ones(10_000, dtype=bool)
for i in range(0, 10_000, n):
mask_alternating[i] = False
return mask_alternating
class SparseCSRContiguousSlice:
_slices = {
"0:1000": slice(0, 1000),
"0:9000": slice(0, 9000),
":9000:-1": slice(None, 9000, -1),
"::-2": slice(None, None, 2),
"array": np.array([0, 5000, 9999]),
"arange": np.arange(0, 1000),
"first": 0,
"alternating": make_alternating_mask(10),
}
params = (
[
(10_000, 10_000),
# (10_000, 500)
],
_slices.keys(),
)
param_names = ["shape", "slice"]
def setup(self, shape: tuple[int, int], slice: str):
X = sparse.random(
*shape, density=0.01, format="csr", random_state=np.random.default_rng(42)
)
self.slice = self._slices[slice]
g = zarr.group()
write_elem(g, "X", X)
self.x = sparse_dataset(g["X"])
self.adata = AnnData(self.x)
def time_getitem(self, *_):
self.x[self.slice]
def peakmem_getitem(self, *_):
self.x[self.slice]
def time_getitem_adata(self, *_):
self.adata[self.slice]
def peakmem_getitem_adata(self, *_):
self.adata[self.slice]
python-anndata-0.12.0~rc1/benchmarks/benchmarks/utils.py 0000664 0000000 0000000 00000007402 15003706322 0023270 0 ustar 00root root 0000000 0000000 from __future__ import annotations
import gc
import sys
from string import ascii_lowercase
from time import sleep
import numpy as np
import pandas as pd
from memory_profiler import memory_usage
from scipy import sparse
from anndata import AnnData
def get_actualsize(input_obj):
"""Using Python Garbage Collector to calculate the size of all elements attached to an object"""
memory_size = 0
ids = set()
objects = [input_obj]
while objects:
new = []
for obj in objects:
if id(obj) not in ids:
ids.add(id(obj))
memory_size += sys.getsizeof(obj)
new.append(obj)
objects = gc.get_referents(*new)
return memory_size
def get_anndata_memsize(adata):
recording = memory_usage(
(sedate(adata.copy, naplength=0.005), (adata,)), interval=0.001
)
diff = recording[-1] - recording[0]
return diff
def get_peak_mem(op, interval=0.001):
recording = memory_usage(op, interval=interval)
return np.max(recording) - np.min(recording)
def sedate(func, naplength=0.05):
"""Make a function sleepy, so we can sample the start and end state."""
def wrapped_function(*args, **kwargs):
sleep(naplength)
val = func(*args, **kwargs)
sleep(naplength)
return val
return wrapped_function
# TODO: Factor out the time it takes to generate these
def gen_indexer(adata, dim, index_kind, ratio):
dimnames = ("obs", "var")
index_kinds = {"slice", "intarray", "boolarray", "strarray"}
if index_kind not in index_kinds:
msg = f"Argument 'index_kind' must be one of {index_kinds}. Was {index_kind}."
raise ValueError(msg)
axis = dimnames.index(dim)
subset = [slice(None), slice(None)]
axis_size = adata.shape[axis]
if index_kind == "slice":
subset[axis] = slice(0, int(np.round(axis_size * ratio)))
elif index_kind == "intarray":
subset[axis] = np.random.choice(
np.arange(axis_size), int(np.round(axis_size * ratio)), replace=False
)
subset[axis].sort()
elif index_kind == "boolarray":
pos = np.random.choice(
np.arange(axis_size), int(np.round(axis_size * ratio)), replace=False
)
a = np.zeros(axis_size, dtype=bool)
a[pos] = True
subset[axis] = a
elif index_kind == "strarray":
subset[axis] = np.random.choice(
getattr(adata, dim).index, int(np.round(axis_size * ratio)), replace=False
)
else:
raise ValueError()
return tuple(subset)
def take_view(adata, *, dim, index_kind, ratio=0.5, nviews=100):
subset = gen_indexer(adata, dim, index_kind, ratio)
views = []
for i in range(nviews):
views.append(adata[subset])
def take_repeated_view(adata, *, dim, index_kind, ratio=0.9, nviews=10):
v = adata
views = []
for i in range(nviews):
subset = gen_indexer(v, dim, index_kind, ratio)
v = v[subset]
views.append(v)
def gen_adata(n_obs, n_var, attr_set):
if "X-csr" in attr_set:
X = sparse.random(n_obs, n_var, density=0.1, format="csr")
elif "X-dense" in attr_set:
X = sparse.random(n_obs, n_var, density=0.1, format="csr")
X = X.toarray()
else:
# TODO: There's probably a better way to do this
X = sparse.random(n_obs, n_var, density=0, format="csr")
adata = AnnData(X)
if "obs,var" in attr_set:
adata.obs = pd.DataFrame(
{k: np.random.randint(0, 100, n_obs) for k in ascii_lowercase},
index=[f"cell{i}" for i in range(n_obs)],
)
adata.var = pd.DataFrame(
{k: np.random.randint(0, 100, n_var) for k in ascii_lowercase},
index=[f"gene{i}" for i in range(n_var)],
)
return adata
python-anndata-0.12.0~rc1/biome.jsonc 0000664 0000000 0000000 00000000777 15003706322 0017465 0 ustar 00root root 0000000 0000000 {
"$schema": "https://biomejs.dev/schemas/1.9.4/schema.json",
"formatter": { "useEditorconfig": true },
"overrides": [
{
"include": ["./.vscode/*.json", "**/*.jsonc", "**/asv.conf.json"],
"json": {
"formatter": {
"trailingCommas": "all",
},
"parser": {
"allowComments": true,
"allowTrailingCommas": true,
},
},
},
],
}
python-anndata-0.12.0~rc1/ci/ 0000775 0000000 0000000 00000000000 15003706322 0015714 5 ustar 00root root 0000000 0000000 python-anndata-0.12.0~rc1/ci/constraints.txt 0000664 0000000 0000000 00000000014 15003706322 0021017 0 ustar 00root root 0000000 0000000 numba>=0.56
python-anndata-0.12.0~rc1/ci/scripts/ 0000775 0000000 0000000 00000000000 15003706322 0017403 5 ustar 00root root 0000000 0000000 python-anndata-0.12.0~rc1/ci/scripts/min-deps.py 0000775 0000000 0000000 00000012142 15003706322 0021474 0 ustar 00root root 0000000 0000000 #!/usr/bin/env python3
# /// script
# dependencies = [
# "tomli; python_version < '3.11'",
# "packaging",
# ]
# ///
from __future__ import annotations
import argparse
import sys
import tomllib
from collections import deque
from contextlib import ExitStack
from functools import cached_property
from pathlib import Path
from typing import TYPE_CHECKING
from packaging.requirements import Requirement
from packaging.version import Version
if TYPE_CHECKING:
from collections.abc import Generator, Iterable, Sequence
from collections.abc import Set as AbstractSet
from typing import Any, Self
def min_dep(req: Requirement) -> Requirement:
"""
Given a requirement, return the minimum version specifier.
Example
-------
>>> min_dep(Requirement("numpy>=1.0"))
>>> min_dep(Requirement("numpy<3.0"))
"""
req_name = req.name
if req.extras:
req_name = f"{req_name}[{','.join(req.extras)}]"
filter_specs = [
spec for spec in req.specifier if spec.operator in {"==", "~=", ">=", ">"}
]
if not filter_specs:
# TODO: handle markers
return Requirement(f"{req_name}{req.specifier}")
min_version = Version("0.0.0.a1")
for spec in filter_specs:
if spec.operator in {">", ">=", "~="}:
min_version = max(min_version, Version(spec.version))
elif spec.operator == "==":
min_version = Version(spec.version)
return Requirement(f"{req_name}=={min_version}.*")
def extract_min_deps(
dependencies: Iterable[Requirement], *, pyproject
) -> Generator[Requirement, None, None]:
dependencies = deque(dependencies) # We'll be mutating this
project_name = pyproject["project"]["name"]
while len(dependencies) > 0:
req = dependencies.pop()
# If we are referring to other optional dependency lists, resolve them
if req.name == project_name:
assert req.extras, (
f"Project included itself as dependency, without specifying extras: {req}"
)
for extra in req.extras:
extra_deps = pyproject["project"]["optional-dependencies"][extra]
dependencies += map(Requirement, extra_deps)
else:
yield min_dep(req)
class Args(argparse.Namespace):
"""\
Parse a pyproject.toml file and output a list of minimum dependencies.
Output is optimized for `[uv] pip install` (see `-o`/`--output` for details).
"""
_path: Path
output: Path | None
_extras: list[str]
_all_extras: bool
@classmethod
def parse(cls, argv: Sequence[str] | None = None) -> Self:
return cls.parser().parse_args(argv, cls())
@classmethod
def parser(cls) -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
prog="min-deps",
description=cls.__doc__,
usage="pip install `python min-deps.py pyproject.toml`",
)
parser.add_argument(
"_path",
metavar="pyproject.toml",
type=Path,
help="Path to pyproject.toml to parse minimum dependencies from",
)
parser.add_argument(
"--extras",
dest="_extras",
metavar="EXTRA",
type=str,
nargs="*",
default=(),
help="extras to install",
)
parser.add_argument(
"--all-extras",
dest="_all_extras",
action="store_true",
help="get all extras",
)
parser.add_argument(
*("--output", "-o"),
metavar="FILE",
type=Path,
default=None,
help=(
"output file (default: stdout). "
"Without this option, output is space-separated for direct passing to `pip install`. "
"With this option, output written to a file newline-separated file usable as `requirements.txt` or `constraints.txt`."
),
)
return parser
@cached_property
def pyproject(self) -> dict[str, Any]:
return tomllib.loads(self._path.read_text())
@cached_property
def extras(self) -> AbstractSet[str]:
if self._extras:
if self._all_extras:
sys.exit("Cannot specify both --extras and --all-extras")
return dict.fromkeys(self._extras).keys()
if not self._all_extras:
return set()
return self.pyproject["project"]["optional-dependencies"].keys()
def main(argv: Sequence[str] | None = None) -> None:
args = Args.parse(argv)
project_name = args.pyproject["project"]["name"]
deps = [
*map(Requirement, args.pyproject["project"]["dependencies"]),
*(Requirement(f"{project_name}[{extra}]") for extra in args.extras),
]
min_deps = extract_min_deps(deps, pyproject=args.pyproject)
sep = "\n" if args.output else " "
with ExitStack() as stack:
f = stack.enter_context(args.output.open("w")) if args.output else sys.stdout
print(sep.join(map(str, min_deps)), file=f)
if __name__ == "__main__":
main()
python-anndata-0.12.0~rc1/ci/scripts/towncrier_automation.py 0000775 0000000 0000000 00000010143 15003706322 0024233 0 ustar 00root root 0000000 0000000 #!/usr/bin/env python3
# /// script
# dependencies = [ "towncrier", "packaging" ]
# ///
from __future__ import annotations
import argparse
import re
import subprocess
from functools import cache
from typing import TYPE_CHECKING
from packaging.version import Version
if TYPE_CHECKING:
from collections.abc import Sequence
class BumpVersion(Version):
def __init__(self, version: str) -> None:
super().__init__(version)
if len(self.release) != 3:
msg = f"{version} must contain major, minor, and patch version."
raise argparse.ArgumentTypeError(msg)
base_branch = get_base_branch()
patch_branch_pattern = re.compile(r"\d+\.\d+\.x")
if self.micro != 0 and not patch_branch_pattern.fullmatch(base_branch):
msg = (
f"{version} is a patch release, but "
f"you are trying to release from a non-patch release branch: {base_branch}."
)
raise argparse.ArgumentTypeError(msg)
if self.micro == 0 and base_branch != "main":
msg = (
f"{version} is a minor or major release, "
f"but you are trying to release not from main: {base_branch}."
)
raise argparse.ArgumentTypeError(msg)
class Args(argparse.Namespace):
version: BumpVersion
dry_run: bool
def parse_args(argv: Sequence[str] | None = None) -> Args:
parser = argparse.ArgumentParser(
prog="towncrier-automation",
description=(
"This script runs towncrier for a given version, "
"creates a branch off of the current one, "
"and then creates a PR into the original branch with the changes. "
"The PR will be backported to main if the current branch is not main."
),
)
parser.add_argument(
"version",
type=BumpVersion,
help=(
"The new version for the release must have at least three parts, like `major.minor.patch` and no `major.minor`. "
"It can have a suffix like `major.minor.patch.dev0` or `major.minor.0rc1`."
),
)
parser.add_argument(
"--dry-run",
help="Whether or not to dry-run the actual creation of the pull request",
action="store_true",
)
args = parser.parse_args(argv, Args())
return args
def main(argv: Sequence[str] | None = None) -> None:
args = parse_args(argv)
# Run towncrier
subprocess.run(
["towncrier", "build", f"--version={args.version}", "--yes"], check=True
)
# Check if we are on the main branch to know if we need to backport
base_branch = get_base_branch()
pr_description = "" if base_branch == "main" else "@meeseeksdev backport to main"
branch_name = f"release_notes_{args.version}"
# Create a new branch + commit
subprocess.run(["git", "switch", "-c", branch_name], check=True)
subprocess.run(["git", "add", "docs/release-notes"], check=True)
pr_title = f"(chore): generate {args.version} release notes"
subprocess.run(["git", "commit", "-m", pr_title], check=True)
# push
if not args.dry_run:
subprocess.run(
["git", "push", "--set-upstream", "origin", branch_name], check=True
)
else:
print("Dry run, not pushing")
# Create a PR
subprocess.run(
[
"gh",
"pr",
"create",
f"--base={base_branch}",
f"--title={pr_title}",
f"--body={pr_description}",
"--label=skip-gpu-ci",
*(["--label=no milestone"] if base_branch == "main" else []),
*(["--dry-run"] if args.dry_run else []),
],
check=True,
)
# Enable auto-merge
if not args.dry_run:
subprocess.run(
["gh", "pr", "merge", branch_name, "--auto", "--squash"], check=True
)
else:
print("Dry run, not merging")
@cache
def get_base_branch():
return subprocess.run(
["git", "rev-parse", "--abbrev-ref", "HEAD"],
capture_output=True,
text=True,
check=True,
).stdout.strip()
if __name__ == "__main__":
main()
python-anndata-0.12.0~rc1/docs/ 0000775 0000000 0000000 00000000000 15003706322 0016251 5 ustar 00root root 0000000 0000000 python-anndata-0.12.0~rc1/docs/Makefile 0000664 0000000 0000000 00000001264 15003706322 0017714 0 ustar 00root root 0000000 0000000 # Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = python3 -msphinx
SPHINXPROJ = Scanpy
SOURCEDIR = .
BUILDDIR = _build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
clean:
rm -r "$(BUILDDIR)"
rm -r "generated"
find . -name anndata.*.rst -delete
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
python-anndata-0.12.0~rc1/docs/_key_contributors.rst 0000664 0000000 0000000 00000000656 15003706322 0022556 0 ustar 00root root 0000000 0000000 .. sidebar:: Key Contributors
* `Isaac Virshup`_: anndata >= 0.7, diverse contributions
* Sergei Rybakov: diverse contributions
* `Alex Wolf`_: initial conception/development
* Philipp Angerer: initial conception/development, software quality
.. _contributions graph: https://github.com/scverse/anndata/graphs/contributors
.. _Isaac Virshup: https://twitter.com/ivirshup
.. _Alex Wolf: https://twitter.com/falexwolf
python-anndata-0.12.0~rc1/docs/_static/ 0000775 0000000 0000000 00000000000 15003706322 0017677 5 ustar 00root root 0000000 0000000 python-anndata-0.12.0~rc1/docs/_static/img/ 0000775 0000000 0000000 00000000000 15003706322 0020453 5 ustar 00root root 0000000 0000000 python-anndata-0.12.0~rc1/docs/_static/img/anndata_schema.svg 0000664 0000000 0000000 00000212114 15003706322 0024123 0 ustar 00root root 0000000 0000000
python-anndata-0.12.0~rc1/docs/_templates/ 0000775 0000000 0000000 00000000000 15003706322 0020406 5 ustar 00root root 0000000 0000000 python-anndata-0.12.0~rc1/docs/_templates/autosummary/ 0000775 0000000 0000000 00000000000 15003706322 0022774 5 ustar 00root root 0000000 0000000 python-anndata-0.12.0~rc1/docs/_templates/autosummary/class.rst 0000664 0000000 0000000 00000001213 15003706322 0024630 0 ustar 00root root 0000000 0000000 {{ fullname | escape | underline}}
.. currentmodule:: {{ module }}
.. add toctree option to make autodoc generate the pages
.. autoclass:: {{ objname }}
{% block attributes %}
{% if attributes %}
.. rubric:: Attributes
.. autosummary::
:toctree: .
{% for item in attributes %}
~{{ name }}.{{ item }}
{%- endfor %}
{% endif %}
{% endblock %}
{% block methods %}
{% if methods %}
.. rubric:: Methods
.. autosummary::
:toctree: .
{% for item in methods %}
{%- if item != '__init__' %}
~{{ name }}.{{ item }}
{%- endif -%}
{%- endfor %}
{% endif %}
{% endblock %}
python-anndata-0.12.0~rc1/docs/api.md 0000664 0000000 0000000 00000007645 15003706322 0017360 0 ustar 00root root 0000000 0000000 # API
```{eval-rst}
.. module:: anndata
```
The central class:
```{eval-rst}
.. autosummary::
:toctree: generated/
AnnData
```
(combining-api)=
## Combining
Combining {class}`AnnData` objects.
See also the section on concatenation.
```{eval-rst}
.. autosummary::
:toctree: generated/
concat
```
(reading-api)=
## Reading
Reading anndata’s native formats `.h5ad` and `zarr`.
```{eval-rst}
.. autosummary::
:toctree: generated/
io.read_h5ad
io.read_zarr
```
Reading individual portions ({attr}`~AnnData.obs`, {attr}`~AnnData.varm` etc.) of the {class}`AnnData` object.
```{eval-rst}
.. autosummary::
:toctree: generated/
io.read_elem
io.sparse_dataset
```
Reading file formats that cannot represent all aspects of {class}`AnnData` objects.
```{tip}
You might have more success by assembling the {class}`AnnData` object yourself from the individual parts.
```
```{eval-rst}
.. autosummary::
:toctree: generated/
io.read_csv
io.read_excel
io.read_hdf
io.read_loom
io.read_mtx
io.read_text
io.read_umi_tools
```
(writing-api)=
## Writing
Writing a complete {class}`AnnData` object to disk in anndata’s native formats `.h5ad` and `zarr`.
(These functions are also exported as {func}`io.write_h5ad` and {func}`io.write_zarr`.)
```{eval-rst}
.. autosummary::
:toctree: generated/
AnnData.write_h5ad
AnnData.write_zarr
..
.. autosummary::
:toctree: generated/
io.write_h5ad
io.write_zarr
.. toctree::
:hidden:
generated/anndata.io.write_h5ad
generated/anndata.io.write_zarr
```
Writing individual portions ({attr}`~AnnData.obs`, {attr}`~AnnData.varm` etc.) of the {class}`AnnData` object.
```{eval-rst}
.. autosummary::
:toctree: generated/
io.write_elem
```
Writing formats that cannot represent all aspects of {class}`AnnData` objects.
```{eval-rst}
.. autosummary::
:toctree: generated/
AnnData.write_csvs
AnnData.write_loom
```
(experimental-api)=
## Experimental API
```{warning}
APIs in the experimental module are currently in development and subject to change at any time.
```
Two classes for working with batched access to collections of many {class}`AnnData` objects or `.h5ad` files.
In particular, for pytorch-based models.
```{eval-rst}
.. autosummary::
:toctree: generated/
experimental.AnnCollection
experimental.AnnLoader
```
Out of core concatenation
```{eval-rst}
.. autosummary::
:toctree: generated/
experimental.concat_on_disk
```
Low level methods for reading and writing elements of an {class}`AnnData` object to a store:
```{eval-rst}
.. autosummary::
:toctree: generated/
experimental.read_elem_lazy
experimental.read_lazy
```
Utilities for customizing the IO process:
```{eval-rst}
.. autosummary::
:toctree: generated/
experimental.read_dispatched
experimental.write_dispatched
```
Types used by the former:
```{eval-rst}
.. autosummary::
:toctree: generated/
experimental.IOSpec
experimental.Read
experimental.Write
experimental.ReadCallback
experimental.WriteCallback
experimental.StorageType
experimental.backed._lazy_arrays.MaskedArray
experimental.backed._lazy_arrays.CategoricalArray
experimental.backed._xarray.Dataset2D
```
(extensions-api)=
## Extensions
```{eval-rst}
.. autosummary::
:toctree: generated/
register_anndata_namespace
```
Types used by the former:
```{eval-rst}
.. autosummary::
:toctree: generated/
types.ExtensionNamespace
```
(errors-api)=
## Errors and warnings
```{eval-rst}
.. autosummary::
:toctree: generated/
ImplicitModificationWarning
```
(settings-api)=
## Settings
```{eval-rst}
.. autosummary::
:toctree: generated/
settings
settings.override
```
(types-api)=
## Custom Types/Classes for Readable/Writeable Elements
```{eval-rst}
.. autosummary::
:toctree: generated/
abc.CSRDataset
abc.CSCDataset
typing.Index
typing.AxisStorable
typing.RWAble
```
python-anndata-0.12.0~rc1/docs/benchmark-read-write.ipynb 0000664 0000000 0000000 00000007057 15003706322 0023320 0 ustar 00root root 0000000 0000000 {
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Simple benchmarks"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Here, we perform simple benchmarks to demonstrate basic performance."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from __future__ import annotations\n",
"\n",
"import scanpy as sc\n",
"\n",
"import anndata as ad"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"adata = sc.datasets.pbmc3k()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"AnnData object with n_obs × n_vars = 2700 × 32738\n",
" var: 'gene_ids'"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"adata"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Reading & writing"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let us start by writing & reading anndata's native HDF5 file format: `.h5ad`:"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 93.9 ms, sys: 17.4 ms, total: 111 ms\n",
"Wall time: 118 ms\n"
]
}
],
"source": [
"%%time\n",
"adata.write(\"test.h5ad\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 51.2 ms, sys: 13.3 ms, total: 64.5 ms\n",
"Wall time: 64.1 ms\n"
]
}
],
"source": [
"%%time\n",
"adata = ad.read_h5ad(\"test.h5ad\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We see that reading and writing is much faster than for loom files. The efficiency gain here is due to explicit storage of the sparse matrix structure."
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 2.82 s, sys: 457 ms, total: 3.27 s\n",
"Wall time: 3.31 s\n"
]
}
],
"source": [
"%%time\n",
"adata.write_loom(\"test.loom\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 1.05 s, sys: 221 ms, total: 1.28 s\n",
"Wall time: 1.28 s\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/alexwolf/repos/anndata/anndata/_core/anndata.py:120: ImplicitModificationWarning: Transforming to str index.\n",
" warnings.warn(\"Transforming to str index.\", ImplicitModificationWarning)\n"
]
}
],
"source": [
"%%time\n",
"adata = ad.io.read_loom(\"test.loom\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
python-anndata-0.12.0~rc1/docs/benchmarks.md 0000664 0000000 0000000 00000000375 15003706322 0020715 0 ustar 00root root 0000000 0000000 # Benchmarks
Computational operations in anndata are consistently benchmarked [here](https://github.com/ivirshup/anndata-benchmarks).
Below follows a simple benchmark showing read-write efficiency.
```{toctree}
:maxdepth: 1
benchmark-read-write
```
python-anndata-0.12.0~rc1/docs/concatenation.rst 0000664 0000000 0000000 00000031103 15003706322 0021626 0 ustar 00root root 0000000 0000000 Concatenation
=============
With :func:`~anndata.concat`, :class:`~anndata.AnnData` objects can be combined via a composition of two operations: concatenation and merging.
* Concatenation is when we keep all sub elements of each object, and stack these elements in an ordered way.
* Merging is combining a set of collections into one resulting collection which contains elements from the objects.
.. note::
This function borrows from similar functions in pandas_ and xarray_. Argument which are used to control concatenation are modeled after :func:`pandas.concat` while strategies for merging are inspired by :func:`xarray.merge`'s `compat` argument.
.. _pandas: https://pandas.pydata.org
.. _xarray: http://xarray.pydata.org
Concatenation
-------------
Let's start off with an example:
>>> import scanpy as sc, anndata as ad, numpy as np, pandas as pd
>>> from scipy import sparse
>>> from anndata import AnnData
>>> pbmc = sc.datasets.pbmc68k_reduced()
>>> pbmc
AnnData object with n_obs × n_vars = 700 × 765
obs: 'bulk_labels', 'n_genes', 'percent_mito', 'n_counts', 'S_score', 'G2M_score', 'phase', 'louvain'
var: 'n_counts', 'means', 'dispersions', 'dispersions_norm', 'highly_variable'
uns: 'bulk_labels_colors', 'louvain', 'louvain_colors', 'neighbors', 'pca', 'rank_genes_groups'
obsm: 'X_pca', 'X_umap'
varm: 'PCs'
obsp: 'distances', 'connectivities'
If we split this object up by clusters of observations, then stack those subsets we'll obtain the same values – just ordered differently.
>>> groups = pbmc.obs.groupby("louvain", observed=True).indices
>>> pbmc_concat = ad.concat([pbmc[inds] for inds in groups.values()], merge="same")
>>> assert np.array_equal(pbmc.X, pbmc_concat[pbmc.obs_names].X)
>>> pbmc_concat
AnnData object with n_obs × n_vars = 700 × 765
obs: 'bulk_labels', 'n_genes', 'percent_mito', 'n_counts', 'S_score', 'G2M_score', 'phase', 'louvain'
var: 'n_counts', 'means', 'dispersions', 'dispersions_norm', 'highly_variable'
obsm: 'X_pca', 'X_umap'
varm: 'PCs'
Note that we concatenated along the observations by default, and that most elements aligned to the observations were concatenated as well.
A notable exception is :attr:`~anndata.AnnData.obsp`, which can be re-enabled with the `pairwise` keyword argument.
This is because it's not obvious that combining graphs or distance matrices padded with 0s is particularly useful, and may be unintuitive.
Inner and outer joins
~~~~~~~~~~~~~~~~~~~~~
When the variables present in the objects to be concatenated aren't exactly the same, you can choose to take either the intersection or union of these variables.
This is otherwise called taking the `"inner"` (intersection) or `"outer"` (union) join.
For example, given two anndata objects with differing variables:
>>> a = AnnData(sparse.eye(3, format="csr"), var=pd.DataFrame(index=list("abc")))
>>> b = AnnData(sparse.eye(2, format="csr"), var=pd.DataFrame(index=list("ba")))
>>> ad.concat([a, b], join="inner").X.toarray()
array([[1., 0.],
[0., 1.],
[0., 0.],
[0., 1.],
[1., 0.]])
>>> ad.concat([a, b], join="outer").X.toarray()
array([[1., 0., 0.],
[0., 1., 0.],
[0., 0., 1.],
[0., 1., 0.],
[1., 0., 0.]])
The join argument is used for any element which has both (1) an axis being concatenated and (2) an axis not being concatenated.
When concatenating along the `obs` dimension, this means elements of `.X`, `obs`, `.layers`, and `.obsm` will be affected by the choice of `join`.
To demonstrate this, let's say we're trying to combine a droplet based experiment with a spatial one.
When building a joint anndata object, we would still like to store the coordinates for the spatial samples.
>>> coords = np.hstack([np.repeat(np.arange(10), 10), np.tile(np.arange(10), 10)]).T
>>> spatial = AnnData(
... sparse.random(5000, 10000, format="csr"),
... obsm={"coords": np.random.randn(5000, 2)}
... )
>>> droplet = AnnData(sparse.random(5000, 10000, format="csr"))
>>> combined = ad.concat([spatial, droplet], join="outer")
>>> sc.pl.embedding(combined, "coords") # doctest: +SKIP
.. TODO: Get the above plot to show up
Annotating data source (`label`, `keys`, and `index_unique`)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Often, you'd like to be able to tell which values came from which object.
This can be accomplished with the `label`, `keys`, and `index_unique` keyword arguments.
For an example, we'll show how you can keep track of the original dataset by passing a `Mapping` of dataset names to `AnnData` objects to `concat`:
>>> adatas = {
... "a": ad.AnnData(
... sparse.random(3, 50, format="csr", density=0.1),
... obs=pd.DataFrame(index=[f"a-{i}" for i in range(3)])
... ),
... "b": ad.AnnData(
... sparse.random(5, 50, format="csr", density=0.1),
... obs=pd.DataFrame(index=[f"b-{i}" for i in range(5)])
... ),
... }
>>> ad.concat(adatas, label="dataset").obs
dataset
a-0 a
a-1 a
a-2 a
b-0 b
b-1 b
b-2 b
b-3 b
b-4 b
Here, a categorical column (with the name specified by `label`) was added to the result.
As an alternative to passing a `Mapping`, you can also specify dataset names with the `keys` argument.
In some cases, your objects may share names along the axes being concatenated.
These values can be made unique by appending the relevant key using the `index_unique` argument:
.. TODO: skipping example since doctest does not capture stderr, but it's relevant to show the unique message
>>> adatas = {
... "a": ad.AnnData(
... sparse.random(3, 10, format="csr", density=0.1),
... obs=pd.DataFrame(index=[f"cell-{i}" for i in range(3)])
... ),
... "b": ad.AnnData(
... sparse.random(5, 10, format="csr", density=0.1),
... obs=pd.DataFrame(index=[f"cell-{i}" for i in range(5)])
... ),
... }
>>> ad.concat(adatas).obs # doctest: +SKIP
Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
Empty DataFrame
Columns: []
Index: [cell-0, cell-1, cell-2, cell-0, cell-1, cell-2, cell-3, cell-4]
>>> ad.concat(adatas, index_unique="_").obs
Empty DataFrame
Columns: []
Index: [cell-0_a, cell-1_a, cell-2_a, cell-0_b, cell-1_b, cell-2_b, cell-3_b, cell-4_b]
Merging
-------
Combining elements not aligned to the axis of concatenation is controlled through the `merge` arguments.
We provide a few strategies for merging elements aligned to the alternative axes:
* `None`: No elements aligned to alternative axes are present in the result object.
* `"same"`: Elements that are the same in each of the objects.
* `"unique"`: Elements for which there is only one possible value.
* `"first"`: The first element seen in each from each position.
* `"only"`: Elements that show up in only one of the objects.
We'll show how this works with elements aligned to the alternative axis, and then how merging works with `.uns`.
First, our example case:
>>> import scanpy as sc
>>> blobs = sc.datasets.blobs(n_variables=30, n_centers=5)
>>> sc.pp.pca(blobs)
>>> blobs
AnnData object with n_obs × n_vars = 640 × 30
obs: 'blobs'
uns: 'pca'
obsm: 'X_pca'
varm: 'PCs'
Now we will split this object by the categorical `"blobs"` and recombine it to illustrate different merge strategies.
>>> adatas = []
>>> for group, idx in blobs.obs.groupby("blobs").indices.items():
... sub_adata = blobs[idx].copy()
... sub_adata.obsm["qc"], sub_adata.varm[f"{group}_qc"] = sc.pp.calculate_qc_metrics(
... sub_adata, percent_top=(), inplace=False, log1p=False
... )
... adatas.append(sub_adata)
>>> adatas[0]
AnnData object with n_obs × n_vars = 128 × 30
obs: 'blobs'
uns: 'pca'
obsm: 'X_pca', 'qc'
varm: 'PCs', '0_qc'
`adatas` is now a list of datasets with disjoint sets of observations and a common set of variables.
Each object has had QC metrics computed, with observation-wise metrics stored under `"qc"` in `.obsm`, and variable-wise metrics stored with a unique key for each subset.
Taking a look at how this affects concatenation:
>>> ad.concat(adatas)
AnnData object with n_obs × n_vars = 640 × 30
obs: 'blobs'
obsm: 'X_pca', 'qc'
>>> ad.concat(adatas, merge="same")
AnnData object with n_obs × n_vars = 640 × 30
obs: 'blobs'
obsm: 'X_pca', 'qc'
varm: 'PCs'
>>> ad.concat(adatas, merge="unique")
AnnData object with n_obs × n_vars = 640 × 30
obs: 'blobs'
obsm: 'X_pca', 'qc'
varm: 'PCs', '0_qc', '1_qc', '2_qc', '3_qc', '4_qc'
Note that comparisons are made after indices are aligned.
That is, if the objects only share a subset of indices on the alternative axis, it's only required that values for those indices match when using a strategy like `"same"`.
>>> a = AnnData(
... sparse.eye(3, format="csr"),
... var=pd.DataFrame({"nums": [1, 2, 3]}, index=list("abc"))
... )
>>> b = AnnData(
... sparse.eye(2, format="csr"),
... var=pd.DataFrame({"nums": [2, 1]}, index=list("ba"))
... )
>>> ad.concat([a, b], merge="same").var
nums
a 1
b 2
Merging `.uns`
~~~~~~~~~~~~~~
We use the same set of strategies for merging `uns` as we do for entries aligned to an axis, but these strategies are applied recursively.
This is a little abstract, so we'll look at some examples of this. Here's our setup:
>>> from anndata import AnnData
>>> import numpy as np
>>> a = AnnData(np.zeros((10, 10)), uns={"a": 1, "b": 2, "c": {"c.a": 3, "c.b": 4}})
>>> b = AnnData(np.zeros((10, 10)), uns={"a": 1, "b": 3, "c": {"c.b": 4}})
>>> c = AnnData(np.zeros((10, 10)), uns={"a": 1, "b": 4, "c": {"c.a": 3, "c.b": 4, "c.c": 5}})
For quick reference, these are the results from each of the merge strategies.
These are discussed in more depth below:
=========== =======================================================
`uns_merge` Result
=========== =======================================================
`None` `{}`
`"same"` `{"a": 1, "c": {"c.b": 4}}`
`"unique"` `{"a": 1, "c": {"c.a": 3, "c.b": 4, "c.c": 5}}`
`"only"` `{"c": {"c.c": 5}}`
`"first"` `{"a": 1, "b": 2, "c": {"c.a": 3, "c.b": 4, "c.c": 5}}`
=========== =======================================================
The default returns a fairly obvious result:
>>> ad.concat([a, b, c]).uns == {}
True
But let's take a look at the others in a bit more depth. Here, we'll be wrapping the output data in a `dict` for simplicity of the return value.
>>> dict(ad.concat([a, b, c], uns_merge="same").uns)
{'a': 1, 'c': {'c.b': 4}}
Here only the values for `uns["a"]` and `uns["c"]["c.b"]` were exactly the same, so only they were kept.
`uns["b"]` has a number of values and neither `uns["c"]["c.a"]` or `uns["c"]["c.b"]` appears in each `uns`.
A key feature to note is that comparisons are aware of the nested structure of `uns` and will be applied at any depth.
This is why `uns["c"]["c.b"]` was kept.
Merging `uns` in this way can be useful when there is some shared data between the objects being concatenated.
For example, if each was put through the same pipeline with the same parameters, those parameters used would still be present in the resulting object.
Now let's look at the behaviour of `unique`:
>>> dict(ad.concat([a, b, c], uns_merge="unique").uns)
{'a': 1, 'c': {'c.a': 3, 'c.b': 4, 'c.c': 5}}
The results here are a super-set of those from `"same"`. Note that there was only one possible value at each position in the resulting mapping.
That is, there were not alternative values present for `uns["c"]["c.c"]` even though it appeared only once.
This can be useful when the object's were both run through the same pipeline but contain specific metadata per object.
An example of this would be a spatial dataset, where the images are stored in `uns`.
>>> dict(ad.concat([a, b, c], uns_merge="only").uns)
{'c': {'c.c': 5}}
`uns["c"]["c.c"]` is the only value that is kept, since it is the only one which was specified in only one `uns`.
>>> dict(ad.concat([a, b, c], uns_merge="first").uns)
{'a': 1, 'b': 2, 'c': {'c.a': 3, 'c.b': 4, 'c.c': 5}}
In this case, the result has the union of the keys from all the starting dictionaries.
The value is taken from the first object to have a value at this key.
python-anndata-0.12.0~rc1/docs/conf.py 0000664 0000000 0000000 00000015634 15003706322 0017561 0 ustar 00root root 0000000 0000000 from __future__ import annotations
import sys
from datetime import datetime
from functools import partial
from importlib import metadata
from pathlib import Path, PurePosixPath
from typing import TYPE_CHECKING
from docutils import nodes
if TYPE_CHECKING:
from sphinx.application import Sphinx
HERE = Path(__file__).parent
_extension_dir = HERE / "extensions"
sys.path[:0] = [str(_extension_dir)]
# -- General configuration ------------------------------------------------
# General information
project = "anndata"
author = f"{project} developers"
copyright = f"{datetime.now():%Y}, scverse"
release = version = metadata.version("anndata")
# default settings
templates_path = ["_templates"]
html_static_path = ["_static"]
source_suffix = {".rst": "restructuredtext", ".md": "myst-nb"}
master_doc = "index"
default_role = "literal"
exclude_patterns = [
"_build",
"Thumbs.db",
".DS_Store",
"**.ipynb_checkpoints",
"tutorials/notebooks/*.rst",
# exclude all 0.x.y.md files, but not index.md
"release-notes/[!i]*.md",
"news.md", # is `include`d into index.md
]
pygments_style = "sphinx"
extensions = [
"myst_nb",
"sphinx_copybutton",
"sphinx.ext.autodoc",
"sphinx.ext.intersphinx",
"sphinx.ext.doctest",
"sphinx.ext.coverage",
"sphinx.ext.mathjax",
"sphinx.ext.napoleon",
"sphinx.ext.autosummary",
"sphinx_autodoc_typehints", # needs to be after napoleon
"sphinx_issues",
"sphinx_design",
"sphinx_search.extension",
"sphinxext.opengraph",
"scanpydoc", # needs to be before linkcode
"sphinx.ext.linkcode",
"IPython.sphinxext.ipython_console_highlighting",
"sphinx_toolbox.more_autodoc.autoprotocol",
*(p.stem for p in _extension_dir.glob("*.py")),
]
myst_enable_extensions = [
"html_image", # So README.md can be used on github and sphinx docs
"colon_fence",
"dollarmath",
]
myst_heading_anchors = 3
nb_execution_mode = "off"
# Generate the API documentation when building
autosummary_generate = True
autodoc_member_order = "bysource"
autodoc_mock_imports = ["torch"]
# autodoc_default_flags = ['members']
issues_github_path = "scverse/anndata"
rtd_links_prefix = PurePosixPath("src")
napoleon_google_docstring = False
napoleon_numpy_docstring = True
napoleon_include_init_with_doc = False
napoleon_use_rtype = True # having a separate entry generally helps readability
napoleon_use_param = True
napoleon_custom_sections = [("Params", "Parameters")]
typehints_defaults = "braces"
todo_include_todos = False
nitpicky = True # Report broken links
nitpick_ignore = [ # APIs without an intersphinx entry
# This API isn’t actually documented
("py:class", "anndata._core.raw.Raw"),
# TODO: remove zappy support; the zappy repo is archived
("py:class", "anndata.compat.ZappyArray"),
]
def setup(app: Sphinx):
app.add_generic_role("small", partial(nodes.inline, classes=["small"]))
app.add_generic_role("smaller", partial(nodes.inline, classes=["smaller"]))
intersphinx_mapping = dict(
awkward=("https://awkward-array.org/doc/stable", None),
cupy=("https://docs.cupy.dev/en/stable", None),
dask=("https://docs.dask.org/en/stable", None),
h5py=("https://docs.h5py.org/en/latest", None),
hdf5plugin=("https://hdf5plugin.readthedocs.io/en/latest", None),
loompy=("https://linnarssonlab.org/loompy", None),
numpy=("https://numpy.org/doc/stable", None),
pandas=("https://pandas.pydata.org/pandas-docs/stable", None),
python=("https://docs.python.org/3", None),
scipy=("https://docs.scipy.org/doc/scipy", None),
sklearn=("https://scikit-learn.org/stable", None),
# TODO: move back to stable once `ObjectStore` is released
zarr=("https://zarr.readthedocs.io/en/latest/", None),
xarray=("https://docs.xarray.dev/en/stable", None),
obstore=("https://developmentseed.org/obstore/latest/", None),
kvikio=("https://docs.rapids.ai/api/kvikio/stable/", None),
zarrs=("https://zarrs-python.readthedocs.io/en/stable/", None),
)
qualname_overrides = {
"h5py._hl.group.Group": "h5py.Group",
"h5py._hl.files.File": "h5py.File",
"h5py._hl.dataset.Dataset": "h5py.Dataset",
"anndata._core.anndata.AnnData": "anndata.AnnData",
**{
f"anndata._core.aligned_mapping.{cls}{kind}": "collections.abc.Mapping"
for cls in "Layers AxisArrays PairwiseArrays".split()
for kind in ["", "View"]
},
"anndata._types.ReadCallback": "anndata.experimental.ReadCallback",
"anndata._types.WriteCallback": "anndata.experimental.WriteCallback",
"anndata._types.Read": "anndata.experimental.Read",
"anndata._types.Write": "anndata.experimental.Write",
"zarr.core.array.Array": "zarr.Array",
"zarr.core.group.Group": "zarr.Group",
# Buffer is not yet exported, so the buffer class registry is the closest thing
"zarr.core.buffer.core.Buffer": "zarr.registry.Registry",
"zarr.storage._common.StorePath": "zarr.storage.StorePath",
"anndata.compat.DaskArray": "dask.array.Array",
"anndata.compat.CupyArray": "cupy.ndarray",
"anndata.compat.CupySparseMatrix": "cupyx.scipy.sparse.spmatrix",
"awkward.highlevel.Array": "ak.Array",
"numpy.int64": ("py:attr", "numpy.int64"),
"pandas.DataFrame.iloc": ("py:attr", "pandas.DataFrame.iloc"),
"pandas.DataFrame.loc": ("py:attr", "pandas.DataFrame.loc"),
# should be fixed soon: https://github.com/tox-dev/sphinx-autodoc-typehints/pull/516
"types.EllipsisType": ("py:data", "types.EllipsisType"),
"pathlib._local.Path": "pathlib.Path",
}
autodoc_type_aliases = dict(
NDArray=":data:`~numpy.typing.NDArray`",
AxisStorable=":data:`~anndata.typing.AxisStorable`",
**{
f"{v}variantRWAble": ":data:`~anndata.typing.RWAble`"
for v in ["In", "Co", "Contra"]
},
)
# -- Social cards ---------------------------------------------------------
ogp_site_url = "https://anndata.readthedocs.io/"
ogp_image = "https://anndata.readthedocs.io/en/latest/_static/img/anndata_schema.svg"
# -- Options for HTML output ----------------------------------------------
# The theme is sphinx-book-theme, with patches for readthedocs-sphinx-search
html_theme = "scanpydoc"
html_theme_options = dict(
use_repository_button=True,
repository_url="https://github.com/scverse/anndata",
repository_branch="main",
navigation_with_keys=False, # https://github.com/pydata/pydata-sphinx-theme/issues/1492
)
html_logo = "_static/img/anndata_schema.svg"
issues_github_path = "scverse/anndata"
html_show_sphinx = False
# -- Options for other output formats ------------------------------------------
htmlhelp_basename = f"{project}doc"
doc_title = f"{project} Documentation"
latex_documents = [(master_doc, f"{project}.tex", doc_title, author, "manual")]
man_pages = [(master_doc, project, doc_title, [author], 1)]
texinfo_documents = [
(
master_doc,
project,
doc_title,
author,
project,
"One line description of project.",
"Miscellaneous",
)
]
python-anndata-0.12.0~rc1/docs/contributing.md 0000664 0000000 0000000 00000000676 15003706322 0021313 0 ustar 00root root 0000000 0000000 # Contributing
AnnData follows the development practices outlined in the [Scanpy contribution guide](https://scanpy.readthedocs.io/en/latest/dev/release.html).
```{eval-rst}
.. include:: _key_contributors.rst
```
## CI
### GPU CI
To test GPU specific code we have a paid self-hosted runner to run the gpu specific tests on.
This CI runs by default on the main branch, but for PRs requires the `run-gpu-ci` label to prevent unnecessary runs.
python-anndata-0.12.0~rc1/docs/extensions/ 0000775 0000000 0000000 00000000000 15003706322 0020450 5 ustar 00root root 0000000 0000000 python-anndata-0.12.0~rc1/docs/extensions/no_skip_abc_members.py 0000664 0000000 0000000 00000001242 15003706322 0025002 0 ustar 00root root 0000000 0000000 """Sphinx extension to not skip abstract methods."""
from __future__ import annotations
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from typing import Literal
from sphinx.application import Sphinx
from sphinx.ext.autodoc import Options
def autodoc_skip_member(
app: Sphinx,
what: Literal["module", "class", "exception", "function", "method", "attribute"],
name: str,
obj: object,
skip: bool, # noqa: FBT001
options: Options,
):
if what == "method" and getattr(obj, "__isabstractmethod__", False):
return False
return None
def setup(app: Sphinx):
app.connect("autodoc-skip-member", autodoc_skip_member)
python-anndata-0.12.0~rc1/docs/extensions/patch_myst_cite.py 0000664 0000000 0000000 00000001507 15003706322 0024204 0 ustar 00root root 0000000 0000000 """Override MyST’s cite role with one that works."""
from __future__ import annotations
from types import MappingProxyType
from typing import TYPE_CHECKING
from docutils import nodes, utils
if TYPE_CHECKING:
from collections.abc import Mapping, Sequence
from typing import Any
from docutils.parsers.rst.states import Inliner
from sphinx.application import Sphinx
def cite_role( # noqa: PLR0917
name: str,
rawsource: str,
text: str,
lineno: int,
inliner: Inliner,
options: Mapping[str, Any] = MappingProxyType({}),
content: Sequence[str] = (),
) -> tuple[list[nodes.Node], list[nodes.system_message]]:
key = utils.unescape(text)
node = nodes.citation_reference(f"[{key}]_", key)
return [node], []
def setup(app: Sphinx):
app.add_role("cite", cite_role, override=True)
python-anndata-0.12.0~rc1/docs/fileformat-prose.md 0000664 0000000 0000000 00000055470 15003706322 0022064 0 ustar 00root root 0000000 0000000 # On-disk format
```{note}
These docs are written for anndata 0.8+.
Files written before this version may differ in some conventions,
but will still be read by newer versions of the library.
```
AnnData objects are saved on disk to hierarchical array stores like [HDF5]
(via {doc}`H5py `) and {doc}`zarr:index`.
This allows us to have very similar structures in disk and on memory.
As an example we’ll look into a typical `.h5ad`/ `.zarr` object that’s been through an analysis.
The structures are largely equivalent, though there are a few minor differences when it comes to type encoding.
## Elements
`````{tab-set}
````{tab-item} HDF5
:sync: hdf5
```python
>>> import h5py
>>> store = h5py.File("for-ondisk-docs/cart-164k-processed.h5ad", mode="r")
>>> list(store.keys())
['X', 'layers', 'obs', 'obsm', 'obsp', 'uns', 'var', 'varm', 'varp']
```
````
````{tab-item} Zarr
:sync: zarr
```python
>>> import zarr
>>> store = zarr.open("for-ondisk-docs/cart-164k-processed.zarr", mode="r")
>>> list(store.keys())
['X', 'layers', 'obs', 'obsm', 'obsp', 'uns', 'var', 'varm', 'varp']
```
````
`````
In general, `AnnData` objects are comprised of various types of elements.
Each element is encoded as either an Array (or Dataset in hdf5 terminology) or a collection of elements (e.g. Group) in the store.
We record the type of an element using the `encoding-type` and `encoding-version` keys in its attributes.
For example, we can see that this file represents an `AnnData` object from its metadata:
```python
>>> dict(store.attrs)
{'encoding-type': 'anndata', 'encoding-version': '0.1.0'}
```
Using this information, we're able to dispatch onto readers for the different element types that you'd find in an anndata.
### Element Specification
* An element can be any object within the storage hierarchy (typically an array or group) with associated metadata
* An element MUST have a string-valued field `"encoding-type"` in its metadata
* An element MUST have a string-valued field `"encoding-version"` in its metadata that can be evaluated to a version
### AnnData specification (v0.1.0)
* An `AnnData` object MUST be a group.
* The group's metadata MUST include entries: `"encoding-type": "anndata"`, `"encoding-version": "0.1.0"`.
* An `AnnData` group MUST contain entries `"obs"` and `"var"`, which MUST be dataframes (though this may only have an index with no columns).
* The group MAY contain an entry `X`, which MUST be either a dense or sparse array and whose shape MUST be (`n_obs`, `n_var`)
* The group MAY contain a mapping `layers`. Entries in `layers` MUST be dense or sparse arrays which have shapes (`n_obs`, `n_var`)
* The group MAY contain a mapping `obsm`. Entries in `obsm` MUST be sparse arrays, dense arrays, or dataframes. These entries MUST have a first dimension of size `n_obs`
* The group MAY contain a mapping `varm`. Entries in `varm` MUST be sparse arrays, dense arrays, or dataframes. These entries MUST have a first dimension of size `n_var`
* The group MAY contain a mapping `obsp`. Entries in `obsp` MUST be sparse or dense arrays. The entries first two dimensions MUST be of size `n_obs`
* The group MAY contain a mapping `varp`. Entries in `varp` MUST be sparse or dense arrays. The entries first two dimensions MUST be of size `n_var`
* The group MAY contain a mapping `uns`. Entries in `uns` MUST be an anndata encoded type.
## Dense arrays
Dense numeric arrays have the most simple representation on disk,
as they have native equivalents in H5py {doc}`h5py:high/dataset` and Zarr {doc}`Arrays `.
We can see an example of this with dimensionality reductions stored in the `obsm` group:
`````{tab-set}
````{tab-item} HDF5
:sync: hdf5
```python
>>> store["obsm/X_pca"]
```
````
````{tab-item} Zarr
:sync: zarr
```python
>>> store["obsm/X_pca"]
```
````
`````
```python
>>> dict(store["obsm"]["X_pca"].attrs)
{'encoding-type': 'array', 'encoding-version': '0.2.0'}
```
### Dense arrays specification (v0.2.0)
* Dense arrays MUST be stored in an Array object
* Dense arrays MUST have the entries `'encoding-type': 'array'` and `'encoding-version': '0.2.0'` in their metadata
## Sparse arrays
Sparse arrays don’t have a native representations in HDF5 or Zarr,
so we've defined our own based on their in-memory structure.
Currently two sparse data formats are supported by `AnnData` objects, CSC and CSR
(corresponding to {class}`scipy.sparse.csc_matrix` and {class}`scipy.sparse.csr_matrix` respectively).
These formats represent a two-dimensional sparse array with
three one-dimensional arrays, `indptr`, `indices`, and `data`.
```{note}
A full description of these formats is out of scope for this document,
but are [easy to find].
```
We represent a sparse array as a `Group` on-disk,
where the kind and shape of the sparse array is defined in the `Group`'s attributes:
```python
>>> dict(store["X"].attrs)
{'encoding-type': 'csr_matrix',
'encoding-version': '0.1.0',
'shape': [164114, 40145]}
```
The group contains three arrays:
`````{tab-set}
````{tab-item} HDF5
:sync: hdf5
```python
>>> store["X"].visititems(print)
data
indices
indptr
```
````
````{tab-item} Zarr
:sync: zarr
```python
>>> store["X"].visititems(print)
data
indices
indptr
```
````
`````
### Sparse array specification (v0.1.0)
* Each sparse array MUST be its own group
* The group MUST contain arrays `indices`, `indptr`, and `data`
* The group's metadata MUST contain:
* `"encoding-type"`, which is set to `"csr_matrix"` or `"csc_matrix"` for compressed sparse row and compressed sparse column, respectively.
* `"encoding-version"`, which is set to `"0.1.0"`
* `"shape"` which is an integer array of length 2 whose values are the sizes of the array's dimensions
## DataFrames
DataFrames are saved as a columnar format in a group, so each column of a DataFrame is saved as a separate array.
We save a little more information in the attributes here.
```python
>>> dict(store["var"].attrs)
{'_index': 'ensembl_id',
'column-order': ['highly_variable',
'means',
'variances',
'variances_norm',
'feature_is_filtered',
'feature_name',
'feature_reference',
'feature_biotype',
'mito'],
'encoding-type': 'dataframe',
'encoding-version': '0.2.0'}
```
These attributes identify the index of the dataframe, as well as the original order of the columns.
Each column in this dataframe is encoded as its own array.
`````{tab-set}
````{tab-item} HDF5
:sync: hdf5
```python
>>> store["var"].visititems(print)
ensembl_id
feature_biotype
feature_biotype/categories
feature_biotype/codes
feature_is_filtered
...
```
````
````{tab-item} Zarr
:sync: zarr
```python
>>> store["var"].visititems(print)
ensembl_id
feature_biotype
feature_biotype/categories
feature_biotype/codes
feature_is_filtered
...
```
````
`````
```python
>>> dict(store["var"]["feature_name"].attrs)
{'encoding-type': 'categorical', 'encoding-version': '0.2.0', 'ordered': False}
>>> dict(store["var"]["feature_is_filtered"].attrs)
{'encoding-type': 'array', 'encoding-version': '0.2.0'}
```
### Dataframe Specification (v0.2.0)
* A dataframe MUST be stored as a group
* The group's metadata:
* MUST contain the field `"_index"`, whose value is the key of the array to be used as an index/ row labels
* MUST contain encoding metadata `"encoding-type": "dataframe"`, `"encoding-version": "0.2.0"`
* MUST contain `"column-order"` an array of strings denoting the order of column entries
* The group MUST contain an array for the index
* Each entry in the group MUST correspond to an array with equivalent first dimensions
* Each entry SHOULD share chunk sizes (in the HDF5 or zarr container)
## Mappings
Mappings are simply stored as `Group`s on disk.
These are distinct from DataFrames and sparse arrays since they don’t have any special attributes.
A `Group` is created for any `Mapping` in the AnnData object,
including the standard `obsm`, `varm`, `layers`, and `uns`.
Notably, this definition is used recursively within `uns`:
`````{tab-set}
````{tab-item} HDF5
:sync: hdf5
```python
>>> store["uns"].visititems(print)
[...]
pca
pca/variance
pca/variance_ratio
[...]
```
````
````{tab-item} Zarr
:sync: zarr
```python
>>> store["uns"].visititems(print)
[...]
pca
pca/variance
pca/variance_ratio
[...]
```
````
`````
### Mapping specifications (v0.1.0)
* Each mapping MUST be its own group
* The group's metadata MUST contain the encoding metadata `"encoding-type": "dict"`, `"encoding-version": "0.1.0"`
## Scalars
Zero dimensional arrays are used for scalar values (i.e. single values like strings, numbers or booleans).
These should only occur inside of `uns`, and are commonly saved parameters:
`````{tab-set}
````{tab-item} HDF5
:sync: hdf5
```python
>>> store["uns/neighbors/params"].visititems(print)
method
metric
n_neighbors
random_state
```
````
````{tab-item} Zarr
:sync: zarr
```python
>>> store["uns/neighbors/params"].visititems(print)
method
metric
n_neighbors
random_state
```
````
`````
```python
>>> store["uns/neighbors/params/metric"][()]
'euclidean'
>>> dict(store["uns/neighbors/params/metric"].attrs)
{'encoding-type': 'string', 'encoding-version': '0.2.0'}
```
### Scalar specification (v0.2.0)
* Scalars MUST be written as a 0 dimensional array
* Numeric scalars
* MUST have `"encoding-type": "numeric-scalar"`, `"encoding-version": "0.2.0"` in their metadata
* MUST be a single numeric value, including boolean, unsigned integer, signed integer, floating point, or complex floating point
* String scalars
* MUST have `"encoding-type": "string"`, `"encoding-version": "0.2.0"` in their metadata
* In zarr, scalar strings MUST be stored as a fixed length unicode dtype
* In HDF5, scalar strings MUST be stored as a variable length utf-8 encoded string dtype
## Categorical arrays
```python
>>> categorical = store["obs"]["development_stage"]
>>> dict(categorical.attrs)
{'encoding-type': 'categorical', 'encoding-version': '0.2.0', 'ordered': False}
```
Discrete values can be efficiently represented with categorical arrays (similar to `factors` in `R`).
These arrays encode the values as small width integers (`codes`), which map to the original label set (`categories`).
Each entry in the `codes` array is the zero-based index of the encoded value in the `categories` array.
To represent a missing value, a code of `-1` is used.
We store these two arrays separately.
`````{tab-set}
````{tab-item} HDF5
:sync: hdf5
```python
>>> categorical.visititems(print)
categories
codes
```
````
````{tab-item} Zarr
:sync: zarr
```python
>>> categorical.visititems(print)
categories
codes
```
````
`````
### Categorical array specification (v0.2.0)
* Categorical arrays MUST be stored as a group
* The group's metadata MUST contain the encoding metadata `"encoding-type": "categorical"`, `"encoding-version": "0.2.0"`
* The group's metadata MUST contain the boolean valued field `"ordered"`, which indicates whether the categories are ordered
* The group MUST contain an integer valued array named `"codes"` whose maximum value is the number of categories - 1
* The `"codes"` array MAY contain signed integer values. If so, the code `-1` denotes a missing value
* The group MUST contain an array called `"categories"`
## String arrays
Arrays of strings are handled differently than numeric arrays since numpy doesn't really have a good way of representing arrays of unicode strings.
`anndata` assumes strings are text-like data, so it uses a variable length encoding.
`````{tab-set}
````{tab-item} HDF5
:sync: hdf5
```python
>>> store["var"][store["var"].attrs["_index"]]
```
````
````{tab-item} Zarr
:sync: zarr
```python
>>> store["var"][store["var"].attrs["_index"]]
```
````
`````
```python
>>> dict(categorical["categories"].attrs)
{'encoding-type': 'string-array', 'encoding-version': '0.2.0'}
```
### String array specifications (v0.2.0)
* String arrays MUST be stored in arrays
* The arrays's metadata MUST contain the encoding metadata `"encoding-type": "string-array"`, `"encoding-version": "0.2.0"`
* In `zarr`, string arrays MUST be stored using `numcodecs`' `VLenUTF8` codec
* In `HDF5`, string arrays MUST be stored using the variable length string data type, with a utf-8 encoding
## Nullable integers and booleans
We support IO with Pandas nullable integer and boolean arrays.
We represent these on disk similar to `numpy` masked arrays, `julia` nullable arrays, or `arrow` validity bitmaps (see {issue}`504` for more discussion).
That is, we store an indicator array (or mask) of null values alongside the array of all values.
`````{tab-set}
````{tab-item} HDF5
:sync: hdf5
```python
>>> from anndata import write_elem
>>> null_store = h5py.File("tmp.h5", mode="w")
>>> int_array = pd.array([1, None, 3, 4])
>>> int_array
[1, , 3, 4]
Length: 4, dtype: Int64
>>> write_elem(null_store, "nullable_integer", int_array)
>>> null_store.visititems(print)
nullable_integer
nullable_integer/mask
nullable_integer/values
```
````
````{tab-item} Zarr
:sync: zarr
```python
>>> from anndata import write_elem
>>> null_store = zarr.open()
>>> int_array = pd.array([1, None, 3, 4])
>>> int_array
[1, , 3, 4]
Length: 4, dtype: Int64
>>> write_elem(null_store, "nullable_integer", int_array)
>>> null_store.visititems(print)
nullable_integer
nullable_integer/mask
nullable_integer/values
```
````
`````
```python
>>> dict(null_store["nullable_integer"].attrs)
{'encoding-type': 'nullable-integer', 'encoding-version': '0.1.0'}
```
### Nullable integer specifications (v0.1.0)
* Nullable integers MUST be stored as a group
* The group's attributes MUST have contain the encoding metadata `"encoding-type": "nullable-integer"`, `"encoding-version": "0.1.0"`
* The group MUST contain an integer valued array under the key `"values"`
* The group MUST contain an boolean valued array under the key `"mask"`
### Nullable boolean specifications (v0.1.0)
* Nullable booleans MUST be stored as a group
* The group's attributes MUST have contain the encoding metadata `"encoding-type": "nullable-boolean"`, `"encoding-version": "0.1.0"`
* The group MUST contain an boolean valued array under the key `"values"`
* The group MUST contain an boolean valued array under the key `"mask"`
* The `"values"` and `"mask"` arrays MUST be the same shape
## AwkwardArrays
```{warning}
**Experimental**
Support for ragged arrays via awkward array is considered experimental under the 0.9.0 release series.
Please direct feedback on it's implementation to [https://github.com/scverse/anndata](https://github.com/scverse/anndata).
```
Ragged arrays are supported in `anndata` through the [Awkward
Array](https://awkward-array.org/) library. For storage on disk, we
break down the awkward array into it’s constituent arrays using
[`ak.to_buffers`](https://awkward-array.readthedocs.io/en/latest/_auto/ak.to_buffers.html)
then writing these arrays using `anndata`’s methods.
`````{tab-set}
````{tab-item} HDF5
:sync: hdf5
```python
>>> store["varm/transcript"].visititems(print)
node1-mask
node10-data
node11-mask
node12-offsets
node13-mask
node14-data
node16-offsets
node17-data
node2-offsets
node3-data
node4-mask
node5-offsets
node6-data
node7-mask
node8-offsets
node9-mask
```
````
````{tab-item} Zarr
:sync: zarr
```python
>>> store["varm/transcript"].visititems(print)
node1-mask
node10-data
node11-mask
node12-offsets
node13-mask
node14-data
node16-offsets
node17-data
node2-offsets
node3-data
node4-mask
node5-offsets
node6-data
node7-mask
node8-offsets
node9-mask
```
````
`````
The length of the array is saved to it’s own `"length"` attribute,
while metadata for the array structure is serialized and saved to the
`“form”` attribute.
```python
>>> dict(store["varm/transcript"].attrs)
{'encoding-type': 'awkward-array',
'encoding-version': '0.1.0',
'form': '{"class": "RecordArray", "fields": ["tx_id", "seq_name", '
'"exon_seq_start", "exon_seq_end", "ensembl_id"], "contents": '
'[{"class": "BitMaskedArray", "mask": "u8", "valid_when": true, '
'"lsb_order": true, "content": {"class": "ListOffsetArray", '
'"offsets": "i64", "content": {"class": "NumpyArray", "primitive": '
'"uint8", "inner_shape": [], "parameters": {"__array__": "char"}, '
'"form_key": "node3"}, "parameters": {"__array__": "string"}, '
'"form_key": "node2"}, "parameters": {}, "form_key": "node1"}, '
...
'length': 40145}
```
These can be read back as awkward arrays using the
[`ak.from_buffers`](https://awkward-array.readthedocs.io/en/latest/_auto/ak.from_buffers.html)
function:
```python
>>> import awkward as ak
>>> from anndata.io import read_elem
>>> awkward_group = store["varm/transcript"]
>>> ak.from_buffers(
... awkward_group.attrs["form"],
... awkward_group.attrs["length"],
... {k: read_elem(v) for k, v in awkward_group.items()}
... )
>>> transcript_models[:5]
[{tx_id: 'ENST00000450305', seq_name: '1', exon_seq_start: [...], ...},
{tx_id: 'ENST00000488147', seq_name: '1', exon_seq_start: [...], ...},
{tx_id: 'ENST00000473358', seq_name: '1', exon_seq_start: [...], ...},
{tx_id: 'ENST00000477740', seq_name: '1', exon_seq_start: [...], ...},
{tx_id: 'ENST00000495576', seq_name: '1', exon_seq_start: [...], ...}]
-----------------------------------------------------------------------
type: 5 * {
tx_id: ?string,
seq_name: ?string,
exon_seq_start: option[var * ?int64],
exon_seq_end: option[var * ?int64],
ensembl_id: ?string
}
>>> transcript_models[0]
{tx_id: 'ENST00000450305',
seq_name: '1',
exon_seq_start: [12010, 12179, 12613, 12975, 13221, 13453],
exon_seq_end: [12057, 12227, 12697, 13052, 13374, 13670],
ensembl_id: 'ENSG00000223972'}
------------------------------------------------------------
type: {
tx_id: ?string,
seq_name: ?string,
exon_seq_start: option[var * ?int64],
exon_seq_end: option[var * ?int64],
ensembl_id: ?string
}
```
[easy to find]: https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)
[hdf5]: https://en.wikipedia.org/wiki/Hierarchical_Data_Format
python-anndata-0.12.0~rc1/docs/index.md 0000664 0000000 0000000 00000000676 15003706322 0017713 0 ustar 00root root 0000000 0000000 ```{include} ../README.md
```
# Latest additions
See {doc}`/release-notes/index`,
particularly {ref}`v0.10` for the current release,
and [the `.feature` fragments](https://github.com/scverse/anndata/tree/main/docs) for the upcoming release,
```{toctree}
:hidden: true
:maxdepth: 1
tutorials/index
api
concatenation
fileformat-prose
interoperability
benchmarks
contributing
release-notes/index
references
```
# News
```{include} news.md
```
python-anndata-0.12.0~rc1/docs/interoperability.md 0000664 0000000 0000000 00000002715 15003706322 0022165 0 ustar 00root root 0000000 0000000 # Interoperability
The on-disk representation of anndata files can be read from other
languages. Here we list interfaces for working with AnnData from your
language of choice:
## R
- [zellkonverter](https://bioconductor.org/packages/release/bioc/html/zellkonverter.html) zellkonverter provides basilisk based tooling for loading from `h5ad` files to `SingleCellExperiment`
- [anndata](https://anndata.dynverse.org) provides an R implementation of `AnnData` as well as IO for the HDF5 format.
- [MuData](https://bioconductor.org/packages/release/bioc/html/MuData.html) provides IO for `AnnData` and `MuData` stored in HDF5 to Bioconductor's `SingleCellExperiment` and `MultiAssayExperiment` objects.
- [MuDataSeurat](https://pmbio.github.io/MuDataSeurat/) provides IO from `AnnData` and `MuData` stored in HDF5 to `Seurat` objects.
## Julia
- [Muon.jl](https://docs.juliahub.com/Muon/QfqCh/0.1.1/objects/) provides Julia implementations of `AnnData` and `MuData` objects, as well as IO for the HDF5 format
- [scVI.jl](https://maren-ha.github.io/scVI.jl/index.html) provides a Julia implementation of `AnnData` as well as IO for the HDF5 format.
## Javascript
- [Vitessce](https://github.com/vitessce/vitessce) contains loaders from `AnnData`s stored as Zarr, and uses this to provide interactive visualization
## Rust
- [anndata-rs](https://github.com/kaizhang/anndata-rs) provides a Rust implementation of `AnnData` as well as advanced IO support for the HDF5 storage format.
python-anndata-0.12.0~rc1/docs/news.md 0000664 0000000 0000000 00000001111 15003706322 0017541 0 ustar 00root root 0000000 0000000 # Muon paper published {small}`2022-02-02`
Muon has been published in Genome Biology {cite}`Bredikhin22`.
Muon is a framework for multimodal data built on top of `AnnData`.
Check out [Muon](https://muon.readthedocs.io/en/latest/) and its datastructure [MuData](https://mudata.readthedocs.io/en/latest/).
# COVID-19 datasets distributed as `h5ad` {small}`2020-04-01`
In a joint initiative, the Wellcome Sanger Institute, the Human Cell Atlas, and the CZI distribute datasets related to COVID-19 via anndata's `h5ad` files: [covid19cellatlas.org](https://www.covid19cellatlas.org/).
python-anndata-0.12.0~rc1/docs/references.rst 0000664 0000000 0000000 00000001206 15003706322 0021123 0 ustar 00root root 0000000 0000000 References
----------
.. [Bredikhin22]
Bredikhin *et al.* (2022),
*MUON: multimodal omics analysis framework*,
Genome Biology https://doi.org/10.1186/s13059-021-02577-8.
.. [Hastie09]
Hastie *et al.* (2009),
*The Elements of Statistical Learning*,
Springer https://web.stanford.edu/~hastie/ElemStatLearn/.
.. [Huber15]
Huber *et al.* (2015),
*Orchestrating high-throughput genomic analysis with Bioconductor*,
Nature Methods https://doi.org/10.1038/nmeth.3252.
.. [Murphy12]
Murphy (2012,
*Machine Learning: A Probabilistic Perspective*,
MIT Press https://mitpress.mit.edu/9780262018029/machine-learning/.
python-anndata-0.12.0~rc1/docs/release-notes/ 0000775 0000000 0000000 00000000000 15003706322 0021017 5 ustar 00root root 0000000 0000000 python-anndata-0.12.0~rc1/docs/release-notes/0.10.0.md 0000664 0000000 0000000 00000004265 15003706322 0022064 0 ustar 00root root 0000000 0000000 (v0.10.0)=
### 0.10.0 {small}`2023-10-06`
#### Features
**GPU Support**
* Dense and sparse [`CuPy`](https://docs.cupy.dev/) arrays are now supported {pr}`1066` {user}`ivirshup`
* Once you have `CuPy` arrays in your anndata, use it with: [`rapids-singlecell`](https://rapids-singlecell.readthedocs.io/en/latest/index.html) from v0.9+
* anndata now has GPU enabled CI. Made possibly by a grant from [CZI's EOSS program](https://chanzuckerberg.com/eoss/) and managed via [Cirun](https://Cirun.io) {pr}`1066` {pr}`1084` {user}`Zethson` {user}`ivirshup`
**Out of core**
* Concatenate on-disk anndata objects with {func}`anndata.experimental.concat_on_disk` {pr}`955` {user}`selmanozleyen`
* AnnData can now hold dask arrays with `scipy.sparse.spmatrix` chunks {pr}`1114` {user}`ivirshup`
* Public API for interacting with on disk sparse arrays: {func}`~anndata.io.sparse_dataset`, {class}`~anndata.abc.CSRDataset`, and {class}`~anndata.abc.CSCDataset` {pr}`765` {user}`ilan-gold` {user}`ivirshup`
* Improved performance for simple slices of OOC sparse arrays {pr}`1131` {user}`ivirshup`
**Improved errors and warnings**
* Improved error messages when combining dataframes with duplicated column names {pr}`1029` {user}`ivirshup`
* Improved warnings when modifying views of `AlingedMappings` {pr}`1016` {user}`flying-sheep` {user}`ivirshup`
* `AnnDataReadError`s have been removed. The original error is now thrown with additional information in a note {pr}`1055` {user}`ivirshup`
#### Documentation
* Added zarr examples to {doc}`file format docs` {pr}`1162` {user}`ivirshup`
#### Breaking changes
* {meth}`anndata.AnnData.transpose` no longer copies unnecessarily. If you rely on the copying behavior, call `.copy` on the resulting object. {pr}`1114` {user}`ivirshup`
#### Other updates
* Bump minimum python version to 3.9 {pr}`1117` {user}`flying-sheep`
#### Deprecations
* Deprecate `anndata.read`, which was just an alias for {func}`anndata.io.read_h5ad` {pr}`1108` {user}`ivirshup`.
* `dtype` argument to `AnnData` constructor is now deprecated {pr}`1153` {user}`ivirshup`
#### Bug fixes
* Fix shape inference on initialization when `X=None` is specified {pr}`1121` {user}`flying-sheep`
python-anndata-0.12.0~rc1/docs/release-notes/0.10.1.md 0000664 0000000 0000000 00000000245 15003706322 0022057 0 ustar 00root root 0000000 0000000 (v0.10.1)=
### 0.10.1 {small}`2023-10-08`
#### Bug fixes
* Fix `ad.concat` erroring when concatenating a categorical and object column {pr}`1171` {user}`ivirshup`
python-anndata-0.12.0~rc1/docs/release-notes/0.10.2.md 0000664 0000000 0000000 00000001440 15003706322 0022056 0 ustar 00root root 0000000 0000000 (v0.10.2)=
### 0.10.2 {small}`2023-10-11`
#### Bug fixes
* Added compatibility layer for packages relying on `anndata._core.sparse_dataset.SparseDataset`.
Note that this API is *deprecated* and new code should use `anndata.CSRDataset`, `~anndata.CSCDataset`, and `anndata.sparse_dataset` instead.
{pr}`1185` {user}`ivirshup`
* Handle deprecation warning from `pd.Categorical.map` thrown during `anndata.concat` {pr}`1189` {user}`flying-sheep` {user}`ivirshup`
* Fixed extra steps being included in IO tracebacks {pr}`1193` {user}`flying-sheep`
* `as_dense` argument of `write_h5ad` no longer writes an array without encoding metadata {pr}`1193` {user}`flying-sheep`
#### Performance
* Improved performance of `concat_on_disk` with dense arrays in some cases {pr}`1169` {user}`selmanozleyen`
python-anndata-0.12.0~rc1/docs/release-notes/0.10.3.md 0000664 0000000 0000000 00000000765 15003706322 0022070 0 ustar 00root root 0000000 0000000 (v0.10.3)=
### 0.10.3 {small}`2023-10-31`
#### Bug fixes
* Prevent pandas from causing infinite recursion when setting a slice of a categorical column {pr}`1211` {user}`flying-sheep`
#### Documentation
* Stop showing “Support for Awkward Arrays is currently experimental” warnings when
reading, concatenating, slicing, or transposing AnnData objects {pr}`1182` {user}`flying-sheep`
#### Other updates
* Fail canary CI job when tests raise unexpected warnings. {pr}`1182` {user}`flying-sheep`
python-anndata-0.12.0~rc1/docs/release-notes/0.10.4.md 0000664 0000000 0000000 00000001400 15003706322 0022054 0 ustar 00root root 0000000 0000000 (v0.10.4)=
### 0.10.4 {small}`2024-01-04`
#### Bug fixes
* Only try to use `Categorical.map(na_action=…)` in actually supported Pandas ≥2.1 {pr}`1226` {user}`flying-sheep`
* `AnnData.__sizeof__()` support for backed datasets {pr}`1230` {user}`Neah-Ko`
* `adata[:, []]` now returns an `AnnData` object empty on the appropriate dimensions instead of erroring {pr}`1243` {user}`ilan-gold`
* `adata.X[mask]` works in newer `numpy` versions when `X` is `backed` {pr}`1255` {user}`ilan-gold`
* `adata.X[...]` fixed for `X` as a `BaseCompressedSparseDataset` with `zarr` backend {pr}`1265` {user}`ilan-gold`
* Improve read/write error reporting {pr}`1273` {user}`flying-sheep`
#### Documentation
* Improve aligned mapping error messages {pr}`1252` {user}`flying-sheep`
python-anndata-0.12.0~rc1/docs/release-notes/0.10.5.md 0000664 0000000 0000000 00000001477 15003706322 0022073 0 ustar 00root root 0000000 0000000 (v0.10.5)=
### 0.10.5 {small}`2024-01-25`
#### Bug fixes
* Fix outer concatenation along variables when only a subset of objects had an entry in layers {pr}`1291` {user}`ivirshup`
* Fix comparison of >2d arrays in `uns` during concatenation {pr}`1300` {user}`ivirshup`
* Fix IO with awkward array version 2.5.2 {pr}`1328` {user}`ivirshup`
* Fix bug (introduced in 0.10.4) where indexing an AnnData with `list[bool]` would return the wrong result {pr}`1332` {user}`ivirshup`
#### Documentation
* Re-add search-as-you-type, this time via `readthedocs-sphinx-search` {pr}`1311` {user}`flying-sheep`
#### Performance
* `BaseCompressedSparseDataset`'s `indptr` is cached {pr}`1266` {user}`ilan-gold`
* Improved performance when indexing backed sparse matrices with boolean masks along their major axis {pr}`1233` {user}`ilan-gold`
python-anndata-0.12.0~rc1/docs/release-notes/0.10.6.md 0000664 0000000 0000000 00000002745 15003706322 0022073 0 ustar 00root root 0000000 0000000 (v0.10.6)=
### 0.10.6 {small}`2024-03-11`
#### Bug fixes
* Defer import of zarr in test helpers, as scanpy CI job relies on them {pr}`1343` {user}`ilan-gold`
* Writing a dataframe with non-unique column names now throws an error, instead of silently overwriting {pr}`1335` {user}`ivirshup`
* Bring optimization from {pr}`1233` to indexing on the whole `AnnData` object, not just the sparse dataset itself {pr}`1365` {user}`ilan-gold`
* Fix mean slice length checking to use improved performance when indexing backed sparse matrices with boolean masks along their major axis {pr}`1366` {user}`ilan-gold`
* Fixed overflow occurring when writing dask arrays with sparse chunks by always writing dask arrays with 64 bit indptr and indices, and adding an overflow check to `.append` method of sparse on disk structures {pr}`1348` {user}`ivirshup`
* Modified `ValueError` message for invalid `.X` during construction to show more helpful list instead of ambiguous `__name__` {pr}`1395` {user}`eroell`
* Pin `array-api-compat!=1.5` to avoid incorrect implementation of `asarray` {pr}`1411` {user}`ivirshup`
#### Documentation
* Type hints and docstrings for `.to_df` method are updated and fixed {pr}`1402` {user}`WeilerP`
#### Development
* `anndata`'s CI now tests against minimum versions of it's dependencies. As a result, several dependencies had their minimum required version bumped. See diff for details {pr}`1314` {user}`ivirshup`
* `anndata` now tests against Python 3.12 {pr}`1373` {user}`ivirshup`
python-anndata-0.12.0~rc1/docs/release-notes/0.10.7.md 0000664 0000000 0000000 00000000651 15003706322 0022066 0 ustar 00root root 0000000 0000000 (v0.10.7)=
### 0.10.7 {small}`2024-04-09`
#### Bug fixes
* Handle upstream `numcodecs` bug where read-only string arrays cannot be encoded {user}`ivirshup` {pr}`1421`
* Use in-memory sparse matrix directly to fix compatibility with `scipy` `1.13` {user}`ilan-gold` {pr}`1435`
#### Performance
* Remove `vindex` for subsetting `dask.array.Array` because of its slowness and memory consumption {user}`ilan-gold` {pr}`1432`
python-anndata-0.12.0~rc1/docs/release-notes/0.10.8.md 0000664 0000000 0000000 00000000722 15003706322 0022066 0 ustar 00root root 0000000 0000000 (v0.10.8)=
### 0.10.8 {small}`2024-06-20`
#### Bug fixes
* Write out `64bit` indptr when appropriate for {func}`~anndata.experimental.concat_on_disk` {pr}`1493` {user}`ilan-gold`
* Support for Numpy 2 {pr}`1499` {user}`flying-sheep`
* Fix {func}`~anndata.io.sparse_dataset` docstring test on account of new {mod}`scipy` version {pr}`1514` {user}`ilan-gold`
#### Documentation
* Improved example for {func}`~anndata.io.sparse_dataset` {pr}`1468` {user}`ivirshup`
python-anndata-0.12.0~rc1/docs/release-notes/0.10.9.md 0000664 0000000 0000000 00000002507 15003706322 0022072 0 ustar 00root root 0000000 0000000 (v0.10.9)=
### 0.10.9 {small}`2024-08-28`
#### Bug fixes
- Fix writing large number of columns for `h5` files {user}`ilan-gold` {user}`selmanozleyen` ({pr}`1147`)
- Add warning for setting `X` on a view with repeated indices {user}`ilan-gold` ({pr}`1501`)
- Coerce {class}`numpy.matrix` classes to arrays when trying to store them in `AnnData` {user}`flying-sheep` ({pr}`1516`)
- Fix for setting a dense `X` view with a sparse matrix {user}`ilan-gold` ({pr}`1532`)
- Upper bound {mod}`numpy` for `gpu` installation on account of {issue}`cupy/cupy#8391` {user}`ilan-gold` ({pr}`1540`)
- Upper bound dask on account of {issue}`1579` {user}`ilan-gold` ({pr}`1580`)
- Ensure setting {attr}`pandas.DataFrame.index` on a view of a {class}`~anndata.AnnData` instantiates the {class}`~pandas.DataFrame` from the view {user}`ilan-gold` ({pr}`1586`)
- Disallow using {class}`~pandas.DataFrame`s with multi-index columns {user}`ilan-gold` ({pr}`1589`)
#### Development Process
- create new `cupy` installation options for cuda 11 & 12 called `cu11` and `cu12` {user}`Intron7` ({pr}`1596`)
#### Documentation
- add `callback` typing for {func}`~anndata.experimental.read_dispatched` and {func}`~anndata.experimental.write_dispatched` {user}`ilan-gold` ({pr}`1557`)
#### Performance
- Support for `concat_on_disk` outer join {user}`ilan-gold` ({pr}`1504`)
python-anndata-0.12.0~rc1/docs/release-notes/0.11.0.md 0000664 0000000 0000000 00000010764 15003706322 0022066 0 ustar 00root root 0000000 0000000 (v0.11.0)=
### 0.11.0 {small}`2024-11-07`
Release candidates:
- (v0.11.0rc3)=
{guilabel}`rc3` 2024-10-14
- (v0.11.0rc2)=
{guilabel}`rc2` 2024-09-24
- (v0.11.0rc1)=
{guilabel}`rc1` 2024-09-04
#### Bug fixes
- Ensure {func}`anndata.concat` of {class}`~anndata.AnnData` object with {class}`scipy.sparse.spmatrix` and {class}`scipy.sparse.sparray` dask arrays uses the correct fill value of 0. {user}`ilan-gold` ({pr}`1719`)
- Ensure that views of AwkwardArrays have their "view" attributes removed on saving an {class}`~anndata.AnnData` object to disk. {user}`grst` ({pr}`1736`)
#### Breaking changes
- {guilabel}`rc3` Drop support for `python` 3.9 {user}`ilan-gold` ({pr}`1712`)
- {guilabel}`rc2` A new `anndata.io` module contains all `read_*` and `write_*` functions, and all imports of such functions should go through this module. Old ways of importing these functions i.e., `from anndata import read_csv` or `from anndata._io.specs import read_elem` will still work, but are now considered deprecated and give a warning on import with the exception of {func}`anndata.io.read_zarr` and {func}`anndata.io.read_h5ad`, which will remain at the top-level `anndata` without warning. {user}`ilan-gold ({pr}`1682`)
- {guilabel}`rc1` Removed deprecated modules `anndata.core` and `anndata.readwrite` {user}`ivirshup` ({pr}`1197`)
- {guilabel}`rc1` No longer export `sparse_dataset` from `anndata.experimental`, instead exporting {func}`anndata.io.sparse_dataset` {user}`ilan-gold` ({pr}`1642`)
- {guilabel}`rc1` Move `RWAble` and `InMemoryElem` out of `experimental`, renaming `RWAble` to {type}`~anndata.typing.AxisStorable` and `InMemoryElem` to {type}`~anndata.typing.RWAble` {user}`ilan-gold` ({pr}`1643`)
#### Development Process
- {guilabel}`rc2` Add extra `dask` dependency for installation i.e., `pip install anndata[dask]` {user}`ilan-gold` ({pr}`1677`)
- {guilabel}`rc2` Remove `shall_` from variable names in `settings` {user}`ilan-gold` ({pr}`1685`)
- {guilabel}`rc1` Create new `cupy` installation options for cuda 11 & 12 called `cu11` and `cu12` {user}`Intron7` ({pr}`1596`)
#### Documentation
- {guilabel}`rc1` Correct {attr}`anndata.AnnData.X` type to include {class}`~anndata.abc.CSRDataset` and {class}`~anndata.abc.CSCDataset` as possible types and being deprecation process for non-csr/csc {class}`scipy.sparse.spmatrix` types in {attr}`anndata.AnnData.X` {user}`ilan-gold` ({pr}`1616`)
#### Features
- Add support for ellipsis indexing of the {class}`~anndata.AnnData` object {user}`ilan-gold` ({pr}`1729`)
- {guilabel}`rc1` `scipy.sparse.csr_array` and `scipy.sparse.csc_array` are now supported when constructing `AnnData` objects {user}`ilan-gold` {user}`isaac-virshup` ({pr}`1028`)
- {guilabel}`rc1` Allow `axis` parameter of e.g. {func}`anndata.concat` to accept `'obs'` and `'var'` {user}`flying-sheep` ({pr}`1244`)
- {guilabel}`rc1` Add `settings` object with methods for altering internally-used options, like checking for uniqueness on `obs`' index {user}`ilan-gold` ({pr}`1270`)
- {guilabel}`rc1` Add {attr}`~anndata.settings.remove_unused_categories` option to {attr}`anndata.settings` to override current behavior {user}`ilan-gold` ({pr}`1340`)
- {guilabel}`rc1` Add `~anndata.experimental.read_elem_as_dask` function to handle i/o with sparse and dense arrays {user}`ilan-gold` ({pr}`1469`)
- {guilabel}`rc1` Add ability to convert strings to categoricals on write in {meth}`~anndata.AnnData.write_h5ad` and {meth}`~anndata.AnnData.write_zarr` via `convert_strings_to_categoricals` parameter {user}` falexwolf` ({pr}`1474`)
- {guilabel}`rc1` Add {attr}`~anndata.settings.check_uniqueness` option to {attr}`anndata.settings` to override current behavior {user}`ilan-gold` ({pr}`1507`)
- {guilabel}`rc1` Add functionality to write from GPU {class}`dask.array.Array` to disk {user}`ilan-gold` ({pr}`1550`)
- {guilabel}`rc1` Read and write support for nullable string arrays ({class}`pandas.arrays.StringArray`). Use pandas’ {doc}`pandas:user_guide/options` `mode.string_storage` to control which storage mode is used when reading `dtype="string"` columns. {user}`flying-sheep` ({pr}`1558`)
- {guilabel}`rc1` Export {func}`~anndata.io.write_elem` and {func}`~anndata.io.read_elem` directly from the main package instead of `experimental` {user}`ilan-gold` ({pr}`1598`)
- {guilabel}`rc1` Allow reading sparse data (via {func}`~anndata.io.read_elem` or {func}`~anndata.io.sparse_dataset`) into either {class}`scipy.sparse.csr_array` or {class}`scipy.sparse.csc_array` via {attr}`anndata.settings.use_sparse_array_on_read` {user}`ilan-gold` ({pr}`1633`)
python-anndata-0.12.0~rc1/docs/release-notes/0.11.1.md 0000664 0000000 0000000 00000000560 15003706322 0022060 0 ustar 00root root 0000000 0000000 (v0.11.1)=
### 0.11.1 {small}`2024-11-12`
### Bug fixes
- Remove upper pin on `dask` and exclude versions broken with sparse indexing {user}`ilan-gold` ({pr}`1725`)
- Fix chunking with -1 in `chunks` argument of `~anndata.experimental.read_elem_as_dask` {user}`ilan-gold` ({pr}`1743`)
- Fix `cupy<0.13` imports in non-gpu environments {user}`ilan-gold` ({pr}`1754`)
python-anndata-0.12.0~rc1/docs/release-notes/0.11.2.md 0000664 0000000 0000000 00000001266 15003706322 0022065 0 ustar 00root root 0000000 0000000 (v0.11.2)=
### 0.11.2 {small}`2025-01-07`
### Bug fixes
- Cache accesses to the `data` and `indices` arrays in {class}`~anndata.abc.CSRDataset` and {class}`~anndata.abc.CSCDataset` {user}`ilan-gold` ({pr}`1744`)
- Error out on floating point indices that are not actually integers {user}`ilan-gold` ({pr}`1746`)
- `write_elem` now filters out incompatible `dataset_kwargs` when saving zero-dimensional arrays {user}`ilia-kats` ({pr}`1783`)
- Add {mod}`scipy` 1.5 compatibility {user}`flying-sheep` ({pr}`1806`)
### Performance
- Batch slice-based indexing in {class}`anndata.abc.CSRDataset` and {class}`anndata.abc.CSCDataset` for performance boost in `zarr` {user}`ilan-gold` ({pr}`1790`)
python-anndata-0.12.0~rc1/docs/release-notes/0.11.3.md 0000664 0000000 0000000 00000000171 15003706322 0022060 0 ustar 00root root 0000000 0000000 (v0.11.3)=
### 0.11.3 {small}`2025-01-10`
### Bug fixes
- Upper bound `zarr` at runtime {user}`ilan-gold` ({pr}`1819`)
python-anndata-0.12.0~rc1/docs/release-notes/0.11.4.md 0000664 0000000 0000000 00000001550 15003706322 0022063 0 ustar 00root root 0000000 0000000 (v0.11.4)=
### 0.11.4 {small}`2025-03-26`
### Bug fixes
- Raise {class}`~anndata.ImplicitModificationWarning` when setting `X` on a view. {user}`ilan-gold` ({pr}`1853`)
- Bound `dask` due to {issue}`dask/dask#11752` {user}`ilan-gold` ({pr}`1859`)
- Fix concatenation of {class}`anndata.AnnData` objects along `var` using `join="outer"` when `varm` is not empty. {user}`ilia-kats` ({pr}`1911`)
- Add `convert_strings_to_categoricals` parameter also to {meth}`~anndata.AnnData.write_h5ad` and {meth}`~anndata.AnnData.write_zarr` as intended {user}`flying-sheep` ({pr}`1914`)
- Allow initialization of {class}`anndata.AnnData` objects without `X` (since they could be constructed previously by deleting `X`) {user}`ilan-gold` ({pr}`1941`)
### Development Process
- Fix version number inference in development environments (CI and local) {user}`flying-sheep` ({pr}`1831`)
python-anndata-0.12.0~rc1/docs/release-notes/0.12.0rc1.md 0000664 0000000 0000000 00000000111 15003706322 0022456 0 ustar 00root root 0000000 0000000 (v0.12.0rc1)=
### 0.12.0rc1 {small}`2025-04-09`
No significant changes.
python-anndata-0.12.0~rc1/docs/release-notes/0.4.0.md 0000664 0000000 0000000 00000001007 15003706322 0021776 0 ustar 00root root 0000000 0000000 (v0.4.0)=
### 0.4.0 {small}`23 December, 2017`
- read/write [.loom](https://loompy.org) files
- scalability beyond dataset sizes that fit into memory: see this [blog post]
- {class}`~anndata.AnnData` has a {class}`~anndata.AnnData.raw` attribute, which simplifies storing the data matrix when you consider it *raw*: see the [clustering tutorial]
[blog post]: http://falexwolf.de/blog/171223_AnnData_indexing_views_HDF5-backing/
[clustering tutorial]: https://github.com/scverse/scanpy_usage/tree/master/170505_seurat
python-anndata-0.12.0~rc1/docs/release-notes/0.5.0.md 0000664 0000000 0000000 00000000715 15003706322 0022004 0 ustar 00root root 0000000 0000000 (v0.5.0)=
### 0.5.0 {small}`9 February, 2018`
- inform about duplicates in {class}`~anndata.AnnData.var_names` and resolve them using {func}`~anndata.AnnData.var_names_make_unique`
- automatically remove unused categories after slicing
- read/write [.loom](https://loompy.org) files using loompy 2
- fixed read/write for a few text file formats
- read [UMI tools] files: {func}`~anndata.io.read_umi_tools`
[umi tools]: https://github.com/CGATOxford/UMI-tools
python-anndata-0.12.0~rc1/docs/release-notes/0.6.0.md 0000664 0000000 0000000 00000000634 15003706322 0022005 0 ustar 00root root 0000000 0000000 (v0.6.0)=
### 0.6.0 {small}`1 May, 2018`
- compatibility with Seurat converter
- tremendous speedup for {meth}`~anndata.AnnData.concatenate`
- bug fix for deep copy of unstructured annotation after slicing
- bug fix for reading HDF5 stored single-category annotations
- `'outer join'` concatenation: adds zeros for concatenation of sparse data and nans for dense data
- better memory efficiency in loom exports
python-anndata-0.12.0~rc1/docs/release-notes/0.6.x.md 0000664 0000000 0000000 00000002426 15003706322 0022116 0 ustar 00root root 0000000 0000000 (v0.6.x)=
### 0.6.\* {small}`2019-*-*`
- better support for aligned mappings (obsm, varm, layers)
`0.6.22` {pr}`155` {smaller}`I Virshup`
- convenience accessors {func}`~anndata.AnnData.obs_vector`, {func}`~anndata.AnnData.var_vector` for 1d arrays.
`0.6.21` {pr}`144` {smaller}`I Virshup`
- compatibility with Scipy >=1.3 by removing `IndexMixin` dependency.
`0.6.20` {pr}`151` {smaller}`P Angerer`
- bug fix for second-indexing into views.
`0.6.19` {smaller}`P Angerer`
- bug fix for reading excel files.
`0.6.19` {smaller}`A Wolf`
- changed default compression to `None` in {func}`~anndata.AnnData.write_h5ad` to speed up read and write, disk space use is usually less critical.
`0.6.16` {smaller}`A Wolf`
- maintain dtype upon copy.
`0.6.13` {smaller}`A Wolf`
- {attr}`~anndata.AnnData.layers` inspired by [.loom](https://loompy.org) files allows their information lossless reading via {func}`~anndata.io.read_loom`.
`0.6.7`–`0.6.9` {pr}`46` & {pr}`48` {smaller}`S Rybakov`
- support for reading zarr files: {func}`~anndata.io.read_zarr`
`0.6.7` {pr}`38` {smaller}`T White`
- initialization from pandas DataFrames
`0.6.` {smaller}`A Wolf`
- iteration over chunks {func}`~anndata.AnnData.chunked_X` and {func}`~anndata.AnnData.chunk_X`
`0.6.1` {pr}`20` {smaller}`S Rybakov`
python-anndata-0.12.0~rc1/docs/release-notes/0.7.0.md 0000664 0000000 0000000 00000004646 15003706322 0022015 0 ustar 00root root 0000000 0000000 (v0.7.0)=
### 0.7.0 {small}`22 January, 2020`
```{warning}
Breaking changes introduced between `0.6.22.post1` and `0.7`:
- Elements of {class}`~anndata.AnnData`s don’t have their dimensionality reduced when the main object is subset.
This is to maintain consistency when subsetting. See discussion in {issue}`145`.
- Internal modules like `anndata.core` are private and their contents are not stable: See {issue}`174`.
- The old deprecated attributes `.smp*`. `.add` and `.data` have been removed.
```
#### View overhaul {pr}`164`
- Indexing into a view no longer keeps a reference to intermediate view, see {issue}`62`.
- Views are now lazy. Elements of view of AnnData are not indexed until they’re accessed.
- Indexing with scalars no longer reduces dimensionality of contained arrays, see {issue}`145`.
- All elements of AnnData should now follow the same rules about how they’re subset, see {issue}`145`.
- Can now index by observations and variables at the same time.
#### IO overhaul {pr}`167`
- Reading and writing has been overhauled for simplification and speed.
- Time and memory usage can be half of previous in typical use cases
- Zarr backend now supports sparse arrays, and generally is closer to having the same features as HDF5.
- Backed mode should see significant speed and memory improvements for access along compressed dimensions and IO. PR {pr}`241`.
- {class}`~pandas.Categorical`s can now be ordered (PR {pr}`230`) and written to disk with a large number of categories (PR {pr}`217`).
#### Mapping attributes overhaul {smaller}`(obsm, varm, layers, ...)`
- New attributes {attr}`~anndata.AnnData.obsp` and {attr}`~anndata.AnnData.varp` have been added for two dimensional arrays where each axis corresponds to a single axis of the AnnData object. PR {pr}`207`.
- These are intended to store values like cell-by-cell graphs, which are currently stored in {attr}`~anndata.AnnData.uns`.
- Sparse arrays are now allowed as values in all mapping attributes.
- DataFrames are now allowed as values in {attr}`~anndata.AnnData.obsm` and {attr}`~anndata.AnnData.varm`.
- All mapping attributes now share an implementation and will have the same behaviour. PR {pr}`164`.
#### Miscellaneous improvements
- Mapping attributes now have ipython tab completion (e.g. `adata.obsm["\\t` can provide suggestions) PR {pr}`183`.
- {class}`~anndata.AnnData` attributes are now delete-able (e.g. `del adata.raw`) PR {pr}`242`.
- Many many bug fixes
python-anndata-0.12.0~rc1/docs/release-notes/0.7.2.md 0000664 0000000 0000000 00000003502 15003706322 0022005 0 ustar 00root root 0000000 0000000 (v0.7.2)=
### 0.7.2 {small}`15 May, 2020`
#### Concatenation overhaul {smaller}`I Virshup`
- Elements of `uns` can now be merged, see {pr}`350`
- Outer joins now work for `layers` and `obsm`, see {pr}`352`
- Fill value for outer joins can now be specified
- Expect improvements in performance, see {issue}`303`
#### Functionality
- {attr}`~anndata.AnnData.obsp` and {attr}`~anndata.AnnData.varp` can now be transposed {pr}`370` {smaller}`A Wolf`
- {meth}`~anndata.AnnData.obs_names_make_unique` is now better at making values unique, and will warn if ambiguities arise {pr}`345` {smaller}`M Weiden`
- {attr}`~anndata.AnnData.obsp` is now preferred for storing pairwise relationships between observations. In practice, this means there will be deprecation warnings and reformatting applied to objects which stored connectivities under `uns["neighbors"]`. Square matrices in {attr}`~anndata.AnnData.uns` will no longer be sliced (use `.{obs,var}p` instead). {pr}`337` {smaller}`I Virshup`
- {class}`~anndata.ImplicitModificationWarning` is now exported {pr}`315` {smaller}`P Angerer`
- Better support for {class}`~numpy.ndarray` subclasses stored in `AnnData` objects {pr}`335` {smaller}`michalk8`
#### Bug fixes
- Fixed inplace modification of {class}`~pandas.Index` objects by the make unique function {pr}`348` {smaller}`I Virshup`
- Passing ambiguous keys to {meth}`~anndata.AnnData.obs_vector` and {meth}`~anndata.AnnData.var_vector` now throws errors {pr}`340` {smaller}`I Virshup`
- Fix instantiating {class}`~anndata.AnnData` objects from {class}`~pandas.DataFrame` {pr}`316` {smaller}`P Angerer`
- Fixed indexing into `AnnData` objects with arrays like `adata[adata[:, gene].X > 0]` {pr}`332` {smaller}`I Virshup`
- Fixed type of version {pr}`315` {smaller}`P Angerer`
- Fixed deprecated import from {mod}`pandas` {pr}`319` {smaller}`P Angerer`
python-anndata-0.12.0~rc1/docs/release-notes/0.7.3.md 0000664 0000000 0000000 00000000225 15003706322 0022005 0 ustar 00root root 0000000 0000000 (v0.7.3)=
### 0.7.3 {small}`20 May, 2020`
#### Bug fixes
- Fixed bug where graphs used too much memory when copying {pr}`381` {smaller}`I Virshup`
python-anndata-0.12.0~rc1/docs/release-notes/0.7.4.md 0000664 0000000 0000000 00000001135 15003706322 0022007 0 ustar 00root root 0000000 0000000 (v0.7.4)=
### 0.7.4 {small}`10 July, 2020`
#### Concatenation overhaul {pr}`378` {smaller}`I Virshup`
- New function {func}`anndata.concat` for concatenating `AnnData` objects along either observations or variables
- New documentation section: {doc}`/concatenation`
#### Functionality
- AnnData object created from dataframes with sparse values will have sparse `.X` {pr}`395` {smaller}`I Virshup`
#### Bug fixes
- Fixed error from `AnnData.concatenate` by bumping minimum versions of numpy and pandas {issue}`385`
- Fixed colors being incorrectly changed when `AnnData` object was subset {pr}`388`
python-anndata-0.12.0~rc1/docs/release-notes/0.7.5.md 0000664 0000000 0000000 00000000623 15003706322 0022011 0 ustar 00root root 0000000 0000000 (v0.7.5)=
### 0.7.5 {small}`12 November, 2020`
#### Functionality
- Added ipython tab completion and a useful return from `.keys` to `adata.uns` {pr}`415` {smaller}`I Virshup`
#### Bug fixes
- Compatibility with `h5py>=3` strings {pr}`444` {smaller}`I Virshup`
- Allow `adata.raw = None`, as is documented {pr}`447` {smaller}`I Virshup`
- Fix warnings from pandas 1.1 {pr}`425` {smaller}`I Virshup`
python-anndata-0.12.0~rc1/docs/release-notes/0.7.6.md 0000664 0000000 0000000 00000002750 15003706322 0022015 0 ustar 00root root 0000000 0000000 (v0.7.6)=
### 0.7.6 {small}`11 April, 2021`
#### Features
- Added {meth}`anndata.AnnData.to_memory` for returning an in memory object from a backed one {pr}`470` {pr}`542` {smaller}`V Bergen` {smaller}`I Virshup`
- {meth}`anndata.AnnData.write_loom` now writes `obs_names` and `var_names` using the `Index`'s `.name` attribute, if set {pr}`538` {smaller}`I Virshup`
#### Bug fixes
- Fixed bug where `np.str_` column names errored at write time {pr}`457` {smaller}`I Virshup`
- Fixed "value.index does not match parent’s axis 0/1 names" error triggered when a data frame is stored in obsm/varm after obs_names/var_names is updated {pr}`461` {smaller}`G Eraslan`
- Fixed `adata.write_csvs` when `adata` is a view {pr}`462` {smaller}`I Virshup`
- Fixed null values being converted to strings when strings are converted to categorical {pr}`529` {smaller}`I Virshup`
- Fixed handling of compression key word arguments {pr}`536` {smaller}`I Virshup`
- Fixed copying a backed `AnnData` from changing which file the original object points at {pr}`533` {smaller}`ilia-kats`
- Fixed a bug where calling `AnnData.concatenate` an `AnnData` with no variables would error {pr}`537` {smaller}`I Virshup`
#### Deprecations
- Passing positional arguments to {func}`anndata.io.read_loom` besides the path is now deprecated {pr}`538` {smaller}`I Virshup`
- {func}`anndata.io.read_loom` arguments `obsm_names` and `varm_names` are now deprecated in favour of `obsm_mapping` and `varm_mapping` {pr}`538` {smaller}`I Virshup`
python-anndata-0.12.0~rc1/docs/release-notes/0.7.7.md 0000664 0000000 0000000 00000001232 15003706322 0022010 0 ustar 00root root 0000000 0000000 (v0.7.7)=
### 0.7.7 {small}`9 November, 2021`
#### Bug fixes
- Fixed propagation of import error when importing `write_zarr` but not all dependencies are installed {pr}`579` {smaller}`R Hillje`
- Fixed issue with `.uns` sub-dictionaries being referenced by copies {pr}`576` {smaller}`I Virshup`
- Fixed out-of-bounds integer indices not raising {class}`IndexError` {pr}`630` {smaller}`M Klein`
- Fixed backed `SparseDataset` indexing with scipy 1.7.2 {pr}`638` {smaller}`I Virshup`
#### Development processes
- Use PEPs 621 (standardized project metadata), 631 (standardized dependencies), and 660 (standardized editable installs) {pr}`639` {smaller}`I Virshup`
python-anndata-0.12.0~rc1/docs/release-notes/0.7.8.md 0000664 0000000 0000000 00000000170 15003706322 0022011 0 ustar 00root root 0000000 0000000 (v0.7.8)=
### 0.7.8 {small}`9 November, 2021`
#### Bug fixes
- Re-include test helpers {pr}`641` {smaller}`I Virshup`
python-anndata-0.12.0~rc1/docs/release-notes/0.8.0.md 0000664 0000000 0000000 00000004372 15003706322 0022012 0 ustar 00root root 0000000 0000000 (v0.8.0)=
### 0.8.0 {small}`14th March, 2022`
#### IO Specification
```{warning}
The on disk format of AnnData objects has been updated with this release.
Previous releases of `anndata` will not be able to read all files written by this version.
For discussion of possible future solutions to this issue, see {issue}`698`
```
Internal handling of IO has been overhauled.
This should make it much easier to support new datatypes, use partial access, and use `AnnData` internally in other formats.
- Each element should be tagged with an `encoding_type` and `encoding_version`. See updated docs on the {doc}`file format `
- Support for nullable integer and boolean data arrays. More data types to come!
- Experimental support for low level access to the IO API via {func}`~anndata.io.read_elem` and {func}`~anndata.io.write_elem`
#### Features
- Added PyTorch dataloader {class}`~anndata.experimental.AnnLoader` and lazy concatenation object {class}`~anndata.experimental.AnnCollection`. See the [tutorials] {pr}`416` {smaller}`S Rybakov`
- Compatibility with `h5ad` files written from Julia {pr}`569` {smaller}`I Kats`
- Many logging messages that should have been warnings are now warnings {pr}`650` {smaller}`I Virshup`
- Significantly more efficient {func}`anndata.io.read_umi_tools` {pr}`661` {smaller}`I Virshup`
- Fixed deepcopy of a copy of a view retaining sparse matrix view mixin type {pr}`670` {smaller}`M Klein`
- In many cases {attr}`~anndata.AnnData.X` can now be `None` {pr}`463` {smaller}`R Cannoodt` {pr}`677` {smaller}`I Virshup`. Remaining work is documented in {issue}`467`.
- Removed hard `xlrd` dependency {smaller}`I Virshup`
- `obs` and `var` dataframes are no longer copied by default on `AnnData` instantiation {issue}`371` {smaller}`I Virshup`
#### Bug fixes
- Fixed issue where `.copy` was creating sparse matrices views when copying {pr}`670` {smaller}`michalk8`
- Fixed issue where `.X` matrix read in from `zarr` would always have `float32` values {pr}`701` {smaller}`I Virshup`
- `` Raw.to_adata` `` now includes `obsp` in the output {pr}`404` {smaller}`G Eraslan`
#### Dependencies
- `xlrd` dropped as a hard dependency
- Now requires `h5py` `v3.0.0` or newer
[tutorials]: https://anndata-tutorials.readthedocs.io/en/latest/index.html
python-anndata-0.12.0~rc1/docs/release-notes/0.9.0.md 0000664 0000000 0000000 00000005754 15003706322 0022020 0 ustar 00root root 0000000 0000000 (v0.9.0)=
### 0.9.0 {small}`2023-04-11`
#### Features
- Added experimental support for dask arrays {pr}`813` {user}`syelman` {user}`rahulbshrestha`
- `obsm`, `varm` and `uns` can now hold [AwkwardArrays](https://awkward-array.org/quickstart.html) {pr}`647` {user}`giovp`, {user}`grst`, {user}`ivirshup`
- Added experimental functions {func}`anndata.experimental.read_dispatched` and {func}`anndata.experimental.write_dispatched` which allow customizing IO with a callback {pr}`873` {user}`ilan-gold` {user}`ivirshup`
- Better error messages during IO {pr}`734` {user}`flying-sheep`, {user}`ivirshup`
- Unordered categorical columns are no longer cast to object during {func}`anndata.concat` {pr}`763` {user}`ivirshup`
#### Documentation
- New tutorials for experimental features
> - {doc}`/tutorials/notebooks/anndata_dask_array` – {pr}`886` {user}`syelman`
> - {doc}`/tutorials/notebooks/{read,write}_dispatched` – {pr}`scverse/anndata-tutorials#17` {user}`ilan-gold`
> - {doc}`/tutorials/notebooks/awkward-arrays` – {pr}`scverse/anndata-tutorials#15` {user}`grst`
- {doc}`File format description ` now includes a more formal specification {pr}`882` {user}`ivirshup`
- {doc}`/interoperability`: new page on interoperability with other packages {pr}`831` {user}`ivirshup`
- Expanded docstring more documentation for `backed` argument of {func}`anndata.io.read_h5ad` {pr}`812` {user}`jeskowagner`
- Documented how to use alternative compression methods for the `h5ad` file format, see {meth}`AnnData.write_h5ad() ` {pr}`857` {user}`nigeil`
- General typo corrections 😅 {pr}`870` {user}`folded`
#### Breaking changes
- The `AnnData` `dtype` argument no longer defaults to `float32` {pr}`854` {user}`ivirshup`
- Previously deprecated `force_dense` argument {meth}`AnnData.write_h5ad() ` has been removed. {pr}`855` {user}`ivirshup`
- Previously deprecated behaviour around storing adjacency matrices in `uns` has been removed {pr}`866` {user}`ivirshup`
#### Other updates
- Bump minimum python version to 3.8 {pr}`820` {user}`ivirshup`
#### Deprecations
- {meth}`AnnData.concatenate() ` is now deprecated in favour of {func}`anndata.concat` {pr}`845` {user}`ivirshup`
#### Bug fixes
- Fix warning from `rename_categories` {pr}`790` {smaller}`I Virshup`
- Remove backwards compat checks for categories in `uns` when we can tell the file is new enough {pr}`790` {smaller}`I Virshup`
- Categorical arrays are now created with a python `bool` instead of a `numpy.bool_` {pr}`856`
- Fixed order dependent outer concatenation bug {pr}`904` {user}`ivirshup`, reported by {user}`szalata`
- Fixed bug in renaming categories {pr}`790` {user}`ivirshup`, reported by {user}`perrin-isir`
- Fixed IO bug when keys in `uns` ended in `_categories` {pr}`806` {user}`ivirshup`, reported by {user}`Hrovatin`
- Fixed `raw.to_adata` not populating `obs` aligned values when `raw` was assigned through the setter {pr}`939` {user}`ivirshup`
python-anndata-0.12.0~rc1/docs/release-notes/0.9.1.md 0000664 0000000 0000000 00000000154 15003706322 0022006 0 ustar 00root root 0000000 0000000 (v0.9.1)=
### 0.9.1 {small}`2023-04-11`
#### Bug fixes
* Fixing windows support {pr}`958` {user}`Koncopd`
python-anndata-0.12.0~rc1/docs/release-notes/0.9.2.md 0000664 0000000 0000000 00000000704 15003706322 0022010 0 ustar 00root root 0000000 0000000 (v0.9.2)=
### 0.9.2 {small}`2023-07-25`
#### Bug fixes
* Views of `awkward.Array`s now work with `awkward>=2.3` {pr}`1040` {user}`ivirshup`
* Fix ufuncs of views like `adata.X[:10].cov(axis=0)` returning views {pr}`1043` {user}`flying-sheep`
* Fix instantiating AnnData where `.X` is a `DataFrame` with an integer valued index {pr}`1002` {user}`flying-sheep`
* Fix {func}`~anndata.io.read_zarr` when used on `zarr.Group` {pr}`1057` {user}`ivirshup`
python-anndata-0.12.0~rc1/docs/release-notes/index.md 0000664 0000000 0000000 00000000052 15003706322 0022445 0 ustar 00root root 0000000 0000000 # Release notes
```{release-notes} .
```
python-anndata-0.12.0~rc1/docs/tutorials/ 0000775 0000000 0000000 00000000000 15003706322 0020277 5 ustar 00root root 0000000 0000000 python-anndata-0.12.0~rc1/docs/tutorials/index.md 0000664 0000000 0000000 00000000715 15003706322 0021733 0 ustar 00root root 0000000 0000000 # Tutorials
For a quick introduction to `AnnData`, check out {doc}`Getting Started with AnnData `.
For working with the experimental data loaders also see {ref}`experimental-api`.
```{toctree}
:maxdepth: 1
notebooks/getting-started
notebooks/annloader
notebooks/anncollection
notebooks/anncollection-annloader
notebooks/anndata_dask_array
notebooks/awkward-arrays
notebooks/{read,write}_dispatched
notebooks/read_lazy
zarr-v3
```
python-anndata-0.12.0~rc1/docs/tutorials/zarr-v3.md 0000664 0000000 0000000 00000016016 15003706322 0022131 0 ustar 00root root 0000000 0000000 # zarr-v3 Guide/Roadmap
`anndata` now uses the much improved {mod}`zarr` v3 package and also allows writing of datasets in the v3 format via {attr}`anndata.settings.zarr_write_format`, with the exception of structured arrays.
Users should notice a significant performance improvement, especially for cloud data, but also likely for local data as well.
Here is a quick guide on some of our learnings so far:
## Remote data
We now provide the {func}`anndata.experimental.read_lazy` feature for reading as much of the {class}`~anndata.AnnData` object as lazily as possible, using `dask` and {mod}`xarray`.
Please note that this feature is experimental and subject to change.
To enable this functionality in a performant and feature-complete way for remote data sources, we use {doc}`zarr:user-guide/consolidated_metadata` on the `zarr` store (written by default).
Please note that this introduces consistency issues – if you update the structure of the underlying `zarr` store i.e., remove a column from `obs`, the consolidated metadata will no longer be valid.
Further, note that without consolidated metadata, we cannot guarantee your stored `AnnData` object will be fully readable.
And even if it is fully readable, it will almost certainly be much slower to read.
There are two ways of opening remote `zarr` stores from the `zarr-python` package, {class}`zarr.storage.FsspecStore` and {class}`zarr.storage.ObjectStore`, and both can be used with `read_lazy`.
[`obstore` claims] to be more performant out-of-the-box, but notes that this claim has not been benchmarked with the `uvloop` event loop, which itself claims to be 2× more performant than the default event loop for `python`.
## Local data
Local data generally poses a different set of challenges.
First, write speeds can be somewhat slow and second, the creation of many small files on a file system can slow down a filesystem.
For the "many small files" problem, `zarr` has introduced {ref}`sharding ` in the v3 file format.
Sharding requires knowledge of the array element you are writing (such as shape or data type), though, and therefore you will need to use {func}`anndata.experimental.write_dispatched` to use sharding.
For example, you cannot shard a 1D array with `shard` sizes `(256, 256)`.
Here is a short example, although you should tune the sizes to your own use-case and also use the compression that makes the most sense for you:
```python
import zarr
import anndata as ad
from collections.abc import Mapping
from typing import Any
ad.settings.zarr_write_format = 3 # Absolutely crucial! Sharding is only for the v3 file format!
def write_sharded(group: zarr.Group, adata: ad.AnnData):
def callback(
func: ad.experimental.Write,
g: zarr.Group,
k: str,
elem: ad.typing.RWAble,
dataset_kwargs: Mapping[str, Any],
iospec: ad.experimental.IOSpec,
):
if iospec.encoding_type in {"array"}:
dataset_kwargs = {
"shards": tuple(int(2 ** (16 / len(elem.shape))) for _ in elem.shape),
**dataset_kwargs,
}
dataset_kwargs["chunks"] = tuple(i // 2 for i in dataset_kwargs["shards"])
elif iospec.encoding_type in {"csr_matrix", "csc_matrix"}:
dataset_kwargs = {"shards": (2**16,), "chunks": (2**8,), **dataset_kwargs}
func(g, k, elem, dataset_kwargs=dataset_kwargs)
return ad.experimental.write_dispatched(group, "/", adata, callback=callback)
```
However, `zarr-python` can be slow with sharding throughput as well as writing throughput.
Thus if you wish to speed up either writing, sharding, or both (or receive a modest speed-boost for reading), a bridge to the `zarr` implementation in Rust {doc}`zarrs-python ` can help with that (see the [zarr-benchmarks]):
```
uv pip install zarrs
```
```python
import zarr
import zarrs
zarr.config.set({"codec_pipeline.path": "zarrs.ZarrsCodecPipeline"})
```
However, this pipeline is not compatible with all types of zarr store, especially remote stores and there are limitations on where rust can give a performance boost for indexing.
We therefore recommend this pipeline for writing full datasets and reading contiguous regions of said written data.
## Codecs
The default `zarr-python` v3 codec for the v3 format is no longer `blosc` but `zstd`.
While `zstd` is more widespread, you may find its performance to not meet your old expectations.
Therefore, we recommend passing in the {class}`zarr.codecs.BloscCodec` to `compressor` on {func}`~anndata.AnnData.write_zarr` if you wish to return to the old behavior.
There is currently a bug with `numcodecs` that prevents data written from other non-numcodecs `zstd` implementations from being read in by the default zarr pipeline (to which the above rust pipeline falls back if it cannot handle a datatype or indexing scheme, like `vlen-string`): {issue}`zarr-developers/numcodecs#424`.
Thus is may be advisable to use `BloscCodec` with `zarr` v3 file format data if you wish to use the rust-accelerated pipeline until this issue is resolved.
The same issue with `zstd` applies to data that may eventually be written by the GPU `zstd` implementation (see below).
## Dask
Zarr v3 should be compatible with dask, although the default behavior is to use zarr's chunking for dask's own. With sharding, this behavior may be undesirable as shards can often contain many small chunks, thereby slowing down i/o as dask will need to index into the zarr store for every chunk. Therefore it may be better to customize this behavior by passing `chunks=my_zarr_array.shards` as an argument to {func}`dask.array.from_zarr` or similar.
## GPU i/o
At the moment, it is unlikely your `anndata` i/o will work if you use {ref}`zarr.config.enable_gpu `.
It's *possible* dense data i/o i.e., using {func}`anndata.io.read_elem` will work as expected, but this functionality is untested – sparse data, awkward arrays, and dataframes will not.
`kvikio` currently provides a {class}`kvikio.zarr.GDSStore` although there are no working compressors at the moment exported from the `zarr-python` package (work is underway for `Zstd`: {pr}`zarr-developers/zarr-python#2863`.
We anticipate enabling officially supporting this functionality officially for dense data, sparse data, and possibly awkward arrays in the next minor release, 0.13.
## Asynchronous i/o
At the moment, `anndata` exports no `async` functions.
However, `zarr-python` has a fully `async` API and provides its own event-loop so that users like `anndata` can interact with a synchronous API while still beenfitting from `zarr-python`'s asynchronous functionality under that API.
We anticipate providing `async` versions of {func}`anndata.io.read_elem` and {func}`anndata.experimental.read_dispatched` so that users can download data asynchronously without using the `zarr-python` event loop.
We also would like to create an asynchronous partial reader to enable iterative streaming of a dataset.
[`obstore` claims]: https://developmentseed.org/obstore/latest/performance
[zarr-benchmarks]: https://github.com/LDeakin/zarr_benchmarks
python-anndata-0.12.0~rc1/hatch.toml 0000664 0000000 0000000 00000002250 15003706322 0017304 0 ustar 00root root 0000000 0000000 [envs.default]
installer = "uv"
features = [ "dev" ]
[envs.docs]
features = [ "doc" ]
scripts.build = "sphinx-build -M html docs docs/_build -W --keep-going {args}"
scripts.open = "python3 -m webbrowser -t docs/_build/html/index.html"
scripts.clean = "git clean -fdX -- {args:docs}"
[envs.towncrier]
scripts.create = "towncrier create {args}"
scripts.build = "python3 ci/scripts/towncrier_automation.py {args}"
scripts.clean = "git restore --source=HEAD --staged --worktree -- docs/release-notes"
[envs.hatch-test]
default-args = [ ]
features = [ "dev", "test" ]
extra-dependencies = [ "ipykernel" ]
env-vars.UV_CONSTRAINT = "ci/constraints.txt"
overrides.matrix.deps.env-vars = [
{ if = [ "pre" ], key = "UV_PRERELEASE", value = "allow" },
{ if = [ "min" ], key = "UV_CONSTRAINT", value = "ci/constraints.txt ci/min-deps.txt" },
]
overrides.matrix.deps.pre-install-commands = [
{ if = [ "min" ], value = "uv run ci/scripts/min-deps.py pyproject.toml --all-extras -o ci/min-deps.txt" },
]
overrides.matrix.deps.python = [
{ if = [ "min" ], value = "3.11" },
{ if = [ "stable", "pre" ], value = "3.13" },
]
[[envs.hatch-test.matrix]]
deps = [ "stable", "pre", "min" ]
python-anndata-0.12.0~rc1/pyproject.toml 0000664 0000000 0000000 00000017035 15003706322 0020243 0 ustar 00root root 0000000 0000000 [build-system]
build-backend = "hatchling.build"
requires = [ "hatchling", "hatch-vcs" ]
[project]
name = "anndata"
description = "Annotated data."
requires-python = ">=3.11"
license = "BSD-3-Clause"
authors = [
{ name = "Philipp Angerer" },
{ name = "Alex Wolf" },
{ name = "Isaac Virshup" },
{ name = "Sergei Rybakov" },
]
maintainers = [
{ name = "Isaac Virshup", email = "ivirshup@gmail.com" },
{ name = "Philipp Angerer", email = "philipp.angerer@helmholtz-munich.de" },
{ name = "Ilan Gold", email = "ilan.gold@helmholtz-munich.de" },
]
readme = "README.md"
classifiers = [
"Environment :: Console",
"Framework :: Jupyter",
"Intended Audience :: Developers",
"Intended Audience :: Science/Research",
"Natural Language :: English",
"Operating System :: MacOS :: MacOS X",
"Operating System :: Microsoft :: Windows",
"Operating System :: POSIX :: Linux",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Topic :: Scientific/Engineering :: Bio-Informatics",
"Topic :: Scientific/Engineering :: Visualization",
]
dependencies = [
# pandas 2.1.0rc0 has pandas/issues/54622
"pandas >=2.0.0, !=2.1.0rc0, !=2.1.2",
"numpy>=1.25",
# https://github.com/scverse/anndata/issues/1434
"scipy >1.11",
"h5py>=3.8",
"natsort",
"packaging>=24.2",
# array-api-compat 1.5 has https://github.com/scverse/anndata/issues/1410
"array_api_compat>1.4,!=1.5",
"legacy-api-wrap",
"zarr >=2.18.7, !=3.0.0, !=3.0.1, !=3.0.2, !=3.0.3",
]
dynamic = [ "version" ]
[project.urls]
Documentation = "https://anndata.readthedocs.io/"
Source = "https://github.com/scverse/anndata"
Home-page = "https://github.com/scverse/anndata"
[project.optional-dependencies]
dev = [
# runtime dev version generation
"hatch-vcs",
"anndata[dev-doc,test]",
]
doc = [
"sphinx>=8.2.1",
"sphinx-book-theme>=1.1.0",
"sphinx-autodoc-typehints>=2.2.0",
"sphinx-issues",
"sphinx-copybutton",
"sphinx-toolbox>=3.8.0",
"sphinxext.opengraph",
"myst-nb",
"scanpydoc[theme,typehints] >=0.15.1",
"awkward>=2.3",
"IPython", # For syntax highlighting in notebooks
"myst_parser",
"sphinx_design>=0.5.0",
"readthedocs-sphinx-search",
# for unreleased changes
"anndata[dev-doc,dask]",
"awkward>=2.3",
]
dev-doc = [ "towncrier>=24.8.0" ] # release notes tool
test-full = [ "anndata[test,lazy]" ]
test = [
"loompy>=3.0.5",
"pytest>=8.2,<8.3.4",
"pytest-cov",
"pytest-randomly",
"pytest-memray",
"pytest-mock",
"pytest-xdist[psutil]",
"filelock",
"matplotlib",
"scikit-learn",
"openpyxl",
"joblib",
"boltons",
"scanpy>=1.10",
"httpx", # For data downloading
"dask[distributed]",
"awkward>=2.3",
"pyarrow",
"anndata[dask]",
]
gpu = [ "cupy" ]
cu12 = [ "cupy-cuda12x" ]
cu11 = [ "cupy-cuda11x" ]
# requests and aiohttp needed for zarr remote data
lazy = [ "xarray>=2024.06.0", "aiohttp", "requests", "anndata[dask]" ]
# https://github.com/dask/dask/issues/11290
# https://github.com/dask/dask/issues/11752
dask = [ "dask[array]>=2023.5.1,!=2024.8.*,!=2024.9.*,<2025.2.0" ]
[tool.hatch.version]
source = "vcs"
raw-options.version_scheme = "release-branch-semver"
[tool.hatch.build.targets.wheel]
packages = [ "src/anndata", "src/testing" ]
[tool.coverage.run]
data_file = "test-data/coverage"
source_pkgs = [ "anndata" ]
omit = [ "src/anndata/_version.py", "**/test_*.py" ]
concurrency = [ "multiprocessing" ]
parallel = "true"
[tool.coverage.xml]
output = "test-data/coverage.xml"
[tool.coverage.paths]
source = [ "./src", "**/site-packages" ]
[tool.coverage.report]
exclude_also = [
"if TYPE_CHECKING:",
]
[tool.pytest.ini_options]
addopts = [
"--import-mode=importlib",
"--strict-markers",
"--doctest-modules",
"--pyargs",
"-ptesting.anndata._pytest",
"--dist=loadgroup",
]
filterwarnings = [
"ignore::anndata._warnings.OldFormatWarning",
"ignore::anndata._warnings.ExperimentalFeatureWarning",
]
# When `--strict-warnings` is used, all warnings are treated as errors, except those:
filterwarnings_when_strict = [
"default::anndata._warnings.ImplicitModificationWarning",
"default:Transforming to str index:UserWarning",
"default:(Observation|Variable) names are not unique. To make them unique:UserWarning",
"default::scipy.sparse.SparseEfficiencyWarning",
"default::dask.array.core.PerformanceWarning",
"default:anndata will no longer support zarr v2:DeprecationWarning",
"default:The codec `vlen-utf8:UserWarning",
"default:The dtype `StringDType():UserWarning",
"default:Consolidated metadata is:UserWarning",
]
python_files = "test_*.py"
testpaths = [
"anndata", # docstrings (module name due to --pyargs)
"./tests", # unit tests
"./ci/scripts", # CI script tests
"./docs/concatenation.rst", # further doctests
]
# For some reason this effects how logging is shown when tests are run
xfail_strict = true
markers = [ "gpu: mark test to run on GPU" ]
[tool.ruff]
src = [ "src" ]
[tool.ruff.format]
docstring-code-format = true
[tool.ruff.lint]
select = [
"E", # Error detected by Pycodestyle
"EM", # Traceback-friendly error messages
"F", # Errors detected by Pyflakes
"FBT", # Boolean positional arguments
"W", # Warning detected by Pycodestyle
"PLW", # Pylint
"UP", # pyupgrade
"I", # isort
"TC", # manage type checking blocks
"TID", # Banned imports
"ICN", # Follow import conventions
"PTH", # Pathlib instead of os.path
"PT", # Pytest conventions
"PYI", # Typing
]
ignore = [
# line too long -> we accept long comment lines; formatter gets rid of long code lines
"E501",
# Do not assign a lambda expression, use a def -> AnnData allows lambda expression assignments,
"E731",
# allow I, O, l as variable names -> I is the identity matrix, i, j, k, l is reasonable indexing notation
"E741",
# We use relative imports from parent modules
"TID252",
# Shadowing loop variables isn’t a big deal
"PLW2901",
]
[tool.ruff.lint.per-file-ignores]
# E721 comparing types, but we specifically are checking that we aren't getting subtypes (views)
"tests/test_readwrite.py" = [ "E721" ]
[tool.ruff.lint.isort]
known-first-party = [ "anndata" ]
required-imports = [ "from __future__ import annotations" ]
[tool.ruff.lint.flake8-tidy-imports.banned-api]
"subprocess.call".msg = "Use `subprocess.run([…])` instead"
"subprocess.check_call".msg = "Use `subprocess.run([…], check=True)` instead"
"subprocess.check_output".msg = "Use `subprocess.run([…], check=True, capture_output=True)` instead"
"legacy_api_wrap.legacy_api".msg = "Use anndata.compat.old_positionals instead"
[tool.ruff.lint.flake8-type-checking]
exempt-modules = [ ]
strict = true
[tool.codespell]
skip = ".git,*.pdf,*.svg"
ignore-words-list = "theis,coo,homogenous"
[tool.towncrier]
package = "anndata"
directory = "docs/release-notes"
filename = "docs/release-notes/{version}.md"
single_file = false
package_dir = "src"
issue_format = "{{pr}}`{issue}`"
title_format = "(v{version})=\n### {version} {{small}}`{project_date}`"
fragment.bugfix.name = "Bug fixes"
fragment.doc.name = "Documentation"
fragment.feature.name = "Features"
fragment.misc.name = "Miscellaneous improvements"
fragment.performance.name = "Performance"
fragment.breaking.name = "Breaking changes"
fragment.dev.name = "Development Process"
python-anndata-0.12.0~rc1/src/ 0000775 0000000 0000000 00000000000 15003706322 0016110 5 ustar 00root root 0000000 0000000 python-anndata-0.12.0~rc1/src/anndata/ 0000775 0000000 0000000 00000000000 15003706322 0017516 5 ustar 00root root 0000000 0000000 python-anndata-0.12.0~rc1/src/anndata/__init__.py 0000664 0000000 0000000 00000003037 15003706322 0021632 0 ustar 00root root 0000000 0000000 """Annotated multivariate observation data."""
from __future__ import annotations
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from typing import Any
from ._core.anndata import AnnData
from ._core.extensions import register_anndata_namespace
from ._core.merge import concat
from ._core.raw import Raw
from ._settings import settings
from ._version import __version__
from ._warnings import (
ExperimentalFeatureWarning,
ImplicitModificationWarning,
OldFormatWarning,
WriteWarning,
)
from .io import read_h5ad, read_zarr
from .utils import module_get_attr_redirect
# Submodules need to be imported last
from . import abc, experimental, typing, io, types # noqa: E402 isort: skip
# We use these in tests by attribute access
from . import logging # noqa: F401, E402 isort: skip
_DEPRECATED_IO = (
"read_loom",
"read_hdf",
"read_excel",
"read_umi_tools",
"read_csv",
"read_text",
"read_mtx",
)
_DEPRECATED = dict((method, f"io.{method}") for method in _DEPRECATED_IO)
def __getattr__(attr_name: str) -> Any:
return module_get_attr_redirect(attr_name, deprecated_mapping=_DEPRECATED)
__all__ = [
# Attributes
"__version__",
"settings",
# Submodules
"abc",
"experimental",
"typing",
"types",
"io",
# Classes
"AnnData",
"Raw",
# Functions
"concat",
"read_zarr",
"read_h5ad",
"register_anndata_namespace",
# Warnings
"OldFormatWarning",
"WriteWarning",
"ImplicitModificationWarning",
"ExperimentalFeatureWarning",
]
python-anndata-0.12.0~rc1/src/anndata/_core/ 0000775 0000000 0000000 00000000000 15003706322 0020605 5 ustar 00root root 0000000 0000000 python-anndata-0.12.0~rc1/src/anndata/_core/__init__.py 0000664 0000000 0000000 00000000000 15003706322 0022704 0 ustar 00root root 0000000 0000000 python-anndata-0.12.0~rc1/src/anndata/_core/access.py 0000664 0000000 0000000 00000001560 15003706322 0022422 0 ustar 00root root 0000000 0000000 from __future__ import annotations
from functools import reduce
from typing import TYPE_CHECKING, NamedTuple
if TYPE_CHECKING:
from anndata import AnnData
class ElementRef(NamedTuple):
parent: AnnData
attrname: str
keys: tuple[str, ...] = ()
def __str__(self) -> str:
return f".{self.attrname}" + "".join(map(lambda x: f"['{x}']", self.keys))
@property
def _parent_el(self):
return reduce(
lambda d, k: d[k], self.keys[:-1], getattr(self.parent, self.attrname)
)
def get(self):
"""Get referenced value in self.parent."""
return reduce(lambda d, k: d[k], self.keys, getattr(self.parent, self.attrname))
def set(self, val):
"""Set referenced value in self.parent."""
self._parent_el[self.keys[-1]] = val
def delete(self):
del self._parent_el[self.keys[-1]]
python-anndata-0.12.0~rc1/src/anndata/_core/aligned_df.py 0000664 0000000 0000000 00000006277 15003706322 0023247 0 ustar 00root root 0000000 0000000 from __future__ import annotations
import warnings
from collections.abc import Mapping
from functools import singledispatch
from typing import TYPE_CHECKING
import pandas as pd
from pandas.api.types import is_string_dtype
from .._warnings import ImplicitModificationWarning
if TYPE_CHECKING:
from collections.abc import Iterable
from typing import Any, Literal
@singledispatch
def _gen_dataframe(
anno: Any,
index_names: Iterable[str],
*,
source: Literal["X", "shape"],
attr: Literal["obs", "var"],
length: int | None = None,
) -> pd.DataFrame: # pragma: no cover
msg = f"Cannot convert {type(anno)} to {attr} DataFrame"
raise ValueError(msg)
@_gen_dataframe.register(Mapping)
@_gen_dataframe.register(type(None))
def _gen_dataframe_mapping(
anno: Mapping[str, Any] | None,
index_names: Iterable[str],
*,
source: Literal["X", "shape"],
attr: Literal["obs", "var"],
length: int | None = None,
) -> pd.DataFrame:
if anno is None or len(anno) == 0:
anno = {}
def mk_index(l: int) -> pd.Index:
return pd.RangeIndex(0, l, name=None).astype(str)
for index_name in index_names:
if index_name not in anno:
continue
df = pd.DataFrame(
anno,
index=anno[index_name],
columns=[k for k in anno.keys() if k != index_name],
)
break
else:
df = pd.DataFrame(
anno,
index=None if length is None else mk_index(length),
columns=None if len(anno) else [],
)
if length is None:
df.index = mk_index(len(df))
elif length != len(df):
raise _mk_df_error(source, attr, length, len(df))
return df
@_gen_dataframe.register(pd.DataFrame)
def _gen_dataframe_df(
anno: pd.DataFrame,
index_names: Iterable[str],
*,
source: Literal["X", "shape"],
attr: Literal["obs", "var"],
length: int | None = None,
):
if length is not None and length != len(anno):
raise _mk_df_error(source, attr, length, len(anno))
anno = anno.copy(deep=False)
if not is_string_dtype(anno.index):
warnings.warn("Transforming to str index.", ImplicitModificationWarning)
anno.index = anno.index.astype(str)
if not len(anno.columns):
anno.columns = anno.columns.astype(str)
return anno
@_gen_dataframe.register(pd.Series)
@_gen_dataframe.register(pd.Index)
def _gen_dataframe_1d(
anno: pd.Series | pd.Index,
index_names: Iterable[str],
*,
source: Literal["X", "shape"],
attr: Literal["obs", "var"],
length: int | None = None,
):
msg = f"Cannot convert {type(anno)} to {attr} DataFrame"
raise ValueError(msg)
def _mk_df_error(
source: Literal["X", "shape"],
attr: Literal["obs", "var"],
expected: int,
actual: int,
):
if source == "X":
what = "row" if attr == "obs" else "column"
msg = (
f"Observations annot. `{attr}` must have as many rows as `X` has {what}s "
f"({expected}), but has {actual} rows."
)
else:
msg = (
f"`shape` is inconsistent with `{attr}` "
"({actual} {what}s instead of {expected})"
)
return ValueError(msg)
python-anndata-0.12.0~rc1/src/anndata/_core/aligned_mapping.py 0000664 0000000 0000000 00000033037 15003706322 0024303 0 ustar 00root root 0000000 0000000 from __future__ import annotations
import warnings
from abc import ABC, abstractmethod
from collections.abc import MutableMapping, Sequence
from copy import copy
from dataclasses import dataclass
from typing import TYPE_CHECKING, Generic, TypeVar
import numpy as np
import pandas as pd
from .._warnings import ExperimentalFeatureWarning, ImplicitModificationWarning
from ..compat import AwkArray, CSArray, CSMatrix
from ..utils import (
axis_len,
convert_to_dict,
deprecated,
raise_value_error_if_multiindex_columns,
warn_once,
)
from .access import ElementRef
from .index import _subset
from .storage import coerce_array
from .views import as_view, view_update
if TYPE_CHECKING:
from collections.abc import Callable, Iterable, Iterator, Mapping
from typing import ClassVar, Literal, Self
from .anndata import AnnData
from .raw import Raw
OneDIdx = Sequence[int] | Sequence[bool] | slice
TwoDIdx = tuple[OneDIdx, OneDIdx]
# TODO: pd.DataFrame only allowed in AxisArrays?
Value = pd.DataFrame | CSMatrix | CSArray | np.ndarray
P = TypeVar("P", bound="AlignedMappingBase")
"""Parent mapping an AlignedView is based on."""
I = TypeVar("I", OneDIdx, TwoDIdx)
class AlignedMappingBase(MutableMapping[str, Value], ABC):
"""\
An abstract base class for Mappings containing array-like values aligned
to either one or both AnnData axes.
"""
_allow_df: ClassVar[bool]
"""If this mapping supports heterogeneous DataFrames"""
_view_class: ClassVar[type[AlignedView]]
"""The view class for this aligned mapping."""
_actual_class: ClassVar[type[AlignedActual]]
"""The actual class (which has it’s own data) for this aligned mapping."""
_parent: AnnData | Raw
"""The parent object that this mapping is aligned to."""
def __repr__(self):
return f"{type(self).__name__} with keys: {', '.join(self.keys())}"
def _ipython_key_completions_(self) -> list[str]:
return list(self.keys())
def _validate_value(self, val: Value, key: str) -> Value:
"""Raises an error if value is invalid"""
if isinstance(val, AwkArray):
warn_once(
"Support for Awkward Arrays is currently experimental. "
"Behavior may change in the future. Please report any issues you may encounter!",
ExperimentalFeatureWarning,
# stacklevel=3,
)
for i, axis in enumerate(self.axes):
if self.parent.shape[axis] == axis_len(val, i):
continue
right_shape = tuple(self.parent.shape[a] for a in self.axes)
actual_shape = tuple(axis_len(val, a) for a, _ in enumerate(self.axes))
if actual_shape[i] is None and isinstance(val, AwkArray):
dim = ("obs", "var")[i]
msg = (
f"The AwkwardArray is of variable length in dimension {dim}.",
f"Try ak.to_regular(array, {i}) before including the array in AnnData",
)
else:
dims = tuple(("obs", "var")[ax] for ax in self.axes)
msg = (
f"Value passed for key {key!r} is of incorrect shape. "
f"Values of {self.attrname} must match dimensions {dims} of parent. "
f"Value had shape {actual_shape} while it should have had {right_shape}."
)
raise ValueError(msg)
name = f"{self.attrname.title().rstrip('s')} {key!r}"
return coerce_array(val, name=name, allow_df=self._allow_df)
@property
@abstractmethod
def attrname(self) -> str:
"""What attr for the AnnData is this?"""
@property
@abstractmethod
def axes(self) -> tuple[Literal[0, 1], ...]:
"""Which axes of the parent is this aligned to?"""
@property
@abstractmethod
def is_view(self) -> bool: ...
@property
def parent(self) -> AnnData | Raw:
return self._parent
def copy(self) -> dict[str, Value]:
# Shallow copy for awkward array since their buffers are immutable
return {
k: copy(v) if isinstance(v, AwkArray) else v.copy() for k, v in self.items()
}
def _view(self, parent: AnnData, subset_idx: I) -> AlignedView[Self, I]:
"""Returns a subset copy-on-write view of the object."""
return self._view_class(self, parent, subset_idx)
@deprecated("dict(obj)")
def as_dict(self) -> dict:
return dict(self)
class AlignedView(AlignedMappingBase, Generic[P, I]):
is_view: ClassVar[Literal[True]] = True
# override docstring
parent: AnnData
"""Reference to parent AnnData view"""
attrname: str
"""What attribute in the parent is this?"""
parent_mapping: P
"""The object this is a view of."""
subset_idx: I
"""The subset of the parent to view."""
def __init__(self, parent_mapping: P, parent_view: AnnData, subset_idx: I):
self.parent_mapping = parent_mapping
self._parent = parent_view
self.subset_idx = subset_idx
if hasattr(parent_mapping, "_axis"):
# LayersBase has no _axis, the rest does
self._axis = parent_mapping._axis # type: ignore
def __getitem__(self, key: str) -> Value:
return as_view(
_subset(self.parent_mapping[key], self.subset_idx),
ElementRef(self.parent, self.attrname, (key,)),
)
def __setitem__(self, key: str, value: Value) -> None:
value = self._validate_value(value, key) # Validate before mutating
warnings.warn(
f"Setting element `.{self.attrname}['{key}']` of view, "
"initializing view as actual.",
ImplicitModificationWarning,
stacklevel=2,
)
with view_update(self.parent, self.attrname, ()) as new_mapping:
new_mapping[key] = value
def __delitem__(self, key: str) -> None:
if key not in self:
msg = f"{key!r} not found in view of {self.attrname}"
raise KeyError(msg) # Make sure it exists before bothering with a copy
warnings.warn(
f"Removing element `.{self.attrname}['{key}']` of view, "
"initializing view as actual.",
ImplicitModificationWarning,
stacklevel=2,
)
with view_update(self.parent, self.attrname, ()) as new_mapping:
del new_mapping[key]
def __contains__(self, key: str) -> bool:
return key in self.parent_mapping
def __iter__(self) -> Iterator[str]:
return iter(self.parent_mapping)
def __len__(self) -> int:
return len(self.parent_mapping)
class AlignedActual(AlignedMappingBase):
is_view: ClassVar[Literal[False]] = False
_data: MutableMapping[str, Value]
"""Underlying mapping to the data"""
def __init__(self, parent: AnnData | Raw, *, store: MutableMapping[str, Value]):
self._parent = parent
self._data = store
for k, v in self._data.items():
self._data[k] = self._validate_value(v, k)
def __getitem__(self, key: str) -> Value:
return self._data[key]
def __setitem__(self, key: str, value: Value):
value = self._validate_value(value, key)
self._data[key] = value
def __contains__(self, key: str) -> bool:
return key in self._data
def __delitem__(self, key: str):
del self._data[key]
def __iter__(self) -> Iterator[str]:
return iter(self._data)
def __len__(self) -> int:
return len(self._data)
class AxisArraysBase(AlignedMappingBase):
"""\
Mapping of key→array-like,
where array-like is aligned to an axis of parent AnnData.
"""
_allow_df: ClassVar = True
_dimnames: ClassVar = ("obs", "var")
_axis: Literal[0, 1]
@property
def attrname(self) -> str:
return f"{self.dim}m"
@property
def axes(self) -> tuple[Literal[0, 1]]:
"""Axes of the parent this is aligned to"""
return (self._axis,)
@property
def dim(self) -> str:
"""Name of the dimension this aligned to."""
return self._dimnames[self._axis]
def to_df(self) -> pd.DataFrame:
"""Convert to pandas dataframe."""
df = pd.DataFrame(index=self.dim_names)
for key in self.keys():
value = self[key]
for icolumn, column in enumerate(value.T):
df[f"{key}{icolumn + 1}"] = column
return df
def _validate_value(self, val: Value, key: str) -> Value:
if isinstance(val, pd.DataFrame):
raise_value_error_if_multiindex_columns(val, f"{self.attrname}[{key!r}]")
if not val.index.equals(self.dim_names):
# Could probably also re-order index if it’s contained
try:
pd.testing.assert_index_equal(val.index, self.dim_names)
except AssertionError as e:
msg = f"value.index does not match parent’s {self.dim} names:\n{e}"
raise ValueError(msg) from None
else:
msg = "Index.equals and pd.testing.assert_index_equal disagree"
raise AssertionError(msg)
return super()._validate_value(val, key)
@property
def dim_names(self) -> pd.Index:
return (self.parent.obs_names, self.parent.var_names)[self._axis]
class AxisArrays(AlignedActual, AxisArraysBase):
def __init__(
self,
parent: AnnData | Raw,
*,
axis: Literal[0, 1],
store: MutableMapping[str, Value] | AxisArraysBase,
):
if axis not in {0, 1}:
raise ValueError()
self._axis = axis
super().__init__(parent, store=store)
class AxisArraysView(AlignedView[AxisArraysBase, OneDIdx], AxisArraysBase):
pass
AxisArraysBase._view_class = AxisArraysView
AxisArraysBase._actual_class = AxisArrays
class LayersBase(AlignedMappingBase):
"""\
Mapping of key: array-like, where array-like is aligned to both axes of the
parent anndata.
"""
_allow_df: ClassVar = False
attrname: ClassVar[Literal["layers"]] = "layers"
axes: ClassVar[tuple[Literal[0], Literal[1]]] = (0, 1)
class Layers(AlignedActual, LayersBase):
pass
class LayersView(AlignedView[LayersBase, TwoDIdx], LayersBase):
pass
LayersBase._view_class = LayersView
LayersBase._actual_class = Layers
class PairwiseArraysBase(AlignedMappingBase):
"""\
Mapping of key: array-like, where both axes of array-like are aligned to
one axis of the parent anndata.
"""
_allow_df: ClassVar = False
_dimnames: ClassVar = ("obs", "var")
_axis: Literal[0, 1]
@property
def attrname(self) -> str:
return f"{self.dim}p"
@property
def axes(self) -> tuple[Literal[0], Literal[0]] | tuple[Literal[1], Literal[1]]:
"""Axes of the parent this is aligned to"""
return self._axis, self._axis # type: ignore
@property
def dim(self) -> str:
"""Name of the dimension this aligned to."""
return self._dimnames[self._axis]
class PairwiseArrays(AlignedActual, PairwiseArraysBase):
def __init__(
self,
parent: AnnData,
*,
axis: Literal[0, 1],
store: MutableMapping[str, Value],
):
if axis not in {0, 1}:
raise ValueError()
self._axis = axis
super().__init__(parent, store=store)
class PairwiseArraysView(AlignedView[PairwiseArraysBase, OneDIdx], PairwiseArraysBase):
pass
PairwiseArraysBase._view_class = PairwiseArraysView
PairwiseArraysBase._actual_class = PairwiseArrays
AlignedMapping = (
AxisArrays
| AxisArraysView
| Layers
| LayersView
| PairwiseArrays
| PairwiseArraysView
)
T = TypeVar("T", bound=AlignedMapping)
"""Pair of types to be aligned."""
@dataclass
class AlignedMappingProperty(property, Generic[T]):
"""A :class:`property` that creates an ephemeral AlignedMapping.
The actual data is stored as `f'_{self.name}'` in the parent object.
"""
name: str
"""Name of the attribute in the parent object."""
cls: type[T]
"""Concrete type that will be constructed."""
axis: Literal[0, 1] | None = None
"""Axis of the parent to align to."""
def construct(self, obj: AnnData, *, store: MutableMapping[str, Value]) -> T:
if self.axis is None:
return self.cls(obj, store=store)
return self.cls(obj, axis=self.axis, store=store)
@property
def fget(self) -> Callable[[], None]:
"""Fake fget for sphinx-autodoc-typehints."""
def fake(): ...
fake.__annotations__ = {"return": self.cls._actual_class | self.cls._view_class}
return fake
def __get__(self, obj: None | AnnData, objtype: type | None = None) -> T:
if obj is None:
# When accessed from the class, e.g. via `AnnData.obs`,
# this needs to return a `property` instance, e.g. for Sphinx
return self # type: ignore
if not obj.is_view:
return self.construct(obj, store=getattr(obj, f"_{self.name}"))
parent_anndata = obj._adata_ref
idxs = (obj._oidx, obj._vidx)
parent: AlignedMapping = getattr(parent_anndata, self.name)
return parent._view(obj, tuple(idxs[ax] for ax in parent.axes))
def __set__(
self, obj: AnnData, value: Mapping[str, Value] | Iterable[tuple[str, Value]]
) -> None:
value = convert_to_dict(value)
_ = self.construct(obj, store=value) # Validate
if obj.is_view:
obj._init_as_actual(obj.copy())
setattr(obj, f"_{self.name}", value)
def __delete__(self, obj) -> None:
setattr(obj, self.name, dict())
python-anndata-0.12.0~rc1/src/anndata/_core/anndata.py 0000664 0000000 0000000 00000227103 15003706322 0022572 0 ustar 00root root 0000000 0000000 """\
Main class and helper functions.
"""
from __future__ import annotations
import warnings
from collections import OrderedDict
from collections.abc import Mapping, MutableMapping, Sequence
from copy import copy, deepcopy
from functools import partial, singledispatch
from pathlib import Path
from textwrap import dedent
from typing import TYPE_CHECKING, cast
import h5py
import numpy as np
import pandas as pd
from natsort import natsorted
from numpy import ma
from pandas.api.types import infer_dtype
from scipy import sparse
from scipy.sparse import issparse
from anndata._warnings import ImplicitModificationWarning
from .. import utils
from .._settings import settings
from ..compat import CSArray, DaskArray, ZarrArray, _move_adj_mtx, old_positionals
from ..logging import anndata_logger as logger
from ..utils import (
axis_len,
deprecated,
ensure_df_homogeneous,
raise_value_error_if_multiindex_columns,
)
from .access import ElementRef
from .aligned_df import _gen_dataframe
from .aligned_mapping import AlignedMappingProperty, AxisArrays, Layers, PairwiseArrays
from .file_backing import AnnDataFileManager, to_memory
from .index import _normalize_indices, _subset, get_vector
from .raw import Raw
from .sparse_dataset import BaseCompressedSparseDataset, sparse_dataset
from .storage import coerce_array
from .views import (
DictView,
_resolve_idxs,
as_view,
)
if TYPE_CHECKING:
from collections.abc import Iterable
from os import PathLike
from typing import Any, ClassVar, Literal
from zarr.storage import StoreLike
from ..compat import Index1D
from ..typing import XDataType
from .aligned_mapping import AxisArraysView, LayersView, PairwiseArraysView
from .index import Index
class AnnData(metaclass=utils.DeprecationMixinMeta):
"""\
An annotated data matrix.
.. figure:: ../_static/img/anndata_schema.svg
:width: 260px
:align: right
:class: dark-light
:class:`~anndata.AnnData` stores a data matrix :attr:`X` together with annotations
of observations :attr:`obs` (:attr:`obsm`, :attr:`obsp`),
variables :attr:`var` (:attr:`varm`, :attr:`varp`),
and unstructured annotations :attr:`uns`.
An :class:`~anndata.AnnData` object `adata` can be sliced like a
:class:`~pandas.DataFrame`,
for instance `adata_subset = adata[:, list_of_variable_names]`.
:class:`~anndata.AnnData`’s basic structure is similar to R’s ExpressionSet
[Huber15]_. If setting an `.h5ad`-formatted HDF5 backing file `.filename`,
data remains on the disk but is automatically loaded into memory if needed.
Parameters
----------
X
A #observations × #variables data matrix. A view of the data is used if the
data type matches, otherwise, a copy is made.
obs
Key-indexed one-dimensional observations annotation of length #observations.
var
Key-indexed one-dimensional variables annotation of length #variables.
uns
Key-indexed unstructured annotation.
obsm
Key-indexed multi-dimensional observations annotation of length #observations.
If passing a :class:`~numpy.ndarray`, it needs to have a structured datatype.
varm
Key-indexed multi-dimensional variables annotation of length #variables.
If passing a :class:`~numpy.ndarray`, it needs to have a structured datatype.
layers
Key-indexed multi-dimensional arrays aligned to dimensions of `X`.
shape
Shape tuple (#observations, #variables). Can only be provided if `X` is `None`.
filename
Name of backing file. See :class:`h5py.File`.
filemode
Open mode of backing file. See :class:`h5py.File`.
See Also
--------
io.read_h5ad
io.read_csv
io.read_excel
io.read_hdf
io.read_loom
io.read_zarr
io.read_mtx
io.read_text
io.read_umi_tools
Notes
-----
:class:`~anndata.AnnData` stores observations (samples) of variables/features
in the rows of a matrix.
This is the convention of the modern classics of statistics [Hastie09]_
and machine learning [Murphy12]_,
the convention of dataframes both in R and Python and the established statistics
and machine learning packages in Python (statsmodels_, scikit-learn_).
Single dimensional annotations of the observation and variables are stored
in the :attr:`obs` and :attr:`var` attributes as :class:`~pandas.DataFrame`\\ s.
This is intended for metrics calculated over their axes.
Multi-dimensional annotations are stored in :attr:`obsm` and :attr:`varm`,
which are aligned to the objects observation and variable dimensions respectively.
Square matrices representing graphs are stored in :attr:`obsp` and :attr:`varp`,
with both of their own dimensions aligned to their associated axis.
Additional measurements across both observations and variables are stored in
:attr:`layers`.
Indexing into an AnnData object can be performed by relative position
with numeric indices (like pandas’ :meth:`~pandas.DataFrame.iloc`),
or by labels (like :meth:`~pandas.DataFrame.loc`).
To avoid ambiguity with numeric indexing into observations or variables,
indexes of the AnnData object are converted to strings by the constructor.
Subsetting an AnnData object by indexing into it will also subset its elements
according to the dimensions they were aligned to.
This means an operation like `adata[list_of_obs, :]` will also subset :attr:`obs`,
:attr:`obsm`, and :attr:`layers`.
Subsetting an AnnData object returns a view into the original object,
meaning very little additional memory is used upon subsetting.
This is achieved lazily, meaning that the constituent arrays are subset on access.
Copying a view causes an equivalent “real” AnnData object to be generated.
Attempting to modify a view (at any attribute except X) is handled
in a copy-on-modify manner, meaning the object is initialized in place.
Here’s an example::
batch1 = adata[adata.obs["batch"] == "batch1", :]
batch1.obs["value"] = 0 # This makes batch1 a “real” AnnData object
At the end of this snippet: `adata` was not modified,
and `batch1` is its own AnnData object with its own data.
Similar to Bioconductor’s `ExpressionSet` and :mod:`scipy.sparse` matrices,
subsetting an AnnData object retains the dimensionality of its constituent arrays.
Therefore, unlike with the classes exposed by :mod:`pandas`, :mod:`numpy`,
and `xarray`, there is no concept of a one dimensional AnnData object.
AnnDatas always have two inherent dimensions, :attr:`obs` and :attr:`var`.
Additionally, maintaining the dimensionality of the AnnData object allows for
consistent handling of :mod:`scipy.sparse` matrices and :mod:`numpy` arrays.
.. _statsmodels: http://www.statsmodels.org/stable/index.html
.. _scikit-learn: http://scikit-learn.org/
"""
_BACKED_ATTRS = ["X", "raw.X"]
# backwards compat
_H5_ALIASES = dict(
X={"X", "_X", "data", "_data"},
obs={"obs", "_obs", "smp", "_smp"},
var={"var", "_var"},
uns={"uns"},
obsm={"obsm", "_obsm", "smpm", "_smpm"},
varm={"varm", "_varm"},
layers={"layers", "_layers"},
)
_H5_ALIASES_NAMES = dict(
obs={"obs_names", "smp_names", "row_names", "index"},
var={"var_names", "col_names", "index"},
)
_accessors: ClassVar[set[str]] = set()
@old_positionals(
"obsm",
"varm",
"layers",
"raw",
"dtype",
"shape",
"filename",
"filemode",
"asview",
)
def __init__(
self,
X: XDataType | pd.DataFrame | None = None,
obs: pd.DataFrame | Mapping[str, Iterable[Any]] | None = None,
var: pd.DataFrame | Mapping[str, Iterable[Any]] | None = None,
uns: Mapping[str, Any] | None = None,
*,
obsm: np.ndarray | Mapping[str, Sequence[Any]] | None = None,
varm: np.ndarray | Mapping[str, Sequence[Any]] | None = None,
layers: Mapping[str, XDataType] | None = None,
raw: Mapping[str, Any] | None = None,
dtype: np.dtype | type | str | None = None,
shape: tuple[int, int] | None = None,
filename: PathLike[str] | str | None = None,
filemode: Literal["r", "r+"] | None = None,
asview: bool = False,
obsp: np.ndarray | Mapping[str, Sequence[Any]] | None = None,
varp: np.ndarray | Mapping[str, Sequence[Any]] | None = None,
oidx: Index1D | None = None,
vidx: Index1D | None = None,
):
# check for any multi-indices that aren’t later checked in coerce_array
for attr, key in [(obs, "obs"), (var, "var"), (X, "X")]:
if isinstance(attr, pd.DataFrame):
raise_value_error_if_multiindex_columns(attr, key)
if asview:
if not isinstance(X, AnnData):
msg = "`X` has to be an AnnData object."
raise ValueError(msg)
self._init_as_view(X, oidx, vidx)
else:
self._init_as_actual(
X=X,
obs=obs,
var=var,
uns=uns,
obsm=obsm,
varm=varm,
raw=raw,
layers=layers,
dtype=dtype,
shape=shape,
obsp=obsp,
varp=varp,
filename=filename,
filemode=filemode,
)
def _init_as_view(self, adata_ref: AnnData, oidx: Index, vidx: Index):
if adata_ref.isbacked and adata_ref.is_view:
msg = (
"Currently, you cannot index repeatedly into a backed AnnData, "
"that is, you cannot make a view of a view."
)
raise ValueError(msg)
self._is_view = True
if isinstance(oidx, int | np.integer):
if not (-adata_ref.n_obs <= oidx < adata_ref.n_obs):
msg = f"Observation index `{oidx}` is out of range."
raise IndexError(msg)
oidx += adata_ref.n_obs * (oidx < 0)
oidx = slice(oidx, oidx + 1, 1)
if isinstance(vidx, int | np.integer):
if not (-adata_ref.n_vars <= vidx < adata_ref.n_vars):
msg = f"Variable index `{vidx}` is out of range."
raise IndexError(msg)
vidx += adata_ref.n_vars * (vidx < 0)
vidx = slice(vidx, vidx + 1, 1)
if adata_ref.is_view:
prev_oidx, prev_vidx = adata_ref._oidx, adata_ref._vidx
adata_ref = adata_ref._adata_ref
oidx, vidx = _resolve_idxs((prev_oidx, prev_vidx), (oidx, vidx), adata_ref)
# self._adata_ref is never a view
self._adata_ref = adata_ref
self._oidx = oidx
self._vidx = vidx
# the file is the same as of the reference object
self.file = adata_ref.file
# views on attributes of adata_ref
obs_sub = adata_ref.obs.iloc[oidx]
var_sub = adata_ref.var.iloc[vidx]
# fix categories
uns = copy(adata_ref._uns)
if settings.remove_unused_categories:
self._remove_unused_categories(adata_ref.obs, obs_sub, uns)
self._remove_unused_categories(adata_ref.var, var_sub, uns)
# set attributes
self._obs = as_view(obs_sub, view_args=(self, "obs"))
self._var = as_view(var_sub, view_args=(self, "var"))
self._uns = uns
# set data
if self.isbacked:
self._X = None
# set raw, easy, as it’s immutable anyways...
if adata_ref._raw is not None:
# slicing along variables axis is ignored
self._raw = adata_ref.raw[oidx]
self._raw._adata = self
else:
self._raw = None
def _init_as_actual(
self,
X=None,
obs=None,
var=None,
uns=None,
obsm=None,
varm=None,
varp=None,
obsp=None,
raw=None,
layers=None,
dtype=None,
shape=None,
filename=None,
filemode=None,
):
# view attributes
self._is_view = False
self._adata_ref = None
self._oidx = None
self._vidx = None
# ----------------------------------------------------------------------
# various ways of initializing the data
# ----------------------------------------------------------------------
# If X is a data frame, we store its indices for verification
x_indices = []
# init from file
if filename is not None:
self.file = AnnDataFileManager(self, filename, filemode)
else:
self.file = AnnDataFileManager(self, None)
# init from AnnData
if isinstance(X, AnnData):
if any((obs, var, uns, obsm, varm, obsp, varp)):
msg = "If `X` is a dict no further arguments must be provided."
raise ValueError(msg)
X, obs, var, uns, obsm, varm, obsp, varp, layers, raw = (
X._X,
X.obs,
X.var,
X.uns,
X.obsm,
X.varm,
X.obsp,
X.varp,
X.layers,
X.raw,
)
# init from DataFrame
elif isinstance(X, pd.DataFrame):
# to verify index matching, we wait until obs and var are DataFrames
if obs is None:
obs = pd.DataFrame(index=X.index)
elif not isinstance(X.index, pd.RangeIndex):
x_indices.append(("obs", "index", X.index.astype(str)))
if var is None:
var = pd.DataFrame(index=X.columns)
elif not isinstance(X.columns, pd.RangeIndex):
x_indices.append(("var", "columns", X.columns.astype(str)))
X = ensure_df_homogeneous(X, "X")
# ----------------------------------------------------------------------
# actually process the data
# ----------------------------------------------------------------------
# check data type of X
if X is not None:
X = coerce_array(X, name="X")
if shape is not None:
msg = "`shape` needs to be `None` if `X` is not `None`."
raise ValueError(msg)
_check_2d_shape(X)
# if type doesn’t match, a copy is made, otherwise, use a view
if dtype is not None:
warnings.warn(
"The dtype argument is deprecated and will be removed in late 2024.",
FutureWarning,
)
if issparse(X) or isinstance(X, ma.MaskedArray):
# TODO: maybe use view on data attribute of sparse matrix
# as in readwrite.read_10x_h5
if X.dtype != np.dtype(dtype):
X = X.astype(dtype)
elif isinstance(X, ZarrArray | DaskArray):
X = X.astype(dtype)
else: # is np.ndarray or a subclass, convert to true np.ndarray
X = np.asarray(X, dtype)
# data matrix and shape
self._X = X
n_obs, n_vars = X.shape
source = "X"
else:
self._X = None
n_obs, n_vars = (
shape
if shape is not None
else _infer_shape(obs, var, obsm, varm, layers, obsp, varp)
)
source = "shape"
# annotations
self._obs = _gen_dataframe(
obs, ["obs_names", "row_names"], source=source, attr="obs", length=n_obs
)
self._var = _gen_dataframe(
var, ["var_names", "col_names"], source=source, attr="var", length=n_vars
)
# now we can verify if indices match!
for attr_name, x_name, idx in x_indices:
attr = getattr(self, attr_name)
if isinstance(attr.index, pd.RangeIndex):
attr.index = idx
elif not idx.equals(attr.index):
msg = f"Index of {attr_name} must match {x_name} of X."
raise ValueError(msg)
# unstructured annotations
self.uns = uns or OrderedDict()
self.obsm = obsm
self.varm = varm
self.obsp = obsp
self.varp = varp
# Backwards compat for connectivities matrices in uns["neighbors"]
_move_adj_mtx({"uns": self._uns, "obsp": self._obsp})
self._check_dimensions()
if settings.check_uniqueness:
self._check_uniqueness()
if self.filename:
assert not isinstance(raw, Raw), (
"got raw from other adata but also filename?"
)
if {"raw", "raw.X"} & set(self.file):
raw = dict(X=None, **raw)
if not raw:
self._raw = None
elif isinstance(raw, Mapping):
self._raw = Raw(self, **raw)
else: # is a Raw from another AnnData
self._raw = Raw(self, raw._X, raw.var, raw.varm)
# clean up old formats
self._clean_up_old_format(uns)
# layers
self.layers = layers
@old_positionals("show_stratified", "with_disk")
def __sizeof__(
self, *, show_stratified: bool = False, with_disk: bool = False
) -> int:
def get_size(X) -> int:
def cs_to_bytes(X) -> int:
return int(X.data.nbytes + X.indptr.nbytes + X.indices.nbytes)
if isinstance(X, h5py.Dataset) and with_disk:
return int(np.array(X.shape).prod() * X.dtype.itemsize)
elif isinstance(X, BaseCompressedSparseDataset) and with_disk:
return cs_to_bytes(X._to_backed())
elif issparse(X):
return cs_to_bytes(X)
else:
return X.__sizeof__()
sizes = {}
attrs = ["X", "_obs", "_var"]
attrs_multi = ["_uns", "_obsm", "_varm", "varp", "_obsp", "_layers"]
for attr in attrs + attrs_multi:
if attr in attrs_multi:
keys = getattr(self, attr).keys()
s = sum(get_size(getattr(self, attr)[k]) for k in keys)
else:
s = get_size(getattr(self, attr))
if s > 0 and show_stratified:
from tqdm import tqdm
print(
f"Size of {attr.replace('_', '.'):<7}: {tqdm.format_sizeof(s, 'B')}"
)
sizes[attr] = s
return sum(sizes.values())
def _gen_repr(self, n_obs, n_vars) -> str:
if self.isbacked:
backed_at = f" backed at {str(self.filename)!r}"
else:
backed_at = ""
descr = f"AnnData object with n_obs × n_vars = {n_obs} × {n_vars}{backed_at}"
for attr in [
"obs",
"var",
"uns",
"obsm",
"varm",
"layers",
"obsp",
"varp",
]:
keys = getattr(self, attr).keys()
if len(keys) > 0:
descr += f"\n {attr}: {str(list(keys))[1:-1]}"
return descr
def __repr__(self) -> str:
if self.is_view:
return "View of " + self._gen_repr(self.n_obs, self.n_vars)
else:
return self._gen_repr(self.n_obs, self.n_vars)
def __eq__(self, other):
"""Equality testing"""
msg = (
"Equality comparisons are not supported for AnnData objects, "
"instead compare the desired attributes."
)
raise NotImplementedError(msg)
@property
def shape(self) -> tuple[int, int]:
"""Shape of data matrix (:attr:`n_obs`, :attr:`n_vars`)."""
return self.n_obs, self.n_vars
@property
def X(self) -> XDataType | None:
"""Data matrix of shape :attr:`n_obs` × :attr:`n_vars`."""
if self.isbacked:
if not self.file.is_open:
self.file.open()
X = self.file["X"]
if isinstance(X, h5py.Group):
X = sparse_dataset(X)
# This is so that we can index into a backed dense dataset with
# indices that aren’t strictly increasing
if self.is_view:
X = _subset(X, (self._oidx, self._vidx))
elif self.is_view and self._adata_ref.X is None:
X = None
elif self.is_view:
X = as_view(
_subset(self._adata_ref.X, (self._oidx, self._vidx)),
ElementRef(self, "X"),
)
else:
X = self._X
return X
# if self.n_obs == 1 and self.n_vars == 1:
# return X[0, 0]
# elif self.n_obs == 1 or self.n_vars == 1:
# if issparse(X): X = X.toarray()
# return X.flatten()
# else:
# return X
@X.setter
def X(self, value: XDataType | None):
if value is None:
if self.isbacked:
msg = "Cannot currently remove data matrix from backed object."
raise NotImplementedError(msg)
if self.is_view:
self._init_as_actual(self.copy())
self._X = None
return
value = coerce_array(value, name="X", allow_array_like=True)
# If indices are both arrays, we need to modify them
# so we don’t set values like coordinates
# This can occur if there are successive views
if (
self.is_view
and isinstance(self._oidx, np.ndarray)
and isinstance(self._vidx, np.ndarray)
):
oidx, vidx = np.ix_(self._oidx, self._vidx)
else:
oidx, vidx = self._oidx, self._vidx
if (
np.isscalar(value)
or (hasattr(value, "shape") and (self.shape == value.shape))
or (self.n_vars == 1 and self.n_obs == len(value))
or (self.n_obs == 1 and self.n_vars == len(value))
):
if not np.isscalar(value):
if self.is_view and any(
isinstance(idx, np.ndarray)
and len(np.unique(idx)) != len(idx.ravel())
for idx in [oidx, vidx]
):
msg = (
"You are attempting to set `X` to a matrix on a view which has non-unique indices. "
"The resulting `adata.X` will likely not equal the value to which you set it. "
"To avoid this potential issue, please make a copy of the data first. "
"In the future, this operation will throw an error."
)
warnings.warn(msg, FutureWarning, stacklevel=1)
if self.shape != value.shape:
# For assigning vector of values to 2d array or matrix
# Not necessary for row of 2d array
value = value.reshape(self.shape)
if self.isbacked:
if self.is_view:
X = self.file["X"]
if isinstance(X, h5py.Group):
X = sparse_dataset(X)
X[oidx, vidx] = value
else:
self._set_backed("X", value)
else:
if self.is_view:
if sparse.issparse(self._adata_ref._X) and isinstance(
value, np.ndarray
):
if isinstance(self._adata_ref.X, CSArray):
memory_class = sparse.coo_array
else:
memory_class = sparse.coo_matrix
value = memory_class(value)
elif sparse.issparse(value) and isinstance(
self._adata_ref._X, np.ndarray
):
warnings.warn(
"Trying to set a dense array with a sparse array on a view."
"Densifying the sparse array."
"This may incur excessive memory usage",
stacklevel=2,
)
value = value.toarray()
warnings.warn(
"Modifying `X` on a view results in data being overridden",
ImplicitModificationWarning,
stacklevel=2,
)
self._adata_ref._X[oidx, vidx] = value
else:
self._X = value
else:
msg = f"Data matrix has wrong shape {value.shape}, need to be {self.shape}."
raise ValueError(msg)
@X.deleter
def X(self):
self.X = None
layers: AlignedMappingProperty[Layers | LayersView] = AlignedMappingProperty(
"layers", Layers
)
"""\
Dictionary-like object with values of the same dimensions as :attr:`X`.
Layers in AnnData are inspired by loompy’s :ref:`loomlayers`.
Return the layer named `"unspliced"`::
adata.layers["unspliced"]
Create or replace the `"spliced"` layer::
adata.layers["spliced"] = ...
Assign the 10th column of layer `"spliced"` to the variable a::
a = adata.layers["spliced"][:, 10]
Delete the `"spliced"` layer::
del adata.layers["spliced"]
Return layers’ names::
adata.layers.keys()
"""
@property
def raw(self) -> Raw:
"""\
Store raw version of :attr:`X` and :attr:`var` as `.raw.X` and `.raw.var`.
The :attr:`raw` attribute is initialized with the current content
of an object by setting::
adata.raw = adata.copy()
Its content can be deleted::
adata.raw = None
# or
del adata.raw
Upon slicing an AnnData object along the obs (row) axis, :attr:`raw`
is also sliced. Slicing an AnnData object along the vars (columns) axis
leaves :attr:`raw` unaffected. Note that you can call::
adata.raw[:, 'orig_variable_name'].X
to retrieve the data associated with a variable that might have been
filtered out or "compressed away" in :attr:`X`.
"""
return self._raw
@raw.setter
def raw(self, value: AnnData):
if value is None:
del self.raw
elif not isinstance(value, AnnData):
msg = "Can only init raw attribute with an AnnData object."
raise ValueError(msg)
else:
if self.is_view:
self._init_as_actual(self.copy())
self._raw = Raw(self, X=value.X, var=value.var, varm=value.varm)
@raw.deleter
def raw(self):
if self.is_view:
self._init_as_actual(self.copy())
self._raw = None
@property
def n_obs(self) -> int:
"""Number of observations."""
return len(self.obs_names)
@property
def n_vars(self) -> int:
"""Number of variables/features."""
return len(self.var_names)
def _set_dim_df(self, value: pd.DataFrame, attr: Literal["obs", "var"]):
if not isinstance(value, pd.DataFrame):
msg = f"Can only assign pd.DataFrame to {attr}."
raise ValueError(msg)
raise_value_error_if_multiindex_columns(value, attr)
value_idx = self._prep_dim_index(value.index, attr)
if self.is_view:
self._init_as_actual(self.copy())
setattr(self, f"_{attr}", value)
self._set_dim_index(value_idx, attr)
if not len(value.columns):
value.columns = value.columns.astype(str)
def _prep_dim_index(self, value, attr: str) -> pd.Index:
"""Prepares index to be uses as obs_names or var_names for AnnData object.AssertionError
If a pd.Index is passed, this will use a reference, otherwise a new index object is created.
"""
if self.shape[attr == "var"] != len(value):
msg = f"Length of passed value for {attr}_names is {len(value)}, but this AnnData has shape: {self.shape}"
raise ValueError(msg)
if isinstance(value, pd.Index) and not isinstance(value.name, str | type(None)):
msg = (
f"AnnData expects .{attr}.index.name to be a string or None, "
f"but you passed a name of type {type(value.name).__name__!r}"
)
raise ValueError(msg)
else:
value = pd.Index(value)
if not isinstance(value.name, str | type(None)):
value.name = None
if (
len(value) > 0
and not isinstance(value, pd.RangeIndex)
and infer_dtype(value) not in {"string", "bytes"}
):
sample = list(value[: min(len(value), 5)])
msg = dedent(
f"""
AnnData expects .{attr}.index to contain strings, but got values like:
{sample}
Inferred to be: {infer_dtype(value)}
"""
)
warnings.warn(msg, stacklevel=2)
return value
def _set_dim_index(self, value: pd.Index, attr: str):
# Assumes _prep_dim_index has been run
if self.is_view:
self._init_as_actual(self.copy())
getattr(self, attr).index = value
for v in getattr(self, f"_{attr}m").values():
if isinstance(v, pd.DataFrame):
v.index = value
@property
def obs(self) -> pd.DataFrame:
"""One-dimensional annotation of observations (`pd.DataFrame`)."""
return self._obs
@obs.setter
def obs(self, value: pd.DataFrame):
self._set_dim_df(value, "obs")
@obs.deleter
def obs(self):
self.obs = pd.DataFrame({}, index=self.obs_names)
@property
def obs_names(self) -> pd.Index:
"""Names of observations (alias for `.obs.index`)."""
return self.obs.index
@obs_names.setter
def obs_names(self, names: Sequence[str]):
names = self._prep_dim_index(names, "obs")
self._set_dim_index(names, "obs")
@property
def var(self) -> pd.DataFrame:
"""One-dimensional annotation of variables/ features (`pd.DataFrame`)."""
return self._var
@var.setter
def var(self, value: pd.DataFrame):
self._set_dim_df(value, "var")
@var.deleter
def var(self):
self.var = pd.DataFrame({}, index=self.var_names)
@property
def var_names(self) -> pd.Index:
"""Names of variables (alias for `.var.index`)."""
return self.var.index
@var_names.setter
def var_names(self, names: Sequence[str]):
names = self._prep_dim_index(names, "var")
self._set_dim_index(names, "var")
@property
def uns(self) -> MutableMapping:
"""Unstructured annotation (ordered dictionary)."""
uns = self._uns
if self.is_view:
uns = DictView(uns, view_args=(self, "_uns"))
return uns
@uns.setter
def uns(self, value: MutableMapping):
if not isinstance(value, MutableMapping):
msg = "Only mutable mapping types (e.g. dict) are allowed for `.uns`."
raise ValueError(msg)
if isinstance(value, DictView):
value = value.copy()
if self.is_view:
self._init_as_actual(self.copy())
self._uns = value
@uns.deleter
def uns(self):
self.uns = OrderedDict()
obsm: AlignedMappingProperty[AxisArrays | AxisArraysView] = AlignedMappingProperty(
"obsm", AxisArrays, 0
)
"""\
Multi-dimensional annotation of observations
(mutable structured :class:`~numpy.ndarray`).
Stores for each key a two or higher-dimensional :class:`~numpy.ndarray`
of length `n_obs`.
Is sliced with `data` and `obs` but behaves otherwise like a :term:`mapping`.
"""
varm: AlignedMappingProperty[AxisArrays | AxisArraysView] = AlignedMappingProperty(
"varm", AxisArrays, 1
)
"""\
Multi-dimensional annotation of variables/features
(mutable structured :class:`~numpy.ndarray`).
Stores for each key a two or higher-dimensional :class:`~numpy.ndarray`
of length `n_vars`.
Is sliced with `data` and `var` but behaves otherwise like a :term:`mapping`.
"""
obsp: AlignedMappingProperty[PairwiseArrays | PairwiseArraysView] = (
AlignedMappingProperty("obsp", PairwiseArrays, 0)
)
"""\
Pairwise annotation of observations,
a mutable mapping with array-like values.
Stores for each key a two or higher-dimensional :class:`~numpy.ndarray`
whose first two dimensions are of length `n_obs`.
Is sliced with `data` and `obs` but behaves otherwise like a :term:`mapping`.
"""
varp: AlignedMappingProperty[PairwiseArrays | PairwiseArraysView] = (
AlignedMappingProperty("varp", PairwiseArrays, 1)
)
"""\
Pairwise annotation of variables/features,
a mutable mapping with array-like values.
Stores for each key a two or higher-dimensional :class:`~numpy.ndarray`
whose first two dimensions are of length `n_var`.
Is sliced with `data` and `var` but behaves otherwise like a :term:`mapping`.
"""
def obs_keys(self) -> list[str]:
"""List keys of observation annotation :attr:`obs`."""
return self._obs.keys().tolist()
def var_keys(self) -> list[str]:
"""List keys of variable annotation :attr:`var`."""
return self._var.keys().tolist()
def obsm_keys(self) -> list[str]:
"""List keys of observation annotation :attr:`obsm`."""
return list(self.obsm.keys())
def varm_keys(self) -> list[str]:
"""List keys of variable annotation :attr:`varm`."""
return list(self.varm.keys())
def uns_keys(self) -> list[str]:
"""List keys of unstructured annotation."""
return sorted(list(self._uns.keys()))
@property
def isbacked(self) -> bool:
"""`True` if object is backed on disk, `False` otherwise."""
return self.filename is not None
@property
def is_view(self) -> bool:
"""`True` if object is view of another AnnData object, `False` otherwise."""
return self._is_view
@property
def filename(self) -> Path | None:
"""\
Change to backing mode by setting the filename of a `.h5ad` file.
- Setting the filename writes the stored data to disk.
- Setting the filename when the filename was previously another name
moves the backing file from the previous file to the new file.
If you want to copy the previous file, use `copy(filename='new_filename')`.
"""
return self.file.filename
@filename.setter
def filename(self, filename: PathLike[str] | str | None):
# convert early for later comparison
filename = None if filename is None else Path(filename)
# change from backing-mode back to full loading into memory
if filename is None:
if self.filename is not None:
self.file._to_memory_mode()
else:
# both filename and self.filename are None
# do nothing
return
else:
if self.filename is not None:
if self.filename != filename:
# write the content of self to the old file
# and close the file
self.write()
self.filename.rename(filename)
else:
# do nothing
return
else:
# change from memory to backing-mode
# write the content of self to disk
if self.raw is not None:
as_dense = ("X", "raw/X")
else:
as_dense = ("X",)
self.write(filename, as_dense=as_dense)
# open new file for accessing
self.file.open(filename, "r+")
# as the data is stored on disk, we can safely set self._X to None
self._X = None
def _set_backed(self, attr, value):
from .._io.utils import write_attribute
write_attribute(self.file._file, attr, value)
def _normalize_indices(self, index: Index | None) -> tuple[slice, slice]:
return _normalize_indices(index, self.obs_names, self.var_names)
# TODO: this is not quite complete...
def __delitem__(self, index: Index):
obs, var = self._normalize_indices(index)
# TODO: does this really work?
if not self.isbacked:
del self._X[obs, var]
else:
X = self.file["X"]
del X[obs, var]
self._set_backed("X", X)
if var == slice(None):
del self._obs.iloc[obs, :]
if obs == slice(None):
del self._var.iloc[var, :]
def __getitem__(self, index: Index) -> AnnData:
"""Returns a sliced view of the object."""
oidx, vidx = self._normalize_indices(index)
return AnnData(self, oidx=oidx, vidx=vidx, asview=True)
@staticmethod
@singledispatch
def _remove_unused_categories(
df_full: pd.DataFrame, df_sub: pd.DataFrame, uns: dict[str, Any]
):
for k in df_full:
if not isinstance(df_full[k].dtype, pd.CategoricalDtype):
continue
all_categories = df_full[k].cat.categories
with pd.option_context("mode.chained_assignment", None):
df_sub[k] = df_sub[k].cat.remove_unused_categories()
# also correct the colors...
color_key = f"{k}_colors"
if color_key not in uns:
continue
color_vec = uns[color_key]
if np.array(color_vec).ndim == 0:
# Make 0D arrays into 1D ones
uns[color_key] = np.array(color_vec)[(None,)]
elif len(color_vec) != len(all_categories):
# Reset colors
del uns[color_key]
else:
idx = np.where(np.isin(all_categories, df_sub[k].cat.categories))[0]
uns[color_key] = np.array(color_vec)[(idx,)]
def rename_categories(self, key: str, categories: Sequence[Any]):
"""\
Rename categories of annotation `key` in :attr:`obs`, :attr:`var`,
and :attr:`uns`.
Only supports passing a list/array-like `categories` argument.
Besides calling `self.obs[key].cat.categories = categories` –
similar for :attr:`var` - this also renames categories in unstructured
annotation that uses the categorical annotation `key`.
Parameters
----------
key
Key for observations or variables annotation.
categories
New categories, the same number as the old categories.
"""
if isinstance(categories, Mapping):
msg = "Only list-like `categories` is supported."
raise ValueError(msg)
if key in self.obs:
old_categories = self.obs[key].cat.categories.tolist()
self.obs[key] = self.obs[key].cat.rename_categories(categories)
elif key in self.var:
old_categories = self.var[key].cat.categories.tolist()
self.var[key] = self.var[key].cat.rename_categories(categories)
else:
msg = f"{key} is neither in `.obs` nor in `.var`."
raise ValueError(msg)
# this is not a good solution
# but depends on the scanpy conventions for storing the categorical key
# as `groupby` in the `params` slot
for k1, v1 in self.uns.items():
if not (
isinstance(v1, Mapping)
and "params" in v1
and "groupby" in v1["params"]
and v1["params"]["groupby"] == key
):
continue
for k2, v2 in v1.items():
# picks out the recarrays that are named according to the old
# categories
if isinstance(v2, np.ndarray) and v2.dtype.names is not None:
if list(v2.dtype.names) == old_categories:
self.uns[k1][k2].dtype.names = categories
else:
logger.warning(
f"Omitting {k1}/{k2} as old categories do not match."
)
def strings_to_categoricals(self, df: pd.DataFrame | None = None):
"""\
Transform string annotations to categoricals.
Only affects string annotations that lead to less categories than the
total number of observations.
Params
------
df
If `df` is `None`, modifies both :attr:`obs` and :attr:`var`,
otherwise modifies `df` inplace.
Notes
-----
Turns the view of an :class:`~anndata.AnnData` into an actual
:class:`~anndata.AnnData`.
"""
dont_modify = False # only necessary for backed views
if df is None:
dfs = [self.obs, self.var]
if self.is_view and self.isbacked:
dont_modify = True
else:
dfs = [df]
for df in dfs:
string_cols = [
key for key in df.columns if infer_dtype(df[key]) == "string"
]
for key in string_cols:
c = pd.Categorical(df[key])
# TODO: We should only check if non-null values are unique, but
# this would break cases where string columns with nulls could
# be written as categorical, but not as string.
# Possible solution: https://github.com/scverse/anndata/issues/504
if len(c.categories) >= len(c):
continue
# Ideally this could be done inplace
sorted_categories = natsorted(c.categories)
if not np.array_equal(c.categories, sorted_categories):
c = c.reorder_categories(sorted_categories)
if dont_modify:
msg = (
"Please call `.strings_to_categoricals()` on full "
"AnnData, not on this view. You might encounter this"
"error message while copying or writing to disk."
)
raise RuntimeError(msg)
df[key] = c
logger.info(f"... storing {key!r} as categorical")
_sanitize = strings_to_categoricals # backwards compat
def _inplace_subset_var(self, index: Index1D):
"""\
Inplace subsetting along variables dimension.
Same as `adata = adata[:, index]`, but inplace.
"""
adata_subset = self[:, index].copy()
self._init_as_actual(adata_subset)
def _inplace_subset_obs(self, index: Index1D):
"""\
Inplace subsetting along variables dimension.
Same as `adata = adata[index, :]`, but inplace.
"""
adata_subset = self[index].copy()
self._init_as_actual(adata_subset)
# TODO: Update, possibly remove
def __setitem__(self, index: Index, val: float | XDataType):
if self.is_view:
msg = "Object is view and cannot be accessed with `[]`."
raise ValueError(msg)
obs, var = self._normalize_indices(index)
if not self.isbacked:
self._X[obs, var] = val
else:
X = self.file["X"]
X[obs, var] = val
self._set_backed("X", X)
def __len__(self) -> int:
return self.shape[0]
def transpose(self) -> AnnData:
"""\
Transpose whole object.
Data matrix is transposed, observations and variables are interchanged.
Ignores `.raw`.
"""
from anndata.compat import _safe_transpose
if not self.isbacked:
X = self.X
else:
X = self.file["X"]
if self.is_view:
msg = (
"You’re trying to transpose a view of an `AnnData`, "
"which is currently not implemented. Call `.copy()` before transposing."
)
raise ValueError(msg)
return AnnData(
X=_safe_transpose(X) if X is not None else None,
layers={k: _safe_transpose(v) for k, v in self.layers.items()},
obs=self.var,
var=self.obs,
uns=self._uns,
obsm=self.varm,
varm=self.obsm,
obsp=self.varp,
varp=self.obsp,
filename=self.filename,
)
T = property(transpose)
def to_df(self, layer: str | None = None) -> pd.DataFrame:
"""\
Generate shallow :class:`~pandas.DataFrame`.
The data matrix :attr:`X` is returned as
:class:`~pandas.DataFrame`, where :attr:`obs_names` initializes the
index, and :attr:`var_names` the columns.
* No annotations are maintained in the returned object.
* The data matrix is densified in case it is sparse.
Params
------
layer
Key for `.layers`.
Returns
-------
Pandas DataFrame of specified data matrix.
"""
if layer is not None:
X = self.layers[layer]
elif not self._has_X():
msg = "X is None, cannot convert to dataframe."
raise ValueError(msg)
else:
X = self.X
if issparse(X):
X = X.toarray()
return pd.DataFrame(X, index=self.obs_names, columns=self.var_names)
def _get_X(self, *, use_raw: bool = False, layer: str | None = None):
"""\
Convenience method for getting expression values
with common arguments and error handling.
"""
is_layer = layer is not None
if use_raw and is_layer:
msg = (
"Cannot use expression from both layer and raw. You provided:"
f"`use_raw={use_raw}` and `layer={layer}`"
)
raise ValueError(msg)
if is_layer:
return self.layers[layer]
elif use_raw:
if self.raw is None:
msg = "This AnnData doesn’t have a value in `.raw`."
raise ValueError(msg)
return self.raw.X
else:
return self.X
def obs_vector(self, k: str, *, layer: str | None = None) -> np.ndarray:
"""\
Convenience function for returning a 1 dimensional ndarray of values
from :attr:`X`, :attr:`layers`\\ `[k]`, or :attr:`obs`.
Made for convenience, not performance.
Intentionally permissive about arguments, for easy iterative use.
Params
------
k
Key to use. Should be in :attr:`var_names` or :attr:`obs`\\ `.columns`.
layer
What layer values should be returned from. If `None`, :attr:`X` is used.
Returns
-------
A one dimensional ndarray, with values for each obs in the same order
as :attr:`obs_names`.
"""
if layer == "X":
if "X" in self.layers:
pass
else:
warnings.warn(
"In a future version of AnnData, access to `.X` by passing"
" `layer='X'` will be removed. Instead pass `layer=None`.",
FutureWarning,
)
layer = None
return get_vector(self, k, "obs", "var", layer=layer)
def var_vector(self, k, *, layer: str | None = None) -> np.ndarray:
"""\
Convenience function for returning a 1 dimensional ndarray of values
from :attr:`X`, :attr:`layers`\\ `[k]`, or :attr:`obs`.
Made for convenience, not performance. Intentionally permissive about
arguments, for easy iterative use.
Params
------
k
Key to use. Should be in :attr:`obs_names` or :attr:`var`\\ `.columns`.
layer
What layer values should be returned from. If `None`, :attr:`X` is used.
Returns
-------
A one dimensional ndarray, with values for each var in the same order
as :attr:`var_names`.
"""
if layer == "X":
if "X" in self.layers:
pass
else:
warnings.warn(
"In a future version of AnnData, access to `.X` by passing "
"`layer='X'` will be removed. Instead pass `layer=None`.",
FutureWarning,
)
layer = None
return get_vector(self, k, "var", "obs", layer=layer)
@deprecated("obs_vector")
def _get_obs_array(self, k, use_raw=False, layer=None): # noqa: FBT002
"""\
Get an array from the layer (default layer='X') along the :attr:`obs`
dimension by first looking up `obs.keys` and then :attr:`obs_names`.
"""
if not use_raw or k in self.obs.columns:
return self.obs_vector(k=k, layer=layer)
else:
return self.raw.obs_vector(k)
@deprecated("var_vector")
def _get_var_array(self, k, use_raw=False, layer=None): # noqa: FBT002
"""\
Get an array from the layer (default layer='X') along the :attr:`var`
dimension by first looking up `var.keys` and then :attr:`var_names`.
"""
if not use_raw or k in self.var.columns:
return self.var_vector(k=k, layer=layer)
else:
return self.raw.var_vector(k)
def _mutated_copy(self, **kwargs):
"""Creating AnnData with attributes optionally specified via kwargs."""
if self.isbacked:
if "X" not in kwargs or (self.raw is not None and "raw" not in kwargs):
msg = (
"This function does not currently handle backed objects "
"internally, this should be dealt with before."
)
raise NotImplementedError(msg)
new = {}
for key in ["obs", "var", "obsm", "varm", "obsp", "varp", "layers"]:
if key in kwargs:
new[key] = kwargs[key]
else:
new[key] = getattr(self, key).copy()
if "X" in kwargs:
new["X"] = kwargs["X"]
elif self._has_X():
new["X"] = self.X.copy()
if "uns" in kwargs:
new["uns"] = kwargs["uns"]
else:
new["uns"] = deepcopy(self._uns)
if "raw" in kwargs:
new["raw"] = kwargs["raw"]
elif self.raw is not None:
new["raw"] = self.raw.copy()
return AnnData(**new)
@old_positionals("copy")
def to_memory(self, *, copy: bool = False) -> AnnData:
"""Return a new AnnData object with all backed arrays loaded into memory.
Params
------
copy
Whether the arrays that are already in-memory should be copied.
Example
-------
.. code:: python
import anndata
backed = anndata.io.read_h5ad("file.h5ad", backed="r")
mem = backed[backed.obs["cluster"] == "a", :].to_memory()
"""
new = {}
for attr_name in [
"X",
"obs",
"var",
"obsm",
"varm",
"obsp",
"varp",
"layers",
"uns",
]:
attr = getattr(self, attr_name, None)
if attr is not None:
new[attr_name] = to_memory(attr, copy=copy)
if self.raw is not None:
new["raw"] = {
"X": to_memory(self.raw.X, copy=copy),
"var": to_memory(self.raw.var, copy=copy),
"varm": to_memory(self.raw.varm, copy=copy),
}
if self.isbacked:
self.file.close()
return AnnData(**new)
def copy(self, filename: PathLike[str] | str | None = None) -> AnnData:
"""Full copy, optionally on disk."""
if not self.isbacked:
if self.is_view and self._has_X():
# TODO: How do I unambiguously check if this is a copy?
# Subsetting this way means we don’t have to have a view type
# defined for the matrix, which is needed for some of the
# current distributed backend. Specifically Dask.
return self._mutated_copy(
X=_subset(self._adata_ref.X, (self._oidx, self._vidx)).copy()
)
else:
return self._mutated_copy()
else:
from ..io import read_h5ad, write_h5ad
if filename is None:
msg = (
"To copy an AnnData object in backed mode, "
"pass a filename: `.copy(filename='myfilename.h5ad')`. "
"To load the object into memory, use `.to_memory()`."
)
raise ValueError(msg)
mode = self.file._filemode
write_h5ad(filename, self)
return read_h5ad(filename, backed=mode)
@deprecated(
"anndata.concat",
add_msg="See the tutorial for concat at: "
"https://anndata.readthedocs.io/en/latest/concatenation.html",
hide=False,
)
def concatenate(
self,
*adatas: AnnData,
join: str = "inner",
batch_key: str = "batch",
batch_categories: Sequence[Any] = None,
uns_merge: str | None = None,
index_unique: str | None = "-",
fill_value=None,
) -> AnnData:
"""\
Concatenate along the observations axis.
The :attr:`uns`, :attr:`varm` and :attr:`obsm` attributes are ignored.
Currently, this works only in `'memory'` mode.
.. note::
For more flexible and efficient concatenation, see: :func:`~anndata.concat`.
Parameters
----------
adatas
AnnData matrices to concatenate with. Each matrix is referred to as
a “batch”.
join
Use intersection (`'inner'`) or union (`'outer'`) of variables.
batch_key
Add the batch annotation to :attr:`obs` using this key.
batch_categories
Use these as categories for the batch annotation. By default, use increasing numbers.
uns_merge
Strategy to use for merging entries of uns. These strategies are applied recusivley.
Currently implemented strategies include:
* `None`: The default. The concatenated object will just have an empty dict for `uns`.
* `"same"`: Only entries which have the same value in all AnnData objects are kept.
* `"unique"`: Only entries which have one unique value in all AnnData objects are kept.
* `"first"`: The first non-missing value is used.
* `"only"`: A value is included if only one of the AnnData objects has a value at this
path.
index_unique
Make the index unique by joining the existing index names with the
batch category, using `index_unique='-'`, for instance. Provide
`None` to keep existing indices.
fill_value
Scalar value to fill newly missing values in arrays with. Note: only applies to arrays
and sparse matrices (not dataframes) and will only be used if `join="outer"`.
.. note::
If not provided, the default value is `0` for sparse matrices and `np.nan`
for numpy arrays. See the examples below for more information.
Returns
-------
:class:`~anndata.AnnData`
The concatenated :class:`~anndata.AnnData`, where `adata.obs[batch_key]`
stores a categorical variable labeling the batch.
Notes
-----
.. warning::
If you use `join='outer'` this fills 0s for sparse data when
variables are absent in a batch. Use this with care. Dense data is
filled with `NaN`. See the examples.
Examples
--------
Joining on intersection of variables.
>>> adata1 = AnnData(
... np.array([[1, 2, 3], [4, 5, 6]]),
... dict(obs_names=['s1', 's2'], anno1=['c1', 'c2']),
... dict(var_names=['a', 'b', 'c'], annoA=[0, 1, 2]),
... )
>>> adata2 = AnnData(
... np.array([[1, 2, 3], [4, 5, 6]]),
... dict(obs_names=['s3', 's4'], anno1=['c3', 'c4']),
... dict(var_names=['d', 'c', 'b'], annoA=[0, 1, 2]),
... )
>>> adata3 = AnnData(
... np.array([[1, 2, 3], [4, 5, 6]]),
... dict(obs_names=['s1', 's2'], anno2=['d3', 'd4']),
... dict(var_names=['d', 'c', 'b'], annoA=[0, 2, 3], annoB=[0, 1, 2]),
... )
>>> adata = adata1.concatenate(adata2, adata3)
>>> adata
AnnData object with n_obs × n_vars = 6 × 2
obs: 'anno1', 'anno2', 'batch'
var: 'annoA-0', 'annoA-1', 'annoA-2', 'annoB-2'
>>> adata.X
array([[2, 3],
[5, 6],
[3, 2],
[6, 5],
[3, 2],
[6, 5]])
>>> adata.obs
anno1 anno2 batch
s1-0 c1 NaN 0
s2-0 c2 NaN 0
s3-1 c3 NaN 1
s4-1 c4 NaN 1
s1-2 NaN d3 2
s2-2 NaN d4 2
>>> adata.var.T
b c
annoA-0 1 2
annoA-1 2 1
annoA-2 3 2
annoB-2 2 1
Joining on the union of variables.
>>> outer = adata1.concatenate(adata2, adata3, join='outer')
>>> outer
AnnData object with n_obs × n_vars = 6 × 4
obs: 'anno1', 'anno2', 'batch'
var: 'annoA-0', 'annoA-1', 'annoA-2', 'annoB-2'
>>> outer.var.T
a b c d
annoA-0 0.0 1.0 2.0 NaN
annoA-1 NaN 2.0 1.0 0.0
annoA-2 NaN 3.0 2.0 0.0
annoB-2 NaN 2.0 1.0 0.0
>>> outer.var_names
Index(['a', 'b', 'c', 'd'], dtype='object')
>>> outer.X
array([[ 1., 2., 3., nan],
[ 4., 5., 6., nan],
[nan, 3., 2., 1.],
[nan, 6., 5., 4.],
[nan, 3., 2., 1.],
[nan, 6., 5., 4.]])
>>> outer.X.sum(axis=0)
array([nan, 25., 23., nan])
>>> import pandas as pd
>>> Xdf = pd.DataFrame(outer.X, columns=outer.var_names)
>>> Xdf
a b c d
0 1.0 2.0 3.0 NaN
1 4.0 5.0 6.0 NaN
2 NaN 3.0 2.0 1.0
3 NaN 6.0 5.0 4.0
4 NaN 3.0 2.0 1.0
5 NaN 6.0 5.0 4.0
>>> Xdf.sum()
a 5.0
b 25.0
c 23.0
d 10.0
dtype: float64
One way to deal with missing values is to use masked arrays:
>>> from numpy import ma
>>> outer.X = ma.masked_invalid(outer.X)
>>> outer.X
masked_array(
data=[[1.0, 2.0, 3.0, --],
[4.0, 5.0, 6.0, --],
[--, 3.0, 2.0, 1.0],
[--, 6.0, 5.0, 4.0],
[--, 3.0, 2.0, 1.0],
[--, 6.0, 5.0, 4.0]],
mask=[[False, False, False, True],
[False, False, False, True],
[ True, False, False, False],
[ True, False, False, False],
[ True, False, False, False],
[ True, False, False, False]],
fill_value=1e+20)
>>> outer.X.sum(axis=0).data
array([ 5., 25., 23., 10.])
The masked array is not saved but has to be reinstantiated after saving.
>>> outer.write('./test.h5ad')
>>> from anndata import read_h5ad
>>> outer = read_h5ad('./test.h5ad')
>>> outer.X
array([[ 1., 2., 3., nan],
[ 4., 5., 6., nan],
[nan, 3., 2., 1.],
[nan, 6., 5., 4.],
[nan, 3., 2., 1.],
[nan, 6., 5., 4.]])
For sparse data, everything behaves similarly,
except that for `join='outer'`, zeros are added.
>>> from scipy.sparse import csr_matrix
>>> adata1 = AnnData(
... csr_matrix([[0, 2, 3], [0, 5, 6]], dtype=np.float32),
... dict(obs_names=['s1', 's2'], anno1=['c1', 'c2']),
... dict(var_names=['a', 'b', 'c']),
... )
>>> adata2 = AnnData(
... csr_matrix([[0, 2, 3], [0, 5, 6]], dtype=np.float32),
... dict(obs_names=['s3', 's4'], anno1=['c3', 'c4']),
... dict(var_names=['d', 'c', 'b']),
... )
>>> adata3 = AnnData(
... csr_matrix([[1, 2, 0], [0, 5, 6]], dtype=np.float32),
... dict(obs_names=['s5', 's6'], anno2=['d3', 'd4']),
... dict(var_names=['d', 'c', 'b']),
... )
>>> adata = adata1.concatenate(adata2, adata3, join='outer')
>>> adata.var_names
Index(['a', 'b', 'c', 'd'], dtype='object')
>>> adata.X.toarray()
array([[0., 2., 3., 0.],
[0., 5., 6., 0.],
[0., 3., 2., 0.],
[0., 6., 5., 0.],
[0., 0., 2., 1.],
[0., 6., 5., 0.]], dtype=float32)
"""
from .merge import concat, merge_dataframes, merge_outer, merge_same
if self.isbacked:
msg = "Currently, concatenate only works in memory mode."
raise ValueError(msg)
if len(adatas) == 0:
return self.copy()
elif len(adatas) == 1 and not isinstance(adatas[0], AnnData):
adatas = adatas[0] # backwards compatibility
all_adatas = (self,) + tuple(adatas)
out = concat(
all_adatas,
axis=0,
join=join,
label=batch_key,
keys=batch_categories,
uns_merge=uns_merge,
fill_value=fill_value,
index_unique=index_unique,
pairwise=False,
)
# Backwards compat (some of this could be more efficient)
# obs used to always be an outer join
sparse_class = sparse.csr_matrix
if any(isinstance(a.X, CSArray) for a in all_adatas):
sparse_class = sparse.csr_array
out.obs = concat(
[AnnData(sparse_class(a.shape), obs=a.obs) for a in all_adatas],
axis=0,
join="outer",
label=batch_key,
keys=batch_categories,
index_unique=index_unique,
).obs
# Removing varm
del out.varm
# Implementing old-style merging of var
if batch_categories is None:
batch_categories = np.arange(len(all_adatas)).astype(str)
pat = rf"-({'|'.join(batch_categories)})$"
out.var = merge_dataframes(
[a.var for a in all_adatas],
out.var_names,
partial(merge_outer, batch_keys=batch_categories, merge=merge_same),
)
out.var = out.var.iloc[
:,
(
out.var.columns.str.extract(pat, expand=False)
.fillna("")
.argsort(kind="stable")
),
]
return out
def var_names_make_unique(self, join: str = "-"):
# Important to go through the setter so obsm dataframes are updated too
self.var_names = utils.make_index_unique(self.var.index, join)
var_names_make_unique.__doc__ = utils.make_index_unique.__doc__
def obs_names_make_unique(self, join: str = "-"):
# Important to go through the setter so obsm dataframes are updated too
self.obs_names = utils.make_index_unique(self.obs.index, join)
obs_names_make_unique.__doc__ = utils.make_index_unique.__doc__
def _check_uniqueness(self):
if not self.obs.index.is_unique:
utils.warn_names_duplicates("obs")
if not self.var.index.is_unique:
utils.warn_names_duplicates("var")
def __contains__(self, key: Any):
msg = "AnnData has no attribute __contains__, don’t check `in adata`."
raise AttributeError(msg)
def _check_dimensions(self, key=None):
if key is None:
key = {"obsm", "varm"}
else:
key = {key}
if "obsm" in key:
if (
not all([axis_len(o, 0) == self.n_obs for o in self.obsm.values()])
and len(self.obsm.dim_names) != self.n_obs
):
msg = (
"Observations annot. `obsm` must have number of rows of `X`"
f" ({self.n_obs}), but has {len(self.obsm)} rows."
)
raise ValueError(msg)
if "varm" in key:
if (
not all([axis_len(v, 0) == self.n_vars for v in self.varm.values()])
and len(self.varm.dim_names) != self.n_vars
):
msg = (
"Variables annot. `varm` must have number of columns of `X`"
f" ({self.n_vars}), but has {len(self.varm)} rows."
)
raise ValueError(msg)
@old_positionals("compression", "compression_opts", "as_dense")
def write_h5ad(
self,
filename: PathLike[str] | str | None = None,
*,
convert_strings_to_categoricals: bool = True,
compression: Literal["gzip", "lzf"] | None = None,
compression_opts: int | Any = None,
as_dense: Sequence[str] = (),
):
"""\
Write `.h5ad`-formatted hdf5 file.
.. note::
Setting compression to `'gzip'` can save disk space
but will slow down writing and subsequent reading.
Prior to v0.6.16, this was the default for parameter `compression`.
Generally, if you have sparse data that are stored as a dense matrix,
you can dramatically improve performance and reduce disk space
by converting to a :class:`~scipy.sparse.csr_matrix`::
from scipy.sparse import csr_matrix
adata.X = csr_matrix(adata.X)
Parameters
----------
filename
Filename of data file. Defaults to backing file.
convert_strings_to_categoricals
Convert string columns to categorical.
compression
For [`lzf`, `gzip`], see the h5py :ref:`dataset_compression`.
Alternative compression filters such as `zstd` can be passed
from the :doc:`hdf5plugin ` library.
Experimental.
Usage example::
import hdf5plugin
adata.write_h5ad(
filename,
compression=hdf5plugin.FILTERS["zstd"]
)
.. note::
Datasets written with hdf5plugin-provided compressors
cannot be opened without first loading the hdf5plugin
library using `import hdf5plugin`. When using alternative
compression filters such as `zstd`, consider writing to
`zarr` format instead of `h5ad`, as the `zarr` library
provides a more transparent compression pipeline.
compression_opts
For [`lzf`, `gzip`], see the h5py :ref:`dataset_compression`.
Alternative compression filters such as `zstd` can be configured
using helpers from the :doc:`hdf5plugin `
library. Experimental.
Usage example (setting `zstd` compression level to 5)::
import hdf5plugin
adata.write_h5ad(
filename,
compression=hdf5plugin.FILTERS["zstd"],
compression_opts=hdf5plugin.Zstd(clevel=5).filter_options
)
as_dense
Sparse arrays in AnnData object to write as dense. Currently only
supports `X` and `raw/X`.
"""
from ..io import write_h5ad
if filename is None and not self.isbacked:
msg = "Provide a filename!"
raise ValueError(msg)
if filename is None:
filename = self.filename
write_h5ad(
Path(filename),
self,
convert_strings_to_categoricals=convert_strings_to_categoricals,
compression=compression,
compression_opts=compression_opts,
as_dense=as_dense,
)
if self.isbacked:
self.file.filename = filename
write = write_h5ad # a shortcut and backwards compat
@old_positionals("skip_data", "sep")
def write_csvs(
self, dirname: PathLike[str] | str, *, skip_data: bool = True, sep: str = ","
):
"""\
Write annotation to `.csv` files.
It is not possible to recover the full :class:`~anndata.AnnData` from
these files. Use :meth:`write` for this.
Parameters
----------
dirname
Name of directory to which to export.
skip_data
Skip the data matrix :attr:`X`.
sep
Separator for the data.
"""
from ..io import write_csvs
write_csvs(dirname, self, skip_data=skip_data, sep=sep)
@old_positionals("write_obsm_varm")
def write_loom(
self, filename: PathLike[str] | str, *, write_obsm_varm: bool = False
):
"""\
Write `.loom`-formatted hdf5 file.
Parameters
----------
filename
The filename.
"""
from ..io import write_loom
write_loom(filename, self, write_obsm_varm=write_obsm_varm)
@old_positionals("chunks")
def write_zarr(
self,
store: StoreLike,
*,
chunks: tuple[int, ...] | None = None,
convert_strings_to_categoricals: bool = True,
):
"""\
Write a hierarchical Zarr array store.
Parameters
----------
store
The filename, a :class:`~typing.MutableMapping`, or a Zarr storage class.
chunks
Chunk shape.
convert_strings_to_categoricals
Convert string columns to categorical.
"""
from ..io import write_zarr
# TODO: What is a bool for chunks supposed to do?
if isinstance(chunks, bool):
msg = (
"Passing `write_zarr(adata, chunks=True)` is no longer supported. "
"Please pass `write_zarr(adata)` instead."
)
raise ValueError(msg)
write_zarr(
store,
self,
chunks=chunks,
convert_strings_to_categoricals=convert_strings_to_categoricals,
)
def chunked_X(self, chunk_size: int | None = None):
"""\
Return an iterator over the rows of the data matrix :attr:`X`.
Parameters
----------
chunk_size
Row size of a single chunk.
"""
if chunk_size is None:
# Should be some adaptive code
chunk_size = 6000
start = 0
n = self.n_obs
for _ in range(int(n // chunk_size)):
end = start + chunk_size
yield (self.X[start:end], start, end)
start = end
if start < n:
yield (self.X[start:n], start, n)
@old_positionals("replace")
def chunk_X(
self,
select: int | Sequence[int] | np.ndarray = 1000,
*,
replace: bool = True,
):
"""\
Return a chunk of the data matrix :attr:`X` with random or specified indices.
Parameters
----------
select
Depending on the type:
:class:`int`
A random chunk with `select` rows will be returned.
:term:`sequence` (e.g. a list, tuple or numpy array) of :class:`int`
A chunk with these indices will be returned.
replace
If `select` is an integer then `True` means random sampling of
indices with replacement, `False` without replacement.
"""
if isinstance(select, int):
select = select if select < self.n_obs else self.n_obs
choice = np.random.choice(self.n_obs, select, replace)
elif isinstance(select, np.ndarray | Sequence):
choice = np.asarray(select)
else:
msg = "select should be int or array"
raise ValueError(msg)
reverse = None
if self.isbacked:
# h5py can only slice with a sorted list of unique index values
# so random batch with indices [2, 2, 5, 3, 8, 10, 8] will fail
# this fixes the problem
indices, reverse = np.unique(choice, return_inverse=True)
selection = self.X[indices.tolist()]
else:
selection = self.X[choice]
selection = selection.toarray() if issparse(selection) else selection
return selection if reverse is None else selection[reverse]
def _has_X(self) -> bool:
"""
Check if X is None.
This is more efficient than trying `adata.X is None` for views, since creating
views (at least anndata's kind) can be expensive.
"""
if not self.is_view:
return self.X is not None
else:
return self._adata_ref.X is not None
# --------------------------------------------------------------------------
# all of the following is for backwards compat
# --------------------------------------------------------------------------
@property
@deprecated("is_view")
def isview(self):
return self.is_view
def _clean_up_old_format(self, uns):
# multicolumn keys
# all of the rest is only for backwards compat
for bases in [["obs", "smp"], ["var"]]:
axis = bases[0]
for k in [f"{p}{base}_keys_multicol" for p in ["", "_"] for base in bases]:
if uns and k in uns:
keys = list(uns[k])
del uns[k]
break
else:
keys = []
# now, for compat, fill the old multicolumn entries into obsm and varm
# and remove them from obs and var
m_attr = getattr(self, f"_{axis}m")
for key in keys:
m_attr[key] = self._get_and_delete_multicol_field(axis, key)
def _get_and_delete_multicol_field(self, a, key_multicol):
keys = []
for k in getattr(self, a).columns:
if k.startswith(key_multicol):
keys.append(k)
values = getattr(self, a)[keys].values
getattr(self, a).drop(keys, axis=1, inplace=True)
return values
def _check_2d_shape(X):
"""\
Check shape of array or sparse matrix.
Assure that X is always 2D: Unlike numpy we always deal with 2D arrays.
"""
if X.dtype.names is None and len(X.shape) != 2:
msg = f"X needs to be 2-dimensional, not {len(X.shape)}-dimensional."
raise ValueError(msg)
def _infer_shape_for_axis(
xxx: pd.DataFrame | Mapping[str, Iterable[Any]] | None,
xxxm: np.ndarray | Mapping[str, Sequence[Any]] | None,
layers: Mapping[str, np.ndarray | sparse.spmatrix] | None,
xxxp: np.ndarray | Mapping[str, Sequence[Any]] | None,
axis: Literal[0, 1],
) -> int | None:
for elem in [xxx, xxxm, xxxp]:
if elem is not None and hasattr(elem, "shape"):
return elem.shape[0]
for elem, id in zip([layers, xxxm, xxxp], ["layers", "xxxm", "xxxp"]):
if elem is not None:
elem = cast("Mapping", elem)
for sub_elem in elem.values():
if hasattr(sub_elem, "shape"):
size = cast("int", sub_elem.shape[axis if id == "layers" else 0])
return size
return None
def _infer_shape(
obs: pd.DataFrame | Mapping[str, Iterable[Any]] | None = None,
var: pd.DataFrame | Mapping[str, Iterable[Any]] | None = None,
obsm: np.ndarray | Mapping[str, Sequence[Any]] | None = None,
varm: np.ndarray | Mapping[str, Sequence[Any]] | None = None,
layers: Mapping[str, np.ndarray | sparse.spmatrix] | None = None,
obsp: np.ndarray | Mapping[str, Sequence[Any]] | None = None,
varp: np.ndarray | Mapping[str, Sequence[Any]] | None = None,
):
return (
_infer_shape_for_axis(obs, obsm, layers, obsp, 0),
_infer_shape_for_axis(var, varm, layers, varp, 1),
)
python-anndata-0.12.0~rc1/src/anndata/_core/extensions.py 0000664 0000000 0000000 00000024271 15003706322 0023364 0 ustar 00root root 0000000 0000000 from __future__ import annotations
import inspect
from pathlib import Path
from typing import TYPE_CHECKING, Generic, TypeVar, get_type_hints, overload
from warnings import warn
from ..types import ExtensionNamespace
from .anndata import AnnData
if TYPE_CHECKING:
from collections.abc import Callable
# Based off of the extension framework in Polars
# https://github.com/pola-rs/polars/blob/main/py-polars/polars/api.py
__all__ = ["register_anndata_namespace"]
def find_stacklevel() -> int:
"""
Find the first place in the stack that is not inside AnnData.
Taken from:
https://github.com/pola-rs/polars/blob/main/py-polars/polars/_utils/various.py#L447
"""
pkg_dir = str(Path(__file__).parent.parent)
# https://stackoverflow.com/questions/17407119/python-inspect-stack-is-slow
frame = inspect.currentframe()
n = 0
try:
while frame:
fname = inspect.getfile(frame)
if fname.startswith(pkg_dir) or (
(qualname := getattr(frame.f_code, "co_qualname", None))
# ignore @singledispatch wrappers
and qualname.startswith("singledispatch.")
):
frame = frame.f_back
n += 1
else:
break
finally:
# https://docs.python.org/3/library/inspect.html
# > Though the cycle detector will catch these, destruction of the frames
# > (and local variables) can be made deterministic by removing the cycle
# > in a finally clause.
del frame
return n
# Reserved namespaces include accessors built into AnnData (currently there are none)
# and all current attributes of AnnData
_reserved_namespaces: set[str] = set(dir(AnnData))
NameSpT = TypeVar("NameSpT", bound=ExtensionNamespace)
T = TypeVar("T")
class AccessorNameSpace(ExtensionNamespace, Generic[NameSpT]):
"""Establish property-like namespace object for user-defined functionality."""
def __init__(self, name: str, namespace: type[NameSpT]) -> None:
self._accessor = name
self._ns = namespace
@overload
def __get__(self, instance: None, cls: type[T]) -> type[NameSpT]: ...
@overload
def __get__(self, instance: T, cls: type[T]) -> NameSpT: ...
def __get__(self, instance: T | None, cls: type[T]) -> NameSpT | type[NameSpT]:
if instance is None:
return self._ns
ns_instance = self._ns(instance) # type: ignore[call-arg]
setattr(instance, self._accessor, ns_instance)
return ns_instance
def _check_namespace_signature(ns_class: type) -> None:
"""Validate the signature of a namespace class for AnnData extensions.
This function ensures that any class intended to be used as an extension namespace
has a properly formatted `__init__` method such that:
1. Accepts at least two parameters (self and adata)
2. Has 'adata' as the name of the second parameter
3. Has the second parameter properly type-annotated as 'AnnData' or any equivalent import alias
The function performs runtime validation of these requirements before a namespace
can be registered through the `register_anndata_namespace` decorator.
Parameters
----------
ns_class
The namespace class to validate.
Raises
------
TypeError
If the `__init__` method has fewer than 2 parameters (missing the AnnData parameter).
AttributeError
If the second parameter of `__init__` lacks a type annotation.
TypeError
If the second parameter of `__init__` is not named 'adata'.
TypeError
If the second parameter of `__init__` is not annotated as the 'AnnData' class.
TypeError
If both the name and type annotation of the second parameter are incorrect.
"""
sig = inspect.signature(ns_class.__init__)
params = list(sig.parameters.values())
# Ensure there are at least two parameters (self and adata)
if len(params) < 2:
error_msg = "Namespace initializer must accept an AnnData instance as the second parameter."
raise TypeError(error_msg)
# Get the second parameter (expected to be 'adata')
param = params[1]
if param.annotation is inspect._empty:
err_msg = "Namespace initializer's second parameter must be annotated as the 'AnnData' class, got empty annotation."
raise AttributeError(err_msg)
name_ok = param.name == "adata"
# Resolve the annotation using get_type_hints to handle forward references and aliases.
try:
type_hints = get_type_hints(ns_class.__init__)
resolved_type = type_hints.get(param.name, param.annotation)
except NameError as e:
err_msg = f"Namespace initializer's second parameter must be named 'adata', got '{param.name}'."
raise NameError(err_msg) from e
type_ok = resolved_type is AnnData
match (name_ok, type_ok):
case (True, True):
return # Signature is correct.
case (False, True):
msg = f"Namespace initializer's second parameter must be named 'adata', got {param.name!r}."
raise TypeError(msg)
case (True, False):
type_repr = getattr(resolved_type, "__name__", str(resolved_type))
msg = f"Namespace initializer's second parameter must be annotated as the 'AnnData' class, got '{type_repr}'."
raise TypeError(msg)
case _:
type_repr = getattr(resolved_type, "__name__", str(resolved_type))
msg = (
f"Namespace initializer's second parameter must be named 'adata', got {param.name!r}. "
f"And must be annotated as 'AnnData', got {type_repr!r}."
)
raise TypeError(msg)
def _create_namespace(
name: str, cls: type[AnnData]
) -> Callable[[type[NameSpT]], type[NameSpT]]:
"""Register custom namespace against the underlying AnnData class."""
def namespace(ns_class: type[NameSpT]) -> type[NameSpT]:
_check_namespace_signature(ns_class) # Perform the runtime signature check
if name in _reserved_namespaces:
msg = f"cannot override reserved attribute {name!r}"
raise AttributeError(msg)
elif name in cls._accessors:
warn(
f"Overriding existing custom namespace {name!r} (on {cls.__name__!r})",
UserWarning,
stacklevel=find_stacklevel(),
)
setattr(cls, name, AccessorNameSpace(name, ns_class))
cls._accessors.add(name)
return ns_class
return namespace
def register_anndata_namespace(
name: str,
) -> Callable[[type[NameSpT]], type[NameSpT]]:
"""Decorator for registering custom functionality with an :class:`~anndata.AnnData` object.
This decorator allows you to extend AnnData objects with custom methods and properties
organized under a namespace. The namespace becomes accessible as an attribute on AnnData
instances, providing a clean way to you to add domain-specific functionality without modifying
the AnnData class itself, or extending the class with additional methods as you see fit in your workflow.
Parameters
----------
name
Name under which the accessor should be registered. This will be the attribute name
used to access your namespace's functionality on AnnData objects (e.g., `adata.{name}`).
Cannot conflict with existing AnnData attributes like `obs`, `var`, `X`, etc. The list of reserved
attributes includes everything outputted by `dir(AnnData)`.
Returns
-------
A decorator that registers the decorated class as a custom namespace.
Notes
-----
Implementation requirements:
1. The decorated class must have an `__init__` method that accepts exactly one parameter
(besides `self`) named `adata` and annotated with type :class:`~anndata.AnnData`.
2. The namespace will be initialized with the AnnData object on first access and then
cached on the instance.
3. If the namespace name conflicts with an existing namespace, a warning is issued.
4. If the namespace name conflicts with a built-in AnnData attribute, an AttributeError is raised.
Examples
--------
Simple transformation namespace with two methods:
>>> import anndata as ad
>>> import numpy as np
>>>
>>> @ad.register_anndata_namespace("transform")
... class TransformX:
... def __init__(self, adata: ad.AnnData):
... self._adata = adata
...
... def log1p(
... self, layer: str = None, inplace: bool = False
... ) -> ad.AnnData | None:
... '''Log1p transform the data.'''
... data = self._adata.layers[layer] if layer else self._adata.X
... log1p_data = np.log1p(data)
...
... if layer:
... layer_name = f"{layer}_log1p" if not inplace else layer
... else:
... layer_name = "log1p"
...
... self._adata.layers[layer_name] = log1p_data
...
... if not inplace:
... return self._adata
...
... def arcsinh(
... self, layer: str = None, scale: float = 1.0, inplace: bool = False
... ) -> ad.AnnData | None:
... '''Arcsinh transform the data with optional scaling.'''
... data = self._adata.layers[layer] if layer else self._adata.X
... asinh_data = np.arcsinh(data / scale)
...
... if layer:
... layer_name = f"{layer}_arcsinh" if not inplace else layer
... else:
... layer_name = "arcsinh"
...
... self._adata.layers[layer_name] = asinh_data
...
... if not inplace:
... return self._adata
>>>
>>> # Create an AnnData object
>>> rng = np.random.default_rng(42)
>>> adata = ad.AnnData(X=rng.poisson(1, size=(100, 2000)))
>>>
>>> # Use the registered namespace
>>> adata.transform.log1p() # Transforms X and returns the AnnData object
AnnData object with n_obs × n_vars = 100 × 2000
layers: 'log1p'
>>> adata.transform.arcsinh() # Transforms X and returns the AnnData object
AnnData object with n_obs × n_vars = 100 × 2000
layers: 'log1p', 'arcsinh'
"""
return _create_namespace(name, AnnData)
python-anndata-0.12.0~rc1/src/anndata/_core/file_backing.py 0000664 0000000 0000000 00000011554 15003706322 0023562 0 ustar 00root root 0000000 0000000 from __future__ import annotations
import weakref
from collections.abc import Mapping
from functools import singledispatch
from pathlib import Path, PurePosixPath
from typing import TYPE_CHECKING
import h5py
from ..compat import AwkArray, DaskArray, ZarrArray, ZarrGroup
from .sparse_dataset import BaseCompressedSparseDataset
if TYPE_CHECKING:
from collections.abc import Iterator
from os import PathLike
from typing import Literal
from .._types import ArrayStorageType
from . import anndata
class AnnDataFileManager:
"""Backing file manager for AnnData."""
def __init__(
self,
adata: anndata.AnnData,
filename: PathLike[str] | str | None = None,
filemode: Literal["r", "r+"] | None = None,
):
self._adata_ref = weakref.ref(adata)
self.filename = filename
self._filemode = filemode
self._file = None
if filename:
self.open()
def __getstate__(self):
state = self.__dict__.copy()
state["_adata_ref"] = state["_adata_ref"]()
return state
def __setstate__(self, state):
self.__dict__ = state.copy()
self.__dict__["_adata_ref"] = weakref.ref(state["_adata_ref"])
@property
def _adata(self):
return self._adata_ref()
def __repr__(self) -> str:
if self.filename is None:
return "Backing file manager: no file is set."
else:
return f"Backing file manager of file {self.filename}."
def __contains__(self, x) -> bool:
return x in self._file
def __iter__(self) -> Iterator[str]:
return iter(self._file)
def __getitem__(
self, key: str
) -> h5py.Group | h5py.Dataset | BaseCompressedSparseDataset:
return self._file[key]
def __setitem__(
self,
key: str,
value: h5py.Group | h5py.Dataset | BaseCompressedSparseDataset,
):
self._file[key] = value
def __delitem__(self, key: str):
del self._file[key]
@property
def filename(self) -> Path:
return self._filename
@filename.setter
def filename(self, filename: PathLike[str] | str | None):
self._filename = None if filename is None else Path(filename)
def open(
self,
filename: PathLike[str] | str | None = None,
filemode: Literal["r", "r+"] | None = None,
):
if filename is not None:
self.filename = filename
if filemode is not None:
self._filemode = filemode
if self.filename is None:
msg = "Cannot open backing file if backing not initialized."
raise ValueError(msg)
self._file = h5py.File(self.filename, self._filemode)
def close(self):
"""Close the backing file, remember filename, do *not* change to memory mode."""
if self._file is not None:
self._file.close()
def _to_memory_mode(self):
"""Close the backing file, forget filename, *do* change to memory mode."""
self._adata._X = self._adata.X[()]
self._file.close()
self._file = None
self._filename = None
@property
def is_open(self) -> bool:
"""State of backing file."""
if self._file is None:
return False
# try accessing the id attribute to see if the file is open
return bool(self._file.id)
@singledispatch
def to_memory(x, *, copy: bool = False):
"""Permissivley convert objects to in-memory representation.
If they already are in-memory, (or are just unrecognized) pass a copy through.
"""
if copy and hasattr(x, "copy"):
return x.copy()
else:
return x
@to_memory.register(ZarrArray)
@to_memory.register(h5py.Dataset)
def _(x: ArrayStorageType, *, copy: bool = False):
return x[...]
@to_memory.register(BaseCompressedSparseDataset)
def _(x: BaseCompressedSparseDataset, *, copy: bool = False):
return x.to_memory()
@to_memory.register(DaskArray)
def _(x: DaskArray, *, copy: bool = False):
return x.compute()
@to_memory.register(Mapping)
def _(x: Mapping, *, copy: bool = False):
return {k: to_memory(v, copy=copy) for k, v in x.items()}
@to_memory.register(AwkArray)
def _(x: AwkArray, *, copy: bool = False):
from copy import copy as _copy
if copy:
return _copy(x)
else:
return x
@singledispatch
def filename(x):
msg = f"Not implemented for {type(x)}"
raise NotImplementedError(msg)
@filename.register(h5py.Group)
@filename.register(h5py.Dataset)
def _(x):
return x.file.filename
@filename.register(ZarrArray)
@filename.register(ZarrGroup)
def _(x):
return x.store.path
@singledispatch
def get_elem_name(x):
msg = f"Not implemented for {type(x)}"
raise NotImplementedError(msg)
@get_elem_name.register(h5py.Group)
def _(x):
return x.name
@get_elem_name.register(ZarrGroup)
def _(x):
return PurePosixPath(x.path).name
python-anndata-0.12.0~rc1/src/anndata/_core/index.py 0000664 0000000 0000000 00000022151 15003706322 0022267 0 ustar 00root root 0000000 0000000 from __future__ import annotations
from collections.abc import Iterable, Sequence
from functools import singledispatch
from itertools import repeat
from typing import TYPE_CHECKING
import h5py
import numpy as np
import pandas as pd
from scipy.sparse import issparse
from ..compat import AwkArray, CSArray, CSMatrix, DaskArray
if TYPE_CHECKING:
from ..compat import Index, Index1D
def _normalize_indices(
index: Index | None, names0: pd.Index, names1: pd.Index
) -> tuple[slice, slice]:
# deal with tuples of length 1
if isinstance(index, tuple) and len(index) == 1:
index = index[0]
# deal with pd.Series
if isinstance(index, pd.Series):
index: Index = index.values
if isinstance(index, tuple):
# TODO: The series should probably be aligned first
index = tuple(i.values if isinstance(i, pd.Series) else i for i in index)
ax0, ax1 = unpack_index(index)
ax0 = _normalize_index(ax0, names0)
ax1 = _normalize_index(ax1, names1)
return ax0, ax1
def _normalize_index(
indexer: slice
| np.integer
| int
| str
| Sequence[bool | int | np.integer]
| np.ndarray
| pd.Index,
index: pd.Index,
) -> slice | int | np.ndarray: # ndarray of int or bool
from ..experimental.backed._compat import DataArray
# TODO: why is this here? All tests pass without it and it seems at the minimum not strict enough.
if not isinstance(index, pd.RangeIndex) and (
index.dtype == float or index.dtype == int
):
msg = f"Don’t call _normalize_index with non-categorical/string names and non-range index {index}"
raise TypeError(msg)
# the following is insanely slow for sequences,
# we replaced it using pandas below
def name_idx(i):
if isinstance(i, str):
i = index.get_loc(i)
return i
if isinstance(indexer, slice):
start = name_idx(indexer.start)
stop = name_idx(indexer.stop)
# string slices can only be inclusive, so +1 in that case
if isinstance(indexer.stop, str):
stop = None if stop is None else stop + 1
step = indexer.step
return slice(start, stop, step)
elif isinstance(indexer, np.integer | int):
return indexer
elif isinstance(indexer, str):
return index.get_loc(indexer) # int
elif isinstance(
indexer, Sequence | np.ndarray | pd.Index | CSMatrix | np.matrix | CSArray
):
if hasattr(indexer, "shape") and (
(indexer.shape == (index.shape[0], 1))
or (indexer.shape == (1, index.shape[0]))
):
if isinstance(indexer, CSMatrix | CSArray):
indexer = indexer.toarray()
indexer = np.ravel(indexer)
if not isinstance(indexer, np.ndarray | pd.Index):
indexer = np.array(indexer)
if len(indexer) == 0:
indexer = indexer.astype(int)
if isinstance(indexer, np.ndarray) and np.issubdtype(
indexer.dtype, np.floating
):
indexer_int = indexer.astype(int)
if np.all((indexer - indexer_int) != 0):
msg = f"Indexer {indexer!r} has floating point values."
raise IndexError(msg)
if issubclass(indexer.dtype.type, np.integer | np.floating):
return indexer # Might not work for range indexes
elif issubclass(indexer.dtype.type, np.bool_):
if indexer.shape != index.shape:
msg = (
f"Boolean index does not match AnnData’s shape along this "
f"dimension. Boolean index has shape {indexer.shape} while "
f"AnnData index has shape {index.shape}."
)
raise IndexError(msg)
return indexer
else: # indexer should be string array
positions = index.get_indexer(indexer)
if np.any(positions < 0):
not_found = indexer[positions < 0]
msg = (
f"Values {list(not_found)}, from {list(indexer)}, "
"are not valid obs/ var names or indices."
)
raise KeyError(msg)
return positions # np.ndarray[int]
elif isinstance(indexer, DataArray):
if isinstance(indexer.data, DaskArray):
return indexer.data.compute()
return indexer.data
msg = f"Unknown indexer {indexer!r} of type {type(indexer)}"
raise IndexError()
def _fix_slice_bounds(s: slice, length: int) -> slice:
"""The slice will be clipped to length, and the step won't be None.
E.g. infer None valued attributes.
"""
step = s.step if s.step is not None else 1
# slice constructor would have errored if step was 0
if step > 0:
start = s.start if s.start is not None else 0
stop = s.stop if s.stop is not None else length
elif step < 0:
# Reverse
start = s.start if s.start is not None else length
stop = s.stop if s.stop is not None else 0
return slice(start, stop, step)
def unpack_index(index: Index) -> tuple[Index1D, Index1D]:
if not isinstance(index, tuple):
if index is Ellipsis:
index = slice(None)
return index, slice(None)
num_ellipsis = sum(i is Ellipsis for i in index)
if num_ellipsis > 1:
msg = "an index can only have a single ellipsis ('...')"
raise IndexError(msg)
# If index has Ellipsis, filter it out (and if not, error)
if len(index) > 2:
if not num_ellipsis:
msg = "Received a length 3 index without an ellipsis"
raise IndexError(msg)
index = tuple(i for i in index if i is not Ellipsis)
return index
# If index has Ellipsis, replace it with slice
if len(index) == 2:
index = tuple(slice(None) if i is Ellipsis else i for i in index)
return index
if len(index) == 1:
index = index[0]
if index is Ellipsis:
index = slice(None)
return index, slice(None)
msg = "invalid number of indices"
raise IndexError(msg)
@singledispatch
def _subset(a: np.ndarray | pd.DataFrame, subset_idx: Index):
# Select as combination of indexes, not coordinates
# Correcting for indexing behaviour of np.ndarray
if all(isinstance(x, Iterable) for x in subset_idx):
subset_idx = np.ix_(*subset_idx)
return a[subset_idx]
@_subset.register(DaskArray)
def _subset_dask(a: DaskArray, subset_idx: Index):
if len(subset_idx) > 1 and all(isinstance(x, Iterable) for x in subset_idx):
if issparse(a._meta) and a._meta.format == "csc":
return a[:, subset_idx[1]][subset_idx[0], :]
return a[subset_idx[0], :][:, subset_idx[1]]
return a[subset_idx]
@_subset.register(CSMatrix)
@_subset.register(CSArray)
def _subset_sparse(a: CSMatrix | CSArray, subset_idx: Index):
# Correcting for indexing behaviour of sparse.spmatrix
if len(subset_idx) > 1 and all(isinstance(x, Iterable) for x in subset_idx):
first_idx = subset_idx[0]
if issubclass(first_idx.dtype.type, np.bool_):
first_idx = np.where(first_idx)[0]
subset_idx = (first_idx.reshape(-1, 1), *subset_idx[1:])
return a[subset_idx]
@_subset.register(pd.DataFrame)
def _subset_df(df: pd.DataFrame, subset_idx: Index):
return df.iloc[subset_idx]
@_subset.register(AwkArray)
def _subset_awkarray(a: AwkArray, subset_idx: Index):
if all(isinstance(x, Iterable) for x in subset_idx):
subset_idx = np.ix_(*subset_idx)
return a[subset_idx]
# Registration for SparseDataset occurs in sparse_dataset.py
@_subset.register(h5py.Dataset)
def _subset_dataset(d, subset_idx):
if not isinstance(subset_idx, tuple):
subset_idx = (subset_idx,)
ordered = list(subset_idx)
rev_order = [slice(None) for _ in range(len(subset_idx))]
for axis, axis_idx in enumerate(ordered.copy()):
if isinstance(axis_idx, np.ndarray):
if axis_idx.dtype == bool:
axis_idx = np.where(axis_idx)[0]
order = np.argsort(axis_idx)
ordered[axis] = axis_idx[order]
rev_order[axis] = np.argsort(order)
# from hdf5, then to real order
return d[tuple(ordered)][tuple(rev_order)]
def make_slice(idx, dimidx, n=2):
mut = list(repeat(slice(None), n))
mut[dimidx] = idx
return tuple(mut)
def get_vector(adata, k, coldim, idxdim, layer=None):
# adata could be self if Raw and AnnData shared a parent
dims = ("obs", "var")
col = getattr(adata, coldim).columns
idx = getattr(adata, f"{idxdim}_names")
in_col = k in col
in_idx = k in idx
if (in_col + in_idx) == 2:
msg = f"Key {k} could be found in both .{idxdim}_names and .{coldim}.columns"
raise ValueError(msg)
elif (in_col + in_idx) == 0:
msg = f"Could not find key {k} in .{idxdim}_names or .{coldim}.columns."
raise KeyError(msg)
elif in_col:
return getattr(adata, coldim)[k].values
elif in_idx:
selected_dim = dims.index(idxdim)
idx = adata._normalize_indices(make_slice(k, selected_dim))
a = adata._get_X(layer=layer)[idx]
if issparse(a):
a = a.toarray()
return np.ravel(a)
python-anndata-0.12.0~rc1/src/anndata/_core/merge.py 0000664 0000000 0000000 00000155734 15003706322 0022275 0 ustar 00root root 0000000 0000000 """
Code for merging/ concatenating AnnData objects.
"""
from __future__ import annotations
from collections import OrderedDict
from collections.abc import Callable, Mapping, MutableSet
from functools import partial, reduce, singledispatch
from itertools import repeat
from operator import and_, or_, sub
from typing import TYPE_CHECKING, Literal, TypeVar
from warnings import warn
import numpy as np
import pandas as pd
import scipy
from natsort import natsorted
from packaging.version import Version
from scipy import sparse
from anndata._core.file_backing import to_memory
from anndata._warnings import ExperimentalFeatureWarning
from ..compat import (
AwkArray,
CSArray,
CSMatrix,
CupyArray,
CupyCSRMatrix,
CupySparseMatrix,
DaskArray,
_map_cat_to_str,
)
from ..utils import asarray, axis_len, warn_once
from .anndata import AnnData
from .index import _subset, make_slice
if TYPE_CHECKING:
from collections.abc import Collection, Generator, Iterable, Sequence
from typing import Any
from pandas.api.extensions import ExtensionDtype
from anndata._types import Join_T
from anndata.experimental.backed._compat import DataArray, Dataset2D
T = TypeVar("T")
###################
# Utilities
###################
# Pretty much just for maintaining order of keys
class OrderedSet(MutableSet):
def __init__(self, vals=()):
self.dict = OrderedDict(zip(vals, repeat(None)))
def __contains__(self, val):
return val in self.dict
def __iter__(self):
return iter(self.dict)
def __len__(self):
return len(self.dict)
def __repr__(self):
return "OrderedSet: {" + ", ".join(map(str, self)) + "}"
def copy(self):
return OrderedSet(self.dict.copy())
def add(self, val):
self.dict[val] = None
def union(self, *vals) -> OrderedSet:
return reduce(or_, vals, self)
def discard(self, val):
if val in self:
del self.dict[val]
def difference(self, *vals) -> OrderedSet:
return reduce(sub, vals, self)
def union_keys(ds: Collection) -> OrderedSet:
return reduce(or_, ds, OrderedSet())
def intersect_keys(ds: Collection) -> OrderedSet:
return reduce(and_, map(OrderedSet, ds))
class MissingVal:
"""Represents a missing value."""
def is_missing(v) -> bool:
return v is MissingVal
def not_missing(v) -> bool:
return v is not MissingVal
# We need to be able to check for equality of arrays to know which are the same.
# Unfortunately equality of arrays is poorly defined.
# * `np.array_equal` does not work for sparse arrays
# * `np.array_equal(..., equal_nan=True)` does not work for null values at the moment
# (see https://github.com/numpy/numpy/issues/16377)
# So we have to define it ourselves with these two issues in mind.
# TODO: Hopefully this will stop being an issue in the future and this code can be removed.
@singledispatch
def equal(a, b) -> bool:
a = asarray(a)
b = asarray(b)
if a.ndim == b.ndim == 0:
return bool(a == b)
return np.array_equal(a, b)
@equal.register(pd.DataFrame)
def equal_dataframe(a, b) -> bool:
return a.equals(b)
@equal.register(DaskArray)
def equal_dask_array(a, b) -> bool:
import dask.array as da
from dask.base import tokenize
if a is b:
return True
if a.shape != b.shape:
return False
if isinstance(b, DaskArray):
if tokenize(a) == tokenize(b):
return True
if isinstance(a._meta, CSMatrix):
# TODO: Maybe also do this in the other case?
return da.map_blocks(equal, a, b, drop_axis=(0, 1)).all()
else:
return da.equal(a, b, where=~(da.isnan(a) == da.isnan(b))).all()
@equal.register(np.ndarray)
def equal_array(a, b) -> bool:
# Reshaping allows us to compare inputs with >2 dimensions
# We cast to pandas since it will still work with non-numeric types
b = asarray(b)
if a.shape != b.shape:
return False
return equal(pd.DataFrame(a.reshape(-1)), pd.DataFrame(b.reshape(-1)))
@equal.register(CupyArray)
def equal_cupyarray(a, b) -> bool:
import cupy as cp
return bool(cp.array_equal(a, b, equal_nan=True))
@equal.register(pd.Series)
def equal_series(a, b) -> bool:
return a.equals(b)
@equal.register(CSMatrix)
@equal.register(CSArray)
@equal.register(CupySparseMatrix)
def equal_sparse(a, b) -> bool:
# It's a weird api, don't blame me
import array_api_compat
xp = array_api_compat.array_namespace(a.data)
if isinstance(b, CupySparseMatrix | CSMatrix | CSArray):
if isinstance(a, CupySparseMatrix):
# Comparison broken for CSC matrices
# https://github.com/cupy/cupy/issues/7757
a, b = CupyCSRMatrix(a), CupyCSRMatrix(b)
comp = a != b
if isinstance(comp, bool):
return not comp
if isinstance(comp, CupySparseMatrix):
# https://github.com/cupy/cupy/issues/7751
comp = comp.get()
# fmt: off
return (
(len(comp.data) == 0)
or (
xp.isnan(a[comp]).all()
and xp.isnan(b[comp]).all()
)
)
# fmt: on
else:
return False
@equal.register(AwkArray)
def equal_awkward(a, b) -> bool:
from ..compat import awkward as ak
return ak.almost_equal(a, b)
def as_sparse(x, *, use_sparse_array: bool = False) -> CSMatrix | CSArray:
if not isinstance(x, CSMatrix | CSArray):
in_memory_array_class = (
sparse.csr_array if use_sparse_array else sparse.csr_matrix
)
if isinstance(x, DaskArray):
x = x.map_blocks(
sparse.csr_matrix,
meta=sparse.csr_matrix(x._meta),
dtype=x.dtype,
).compute()
return in_memory_array_class(x)
return x
def as_cp_sparse(x) -> CupySparseMatrix:
import cupyx.scipy.sparse as cpsparse
if isinstance(x, cpsparse.spmatrix):
return x
elif isinstance(x, np.ndarray):
return cpsparse.csr_matrix(as_sparse(x))
else:
return cpsparse.csr_matrix(x)
def unify_dtypes(
dfs: Iterable[pd.DataFrame | Dataset2D],
) -> list[pd.DataFrame | Dataset2D]:
"""
Attempts to unify datatypes from multiple dataframes.
For catching cases where pandas would convert to object dtype.
"""
dfs = list(dfs)
# Get shared categorical columns
df_dtypes = [dict(df.dtypes) for df in dfs]
columns = reduce(lambda x, y: x.union(y), [df.columns for df in dfs])
dtypes: dict[str, list[np.dtype | ExtensionDtype]] = {col: [] for col in columns}
for col in columns:
for df in df_dtypes:
dtypes[col].append(df.get(col, None))
if len(dtypes) == 0:
return dfs
else:
dfs = [df.copy(deep=False) for df in dfs]
new_dtypes = {}
for col in dtypes.keys():
target_dtype = try_unifying_dtype(dtypes[col])
if target_dtype is not None:
new_dtypes[col] = target_dtype
for df in dfs:
for col, dtype in new_dtypes.items():
if col in df:
df[col] = df[col].astype(dtype)
return dfs
def try_unifying_dtype(
col: Sequence[np.dtype | ExtensionDtype],
) -> pd.core.dtypes.base.ExtensionDtype | None:
"""
If dtypes can be unified, returns the dtype they would be unified to.
Returns None if they can't be unified, or if we can expect pandas to unify them for
us.
Params
------
col:
A list of dtypes to unify. Can be numpy/ pandas dtypes, or None (which denotes
a missing value)
"""
dtypes: set[pd.CategoricalDtype] = set()
# Categorical
if any(isinstance(dtype, pd.CategoricalDtype) for dtype in col):
ordered = False
for dtype in col:
if isinstance(dtype, pd.CategoricalDtype):
dtypes.add(dtype)
ordered = ordered | dtype.ordered
elif not pd.isnull(dtype):
return None
if len(dtypes) > 0 and not ordered:
categories = reduce(
lambda x, y: x.union(y),
[dtype.categories for dtype in dtypes if not pd.isnull(dtype)],
)
return pd.CategoricalDtype(natsorted(categories), ordered=False)
# Boolean
elif all(pd.api.types.is_bool_dtype(dtype) or dtype is None for dtype in col):
if any(dtype is None for dtype in col):
return pd.BooleanDtype()
else:
return None
else:
return None
def check_combinable_cols(cols: list[pd.Index], join: Join_T):
"""Given columns for a set of dataframes, checks if the can be combined.
Looks for if there are duplicated column names that would show up in the result.
"""
repeated_cols = reduce(lambda x, y: x.union(y[y.duplicated()]), cols, set())
if join == "inner":
intersecting_cols = intersect_keys(cols)
problem_cols = repeated_cols.intersection(intersecting_cols)
elif join == "outer":
problem_cols = repeated_cols
else:
raise ValueError()
if len(problem_cols) > 0:
problem_cols = list(problem_cols)
msg = (
f"Cannot combine dataframes as some contained duplicated column names - "
"causing ambiguity.\n\n"
f"The problem columns are: {problem_cols}"
)
raise pd.errors.InvalidIndexError(msg)
# TODO: open PR or feature request to cupy
def _cp_block_diag(mats, format=None, dtype=None):
"""
Modified version of scipy.sparse.block_diag for cupy sparse.
"""
import cupy as cp
from cupyx.scipy import sparse as cpsparse
row = []
col = []
data = []
r_idx = 0
c_idx = 0
for a in mats:
# if isinstance(a, (list, numbers.Number)):
# a = cpsparse.coo_matrix(a)
nrows, ncols = a.shape
if cpsparse.issparse(a):
a = a.tocoo()
row.append(a.row + r_idx)
col.append(a.col + c_idx)
data.append(a.data)
else:
a_row, a_col = cp.divmod(cp.arange(nrows * ncols), ncols)
row.append(a_row + r_idx)
col.append(a_col + c_idx)
data.append(a.reshape(-1))
r_idx += nrows
c_idx += ncols
row = cp.concatenate(row)
col = cp.concatenate(col)
data = cp.concatenate(data)
return cpsparse.coo_matrix(
(data, (row, col)), shape=(r_idx, c_idx), dtype=dtype
).asformat(format)
def _dask_block_diag(mats):
from itertools import permutations
import dask.array as da
blocks = np.zeros((len(mats), len(mats)), dtype=object)
for i, j in permutations(range(len(mats)), 2):
blocks[i, j] = da.from_array(
sparse.csr_matrix((mats[i].shape[0], mats[j].shape[1]))
)
for i, x in enumerate(mats):
if not isinstance(x._meta, sparse.csr_matrix):
x = x.map_blocks(sparse.csr_matrix)
blocks[i, i] = x
return da.block(blocks.tolist())
###################
# Per element logic
###################
def unique_value(vals: Collection[T]) -> T | MissingVal:
"""
Given a collection vals, returns the unique value (if one exists), otherwise
returns MissingValue.
"""
unique_val = vals[0]
for v in vals[1:]:
if not equal(v, unique_val):
return MissingVal
return unique_val
def first(vals: Collection[T]) -> T | MissingVal:
"""
Given a collection of vals, return the first non-missing one.If they're all missing,
return MissingVal.
"""
for val in vals:
if not_missing(val):
return val
return MissingVal
def only(vals: Collection[T]) -> T | MissingVal:
"""Return the only value in the collection, otherwise MissingVal."""
if len(vals) == 1:
return vals[0]
else:
return MissingVal
###################
# Merging
###################
def merge_nested(ds: Collection[Mapping], keys_join: Callable, value_join: Callable):
out = {}
for k in keys_join(ds):
v = _merge_nested(ds, k, keys_join, value_join)
if not_missing(v):
out[k] = v
return out
def _merge_nested(
ds: Collection[Mapping], k, keys_join: Callable, value_join: Callable
):
vals = [d[k] for d in ds if k in d]
if len(vals) == 0:
return MissingVal
elif all(isinstance(v, Mapping) for v in vals):
new_map = merge_nested(vals, keys_join, value_join)
if len(new_map) == 0:
return MissingVal
else:
return new_map
else:
return value_join(vals)
def merge_unique(ds: Collection[Mapping]) -> Mapping:
return merge_nested(ds, union_keys, unique_value)
def merge_same(ds: Collection[Mapping]) -> Mapping:
return merge_nested(ds, intersect_keys, unique_value)
def merge_first(ds: Collection[Mapping]) -> Mapping:
return merge_nested(ds, union_keys, first)
def merge_only(ds: Collection[Mapping]) -> Mapping:
return merge_nested(ds, union_keys, only)
###################
# Interface
###################
# Leaving out for now, it's ugly in the rendered docs and would be adding a dependency.
# from typing_extensions import Literal
# UNS_STRATEGIES_TYPE = Literal[None, "same", "unique", "first", "only"]
MERGE_STRATEGIES = {
None: lambda x: {},
"same": merge_same,
"unique": merge_unique,
"first": merge_first,
"only": merge_only,
}
StrategiesLiteral = Literal["same", "unique", "first", "only"]
def resolve_merge_strategy(
strategy: str | Callable | None,
) -> Callable[[Collection[Mapping]], Mapping]:
if not isinstance(strategy, Callable):
strategy = MERGE_STRATEGIES[strategy]
return strategy
#####################
# Concatenation
#####################
class Reindexer:
"""
Indexing to be applied to axis of 2d array orthogonal to the axis being concatenated.
Attrs
-----
old_idx
Original index
new_idx
Target index
old_pos
Indices of original index which will be kept
new_pos
Indices of new index which data from old_pos will be placed in.
Together with `old_pos` this forms a mapping.
"""
def __init__(self, old_idx, new_idx):
self.old_idx = old_idx
self.new_idx = new_idx
self.no_change = new_idx.equals(old_idx)
new_pos = new_idx.get_indexer(old_idx)
old_pos = np.arange(len(new_pos))
mask = new_pos != -1
self.new_pos = new_pos[mask]
self.old_pos = old_pos[mask]
def __call__(self, el, *, axis=1, fill_value=None):
return self.apply(el, axis=axis, fill_value=fill_value)
def apply(self, el, *, axis, fill_value=None):
"""
Reindex element so el[axis] is aligned to self.new_idx.
Missing values are to be replaced with `fill_value`.
"""
if self.no_change and (axis_len(el, axis) == len(self.old_idx)):
return el
if isinstance(el, pd.DataFrame):
return self._apply_to_df(el, axis=axis, fill_value=fill_value)
elif isinstance(el, CSMatrix | CSArray | CupySparseMatrix):
return self._apply_to_sparse(el, axis=axis, fill_value=fill_value)
elif isinstance(el, AwkArray):
return self._apply_to_awkward(el, axis=axis, fill_value=fill_value)
elif isinstance(el, DaskArray):
return self._apply_to_dask_array(el, axis=axis, fill_value=fill_value)
elif isinstance(el, CupyArray):
return self._apply_to_cupy_array(el, axis=axis, fill_value=fill_value)
else:
return self._apply_to_array(el, axis=axis, fill_value=fill_value)
def _apply_to_df(self, el: pd.DataFrame, *, axis, fill_value=None):
if fill_value is None:
fill_value = np.nan
return el.reindex(self.new_idx, axis=axis, fill_value=fill_value)
def _apply_to_dask_array(self, el: DaskArray, *, axis, fill_value=None):
import dask.array as da
if fill_value is None:
fill_value = default_fill_value([el])
shape = list(el.shape)
if el.shape[axis] == 0:
# Presumably faster since it won't allocate the full array
shape[axis] = len(self.new_idx)
return da.broadcast_to(fill_value, tuple(shape))
indexer = self.idx
sub_el = _subset(el, make_slice(indexer, axis, len(shape)))
if any(indexer == -1):
sub_el[make_slice(indexer == -1, axis, len(shape))] = fill_value
return sub_el
def _apply_to_cupy_array(self, el, *, axis, fill_value=None):
import cupy as cp
if fill_value is None:
fill_value = default_fill_value([el])
if el.shape[axis] == 0:
# Presumably faster since it won't allocate the full array
shape = list(el.shape)
shape[axis] = len(self.new_idx)
return cp.broadcast_to(cp.asarray(fill_value), tuple(shape))
old_idx_tuple = [slice(None)] * len(el.shape)
old_idx_tuple[axis] = self.old_pos
old_idx_tuple = tuple(old_idx_tuple)
new_idx_tuple = [slice(None)] * len(el.shape)
new_idx_tuple[axis] = self.new_pos
new_idx_tuple = tuple(new_idx_tuple)
out_shape = list(el.shape)
out_shape[axis] = len(self.new_idx)
out = cp.full(tuple(out_shape), fill_value)
out[new_idx_tuple] = el[old_idx_tuple]
return out
def _apply_to_array(self, el, *, axis, fill_value=None):
if fill_value is None:
fill_value = default_fill_value([el])
if el.shape[axis] == 0:
# Presumably faster since it won't allocate the full array
shape = list(el.shape)
shape[axis] = len(self.new_idx)
return np.broadcast_to(fill_value, tuple(shape))
indexer = self.idx
# Indexes real fast, and does outer indexing
return pd.api.extensions.take(
el, indexer, axis=axis, allow_fill=True, fill_value=fill_value
)
def _apply_to_sparse(
self, el: CSMatrix | CSArray, *, axis, fill_value=None
) -> CSMatrix:
if isinstance(el, CupySparseMatrix):
from cupyx.scipy import sparse
else:
from scipy import sparse
import array_api_compat
xp = array_api_compat.array_namespace(el.data)
if fill_value is None:
fill_value = default_fill_value([el])
if fill_value != 0:
to_fill = self.new_idx.get_indexer(self.new_idx.difference(self.old_idx))
else:
to_fill = xp.array([])
# Fixing outer indexing for missing values
if el.shape[axis] == 0:
shape = list(el.shape)
shape[axis] = len(self.new_idx)
shape = tuple(shape)
if fill_value == 0:
if isinstance(el, CSArray):
memory_class = sparse.csr_array
else:
memory_class = sparse.csr_matrix
return memory_class(shape)
else:
return type(el)(xp.broadcast_to(xp.asarray(fill_value), shape))
fill_idxer = None
if len(to_fill) > 0 or isinstance(el, CupySparseMatrix):
idxmtx_dtype = xp.promote_types(el.dtype, xp.array(fill_value).dtype)
else:
idxmtx_dtype = bool
if isinstance(el, CSArray):
memory_class = sparse.coo_array
else:
memory_class = sparse.coo_matrix
if axis == 1:
idxmtx = memory_class(
(
xp.ones(len(self.new_pos), dtype=idxmtx_dtype),
(xp.asarray(self.old_pos), xp.asarray(self.new_pos)),
),
shape=(len(self.old_idx), len(self.new_idx)),
dtype=idxmtx_dtype,
)
out = el @ idxmtx
if len(to_fill) > 0:
out = out.tocsc()
fill_idxer = (slice(None), to_fill)
elif axis == 0:
idxmtx = memory_class(
(
xp.ones(len(self.new_pos), dtype=idxmtx_dtype),
(xp.asarray(self.new_pos), xp.asarray(self.old_pos)),
),
shape=(len(self.new_idx), len(self.old_idx)),
dtype=idxmtx_dtype,
)
out = idxmtx @ el
if len(to_fill) > 0:
out = out.tocsr()
fill_idxer = (to_fill, slice(None))
if fill_idxer is not None:
out[fill_idxer] = fill_value
return out
def _apply_to_awkward(self, el: AwkArray, *, axis, fill_value=None):
import awkward as ak
if self.no_change:
return el
elif axis == 1: # Indexing by field
if self.new_idx.isin(self.old_idx).all(): # inner join
return el[self.new_idx]
else: # outer join
# TODO: this code isn't actually hit, we should refactor
msg = "This should be unreachable, please open an issue."
raise Exception(msg)
else:
if len(self.new_idx) > len(self.old_idx):
el = ak.pad_none(el, 1, axis=axis) # axis == 0
return el[self.idx]
@property
def idx(self):
return self.old_idx.get_indexer(self.new_idx)
def merge_indices(inds: Iterable[pd.Index], join: Join_T) -> pd.Index:
if join == "inner":
return reduce(lambda x, y: x.intersection(y), inds)
elif join == "outer":
return reduce(lambda x, y: x.union(y), inds)
else:
msg = f"`join` must be one of 'inner' or 'outer', got {join!r}"
raise ValueError(msg)
def default_fill_value(els):
"""Given some arrays, returns what the default fill value should be.
This is largely due to backwards compat, and might not be the ideal solution.
"""
if any(
isinstance(el, CSMatrix | CSArray)
or (isinstance(el, DaskArray) and isinstance(el._meta, CSMatrix | CSArray))
for el in els
):
return 0
else:
return np.nan
def gen_reindexer(new_var: pd.Index, cur_var: pd.Index):
"""
Given a new set of var_names, and a current set, generates a function which will reindex
a matrix to be aligned with the new set.
Usage
-----
>>> a = AnnData(sparse.eye(3, format="csr"), var=pd.DataFrame(index=list("abc")))
>>> b = AnnData(sparse.eye(2, format="csr"), var=pd.DataFrame(index=list("ba")))
>>> reindexer = gen_reindexer(a.var_names, b.var_names)
>>> sparse.vstack([a.X, reindexer(b.X)]).toarray()
array([[1., 0., 0.],
[0., 1., 0.],
[0., 0., 1.],
[0., 1., 0.],
[1., 0., 0.]])
"""
return Reindexer(cur_var, new_var)
def np_bool_to_pd_bool_array(df: pd.DataFrame):
for col_name, col_type in dict(df.dtypes).items():
if col_type is np.dtype(bool):
df[col_name] = pd.array(df[col_name].values)
return df
def concat_arrays(arrays, reindexers, axis=0, index=None, fill_value=None):
from anndata.experimental.backed._compat import Dataset2D
arrays = list(arrays)
if fill_value is None:
fill_value = default_fill_value(arrays)
if any(isinstance(a, Dataset2D) for a in arrays):
if any(isinstance(a, pd.DataFrame) for a in arrays):
arrays = [to_memory(a) if isinstance(a, Dataset2D) else a for a in arrays]
elif not all(isinstance(a, Dataset2D) for a in arrays):
msg = f"Cannot concatenate a Dataset2D with other array types {[type(a) for a in arrays if not isinstance(a, Dataset2D)]}."
raise ValueError(msg)
else:
return concat_dataset2d_on_annot_axis(arrays, join="outer")
if any(isinstance(a, pd.DataFrame) for a in arrays):
# TODO: This is hacky, 0 is a sentinel for outer_concat_aligned_mapping
if not all(
isinstance(a, pd.DataFrame) or a is MissingVal or 0 in a.shape
for a in arrays
):
msg = "Cannot concatenate a dataframe with other array types."
raise NotImplementedError(msg)
# TODO: behaviour here should be chosen through a merge strategy
df = pd.concat(
unify_dtypes(f(x) for f, x in zip(reindexers, arrays)),
axis=axis,
ignore_index=True,
)
df.index = index
return df
elif any(isinstance(a, AwkArray) for a in arrays):
from ..compat import awkward as ak
if not all(
isinstance(a, AwkArray) or a is MissingVal or 0 in a.shape for a in arrays
):
msg = "Cannot concatenate an AwkwardArray with other array types."
raise NotImplementedError(msg)
return ak.concatenate([f(a) for f, a in zip(reindexers, arrays)], axis=axis)
elif any(isinstance(a, CupySparseMatrix) for a in arrays):
import cupyx.scipy.sparse as cpsparse
if not all(
isinstance(a, CupySparseMatrix | CupyArray) or 0 in a.shape for a in arrays
):
msg = "Cannot concatenate a cupy array with other array types."
raise NotImplementedError(msg)
sparse_stack = (cpsparse.vstack, cpsparse.hstack)[axis]
return sparse_stack(
[
f(as_cp_sparse(a), axis=1 - axis, fill_value=fill_value)
for f, a in zip(reindexers, arrays)
],
format="csr",
)
elif any(isinstance(a, CupyArray) for a in arrays):
import cupy as cp
if not all(isinstance(a, CupyArray) or 0 in a.shape for a in arrays):
msg = "Cannot concatenate a cupy array with other array types."
raise NotImplementedError(msg)
return cp.concatenate(
[
f(cp.asarray(x), fill_value=fill_value, axis=1 - axis)
for f, x in zip(reindexers, arrays)
],
axis=axis,
)
elif any(isinstance(a, CSMatrix | CSArray) for a in arrays):
sparse_stack = (sparse.vstack, sparse.hstack)[axis]
use_sparse_array = any(issubclass(type(a), CSArray) for a in arrays)
mat = sparse_stack(
[
f(
as_sparse(a, use_sparse_array=use_sparse_array),
axis=1 - axis,
fill_value=fill_value,
)
for f, a in zip(reindexers, arrays)
],
format="csr",
)
scipy_version = Version(scipy.__version__)
# Bug where xstack produces a matrix not an array in 1.11.*
if use_sparse_array and (scipy_version.major, scipy_version.minor) == (1, 11):
if mat.format == "csc":
return sparse.csc_array(mat)
return sparse.csr_array(mat)
return mat
else:
return np.concatenate(
[
f(x, fill_value=fill_value, axis=1 - axis)
for f, x in zip(reindexers, arrays)
],
axis=axis,
)
def inner_concat_aligned_mapping(
mappings, *, reindexers=None, index=None, axis=0, concat_axis=None
):
if concat_axis is None:
concat_axis = axis
result = {}
for k in intersect_keys(mappings):
els = [m[k] for m in mappings]
if reindexers is None:
cur_reindexers = gen_inner_reindexers(
els, new_index=index, axis=concat_axis
)
else:
cur_reindexers = reindexers
result[k] = concat_arrays(els, cur_reindexers, index=index, axis=concat_axis)
return result
def gen_inner_reindexers(els, new_index, axis: Literal[0, 1] = 0):
alt_axis = 1 - axis
if axis == 0:
df_indices = lambda x: x.columns
elif axis == 1:
df_indices = lambda x: x.indices
if all(isinstance(el, pd.DataFrame) for el in els if not_missing(el)):
common_ind = reduce(
lambda x, y: x.intersection(y), (df_indices(el) for el in els)
)
reindexers = [Reindexer(df_indices(el), common_ind) for el in els]
elif any(isinstance(el, AwkArray) for el in els if not_missing(el)):
if not all(isinstance(el, AwkArray) for el in els if not_missing(el)):
msg = "Cannot concatenate an AwkwardArray with other array types."
raise NotImplementedError(msg)
common_keys = intersect_keys(el.fields for el in els)
reindexers = [
Reindexer(pd.Index(el.fields), pd.Index(list(common_keys))) for el in els
]
else:
min_ind = min(el.shape[alt_axis] for el in els)
reindexers = [
gen_reindexer(pd.RangeIndex(min_ind), pd.RangeIndex(el.shape[alt_axis]))
for el in els
]
return reindexers
def gen_outer_reindexers(els, shapes, new_index: pd.Index, *, axis=0):
if all(isinstance(el, pd.DataFrame) for el in els if not_missing(el)):
reindexers = [
(lambda x: x)
if not_missing(el)
else (lambda _, shape=shape: pd.DataFrame(index=range(shape)))
for el, shape in zip(els, shapes)
]
elif any(isinstance(el, AwkArray) for el in els if not_missing(el)):
import awkward as ak
if not all(isinstance(el, AwkArray) for el in els if not_missing(el)):
msg = "Cannot concatenate an AwkwardArray with other array types."
raise NotImplementedError(msg)
warn_once(
"Outer joins on awkward.Arrays will have different return values in the future. "
"For details, and to offer input, please see:\n\n\t"
"https://github.com/scverse/anndata/issues/898",
ExperimentalFeatureWarning,
)
# all_keys = union_keys(el.fields for el in els if not_missing(el))
reindexers = []
for el in els:
if not_missing(el):
reindexers.append(lambda x: x)
else:
reindexers.append(
lambda x: ak.pad_none(
ak.Array([]),
len(x),
0,
)
)
else:
max_col = max(el.shape[1] for el in els if not_missing(el))
orig_cols = [el.shape[1] if not_missing(el) else 0 for el in els]
reindexers = [
gen_reindexer(pd.RangeIndex(max_col), pd.RangeIndex(n)) for n in orig_cols
]
return reindexers
def missing_element(
n: int,
els: list[CSArray | CSMatrix | np.ndarray | DaskArray],
axis: Literal[0, 1] = 0,
fill_value: Any | None = None,
off_axis_size: int = 0,
) -> np.ndarray | DaskArray:
"""Generates value to use when there is a missing element."""
should_return_dask = any(isinstance(el, DaskArray) for el in els)
# 0 sized array for in-memory prevents allocating unnecessary memory while preserving broadcasting.
shape = (n, off_axis_size) if axis == 0 else (off_axis_size, n)
if should_return_dask:
import dask.array as da
return da.full(
shape, default_fill_value(els) if fill_value is None else fill_value
)
return np.zeros(shape, dtype=bool)
def outer_concat_aligned_mapping(
mappings, *, reindexers=None, index=None, axis=0, concat_axis=None, fill_value=None
):
if concat_axis is None:
concat_axis = axis
result = {}
ns = [m.parent.shape[axis] for m in mappings]
for k in union_keys(mappings):
els = [m.get(k, MissingVal) for m in mappings]
if reindexers is None:
cur_reindexers = gen_outer_reindexers(
els, ns, new_index=index, axis=concat_axis
)
else:
cur_reindexers = reindexers
# Dask needs to create a full array and can't do the size-0 trick
off_axis_size = 0
if any(isinstance(e, DaskArray) for e in els):
if not isinstance(cur_reindexers[0], Reindexer): # pragma: no cover
msg = "Cannot re-index a dask array without a Reindexer"
raise ValueError(msg)
off_axis_size = cur_reindexers[0].idx.shape[0]
# Handling of missing values here is hacky for dataframes
# We should probably just handle missing elements for all types
result[k] = concat_arrays(
[
el
if not_missing(el)
else missing_element(
n,
axis=concat_axis,
els=els,
fill_value=fill_value,
off_axis_size=off_axis_size,
)
for el, n in zip(els, ns)
],
cur_reindexers,
axis=concat_axis,
index=index,
fill_value=fill_value,
)
return result
def concat_pairwise_mapping(
mappings: Collection[Mapping], shapes: Collection[int], join_keys=intersect_keys
):
result = {}
if any(any(isinstance(v, CSArray) for v in m.values()) for m in mappings):
sparse_class = sparse.csr_array
else:
sparse_class = sparse.csr_matrix
for k in join_keys(mappings):
els = [
m.get(k, sparse_class((s, s), dtype=bool)) for m, s in zip(mappings, shapes)
]
if all(isinstance(el, CupySparseMatrix | CupyArray) for el in els):
result[k] = _cp_block_diag(els, format="csr")
elif all(isinstance(el, DaskArray) for el in els):
result[k] = _dask_block_diag(els)
else:
result[k] = sparse.block_diag(els, format="csr")
return result
def merge_dataframes(
dfs: Iterable[pd.DataFrame], new_index, merge_strategy=merge_unique
) -> pd.DataFrame:
dfs = [df.reindex(index=new_index) for df in dfs]
# New dataframe with all shared data
new_df = pd.DataFrame(merge_strategy(dfs), index=new_index)
return new_df
def merge_outer(mappings, batch_keys, *, join_index="-", merge=merge_unique):
"""
Combine elements of two mappings, such that non-overlapping entries are added with their batch-key appended.
Note: this currently does NOT work for nested mappings. Additionally, values are not promised to be unique, and may be overwritten.
"""
all_keys = union_keys(mappings)
out = merge(mappings)
for key in all_keys.difference(out.keys()):
for b, m in zip(batch_keys, mappings):
val = m.get(key, None)
if val is not None:
out[f"{key}{join_index}{b}"] = val
return out
def _resolve_axis(
axis: Literal["obs", 0, "var", 1],
) -> tuple[Literal[0], Literal["obs"]] | tuple[Literal[1], Literal["var"]]:
if axis in {0, "obs"}:
return (0, "obs")
if axis in {1, "var"}:
return (1, "var")
msg = f"`axis` must be either 0, 1, 'obs', or 'var', was {axis}"
raise ValueError(msg)
def axis_indices(adata: AnnData, axis: Literal["obs", 0, "var", 1]) -> pd.Index:
"""Helper function to get adata.{dim}_names."""
_, axis_name = _resolve_axis(axis)
return getattr(adata, f"{axis_name}_names")
# TODO: Resolve https://github.com/scverse/anndata/issues/678 and remove this function
def concat_Xs(adatas, reindexers, axis, fill_value):
"""
Shimy until support for some missing X's is implemented.
Basically just checks if it's one of the two supported cases, or throws an error.
This is not done inline in `concat` because we don't want to maintain references
to the values of a.X.
"""
Xs = [a.X for a in adatas]
if all(X is None for X in Xs):
return None
elif any(X is None for X in Xs):
msg = (
"Some (but not all) of the AnnData's to be concatenated had no .X value. "
"Concatenation is currently only implemented for cases where all or none of"
" the AnnData's have .X assigned."
)
raise NotImplementedError(msg)
else:
return concat_arrays(Xs, reindexers, axis=axis, fill_value=fill_value)
def make_dask_col_from_extension_dtype(
col: DataArray, *, use_only_object_dtype: bool = False
) -> DaskArray:
"""
Creates dask arrays from :class:`pandas.api.extensions.ExtensionArray` dtype :class:`xarray.DataArray`s.
Parameters
----------
col
The columns to be converted
use_only_object_dtype
Whether or not to cast all :class:`pandas.api.extensions.ExtensionArray` dtypes to `object` type, by default False
Returns
-------
A :class:`dask.Array`: representation of the column.
"""
import dask.array as da
from anndata._io.specs.lazy_methods import (
compute_chunk_layout_for_axis_size,
get_chunksize,
maybe_open_h5,
)
from anndata.experimental import read_elem_lazy
from anndata.experimental.backed._compat import DataArray
from anndata.experimental.backed._compat import xarray as xr
base_path_or_zarr_group = col.attrs.get("base_path_or_zarr_group")
elem_name = col.attrs.get("elem_name")
dims = col.dims
coords = col.coords.copy()
with maybe_open_h5(base_path_or_zarr_group, elem_name) as f:
maybe_chunk_size = get_chunksize(read_elem_lazy(f))
chunk_size = (
compute_chunk_layout_for_axis_size(
1000 if maybe_chunk_size is None else maybe_chunk_size[0], col.shape[0]
),
)
def get_chunk(block_info=None):
# reopening is important to get around h5py's unserializable lock in processes
with maybe_open_h5(base_path_or_zarr_group, elem_name) as f:
v = read_elem_lazy(f)
variable = xr.Variable(
data=xr.core.indexing.LazilyIndexedArray(v), dims=dims
)
data_array = DataArray(
variable,
coords=coords,
dims=dims,
)
idx = tuple(
slice(start, stop) for start, stop in block_info[None]["array-location"]
)
chunk = np.array(data_array.data[idx].array)
return chunk
if col.dtype == "category" or col.dtype == "string" or use_only_object_dtype:
dtype = "object"
else:
dtype = col.dtype.numpy_dtype
return da.map_blocks(
get_chunk,
chunks=chunk_size,
meta=np.array([], dtype=dtype),
dtype=dtype,
)
def make_xarray_extension_dtypes_dask(
annotations: Iterable[Dataset2D], *, use_only_object_dtype: bool = False
) -> Generator[Dataset2D, None, None]:
"""
Creates a generator of Dataset2D objects with dask arrays in place of :class:`pandas.api.extensions.ExtensionArray` dtype columns.
Parameters
----------
annotations
The datasets to be altered
use_only_object_dtype
Whether or not to cast all :class:`pandas.api.extensions.ExtensionArray` dtypes to `object` type, by default False
Yields
------
An altered dataset.
"""
for a in annotations:
extension_cols = {
col for col in a.columns if pd.api.types.is_extension_array_dtype(a[col])
}
yield a.copy(
data={
name: (
make_dask_col_from_extension_dtype(
col, use_only_object_dtype=use_only_object_dtype
)
if name in extension_cols
else col
)
for name, col in a.items()
}
)
DS_CONCAT_DUMMY_INDEX_NAME = "concat_index"
def concat_dataset2d_on_annot_axis(
annotations: Iterable[Dataset2D],
join: Join_T,
) -> Dataset2D:
"""Create a concatenate dataset from a list of :class:`~anndata.experimental.backed._xarray.Dataset2D` objects.
The goal of this function is to mimic `pd.concat(..., ignore_index=True)` so has some complicated logic
for handling the "index" to ensure (a) nothing is loaded into memory and (b) the true index is always tracked.
Parameters
----------
annotations
The :class:`~anndata.experimental.backed._xarray.Dataset2D` objects to be concatenated.
join
Type of join operation
Returns
-------
Concatenated :class:`~anndata.experimental.backed._xarray.Dataset2D`
"""
from anndata._io.specs.lazy_methods import DUMMY_RANGE_INDEX_KEY
from anndata.experimental.backed._compat import Dataset2D
from anndata.experimental.backed._compat import xarray as xr
annotations_re_indexed = []
for a in make_xarray_extension_dtypes_dask(annotations):
old_key = list(a.coords.keys())[0]
# First create a dummy index
a.coords[DS_CONCAT_DUMMY_INDEX_NAME] = (
old_key,
pd.RangeIndex(a[a.attrs["indexing_key"]].shape[0]).astype("str"),
)
# Set all the dimensions to this new dummy index
a = a.swap_dims({old_key: DS_CONCAT_DUMMY_INDEX_NAME})
# Move the old coordinate into a variable
old_coord = a.coords[old_key]
del a.coords[old_key]
a[old_key] = old_coord
annotations_re_indexed.append(a)
# Concat along the dummy index
ds = Dataset2D(
xr.concat(annotations_re_indexed, join=join, dim=DS_CONCAT_DUMMY_INDEX_NAME),
attrs={"indexing_key": f"true_{DS_CONCAT_DUMMY_INDEX_NAME}"},
)
ds.coords[DS_CONCAT_DUMMY_INDEX_NAME] = pd.RangeIndex(
ds.coords[DS_CONCAT_DUMMY_INDEX_NAME].shape[0]
).astype("str")
# Drop any lingering dimensions (swap doesn't delete)
ds = ds.drop_dims(d for d in ds.dims if d != DS_CONCAT_DUMMY_INDEX_NAME)
# Create a new true index and then delete the columns resulting from the concatenation for each index.
# This includes the dummy column (which is neither a dimension nor a true indexing column)
index = xr.concat(
[a[a.attrs["indexing_key"]] for a in annotations_re_indexed],
dim=DS_CONCAT_DUMMY_INDEX_NAME,
)
# prevent duplicate values
index.coords[DS_CONCAT_DUMMY_INDEX_NAME] = ds.coords[DS_CONCAT_DUMMY_INDEX_NAME]
ds[f"true_{DS_CONCAT_DUMMY_INDEX_NAME}"] = index
for key in set(a.attrs["indexing_key"] for a in annotations_re_indexed):
del ds[key]
if DUMMY_RANGE_INDEX_KEY in ds:
del ds[DUMMY_RANGE_INDEX_KEY]
return ds
def concat(
adatas: Collection[AnnData] | Mapping[str, AnnData],
*,
axis: Literal["obs", 0, "var", 1] = "obs",
join: Join_T = "inner",
merge: StrategiesLiteral | Callable | None = None,
uns_merge: StrategiesLiteral | Callable | None = None,
label: str | None = None,
keys: Collection | None = None,
index_unique: str | None = None,
fill_value: Any | None = None,
pairwise: bool = False,
) -> AnnData:
"""Concatenates AnnData objects along an axis.
See the :doc:`concatenation <../concatenation>` section in the docs for a more in-depth description.
Params
------
adatas
The objects to be concatenated. If a Mapping is passed, keys are used for the `keys`
argument and values are concatenated.
axis
Which axis to concatenate along.
join
How to align values when concatenating. If "outer", the union of the other axis
is taken. If "inner", the intersection. See :doc:`concatenation <../concatenation>`
for more.
merge
How elements not aligned to the axis being concatenated along are selected.
Currently implemented strategies include:
* `None`: No elements are kept.
* `"same"`: Elements that are the same in each of the objects.
* `"unique"`: Elements for which there is only one possible value.
* `"first"`: The first element seen at each from each position.
* `"only"`: Elements that show up in only one of the objects.
For :class:`xarray.Dataset` objects, we use their :func:`xarray.merge` with `override` to stay lazy.
uns_merge
How the elements of `.uns` are selected. Uses the same set of strategies as
the `merge` argument, except applied recursively.
label
Column in axis annotation (i.e. `.obs` or `.var`) to place batch information in.
If it's None, no column is added.
keys
Names for each object being added. These values are used for column values for
`label` or appended to the index if `index_unique` is not `None`. Defaults to
incrementing integer labels.
index_unique
Whether to make the index unique by using the keys. If provided, this
is the delimiter between "{orig_idx}{index_unique}{key}". When `None`,
the original indices are kept.
fill_value
When `join="outer"`, this is the value that will be used to fill the introduced
indices. By default, sparse arrays are padded with zeros, while dense arrays and
DataFrames are padded with missing values.
pairwise
Whether pairwise elements along the concatenated dimension should be included.
This is False by default, since the resulting arrays are often not meaningful.
Notes
-----
.. warning::
If you use `join='outer'` this fills 0s for sparse data when
variables are absent in a batch. Use this with care. Dense data is
filled with `NaN`.
Examples
--------
Preparing example objects
>>> import anndata as ad, pandas as pd, numpy as np
>>> from scipy import sparse
>>> a = ad.AnnData(
... X=sparse.csr_matrix(np.array([[0, 1], [2, 3]])),
... obs=pd.DataFrame({"group": ["a", "b"]}, index=["s1", "s2"]),
... var=pd.DataFrame(index=["var1", "var2"]),
... varm={
... "ones": np.ones((2, 5)),
... "rand": np.random.randn(2, 3),
... "zeros": np.zeros((2, 5)),
... },
... uns={"a": 1, "b": 2, "c": {"c.a": 3, "c.b": 4}},
... )
>>> b = ad.AnnData(
... X=sparse.csr_matrix(np.array([[4, 5, 6], [7, 8, 9]])),
... obs=pd.DataFrame(
... {"group": ["b", "c"], "measure": [1.2, 4.3]}, index=["s3", "s4"]
... ),
... var=pd.DataFrame(index=["var1", "var2", "var3"]),
... varm={"ones": np.ones((3, 5)), "rand": np.random.randn(3, 5)},
... uns={"a": 1, "b": 3, "c": {"c.b": 4}},
... )
>>> c = ad.AnnData(
... X=sparse.csr_matrix(np.array([[10, 11], [12, 13]])),
... obs=pd.DataFrame({"group": ["a", "b"]}, index=["s1", "s2"]),
... var=pd.DataFrame(index=["var3", "var4"]),
... uns={"a": 1, "b": 4, "c": {"c.a": 3, "c.b": 4, "c.c": 5}},
... )
Concatenating along different axes
>>> ad.concat([a, b]).to_df()
var1 var2
s1 0 1
s2 2 3
s3 4 5
s4 7 8
>>> ad.concat([a, c], axis="var").to_df()
var1 var2 var3 var4
s1 0 1 10 11
s2 2 3 12 13
Inner and outer joins
>>> inner = ad.concat([a, b]) # Joining on intersection of variables
>>> inner
AnnData object with n_obs × n_vars = 4 × 2
obs: 'group'
>>> (inner.obs_names, inner.var_names) # doctest: +NORMALIZE_WHITESPACE
(Index(['s1', 's2', 's3', 's4'], dtype='object'),
Index(['var1', 'var2'], dtype='object'))
>>> outer = ad.concat([a, b], join="outer") # Joining on union of variables
>>> outer
AnnData object with n_obs × n_vars = 4 × 3
obs: 'group', 'measure'
>>> outer.var_names
Index(['var1', 'var2', 'var3'], dtype='object')
>>> outer.to_df() # Sparse arrays are padded with zeroes by default
var1 var2 var3
s1 0 1 0
s2 2 3 0
s3 4 5 6
s4 7 8 9
Using the axis’ index instead of its name
>>> ad.concat([a, b], axis=0).to_df() # Equivalent to axis="obs"
var1 var2
s1 0 1
s2 2 3
s3 4 5
s4 7 8
>>> ad.concat([a, c], axis=1).to_df() # Equivalent to axis="var"
var1 var2 var3 var4
s1 0 1 10 11
s2 2 3 12 13
Keeping track of source objects
>>> ad.concat({"a": a, "b": b}, label="batch").obs
group batch
s1 a a
s2 b a
s3 b b
s4 c b
>>> ad.concat([a, b], label="batch", keys=["a", "b"]).obs # Equivalent to previous
group batch
s1 a a
s2 b a
s3 b b
s4 c b
>>> ad.concat({"a": a, "b": b}, index_unique="-").obs
group
s1-a a
s2-a b
s3-b b
s4-b c
Combining values not aligned to axis of concatenation
>>> ad.concat([a, b], merge="same")
AnnData object with n_obs × n_vars = 4 × 2
obs: 'group'
varm: 'ones'
>>> ad.concat([a, b], merge="unique")
AnnData object with n_obs × n_vars = 4 × 2
obs: 'group'
varm: 'ones', 'zeros'
>>> ad.concat([a, b], merge="first")
AnnData object with n_obs × n_vars = 4 × 2
obs: 'group'
varm: 'ones', 'rand', 'zeros'
>>> ad.concat([a, b], merge="only")
AnnData object with n_obs × n_vars = 4 × 2
obs: 'group'
varm: 'zeros'
The same merge strategies can be used for elements in `.uns`
>>> dict(ad.concat([a, b, c], uns_merge="same").uns)
{'a': 1, 'c': {'c.b': 4}}
>>> dict(ad.concat([a, b, c], uns_merge="unique").uns)
{'a': 1, 'c': {'c.a': 3, 'c.b': 4, 'c.c': 5}}
>>> dict(ad.concat([a, b, c], uns_merge="only").uns)
{'c': {'c.c': 5}}
>>> dict(ad.concat([a, b, c], uns_merge="first").uns)
{'a': 1, 'b': 2, 'c': {'c.a': 3, 'c.b': 4, 'c.c': 5}}
"""
from anndata.experimental.backed._compat import Dataset2D
from anndata.experimental.backed._compat import xarray as xr
# Argument normalization
merge = resolve_merge_strategy(merge)
uns_merge = resolve_merge_strategy(uns_merge)
if isinstance(adatas, Mapping):
if keys is not None:
msg = (
"Cannot specify categories in both mapping keys and using `keys`. "
"Only specify this once."
)
raise TypeError(msg)
keys, adatas = list(adatas.keys()), list(adatas.values())
else:
adatas = list(adatas)
if keys is None:
keys = np.arange(len(adatas)).astype(str)
axis, axis_name = _resolve_axis(axis)
alt_axis, alt_axis_name = _resolve_axis(axis=1 - axis)
# Label column
label_col = pd.Categorical.from_codes(
np.repeat(np.arange(len(adatas)), [a.shape[axis] for a in adatas]),
categories=keys,
)
# Combining indexes
concat_indices = pd.concat(
[pd.Series(axis_indices(a, axis=axis)) for a in adatas], ignore_index=True
)
if index_unique is not None:
concat_indices = concat_indices.str.cat(
_map_cat_to_str(label_col), sep=index_unique
)
concat_indices = pd.Index(concat_indices)
alt_indices = merge_indices(
[axis_indices(a, axis=alt_axis) for a in adatas], join=join
)
reindexers = [
gen_reindexer(alt_indices, axis_indices(a, axis=alt_axis)) for a in adatas
]
# Annotation for concatenation axis
check_combinable_cols([getattr(a, axis_name).columns for a in adatas], join=join)
annotations = [getattr(a, axis_name) for a in adatas]
are_any_annotations_dataframes = any(
isinstance(a, pd.DataFrame) for a in annotations
)
if are_any_annotations_dataframes:
annotations_in_memory = (
to_memory(a) if isinstance(a, Dataset2D) else a for a in annotations
)
concat_annot = pd.concat(
unify_dtypes(annotations_in_memory),
join=join,
ignore_index=True,
)
concat_annot.index = concat_indices
else:
concat_annot = concat_dataset2d_on_annot_axis(annotations, join)
concat_indices.name = DS_CONCAT_DUMMY_INDEX_NAME
if label is not None:
concat_annot[label] = label_col
# Annotation for other axis
alt_annotations = [getattr(a, alt_axis_name) for a in adatas]
are_any_alt_annotations_dataframes = any(
isinstance(a, pd.DataFrame) for a in alt_annotations
)
if are_any_alt_annotations_dataframes:
alt_annotations_in_memory = [
to_memory(a) if isinstance(a, Dataset2D) else a for a in alt_annotations
]
alt_annot = merge_dataframes(alt_annotations_in_memory, alt_indices, merge)
else:
# TODO: figure out mapping of our merge to theirs instead of just taking first, although this appears to be
# the only "lazy" setting so I'm not sure we really want that.
# Because of xarray's merge upcasting, it's safest to simply assume that all dtypes are objects.
annotations_with_only_dask = list(
make_xarray_extension_dtypes_dask(
alt_annotations, use_only_object_dtype=True
)
)
annotations_with_only_dask = [
a.rename({a.attrs["indexing_key"]: "merge_index"})
for a in annotations_with_only_dask
]
alt_annot = Dataset2D(
xr.merge(annotations_with_only_dask, join=join, compat="override"),
attrs={"indexing_key": "merge_index"},
)
X = concat_Xs(adatas, reindexers, axis=axis, fill_value=fill_value)
if join == "inner":
concat_aligned_mapping = inner_concat_aligned_mapping
join_keys = intersect_keys
elif join == "outer":
concat_aligned_mapping = partial(
outer_concat_aligned_mapping, fill_value=fill_value
)
join_keys = union_keys
else:
msg = f"{join=} should have been validated above by pd.concat"
raise AssertionError(msg)
layers = concat_aligned_mapping(
[a.layers for a in adatas], axis=axis, reindexers=reindexers
)
concat_mapping = concat_aligned_mapping(
[getattr(a, f"{axis_name}m") for a in adatas],
axis=axis,
concat_axis=0,
index=concat_indices,
)
if pairwise:
concat_pairwise = concat_pairwise_mapping(
mappings=[getattr(a, f"{axis_name}p") for a in adatas],
shapes=[a.shape[axis] for a in adatas],
join_keys=join_keys,
)
else:
concat_pairwise = {}
# TODO: Reindex lazily, so we don't have to make those copies until we're sure we need the element
alt_mapping = merge(
[
{k: r(v, axis=0) for k, v in getattr(a, f"{alt_axis_name}m").items()}
for r, a in zip(reindexers, adatas)
],
)
alt_pairwise = merge(
[
{
k: r(r(v, axis=0), axis=1)
for k, v in getattr(a, f"{alt_axis_name}p").items()
}
for r, a in zip(reindexers, adatas)
]
)
uns = uns_merge([a.uns for a in adatas])
raw = None
has_raw = [a.raw is not None for a in adatas]
if all(has_raw):
raw = concat(
[
AnnData(
X=a.raw.X,
obs=pd.DataFrame(index=a.obs_names),
var=a.raw.var,
varm=a.raw.varm,
)
for a in adatas
],
join=join,
label=label,
keys=keys,
index_unique=index_unique,
fill_value=fill_value,
axis=axis,
)
elif any(has_raw):
warn(
"Only some AnnData objects have `.raw` attribute, "
"not concatenating `.raw` attributes.",
UserWarning,
)
return AnnData(
**{
"X": X,
"layers": layers,
axis_name: concat_annot,
alt_axis_name: alt_annot,
f"{axis_name}m": concat_mapping,
f"{alt_axis_name}m": alt_mapping,
f"{axis_name}p": concat_pairwise,
f"{alt_axis_name}p": alt_pairwise,
"uns": uns,
"raw": raw,
}
)
python-anndata-0.12.0~rc1/src/anndata/_core/raw.py 0000664 0000000 0000000 00000017233 15003706322 0021756 0 ustar 00root root 0000000 0000000 from __future__ import annotations
from typing import TYPE_CHECKING
import h5py
import numpy as np
import pandas as pd
from scipy.sparse import issparse
from ..compat import CupyArray, CupySparseMatrix
from .aligned_df import _gen_dataframe
from .aligned_mapping import AlignedMappingProperty, AxisArrays
from .index import _normalize_index, _subset, get_vector, unpack_index
from .sparse_dataset import sparse_dataset
if TYPE_CHECKING:
from collections.abc import Mapping, Sequence
from typing import ClassVar
from ..compat import CSMatrix
from .aligned_mapping import AxisArraysView
from .anndata import AnnData
from .sparse_dataset import BaseCompressedSparseDataset
# TODO: Implement views for Raw
class Raw:
is_view: ClassVar = False
def __init__(
self,
adata: AnnData,
X: np.ndarray | CSMatrix | None = None,
var: pd.DataFrame | Mapping[str, Sequence] | None = None,
varm: AxisArrays | Mapping[str, np.ndarray] | None = None,
):
self._adata = adata
self._n_obs = adata.n_obs
# construct manually
if adata.isbacked == (X is None):
# Move from GPU to CPU since it's large and not always used
if isinstance(X, CupyArray | CupySparseMatrix):
self._X = X.get()
else:
self._X = X
n_var = None if self._X is None else self._X.shape[1]
self._var = _gen_dataframe(
var, ["var_names"], source="X", attr="var", length=n_var
)
self.varm = varm
elif X is None: # construct from adata
# Move from GPU to CPU since it's large and not always used
if isinstance(adata.X, CupyArray | CupySparseMatrix):
self._X = adata.X.get()
else:
self._X = adata.X.copy()
self._var = adata.var.copy()
self.varm = adata.varm.copy()
elif adata.isbacked:
msg = "Cannot specify X if adata is backed"
raise ValueError(msg)
def _get_X(self, layer=None):
if layer is not None:
raise ValueError()
return self.X
@property
def X(self) -> BaseCompressedSparseDataset | np.ndarray | CSMatrix:
# TODO: Handle unsorted array of integer indices for h5py.Datasets
if not self._adata.isbacked:
return self._X
if not self._adata.file.is_open:
self._adata.file.open()
# Handle legacy file formats:
if "raw/X" in self._adata.file:
X = self._adata.file["raw/X"]
elif "raw.X" in self._adata.file:
X = self._adata.file["raw.X"] # Backwards compat
else:
msg = (
f"Could not find dataset for raw X in file: "
f"{self._adata.file.filename}."
)
raise AttributeError(msg)
if isinstance(X, h5py.Group):
X = sparse_dataset(X)
# Check if we need to subset
if self._adata.is_view:
# TODO: As noted above, implement views of raw
# so we can know if we need to subset by var
return _subset(X, (self._adata._oidx, slice(None)))
else:
return X
@property
def shape(self) -> tuple[int, int]:
return self.n_obs, self.n_vars
@property
def var(self) -> pd.DataFrame:
return self._var
@property
def n_vars(self) -> int:
return self._var.shape[0]
@property
def n_obs(self) -> int:
return self._n_obs
varm: AlignedMappingProperty[AxisArrays | AxisArraysView] = AlignedMappingProperty(
"varm", AxisArrays, 1
)
@property
def var_names(self) -> pd.Index[str]:
return self.var.index
@property
def obs_names(self) -> pd.Index[str]:
return self._adata.obs_names
def __getitem__(self, index):
oidx, vidx = self._normalize_indices(index)
# To preserve two dimensional shape
if isinstance(vidx, int | np.integer):
vidx = slice(vidx, vidx + 1, 1)
if isinstance(oidx, int | np.integer):
oidx = slice(oidx, oidx + 1, 1)
if not self._adata.isbacked:
X = _subset(self.X, (oidx, vidx))
else:
X = None
var = self._var.iloc[vidx]
new = Raw(self._adata, X=X, var=var)
if self.varm is not None:
# Since there is no view of raws
new.varm = self.varm._view(_RawViewHack(self, vidx), (vidx,)).copy()
return new
def __str__(self) -> str:
descr = f"Raw AnnData with n_obs × n_vars = {self.n_obs} × {self.n_vars}"
for attr in ["var", "varm"]:
keys = getattr(self, attr).keys()
if len(keys) > 0:
descr += f"\n {attr}: {str(list(keys))[1:-1]}"
return descr
def copy(self) -> Raw:
return Raw(
self._adata,
X=self.X.copy(),
var=self.var.copy(),
varm=None if self._varm is None else self._varm.copy(),
)
def to_adata(self) -> AnnData:
"""Create full AnnData object."""
from anndata import AnnData
return AnnData(
X=self.X.copy(),
var=self.var.copy(),
varm=None if self._varm is None else self._varm.copy(),
obs=self._adata.obs.copy(),
obsm=self._adata.obsm.copy(),
obsp=self._adata.obsp.copy(),
uns=self._adata.uns.copy(),
)
def _normalize_indices(self, packed_index):
# deal with slicing with pd.Series
if isinstance(packed_index, pd.Series):
packed_index = packed_index.values
if isinstance(packed_index, tuple):
if len(packed_index) != 2:
raise IndexDimError(len(packed_index))
if isinstance(packed_index[1], pd.Series):
packed_index = packed_index[0], packed_index[1].values
if isinstance(packed_index[0], pd.Series):
packed_index = packed_index[0].values, packed_index[1]
obs, var = unpack_index(packed_index)
obs = _normalize_index(obs, self._adata.obs_names)
var = _normalize_index(var, self.var_names)
return obs, var
def var_vector(self, k: str) -> np.ndarray:
# TODO decorator to copy AnnData.var_vector docstring
return get_vector(self, k, "var", "obs")
def obs_vector(self, k: str) -> np.ndarray:
# TODO decorator to copy AnnData.obs_vector docstring
idx = self._normalize_indices((slice(None), k))
a = self.X[idx]
if issparse(a):
a = a.toarray()
return np.ravel(a)
# This exists to accommodate AlignedMappings,
# until we implement a proper RawView or get rid of Raw in favor of modes.
class _RawViewHack:
def __init__(self, raw: Raw, vidx: slice | np.ndarray):
self.parent_raw = raw
self.vidx = vidx
@property
def shape(self) -> tuple[int, int]:
return self.parent_raw.n_obs, len(self.var_names)
@property
def obs_names(self) -> pd.Index:
return self.parent_raw.obs_names
@property
def var_names(self) -> pd.Index:
return self.parent_raw.var_names[self.vidx]
class IndexDimError(IndexError):
MSG = (
"You tried to slice an AnnData(View) object with an"
"{}-dimensional index, but only 2 dimensions exist in such an object."
)
MSG_1D = (
"\nIf you tried to slice cells using adata[cells, ], "
"note that Python (unlike R) uses adata[cells, :] as slicing syntax."
)
def __init__(self, n_dims: int):
msg = self.MSG.format(n_dims)
if n_dims == 1:
msg += self.MSG_1D
super().__init__(msg)
python-anndata-0.12.0~rc1/src/anndata/_core/sparse_dataset.py 0000664 0000000 0000000 00000062357 15003706322 0024176 0 ustar 00root root 0000000 0000000 """\
This module implements on disk sparse datasets.
This code is based on and uses the conventions of h5sparse_ by `Appier Inc.`_.
See the copyright and license note in this directory source code.
.. _h5sparse: https://github.com/appier/h5sparse
.. _Appier Inc.: https://www.appier.com/
"""
# TODO:
# - think about supporting the COO format
from __future__ import annotations
import warnings
from abc import ABC
from collections.abc import Iterable
from functools import cached_property
from itertools import accumulate, chain, pairwise
from math import floor
from pathlib import Path
from typing import TYPE_CHECKING, NamedTuple
import h5py
import numpy as np
import scipy
import scipy.sparse as ss
from packaging.version import Version
from scipy.sparse import _sparsetools
from .. import abc
from .._settings import settings
from ..compat import (
CSArray,
CSMatrix,
H5Group,
ZarrArray,
ZarrGroup,
_read_attr,
is_zarr_v2,
)
from .index import _fix_slice_bounds, _subset, unpack_index
if TYPE_CHECKING:
from collections.abc import Sequence
from typing import Literal
from scipy.sparse._compressed import _cs_matrix
from .._types import GroupStorageType
from ..compat import H5Array
from .index import Index, Index1D
else:
from scipy.sparse import spmatrix as _cs_matrix
SCIPY_1_15 = Version(scipy.__version__) >= Version("1.15rc0")
class BackedFormat(NamedTuple):
format: Literal["csr", "csc"]
backed_type: type[BackedSparseMatrix]
memory_type: type[_cs_matrix]
class BackedSparseMatrix(_cs_matrix):
"""\
Mixin class for backed sparse matrices.
Largely needed for the case `backed_sparse_csr(...)[:]`,
since that calls copy on `.data`, `.indices`, and `.indptr`.
"""
data: GroupStorageType
indices: GroupStorageType
indptr: np.ndarray
def copy(self) -> CSMatrix:
if isinstance(self.data, h5py.Dataset):
return sparse_dataset(self.data.parent).to_memory()
if isinstance(self.data, ZarrArray):
import zarr
if is_zarr_v2():
sparse_group = zarr.open(
store=self.data.store,
mode="r",
chunk_store=self.data.chunk_store, # chunk_store is needed, not clear why
)[Path(self.data.path).parent]
else:
anndata_group = zarr.open_group(store=self.data.store, mode="r")
sparse_group = anndata_group[
str(
Path(str(self.data.store_path))
.relative_to(str(anndata_group.store_path))
.parent
)
]
return sparse_dataset(sparse_group).to_memory()
return super().copy()
def _set_many(self, i: Iterable[int], j: Iterable[int], x):
"""\
Sets value at each (i, j) to x
Here (i,j) index major and minor respectively,
and must not contain duplicate entries.
"""
# Scipy 1.3+ compat
n_samples = 1 if np.isscalar(x) else len(x)
offsets = self._offsets(i, j, n_samples)
if -1 not in offsets:
# make a list for interaction with h5py
offsets = list(offsets)
# only affects existing non-zero cells
self.data[offsets] = x
return
else:
msg = "You cannot change the sparsity structure of a SparseDataset."
raise ValueError(msg)
# replace where possible
# mask = offsets > -1
# # offsets[mask]
# bool_data_mask = np.zeros(len(self.data), dtype=bool)
# bool_data_mask[offsets[mask]] = True
# self.data[bool_data_mask] = x[mask]
# # self.data[offsets[mask]] = x[mask]
# # only insertions remain
# mask = ~mask
# i = i[mask]
# i[i < 0] += M
# j = j[mask]
# j[j < 0] += N
# self._insert_many(i, j, x[mask])
def _zero_many(self, i: Sequence[int], j: Sequence[int]):
"""\
Sets value at each (i, j) to zero, preserving sparsity structure.
Here (i,j) index major and minor respectively.
"""
offsets = self._offsets(i, j, len(i))
# only assign zeros to the existing sparsity structure
self.data[list(offsets[offsets > -1])] = 0
def _offsets(
self, i: Iterable[int], j: Iterable[int], n_samples: int
) -> np.ndarray:
i, j, M, N = self._prepare_indices(i, j)
offsets = np.empty(n_samples, dtype=self.indices.dtype)
ret = _sparsetools.csr_sample_offsets(
M, N, self.indptr, self.indices, n_samples, i, j, offsets
)
if ret == 1:
# rinse and repeat
self.sum_duplicates()
_sparsetools.csr_sample_offsets(
M, N, self.indptr, self.indices, n_samples, i, j, offsets
)
return offsets
def _get_contiguous_compressed_slice(
self, s: slice
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
new_indptr = self.indptr[s.start : s.stop + 1].copy()
start = new_indptr[0]
stop = new_indptr[-1]
new_indptr -= start
new_data = self.data[start:stop]
new_indices = self.indices[start:stop]
return new_data, new_indices, new_indptr
class backed_csr_matrix(BackedSparseMatrix, ss.csr_matrix):
def _get_intXslice(self, row: int, col: slice) -> ss.csr_matrix:
return ss.csr_matrix(
get_compressed_vector(self, row), shape=(1, self.shape[1])
)[:, col]
def _get_sliceXslice(self, row: slice, col: slice) -> ss.csr_matrix:
row = _fix_slice_bounds(row, self.shape[0])
col = _fix_slice_bounds(col, self.shape[1])
out_shape = (
slice_len(row, self.shape[0]),
slice_len(col, self.shape[1]),
)
if out_shape[0] == 1:
return self._get_intXslice(slice_as_int(row, self.shape[0]), col)
if row.step != 1:
return self._get_arrayXslice(np.arange(*row.indices(self.shape[0])), col)
res = ss.csr_matrix(
self._get_contiguous_compressed_slice(row),
shape=(out_shape[0], self.shape[1]),
)
return res if out_shape[1] == self.shape[1] else res[:, col]
def _get_arrayXslice(self, row: Sequence[int], col: slice) -> ss.csr_matrix:
idxs = np.asarray(row)
if len(idxs) == 0:
return ss.csr_matrix((0, self.shape[1]))
if idxs.dtype == bool:
idxs = np.where(idxs)
return ss.csr_matrix(
get_compressed_vectors(self, idxs), shape=(len(idxs), self.shape[1])
)[:, col]
class backed_csc_matrix(BackedSparseMatrix, ss.csc_matrix):
def _get_sliceXint(self, row: slice, col: int) -> ss.csc_matrix:
return ss.csc_matrix(
get_compressed_vector(self, col), shape=(self.shape[0], 1)
)[row, :]
def _get_sliceXslice(self, row: slice, col: slice) -> ss.csc_matrix:
row = _fix_slice_bounds(row, self.shape[0])
col = _fix_slice_bounds(col, self.shape[1])
out_shape = (
slice_len(row, self.shape[0]),
slice_len(col, self.shape[1]),
)
if out_shape[1] == 1:
return self._get_sliceXint(row, slice_as_int(col, self.shape[1]))
if col.step != 1:
return self._get_sliceXarray(row, np.arange(*col.indices(self.shape[1])))
res = ss.csc_matrix(
self._get_contiguous_compressed_slice(col),
shape=(self.shape[0], out_shape[1]),
)
return res if out_shape[0] == self.shape[0] else res[row, :]
def _get_sliceXarray(self, row: slice, col: Sequence[int]) -> ss.csc_matrix:
idxs = np.asarray(col)
if len(idxs) == 0:
return ss.csc_matrix((self.shape[0], 0))
if idxs.dtype == bool:
idxs = np.where(idxs)
return ss.csc_matrix(
get_compressed_vectors(self, idxs), shape=(self.shape[0], len(idxs))
)[row, :]
FORMATS = [
BackedFormat("csr", backed_csr_matrix, ss.csr_matrix),
BackedFormat("csc", backed_csc_matrix, ss.csc_matrix),
BackedFormat("csr", backed_csr_matrix, ss.csr_array),
BackedFormat("csc", backed_csc_matrix, ss.csc_array),
]
def slice_len(s: slice, l: int) -> int:
"""Returns length of `a[s]` where `len(a) == l`."""
return len(range(*s.indices(l)))
def slice_as_int(s: slice, l: int) -> int:
"""Converts slices of length 1 to the integer index they’ll access."""
out = list(range(*s.indices(l)))
assert len(out) == 1
return out[0]
def get_compressed_vectors(
x: BackedSparseMatrix, row_idxs: Iterable[int]
) -> tuple[Sequence, Sequence, Sequence]:
indptr_slices = [slice(*(x.indptr[i : i + 2])) for i in row_idxs]
# HDF5 cannot handle out-of-order integer indexing
if isinstance(x.data, ZarrArray):
as_np_indptr = np.concatenate(
[np.arange(s.start, s.stop) for s in indptr_slices]
)
data = x.data[as_np_indptr]
indices = x.indices[as_np_indptr]
else:
data = np.concatenate([x.data[s] for s in indptr_slices])
indices = np.concatenate([x.indices[s] for s in indptr_slices])
indptr = list(accumulate(chain((0,), (s.stop - s.start for s in indptr_slices))))
return data, indices, indptr
def get_compressed_vectors_for_slices(
x: BackedSparseMatrix, slices: Iterable[slice]
) -> tuple[Sequence, Sequence, Sequence]:
indptr_indices = [x.indptr[slice(s.start, s.stop + 1)] for s in slices]
indptr_limits = [slice(i[0], i[-1]) for i in indptr_indices]
# HDF5 cannot handle out-of-order integer indexing
if isinstance(x.data, ZarrArray):
indptr_int = np.concatenate([np.arange(s.start, s.stop) for s in indptr_limits])
data = x.data[indptr_int]
indices = x.indices[indptr_int]
else:
data = np.concatenate([x.data[s] for s in indptr_limits])
indices = np.concatenate([x.indices[s] for s in indptr_limits])
# Need to track the size of the gaps in the slices to each indptr subselection
gaps = (s1.start - s0.stop for s0, s1 in pairwise(indptr_limits))
offsets = accumulate(chain([indptr_limits[0].start], gaps))
start_indptr = indptr_indices[0] - next(offsets)
if len(slices) < 2: # there is only one slice so no need to concatenate
return data, indices, start_indptr
end_indptr = np.concatenate(
[s[1:] - o for s, o in zip(indptr_indices[1:], offsets)]
)
indptr = np.concatenate([start_indptr, end_indptr])
return data, indices, indptr
def get_compressed_vector(
x: BackedSparseMatrix, idx: int
) -> tuple[Sequence, Sequence, Sequence]:
s = slice(*(x.indptr[idx : idx + 2]))
data = x.data[s]
indices = x.indices[s]
indptr = [0, len(data)]
return data, indices, indptr
def subset_by_major_axis_mask(
mtx: _cs_matrix, mask: np.ndarray
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
slices = np.ma.extras._ezclump(mask)
def mean_slice_length(slices):
return floor(sum(s.stop - s.start for s in slices) / len(slices))
# heuristic for whether slicing should be optimized
if len(slices) > 0:
if mean_slice_length(slices) <= 7:
return get_compressed_vectors(mtx, np.where(mask)[0])
else:
return get_compressed_vectors_for_slices(mtx, slices)
return [], [], [0]
def get_memory_class(
format: Literal["csr", "csc"], *, use_sparray_in_io: bool = False
) -> type[_cs_matrix]:
for fmt, _, memory_class in FORMATS:
if format == fmt:
if use_sparray_in_io and issubclass(memory_class, CSArray):
return memory_class
elif not use_sparray_in_io and issubclass(memory_class, CSMatrix):
return memory_class
msg = f"Format string {format} is not supported."
raise ValueError(msg)
def get_backed_class(
format: Literal["csr", "csc"], *, use_sparray_in_io: bool = False
) -> type[BackedSparseMatrix]:
for fmt, backed_class, _ in FORMATS:
if format == fmt:
if use_sparray_in_io and issubclass(backed_class, CSArray):
return backed_class
elif not use_sparray_in_io and issubclass(backed_class, CSMatrix):
return backed_class
msg = f"Format string {format} is not supported."
raise ValueError(msg)
def _get_group_format(group: GroupStorageType) -> str:
if "h5sparse_format" in group.attrs:
# TODO: Warn about an old format
# If this is only just going to be public, I could insist it's not like this
return _read_attr(group.attrs, "h5sparse_format")
else:
# Should this be an extra field?
return _read_attr(group.attrs, "encoding-type").replace("_matrix", "")
# Check for the overridden few methods above in our BackedSparseMatrix subclasses
def is_sparse_indexing_overridden(
format: Literal["csr", "csc"], row: Index1D, col: Index1D
):
major_indexer, minor_indexer = (row, col) if format == "csr" else (col, row)
return isinstance(minor_indexer, slice) and (
(isinstance(major_indexer, int | np.integer))
or (isinstance(major_indexer, slice))
or (isinstance(major_indexer, np.ndarray) and major_indexer.ndim == 1)
)
def validate_indices(
mtx: BackedSparseMatrix, indices: tuple[Index1D, Index1D]
) -> tuple[Index1D, Index1D]:
res = mtx._validate_indices(indices)
return res[0] if SCIPY_1_15 else res
class BaseCompressedSparseDataset(abc._AbstractCSDataset, ABC):
_group: GroupStorageType
def __init__(self, group: GroupStorageType):
type(self)._check_group_format(group)
self._group = group
@property
def group(self) -> GroupStorageType:
"""The group underlying the backed matrix."""
return self._group
@group.setter
def group(self, val):
msg = f"Do not reset group on a {type(self)} with {val}. Instead use `sparse_dataset` to make a new class."
raise AttributeError(msg)
@property
def backend(self) -> Literal["zarr", "hdf5"]:
"""Which file type is used on-disk."""
if isinstance(self.group, ZarrGroup):
return "zarr"
elif isinstance(self.group, H5Group):
return "hdf5"
else:
msg = f"Unknown group type {type(self.group)}"
raise ValueError(msg)
@property
def dtype(self) -> np.dtype:
"""The :class:`numpy.dtype` of the `data` attribute of the sparse matrix."""
return self._data.dtype
@classmethod
def _check_group_format(cls, group):
group_format = _get_group_format(group)
assert group_format == cls.format
@property
def _name(self) -> str:
"""Name of the group."""
return self.group.name
@property
def shape(self) -> tuple[int, int]:
"""Shape of the matrix read off disk."""
shape = _read_attr(self.group.attrs, "shape", None)
if shape is None:
# TODO warn
shape = self.group.attrs.get("h5sparse_shape")
return tuple(map(int, shape))
def __repr__(self) -> str:
name = type(self).__name__.removeprefix("_")
return f"{name}: backend {self.backend}, shape {self.shape}, data_dtype {self.dtype}"
def __getitem__(self, index: Index | tuple[()]) -> float | CSMatrix | CSArray:
indices = self._normalize_index(index)
row, col = indices
mtx = self._to_backed()
row_sp_matrix_validated, col_sp_matrix_validated = validate_indices(
mtx, indices
)
# Handle masked indexing along major axis
if self.format == "csr" and np.array(row).dtype == bool:
sub = ss.csr_matrix(
subset_by_major_axis_mask(mtx, row), shape=(row.sum(), mtx.shape[1])
)[:, col]
elif self.format == "csc" and np.array(col).dtype == bool:
sub = ss.csc_matrix(
subset_by_major_axis_mask(mtx, col), shape=(mtx.shape[0], col.sum())
)[row, :]
# read into memory data if we do not override access methods
elif not is_sparse_indexing_overridden(
self.format, row_sp_matrix_validated, col_sp_matrix_validated
):
sub = self.to_memory()[row_sp_matrix_validated, col_sp_matrix_validated]
else:
sub = mtx[row, col]
# If indexing is array x array it returns a backed_sparse_matrix
# Not sure what the performance is on that operation
# Also need to check if memory format is not matrix
mtx_fmt = get_memory_class(
self.format, use_sparray_in_io=settings.use_sparse_array_on_read
)
must_convert_to_array = issubclass(mtx_fmt, CSArray) and not isinstance(
sub, CSArray
)
if isinstance(sub, BackedSparseMatrix) or must_convert_to_array:
return mtx_fmt(sub)
else:
return sub
def _normalize_index(
self, index: Index | tuple[()]
) -> tuple[np.ndarray, np.ndarray]:
if isinstance(index, tuple) and not len(index):
index = slice(None)
row, col = unpack_index(index)
if all(isinstance(x, Iterable) for x in (row, col)):
row, col = np.ix_(row, col)
return row, col
def __setitem__(self, index: Index | tuple[()], value) -> None:
warnings.warn(
"__setitem__ for backed sparse will be removed in the next anndata release.",
FutureWarning,
)
row, col = self._normalize_index(index)
mock_matrix = self._to_backed()
mock_matrix[row, col] = value
# TODO: split to other classes?
def append(self, sparse_matrix: CSMatrix | CSArray) -> None:
"""Append an in-memory or on-disk sparse matrix to the current object's store.
Parameters
----------
sparse_matrix
The matrix to append.
Raises
------
NotImplementedError
If the matrix to append is not one of :class:`~scipy.sparse.csr_array`, :class:`~scipy.sparse.csc_array`, :class:`~scipy.sparse.csr_matrix`, or :class:`~scipy.sparse.csc_matrix`.
ValueError
If both the on-disk and to-append matrices are not of the same format i.e., `csr` or `csc`.
OverflowError
If the underlying data store has a 32 bit indptr, and the new matrix is too large to fit in it i.e., would cause a 64 bit `indptr` to be written.
AssertionError
If the on-disk data does not have `csc` or `csr` format.
"""
# Prep variables
shape = self.shape
if isinstance(sparse_matrix, BaseCompressedSparseDataset):
sparse_matrix = sparse_matrix._to_backed()
# Check input
if not ss.issparse(sparse_matrix):
msg = (
"Currently, only sparse matrices of equivalent format can be "
"appended to a SparseDataset."
)
raise NotImplementedError(msg)
if self.format not in {"csr", "csc"}:
msg = f"The append method for format {self.format} is not implemented."
raise NotImplementedError(msg)
if self.format != sparse_matrix.format:
msg = (
f"Matrices must have same format. Currently are "
f"{self.format!r} and {sparse_matrix.format!r}"
)
raise ValueError(msg)
[indptr_offset] = self.group["indices"].shape
if self.group["indptr"].dtype == np.int32:
new_nnz = indptr_offset + sparse_matrix.indices.shape[0]
if new_nnz >= np.iinfo(np.int32).max:
msg = (
"This array was written with a 32 bit intptr, but is now large "
"enough to require 64 bit values. Please recreate the array with "
"a 64 bit indptr."
)
raise OverflowError(msg)
# shape
if self.format == "csr":
assert shape[1] == sparse_matrix.shape[1], (
"CSR matrices must have same size of dimension 1 to be appended."
)
new_shape = (shape[0] + sparse_matrix.shape[0], shape[1])
elif self.format == "csc":
assert shape[0] == sparse_matrix.shape[0], (
"CSC matrices must have same size of dimension 0 to be appended."
)
new_shape = (shape[0], shape[1] + sparse_matrix.shape[1])
else:
msg = "We forgot to update this branching to a new format"
raise AssertionError(msg)
if "h5sparse_shape" in self.group.attrs:
del self.group.attrs["h5sparse_shape"]
self.group.attrs["shape"] = new_shape
# data
data = self.group["data"]
orig_data_size = data.shape[0]
data.resize((orig_data_size + sparse_matrix.data.shape[0],))
# see https://github.com/zarr-developers/zarr-python/discussions/2712 for why we need to read first
append_data = sparse_matrix.data
append_indices = sparse_matrix.indices
if isinstance(sparse_matrix.data, ZarrArray) and not is_zarr_v2():
data[orig_data_size:] = append_data[...]
else:
data[orig_data_size:] = append_data
# indptr
indptr = self.group["indptr"]
orig_data_size = indptr.shape[0]
indptr.resize((orig_data_size + sparse_matrix.indptr.shape[0] - 1,))
indptr[orig_data_size:] = (
sparse_matrix.indptr[1:].astype(np.int64) + indptr_offset
)
# indices
if isinstance(sparse_matrix.data, ZarrArray) and not is_zarr_v2():
append_indices = append_indices[...]
indices = self.group["indices"]
orig_data_size = indices.shape[0]
indices.resize((orig_data_size + sparse_matrix.indices.shape[0],))
indices[orig_data_size:] = append_indices
# Clear cached property
for attr in ["_indptr", "_indices", "_data"]:
if hasattr(self, attr):
delattr(self, attr)
@cached_property
def _indptr(self) -> np.ndarray:
"""\
Other than `data` and `indices`, this is only as long as the major axis
It should therefore fit into memory, so we cache it for faster access.
"""
arr = self.group["indptr"][...]
return arr
@cached_property
def _indices(self) -> H5Array | ZarrArray:
"""\
Cache access to the indices to prevent unnecessary reads of the zarray
"""
return self.group["indices"]
@cached_property
def _data(self) -> H5Array | ZarrArray:
"""\
Cache access to the data to prevent unnecessary reads of the zarray
"""
return self.group["data"]
def _to_backed(self) -> BackedSparseMatrix:
format_class = get_backed_class(self.format)
mtx = format_class(self.shape, dtype=self.dtype)
mtx.data = self._data
mtx.indices = self._indices
mtx.indptr = self._indptr
return mtx
def to_memory(self) -> CSMatrix | CSArray:
format_class = get_memory_class(
self.format, use_sparray_in_io=settings.use_sparse_array_on_read
)
mtx = format_class(self.shape, dtype=self.dtype)
mtx.data = self._data[...]
mtx.indices = self._indices[...]
mtx.indptr = self._indptr
return mtx
class _CSRDataset(BaseCompressedSparseDataset, abc.CSRDataset):
"""Internal concrete version of :class:`anndata.abc.CSRDataset`."""
class _CSCDataset(BaseCompressedSparseDataset, abc.CSCDataset):
"""Internal concrete version of :class:`anndata.abc.CSRDataset`."""
def sparse_dataset(group: GroupStorageType) -> abc.CSRDataset | abc.CSCDataset:
"""Generates a backed mode-compatible sparse dataset class.
Parameters
----------
group
The backing group store.
Returns
-------
Sparse dataset class.
Example
-------
First we'll need a stored dataset:
>>> import scanpy as sc
>>> import h5py
>>> from anndata.io import sparse_dataset
>>> from anndata.io import read_elem
>>> sc.datasets.pbmc68k_reduced().raw.to_adata().write_h5ad("pbmc.h5ad")
Initialize a sparse dataset from storage
>>> f = h5py.File("pbmc.h5ad")
>>> X = sparse_dataset(f["X"])
>>> X
CSRDataset: backend hdf5, shape (700, 765), data_dtype float32
Indexing returns sparse matrices
>>> X[100:200] # doctest: +ELLIPSIS
<...sparse matrix of...float32...with 25003 stored elements...>
These can also be used inside of an AnnData object, no need for backed mode
>>> from anndata import AnnData
>>> adata = AnnData(
... layers={"backed": X}, obs=read_elem(f["obs"]), var=read_elem(f["var"])
... )
>>> adata.layers["backed"]
CSRDataset: backend hdf5, shape (700, 765), data_dtype float32
Indexing access (i.e., from views) brings selection into memory
>>> adata[adata.obs["bulk_labels"] == "CD56+ NK"].layers[
... "backed"
... ] # doctest: +ELLIPSIS
<...sparse matrix of...float32...with 7340 stored elements...>
"""
encoding_type = _get_group_format(group)
if encoding_type == "csr":
return _CSRDataset(group)
elif encoding_type == "csc":
return _CSCDataset(group)
msg = f"Unknown encoding type {encoding_type}"
raise ValueError(msg)
@_subset.register(BaseCompressedSparseDataset)
def subset_sparsedataset(d, subset_idx):
return d[subset_idx]
python-anndata-0.12.0~rc1/src/anndata/_core/storage.py 0000664 0000000 0000000 00000005143 15003706322 0022626 0 ustar 00root root 0000000 0000000 from __future__ import annotations
import warnings
from typing import TYPE_CHECKING, get_args
import numpy as np
import pandas as pd
from scipy import sparse
from anndata.compat import CSArray, CSMatrix
from .._warnings import ImplicitModificationWarning
from ..utils import (
ensure_df_homogeneous,
join_english,
raise_value_error_if_multiindex_columns,
)
if TYPE_CHECKING:
from typing import Any
def coerce_array(
value: Any,
*,
name: str,
allow_df: bool = False,
allow_array_like: bool = False,
):
try:
from anndata.experimental.backed._compat import Dataset2D
except ImportError:
class Dataset2D:
@staticmethod
def __repr__():
return "mock anndata.experimental.backed._xarray."
"""Coerce arrays stored in layers/X, and aligned arrays ({obs,var}{m,p})."""
from ..typing import ArrayDataStructureTypes
# If value is a scalar and we allow that, return it
if allow_array_like and np.isscalar(value):
return value
# If value is one of the allowed types, return it
array_data_structure_types = get_args(ArrayDataStructureTypes)
if isinstance(value, (*array_data_structure_types, Dataset2D)):
if isinstance(value, np.matrix):
msg = f"{name} should not be a np.matrix, use np.ndarray instead."
warnings.warn(msg, ImplicitModificationWarning)
value = value.A
return value
is_non_csc_r_array_or_matrix = (
(isinstance(value, base) and not isinstance(value, csr_c_format))
for base, csr_c_format in [
(sparse.spmatrix, CSMatrix),
(sparse.sparray, CSArray),
]
)
if any(is_non_csc_r_array_or_matrix):
msg = f"Only CSR and CSC {'matrices' if isinstance(value, sparse.spmatrix) else 'arrays'} are supported."
raise ValueError(msg)
if isinstance(value, pd.DataFrame):
if allow_df:
raise_value_error_if_multiindex_columns(value, name)
return value if allow_df else ensure_df_homogeneous(value, name)
# if value is an array-like object, try to convert it
e = None
if allow_array_like:
try:
# TODO: asarray? asanyarray?
return np.array(value)
except (ValueError, TypeError) as _e:
e = _e
# if value isn’t the right type or convertible, raise an error
msg = f"{name} needs to be of one of {join_english(map(str, array_data_structure_types))}, not {type(value)}."
if e is not None:
msg += " (Failed to convert it to an array, see above for details.)"
raise ValueError(msg) from e
python-anndata-0.12.0~rc1/src/anndata/_core/views.py 0000664 0000000 0000000 00000034232 15003706322 0022320 0 ustar 00root root 0000000 0000000 from __future__ import annotations
import warnings
from contextlib import contextmanager
from copy import deepcopy
from functools import reduce, singledispatch, wraps
from typing import TYPE_CHECKING, Literal
import numpy as np
import pandas as pd
from pandas.api.types import is_bool_dtype
from scipy import sparse
from anndata._warnings import ImplicitModificationWarning
from .._settings import settings
from ..compat import (
AwkArray,
CupyArray,
CupyCSCMatrix,
CupyCSRMatrix,
DaskArray,
ZappyArray,
)
from .access import ElementRef
if TYPE_CHECKING:
from collections.abc import Callable, Iterable, KeysView, Sequence
from typing import Any
from anndata import AnnData
@contextmanager
def view_update(adata_view: AnnData, attr_name: str, keys: tuple[str, ...]):
"""Context manager for updating a view of an AnnData object.
Contains logic for "actualizing" a view. Yields the object to be modified in-place.
Parameters
----------
adata_view
A view of an AnnData
attr_name
Name of the attribute being updated
keys
Keys to the attribute being updated
Yields
------
`adata.attr[key1][key2][keyn]...`
"""
new = adata_view.copy()
attr = getattr(new, attr_name)
container = reduce(lambda d, k: d[k], keys, attr)
yield container
adata_view._init_as_actual(new)
class _SetItemMixin:
"""\
Class which (when values are being set) lets their parent AnnData view know,
so it can make a copy of itself.
This implements copy-on-modify semantics for views of AnnData objects.
"""
_view_args: ElementRef | None
def __setitem__(self, idx: Any, value: Any):
if self._view_args is None:
super().__setitem__(idx, value)
else:
warnings.warn(
f"Trying to modify attribute `.{self._view_args.attrname}` of view, "
"initializing view as actual.",
ImplicitModificationWarning,
stacklevel=2,
)
with view_update(*self._view_args) as container:
container[idx] = value
class _ViewMixin(_SetItemMixin):
def __init__(
self,
*args,
view_args: tuple[AnnData, str, tuple[str, ...]] = None,
**kwargs,
):
if view_args is not None:
view_args = ElementRef(*view_args)
self._view_args = view_args
super().__init__(*args, **kwargs)
# TODO: This makes `deepcopy(obj)` return `obj._view_args.parent._adata_ref`, fix it
def __deepcopy__(self, memo):
parent, attrname, keys = self._view_args
return deepcopy(getattr(parent._adata_ref, attrname))
_UFuncMethod = Literal["__call__", "reduce", "reduceat", "accumulate", "outer", "inner"]
class ArrayView(_SetItemMixin, np.ndarray):
def __new__(
cls,
input_array: Sequence[Any],
view_args: tuple[AnnData, str, tuple[str, ...]] = None,
):
arr = np.asanyarray(input_array).view(cls)
if view_args is not None:
view_args = ElementRef(*view_args)
arr._view_args = view_args
return arr
def __array_finalize__(self, obj: np.ndarray | None):
if obj is not None:
self._view_args = getattr(obj, "_view_args", None)
def __array_ufunc__(
self: ArrayView,
ufunc: Callable[..., Any],
method: _UFuncMethod,
*inputs,
out: tuple[np.ndarray, ...] | None = None,
**kwargs,
) -> np.ndarray:
"""Makes numpy ufuncs convert all instances of views to plain arrays.
See https://numpy.org/devdocs/user/basics.subclassing.html#array-ufunc-for-ufuncs
"""
def convert_all(arrs: Iterable[np.ndarray]) -> Iterable[np.ndarray]:
return (
arr.view(np.ndarray) if isinstance(arr, ArrayView) else arr
for arr in arrs
)
if out is None:
outputs = (None,) * ufunc.nout
else:
out = outputs = tuple(convert_all(out))
results = super().__array_ufunc__(
ufunc, method, *convert_all(inputs), out=out, **kwargs
)
if results is NotImplemented:
return NotImplemented
if ufunc.nout == 1:
results = (results,)
results = tuple(
(np.asarray(result) if output is None else output)
for result, output in zip(results, outputs)
)
return results[0] if len(results) == 1 else results
def keys(self) -> KeysView[str]:
# it’s a structured array
return self.dtype.names
def copy(self, order: str = "C") -> np.ndarray:
# we want a conventional array
return np.array(self)
def toarray(self) -> np.ndarray:
return self.copy()
# Extends DaskArray
# Calls parent __new__ constructor since
# even calling astype on a dask array
# needs a .compute() call to actually happen.
# So no construction by view casting like ArrayView
class DaskArrayView(_SetItemMixin, DaskArray):
def __new__(
cls,
input_array: DaskArray,
view_args: tuple[AnnData, str, tuple[str, ...]] = None,
):
arr = super().__new__(
cls,
dask=input_array.dask,
name=input_array.name,
chunks=input_array.chunks,
dtype=input_array.dtype,
meta=input_array._meta,
shape=input_array.shape,
)
if view_args is not None:
view_args = ElementRef(*view_args)
arr._view_args = view_args
return arr
def __array_finalize__(self, obj: DaskArray | None):
if obj is not None:
self._view_args = getattr(obj, "_view_args", None)
def keys(self) -> KeysView[str]:
# it’s a structured array
return self.dtype.names
# Unlike array views, SparseCSRMatrixView and SparseCSCMatrixView
# do not propagate through subsetting
class SparseCSRMatrixView(_ViewMixin, sparse.csr_matrix):
# https://github.com/scverse/anndata/issues/656
def copy(self) -> sparse.csr_matrix:
return sparse.csr_matrix(self).copy()
class SparseCSCMatrixView(_ViewMixin, sparse.csc_matrix):
# https://github.com/scverse/anndata/issues/656
def copy(self) -> sparse.csc_matrix:
return sparse.csc_matrix(self).copy()
class SparseCSRArrayView(_ViewMixin, sparse.csr_array):
# https://github.com/scverse/anndata/issues/656
def copy(self) -> sparse.csr_array:
return sparse.csr_array(self).copy()
class SparseCSCArrayView(_ViewMixin, sparse.csc_array):
# https://github.com/scverse/anndata/issues/656
def copy(self) -> sparse.csc_array:
return sparse.csc_array(self).copy()
class CupySparseCSRView(_ViewMixin, CupyCSRMatrix):
def copy(self) -> CupyCSRMatrix:
return CupyCSRMatrix(self).copy()
class CupySparseCSCView(_ViewMixin, CupyCSCMatrix):
def copy(self) -> CupyCSCMatrix:
return CupyCSCMatrix(self).copy()
class CupyArrayView(_ViewMixin, CupyArray):
def __new__(
cls,
input_array: Sequence[Any],
view_args: tuple[AnnData, str, tuple[str, ...]] = None,
):
import cupy as cp
arr = cp.asarray(input_array).view(type=cls)
if view_args is not None:
view_args = ElementRef(*view_args)
arr._view_args = view_args
return arr
def copy(self) -> CupyArray:
import cupy as cp
return cp.array(self).copy()
class DictView(_ViewMixin, dict):
pass
class DataFrameView(_ViewMixin, pd.DataFrame):
_metadata = ["_view_args"]
@wraps(pd.DataFrame.drop)
def drop(self, *args, inplace: bool = False, **kw):
if not inplace:
return self.copy().drop(*args, **kw)
with view_update(*self._view_args) as df:
df.drop(*args, inplace=True, **kw)
def __setattr__(self, key: str, value: Any):
if key == "index":
warnings.warn(
f"Trying to modify {key} of attribute `.{self._view_args.attrname}` of view, "
"initializing view as actual.",
ImplicitModificationWarning,
stacklevel=2,
)
with view_update(*self._view_args) as container:
setattr(container, key, value)
else:
super().__setattr__(key, value)
@singledispatch
def as_view(obj, view_args):
msg = f"No view type has been registered for {type(obj)}"
raise NotImplementedError(msg)
@as_view.register(np.ndarray)
def as_view_array(array, view_args):
return ArrayView(array, view_args=view_args)
@as_view.register(DaskArray)
def as_view_dask_array(array, view_args):
return DaskArrayView(array, view_args=view_args)
@as_view.register(pd.DataFrame)
def as_view_df(df, view_args):
if settings.remove_unused_categories:
for col in df.columns:
if isinstance(df[col].dtype, pd.CategoricalDtype):
with pd.option_context("mode.chained_assignment", None):
df[col] = df[col].cat.remove_unused_categories()
return DataFrameView(df, view_args=view_args)
@as_view.register(sparse.csr_matrix)
def as_view_csr_matrix(mtx, view_args):
return SparseCSRMatrixView(mtx, view_args=view_args)
@as_view.register(sparse.csc_matrix)
def as_view_csc_matrix(mtx, view_args):
return SparseCSCMatrixView(mtx, view_args=view_args)
@as_view.register(sparse.csr_array)
def as_view_csr_array(mtx, view_args):
return SparseCSRArrayView(mtx, view_args=view_args)
@as_view.register(sparse.csc_array)
def as_view_csc_array(mtx, view_args):
return SparseCSCArrayView(mtx, view_args=view_args)
@as_view.register(dict)
def as_view_dict(d, view_args):
return DictView(d, view_args=view_args)
@as_view.register(ZappyArray)
def as_view_zappy(z, view_args):
# Previous code says ZappyArray works as view,
# but as far as I can tell they’re immutable.
return z
@as_view.register(CupyArray)
def as_view_cupy(array, view_args):
return CupyArrayView(array, view_args=view_args)
@as_view.register(CupyCSRMatrix)
def as_view_cupy_csr(mtx, view_args):
return CupySparseCSRView(mtx, view_args=view_args)
@as_view.register(CupyCSCMatrix)
def as_view_cupy_csc(mtx, view_args):
return CupySparseCSCView(mtx, view_args=view_args)
try:
import weakref
from ..compat import awkward as ak
# Registry to store weak references from AwkwardArrayViews to their parent AnnData container
_registry = weakref.WeakValueDictionary()
_PARAM_NAME = "_view_args"
class AwkwardArrayView(_ViewMixin, AwkArray):
@property
def _view_args(self):
"""Override _view_args to retrieve the values from awkward arrays parameters.
Awkward arrays cannot be subclassed like other python objects. Instead subclasses need
to be attached as "behavior". These "behaviors" cannot take any additional parameters (as we do
for other data types to store `_view_args`). Therefore, we need to store `_view_args` using awkward's
parameter mechanism. These parameters need to be json-serializable, which is why we can't store
ElementRef directly, but need to replace the reference to the parent AnnDataView container with a weak
reference.
"""
parent_key, attrname, keys = self.layout.parameter(_PARAM_NAME)
parent = _registry[parent_key]
return ElementRef(parent, attrname, keys)
def __copy__(self) -> AwkArray:
"""
Turn the AwkwardArrayView into an actual AwkwardArray with no special behavior.
Need to override __copy__ instead of `.copy()` as awkward arrays don't implement `.copy()`
and are copied using python's standard copy mechanism in `aligned_mapping.py`.
"""
array = self
# makes a shallow copy and removes the reference to the original AnnData object
array = ak.with_parameter(self, _PARAM_NAME, None)
array = ak.with_parameter(array, "__list__", None)
return array
@as_view.register(AwkArray)
def as_view_awkarray(array, view_args):
parent, attrname, keys = view_args
parent_key = f"target-{id(parent)}"
_registry[parent_key] = parent
# TODO: See https://github.com/scverse/anndata/pull/647#discussion_r963494798_ for more details and
# possible strategies to stack behaviors.
# A better solution might be based on xarray-style "attrs", once this is implemented
# https://github.com/scikit-hep/awkward/issues/1391#issuecomment-1412297114
if type(array).__name__ != "Array":
msg = (
"Cannot create a view of an awkward array with __array__ parameter. "
"Please open an issue in the AnnData repo and describe your use-case."
)
raise NotImplementedError(msg)
array = ak.with_parameter(array, _PARAM_NAME, (parent_key, attrname, keys))
array = ak.with_parameter(array, "__list__", "AwkwardArrayView")
return array
ak.behavior["AwkwardArrayView"] = AwkwardArrayView
except ImportError:
class AwkwardArrayView:
pass
def _resolve_idxs(old, new, adata):
t = tuple(_resolve_idx(old[i], new[i], adata.shape[i]) for i in (0, 1))
return t
@singledispatch
def _resolve_idx(old, new, l):
return old[new]
@_resolve_idx.register(np.ndarray)
def _resolve_idx_ndarray(old, new, l):
if is_bool_dtype(old) and is_bool_dtype(new):
mask_new = np.zeros_like(old)
mask_new[np.flatnonzero(old)[new]] = True
return mask_new
if is_bool_dtype(old):
old = np.where(old)[0]
return old[new]
@_resolve_idx.register(np.integer)
@_resolve_idx.register(int)
def _resolve_idx_scalar(old, new, l):
return np.array([old])[new]
@_resolve_idx.register(slice)
def _resolve_idx_slice(old, new, l):
if isinstance(new, slice):
return _resolve_idx_slice_slice(old, new, l)
else:
return np.arange(*old.indices(l))[new]
def _resolve_idx_slice_slice(old, new, l):
r = range(*old.indices(l))[new]
# Convert back to slice
start, stop, step = r.start, r.stop, r.step
if len(r) == 0:
stop = start
elif stop < 0:
stop = None
return slice(start, stop, step)
python-anndata-0.12.0~rc1/src/anndata/_io/ 0000775 0000000 0000000 00000000000 15003706322 0020264 5 ustar 00root root 0000000 0000000 python-anndata-0.12.0~rc1/src/anndata/_io/__init__.py 0000664 0000000 0000000 00000000505 15003706322 0022375 0 ustar 00root root 0000000 0000000 from __future__ import annotations
import warnings
__all__: list[str] = []
def __getattr__(key: str):
from .. import io
attr = getattr(io, key)
warnings.warn(
f"Importing {key} from `anndata._io` is deprecated. "
"Please use anndata.io instead.",
FutureWarning,
)
return attr
python-anndata-0.12.0~rc1/src/anndata/_io/h5ad.py 0000664 0000000 0000000 00000032505 15003706322 0021464 0 ustar 00root root 0000000 0000000 from __future__ import annotations
import re
from functools import partial
from pathlib import Path
from types import MappingProxyType
from typing import TYPE_CHECKING, TypeVar
from warnings import warn
import h5py
import numpy as np
import pandas as pd
from scipy import sparse
from anndata._warnings import OldFormatWarning
from .._core.anndata import AnnData
from .._core.file_backing import filename
from .._core.sparse_dataset import BaseCompressedSparseDataset
from ..compat import (
CSMatrix,
_clean_uns,
_decode_structured_array,
_from_fixed_length_strings,
)
from ..experimental import read_dispatched
from .specs import read_elem, write_elem
from .specs.registry import IOSpec, write_spec
from .utils import (
H5PY_V3,
_read_legacy_raw,
idx_chunks_along_axis,
no_write_dataset_2d,
report_read_key_on_error,
report_write_key_on_error,
)
if TYPE_CHECKING:
from collections.abc import Callable, Collection, Mapping, Sequence
from os import PathLike
from typing import Any, Literal
from .._core.file_backing import AnnDataFileManager
T = TypeVar("T")
@no_write_dataset_2d
def write_h5ad(
filepath: PathLike[str] | str,
adata: AnnData,
*,
as_dense: Sequence[str] = (),
convert_strings_to_categoricals: bool = True,
dataset_kwargs: Mapping[str, Any] = MappingProxyType({}),
**kwargs,
) -> None:
"""See :meth:`~anndata.AnnData.write_h5ad`."""
if isinstance(as_dense, str):
as_dense = [as_dense]
if "raw.X" in as_dense:
as_dense = list(as_dense)
as_dense[as_dense.index("raw.X")] = "raw/X"
if any(val not in {"X", "raw/X"} for val in as_dense):
msg = "Currently, only `X` and `raw/X` are supported values in `as_dense`"
raise NotImplementedError(msg)
if "raw/X" in as_dense and adata.raw is None:
msg = "Cannot specify writing `raw/X` to dense if it doesn’t exist."
raise ValueError(msg)
if convert_strings_to_categoricals:
adata.strings_to_categoricals()
if adata.raw is not None:
adata.strings_to_categoricals(adata.raw.var)
dataset_kwargs = {**dataset_kwargs, **kwargs}
filepath = Path(filepath)
mode = "a" if adata.isbacked else "w"
if adata.isbacked: # close so that we can reopen below
adata.file.close()
with h5py.File(filepath, mode) as f:
# TODO: Use spec writing system for this
# Currently can't use write_dispatched here because this function is also called to do an
# inplace update of a backed object, which would delete "/"
f = f["/"]
f.attrs.setdefault("encoding-type", "anndata")
f.attrs.setdefault("encoding-version", "0.1.0")
if "X" in as_dense and isinstance(
adata.X, CSMatrix | BaseCompressedSparseDataset
):
write_sparse_as_dense(f, "X", adata.X, dataset_kwargs=dataset_kwargs)
elif not (adata.isbacked and Path(adata.filename) == Path(filepath)):
# If adata.isbacked, X should already be up to date
write_elem(f, "X", adata.X, dataset_kwargs=dataset_kwargs)
if "raw/X" in as_dense and isinstance(
adata.raw.X, CSMatrix | BaseCompressedSparseDataset
):
write_sparse_as_dense(
f, "raw/X", adata.raw.X, dataset_kwargs=dataset_kwargs
)
write_elem(f, "raw/var", adata.raw.var, dataset_kwargs=dataset_kwargs)
write_elem(
f, "raw/varm", dict(adata.raw.varm), dataset_kwargs=dataset_kwargs
)
elif adata.raw is not None:
write_elem(f, "raw", adata.raw, dataset_kwargs=dataset_kwargs)
write_elem(f, "obs", adata.obs, dataset_kwargs=dataset_kwargs)
write_elem(f, "var", adata.var, dataset_kwargs=dataset_kwargs)
write_elem(f, "obsm", dict(adata.obsm), dataset_kwargs=dataset_kwargs)
write_elem(f, "varm", dict(adata.varm), dataset_kwargs=dataset_kwargs)
write_elem(f, "obsp", dict(adata.obsp), dataset_kwargs=dataset_kwargs)
write_elem(f, "varp", dict(adata.varp), dataset_kwargs=dataset_kwargs)
write_elem(f, "layers", dict(adata.layers), dataset_kwargs=dataset_kwargs)
write_elem(f, "uns", dict(adata.uns), dataset_kwargs=dataset_kwargs)
@report_write_key_on_error
@write_spec(IOSpec("array", "0.2.0"))
def write_sparse_as_dense(
f: h5py.Group,
key: str,
value: CSMatrix | BaseCompressedSparseDataset,
*,
dataset_kwargs: Mapping[str, Any] = MappingProxyType({}),
):
real_key = None # Flag for if temporary key was used
if key in f:
if isinstance(value, BaseCompressedSparseDataset) and (
filename(value.group) == filename(f)
): # Write to temporary key before overwriting
real_key = key
# Transform key to temporary, e.g. raw/X -> raw/_X, or X -> _X
key = re.sub(r"(.*)(\w(?!.*/))", r"\1_\2", key.rstrip("/"))
else:
del f[key] # Wipe before write
dset = f.create_dataset(key, shape=value.shape, dtype=value.dtype, **dataset_kwargs)
compressed_axis = int(isinstance(value, sparse.csc_matrix))
for idx in idx_chunks_along_axis(value.shape, compressed_axis, 1000):
dset[idx] = value[idx].toarray()
if real_key is not None:
del f[real_key]
f[real_key] = f[key]
del f[key]
def read_h5ad_backed(
filename: str | PathLike[str], mode: Literal["r", "r+"]
) -> AnnData:
d = dict(filename=filename, filemode=mode)
f = h5py.File(filename, mode)
attributes = ["obsm", "varm", "obsp", "varp", "uns", "layers"]
df_attributes = ["obs", "var"]
if "encoding-type" in f.attrs:
attributes.extend(df_attributes)
else:
for k in df_attributes:
if k in f: # Backwards compat
d[k] = read_dataframe(f[k])
d.update({k: read_elem(f[k]) for k in attributes if k in f})
d["raw"] = _read_raw(f, attrs={"var", "varm"})
adata = AnnData(**d)
# Backwards compat to <0.7
if isinstance(f["obs"], h5py.Dataset):
_clean_uns(adata)
return adata
def read_h5ad(
filename: PathLike[str] | str,
backed: Literal["r", "r+"] | bool | None = None,
*,
as_sparse: Sequence[str] = (),
as_sparse_fmt: type[CSMatrix] = sparse.csr_matrix,
chunk_size: int = 6000, # TODO, probably make this 2d chunks
) -> AnnData:
"""\
Read `.h5ad`-formatted hdf5 file.
Parameters
----------
filename
File name of data file.
backed
If `'r'`, load :class:`~anndata.AnnData` in `backed` mode
instead of fully loading it into memory (`memory` mode).
If you want to modify backed attributes of the AnnData object,
you need to choose `'r+'`.
Currently, `backed` only support updates to `X`. That means any
changes to other slots like `obs` will not be written to disk in
`backed` mode. If you would like save changes made to these slots
of a `backed` :class:`~anndata.AnnData`, write them to a new file
(see :meth:`~anndata.AnnData.write`). For an example, see
:ref:`read-partial`.
as_sparse
If an array was saved as dense, passing its name here will read it as
a sparse_matrix, by chunk of size `chunk_size`.
as_sparse_fmt
Sparse format class to read elements from `as_sparse` in as.
chunk_size
Used only when loading sparse dataset that is stored as dense.
Loading iterates through chunks of the dataset of this row size
until it reads the whole dataset.
Higher size means higher memory consumption and higher (to a point)
loading speed.
"""
if backed not in {None, False}:
mode = backed
if mode is True:
mode = "r+"
assert mode in {"r", "r+"}
return read_h5ad_backed(filename, mode)
if as_sparse_fmt not in (sparse.csr_matrix, sparse.csc_matrix):
msg = "Dense formats can only be read to CSR or CSC matrices at this time."
raise NotImplementedError(msg)
if isinstance(as_sparse, str):
as_sparse = [as_sparse]
else:
as_sparse = list(as_sparse)
for i in range(len(as_sparse)):
if as_sparse[i] in {("raw", "X"), "raw.X"}:
as_sparse[i] = "raw/X"
elif as_sparse[i] not in {"raw/X", "X"}:
msg = "Currently only `X` and `raw/X` can be read as sparse."
raise NotImplementedError(msg)
rdasp = partial(
read_dense_as_sparse, sparse_format=as_sparse_fmt, axis_chunk=chunk_size
)
with h5py.File(filename, "r") as f:
def callback(func, elem_name: str, elem, iospec):
if iospec.encoding_type == "anndata" or elem_name.endswith("/"):
return AnnData(
**{
# This is covering up backwards compat in the anndata initializer
# In most cases we should be able to call `func(elen[k])` instead
k: read_dispatched(elem[k], callback)
for k in elem.keys()
if not k.startswith("raw.")
}
)
elif elem_name.startswith("/raw."):
return None
elif elem_name == "/X" and "X" in as_sparse:
return rdasp(elem)
elif elem_name == "/raw":
return _read_raw(f, as_sparse, rdasp)
elif elem_name in {"/obs", "/var"}:
# Backwards compat
return read_dataframe(elem)
return func(elem)
adata = read_dispatched(f, callback=callback)
# Backwards compat (should figure out which version)
if "raw.X" in f:
raw = AnnData(**_read_raw(f, as_sparse, rdasp))
raw.obs_names = adata.obs_names
adata.raw = raw
# Backwards compat to <0.7
if isinstance(f["obs"], h5py.Dataset):
_clean_uns(adata)
return adata
def _read_raw(
f: h5py.File | AnnDataFileManager,
as_sparse: Collection[str] = (),
rdasp: Callable[[h5py.Dataset], CSMatrix] | None = None,
*,
attrs: Collection[str] = ("X", "var", "varm"),
) -> dict:
if as_sparse:
assert rdasp is not None, "must supply rdasp if as_sparse is supplied"
raw = {}
if "X" in attrs and "raw/X" in f:
read_x = rdasp if "raw/X" in as_sparse else read_elem
raw["X"] = read_x(f["raw/X"])
for v in ("var", "varm"):
if v in attrs and f"raw/{v}" in f:
raw[v] = read_elem(f[f"raw/{v}"])
return _read_legacy_raw(f, raw, read_dataframe, read_elem, attrs=attrs)
@report_read_key_on_error
def read_dataframe_legacy(dataset: h5py.Dataset) -> pd.DataFrame:
"""Read pre-anndata 0.7 dataframes."""
warn(
f"'{dataset.name}' was written with a very old version of AnnData. "
"Consider rewriting it.",
OldFormatWarning,
)
if H5PY_V3:
df = pd.DataFrame(
_decode_structured_array(
_from_fixed_length_strings(dataset[()]), dtype=dataset.dtype
)
)
else:
df = pd.DataFrame(_from_fixed_length_strings(dataset[()]))
df.set_index(df.columns[0], inplace=True)
return df
def read_dataframe(group: h5py.Group | h5py.Dataset) -> pd.DataFrame:
"""Backwards compat function"""
if not isinstance(group, h5py.Group):
return read_dataframe_legacy(group)
else:
return read_elem(group)
@report_read_key_on_error
def read_dataset(dataset: h5py.Dataset):
if H5PY_V3:
string_dtype = h5py.check_string_dtype(dataset.dtype)
if (string_dtype is not None) and (string_dtype.encoding == "utf-8"):
dataset = dataset.asstr()
value = dataset[()]
if not hasattr(value, "dtype"):
return value
elif isinstance(value.dtype, str):
pass
elif issubclass(value.dtype.type, np.bytes_):
value = value.astype(str)
# Backwards compat, old datasets have strings as one element 1d arrays
if len(value) == 1:
return value[0]
elif len(value.dtype.descr) > 1: # Compound dtype
# For backwards compat, now strings are written as variable length
dtype = value.dtype
value = _from_fixed_length_strings(value)
if H5PY_V3:
value = _decode_structured_array(value, dtype=dtype)
if value.shape == ():
value = value[()]
return value
@report_read_key_on_error
def read_dense_as_sparse(
dataset: h5py.Dataset, sparse_format: CSMatrix, axis_chunk: int
):
if sparse_format == sparse.csr_matrix:
return read_dense_as_csr(dataset, axis_chunk)
elif sparse_format == sparse.csc_matrix:
return read_dense_as_csc(dataset, axis_chunk)
else:
msg = f"Cannot read dense array as type: {sparse_format}"
raise ValueError(msg)
def read_dense_as_csr(dataset: h5py.Dataset, axis_chunk: int = 6000):
sub_matrices = []
for idx in idx_chunks_along_axis(dataset.shape, 0, axis_chunk):
dense_chunk = dataset[idx]
sub_matrix = sparse.csr_matrix(dense_chunk)
sub_matrices.append(sub_matrix)
return sparse.vstack(sub_matrices, format="csr")
def read_dense_as_csc(dataset: h5py.Dataset, axis_chunk: int = 6000):
sub_matrices = []
for idx in idx_chunks_along_axis(dataset.shape, 1, axis_chunk):
sub_matrix = sparse.csc_matrix(dataset[idx])
sub_matrices.append(sub_matrix)
return sparse.hstack(sub_matrices, format="csc")
python-anndata-0.12.0~rc1/src/anndata/_io/read.py 0000664 0000000 0000000 00000036004 15003706322 0021554 0 ustar 00root root 0000000 0000000 from __future__ import annotations
import bz2
import gzip
from collections import OrderedDict
from os import PathLike, fspath
from pathlib import Path
from types import MappingProxyType
from typing import TYPE_CHECKING
from warnings import warn
import h5py
import numpy as np
import pandas as pd
from scipy import sparse
from .. import AnnData
from ..compat import _deprecate_positional_args
from .utils import is_float
if TYPE_CHECKING:
from collections.abc import Generator, Iterable, Iterator, Mapping
def read_csv(
filename: PathLike[str] | str | Iterator[str],
delimiter: str | None = ",",
first_column_names: bool | None = None,
dtype: str = "float32",
) -> AnnData:
"""\
Read `.csv` file.
Same as :func:`~anndata.io.read_text` but with default delimiter `','`.
Parameters
----------
filename
Data file.
delimiter
Delimiter that separates data within text file.
If `None`, will split at arbitrary number of white spaces,
which is different from enforcing splitting at single white space `' '`.
first_column_names
Assume the first column stores row names.
dtype
Numpy data type.
"""
return read_text(filename, delimiter, first_column_names, dtype)
def read_excel(
filename: PathLike[str] | str, sheet: str | int, dtype: str = "float32"
) -> AnnData:
"""\
Read `.xlsx` (Excel) file.
Assumes that the first columns stores the row names and the first row the
column names.
Parameters
----------
filename
File name to read from.
sheet
Name of sheet in Excel file.
"""
# rely on pandas for reading an excel file
from pandas import read_excel
df = read_excel(fspath(filename), sheet)
X = df.values[:, 1:]
row = dict(row_names=df.iloc[:, 0].values.astype(str))
col = dict(col_names=np.array(df.columns[1:], dtype=str))
return AnnData(X, row, col)
def read_umi_tools(filename: PathLike[str] | str, dtype=None) -> AnnData:
"""\
Read a gzipped condensed count matrix from umi_tools.
Parameters
----------
filename
File name to read from.
"""
# import pandas for conversion of a dict of dicts into a matrix
# import gzip to read a gzipped file :-)
table = pd.read_table(filename, dtype={"gene": "category", "cell": "category"})
X = sparse.csr_matrix(
(table["count"], (table["cell"].cat.codes, table["gene"].cat.codes)),
dtype=dtype,
)
obs = pd.DataFrame(index=pd.Index(table["cell"].cat.categories, name="cell"))
var = pd.DataFrame(index=pd.Index(table["gene"].cat.categories, name="gene"))
return AnnData(X=X, obs=obs, var=var)
def read_hdf(filename: PathLike[str] | str, key: str) -> AnnData:
"""\
Read `.h5` (hdf5) file.
Note: Also looks for fields `row_names` and `col_names`.
Parameters
----------
filename
Filename of data file.
key
Name of dataset in the file.
"""
with h5py.File(filename, "r") as f:
# the following is necessary in Python 3, because only
# a view and not a list is returned
keys = [k for k in f.keys()]
if key == "":
msg = (
f"The file {filename} stores the following sheets:\n{keys}\n"
f"Call read/read_hdf5 with one of them."
)
raise ValueError(msg)
# read array
X = f[key][()]
# try to find row and column names
rows_cols = [{}, {}]
for iname, name in enumerate(["row_names", "col_names"]):
if name in keys:
rows_cols[iname][name] = f[name][()]
adata = AnnData(X, rows_cols[0], rows_cols[1])
return adata
def _fmt_loom_axis_attrs(
input: Mapping, idx_name: str, dimm_mapping: Mapping[str, Iterable[str]]
) -> tuple[pd.DataFrame, Mapping[str, np.ndarray]]:
axis_df = pd.DataFrame()
axis_mapping = {}
for key, names in dimm_mapping.items():
axis_mapping[key] = np.array([input.pop(name) for name in names]).T
for k, v in input.items():
if v.ndim > 1 and v.shape[1] > 1:
axis_mapping[k] = v
else:
axis_df[k] = v
if idx_name in axis_df:
axis_df.set_index(idx_name, drop=True, inplace=True)
return axis_df, axis_mapping
@_deprecate_positional_args(version="0.9")
def read_loom(
filename: PathLike[str] | str,
*,
sparse: bool = True,
cleanup: bool = False,
X_name: str = "spliced",
obs_names: str = "CellID",
obsm_names: Mapping[str, Iterable[str]] | None = None,
var_names: str = "Gene",
varm_names: Mapping[str, Iterable[str]] | None = None,
dtype: str = "float32",
obsm_mapping: Mapping[str, Iterable[str]] = MappingProxyType({}),
varm_mapping: Mapping[str, Iterable[str]] = MappingProxyType({}),
**kwargs,
) -> AnnData:
"""\
Read `.loom`-formatted hdf5 file.
This reads the whole file into memory.
Beware that you have to explicitly state when you want to read the file as
sparse data.
Parameters
----------
filename
The filename.
sparse
Whether to read the data matrix as sparse.
cleanup
Whether to collapse all obs/var fields that only store
one unique value into `.uns['loom-.']`.
X_name
Loompy key with which the data matrix :attr:`~anndata.AnnData.X` is initialized.
obs_names
Loompy key where the observation/cell names are stored.
obsm_mapping
Loompy keys which will be constructed into observation matrices
var_names
Loompy key where the variable/gene names are stored.
varm_mapping
Loompy keys which will be constructed into variable matrices
**kwargs:
Arguments to loompy.connect
Example
-------
.. code:: python
pbmc = anndata.io.read_loom(
"pbmc.loom",
sparse=True,
X_name="lognorm",
obs_names="cell_names",
var_names="gene_names",
obsm_mapping={
"X_umap": ["umap_1", "umap_2"]
}
)
"""
# Deprecations
if obsm_names is not None:
warn(
"Argument obsm_names has been deprecated in favour of `obsm_mapping`. "
"In 0.9 this will be an error.",
FutureWarning,
)
if obsm_mapping != {}:
msg = (
"Received values for both `obsm_names` and `obsm_mapping`. This is "
"ambiguous, only pass `obsm_mapping`."
)
raise ValueError(msg)
obsm_mapping = obsm_names
if varm_names is not None:
warn(
"Argument varm_names has been deprecated in favour of `varm_mapping`. "
"In 0.9 this will be an error.",
FutureWarning,
)
if varm_mapping != {}:
msg = (
"Received values for both `varm_names` and `varm_mapping`. This is "
"ambiguous, only pass `varm_mapping`."
)
raise ValueError(msg)
varm_mapping = varm_names
filename = fspath(filename) # allow passing pathlib.Path objects
from loompy import connect
with connect(filename, "r", **kwargs) as lc:
if X_name not in lc.layers.keys():
X_name = ""
X = lc.layers[X_name].sparse().T.tocsr() if sparse else lc.layers[X_name][()].T
X = X.astype(dtype, copy=False)
layers = OrderedDict()
if X_name != "":
layers["matrix"] = (
lc.layers[""].sparse().T.tocsr() if sparse else lc.layers[""][()].T
)
for key in lc.layers.keys():
if key != "":
layers[key] = (
lc.layers[key].sparse().T.tocsr()
if sparse
else lc.layers[key][()].T
)
# TODO: Figure out the singleton obs elements
obs, obsm = _fmt_loom_axis_attrs(dict(lc.col_attrs), obs_names, obsm_mapping)
var, varm = _fmt_loom_axis_attrs(dict(lc.row_attrs), var_names, varm_mapping)
uns = {}
if cleanup:
uns_obs = {}
for key in obs.columns:
if len(obs[key].unique()) == 1:
uns_obs[key] = obs[key].iloc[0]
del obs[key]
if uns_obs:
uns["loom-obs"] = uns_obs
uns_var = {}
for key in var.columns:
if len(var[key].unique()) == 1:
uns_var[key] = var[key].iloc[0]
del var[key]
if uns_var:
uns["loom-var"] = uns_var
adata = AnnData(
X,
obs=obs,
var=var,
layers=layers,
obsm=obsm if obsm else None,
varm=varm if varm else None,
uns=uns,
)
return adata
def read_mtx(filename: PathLike[str] | str, dtype: str = "float32") -> AnnData:
"""\
Read `.mtx` file.
Parameters
----------
filename
The filename.
dtype
Numpy data type.
"""
from scipy.io import mmread
# could be rewritten accounting for dtype to be more performant
X = mmread(fspath(filename)).astype(dtype)
from scipy.sparse import csr_matrix
X = csr_matrix(X)
return AnnData(X)
def read_text(
filename: PathLike[str] | str | Iterator[str],
delimiter: str | None = None,
first_column_names: bool | None = None,
dtype: str = "float32",
) -> AnnData:
"""\
Read `.txt`, `.tab`, `.data` (text) file.
Same as :func:`~anndata.io.read_csv` but with default delimiter `None`.
Parameters
----------
filename
Data file, filename or stream.
delimiter
Delimiter that separates data within text file. If `None`, will split at
arbitrary number of white spaces, which is different from enforcing
splitting at single white space `' '`.
first_column_names
Assume the first column stores row names.
dtype
Numpy data type.
"""
if not isinstance(filename, PathLike | str | bytes):
return _read_text(filename, delimiter, first_column_names, dtype)
filename = Path(filename)
if filename.suffix == ".gz":
with gzip.open(str(filename), mode="rt") as f:
return _read_text(f, delimiter, first_column_names, dtype)
elif filename.suffix == ".bz2":
with bz2.open(str(filename), mode="rt") as f:
return _read_text(f, delimiter, first_column_names, dtype)
else:
with filename.open() as f:
return _read_text(f, delimiter, first_column_names, dtype)
def _iter_lines(file_like: Iterable[str]) -> Generator[str, None, None]:
"""Helper for iterating only nonempty lines without line breaks"""
for line in file_like:
line = line.rstrip("\r\n")
if line:
yield line
def _read_text(
f: Iterator[str],
delimiter: str | None,
first_column_names: bool | None,
dtype: str,
) -> AnnData:
comments = []
data = []
lines = _iter_lines(f)
col_names = []
row_names = []
# read header and column names
for line in lines:
if line.startswith("#"):
comment = line.lstrip("# ")
if comment:
comments.append(comment)
else:
if delimiter is not None and delimiter not in line:
msg = f"Did not find delimiter {delimiter!r} in first line."
raise ValueError(msg)
line_list = line.split(delimiter)
# the first column might be row names, so check the last
if not is_float(line_list[-1]):
col_names = line_list
# logg.msg(" assuming first line in file stores column names", v=4)
else:
if not is_float(line_list[0]) or first_column_names:
first_column_names = True
row_names.append(line_list[0])
data.append(np.array(line_list[1:], dtype=dtype))
else:
data.append(np.array(line_list, dtype=dtype))
break
if not col_names:
# try reading col_names from the last comment line
if len(comments) > 0:
# logg.msg(" assuming last comment line stores variable names", v=4)
col_names = np.array(comments[-1].split())
# just numbers as col_names
else:
# logg.msg(" did not find column names in file", v=4)
col_names = np.arange(len(data[0])).astype(str)
col_names = np.array(col_names, dtype=str)
# read another line to check if first column contains row names or not
if first_column_names is None:
first_column_names = False
for line in lines:
line_list = line.split(delimiter)
if first_column_names or not is_float(line_list[0]):
# logg.msg(" assuming first column in file stores row names", v=4)
first_column_names = True
row_names.append(line_list[0])
data.append(np.array(line_list[1:], dtype=dtype))
else:
data.append(np.array(line_list, dtype=dtype))
break
# if row names are just integers
if len(data) > 1 and data[0].size != data[1].size:
# logg.msg(
# " assuming first row stores column names and first column row names",
# v=4,
# )
first_column_names = True
col_names = np.array(data[0]).astype(int).astype(str)
row_names.append(data[1][0].astype(int).astype(str))
data = [data[1][1:]]
# parse the file
for line in lines:
line_list = line.split(delimiter)
if first_column_names:
row_names.append(line_list[0])
data.append(np.array(line_list[1:], dtype=dtype))
else:
data.append(np.array(line_list, dtype=dtype))
# logg.msg(" read data into list of lists", t=True, v=4)
# transform to array, this takes a long time and a lot of memory
# but it’s actually the same thing as np.genfromtxt does
# - we don’t use the latter as it would involve another slicing step
# in the end, to separate row_names from float data, slicing takes
# a lot of memory and CPU time
if data[0].size != data[-1].size:
msg = (
f"Length of first line ({data[0].size}) is different "
f"from length of last line ({data[-1].size})."
)
raise ValueError(msg)
data = np.array(data, dtype=dtype)
# logg.msg(" constructed array from list of list", t=True, v=4)
# transform row_names
if not row_names:
row_names = np.arange(len(data)).astype(str)
# logg.msg(" did not find row names in file", v=4)
else:
row_names = np.array(row_names)
for iname, name in enumerate(row_names):
row_names[iname] = name.strip('"')
# adapt col_names if necessary
if col_names.size > data.shape[1]:
col_names = col_names[1:]
for iname, name in enumerate(col_names):
col_names[iname] = name.strip('"')
return AnnData(
data,
obs=dict(obs_names=row_names),
var=dict(var_names=col_names),
)
python-anndata-0.12.0~rc1/src/anndata/_io/specs/ 0000775 0000000 0000000 00000000000 15003706322 0021401 5 ustar 00root root 0000000 0000000 python-anndata-0.12.0~rc1/src/anndata/_io/specs/__init__.py 0000664 0000000 0000000 00000000653 15003706322 0023516 0 ustar 00root root 0000000 0000000 from __future__ import annotations
from . import lazy_methods, methods
from .registry import (
_LAZY_REGISTRY, # noqa: F401
_REGISTRY, # noqa: F401
IOSpec,
Reader,
Writer,
get_spec,
read_elem,
read_elem_lazy,
write_elem,
)
__all__ = [
"methods",
"lazy_methods",
"write_elem",
"get_spec",
"read_elem",
"read_elem_lazy",
"Reader",
"Writer",
"IOSpec",
]
python-anndata-0.12.0~rc1/src/anndata/_io/specs/lazy_methods.py 0000664 0000000 0000000 00000026141 15003706322 0024461 0 ustar 00root root 0000000 0000000 from __future__ import annotations
from contextlib import contextmanager
from functools import partial, singledispatch
from pathlib import Path
from typing import TYPE_CHECKING, overload
import h5py
import numpy as np
import pandas as pd
from scipy import sparse
import anndata as ad
from anndata._core.file_backing import filename, get_elem_name
from anndata.abc import CSCDataset, CSRDataset
from anndata.compat import DaskArray, H5Array, H5Group, ZarrArray, ZarrGroup
from .registry import _LAZY_REGISTRY, IOSpec
if TYPE_CHECKING:
from collections.abc import Generator, Mapping, Sequence
from typing import Literal, ParamSpec, TypeVar
from anndata.experimental.backed._compat import DataArray, Dataset2D
from anndata.experimental.backed._lazy_arrays import CategoricalArray, MaskedArray
from ...compat import CSArray, CSMatrix, H5File
from .registry import LazyDataStructures, LazyReader
BlockInfo = Mapping[
Literal[None],
dict[str, Sequence[tuple[int, int]]],
]
P = ParamSpec("P")
R = TypeVar("R")
D = TypeVar("D")
@overload
@contextmanager
def maybe_open_h5(
path_or_other: Path, elem_name: str
) -> Generator[H5File, None, None]: ...
@overload
@contextmanager
def maybe_open_h5(path_or_other: D, elem_name: str) -> Generator[D, None, None]: ...
@contextmanager
def maybe_open_h5(
path_or_other: H5File | D, elem_name: str
) -> Generator[H5File | D, None, None]:
if not isinstance(path_or_other, Path):
yield path_or_other
return
file = h5py.File(path_or_other, "r")
try:
yield file[elem_name]
finally:
file.close()
_DEFAULT_STRIDE = 1000
def compute_chunk_layout_for_axis_size(
chunk_axis_size: int, full_axis_size: int
) -> tuple[int, ...]:
n_strides, rest = np.divmod(full_axis_size, chunk_axis_size)
chunk = (chunk_axis_size,) * n_strides
if rest > 0:
chunk += (rest,)
return chunk
def make_dask_chunk(
path_or_sparse_dataset: Path | D,
elem_name: str,
block_info: BlockInfo | None = None,
) -> CSMatrix | CSArray:
if block_info is None:
msg = "Block info is required"
raise ValueError(msg)
# We need to open the file in each task since `dask` cannot share h5py objects when using `dask.distributed`
# https://github.com/scverse/anndata/issues/1105
with maybe_open_h5(path_or_sparse_dataset, elem_name) as f:
mtx = ad.io.sparse_dataset(f) if isinstance(f, H5Group) else f
idx = tuple(
slice(start, stop) for start, stop in block_info[None]["array-location"]
)
chunk = mtx[idx]
return chunk
@singledispatch
def get_chunksize(obj) -> tuple[int, ...]:
if hasattr(obj, "chunks"):
return obj.chunks
msg = "object of type {type(obj)} has no recognized chunks"
raise ValueError(msg)
@_LAZY_REGISTRY.register_read(H5Group, IOSpec("csc_matrix", "0.1.0"))
@_LAZY_REGISTRY.register_read(H5Group, IOSpec("csr_matrix", "0.1.0"))
@_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0"))
@_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csr_matrix", "0.1.0"))
def read_sparse_as_dask(
elem: H5Group | ZarrGroup,
*,
_reader: LazyReader,
chunks: tuple[int, ...] | None = None, # only tuple[int, int] is supported here
) -> DaskArray:
import dask.array as da
path_or_sparse_dataset = (
Path(filename(elem))
if isinstance(elem, H5Group)
else ad.io.sparse_dataset(elem)
)
elem_name = get_elem_name(elem)
shape: tuple[int, int] = tuple(elem.attrs["shape"])
if isinstance(path_or_sparse_dataset, CSRDataset | CSCDataset):
dtype = path_or_sparse_dataset.dtype
else:
dtype = elem["data"].dtype
is_csc: bool = elem.attrs["encoding-type"] == "csc_matrix"
stride: int = _DEFAULT_STRIDE
major_dim, minor_dim = (1, 0) if is_csc else (0, 1)
if chunks is not None:
if len(chunks) != 2:
msg = "`chunks` must be a tuple of two integers"
raise ValueError(msg)
if chunks[minor_dim] not in {shape[minor_dim], -1, None}:
msg = (
"Only the major axis can be chunked. "
f"Try setting chunks to {((-1, _DEFAULT_STRIDE) if is_csc else (_DEFAULT_STRIDE, -1))}"
)
raise ValueError(msg)
stride = (
chunks[major_dim]
if chunks[major_dim] not in {None, -1}
else shape[major_dim]
)
shape_minor, shape_major = shape if is_csc else shape[::-1]
chunks_major = compute_chunk_layout_for_axis_size(stride, shape_major)
chunks_minor = (shape_minor,)
chunk_layout = (
(chunks_minor, chunks_major) if is_csc else (chunks_major, chunks_minor)
)
memory_format = sparse.csc_matrix if is_csc else sparse.csr_matrix
make_chunk = partial(make_dask_chunk, path_or_sparse_dataset, elem_name)
da_mtx = da.map_blocks(
make_chunk,
dtype=dtype,
chunks=chunk_layout,
meta=memory_format((0, 0), dtype=dtype),
)
return da_mtx
@_LAZY_REGISTRY.register_read(H5Array, IOSpec("string-array", "0.2.0"))
def read_h5_string_array(
elem: H5Array,
*,
_reader: LazyReader,
chunks: tuple[int, int] | None = None,
) -> DaskArray:
import dask.array as da
from anndata._io.h5ad import read_dataset
return da.from_array(
read_dataset(elem),
chunks=chunks if chunks is not None else (_DEFAULT_STRIDE,) * len(elem.shape),
)
@_LAZY_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0"))
def read_h5_array(
elem: H5Array, *, _reader: LazyReader, chunks: tuple[int, ...] | None = None
) -> DaskArray:
import dask.array as da
path = Path(elem.file.filename)
elem_name: str = elem.name
shape = tuple(elem.shape)
dtype = elem.dtype
chunks: tuple[int, ...] = (
tuple(
c if c not in {None, -1} else s for c, s in zip(chunks, shape, strict=True)
)
if chunks is not None
else tuple(min(_DEFAULT_STRIDE, s) for s in shape)
)
chunk_layout = tuple(
compute_chunk_layout_for_axis_size(chunks[i], shape[i])
for i in range(len(shape))
)
make_chunk = partial(make_dask_chunk, path, elem_name)
return da.map_blocks(
make_chunk, dtype=dtype, chunks=chunk_layout, meta=np.array([])
)
@_LAZY_REGISTRY.register_read(ZarrArray, IOSpec("string-array", "0.2.0"))
@_LAZY_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0"))
def read_zarr_array(
elem: ZarrArray, *, _reader: LazyReader, chunks: tuple[int, ...] | None = None
) -> DaskArray:
chunks: tuple[int, ...] = chunks if chunks is not None else elem.chunks
import dask.array as da
return da.from_zarr(elem, chunks=chunks)
def _gen_xarray_dict_iterator_from_elems(
elem_dict: dict[str, LazyDataStructures],
dim_name: str,
index: np.NDArray,
) -> Generator[tuple[str, DataArray], None, None]:
from anndata.experimental.backed._compat import DataArray
from anndata.experimental.backed._compat import xarray as xr
from anndata.experimental.backed._lazy_arrays import CategoricalArray, MaskedArray
for k, v in elem_dict.items():
if isinstance(v, DaskArray) and k != dim_name:
data_array = DataArray(v, coords=[index], dims=[dim_name], name=k)
elif isinstance(v, CategoricalArray | MaskedArray) and k != dim_name:
variable = xr.Variable(
data=xr.core.indexing.LazilyIndexedArray(v), dims=[dim_name]
)
data_array = DataArray(
variable,
coords=[index],
dims=[dim_name],
name=k,
attrs={
"base_path_or_zarr_group": v.base_path_or_zarr_group,
"elem_name": v.elem_name,
},
)
elif k == dim_name:
data_array = DataArray(
index, coords=[index], dims=[dim_name], name=dim_name
)
else:
msg = f"Could not read {k}: {v} from into xarray Dataset2D"
raise ValueError(msg)
yield k, data_array
DUMMY_RANGE_INDEX_KEY = "_anndata_dummy_range_index"
@_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("dataframe", "0.2.0"))
@_LAZY_REGISTRY.register_read(H5Group, IOSpec("dataframe", "0.2.0"))
def read_dataframe(
elem: H5Group | ZarrGroup,
*,
_reader: LazyReader,
use_range_index: bool = False,
) -> Dataset2D:
from anndata.experimental.backed._compat import DataArray, Dataset2D
elem_dict = {
k: _reader.read_elem(elem[k])
for k in [*elem.attrs["column-order"], elem.attrs["_index"]]
}
# If we use a range index, the coord axis needs to have the special dim name
# which is used below as well.
if not use_range_index:
dim_name = elem.attrs["_index"]
# no sense in reading this in multiple times
index = elem_dict[dim_name].compute()
else:
dim_name = DUMMY_RANGE_INDEX_KEY
index = pd.RangeIndex(len(elem_dict[elem.attrs["_index"]])).astype("str")
elem_xarray_dict = dict(
_gen_xarray_dict_iterator_from_elems(elem_dict, dim_name, index)
)
if use_range_index:
elem_xarray_dict[DUMMY_RANGE_INDEX_KEY] = DataArray(
index,
coords=[index],
dims=[DUMMY_RANGE_INDEX_KEY],
name=DUMMY_RANGE_INDEX_KEY,
)
# We ensure the indexing_key attr always points to the true index
# so that the roundtrip works even for the `use_range_index` `True` case
ds = Dataset2D(elem_xarray_dict, attrs={"indexing_key": elem.attrs["_index"]})
return ds
@_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("categorical", "0.2.0"))
@_LAZY_REGISTRY.register_read(H5Group, IOSpec("categorical", "0.2.0"))
def read_categorical(
elem: H5Group | ZarrGroup,
*,
_reader: LazyReader,
) -> CategoricalArray:
from anndata.experimental.backed._lazy_arrays import CategoricalArray
base_path_or_zarr_group = (
Path(filename(elem)) if isinstance(elem, H5Group) else elem
)
elem_name = get_elem_name(elem)
return CategoricalArray(
codes=elem["codes"],
categories=elem["categories"],
ordered=elem.attrs["ordered"],
base_path_or_zarr_group=base_path_or_zarr_group,
elem_name=elem_name,
)
def read_nullable(
elem: H5Group | ZarrGroup,
*,
encoding_type: Literal[
"nullable-integer", "nullable-boolean", "nullable-string-array"
],
_reader: LazyReader,
) -> MaskedArray:
from anndata.experimental.backed._lazy_arrays import MaskedArray
base_path_or_zarr_group = (
Path(filename(elem)) if isinstance(elem, H5Group) else elem
)
elem_name = get_elem_name(elem)
return MaskedArray(
values=elem["values"],
mask=elem["mask"] if "mask" in elem else None,
dtype_str=encoding_type,
base_path_or_zarr_group=base_path_or_zarr_group,
elem_name=elem_name,
)
for dtype in ["integer", "boolean", "string-array"]:
for group_type in [ZarrGroup, H5Group]:
_LAZY_REGISTRY.register_read(group_type, IOSpec(f"nullable-{dtype}", "0.1.0"))(
partial(read_nullable, encoding_type=f"nullable-{dtype}")
)
python-anndata-0.12.0~rc1/src/anndata/_io/specs/methods.py 0000664 0000000 0000000 00000132310 15003706322 0023416 0 ustar 00root root 0000000 0000000 from __future__ import annotations
import warnings
from collections.abc import Mapping
from copy import copy
from functools import partial
from itertools import product
from types import MappingProxyType
from typing import TYPE_CHECKING
from warnings import warn
import h5py
import numpy as np
import pandas as pd
from packaging.version import Version
from scipy import sparse
import anndata as ad
from anndata import AnnData, Raw
from anndata._core import views
from anndata._core.index import _normalize_indices
from anndata._core.merge import intersect_keys
from anndata._core.sparse_dataset import _CSCDataset, _CSRDataset, sparse_dataset
from anndata._io.utils import H5PY_V3, check_key, zero_dim_array_as_scalar
from anndata._warnings import OldFormatWarning
from anndata.compat import (
AwkArray,
CupyArray,
CupyCSCMatrix,
CupyCSRMatrix,
DaskArray,
H5Array,
H5File,
H5Group,
ZarrArray,
ZarrGroup,
_decode_structured_array,
_from_fixed_length_strings,
_read_attr,
_require_group_write_dataframe,
)
from ..._settings import settings
from ...compat import is_zarr_v2
from .registry import _REGISTRY, IOSpec, read_elem, read_elem_partial
if TYPE_CHECKING:
from collections.abc import Callable, Iterator
from os import PathLike
from typing import Any, Literal
from numpy import typing as npt
from numpy.typing import NDArray
from anndata._types import ArrayStorageType, GroupStorageType
from anndata.compat import CSArray, CSMatrix
from anndata.typing import AxisStorable, InMemoryArrayOrScalarType
from .registry import Reader, Writer
####################
# Dask utils #
####################
try:
from dask.utils import SerializableLock as Lock
except ImportError:
from threading import Lock
# to fix https://github.com/dask/distributed/issues/780
GLOBAL_LOCK = Lock()
####################
# Dispatch methods #
####################
# def is_full_slice(idx):
# if isinstance(idx, tuple)len(idx) == 1:
# if isinstance(idx, type(None)):
# return True
# elif idx is Ellipsis:
# return True
# elif isinstance(idx, tuple):
# for el in idx:
# if isinstance(el, type(None)):
# pass
# elif isinstance(el, slice):
# if el != slice(None):
# return False
# else:
# return False
# return True
# return False
def zarr_v3_compressor_compat(dataset_kwargs) -> dict:
if not is_zarr_v2() and (compressor := dataset_kwargs.pop("compressor", None)):
dataset_kwargs["compressors"] = compressor
return dataset_kwargs
def _to_cpu_mem_wrapper(write_func):
"""
Wrapper to bring cupy types into cpu memory before writing.
Ideally we do direct writing at some point.
"""
def wrapper(
f,
k,
cupy_val: CupyArray | CupyCSCMatrix | CupyCSRMatrix,
*,
_writer: Writer,
dataset_kwargs: Mapping[str, Any] = MappingProxyType({}),
):
return write_func(
f, k, cupy_val.get(), _writer=_writer, dataset_kwargs=dataset_kwargs
)
return wrapper
################################
# Fallbacks / backwards compat #
################################
# Note: there is no need for writing in a backwards compatible format, maybe
@_REGISTRY.register_read(H5File, IOSpec("", ""))
@_REGISTRY.register_read(H5Group, IOSpec("", ""))
@_REGISTRY.register_read(H5Array, IOSpec("", ""))
def read_basic(
elem: H5File | H5Group | H5Array, *, _reader: Reader
) -> dict[str, InMemoryArrayOrScalarType] | npt.NDArray | CSMatrix | CSArray:
from anndata._io import h5ad
warn(
f"Element '{elem.name}' was written without encoding metadata.",
OldFormatWarning,
stacklevel=3,
)
if isinstance(elem, Mapping):
# Backwards compat sparse arrays
if "h5sparse_format" in elem.attrs:
return sparse_dataset(elem).to_memory()
return {k: _reader.read_elem(v) for k, v in dict(elem).items()}
elif isinstance(elem, h5py.Dataset):
return h5ad.read_dataset(elem) # TODO: Handle legacy
@_REGISTRY.register_read(ZarrGroup, IOSpec("", ""))
@_REGISTRY.register_read(ZarrArray, IOSpec("", ""))
def read_basic_zarr(
elem: ZarrGroup | ZarrArray, *, _reader: Reader
) -> dict[str, InMemoryArrayOrScalarType] | npt.NDArray | CSMatrix | CSArray:
from anndata._io import zarr
warn(
f"Element '{elem.name}' was written without encoding metadata.",
OldFormatWarning,
stacklevel=3,
)
if isinstance(elem, ZarrGroup):
# Backwards compat sparse arrays
if "h5sparse_format" in elem.attrs:
return sparse_dataset(elem).to_memory()
return {k: _reader.read_elem(v) for k, v in dict(elem).items()}
elif isinstance(elem, ZarrArray):
return zarr.read_dataset(elem) # TODO: Handle legacy
# @_REGISTRY.register_read_partial(IOSpec("", ""))
# def read_basic_partial(elem, *, items=None, indices=(slice(None), slice(None))):
# if isinstance(elem, Mapping):
# return _read_partial(elem, items=items, indices=indices)
# elif indices != (slice(None), slice(None)):
# return elem[indices]
# else:
# return elem[()]
###########
# AnnData #
###########
def read_indices(group):
obs_group = group["obs"]
obs_idx_elem = obs_group[_read_attr(obs_group.attrs, "_index")]
obs_idx = read_elem(obs_idx_elem)
var_group = group["var"]
var_idx_elem = var_group[_read_attr(var_group.attrs, "_index")]
var_idx = read_elem(var_idx_elem)
return obs_idx, var_idx
def read_partial(
pth: PathLike[str] | str,
*,
obs_idx=slice(None),
var_idx=slice(None),
X=True,
obs=None,
var=None,
obsm=None,
varm=None,
obsp=None,
varp=None,
layers=None,
uns=None,
) -> ad.AnnData:
result = {}
with h5py.File(pth, "r") as f:
obs_idx, var_idx = _normalize_indices((obs_idx, var_idx), *read_indices(f))
result["obs"] = read_elem_partial(
f["obs"], items=obs, indices=(obs_idx, slice(None))
)
result["var"] = read_elem_partial(
f["var"], items=var, indices=(var_idx, slice(None))
)
if X:
result["X"] = read_elem_partial(f["X"], indices=(obs_idx, var_idx))
else:
result["X"] = sparse.csr_matrix((len(result["obs"]), len(result["var"])))
if "obsm" in f:
result["obsm"] = _read_partial(
f["obsm"], items=obsm, indices=(obs_idx, slice(None))
)
if "varm" in f:
result["varm"] = _read_partial(
f["varm"], items=varm, indices=(var_idx, slice(None))
)
if "obsp" in f:
result["obsp"] = _read_partial(
f["obsp"], items=obsp, indices=(obs_idx, obs_idx)
)
if "varp" in f:
result["varp"] = _read_partial(
f["varp"], items=varp, indices=(var_idx, var_idx)
)
if "layers" in f:
result["layers"] = _read_partial(
f["layers"], items=layers, indices=(obs_idx, var_idx)
)
if "uns" in f:
result["uns"] = _read_partial(f["uns"], items=uns)
return ad.AnnData(**result)
def _read_partial(group, *, items=None, indices=(slice(None), slice(None))):
if group is None:
return None
if items is None:
keys = intersect_keys((group,))
else:
keys = intersect_keys((group, items))
result = {}
for k in keys:
if isinstance(items, Mapping):
next_items = items.get(k, None)
else:
next_items = None
result[k] = read_elem_partial(group[k], items=next_items, indices=indices)
return result
@_REGISTRY.register_write(ZarrGroup, AnnData, IOSpec("anndata", "0.1.0"))
@_REGISTRY.register_write(H5Group, AnnData, IOSpec("anndata", "0.1.0"))
def write_anndata(
f: GroupStorageType,
k: str,
adata: AnnData,
*,
_writer: Writer,
dataset_kwargs: Mapping[str, Any] = MappingProxyType({}),
):
g = f.require_group(k)
_writer.write_elem(g, "X", adata.X, dataset_kwargs=dataset_kwargs)
_writer.write_elem(g, "obs", adata.obs, dataset_kwargs=dataset_kwargs)
_writer.write_elem(g, "var", adata.var, dataset_kwargs=dataset_kwargs)
_writer.write_elem(g, "obsm", dict(adata.obsm), dataset_kwargs=dataset_kwargs)
_writer.write_elem(g, "varm", dict(adata.varm), dataset_kwargs=dataset_kwargs)
_writer.write_elem(g, "obsp", dict(adata.obsp), dataset_kwargs=dataset_kwargs)
_writer.write_elem(g, "varp", dict(adata.varp), dataset_kwargs=dataset_kwargs)
_writer.write_elem(g, "layers", dict(adata.layers), dataset_kwargs=dataset_kwargs)
_writer.write_elem(g, "uns", dict(adata.uns), dataset_kwargs=dataset_kwargs)
_writer.write_elem(g, "raw", adata.raw, dataset_kwargs=dataset_kwargs)
@_REGISTRY.register_read(H5Group, IOSpec("anndata", "0.1.0"))
@_REGISTRY.register_read(H5Group, IOSpec("raw", "0.1.0"))
@_REGISTRY.register_read(H5File, IOSpec("anndata", "0.1.0"))
@_REGISTRY.register_read(H5File, IOSpec("raw", "0.1.0"))
@_REGISTRY.register_read(ZarrGroup, IOSpec("anndata", "0.1.0"))
@_REGISTRY.register_read(ZarrGroup, IOSpec("raw", "0.1.0"))
def read_anndata(elem: GroupStorageType | H5File, *, _reader: Reader) -> AnnData:
d = {}
for k in [
"X",
"obs",
"var",
"obsm",
"varm",
"obsp",
"varp",
"layers",
"uns",
"raw",
]:
if k in elem:
d[k] = _reader.read_elem(elem[k])
return AnnData(**d)
@_REGISTRY.register_write(H5Group, Raw, IOSpec("raw", "0.1.0"))
@_REGISTRY.register_write(ZarrGroup, Raw, IOSpec("raw", "0.1.0"))
def write_raw(
f: GroupStorageType,
k: str,
raw: Raw,
*,
_writer: Writer,
dataset_kwargs: Mapping[str, Any] = MappingProxyType({}),
):
g = f.require_group(k)
_writer.write_elem(g, "X", raw.X, dataset_kwargs=dataset_kwargs)
_writer.write_elem(g, "var", raw.var, dataset_kwargs=dataset_kwargs)
_writer.write_elem(g, "varm", dict(raw.varm), dataset_kwargs=dataset_kwargs)
########
# Null #
########
@_REGISTRY.register_read(H5Array, IOSpec("null", "0.1.0"))
@_REGISTRY.register_read(ZarrArray, IOSpec("null", "0.1.0"))
def read_null(_elem, _reader) -> None:
return None
@_REGISTRY.register_write(H5Group, type(None), IOSpec("null", "0.1.0"))
def write_null_h5py(f, k, _v, _writer, dataset_kwargs=MappingProxyType({})):
f.create_dataset(k, data=h5py.Empty("f"), **dataset_kwargs)
@_REGISTRY.register_write(ZarrGroup, type(None), IOSpec("null", "0.1.0"))
def write_null_zarr(f, k, _v, _writer, dataset_kwargs=MappingProxyType({})):
# zarr has no first-class null dataset
if is_zarr_v2():
import zarr
# zarr has no first-class null dataset
f.create_dataset(k, data=zarr.empty(()), **dataset_kwargs)
else:
# TODO: why is this not actually storing the empty info with a f.empty call?
# It fails complaining that k doesn't exist when updating the attributes.
f.create_array(k, shape=(), dtype="bool")
############
# Mappings #
############
@_REGISTRY.register_read(H5Group, IOSpec("dict", "0.1.0"))
@_REGISTRY.register_read(ZarrGroup, IOSpec("dict", "0.1.0"))
def read_mapping(elem: GroupStorageType, *, _reader: Reader) -> dict[str, AxisStorable]:
return {k: _reader.read_elem(v) for k, v in dict(elem).items()}
@_REGISTRY.register_write(H5Group, dict, IOSpec("dict", "0.1.0"))
@_REGISTRY.register_write(ZarrGroup, dict, IOSpec("dict", "0.1.0"))
def write_mapping(
f: GroupStorageType,
k: str,
v: dict[str, AxisStorable],
*,
_writer: Writer,
dataset_kwargs: Mapping[str, Any] = MappingProxyType({}),
):
g = f.require_group(k)
for sub_k, sub_v in v.items():
_writer.write_elem(g, sub_k, sub_v, dataset_kwargs=dataset_kwargs)
##############
# np.ndarray #
##############
@_REGISTRY.register_write(H5Group, list, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(ZarrGroup, list, IOSpec("array", "0.2.0"))
def write_list(
f: GroupStorageType,
k: str,
elem: list[AxisStorable],
*,
_writer: Writer,
dataset_kwargs: Mapping[str, Any] = MappingProxyType({}),
):
_writer.write_elem(f, k, np.array(elem), dataset_kwargs=dataset_kwargs)
# TODO: Is this the right behavior for MaskedArrays?
# It's in the `AnnData.concatenate` docstring, but should we keep it?
@_REGISTRY.register_write(H5Group, views.ArrayView, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(H5Group, np.ndarray, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(H5Group, np.ma.MaskedArray, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(ZarrGroup, views.ArrayView, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(ZarrGroup, np.ndarray, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(ZarrGroup, np.ma.MaskedArray, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(ZarrGroup, ZarrArray, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(ZarrGroup, H5Array, IOSpec("array", "0.2.0"))
@zero_dim_array_as_scalar
def write_basic(
f: GroupStorageType,
k: str,
elem: views.ArrayView | np.ndarray | h5py.Dataset | np.ma.MaskedArray | ZarrArray,
*,
_writer: Writer,
dataset_kwargs: Mapping[str, Any] = MappingProxyType({}),
):
"""Write methods which underlying library handles natively."""
dataset_kwargs = dataset_kwargs.copy()
dtype = dataset_kwargs.pop("dtype", elem.dtype)
if isinstance(f, H5Group) or is_zarr_v2():
f.create_dataset(k, data=elem, shape=elem.shape, dtype=dtype, **dataset_kwargs)
else:
dataset_kwargs = zarr_v3_compressor_compat(dataset_kwargs)
f.create_array(k, shape=elem.shape, dtype=dtype, **dataset_kwargs)
# see https://github.com/zarr-developers/zarr-python/discussions/2712
if isinstance(elem, ZarrArray):
f[k][...] = elem[...]
else:
f[k][...] = elem
def _iter_chunks_for_copy(
elem: ArrayStorageType, dest: ArrayStorageType
) -> Iterator[slice | tuple[list[slice]]]:
"""
Returns an iterator of tuples of slices for copying chunks from `elem` to `dest`.
* If `dest` has chunks, it will return the chunks of `dest`.
* If `dest` is not chunked, we write it in ~100MB chunks or 1000 rows, whichever is larger.
"""
if dest.chunks and hasattr(dest, "iter_chunks"):
return dest.iter_chunks()
else:
shape = elem.shape
# Number of rows that works out to
n_rows = max(
ad.settings.min_rows_for_chunked_h5_copy,
elem.chunks[0] if elem.chunks is not None else 1,
)
return (slice(i, min(i + n_rows, shape[0])) for i in range(0, shape[0], n_rows))
@_REGISTRY.register_write(H5Group, H5Array, IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(H5Group, ZarrArray, IOSpec("array", "0.2.0"))
def write_chunked_dense_array_to_group(
f: H5Group,
k: str,
elem: ArrayStorageType,
*,
_writer: Writer,
dataset_kwargs: Mapping[str, Any] = MappingProxyType({}),
):
"""Write to a h5py.Dataset in chunks.
`h5py.Group.create_dataset(..., data: h5py.Dataset)` will load all of `data` into memory
before writing. Instead, we will write in chunks to avoid this. We don't need to do this for
zarr since zarr handles this automatically.
"""
dtype = dataset_kwargs.get("dtype", elem.dtype)
kwargs = {**dataset_kwargs, "dtype": dtype}
dest = f.create_dataset(k, shape=elem.shape, **kwargs)
for chunk in _iter_chunks_for_copy(elem, dest):
dest[chunk] = elem[chunk]
_REGISTRY.register_write(H5Group, CupyArray, IOSpec("array", "0.2.0"))(
_to_cpu_mem_wrapper(write_basic)
)
_REGISTRY.register_write(ZarrGroup, CupyArray, IOSpec("array", "0.2.0"))(
_to_cpu_mem_wrapper(write_basic)
)
@_REGISTRY.register_write(ZarrGroup, DaskArray, IOSpec("array", "0.2.0"))
def write_basic_dask_zarr(
f: ZarrGroup,
k: str,
elem: DaskArray,
*,
_writer: Writer,
dataset_kwargs: Mapping[str, Any] = MappingProxyType({}),
):
import dask.array as da
dataset_kwargs = dataset_kwargs.copy()
dataset_kwargs = zarr_v3_compressor_compat(dataset_kwargs)
if is_zarr_v2():
g = f.require_dataset(k, shape=elem.shape, dtype=elem.dtype, **dataset_kwargs)
else:
g = f.require_array(k, shape=elem.shape, dtype=elem.dtype, **dataset_kwargs)
da.store(elem, g, lock=GLOBAL_LOCK)
# Adding this separately because h5py isn't serializable
# https://github.com/pydata/xarray/issues/4242
@_REGISTRY.register_write(H5Group, DaskArray, IOSpec("array", "0.2.0"))
def write_basic_dask_h5(
f: H5Group,
k: str,
elem: DaskArray,
*,
_writer: Writer,
dataset_kwargs: Mapping[str, Any] = MappingProxyType({}),
):
import dask.array as da
import dask.config as dc
if dc.get("scheduler", None) == "dask.distributed":
msg = "Cannot write dask arrays to hdf5 when using distributed scheduler"
raise ValueError(msg)
g = f.require_dataset(k, shape=elem.shape, dtype=elem.dtype, **dataset_kwargs)
da.store(elem, g)
@_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0"))
@_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0"))
@_REGISTRY.register_read(ZarrArray, IOSpec("string-array", "0.2.0"))
def read_array(elem: ArrayStorageType, *, _reader: Reader) -> npt.NDArray:
return elem[()]
@_REGISTRY.register_read_partial(H5Array, IOSpec("array", "0.2.0"))
@_REGISTRY.register_read_partial(ZarrArray, IOSpec("string-array", "0.2.0"))
def read_array_partial(elem, *, items=None, indices=(slice(None, None))):
return elem[indices]
@_REGISTRY.register_read_partial(ZarrArray, IOSpec("array", "0.2.0"))
def read_zarr_array_partial(elem, *, items=None, indices=(slice(None, None))):
return elem.oindex[indices]
# arrays of strings
@_REGISTRY.register_read(H5Array, IOSpec("string-array", "0.2.0"))
def read_string_array(d: H5Array, *, _reader: Reader):
return read_array(d.asstr(), _reader=_reader)
@_REGISTRY.register_read_partial(H5Array, IOSpec("string-array", "0.2.0"))
def read_string_array_partial(d, items=None, indices=slice(None)):
return read_array_partial(d.asstr(), items=items, indices=indices)
@_REGISTRY.register_write(
H5Group, (views.ArrayView, "U"), IOSpec("string-array", "0.2.0")
)
@_REGISTRY.register_write(
H5Group, (views.ArrayView, "O"), IOSpec("string-array", "0.2.0")
)
@_REGISTRY.register_write(H5Group, (np.ndarray, "U"), IOSpec("string-array", "0.2.0"))
@_REGISTRY.register_write(H5Group, (np.ndarray, "O"), IOSpec("string-array", "0.2.0"))
@_REGISTRY.register_write(H5Group, (np.ndarray, "T"), IOSpec("string-array", "0.2.0"))
@zero_dim_array_as_scalar
def write_vlen_string_array(
f: H5Group,
k: str,
elem: np.ndarray,
*,
_writer: Writer,
dataset_kwargs: Mapping[str, Any] = MappingProxyType({}),
):
"""Write methods which underlying library handles nativley."""
str_dtype = h5py.special_dtype(vlen=str)
f.create_dataset(k, data=elem.astype(str_dtype), dtype=str_dtype, **dataset_kwargs)
@_REGISTRY.register_write(
ZarrGroup, (views.ArrayView, "U"), IOSpec("string-array", "0.2.0")
)
@_REGISTRY.register_write(
ZarrGroup, (views.ArrayView, "O"), IOSpec("string-array", "0.2.0")
)
@_REGISTRY.register_write(ZarrGroup, (np.ndarray, "U"), IOSpec("string-array", "0.2.0"))
@_REGISTRY.register_write(ZarrGroup, (np.ndarray, "O"), IOSpec("string-array", "0.2.0"))
@_REGISTRY.register_write(ZarrGroup, (np.ndarray, "T"), IOSpec("string-array", "0.2.0"))
@zero_dim_array_as_scalar
def write_vlen_string_array_zarr(
f: ZarrGroup,
k: str,
elem: np.ndarray,
*,
_writer: Writer,
dataset_kwargs: Mapping[str, Any] = MappingProxyType({}),
):
if is_zarr_v2():
import numcodecs
if Version(numcodecs.__version__) < Version("0.13"):
msg = "Old numcodecs version detected. Please update for improved performance and stability."
warnings.warn(msg)
# Workaround for https://github.com/zarr-developers/numcodecs/issues/514
if hasattr(elem, "flags") and not elem.flags.writeable:
elem = elem.copy()
f.create_dataset(
k,
shape=elem.shape,
dtype=object,
object_codec=numcodecs.VLenUTF8(),
**dataset_kwargs,
)
f[k][:] = elem
else:
from numcodecs import VLenUTF8
dataset_kwargs = dataset_kwargs.copy()
dataset_kwargs = zarr_v3_compressor_compat(dataset_kwargs)
match (
ad.settings.zarr_write_format,
Version(np.__version__) >= Version("2.0.0"),
):
case 2, _:
filters, dtype = [VLenUTF8()], object
case 3, True:
filters, dtype = None, np.dtypes.StringDType()
case 3, False:
filters, dtype = None, np.dtypes.ObjectDType()
f.create_array(
k,
shape=elem.shape,
dtype=dtype,
filters=filters,
**dataset_kwargs,
)
f[k][:] = elem
###############
# np.recarray #
###############
def _to_hdf5_vlen_strings(value: np.ndarray) -> np.ndarray:
"""This corrects compound dtypes to work with hdf5 files."""
new_dtype = []
for dt_name, (dt_type, _) in value.dtype.fields.items():
if dt_type.kind in {"U", "O"}:
new_dtype.append((dt_name, h5py.special_dtype(vlen=str)))
else:
new_dtype.append((dt_name, dt_type))
return value.astype(new_dtype)
@_REGISTRY.register_read(H5Array, IOSpec("rec-array", "0.2.0"))
@_REGISTRY.register_read(ZarrArray, IOSpec("rec-array", "0.2.0"))
def read_recarray(d: ArrayStorageType, *, _reader: Reader) -> np.recarray | npt.NDArray:
value = d[()]
dtype = value.dtype
value = _from_fixed_length_strings(value)
if H5PY_V3:
value = _decode_structured_array(value, dtype=dtype)
return value
@_REGISTRY.register_write(H5Group, (np.ndarray, "V"), IOSpec("rec-array", "0.2.0"))
@_REGISTRY.register_write(H5Group, np.recarray, IOSpec("rec-array", "0.2.0"))
def write_recarray(
f: H5Group,
k: str,
elem: np.ndarray | np.recarray,
*,
_writer: Writer,
dataset_kwargs: Mapping[str, Any] = MappingProxyType({}),
):
f.create_dataset(k, data=_to_hdf5_vlen_strings(elem), **dataset_kwargs)
@_REGISTRY.register_write(ZarrGroup, (np.ndarray, "V"), IOSpec("rec-array", "0.2.0"))
@_REGISTRY.register_write(ZarrGroup, np.recarray, IOSpec("rec-array", "0.2.0"))
def write_recarray_zarr(
f: ZarrGroup,
k: str,
elem: np.ndarray | np.recarray,
*,
_writer: Writer,
dataset_kwargs: Mapping[str, Any] = MappingProxyType({}),
):
from anndata.compat import _to_fixed_length_strings
elem = _to_fixed_length_strings(elem)
if isinstance(f, H5Group) or is_zarr_v2():
f.create_dataset(k, data=elem, shape=elem.shape, **dataset_kwargs)
else:
dataset_kwargs = dataset_kwargs.copy()
dataset_kwargs = zarr_v3_compressor_compat(dataset_kwargs)
# TODO: zarr’s on-disk format v3 doesn’t support this dtype
f.create_array(k, shape=elem.shape, dtype=elem.dtype, **dataset_kwargs)
f[k][...] = elem
#################
# Sparse arrays #
#################
def write_sparse_compressed(
f: GroupStorageType,
key: str,
value: CSMatrix | CSArray,
*,
_writer: Writer,
fmt: Literal["csr", "csc"],
dataset_kwargs=MappingProxyType({}),
):
g = f.require_group(key)
g.attrs["shape"] = value.shape
dataset_kwargs = dict(dataset_kwargs)
indptr_dtype = dataset_kwargs.pop("indptr_dtype", value.indptr.dtype)
# Allow resizing for hdf5
if isinstance(f, H5Group):
dataset_kwargs = dict(maxshape=(None,), **dataset_kwargs)
dataset_kwargs = zarr_v3_compressor_compat(dataset_kwargs)
for attr_name in ["data", "indices", "indptr"]:
attr = getattr(value, attr_name)
dtype = indptr_dtype if attr_name == "indptr" else attr.dtype
if isinstance(f, H5Group) or is_zarr_v2():
g.create_dataset(
attr_name, data=attr, shape=attr.shape, dtype=dtype, **dataset_kwargs
)
else:
arr = g.create_array(
attr_name, shape=attr.shape, dtype=dtype, **dataset_kwargs
)
# see https://github.com/zarr-developers/zarr-python/discussions/2712
arr[...] = attr[...]
write_csr = partial(write_sparse_compressed, fmt="csr")
write_csc = partial(write_sparse_compressed, fmt="csc")
for store_type, (cls, spec, func) in product(
(H5Group, ZarrGroup),
[
# spmatrix
(sparse.csr_matrix, IOSpec("csr_matrix", "0.1.0"), write_csr),
(views.SparseCSRMatrixView, IOSpec("csr_matrix", "0.1.0"), write_csr),
(sparse.csc_matrix, IOSpec("csc_matrix", "0.1.0"), write_csc),
(views.SparseCSCMatrixView, IOSpec("csc_matrix", "0.1.0"), write_csc),
# sparray
(sparse.csr_array, IOSpec("csr_matrix", "0.1.0"), write_csr),
(views.SparseCSRArrayView, IOSpec("csr_matrix", "0.1.0"), write_csr),
(sparse.csc_array, IOSpec("csc_matrix", "0.1.0"), write_csc),
(views.SparseCSCArrayView, IOSpec("csc_matrix", "0.1.0"), write_csc),
# cupy spmatrix
(CupyCSRMatrix, IOSpec("csr_matrix", "0.1.0"), _to_cpu_mem_wrapper(write_csr)),
(
views.CupySparseCSRView,
IOSpec("csr_matrix", "0.1.0"),
_to_cpu_mem_wrapper(write_csr),
),
(CupyCSCMatrix, IOSpec("csc_matrix", "0.1.0"), _to_cpu_mem_wrapper(write_csc)),
(
views.CupySparseCSCView,
IOSpec("csc_matrix", "0.1.0"),
_to_cpu_mem_wrapper(write_csc),
),
],
):
_REGISTRY.register_write(store_type, cls, spec)(func)
@_REGISTRY.register_write(H5Group, _CSRDataset, IOSpec("", "0.1.0"))
@_REGISTRY.register_write(H5Group, _CSCDataset, IOSpec("", "0.1.0"))
@_REGISTRY.register_write(ZarrGroup, _CSRDataset, IOSpec("", "0.1.0"))
@_REGISTRY.register_write(ZarrGroup, _CSCDataset, IOSpec("", "0.1.0"))
def write_sparse_dataset(
f: GroupStorageType,
k: str,
elem: _CSCDataset | _CSRDataset,
*,
_writer: Writer,
dataset_kwargs: Mapping[str, Any] = MappingProxyType({}),
):
write_sparse_compressed(
f,
k,
elem._to_backed(),
_writer=_writer,
fmt=elem.format,
dataset_kwargs=dataset_kwargs,
)
# TODO: Cleaner way to do this
f[k].attrs["encoding-type"] = f"{elem.format}_matrix"
f[k].attrs["encoding-version"] = "0.1.0"
@_REGISTRY.register_write(H5Group, (DaskArray, CupyArray), IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(ZarrGroup, (DaskArray, CupyArray), IOSpec("array", "0.2.0"))
@_REGISTRY.register_write(
H5Group, (DaskArray, CupyCSRMatrix), IOSpec("csr_matrix", "0.1.0")
)
@_REGISTRY.register_write(
H5Group, (DaskArray, CupyCSCMatrix), IOSpec("csc_matrix", "0.1.0")
)
@_REGISTRY.register_write(
ZarrGroup, (DaskArray, CupyCSRMatrix), IOSpec("csr_matrix", "0.1.0")
)
@_REGISTRY.register_write(
ZarrGroup, (DaskArray, CupyCSCMatrix), IOSpec("csc_matrix", "0.1.0")
)
def write_cupy_dask_sparse(f, k, elem, _writer, dataset_kwargs=MappingProxyType({})):
_writer.write_elem(
f,
k,
elem.map_blocks(lambda x: x.get(), dtype=elem.dtype, meta=elem._meta.get()),
dataset_kwargs=dataset_kwargs,
)
@_REGISTRY.register_write(
H5Group, (DaskArray, sparse.csr_matrix), IOSpec("csr_matrix", "0.1.0")
)
@_REGISTRY.register_write(
H5Group, (DaskArray, sparse.csc_matrix), IOSpec("csc_matrix", "0.1.0")
)
@_REGISTRY.register_write(
ZarrGroup, (DaskArray, sparse.csr_matrix), IOSpec("csr_matrix", "0.1.0")
)
@_REGISTRY.register_write(
ZarrGroup, (DaskArray, sparse.csc_matrix), IOSpec("csc_matrix", "0.1.0")
)
def write_dask_sparse(
f: GroupStorageType,
k: str,
elem: DaskArray,
*,
_writer: Writer,
dataset_kwargs: Mapping[str, Any] = MappingProxyType({}),
):
sparse_format = elem._meta.format
def as_int64_indices(x):
x.indptr = x.indptr.astype(np.int64, copy=False)
x.indices = x.indices.astype(np.int64, copy=False)
return x
if sparse_format == "csr":
axis = 0
elif sparse_format == "csc":
axis = 1
else:
msg = f"Cannot write dask sparse arrays with format {sparse_format}"
raise NotImplementedError(msg)
def chunk_slice(start: int, stop: int) -> tuple[slice | None, slice | None]:
result = [slice(None), slice(None)]
result[axis] = slice(start, stop)
return tuple(result)
axis_chunks = elem.chunks[axis]
chunk_start = 0
chunk_stop = axis_chunks[0]
_writer.write_elem(
f,
k,
as_int64_indices(elem[chunk_slice(chunk_start, chunk_stop)].compute()),
dataset_kwargs=dataset_kwargs,
)
disk_mtx = sparse_dataset(f[k])
for chunk_size in axis_chunks[1:]:
chunk_start = chunk_stop
chunk_stop += chunk_size
disk_mtx.append(elem[chunk_slice(chunk_start, chunk_stop)].compute())
@_REGISTRY.register_read(H5Group, IOSpec("csc_matrix", "0.1.0"))
@_REGISTRY.register_read(H5Group, IOSpec("csr_matrix", "0.1.0"))
@_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0"))
@_REGISTRY.register_read(ZarrGroup, IOSpec("csr_matrix", "0.1.0"))
def read_sparse(elem: GroupStorageType, *, _reader: Reader) -> CSMatrix | CSArray:
return sparse_dataset(elem).to_memory()
@_REGISTRY.register_read_partial(H5Group, IOSpec("csc_matrix", "0.1.0"))
@_REGISTRY.register_read_partial(H5Group, IOSpec("csr_matrix", "0.1.0"))
@_REGISTRY.register_read_partial(ZarrGroup, IOSpec("csc_matrix", "0.1.0"))
@_REGISTRY.register_read_partial(ZarrGroup, IOSpec("csr_matrix", "0.1.0"))
def read_sparse_partial(elem, *, items=None, indices=(slice(None), slice(None))):
return sparse_dataset(elem)[indices]
#################
# Awkward array #
#################
@_REGISTRY.register_write(H5Group, AwkArray, IOSpec("awkward-array", "0.1.0"))
@_REGISTRY.register_write(ZarrGroup, AwkArray, IOSpec("awkward-array", "0.1.0"))
@_REGISTRY.register_write(
H5Group, views.AwkwardArrayView, IOSpec("awkward-array", "0.1.0")
)
@_REGISTRY.register_write(
ZarrGroup, views.AwkwardArrayView, IOSpec("awkward-array", "0.1.0")
)
def write_awkward(
f: GroupStorageType,
k: str,
v: views.AwkwardArrayView | AwkArray,
*,
_writer: Writer,
dataset_kwargs: Mapping[str, Any] = MappingProxyType({}),
):
from anndata.compat import awkward as ak
group = f.require_group(k)
if isinstance(v, views.AwkwardArrayView):
# copy to remove the view attributes
v = copy(v)
form, length, container = ak.to_buffers(ak.to_packed(v))
group.attrs["length"] = length
group.attrs["form"] = form.to_json()
for k, v in container.items():
_writer.write_elem(group, k, v, dataset_kwargs=dataset_kwargs)
@_REGISTRY.register_read(H5Group, IOSpec("awkward-array", "0.1.0"))
@_REGISTRY.register_read(ZarrGroup, IOSpec("awkward-array", "0.1.0"))
def read_awkward(elem: GroupStorageType, *, _reader: Reader) -> AwkArray:
from anndata.compat import awkward as ak
form = _read_attr(elem.attrs, "form")
length = _read_attr(elem.attrs, "length")
container = {k: _reader.read_elem(elem[k]) for k in elem.keys()}
return ak.from_buffers(form, int(length), container)
##############
# DataFrames #
##############
@_REGISTRY.register_write(H5Group, views.DataFrameView, IOSpec("dataframe", "0.2.0"))
@_REGISTRY.register_write(H5Group, pd.DataFrame, IOSpec("dataframe", "0.2.0"))
@_REGISTRY.register_write(ZarrGroup, views.DataFrameView, IOSpec("dataframe", "0.2.0"))
@_REGISTRY.register_write(ZarrGroup, pd.DataFrame, IOSpec("dataframe", "0.2.0"))
def write_dataframe(
f: GroupStorageType,
key: str,
df: views.DataFrameView | pd.DataFrame,
*,
_writer: Writer,
dataset_kwargs: Mapping[str, Any] = MappingProxyType({}),
):
# Check arguments
for reserved in ("_index",):
if reserved in df.columns:
msg = f"{reserved!r} is a reserved name for dataframe columns."
raise ValueError(msg)
group = _require_group_write_dataframe(f, key, df)
if not df.columns.is_unique:
duplicates = list(df.columns[df.columns.duplicated()])
msg = f"Found repeated column names: {duplicates}. Column names must be unique."
raise ValueError(msg)
col_names = [check_key(c) for c in df.columns]
group.attrs["column-order"] = col_names
if df.index.name is not None:
if df.index.name in col_names and not pd.Series(
df.index, index=df.index
).equals(df[df.index.name]):
msg = (
f"DataFrame.index.name ({df.index.name!r}) is also used by a column "
"whose values are different. This is not supported. Please make sure "
"the values are the same, or use a different name."
)
raise ValueError(msg)
index_name = df.index.name
else:
index_name = "_index"
group.attrs["_index"] = check_key(index_name)
# ._values is "the best" array representation. It's the true array backing the
# object, where `.values` is always a np.ndarray and .array is always a pandas
# array.
_writer.write_elem(
group, index_name, df.index._values, dataset_kwargs=dataset_kwargs
)
for colname, series in df.items():
# TODO: this should write the "true" representation of the series (i.e. the underlying array or ndarray depending)
_writer.write_elem(
group, colname, series._values, dataset_kwargs=dataset_kwargs
)
@_REGISTRY.register_read(H5Group, IOSpec("dataframe", "0.2.0"))
@_REGISTRY.register_read(ZarrGroup, IOSpec("dataframe", "0.2.0"))
def read_dataframe(elem: GroupStorageType, *, _reader: Reader) -> pd.DataFrame:
columns = list(_read_attr(elem.attrs, "column-order"))
idx_key = _read_attr(elem.attrs, "_index")
df = pd.DataFrame(
{k: _reader.read_elem(elem[k]) for k in columns},
index=_reader.read_elem(elem[idx_key]),
columns=columns if len(columns) else None,
)
if idx_key != "_index":
df.index.name = idx_key
return df
# TODO: Figure out what indices is allowed to be at each element
@_REGISTRY.register_read_partial(H5Group, IOSpec("dataframe", "0.2.0"))
@_REGISTRY.register_read_partial(ZarrGroup, IOSpec("dataframe", "0.2.0"))
def read_dataframe_partial(
elem, *, items=None, indices=(slice(None, None), slice(None, None))
):
if items is not None:
columns = [
col for col in _read_attr(elem.attrs, "column-order") if col in items
]
else:
columns = list(_read_attr(elem.attrs, "column-order"))
idx_key = _read_attr(elem.attrs, "_index")
df = pd.DataFrame(
{k: read_elem_partial(elem[k], indices=indices[0]) for k in columns},
index=read_elem_partial(elem[idx_key], indices=indices[0]),
columns=columns if len(columns) else None,
)
if idx_key != "_index":
df.index.name = idx_key
return df
# Backwards compat dataframe reading
@_REGISTRY.register_read(H5Group, IOSpec("dataframe", "0.1.0"))
@_REGISTRY.register_read(ZarrGroup, IOSpec("dataframe", "0.1.0"))
def read_dataframe_0_1_0(elem: GroupStorageType, *, _reader: Reader) -> pd.DataFrame:
columns = _read_attr(elem.attrs, "column-order")
idx_key = _read_attr(elem.attrs, "_index")
df = pd.DataFrame(
{k: read_series(elem[k]) for k in columns},
index=read_series(elem[idx_key]),
columns=columns if len(columns) else None,
)
if idx_key != "_index":
df.index.name = idx_key
return df
def read_series(dataset: h5py.Dataset) -> np.ndarray | pd.Categorical:
# For reading older dataframes
if "categories" in dataset.attrs:
if isinstance(dataset, ZarrArray):
import zarr
parent_name = dataset.name.rstrip(dataset.basename).strip("/")
parent = zarr.open(dataset.store, mode="r")[parent_name]
else:
parent = dataset.parent
categories_dset = parent[_read_attr(dataset.attrs, "categories")]
categories = read_elem(categories_dset)
ordered = bool(_read_attr(categories_dset.attrs, "ordered", default=False))
return pd.Categorical.from_codes(
read_elem(dataset), categories, ordered=ordered
)
else:
return read_elem(dataset)
@_REGISTRY.register_read_partial(H5Group, IOSpec("dataframe", "0.1.0"))
@_REGISTRY.register_read_partial(ZarrGroup, IOSpec("dataframe", "0.1.0"))
def read_partial_dataframe_0_1_0(
elem, *, items=None, indices=(slice(None), slice(None))
):
if items is None:
items = slice(None)
else:
items = list(items)
return read_elem(elem)[items].iloc[indices[0]]
###############
# Categorical #
###############
@_REGISTRY.register_write(H5Group, pd.Categorical, IOSpec("categorical", "0.2.0"))
@_REGISTRY.register_write(ZarrGroup, pd.Categorical, IOSpec("categorical", "0.2.0"))
def write_categorical(
f: GroupStorageType,
k: str,
v: pd.Categorical,
*,
_writer: Writer,
dataset_kwargs: Mapping[str, Any] = MappingProxyType({}),
):
g = f.require_group(k)
g.attrs["ordered"] = bool(v.ordered)
_writer.write_elem(g, "codes", v.codes, dataset_kwargs=dataset_kwargs)
_writer.write_elem(
g, "categories", v.categories._values, dataset_kwargs=dataset_kwargs
)
@_REGISTRY.register_read(H5Group, IOSpec("categorical", "0.2.0"))
@_REGISTRY.register_read(ZarrGroup, IOSpec("categorical", "0.2.0"))
def read_categorical(elem: GroupStorageType, *, _reader: Reader) -> pd.Categorical:
return pd.Categorical.from_codes(
codes=_reader.read_elem(elem["codes"]),
categories=_reader.read_elem(elem["categories"]),
ordered=bool(_read_attr(elem.attrs, "ordered")),
)
@_REGISTRY.register_read_partial(H5Group, IOSpec("categorical", "0.2.0"))
@_REGISTRY.register_read_partial(ZarrGroup, IOSpec("categorical", "0.2.0"))
def read_partial_categorical(elem, *, items=None, indices=(slice(None),)):
return pd.Categorical.from_codes(
codes=read_elem_partial(elem["codes"], indices=indices),
categories=read_elem(elem["categories"]),
ordered=bool(_read_attr(elem.attrs, "ordered")),
)
####################
# Pandas nullables #
####################
@_REGISTRY.register_write(
H5Group, pd.arrays.IntegerArray, IOSpec("nullable-integer", "0.1.0")
)
@_REGISTRY.register_write(
ZarrGroup, pd.arrays.IntegerArray, IOSpec("nullable-integer", "0.1.0")
)
@_REGISTRY.register_write(
H5Group, pd.arrays.BooleanArray, IOSpec("nullable-boolean", "0.1.0")
)
@_REGISTRY.register_write(
ZarrGroup, pd.arrays.BooleanArray, IOSpec("nullable-boolean", "0.1.0")
)
@_REGISTRY.register_write(
H5Group, pd.arrays.StringArray, IOSpec("nullable-string-array", "0.1.0")
)
@_REGISTRY.register_write(
ZarrGroup, pd.arrays.StringArray, IOSpec("nullable-string-array", "0.1.0")
)
def write_nullable(
f: GroupStorageType,
k: str,
v: pd.arrays.IntegerArray | pd.arrays.BooleanArray | pd.arrays.StringArray,
*,
_writer: Writer,
dataset_kwargs: Mapping[str, Any] = MappingProxyType({}),
):
if (
isinstance(v, pd.arrays.StringArray)
and not settings.allow_write_nullable_strings
):
msg = (
"`anndata.settings.allow_write_nullable_strings` is False, "
"because writing of `pd.arrays.StringArray` is new "
"and not supported in anndata < 0.11, still use by many people. "
"Opt-in to writing these arrays by toggling the setting to True."
)
raise RuntimeError(msg)
g = f.require_group(k)
values = (
v.to_numpy(na_value="")
if isinstance(v, pd.arrays.StringArray)
else v.to_numpy(na_value=0, dtype=v.dtype.numpy_dtype)
)
_writer.write_elem(g, "values", values, dataset_kwargs=dataset_kwargs)
_writer.write_elem(g, "mask", v.isna(), dataset_kwargs=dataset_kwargs)
def _read_nullable(
elem: GroupStorageType,
*,
_reader: Reader,
# BaseMaskedArray
array_type: Callable[
[NDArray[np.number], NDArray[np.bool_]], pd.api.extensions.ExtensionArray
],
) -> pd.api.extensions.ExtensionArray:
return array_type(
_reader.read_elem(elem["values"]),
mask=_reader.read_elem(elem["mask"]),
)
def _string_array(
values: np.ndarray, mask: np.ndarray
) -> pd.api.extensions.ExtensionArray:
"""Construct a string array from values and mask."""
arr = pd.array(values, dtype=pd.StringDtype())
arr[mask] = pd.NA
return arr
_REGISTRY.register_read(H5Group, IOSpec("nullable-integer", "0.1.0"))(
read_nullable_integer := partial(_read_nullable, array_type=pd.arrays.IntegerArray)
)
_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-integer", "0.1.0"))(
read_nullable_integer
)
_REGISTRY.register_read(H5Group, IOSpec("nullable-boolean", "0.1.0"))(
read_nullable_boolean := partial(_read_nullable, array_type=pd.arrays.BooleanArray)
)
_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-boolean", "0.1.0"))(
read_nullable_boolean
)
_REGISTRY.register_read(H5Group, IOSpec("nullable-string-array", "0.1.0"))(
read_nullable_string := partial(_read_nullable, array_type=_string_array)
)
_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-string-array", "0.1.0"))(
read_nullable_string
)
###########
# Scalars #
###########
@_REGISTRY.register_read(H5Array, IOSpec("numeric-scalar", "0.2.0"))
@_REGISTRY.register_read(ZarrArray, IOSpec("numeric-scalar", "0.2.0"))
def read_scalar(elem: ArrayStorageType, *, _reader: Reader) -> np.number:
# TODO: `item` ensures the return is in fact a scalar (needed after zarr v3 which now returns a 1 elem array)
# https://github.com/zarr-developers/zarr-python/issues/2713
return elem[()].item()
def _remove_scalar_compression_args(dataset_kwargs: Mapping[str, Any]) -> dict:
# Can’t compress scalars, error is thrown
dataset_kwargs = dict(dataset_kwargs)
for arg in (
"compression",
"compression_opts",
"chunks",
"shuffle",
"fletcher32",
"scaleoffset",
"compressor",
):
dataset_kwargs.pop(arg, None)
return dataset_kwargs
def write_scalar_zarr(
f: ZarrGroup,
key: str,
value,
*,
_writer: Writer,
dataset_kwargs: Mapping[str, Any] = MappingProxyType({}),
):
# these args are ignored in v2: https://zarr.readthedocs.io/en/v2.18.4/api/hierarchy.html#zarr.hierarchy.Group.create_dataset
# and error out in v3
dataset_kwargs = _remove_scalar_compression_args(dataset_kwargs)
if is_zarr_v2():
return f.create_dataset(key, data=np.array(value), shape=(), **dataset_kwargs)
else:
from numcodecs import VLenUTF8
match ad.settings.zarr_write_format, value:
case 2, str():
filters, dtype = [VLenUTF8()], object
case 3, str():
filters, dtype = None, np.dtypes.StringDType()
case _, _:
filters, dtype = None, np.array(value).dtype
a = f.create_array(
key,
shape=(),
dtype=dtype,
filters=filters,
**dataset_kwargs,
)
a[...] = np.array(value)
def write_hdf5_scalar(
f: H5Group,
key: str,
value,
*,
_writer: Writer,
dataset_kwargs: Mapping[str, Any] = MappingProxyType({}),
):
# Can’t compress scalars, error is thrown
dataset_kwargs = _remove_scalar_compression_args(dataset_kwargs)
f.create_dataset(key, data=np.array(value), **dataset_kwargs)
for numeric_scalar_type in [
*(bool, np.bool_),
*(np.uint8, np.uint16, np.uint32, np.uint64),
*(int, np.int8, np.int16, np.int32, np.int64),
*(float, *np.floating.__subclasses__()),
*np.complexfloating.__subclasses__(),
]:
_REGISTRY.register_write(
H5Group, numeric_scalar_type, IOSpec("numeric-scalar", "0.2.0")
)(write_hdf5_scalar)
_REGISTRY.register_write(
ZarrGroup, numeric_scalar_type, IOSpec("numeric-scalar", "0.2.0")
)(write_scalar_zarr)
_REGISTRY.register_write(ZarrGroup, str, IOSpec("string", "0.2.0"))(write_scalar_zarr)
_REGISTRY.register_write(ZarrGroup, np.str_, IOSpec("string", "0.2.0"))(
write_scalar_zarr
)
@_REGISTRY.register_read(H5Array, IOSpec("string", "0.2.0"))
def read_hdf5_string(elem: H5Array, *, _reader: Reader) -> str:
return elem.asstr()[()]
@_REGISTRY.register_read(ZarrArray, IOSpec("string", "0.2.0"))
def read_zarr_string(elem: ZarrArray, *, _reader: Reader) -> str:
return str(elem[()])
_REGISTRY.register_read(H5Array, IOSpec("bytes", "0.2.0"))(read_scalar)
_REGISTRY.register_read(ZarrArray, IOSpec("bytes", "0.2.0"))(read_scalar)
@_REGISTRY.register_write(H5Group, np.str_, IOSpec("string", "0.2.0"))
@_REGISTRY.register_write(H5Group, str, IOSpec("string", "0.2.0"))
def write_string(
f: H5Group,
k: str,
v: np.str_ | str,
*,
_writer: Writer,
dataset_kwargs: Mapping[str, Any],
):
dataset_kwargs = dataset_kwargs.copy()
dataset_kwargs.pop("compression", None)
dataset_kwargs.pop("compression_opts", None)
f.create_dataset(
k, data=np.array(v, dtype=h5py.string_dtype(encoding="utf-8")), **dataset_kwargs
)
# @_REGISTRY.register_write(np.bytes_, IOSpec("bytes", "0.2.0"))
# @_REGISTRY.register_write(bytes, IOSpec("bytes", "0.2.0"))
# def write_string(f, k, v, dataset_kwargs):
# if "compression" in dataset_kwargs:
# dataset_kwargs = dict(dataset_kwargs)
# dataset_kwargs.pop("compression")
# f.create_dataset(k, data=np.array(v), **dataset_kwargs)
python-anndata-0.12.0~rc1/src/anndata/_io/specs/registry.py 0000664 0000000 0000000 00000041112 15003706322 0023622 0 ustar 00root root 0000000 0000000 from __future__ import annotations
import inspect
import warnings
from collections.abc import Mapping
from dataclasses import dataclass
from functools import partial, singledispatch, wraps
from types import MappingProxyType
from typing import TYPE_CHECKING, Generic, TypeVar
from anndata._io.utils import report_read_key_on_error, report_write_key_on_error
from anndata._types import Read, ReadLazy, _ReadInternal, _ReadLazyInternal
from anndata.compat import DaskArray, ZarrGroup, _read_attr, is_zarr_v2
if TYPE_CHECKING:
from collections.abc import Callable, Generator, Iterable
from typing import Any
from anndata._types import (
GroupStorageType,
ReadCallback,
StorageType,
Write,
WriteCallback,
_WriteInternal,
)
from anndata.experimental.backed._compat import Dataset2D
from anndata.experimental.backed._lazy_arrays import CategoricalArray, MaskedArray
from anndata.typing import RWAble
T = TypeVar("T")
W = TypeVar("W", bound=_WriteInternal)
LazyDataStructures = DaskArray | Dataset2D | CategoricalArray | MaskedArray
# TODO: This probably should be replaced by a hashable Mapping due to conversion b/w "_" and "-"
# TODO: Should filetype be included in the IOSpec if it changes the encoding? Or does the intent that these things be "the same" overrule that?
@dataclass(frozen=True)
class IOSpec:
encoding_type: str
encoding_version: str
# TODO: Should this subclass from LookupError?
class IORegistryError(Exception):
@classmethod
def _from_write_parts(
cls, dest_type: type, typ: type | tuple[type, str], modifiers: frozenset[str]
) -> IORegistryError:
msg = f"No method registered for writing {typ} into {dest_type}"
if modifiers:
msg += f" with {modifiers}"
return cls(msg)
@classmethod
def _from_read_parts(
cls,
method: str,
registry: Mapping,
src_typ: type[StorageType],
spec: IOSpec,
) -> IORegistryError:
# TODO: Improve error message if type exists, but version does not
msg = (
f"No {method} method registered for {spec} from {src_typ}. "
"You may need to update your installation of anndata."
)
return cls(msg)
def write_spec(spec: IOSpec):
def decorator(func: W) -> W:
@wraps(func)
def wrapper(g: GroupStorageType, k: str, *args, **kwargs):
result = func(g, k, *args, **kwargs)
g[k].attrs.setdefault("encoding-type", spec.encoding_type)
g[k].attrs.setdefault("encoding-version", spec.encoding_version)
return result
return wrapper
return decorator
_R = TypeVar("_R", _ReadInternal, _ReadLazyInternal)
R = TypeVar("R", Read, ReadLazy)
class IORegistry(Generic[_R, R]):
def __init__(self):
self.read: dict[tuple[type, IOSpec, frozenset[str]], _R] = {}
self.read_partial: dict[tuple[type, IOSpec, frozenset[str]], Callable] = {}
self.write: dict[
tuple[type, type | tuple[type, str], frozenset[str]], _WriteInternal
] = {}
self.write_specs: dict[type | tuple[type, str] | tuple[type, type], IOSpec] = {}
def register_write(
self,
dest_type: type,
src_type: type | tuple[type, str],
spec: IOSpec | Mapping[str, str],
modifiers: Iterable[str] = frozenset(),
) -> Callable[[_WriteInternal[T]], _WriteInternal[T]]:
spec = proc_spec(spec)
modifiers = frozenset(modifiers)
# Record specification for src_type
if src_type in self.write_specs and (spec != self.write_specs[src_type]):
# First check for consistency
current_spec = self.write_specs[src_type]
msg = (
"Cannot overwrite IO specifications. Attempted to overwrite encoding "
f"for {src_type} from {current_spec} to {spec}"
)
raise TypeError(msg)
else:
self.write_specs[src_type] = spec
def _register(func):
self.write[(dest_type, src_type, modifiers)] = write_spec(spec)(func)
return func
return _register
def get_write(
self,
dest_type: type,
src_type: type | tuple[type, str],
modifiers: frozenset[str] = frozenset(),
*,
writer: Writer,
) -> Write:
import h5py
if dest_type is h5py.File:
dest_type = h5py.Group
if (dest_type, src_type, modifiers) not in self.write:
raise IORegistryError._from_write_parts(dest_type, src_type, modifiers)
internal = self.write[(dest_type, src_type, modifiers)]
return partial(internal, _writer=writer)
def has_write(
self,
dest_type: type,
src_type: type | tuple[type, str],
modifiers: frozenset[str],
) -> bool:
return (dest_type, src_type, modifiers) in self.write
def register_read(
self,
src_type: type,
spec: IOSpec | Mapping[str, str],
modifiers: Iterable[str] = frozenset(),
) -> Callable[[_R], _R]:
spec = proc_spec(spec)
modifiers = frozenset(modifiers)
def _register(func):
self.read[(src_type, spec, modifiers)] = func
return func
return _register
def get_read(
self,
src_type: type,
spec: IOSpec,
modifiers: frozenset[str] = frozenset(),
*,
reader: Reader,
) -> R:
if (src_type, spec, modifiers) not in self.read:
raise IORegistryError._from_read_parts("read", self.read, src_type, spec) # noqa: EM101
internal = self.read[(src_type, spec, modifiers)]
return partial(internal, _reader=reader)
def has_read(
self, src_type: type, spec: IOSpec, modifiers: frozenset[str] = frozenset()
) -> bool:
return (src_type, spec, modifiers) in self.read
def register_read_partial(
self,
src_type: type,
spec: IOSpec | Mapping[str, str],
modifiers: Iterable[str] = frozenset(),
):
spec = proc_spec(spec)
modifiers = frozenset(modifiers)
def _register(func):
self.read_partial[(src_type, spec, modifiers)] = func
return func
return _register
def get_partial_read(
self, src_type: type, spec: IOSpec, modifiers: frozenset[str] = frozenset()
):
if (src_type, spec, modifiers) in self.read_partial:
return self.read_partial[(src_type, spec, modifiers)]
name = "read_partial"
raise IORegistryError._from_read_parts(name, self.read_partial, src_type, spec)
def get_spec(self, elem: Any) -> IOSpec:
if isinstance(elem, DaskArray):
if (typ_meta := (DaskArray, type(elem._meta))) in self.write_specs:
return self.write_specs[typ_meta]
elif hasattr(elem, "dtype"):
if (typ_kind := (type(elem), elem.dtype.kind)) in self.write_specs:
return self.write_specs[typ_kind]
return self.write_specs[type(elem)]
_REGISTRY: IORegistry[_ReadInternal, Read] = IORegistry()
_LAZY_REGISTRY: IORegistry[_ReadLazyInternal, ReadLazy] = IORegistry()
@singledispatch
def proc_spec(spec) -> IOSpec:
msg = f"proc_spec not defined for type: {type(spec)}."
raise NotImplementedError(msg)
@proc_spec.register(IOSpec)
def proc_spec_spec(spec: IOSpec) -> IOSpec:
return spec
@proc_spec.register(Mapping)
def proc_spec_mapping(spec: Mapping[str, str]) -> IOSpec:
return IOSpec(**{k.replace("-", "_"): v for k, v in spec.items()})
def get_spec(
elem: StorageType,
) -> IOSpec:
return proc_spec(
{
k: _read_attr(elem.attrs, k, "")
for k in ["encoding-type", "encoding-version"]
}
)
def _iter_patterns(
elem,
) -> Generator[tuple[type, type | str] | tuple[type, type, str], None, None]:
"""Iterates over possible patterns for an element in order of precedence."""
from anndata.compat import DaskArray
t = type(elem)
if isinstance(elem, DaskArray):
yield (t, type(elem._meta), elem.dtype.kind)
yield (t, type(elem._meta))
if hasattr(elem, "dtype"):
yield (t, elem.dtype.kind)
yield t
class Reader:
def __init__(
self, registry: IORegistry, callback: ReadCallback | None = None
) -> None:
self.registry = registry
self.callback = callback
@report_read_key_on_error
def read_elem(
self,
elem: StorageType,
modifiers: frozenset[str] = frozenset(),
) -> RWAble:
"""Read an element from a store. See exported function for more details."""
iospec = get_spec(elem)
read_func: Read = self.registry.get_read(
type(elem), iospec, modifiers, reader=self
)
if self.callback is None:
return read_func(elem)
return self.callback(read_func, elem.name, elem, iospec=iospec)
class LazyReader(Reader):
@report_read_key_on_error
def read_elem(
self,
elem: StorageType,
modifiers: frozenset[str] = frozenset(),
chunks: tuple[int, ...] | None = None,
**kwargs,
) -> LazyDataStructures:
"""Read a dask element from a store. See exported function for more details."""
iospec = get_spec(elem)
read_func: ReadLazy = self.registry.get_read(
type(elem), iospec, modifiers, reader=self
)
if self.callback is not None:
msg = "Dask reading does not use a callback. Ignoring callback."
warnings.warn(msg, stacklevel=2)
read_params = inspect.signature(read_func).parameters
for kwarg in kwargs:
if kwarg not in read_params:
msg = (
f"Keyword argument {kwarg} passed to read_elem_lazy are not supported by the "
"registered read function."
)
raise ValueError(msg)
if "chunks" in read_params:
kwargs["chunks"] = chunks
return read_func(elem, **kwargs)
class Writer:
def __init__(self, registry: IORegistry, callback: WriteCallback | None = None):
self.registry = registry
self.callback = callback
def find_write_func(
self, dest_type: type, elem: Any, modifiers: frozenset[str]
) -> Write:
for pattern in _iter_patterns(elem):
if self.registry.has_write(dest_type, pattern, modifiers):
return self.registry.get_write(
dest_type, pattern, modifiers, writer=self
)
# Raises IORegistryError
return self.registry.get_write(dest_type, type(elem), modifiers, writer=self)
@report_write_key_on_error
def write_elem(
self,
store: GroupStorageType,
k: str,
elem: RWAble,
*,
dataset_kwargs: Mapping[str, Any] = MappingProxyType({}),
modifiers: frozenset[str] = frozenset(),
):
from pathlib import PurePosixPath
import h5py
# we allow stores to have a prefix like /uns which are then written to with keys like /uns/foo
if "/" in k.split(store.name)[-1][1:]:
msg = "Forward slashes are not allowed in keys."
raise ValueError(msg)
if isinstance(store, h5py.File):
store = store["/"]
dest_type = type(store)
# Normalize k to absolute path
if (
(isinstance(store, ZarrGroup) and is_zarr_v2())
or isinstance(store, h5py.Group)
and not PurePosixPath(k).is_absolute()
):
k = str(PurePosixPath(store.name) / k)
if k == "/":
if isinstance(store, ZarrGroup) and not is_zarr_v2():
from zarr.core.sync import sync
sync(store.store.clear())
else:
store.clear()
elif k in store:
del store[k]
write_func = self.find_write_func(dest_type, elem, modifiers)
if self.callback is None:
return write_func(store, k, elem, dataset_kwargs=dataset_kwargs)
return self.callback(
write_func,
store,
k,
elem,
dataset_kwargs=dataset_kwargs,
iospec=self.registry.get_spec(elem),
)
def read_elem(elem: StorageType) -> RWAble:
"""
Read an element from a store.
Assumes that the element is encoded using the anndata encoding. This function will
determine the encoded type using the encoding metadata stored in elem's attributes.
Params
------
elem
The stored element.
"""
return Reader(_REGISTRY).read_elem(elem)
def read_elem_lazy(
elem: StorageType, chunks: tuple[int, ...] | None = None, **kwargs
) -> LazyDataStructures:
"""
Read an element from a store lazily.
Assumes that the element is encoded using the anndata encoding. This function will
determine the encoded type using the encoding metadata stored in elem's attributes.
Parameters
----------
elem
The stored element.
chunks, optional
length `n`, the same `n` as the size of the underlying array.
Note that the minor axis dimension must match the shape for sparse.
Defaults to `(1000, adata.shape[1])` for CSR sparse,
`(adata.shape[0], 1000)` for CSC sparse,
and the on-disk chunking otherwise for dense.
Can use `-1` or `None` to indicate use of the size of the corresponding dimension.
Returns
-------
A "lazy" elem
Examples
--------
Setting up our example:
>>> from scanpy.datasets import pbmc3k
>>> import tempfile
>>> import anndata as ad
>>> import zarr
>>> tmp_path = tempfile.gettempdir()
>>> zarr_path = tmp_path + "/adata.zarr"
>>> adata = pbmc3k()
>>> adata.layers["dense"] = adata.X.toarray()
>>> adata.write_zarr(zarr_path)
Reading a sparse matrix from a zarr store lazily, with custom chunk size and default:
>>> g = zarr.open(zarr_path)
>>> adata.X = ad.experimental.read_elem_lazy(g["X"])
>>> adata.X
dask.array
>>> adata.X = ad.experimental.read_elem_lazy(g["X"], chunks=(500, adata.shape[1]))
>>> adata.X
dask.array
Reading a dense matrix from a zarr store lazily:
>>> adata.layers["dense"] = ad.experimental.read_elem_lazy(g["layers/dense"])
>>> adata.layers["dense"]
dask.array
Making a new anndata object from on-disk, with custom chunks:
>>> adata = ad.AnnData(
... obs=ad.io.read_elem(g["obs"]),
... var=ad.io.read_elem(g["var"]),
... uns=ad.io.read_elem(g["uns"]),
... obsm=ad.io.read_elem(g["obsm"]),
... varm=ad.io.read_elem(g["varm"]),
... )
>>> adata.X = ad.experimental.read_elem_lazy(g["X"], chunks=(500, adata.shape[1]))
>>> adata.layers["dense"] = ad.experimental.read_elem_lazy(g["layers/dense"])
We also support using -1 and None as a chunk size to signify the reading the whole axis:
>>> adata.X = ad.experimental.read_elem_lazy(g["X"], chunks=(500, -1))
>>> adata.X = ad.experimental.read_elem_lazy(g["X"], chunks=(500, None))
"""
return LazyReader(_LAZY_REGISTRY).read_elem(elem, chunks=chunks, **kwargs)
def write_elem(
store: GroupStorageType,
k: str,
elem: RWAble,
*,
dataset_kwargs: Mapping[str, Any] = MappingProxyType({}),
) -> None:
"""
Write an element to a storage group using anndata encoding.
Params
------
store
The group to write to.
k
The key to write to in the group. Note that absolute paths will be written
from the root.
elem
The element to write. Typically an in-memory object, e.g. an AnnData, pandas
dataframe, scipy sparse matrix, etc.
dataset_kwargs
Keyword arguments to pass to the stores dataset creation function.
E.g. for zarr this would be `chunks`, `compressor`.
"""
Writer(_REGISTRY).write_elem(store, k, elem, dataset_kwargs=dataset_kwargs)
# TODO: If all items would be read, just call normal read method
def read_elem_partial(
elem,
*,
items=None,
indices=(slice(None), slice(None)),
modifiers: frozenset[str] = frozenset(),
):
"""Read part of an element from an on disk store."""
read_partial = _REGISTRY.get_partial_read(
type(elem), get_spec(elem), frozenset(modifiers)
)
return read_partial(elem, items=items, indices=indices)
python-anndata-0.12.0~rc1/src/anndata/_io/utils.py 0000664 0000000 0000000 00000022567 15003706322 0022012 0 ustar 00root root 0000000 0000000 from __future__ import annotations
from functools import WRAPPER_ASSIGNMENTS, wraps
from itertools import pairwise
from typing import TYPE_CHECKING, cast
from warnings import warn
import h5py
from packaging.version import Version
from .._core.sparse_dataset import BaseCompressedSparseDataset
if TYPE_CHECKING:
from collections.abc import Callable, Mapping
from typing import Any, Literal
from .._types import ContravariantRWAble, StorageType, _WriteInternal
from ..compat import H5Group, ZarrGroup
from .specs.registry import Writer
Storage = StorageType | BaseCompressedSparseDataset
# For allowing h5py v3
# https://github.com/scverse/anndata/issues/442
H5PY_V3 = Version(h5py.__version__).major >= 3
# -------------------------------------------------------------------------------
# Type conversion
# -------------------------------------------------------------------------------
# Could be numba’d if it returned tuples instead of slices
def idx_chunks_along_axis(shape: tuple, axis: int, chunk_size: int):
"""\
Gives indexer tuples chunked along an axis.
Params
------
shape
Shape of array to be chunked
axis
Axis to chunk along
chunk_size
Size of chunk along axis
Returns
-------
An iterator of tuples for indexing into an array of passed shape.
"""
total = shape[axis]
cur = 0
mutable_idx = [slice(None) for i in range(len(shape))]
while cur + chunk_size < total:
mutable_idx[axis] = slice(cur, cur + chunk_size)
yield tuple(mutable_idx)
cur += chunk_size
mutable_idx[axis] = slice(cur, None)
yield tuple(mutable_idx)
def is_float(string):
"""\
Check whether string is float.
See also
--------
http://stackoverflow.com/questions/736043/checking-if-a-string-can-be-converted-to-float-in-python
"""
try:
float(string)
return True
except ValueError:
return False
def is_int(string):
"""Check whether string is integer."""
try:
int(string)
return True
except ValueError:
return False
def convert_bool(string):
"""Check whether string is boolean."""
if string == "True":
return True, True
elif string == "False":
return True, False
else:
return False, False
def convert_string(string):
"""Convert string to int, float or bool."""
if is_int(string):
return int(string)
elif is_float(string):
return float(string)
elif convert_bool(string)[0]:
return convert_bool(string)[1]
elif string == "None":
return None
else:
return string
def check_key(key):
"""Checks that passed value is a valid h5py key.
Should convert it if there is an obvious conversion path, error otherwise.
"""
typ = type(key)
if issubclass(typ, str):
return str(key)
# TODO: Should I try to decode bytes? It's what h5py would do,
# but it will be read out as a str.
# elif issubclass(typ, bytes):
# return key
else:
msg = f"{key} of type {typ} is an invalid key. Should be str."
raise TypeError(msg)
# -------------------------------------------------------------------------------
# Generic functions
# -------------------------------------------------------------------------------
def read_attribute(*args, **kwargs):
from .specs import read_elem
warn(
"This internal function has been deprecated, please use read_elem instead",
FutureWarning,
)
return read_elem(*args, **kwargs)
def write_attribute(*args, **kwargs):
from .specs import write_elem
warn(
"This internal function has been deprecated, please use write_elem instead",
FutureWarning,
)
return write_elem(*args, **kwargs)
# -------------------------------------------------------------------------------
# Errors handling
# -------------------------------------------------------------------------------
# TODO: Is there a consistent way to do this which just modifies the previously
# thrown error? Could do a warning?
class AnnDataReadError(OSError):
"""Error caused while trying to read in AnnData."""
pass
def _get_display_path(store: Storage) -> str:
"""Return an absolute path of an element (always starts with “/”)."""
if isinstance(store, BaseCompressedSparseDataset):
store = store.group
path = store.name or "??" # can be None
return f"/{path.removeprefix('/')}"
def add_key_note(
e: BaseException, store: Storage, path: str, key: str, op: Literal["read", "writ"]
) -> None:
if any(
f"Error raised while {op}ing key" in note
for note in getattr(e, "__notes__", [])
):
return
dir = "to" if op == "writ" else "from"
msg = f"Error raised while {op}ing key {key!r} of {type(store)} {dir} {path}"
e.add_note(msg)
def report_read_key_on_error(func):
"""\
A decorator for hdf5/zarr element reading which makes keys involved in errors get reported.
Example
-------
>>> import zarr
>>> import numpy as np
>>> @report_read_key_on_error
... def read_arr(group):
... raise NotImplementedError()
>>> z = zarr.open("tmp.zarr", mode="w")
>>> z["X"] = np.array([1, 2, 3])
>>> read_arr(z["X"]) # doctest: +SKIP
"""
@wraps(func)
def func_wrapper(*args, **kwargs):
from anndata._io.specs import Reader
# Figure out signature (method vs function) by going through args
for arg in args:
if not isinstance(arg, Reader):
store = cast("Storage", arg)
break
else:
msg = "No element found in args."
raise ValueError(msg)
try:
return func(*args, **kwargs)
except Exception as e:
path, key = _get_display_path(store).rsplit("/", 1)
add_key_note(e, store, path or "/", key, "read")
raise
return func_wrapper
def report_write_key_on_error(func):
"""\
A decorator for hdf5/zarr element writing which makes keys involved in errors get reported.
Example
-------
>>> import zarr
>>> @report_write_key_on_error
... def write_arr(group, key, val):
... raise NotImplementedError()
>>> z = zarr.open("tmp.zarr", mode="w")
>>> X = [1, 2, 3]
>>> write_arr(z, "X", X) # doctest: +SKIP
"""
@wraps(func)
def func_wrapper(*args, **kwargs):
from anndata._io.specs import Writer
# Figure out signature (method vs function) by going through args
for arg, key in pairwise(args):
if not isinstance(arg, Writer):
store = cast("Storage", arg)
break
else:
msg = "No element found in args."
raise ValueError(msg)
try:
return func(*args, **kwargs)
except Exception as e:
path = _get_display_path(store)
add_key_note(e, store, path, key, "writ")
raise
return func_wrapper
# -------------------------------------------------------------------------------
# Common h5ad/zarr stuff
# -------------------------------------------------------------------------------
def _read_legacy_raw(
f: ZarrGroup | H5Group,
modern_raw, # TODO: type
read_df: Callable,
read_attr: Callable,
*,
attrs=("X", "var", "varm"),
) -> dict:
"""\
Backwards compat for reading legacy raw.
Makes sure that no modern raw group coexists with legacy raw.* groups.
"""
if modern_raw:
if any(k.startswith("raw.") for k in f):
what = f"File {f.filename}" if hasattr(f, "filename") else "Store"
msg = f"{what} has both legacy and current raw formats."
raise ValueError(msg)
return modern_raw
raw = {}
if "X" in attrs and "raw.X" in f:
raw["X"] = read_attr(f["raw.X"])
if "var" in attrs and "raw.var" in f:
raw["var"] = read_df(f["raw.var"]) # Backwards compat
if "varm" in attrs and "raw.varm" in f:
raw["varm"] = read_attr(f["raw.varm"])
return raw
def zero_dim_array_as_scalar(func: _WriteInternal):
"""\
A decorator for write_elem implementations of arrays where zero-dimensional arrays need special handling.
"""
@wraps(func, assigned=WRAPPER_ASSIGNMENTS + ("__defaults__", "__kwdefaults__"))
def func_wrapper(
f: StorageType,
k: str,
elem: ContravariantRWAble,
*,
_writer: Writer,
dataset_kwargs: Mapping[str, Any],
):
if elem.shape == ():
_writer.write_elem(f, k, elem[()], dataset_kwargs=dataset_kwargs)
else:
func(f, k, elem, _writer=_writer, dataset_kwargs=dataset_kwargs)
return func_wrapper
def no_write_dataset_2d(write):
def raise_error_if_dataset_2d_present(store, adata, *args, **kwargs):
from anndata.experimental.backed._compat import has_dataset_2d
if has_dataset_2d(adata):
msg = (
"Writing AnnData objects with a Dataset2D not supported yet. "
"Please use `ds.to_memory` to bring the dataset into memory. "
"Note that if you have generated this object by concatenating several `AnnData` objects"
"the original types may be lost."
)
raise NotImplementedError(msg)
return write(store, adata, *args, **kwargs)
return raise_error_if_dataset_2d_present
python-anndata-0.12.0~rc1/src/anndata/_io/write.py 0000664 0000000 0000000 00000011420 15003706322 0021766 0 ustar 00root root 0000000 0000000 from __future__ import annotations
import math
import warnings
from os import fspath
from pathlib import Path
from typing import TYPE_CHECKING
import numpy as np
import pandas as pd
from scipy.sparse import issparse
from anndata._io.utils import no_write_dataset_2d
from .._warnings import WriteWarning
from ..compat import old_positionals
from ..logging import get_logger
if TYPE_CHECKING:
from os import PathLike
from .. import AnnData
logger = get_logger(__name__)
@no_write_dataset_2d
@old_positionals("skip_data", "sep")
def write_csvs(
dirname: PathLike[str] | str,
adata: AnnData,
*,
skip_data: bool = True,
sep: str = ",",
):
"""See :meth:`~anndata.AnnData.write_csvs`."""
dirname = Path(dirname)
if dirname.suffix == ".csv":
dirname = dirname.with_suffix("")
logger.info(f"writing .csv files to {dirname}")
if not dirname.is_dir():
dirname.mkdir(parents=True, exist_ok=True)
dir_uns = dirname / "uns"
if not dir_uns.is_dir():
dir_uns.mkdir(parents=True, exist_ok=True)
d = dict(
obs=adata._obs,
var=adata._var,
obsm=adata.obsm.to_df(),
varm=adata.varm.to_df(),
)
if not skip_data:
d["X"] = pd.DataFrame(adata.X.toarray() if issparse(adata.X) else adata.X)
d_write = {**d, **adata._uns}
not_yet_raised_sparse_warning = True
for key, value in d_write.items():
if issparse(value):
if not_yet_raised_sparse_warning:
warnings.warn("Omitting to write sparse annotation.", WriteWarning)
not_yet_raised_sparse_warning = False
continue
filename = dirname
if key not in {"X", "var", "obs", "obsm", "varm"}:
filename = dir_uns
filename /= f"{key}.csv"
df = value
if not isinstance(value, pd.DataFrame):
value = np.array(value)
if np.ndim(value) == 0:
value = value[None]
try:
df = pd.DataFrame(value)
except Exception as e:
warnings.warn(
f"Omitting to write {key!r} of type {type(e)}.",
WriteWarning,
)
continue
df.to_csv(
filename,
sep=sep,
header=key in {"obs", "var", "obsm", "varm"},
index=key in {"obs", "var"},
)
@no_write_dataset_2d
@old_positionals("write_obsm_varm")
def write_loom(
filename: PathLike[str] | str, adata: AnnData, *, write_obsm_varm: bool = False
) -> None:
"""See :meth:`~anndata.AnnData.write_loom`."""
filename = Path(filename)
row_attrs = {k: np.array(v) for k, v in adata.var.to_dict("list").items()}
row_names = adata.var_names
row_dim = row_names.name if row_names.name is not None else "var_names"
row_attrs[row_dim] = row_names.values
col_attrs = {k: np.array(v) for k, v in adata.obs.to_dict("list").items()}
col_names = adata.obs_names
col_dim = col_names.name if col_names.name is not None else "obs_names"
col_attrs[col_dim] = col_names.values
if adata.X is None:
msg = "loompy does not accept empty matrices as data"
raise ValueError(msg)
if write_obsm_varm:
for key in adata.obsm.keys():
col_attrs[key] = adata.obsm[key]
for key in adata.varm.keys():
row_attrs[key] = adata.varm[key]
elif len(adata.obsm.keys()) > 0 or len(adata.varm.keys()) > 0:
logger.warning(
f"The loom file will lack these fields:\n"
f"{adata.obsm.keys() | adata.varm.keys()}\n"
f"Use write_obsm_varm=True to export multi-dimensional annotations"
)
layers = {"": adata.X.T}
for key in adata.layers.keys():
layers[key] = adata.layers[key].T
from loompy import create
if filename.exists():
filename.unlink()
create(fspath(filename), layers, row_attrs=row_attrs, col_attrs=col_attrs)
def _get_chunk_indices(za):
# TODO: does zarr provide code for this?
"""\
Return all the indices (coordinates) for the chunks in a zarr array,
even empty ones.
"""
return [
(i, j)
for i in range(int(math.ceil(float(za.shape[0]) / za.chunks[0])))
for j in range(int(math.ceil(float(za.shape[1]) / za.chunks[1])))
]
def _write_in_zarr_chunks(za, key, value):
if key != "X":
za[:] = value # don’t chunk metadata
else:
for ci in _get_chunk_indices(za):
s0, e0 = za.chunks[0] * ci[0], za.chunks[0] * (ci[0] + 1)
s1, e1 = za.chunks[1] * ci[1], za.chunks[1] * (ci[1] + 1)
print(ci, s0, e1, s1, e1)
if issparse(value):
za[s0:e0, s1:e1] = value[s0:e0, s1:e1].todense()
else:
za[s0:e0, s1:e1] = value[s0:e0, s1:e1]
python-anndata-0.12.0~rc1/src/anndata/_io/zarr.py 0000664 0000000 0000000 00000013033 15003706322 0021614 0 ustar 00root root 0000000 0000000 from __future__ import annotations
from pathlib import Path
from typing import TYPE_CHECKING, TypeVar
from warnings import warn
import numpy as np
import pandas as pd
import zarr
from scipy import sparse
from .._core.anndata import AnnData
from .._settings import settings
from .._warnings import OldFormatWarning
from ..compat import _clean_uns, _from_fixed_length_strings, is_zarr_v2
from ..experimental import read_dispatched, write_dispatched
from .specs import read_elem
from .utils import _read_legacy_raw, no_write_dataset_2d, report_read_key_on_error
if TYPE_CHECKING:
from collections.abc import MutableMapping
from os import PathLike
from zarr.core.common import AccessModeLiteral
from zarr.storage import StoreLike
T = TypeVar("T")
def _check_rec_array(adata):
if settings.zarr_write_format == 3 and len(
structured_dtype_keys := {
k
for k in adata.uns.keys()
if isinstance(adata.uns[k], np.recarray)
or (isinstance(adata.uns[k], np.ndarray) and adata.uns[k].dtype.kind == "V")
}
):
msg = f"zarr v3 does not support structured dtypes. Found keys {structured_dtype_keys}"
raise NotImplementedError(msg)
@no_write_dataset_2d
def write_zarr(
store: StoreLike,
adata: AnnData,
*,
chunks: tuple[int, ...] | None = None,
convert_strings_to_categoricals: bool = True,
**ds_kwargs,
) -> None:
"""See :meth:`~anndata.AnnData.write_zarr`."""
_check_rec_array(adata)
if isinstance(store, Path):
store = str(store)
if convert_strings_to_categoricals:
adata.strings_to_categoricals()
if adata.raw is not None:
adata.strings_to_categoricals(adata.raw.var)
# TODO: Use spec writing system for this
f = open_write_group(store)
f.attrs.setdefault("encoding-type", "anndata")
f.attrs.setdefault("encoding-version", "0.1.0")
def callback(func, s, k: str, elem, dataset_kwargs, iospec):
if (
chunks is not None
and not isinstance(elem, sparse.spmatrix)
and k.lstrip("/") == "X"
):
dataset_kwargs = dict(dataset_kwargs, chunks=chunks)
func(s, k, elem, dataset_kwargs=dataset_kwargs)
write_dispatched(f, "/", adata, callback=callback, dataset_kwargs=ds_kwargs)
if is_zarr_v2():
zarr.convenience.consolidate_metadata(f.store)
else:
zarr.consolidate_metadata(f.store)
def read_zarr(store: PathLike[str] | str | MutableMapping | zarr.Group) -> AnnData:
"""\
Read from a hierarchical Zarr array store.
Parameters
----------
store
The filename, a :class:`~typing.MutableMapping`, or a Zarr storage class.
"""
if isinstance(store, Path):
store = str(store)
if isinstance(store, zarr.Group):
f = store
else:
f = zarr.open(store, mode="r")
# Read with handling for backwards compat
def callback(func, elem_name: str, elem, iospec):
if iospec.encoding_type == "anndata" or elem_name.endswith("/"):
return AnnData(
**{
k: read_dispatched(v, callback)
for k, v in dict(elem).items()
if not k.startswith("raw.")
}
)
elif elem_name.startswith("/raw."):
return None
elif elem_name in {"/obs", "/var"}:
return read_dataframe(elem)
elif elem_name == "/raw":
# Backwards compat
return _read_legacy_raw(f, func(elem), read_dataframe, func)
return func(elem)
adata = read_dispatched(f, callback=callback)
# Backwards compat (should figure out which version)
if "raw.X" in f:
raw = AnnData(**_read_legacy_raw(f, adata.raw, read_dataframe, read_elem))
raw.obs_names = adata.obs_names
adata.raw = raw
# Backwards compat for <0.7
if isinstance(f["obs"], zarr.Array):
_clean_uns(adata)
return adata
@report_read_key_on_error
def read_dataset(dataset: zarr.Array):
"""Legacy method for reading datasets without encoding_type."""
value = dataset[...]
if not hasattr(value, "dtype"):
return value
elif isinstance(value.dtype, str):
pass
elif issubclass(value.dtype.type, np.bytes_):
value = value.astype(str).astype(object) # bytestring -> unicode -> str
elif len(value.dtype.descr) > 1: # Compound dtype
# For backwards compat, now strings are written as variable length
value = _from_fixed_length_strings(value)
if value.shape == ():
value = value[()]
return value
@report_read_key_on_error
def read_dataframe_legacy(dataset: zarr.Array) -> pd.DataFrame:
"""Reads old format of dataframes"""
# NOTE: Likely that categoricals need to be removed from uns
warn(
f"'{dataset.name}' was written with a very old version of AnnData. "
"Consider rewriting it.",
OldFormatWarning,
)
df = pd.DataFrame(_from_fixed_length_strings(dataset[()]))
df.set_index(df.columns[0], inplace=True)
return df
@report_read_key_on_error
def read_dataframe(group: zarr.Group | zarr.Array) -> pd.DataFrame:
# Fast paths
if isinstance(group, zarr.Array):
return read_dataframe_legacy(group)
else:
return read_elem(group)
def open_write_group(
store: StoreLike, *, mode: AccessModeLiteral = "w", **kwargs
) -> zarr.Group:
if not is_zarr_v2() and "zarr_format" not in kwargs:
kwargs["zarr_format"] = settings.zarr_write_format
return zarr.open_group(store, mode=mode, **kwargs)
python-anndata-0.12.0~rc1/src/anndata/_settings.py 0000664 0000000 0000000 00000035757 15003706322 0022110 0 ustar 00root root 0000000 0000000 from __future__ import annotations
import inspect
import os
import textwrap
import warnings
from collections.abc import Iterable
from contextlib import contextmanager
from dataclasses import dataclass, field, fields
from enum import Enum
from functools import partial
from inspect import Parameter, signature
from types import GenericAlias
from typing import TYPE_CHECKING, Generic, NamedTuple, TypeVar, cast
if TYPE_CHECKING:
from collections.abc import Callable, Sequence
from typing import Any, TypeGuard
T = TypeVar("T")
class DeprecatedOption(NamedTuple):
option: str
message: str | None
removal_version: str | None
def _is_plain_type(obj: object) -> TypeGuard[type]:
return isinstance(obj, type) and not isinstance(obj, GenericAlias)
def describe(self: RegisteredOption, *, as_rst: bool = False) -> str:
type_str = self.type.__name__ if _is_plain_type(self.type) else str(self.type)
if as_rst:
default_str = repr(self.default_value).replace("\\", "\\\\")
doc = f"""\
.. attribute:: settings.{self.option}
:type: {type_str}
:value: {default_str}
{self.description}
"""
else:
doc = f"""\
{self.option}: `{type_str}`
{self.description} (default: `{self.default_value!r}`).
"""
return textwrap.dedent(doc)
class RegisteredOption(NamedTuple, Generic[T]):
option: str
default_value: T
description: str
validate: Callable[[T], None]
type: object
describe = describe
def check_and_get_environ_var(
key: str,
default_value: str,
allowed_values: Sequence[str] | None = None,
cast: Callable[[Any], T] | type[Enum] = lambda x: x,
) -> T:
"""Get the environment variable and return it is a (potentially) non-string, usable value.
Parameters
----------
key
The environment variable name.
default_value
The default value for `os.environ.get`.
allowed_values
Allowable string values., by default None
cast
Casting from the string to a (potentially different) python object, by default lambdax:x
Returns
-------
The casted value.
"""
environ_value_or_default_value = os.environ.get(key, default_value)
if (
allowed_values is not None
and environ_value_or_default_value not in allowed_values
):
msg = (
f"Value {environ_value_or_default_value!r} is not in allowed {allowed_values} for environment variable {key}. "
f"Default {default_value} will be used."
)
warnings.warn(msg)
environ_value_or_default_value = default_value
return (
cast(environ_value_or_default_value)
if not isinstance(cast, type(Enum))
else cast[environ_value_or_default_value]
)
def check_and_get_bool(option, default_value):
return check_and_get_environ_var(
f"ANNDATA_{option.upper()}",
str(int(default_value)),
["0", "1"],
lambda x: bool(int(x)),
)
def check_and_get_int(option, default_value):
return check_and_get_environ_var(
f"ANNDATA_{option.upper()}",
str(int(default_value)),
None,
lambda x: int(x),
)
_docstring = """
This manager allows users to customize settings for the anndata package.
Settings here will generally be for advanced use-cases and should be used with caution.
The following options are available:
{options_description}
For setting an option please use :func:`~anndata.settings.override` (local) or set the above attributes directly (global) i.e., `anndata.settings.my_setting = foo`.
For assignment by environment variable, use the variable name in all caps with `ANNDATA_` as the prefix before import of :mod:`anndata`.
For boolean environment variable setting, use 1 for `True` and 0 for `False`.
"""
@dataclass
class SettingsManager:
_registered_options: dict[str, RegisteredOption] = field(default_factory=dict)
_deprecated_options: dict[str, DeprecatedOption] = field(default_factory=dict)
_config: dict[str, object] = field(default_factory=dict)
__doc_tmpl__: str = _docstring
def describe(
self,
option: str | Iterable[str] | None = None,
*,
should_print_description: bool = True,
as_rst: bool = False,
) -> str:
"""Print and/or return a (string) description of the option(s).
Parameters
----------
option
Option(s) to be described, by default None (i.e., do all option)
should_print_description
Whether or not to print the description in addition to returning it.
Returns
-------
The description.
"""
describe = partial(
self.describe,
should_print_description=should_print_description,
as_rst=as_rst,
)
if option is None:
return describe(self._registered_options.keys())
if isinstance(option, Iterable) and not isinstance(option, str):
return "\n".join([describe(k) for k in option])
registered_option = self._registered_options[option]
doc = registered_option.describe(as_rst=as_rst).rstrip("\n")
if option in self._deprecated_options:
opt = self._deprecated_options[option]
if opt.message is not None:
doc += f" *{opt.message}"
doc += f" {option} will be removed in {opt.removal_version}.*"
if should_print_description:
print(doc)
return doc
def deprecate(
self, option: str, removal_version: str, message: str | None = None
) -> None:
"""Deprecate options with a message at a version.
Parameters
----------
option
Which option should be deprecated.
removal_version
The version targeted for removal.
message
A custom message.
"""
self._deprecated_options[option] = DeprecatedOption(
option, message, removal_version
)
def register(
self,
option: str,
default_value: T,
description: str,
validate: Callable[[T], None],
option_type: object | None = None,
get_from_env: Callable[[str, T], T] = lambda x, y: y,
) -> None:
"""Register an option so it can be set/described etc. by end-users
Parameters
----------
option
Option to be set.
default_value
Default value with which to set the option.
description
Description to be used in the docstring.
validate
A function which raises a `ValueError` or `TypeError` if the value is invalid.
option_type
Optional override for the option type to be displayed. Otherwise `type(default_value)`.
get_from_env
An optional function which takes as arguments the name of the option and a default value and returns the value from the environment variable `ANNDATA_CAPS_OPTION` (or default if not present).
Default behavior is to return `default_value` without checking the environment.
"""
try:
validate(default_value)
except (ValueError, TypeError) as e:
e.add_note(f"for option {option!r}")
raise e
option_type = type(default_value) if option_type is None else option_type
self._registered_options[option] = RegisteredOption(
option, default_value, description, validate, option_type
)
self._config[option] = get_from_env(option, default_value)
self._update_override_function_for_new_option(option)
def _update_override_function_for_new_option(
self,
option: str,
):
"""This function updates the keyword arguments, docstring, and annotations of the `SettingsManager.override` function as the `SettingsManager.register` method is called.
Parameters
----------
option
The option being registered for which the override function needs updating.
"""
option_type = self._registered_options[option].type
# Update annotations for type checking.
self.override.__annotations__[option] = option_type
# __signature__ needs to be updated for tab autocompletion in IPython.
# See https://github.com/ipython/ipython/issues/11624 for inspiration.
self.override.__func__.__signature__ = signature(self.override).replace(
parameters=[
Parameter(name="self", kind=Parameter.POSITIONAL_ONLY),
*[
Parameter(
name=k,
annotation=option_type,
kind=Parameter.KEYWORD_ONLY,
)
for k in self._registered_options
],
]
)
# Update docstring for `SettingsManager.override` as well.
doc = cast("str", self.override.__doc__)
insert_index = doc.find("\n Yields")
option_docstring = "\t" + "\t".join(
self.describe(option, should_print_description=False).splitlines(
keepends=True
)
)
self.override.__func__.__doc__ = (
f"{doc[:insert_index]}\n{option_docstring}{doc[insert_index:]}"
)
def __setattr__(self, option: str, val: object) -> None:
"""
Set an option to a value. To see the allowed option to be set and their description,
use describe_option.
Parameters
----------
option
Option to be set.
val
Value with which to set the option.
Raises
------
AttributeError
If the option has not been registered, this function will raise an error.
"""
if option in {f.name for f in fields(self)}:
return super().__setattr__(option, val)
elif option not in self._registered_options:
msg = (
f"{option} is not an available option for anndata. "
"Please open an issue if you believe this is a mistake."
)
raise AttributeError(msg)
registered_option = self._registered_options[option]
registered_option.validate(val)
self._config[option] = val
def __getattr__(self, option: str) -> object:
"""
Gets the option's value.
Parameters
----------
option
Option to be got.
Returns
-------
Value of the option.
"""
if option in self._deprecated_options:
deprecated = self._deprecated_options[option]
msg = f"{option!r} will be removed in {deprecated.removal_version}. {deprecated.message}"
warnings.warn(msg, FutureWarning)
if option in self._config:
return self._config[option]
msg = f"{option} not found."
raise AttributeError(msg)
def __dir__(self) -> Iterable[str]:
return sorted((*dir(super()), *self._config.keys()))
def reset(self, option: Iterable[str] | str) -> None:
"""
Resets option(s) to its (their) default value(s).
Parameters
----------
option
The option(s) to be reset.
"""
if isinstance(option, Iterable) and not isinstance(option, str):
for opt in option:
self.reset(opt)
else:
self._config[option] = self._registered_options[option].default_value
@contextmanager
def override(self, **overrides):
"""
Provides local override via keyword arguments as a context manager.
Parameters
----------
Yields
------
None
"""
restore = {a: getattr(self, a) for a in overrides}
try:
for attr, value in overrides.items():
setattr(self, attr, value)
yield None
finally:
for attr, value in restore.items():
setattr(self, attr, value)
def __repr__(self) -> str:
params = "".join(f"\t{k}={v!r},\n" for k, v in self._config.items())
return f"{type(self).__name__}(\n{params}\n)"
@property
def __doc__(self):
in_sphinx = any("/sphinx/" in frame.filename for frame in inspect.stack())
options_description = self.describe(
should_print_description=False, as_rst=in_sphinx
)
return self.__doc_tmpl__.format(
options_description=options_description,
)
settings = SettingsManager()
##################################################################################
# PLACE REGISTERED SETTINGS HERE SO THEY CAN BE PICKED UP FOR DOCSTRING CREATION #
##################################################################################
V = TypeVar("V")
def gen_validator(_type: type[V]) -> Callable[[V], None]:
def validate_type(val: V) -> None:
if not isinstance(val, _type):
msg = f"{val} not valid {_type}"
raise TypeError(msg)
return validate_type
validate_bool = gen_validator(bool)
validate_int = gen_validator(int)
settings.register(
"remove_unused_categories",
default_value=True,
description="Whether or not to remove unused categories with :class:`~pandas.Categorical`.",
validate=validate_bool,
get_from_env=check_and_get_bool,
)
settings.register(
"check_uniqueness",
default_value=True,
description=(
"Whether or not to check uniqueness of the `obs` indices on `__init__` of :class:`~anndata.AnnData`."
),
validate=validate_bool,
get_from_env=check_and_get_bool,
)
settings.register(
"allow_write_nullable_strings",
default_value=False,
description="Whether or not to allow writing of `pd.arrays.StringArray`.",
validate=validate_bool,
get_from_env=check_and_get_bool,
)
def validate_zarr_write_format(format: int):
validate_int(format)
if format not in {2, 3}:
msg = "non-v2 zarr on-disk format not supported"
raise ValueError(msg)
settings.register(
"zarr_write_format",
default_value=2,
description="Which version of zarr to write to.",
validate=validate_zarr_write_format,
get_from_env=lambda name, default: check_and_get_environ_var(
f"ANNDATA_{name.upper()}",
str(default),
["2", "3"],
lambda x: int(x),
),
)
def validate_sparse_settings(val: Any) -> None:
validate_bool(val)
settings.register(
"use_sparse_array_on_read",
default_value=False,
description="Whether or not to use :class:`scipy.sparse.sparray` as the default class when reading in data",
validate=validate_bool,
get_from_env=check_and_get_bool,
)
settings.register(
"min_rows_for_chunked_h5_copy",
default_value=1000,
description="Minimum number of rows at a time to copy when writing out an H5 Dataset to a new location",
validate=validate_int,
get_from_env=check_and_get_int,
)
##################################################################################
##################################################################################
python-anndata-0.12.0~rc1/src/anndata/_types.py 0000664 0000000 0000000 00000012330 15003706322 0021372 0 ustar 00root root 0000000 0000000 """
Defines some useful types for this library. Should probably be cleaned up before thinking about exporting.
"""
from __future__ import annotations
from typing import TYPE_CHECKING, Literal, Protocol, TypeVar
from .compat import H5Array, H5Group, ZarrArray, ZarrGroup
from .typing import RWAble
if TYPE_CHECKING:
from collections.abc import Mapping
from typing import Any, TypeAlias
from ._io.specs.registry import (
IOSpec,
LazyDataStructures,
LazyReader,
Reader,
Writer,
)
__all__ = [
"ArrayStorageType",
"GroupStorageType",
"StorageType",
"_ReadInternal",
"_ReadLazyInternal",
"_WriteInternal",
]
ArrayStorageType: TypeAlias = ZarrArray | H5Array
GroupStorageType: TypeAlias = ZarrGroup | H5Group
StorageType: TypeAlias = ArrayStorageType | GroupStorageType
# NOTE: If you change these, be sure to update `autodoc_type_aliases` in docs/conf.py!
ContravariantRWAble = TypeVar("ContravariantRWAble", bound=RWAble, contravariant=True)
CovariantRWAble = TypeVar("CovariantRWAble", bound=RWAble, covariant=True)
InvariantRWAble = TypeVar("InvariantRWAble", bound=RWAble)
SCo = TypeVar("SCo", covariant=True, bound=StorageType)
SCon = TypeVar("SCon", contravariant=True, bound=StorageType)
class _ReadInternal(Protocol[SCon, CovariantRWAble]):
def __call__(self, elem: SCon, *, _reader: Reader) -> CovariantRWAble: ...
class _ReadLazyInternal(Protocol[SCon]):
def __call__(
self, elem: SCon, *, _reader: LazyReader, chunks: tuple[int, ...] | None = None
) -> LazyDataStructures: ...
class Read(Protocol[SCon, CovariantRWAble]):
def __call__(self, elem: SCon) -> CovariantRWAble:
"""Low-level reading function for an element.
Parameters
----------
elem
The element to read from.
Returns
-------
The element read from the store.
"""
...
class ReadLazy(Protocol[SCon]):
def __call__(
self, elem: SCon, *, chunks: tuple[int, ...] | None = None
) -> LazyDataStructures:
"""Low-level reading function for a lazy element.
Parameters
----------
elem
The element to read from.
chunks
The chunk size to be used.
Returns
-------
The lazy element read from the store.
"""
...
class _WriteInternal(Protocol[ContravariantRWAble]):
def __call__(
self,
f: StorageType,
k: str,
v: ContravariantRWAble,
*,
_writer: Writer,
dataset_kwargs: Mapping[str, Any],
) -> None: ...
class Write(Protocol[ContravariantRWAble]):
def __call__(
self,
f: StorageType,
k: str,
v: ContravariantRWAble,
*,
dataset_kwargs: Mapping[str, Any],
) -> None:
"""Low-level writing function for an element.
Parameters
----------
f
The store to which `elem` should be written.
k
The key to read in from the group.
v
The element to write out.
dataset_kwargs
Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`zarr:index`.
"""
...
class ReadCallback(Protocol[SCo, InvariantRWAble]):
def __call__(
self,
/,
read_func: Read[SCo, InvariantRWAble],
elem_name: str,
elem: StorageType,
*,
iospec: IOSpec,
) -> InvariantRWAble:
"""
Callback used in :func:`anndata.experimental.read_dispatched` to customize reading an element from a store.
Params
------
read_func
:func:`anndata.io.read_elem` function to call to read the current element given the ``iospec``.
elem_name
The key to read in from the group.
elem
The element to read from.
iospec
Internal AnnData encoding specification for the element.
Returns
-------
The element read from the store.
"""
...
class WriteCallback(Protocol[InvariantRWAble]):
def __call__(
self,
/,
write_func: Write[InvariantRWAble],
store: StorageType,
elem_name: str,
elem: InvariantRWAble,
*,
iospec: IOSpec,
dataset_kwargs: Mapping[str, Any],
) -> None:
"""
Callback used in :func:`anndata.experimental.write_dispatched` to customize writing an element to a store.
Params
------
write_func
:func:`anndata.io.write_elem` function to call to read the current element given the ``iospec``.
store
The store to which `elem` should be written.
elem_name
The key to read in from the group.
elem
The element to write out.
iospec
Internal AnnData encoding specification for the element.
dataset_kwargs
Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`zarr:index`.
"""
...
AnnDataElem = Literal[
"obs",
"var",
"obsm",
"varm",
"obsp",
"varp",
"layers",
"X",
"raw",
"uns",
]
Join_T = Literal["inner", "outer"]
python-anndata-0.12.0~rc1/src/anndata/_version.py 0000664 0000000 0000000 00000002361 15003706322 0021716 0 ustar 00root root 0000000 0000000 """Get version from VCS in a dev environment or from package metadata in production.
See .
"""
from __future__ import annotations
from pathlib import Path
__all__ = ["__version__"]
def _get_version_from_vcs() -> str: # pragma: no cover
from hatchling.metadata.core import ProjectMetadata
from hatchling.plugin.exceptions import UnknownPluginError
from hatchling.plugin.manager import PluginManager
from hatchling.utils.fs import locate_file
if (pyproject_toml := locate_file(__file__, "pyproject.toml")) is None:
msg = "pyproject.toml not found although hatchling is installed"
raise LookupError(msg)
root = Path(pyproject_toml).parent
metadata = ProjectMetadata(root=str(root), plugin_manager=PluginManager())
try:
# Version can be either statically set in pyproject.toml or computed dynamically:
return metadata.core.version or metadata.hatch.version.cached
except UnknownPluginError:
msg = "Unable to import hatch plugin."
raise ImportError(msg)
try:
__version__ = _get_version_from_vcs()
except (ImportError, LookupError):
import importlib.metadata
__version__ = importlib.metadata.version("anndata")
python-anndata-0.12.0~rc1/src/anndata/_warnings.py 0000664 0000000 0000000 00000001334 15003706322 0022060 0 ustar 00root root 0000000 0000000 from __future__ import annotations
class WriteWarning(UserWarning):
pass
class OldFormatWarning(PendingDeprecationWarning):
"""Raised when a file in an old file format is read."""
pass
class ImplicitModificationWarning(UserWarning):
"""\
Raised whenever initializing an object or assigning a property changes
the type of a part of a parameter or the value being assigned.
Examples
========
>>> import pandas as pd
>>> adata = AnnData(obs=pd.DataFrame(index=[0, 1, 2])) # doctest: +SKIP
ImplicitModificationWarning: Transforming to str index.
"""
pass
class ExperimentalFeatureWarning(Warning):
"""Raised when an unstable experimental feature is used."""
pass
python-anndata-0.12.0~rc1/src/anndata/abc.py 0000664 0000000 0000000 00000003155 15003706322 0020621 0 ustar 00root root 0000000 0000000 from __future__ import annotations
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from typing import ClassVar, Literal
import numpy as np
from .compat import CSArray, CSMatrix, Index
__all__ = ["CSRDataset", "CSCDataset"]
class _AbstractCSDataset(ABC):
"""Base for the public API for CSRDataset/CSCDataset."""
format: ClassVar[Literal["csr", "csc"]]
"""The format of the sparse matrix."""
shape: tuple[int, int]
"""Shape of the matrix."""
dtype: np.dtype
"""The :class:`numpy.dtype` of the `data` attribute of the sparse matrix."""
backend: Literal["zarr", "hdf5"]
"""Which file type is used on-disk."""
@abstractmethod
def __getitem__(self, index: Index) -> float | CSMatrix | CSArray:
"""Load a slice or an element from the sparse dataset into memory.
Parameters
----------
index
Index to load.
Returns
-------
The desired data read off disk.
"""
@abstractmethod
def to_memory(self) -> CSMatrix | CSArray:
"""Load the sparse dataset into memory.
Returns
-------
The in-memory representation of the sparse dataset.
"""
_sparse_dataset_doc = """\
On disk {format} sparse matrix.
Analogous to :class:`h5py.Dataset` or :class:`zarr.Array`, but for sparse matrices.
"""
class CSRDataset(_AbstractCSDataset, ABC):
__doc__ = _sparse_dataset_doc.format(format="CSR")
format = "csr"
class CSCDataset(_AbstractCSDataset, ABC):
__doc__ = _sparse_dataset_doc.format(format="CSC")
format = "csc"
python-anndata-0.12.0~rc1/src/anndata/compat/ 0000775 0000000 0000000 00000000000 15003706322 0021001 5 ustar 00root root 0000000 0000000 python-anndata-0.12.0~rc1/src/anndata/compat/__init__.py 0000664 0000000 0000000 00000031351 15003706322 0023115 0 ustar 00root root 0000000 0000000 from __future__ import annotations
from codecs import decode
from collections.abc import Mapping
from functools import cache, partial, singledispatch, wraps
from importlib.util import find_spec
from inspect import Parameter, signature
from types import EllipsisType
from typing import TYPE_CHECKING, TypeVar
from warnings import warn
import h5py
import numpy as np
import pandas as pd
import scipy
from packaging.version import Version
from zarr import Array as ZarrArray # noqa: F401
from zarr import Group as ZarrGroup
if TYPE_CHECKING:
from typing import Any
#############################
# scipy sparse array comapt #
#############################
CSMatrix = scipy.sparse.csr_matrix | scipy.sparse.csc_matrix
CSArray = scipy.sparse.csr_array | scipy.sparse.csc_array
class Empty:
pass
Index1D = slice | int | str | np.int64 | np.ndarray | pd.Series
IndexRest = Index1D | EllipsisType
Index = (
IndexRest
| tuple[Index1D, IndexRest]
| tuple[IndexRest, Index1D]
| tuple[Index1D, Index1D, EllipsisType]
| tuple[EllipsisType, Index1D, Index1D]
| tuple[Index1D, EllipsisType, Index1D]
| CSMatrix
| CSArray
)
H5Group = h5py.Group
H5Array = h5py.Dataset
H5File = h5py.File
#############################
# Optional deps
#############################
@cache
def is_zarr_v2() -> bool:
import zarr
from packaging.version import Version
return Version(zarr.__version__) < Version("3.0.0")
if is_zarr_v2():
msg = "anndata will no longer support zarr v2 in the near future. Please prepare to upgrade to zarr>=3."
warn(msg, DeprecationWarning)
if find_spec("awkward") or TYPE_CHECKING:
import awkward # noqa: F401
from awkward import Array as AwkArray
else:
class AwkArray:
@staticmethod
def __repr__():
return "mock awkward.highlevel.Array"
if find_spec("zappy") or TYPE_CHECKING:
from zappy.base import ZappyArray
else:
class ZappyArray:
@staticmethod
def __repr__():
return "mock zappy.base.ZappyArray"
if TYPE_CHECKING:
# type checkers are confused and can only see …core.Array
from dask.array.core import Array as DaskArray
elif find_spec("dask"):
from dask.array import Array as DaskArray
else:
class DaskArray:
@staticmethod
def __repr__():
return "mock dask.array.core.Array"
# https://github.com/scverse/anndata/issues/1749
def is_cupy_importable() -> bool:
try:
import cupy # noqa: F401
except ImportError:
return False
return True
if is_cupy_importable() or TYPE_CHECKING:
from cupy import ndarray as CupyArray
from cupyx.scipy.sparse import csc_matrix as CupyCSCMatrix
from cupyx.scipy.sparse import csr_matrix as CupyCSRMatrix
from cupyx.scipy.sparse import spmatrix as CupySparseMatrix
try:
import dask.array as da
except ImportError:
pass
else:
da.register_chunk_type(CupyCSRMatrix)
da.register_chunk_type(CupyCSCMatrix)
else:
class CupySparseMatrix:
@staticmethod
def __repr__():
return "mock cupyx.scipy.sparse.spmatrix"
class CupyCSRMatrix:
@staticmethod
def __repr__():
return "mock cupyx.scipy.sparse.csr_matrix"
class CupyCSCMatrix:
@staticmethod
def __repr__():
return "mock cupyx.scipy.sparse.csc_matrix"
class CupyArray:
@staticmethod
def __repr__():
return "mock cupy.ndarray"
if find_spec("legacy_api_wrap") or TYPE_CHECKING:
from legacy_api_wrap import legacy_api # noqa: TID251
old_positionals = partial(legacy_api, category=FutureWarning)
else:
def old_positionals(*old_positionals):
return lambda func: func
#############################
# IO helpers
#############################
@singledispatch
def _read_attr(attrs: Mapping, name: str, default: Any | None = Empty):
if default is Empty:
return attrs[name]
else:
return attrs.get(name, default=default)
@_read_attr.register(h5py.AttributeManager)
def _read_attr_hdf5(
attrs: h5py.AttributeManager, name: str, default: Any | None = Empty
):
"""
Read an HDF5 attribute and perform all necessary conversions.
At the moment, this only implements conversions for string attributes, other types
are passed through. String conversion is needed compatibility with other languages.
For example Julia's HDF5.jl writes string attributes as fixed-size strings, which
are read as bytes by h5py.
"""
if name not in attrs and default is not Empty:
return default
attr = attrs[name]
attr_id = attrs.get_id(name)
dtype = h5py.check_string_dtype(attr_id.dtype)
if dtype is None:
return attr
else:
if dtype.length is None: # variable-length string, no problem
return attr
elif len(attr_id.shape) == 0: # Python bytestring
return attr.decode("utf-8")
else: # NumPy array
return [decode(s, "utf-8") for s in attr]
def _from_fixed_length_strings(value):
"""\
Convert from fixed length strings to unicode.
For backwards compatibility with older h5ad and zarr files.
"""
new_dtype = []
for dt in value.dtype.descr:
dt_list = list(dt)
dt_type = dt[1]
# could probably match better
is_annotated = isinstance(dt_type, tuple)
if is_annotated:
dt_type = dt_type[0]
# Fixing issue introduced with h5py v2.10.0, see:
# https://github.com/h5py/h5py/issues/1307
if issubclass(np.dtype(dt_type).type, np.bytes_):
dt_list[1] = f"U{int(dt_type[2:])}"
elif is_annotated or np.issubdtype(np.dtype(dt_type), np.str_):
dt_list[1] = "O" # Assumption that it’s a vlen str
new_dtype.append(tuple(dt_list))
return value.astype(new_dtype)
def _decode_structured_array(
arr: np.ndarray, *, dtype: np.dtype | None = None, copy: bool = False
) -> np.ndarray:
"""
h5py 3.0 now reads all strings as bytes. There is a helper method which can convert these to strings,
but there isn't anything for fields of structured dtypes.
Params
------
arr
An array with structured dtype
dtype
dtype of the array. This is checked for h5py string data types.
Passing this is allowed for cases where array may have been processed by another function before hand.
"""
if copy:
arr = arr.copy()
if dtype is None:
dtype = arr.dtype
# codecs.decode is 2x slower than this lambda, go figure
decode = np.frompyfunc(lambda x: x.decode("utf-8"), 1, 1)
for k, (dt, _) in dtype.fields.items():
check = h5py.check_string_dtype(dt)
if check is not None and check.encoding == "utf-8":
decode(arr[k], out=arr[k])
return arr
def _to_fixed_length_strings(value: np.ndarray) -> np.ndarray:
"""\
Convert variable length strings to fixed length.
Currently a workaround for
https://github.com/zarr-developers/zarr-python/pull/422
"""
new_dtype = []
for dt_name, (dt_type, dt_offset) in value.dtype.fields.items():
if dt_type.kind == "O":
# Assuming the objects are str
size = max(len(x.encode()) for x in value.getfield("O", dt_offset))
new_dtype.append((dt_name, ("U", size)))
else:
new_dtype.append((dt_name, dt_type))
return value.astype(new_dtype)
Group_T = TypeVar("Group_T", bound=ZarrGroup | h5py.Group)
# TODO: This is a workaround for https://github.com/scverse/anndata/issues/874
# See https://github.com/h5py/h5py/pull/2311#issuecomment-1734102238 for why this is done this way.
def _require_group_write_dataframe(
f: Group_T, name: str, df: pd.DataFrame, *args, **kwargs
) -> Group_T:
if len(df.columns) > 5_000 and isinstance(f, H5Group):
# actually 64kb is the limit, but this should be a conservative estimate
return f.create_group(name, track_order=True, *args, **kwargs)
return f.require_group(name, *args, **kwargs)
#############################
# Dealing with uns
#############################
def _clean_uns(adata: AnnData): # noqa: F821
"""
Compat function for when categorical keys were stored in uns.
This used to be buggy because when storing categorical columns in obs and var with
the same column name, only one `_categories` is retained.
"""
k_to_delete = set()
for cats_name, cats in adata.uns.items():
if not cats_name.endswith("_categories"):
continue
name = cats_name.replace("_categories", "")
# fix categories with a single category
if isinstance(cats, str | int):
cats = [cats]
for ann in [adata.obs, adata.var]:
if name not in ann:
continue
codes: np.ndarray = ann[name].values
# hack to maybe find the axis the categories were for
if not np.all(codes < len(cats)):
continue
ann[name] = pd.Categorical.from_codes(codes, cats)
k_to_delete.add(cats_name)
for cats_name in k_to_delete:
del adata.uns[cats_name]
def _move_adj_mtx(d):
"""
Read-time fix for moving adjacency matrices from uns to obsp
"""
n = d.get("uns", {}).get("neighbors", {})
obsp = d.setdefault("obsp", {})
for k in ("distances", "connectivities"):
if (
(k in n)
and isinstance(n[k], scipy.sparse.spmatrix | np.ndarray)
and len(n[k].shape) == 2
):
warn(
f"Moving element from .uns['neighbors']['{k}'] to .obsp['{k}'].\n\n"
"This is where adjacency matrices should go now.",
FutureWarning,
)
obsp[k] = n.pop(k)
def _find_sparse_matrices(d: Mapping, n: int, keys: tuple, paths: list):
"""Find paths to sparse matrices with shape (n, n)."""
for k, v in d.items():
if isinstance(v, Mapping):
_find_sparse_matrices(v, n, (*keys, k), paths)
elif scipy.sparse.issparse(v) and v.shape == (n, n):
paths.append((*keys, k))
return paths
# This function was adapted from scikit-learn
# github.com/scikit-learn/scikit-learn/blob/master/sklearn/utils/validation.py
def _deprecate_positional_args(func=None, *, version: str = "1.0 (renaming of 0.25)"):
"""Decorator for methods that issues warnings for positional arguments.
Using the keyword-only argument syntax in pep 3102, arguments after the
* will issue a warning when passed as a positional argument.
Parameters
----------
func
Function to check arguments on.
version
The version when positional arguments will result in error.
"""
def _inner_deprecate_positional_args(f):
sig = signature(f)
kwonly_args = []
all_args = []
for name, param in sig.parameters.items():
if param.kind == Parameter.POSITIONAL_OR_KEYWORD:
all_args.append(name)
elif param.kind == Parameter.KEYWORD_ONLY:
kwonly_args.append(name)
@wraps(f)
def inner_f(*args, **kwargs):
extra_args = len(args) - len(all_args)
if extra_args <= 0:
return f(*args, **kwargs)
# extra_args > 0
args_msg = [
f"{name}={arg}"
for name, arg in zip(kwonly_args[:extra_args], args[-extra_args:])
]
args_msg = ", ".join(args_msg)
warn(
f"Pass {args_msg} as keyword args. From version {version} passing "
"these as positional arguments will result in an error",
FutureWarning,
)
kwargs.update(zip(sig.parameters, args))
return f(**kwargs)
return inner_f
if func is not None:
return _inner_deprecate_positional_args(func)
return _inner_deprecate_positional_args
def _transpose_by_block(dask_array: DaskArray) -> DaskArray:
import dask.array as da
b = dask_array.blocks
b_raveled = b.ravel()
block_layout = np.zeros(b.shape, dtype=object)
for i in range(block_layout.size):
block_layout.flat[i] = b_raveled[i].map_blocks(
lambda x: x.T, chunks=b_raveled[i].chunks[::-1]
)
return da.block(block_layout.T.tolist())
def _safe_transpose(x):
"""Safely transpose x
This is a workaround for: https://github.com/scipy/scipy/issues/19161
"""
if isinstance(x, DaskArray) and scipy.sparse.issparse(x._meta):
return _transpose_by_block(x)
else:
return x.T
def _map_cat_to_str(cat: pd.Categorical) -> pd.Categorical:
if Version(pd.__version__) >= Version("2.1"):
# Argument added in pandas 2.1
return cat.map(str, na_action="ignore")
else:
return cat.map(str)
python-anndata-0.12.0~rc1/src/anndata/experimental/ 0000775 0000000 0000000 00000000000 15003706322 0022213 5 ustar 00root root 0000000 0000000 python-anndata-0.12.0~rc1/src/anndata/experimental/__init__.py 0000664 0000000 0000000 00000002672 15003706322 0024333 0 ustar 00root root 0000000 0000000 from __future__ import annotations
from types import MappingProxyType
from typing import TYPE_CHECKING
from .._io.specs import IOSpec, read_elem_lazy
from .._types import Read, ReadCallback, StorageType, Write, WriteCallback
from ..utils import module_get_attr_redirect
from ._dispatch_io import read_dispatched, write_dispatched
from .backed import read_lazy
from .merge import concat_on_disk
from .multi_files import AnnCollection
from .pytorch import AnnLoader
if TYPE_CHECKING:
from typing import Any
# Map old name in `anndata.experimental` to new name in `anndata`
_DEPRECATED = MappingProxyType(
dict(
(kv if isinstance(kv, tuple) else (kv, kv))
for kv in (
("CSRDataset", "abc.CSRDataset"),
("CSCDataset", "abc.CSCDataset"),
("sparse_dataset", "io.sparse_dataset"),
("read_elem", "io.read_elem"),
("write_elem", "io.write_elem"),
("RWAble", "typing.AxisStorable"),
("InMemoryElem", "typing.RWAble"),
)
)
)
def __getattr__(attr_name: str) -> Any:
return module_get_attr_redirect(
attr_name, deprecated_mapping=_DEPRECATED, old_module_path="experimental"
)
__all__ = [
"AnnCollection",
"AnnLoader",
"read_elem_lazy",
"read_dispatched",
"write_dispatched",
"IOSpec",
"concat_on_disk",
"Read",
"read_lazy",
"Write",
"ReadCallback",
"WriteCallback",
"StorageType",
]
python-anndata-0.12.0~rc1/src/anndata/experimental/_dispatch_io.py 0000664 0000000 0000000 00000003515 15003706322 0025216 0 ustar 00root root 0000000 0000000 from __future__ import annotations
from types import MappingProxyType
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from collections.abc import Mapping
from typing import Any
from anndata._types import (
GroupStorageType,
ReadCallback,
StorageType,
WriteCallback,
)
from anndata.typing import RWAble
def read_dispatched(
elem: StorageType,
callback: ReadCallback,
) -> RWAble:
"""
Read elem, calling the callback at each sub-element.
Params
------
elem
Storage container (e.g. `h5py.Group`, `zarr.Group`).
This must have anndata element specifications.
callback
Function to call at each anndata encoded element.
See Also
--------
:doc:`/tutorials/notebooks/{read,write}_dispatched`
"""
from anndata._io.specs import _REGISTRY, Reader
reader = Reader(_REGISTRY, callback=callback)
return reader.read_elem(elem)
def write_dispatched(
store: GroupStorageType,
key: str,
elem: RWAble,
callback: WriteCallback,
*,
dataset_kwargs: Mapping[str, Any] = MappingProxyType({}),
) -> None:
"""
Write elem to store, recursively calling callback at each sub-element.
Params
------
store
Storage container to be written to.
key
Key to write element to. To write to the root group, use "/".
elem
The element to write. Probably an AnnData.
callback
Function called when writing each element.
dataset_kwargs
Keyword arguments to pass to the dataset creation function.
See Also
--------
:doc:`/tutorials/notebooks/{read,write}_dispatched`
"""
from anndata._io.specs import _REGISTRY, Writer
writer = Writer(_REGISTRY, callback=callback)
writer.write_elem(store, key, elem, dataset_kwargs=dataset_kwargs)
python-anndata-0.12.0~rc1/src/anndata/experimental/backed/ 0000775 0000000 0000000 00000000000 15003706322 0023424 5 ustar 00root root 0000000 0000000 python-anndata-0.12.0~rc1/src/anndata/experimental/backed/__init__.py 0000664 0000000 0000000 00000000130 15003706322 0025527 0 ustar 00root root 0000000 0000000 from __future__ import annotations
from ._io import read_lazy
__all__ = ["read_lazy"]
python-anndata-0.12.0~rc1/src/anndata/experimental/backed/_compat.py 0000664 0000000 0000000 00000002103 15003706322 0025414 0 ustar 00root root 0000000 0000000 from __future__ import annotations
from importlib.util import find_spec
from typing import TYPE_CHECKING
if find_spec("xarray") or TYPE_CHECKING:
import xarray
from xarray import DataArray
from xarray.backends import BackendArray
from xarray.backends.zarr import ZarrArrayWrapper
else:
class DataArray:
def __repr__(self) -> str:
return "mock DataArray"
xarray = None
class ZarrArrayWrapper:
def __repr__(self) -> str:
return "mock ZarrArrayWrapper"
class BackendArray:
def __repr__(self) -> str:
return "mock BackendArray"
from ._xarray import Dataset, Dataset2D # noqa: F401
if TYPE_CHECKING:
from anndata import AnnData
def has_dataset_2d(adata: AnnData) -> bool:
if any(isinstance(annot_df, Dataset2D) for annot_df in [adata.obs, adata.var]):
return True
for annot_m_key in ["varm", "obsm"]:
annot_m = getattr(adata, annot_m_key)
if any(isinstance(maybe_df, Dataset2D) for maybe_df in annot_m.values()):
return True
return False
python-anndata-0.12.0~rc1/src/anndata/experimental/backed/_io.py 0000664 0000000 0000000 00000013572 15003706322 0024554 0 ustar 00root root 0000000 0000000 from __future__ import annotations
import typing
import warnings
from os import PathLike
from pathlib import Path
from typing import TYPE_CHECKING
import h5py
from anndata._io.specs.registry import read_elem_lazy
from anndata._types import AnnDataElem
from testing.anndata._doctest import doctest_needs
from ..._core.anndata import AnnData
from ..._settings import settings
from ...compat import ZarrGroup, is_zarr_v2
from .. import read_dispatched
if TYPE_CHECKING:
from collections.abc import MutableMapping
from anndata._io.specs.registry import IOSpec
from anndata._types import Read, StorageType
@doctest_needs("xarray")
def read_lazy(
store: PathLike[str] | str | MutableMapping | ZarrGroup | h5py.Dataset,
*,
load_annotation_index: bool = True,
) -> AnnData:
"""
Lazily read in on-disk/in-cloud AnnData stores, including `obs` and `var`.
No array data should need to be read into memory with the exception of :class:`ak.Array`, scalars, and some older-encoding arrays.
Parameters
----------
store
A store-like object to be read in. If :class:`zarr.Group`, it is best for it to be consolidated.
load_annotation_index
Whether or not to use a range index for the `{obs,var}` :class:`xarray.Dataset` so as not to load the index into memory.
If `False`, the real `index` will be inserted as `{obs,var}_names` in the object but not be one of the `coords` thereby preventing read operations.
Access to `adata.obs.index` will also only give the dummy index, and not the "real" index that is file-backed.
Returns
-------
A lazily read-in :class:`~anndata.AnnData` object.
Examples
--------
Preparing example objects
>>> import anndata as ad
>>> from urllib.request import urlretrieve
>>> import scanpy as sc
>>> base_url = "https://datasets.cellxgene.cziscience.com"
>>> def get_cellxgene_data(id_: str):
... out_path = sc.settings.datasetdir / f"{id_}.h5ad"
... if out_path.exists():
... return out_path
... file_url = f"{base_url}/{id_}.h5ad"
... sc.settings.datasetdir.mkdir(parents=True, exist_ok=True)
... urlretrieve(file_url, out_path)
... return out_path
>>> path_b_cells = get_cellxgene_data("a93eab58-3d82-4b61-8a2f-d7666dcdb7c4")
>>> path_fetal = get_cellxgene_data("d170ff04-6da0-4156-a719-f8e1bbefbf53")
>>> b_cells_adata = ad.experimental.read_lazy(path_b_cells)
>>> fetal_adata = ad.experimental.read_lazy(path_fetal)
>>> print(b_cells_adata)
AnnData object with n_obs × n_vars = 146 × 33452
obs: 'donor_id', 'self_reported_ethnicity_ontology_term_id', 'organism_ontology_term_id', ...
>>> print(fetal_adata)
AnnData object with n_obs × n_vars = 344 × 15585
obs: 'nCount_Spatial', 'nFeature_Spatial', 'Cluster', 'adult_pred_type'...
This functionality is compatible with :func:`anndata.concat`
>>> ad.concat([b_cells_adata, fetal_adata], join="outer")
AnnData object with n_obs × n_vars = 490 × 33452
obs: 'donor_id', 'self_reported_ethnicity_ontology_term_id', 'organism_ontology_term_id'...
"""
try:
import xarray # noqa: F401
except ImportError:
msg = (
"xarray is required to use the `read_lazy` function. Please install xarray."
)
raise ImportError(msg)
is_h5_store = isinstance(store, h5py.Dataset | h5py.File | h5py.Group)
is_h5 = (
isinstance(store, PathLike | str) and Path(store).suffix == ".h5ad"
) or is_h5_store
has_keys = True # true if consolidated or h5ad
if not is_h5:
import zarr
if not isinstance(store, ZarrGroup):
try:
f = zarr.open_consolidated(store, mode="r")
except (
KeyError if is_zarr_v2() else ValueError
): # v3 returns a ValueError for consolidated metadata not found
msg = "Did not read zarr as consolidated. Consider consolidating your metadata."
warnings.warn(msg)
has_keys = False
f = zarr.open_group(store, mode="r")
else:
f = store
else:
if is_h5_store:
f = store
else:
f = h5py.File(store, mode="r")
def callback(func: Read, /, elem_name: str, elem: StorageType, *, iospec: IOSpec):
if iospec.encoding_type in {"anndata", "raw"} or elem_name.endswith("/"):
iter_object = (
dict(elem).items()
if has_keys
else (
(k, v)
for k, v in (
(k, elem.get(k, None)) for k in typing.get_args(AnnDataElem)
)
if v
is not None # need to do this instead of `k in elem` to prevent unnecessary metadata accesses
)
)
return AnnData(**{k: read_dispatched(v, callback) for k, v in iter_object})
elif (
iospec.encoding_type
in {
"csr_matrix",
"csc_matrix",
"array",
"string-array",
"dataframe",
"categorical",
}
or "nullable" in iospec.encoding_type
):
if "dataframe" == iospec.encoding_type and elem_name in {"/obs", "/var"}:
return read_elem_lazy(elem, use_range_index=not load_annotation_index)
return read_elem_lazy(elem)
elif iospec.encoding_type in {"awkward-array"}:
return read_dispatched(elem, None)
elif iospec.encoding_type == "dict":
return {
k: read_dispatched(v, callback=callback) for k, v in dict(elem).items()
}
return func(elem)
with settings.override(check_uniqueness=load_annotation_index):
adata = read_dispatched(f, callback=callback)
return adata
python-anndata-0.12.0~rc1/src/anndata/experimental/backed/_lazy_arrays.py 0000664 0000000 0000000 00000013771 15003706322 0026506 0 ustar 00root root 0000000 0000000 from __future__ import annotations
from functools import cached_property
from typing import TYPE_CHECKING, Generic, TypeVar
import pandas as pd
from anndata._core.index import _subset
from anndata._core.views import as_view
from anndata._io.specs.lazy_methods import get_chunksize
from anndata.compat import H5Array, ZarrArray
from ..._settings import settings
from ._compat import BackendArray, DataArray, ZarrArrayWrapper
from ._compat import xarray as xr
if TYPE_CHECKING:
from pathlib import Path
from typing import Literal
import numpy as np
from anndata._core.index import Index
from anndata.compat import ZarrGroup
K = TypeVar("K", H5Array, ZarrArray)
class ZarrOrHDF5Wrapper(ZarrArrayWrapper, Generic[K]):
def __init__(self, array: K):
self.chunks = array.chunks
if isinstance(array, ZarrArray):
return super().__init__(array)
self._array = array
self.shape = self._array.shape
self.dtype = self._array.dtype
def __getitem__(self, key: xr.core.indexing.ExplicitIndexer):
if isinstance(self._array, ZarrArray):
return super().__getitem__(key)
return xr.core.indexing.explicit_indexing_adapter(
key,
self.shape,
xr.core.indexing.IndexingSupport.OUTER_1VECTOR,
lambda key: self._array[key],
)
class CategoricalArray(BackendArray, Generic[K]):
"""
A wrapper class meant to enable working with lazy categorical data.
We do not guarantee the stability of this API beyond that guaranteed
by :class:`xarray.backends.BackendArray`.
"""
_codes: ZarrOrHDF5Wrapper[K]
_categories: ZarrArray | H5Array
shape: tuple[int, ...]
base_path_or_zarr_group: Path | ZarrGroup
elem_name: str
def __init__(
self,
codes: K,
categories: ZarrArray | H5Array,
base_path_or_zarr_group: Path | ZarrGroup,
elem_name: str,
*args,
ordered: bool,
**kwargs,
):
self._categories = categories
self._ordered = ordered
self._codes = ZarrOrHDF5Wrapper(codes)
self.shape = self._codes.shape
self.base_path_or_zarr_group = base_path_or_zarr_group
self.file_format = "zarr" if isinstance(codes, ZarrArray) else "h5"
self.elem_name = elem_name
@cached_property
def categories(self) -> np.ndarray:
if isinstance(self._categories, ZarrArray):
return self._categories[...]
from ..._io.h5ad import read_dataset
return read_dataset(self._categories)
def __getitem__(
self, key: xr.core.indexing.ExplicitIndexer
) -> xr.core.extension_array.PandasExtensionArray:
codes = self._codes[key]
categorical_array = pd.Categorical.from_codes(
codes=codes, categories=self.categories, ordered=self._ordered
)
if settings.remove_unused_categories:
categorical_array = categorical_array.remove_unused_categories()
return xr.core.extension_array.PandasExtensionArray(categorical_array)
@cached_property
def dtype(self):
return pd.CategoricalDtype(categories=self.categories, ordered=self._ordered)
class MaskedArray(BackendArray, Generic[K]):
"""
A wrapper class meant to enable working with lazy masked data.
We do not guarantee the stability of this API beyond that guaranteed
by :class:`xarray.backends.BackendArray`.
"""
_mask: ZarrOrHDF5Wrapper[K]
_values: ZarrOrHDF5Wrapper[K]
_dtype_str: Literal["nullable-integer", "nullable-boolean", "nullable-string-array"]
shape: tuple[int, ...]
base_path_or_zarr_group: Path | ZarrGroup
elem_name: str
def __init__(
self,
values: ZarrArray | H5Array,
dtype_str: Literal[
"nullable-integer", "nullable-boolean", "nullable-string-array"
],
mask: ZarrArray | H5Array,
base_path_or_zarr_group: Path | ZarrGroup,
elem_name: str,
):
self._mask = ZarrOrHDF5Wrapper(mask)
self._values = ZarrOrHDF5Wrapper(values)
self._dtype_str = dtype_str
self.shape = self._values.shape
self.base_path_or_zarr_group = base_path_or_zarr_group
self.file_format = "zarr" if isinstance(mask, ZarrArray) else "h5"
self.elem_name = elem_name
def __getitem__(
self, key: xr.core.indexing.ExplicitIndexer
) -> xr.core.extension_array.PandasExtensionArray:
values = self._values[key]
mask = self._mask[key]
if self._dtype_str == "nullable-integer":
# numpy does not support nan ints
extension_array = pd.arrays.IntegerArray(values, mask=mask)
elif self._dtype_str == "nullable-boolean":
extension_array = pd.arrays.BooleanArray(values, mask=mask)
elif self._dtype_str == "nullable-string-array":
values[mask] = pd.NA
extension_array = pd.array(values, dtype=pd.StringDtype())
else:
msg = f"Invalid dtype_str {self._dtype_str}"
raise RuntimeError(msg)
return xr.core.extension_array.PandasExtensionArray(extension_array)
@cached_property
def dtype(self):
if self._dtype_str == "nullable-integer":
return pd.array(
[],
dtype=str(pd.api.types.pandas_dtype(self._values.dtype)).capitalize(),
).dtype
elif self._dtype_str == "nullable-boolean":
return pd.BooleanDtype()
elif self._dtype_str == "nullable-string-array":
return pd.StringDtype()
msg = f"Invalid dtype_str {self._dtype_str}"
raise RuntimeError(msg)
@_subset.register(DataArray)
def _subset_masked(a: DataArray, subset_idx: Index):
return a[subset_idx]
@as_view.register(DataArray)
def _view_pd_boolean_array(a: DataArray, view_args):
return a
@get_chunksize.register(MaskedArray)
def _(a: MaskedArray):
return get_chunksize(a._values)
@get_chunksize.register(CategoricalArray)
def _(a: CategoricalArray):
return get_chunksize(a._codes)
python-anndata-0.12.0~rc1/src/anndata/experimental/backed/_xarray.py 0000664 0000000 0000000 00000010235 15003706322 0025444 0 ustar 00root root 0000000 0000000 from __future__ import annotations
from typing import TYPE_CHECKING
import pandas as pd
from ..._core.anndata import AnnData, _gen_dataframe
from ..._core.file_backing import to_memory
from ..._core.index import _subset
from ..._core.views import as_view
try:
from xarray import Dataset
except ImportError:
class Dataset:
def __repr__(self) -> str:
return "mock Dataset"
if TYPE_CHECKING:
from collections.abc import Hashable, Iterable
from typing import Any, Literal
from ..._core.index import Index
from ._compat import xarray as xr
def get_index_dim(ds: xr.DataArray) -> Hashable:
if len(ds.sizes) != 1:
msg = f"xarray Dataset should not have more than 1 dims, found {len(ds.sizes)} {ds.sizes}, {ds}"
raise ValueError(msg)
return list(ds.indexes.keys())[0]
class Dataset2D(Dataset):
"""
A wrapper class meant to enable working with lazy dataframe data.
We do not guarantee the stability of this API beyond that guaranteed
by :class:`xarray.Dataset` and the `to_memory` function, a thin wrapper
around :meth:`xarray.Dataset.to_dataframe` to ensure roundtrip
compatibility here.
"""
__slots__ = ()
@property
def index(self) -> pd.Index:
""":attr:`~anndata.AnnData` internally looks for :attr:`~pandas.DataFrame.index` so this ensures usability
Returns
-------
The index of the of the dataframe as resolved from :attr:`~xarray.Dataset.coords`.
"""
coord = get_index_dim(self)
return self.indexes[coord]
@index.setter
def index(self, val) -> None:
coord = get_index_dim(self)
self.coords[coord] = val
@property
def shape(self) -> tuple[int, int]:
""":attr:`~anndata.AnnData` internally looks for :attr:`~pandas.DataFrame.shape` so this ensures usability
Returns
-------
The (2D) shape of the dataframe resolved from :attr:`~xarray.Dataset.sizes`.
"""
return (self.sizes[get_index_dim(self)], len(self))
@property
def iloc(self):
""":attr:`~anndata.AnnData` internally looks for :attr:`~pandas.DataFrame.iloc` so this ensures usability
Returns
-------
Handler class for doing the iloc-style indexing using :meth:`~xarray.Dataset.isel`.
"""
class IlocGetter:
def __init__(self, ds):
self._ds = ds
def __getitem__(self, idx):
coord = get_index_dim(self._ds)
return self._ds.isel(**{coord: idx})
return IlocGetter(self)
def to_memory(self, *, copy=False) -> pd.DataFrame:
df = self.to_dataframe()
index_key = self.attrs.get("indexing_key", None)
if df.index.name != index_key and index_key is not None:
df = df.set_index(index_key)
df.index.name = None # matches old AnnData object
return df
@property
def columns(self) -> pd.Index:
"""
:class:`~anndata.AnnData` internally looks for :attr:`~pandas.DataFrame.columns` so this ensures usability
Returns
-------
:class:`pandas.Index` that represents the "columns."
"""
columns_list = list(self.keys())
return pd.Index(columns_list)
@_subset.register(Dataset2D)
def _(a: Dataset2D, subset_idx: Index):
key = get_index_dim(a)
# xarray seems to have some code looking for a second entry in tuples
if isinstance(subset_idx, tuple) and len(subset_idx) == 1:
subset_idx = subset_idx[0]
return a.isel(**{key: subset_idx})
@as_view.register(Dataset2D)
def _(a: Dataset2D, view_args):
return a
@_gen_dataframe.register(Dataset2D)
def _gen_dataframe_xr(
anno: Dataset2D,
index_names: Iterable[str],
*,
source: Literal["X", "shape"],
attr: Literal["obs", "var"],
length: int | None = None,
):
return anno
@AnnData._remove_unused_categories.register(Dataset2D)
def _remove_unused_categories_xr(
df_full: Dataset2D, df_sub: Dataset2D, uns: dict[str, Any]
):
pass # this is handled automatically by the categorical arrays themselves i.e., they dedup upon access.
to_memory.register(Dataset2D, Dataset2D.to_memory)
python-anndata-0.12.0~rc1/src/anndata/experimental/merge.py 0000664 0000000 0000000 00000052314 15003706322 0023671 0 ustar 00root root 0000000 0000000 from __future__ import annotations
import shutil
from collections.abc import Mapping
from functools import singledispatch
from os import PathLike
from pathlib import Path
from typing import TYPE_CHECKING
import numpy as np
import pandas as pd
from scipy.sparse import csc_matrix, csr_matrix
from .._core.file_backing import to_memory
from .._core.merge import (
MissingVal,
_resolve_axis,
concat_arrays,
gen_inner_reindexers,
gen_reindexer,
intersect_keys,
merge_dataframes,
merge_indices,
resolve_merge_strategy,
unify_dtypes,
)
from .._core.sparse_dataset import BaseCompressedSparseDataset, sparse_dataset
from .._io.specs import read_elem, write_elem
from ..compat import H5Array, H5Group, ZarrArray, ZarrGroup, _map_cat_to_str
from . import read_dispatched
if TYPE_CHECKING:
from collections.abc import Callable, Collection, Iterable, Sequence
from typing import Any, Literal
from .._core.merge import Reindexer, StrategiesLiteral
SPARSE_MATRIX = {"csc_matrix", "csr_matrix"}
EAGER_TYPES = {"dataframe", "awkward-array"}
###################
# Utilities
###################
# Wrapper to reindexer that stores if there is a change
# and won't do anything if there is
class IdentityReindexer:
def __init__(self):
self.no_change = True
def __call__(self, x, *args, **kwargs):
return x
# Checks if given indices are equal to each other in the whole list.
def _indices_equal(indices: Iterable[pd.Index]) -> bool:
init_elem = indices[0]
return all(np.array_equal(init_elem, elem) for elem in indices[1:])
def _gen_slice_to_append(
datasets: Sequence[BaseCompressedSparseDataset],
reindexers,
max_loaded_elems: int,
axis=0,
fill_value=None,
):
for ds, ri in zip(datasets, reindexers):
n_slices = ds.shape[axis] * ds.shape[1 - axis] // max_loaded_elems
if n_slices < 2:
yield (csr_matrix, csc_matrix)[axis](
ri(to_memory(ds), axis=1 - axis, fill_value=fill_value)
)
else:
slice_size = max_loaded_elems // ds.shape[1 - axis]
if slice_size == 0:
slice_size = 1
rem_slices = ds.shape[axis]
idx = 0
while rem_slices > 0:
ds_part = None
if axis == 0:
ds_part = ds[idx : idx + slice_size, :]
elif axis == 1:
ds_part = ds[:, idx : idx + slice_size]
yield (csr_matrix, csc_matrix)[axis](
ri(ds_part, axis=1 - axis, fill_value=fill_value)
)
rem_slices -= slice_size
idx += slice_size
###################
# File Management
###################
@singledispatch
def as_group(store, *, mode: str) -> ZarrGroup | H5Group:
msg = "This is not yet implemented."
raise NotImplementedError(msg)
@as_group.register(PathLike)
@as_group.register(str)
def _(store: PathLike[str] | str, *, mode: str) -> ZarrGroup | H5Group:
store = Path(store)
if store.suffix == ".h5ad":
import h5py
return h5py.File(store, mode=mode)
if mode == "r": # others all write: r+, a, w, w-
import zarr
return zarr.open_group(store, mode=mode)
from anndata._io.zarr import open_write_group
return open_write_group(store, mode=mode)
@as_group.register(ZarrGroup)
@as_group.register(H5Group)
def _(store, *, mode: str) -> ZarrGroup | H5Group:
del mode
return store
###################
# Reading
###################
def read_as_backed(group: ZarrGroup | H5Group):
"""
Read the group until
BaseCompressedSparseDataset, Array or EAGER_TYPES are encountered.
"""
def callback(func, elem_name: str, elem, iospec):
if iospec.encoding_type in SPARSE_MATRIX:
return sparse_dataset(elem)
elif iospec.encoding_type in EAGER_TYPES:
return read_elem(elem)
elif iospec.encoding_type == "array":
return elem
elif iospec.encoding_type == "dict":
return {k: read_as_backed(v) for k, v in dict(elem).items()}
else:
return func(elem)
return read_dispatched(group, callback=callback)
def _df_index(df: ZarrGroup | H5Group) -> pd.Index:
index_key = df.attrs["_index"]
return pd.Index(read_elem(df[index_key]))
###################
# Writing
###################
def write_concat_dense(
arrays: Sequence[ZarrArray | H5Array],
output_group: ZarrGroup | H5Group,
output_path: ZarrGroup | H5Group,
axis: Literal[0, 1] = 0,
reindexers: Reindexer = None,
fill_value=None,
):
"""
Writes the concatenation of given dense arrays to disk using dask.
"""
import dask.array as da
darrays = (
da.from_array(a, chunks="auto" if a.chunks is None else a.chunks)
for a in arrays
)
res = da.concatenate(
[
ri(a, axis=1 - axis, fill_value=fill_value)
for a, ri in zip(darrays, reindexers)
],
axis=axis,
)
write_elem(output_group, output_path, res)
output_group[output_path].attrs.update(
{"encoding-type": "array", "encoding-version": "0.2.0"}
)
def write_concat_sparse(
datasets: Sequence[BaseCompressedSparseDataset],
output_group: ZarrGroup | H5Group,
output_path: ZarrGroup | H5Group,
max_loaded_elems: int,
axis: Literal[0, 1] = 0,
reindexers: Reindexer = None,
fill_value=None,
):
"""
Writes and concatenates sparse datasets into a single output dataset.
Args:
datasets (Sequence[BaseCompressedSparseDataset]): A sequence of BaseCompressedSparseDataset objects to be concatenated.
output_group (Union[ZarrGroup, H5Group]): The output group where the concatenated dataset will be written.
output_path (Union[ZarrGroup, H5Group]): The output path where the concatenated dataset will be written.
max_loaded_elems (int): The maximum number of sparse elements to load at once.
axis (Literal[0, 1], optional): The axis along which the datasets should be concatenated.
Defaults to 0.
reindexers (Reindexer, optional): A reindexer object that defines the reindexing operation to be applied.
Defaults to None.
fill_value (Any, optional): The fill value to use for missing elements. Defaults to None.
"""
elems = None
if all(ri.no_change for ri in reindexers):
elems = iter(datasets)
else:
elems = _gen_slice_to_append(
datasets, reindexers, max_loaded_elems, axis, fill_value
)
number_non_zero = sum(d.group["indices"].shape[0] for d in datasets)
init_elem = next(elems)
indptr_dtype = "int64" if number_non_zero >= np.iinfo(np.int32).max else "int32"
write_elem(
output_group,
output_path,
init_elem,
dataset_kwargs=dict(indptr_dtype=indptr_dtype),
)
del init_elem
out_dataset: BaseCompressedSparseDataset = read_as_backed(output_group[output_path])
for temp_elem in elems:
out_dataset.append(temp_elem)
del temp_elem
def _write_concat_mappings(
mappings,
output_group: ZarrGroup | H5Group,
keys,
path,
max_loaded_elems,
axis=0,
index=None,
reindexers=None,
fill_value=None,
):
"""
Write a list of mappings to a zarr/h5 group.
"""
mapping_group = output_group.create_group(path)
mapping_group.attrs.update(
{
"encoding-type": "dict",
"encoding-version": "0.1.0",
}
)
for k in keys:
elems = [m[k] for m in mappings]
_write_concat_sequence(
elems,
output_group=mapping_group,
output_path=k,
axis=axis,
index=index,
reindexers=reindexers,
fill_value=fill_value,
max_loaded_elems=max_loaded_elems,
)
def _write_concat_arrays(
arrays: Sequence[ZarrArray | H5Array | BaseCompressedSparseDataset],
output_group,
output_path,
max_loaded_elems,
axis=0,
reindexers=None,
fill_value=None,
join="inner",
):
init_elem = arrays[0]
init_type = type(init_elem)
if not all(isinstance(a, init_type) for a in arrays):
msg = f"All elements must be the same type instead got types: {[type(a) for a in arrays]}"
raise NotImplementedError(msg)
if reindexers is None:
if join == "inner":
reindexers = gen_inner_reindexers(arrays, new_index=None, axis=axis)
else:
msg = "Cannot reindex arrays with outer join."
raise NotImplementedError(msg)
if isinstance(init_elem, BaseCompressedSparseDataset):
expected_sparse_fmt = ["csr", "csc"][axis]
if all(a.format == expected_sparse_fmt for a in arrays):
write_concat_sparse(
arrays,
output_group,
output_path,
max_loaded_elems,
axis,
reindexers,
fill_value,
)
else:
msg = f"Concat of following not supported: {[a.format for a in arrays]}"
raise NotImplementedError(msg)
else:
write_concat_dense(
arrays, output_group, output_path, axis, reindexers, fill_value
)
def _write_concat_sequence(
arrays: Sequence[pd.DataFrame | BaseCompressedSparseDataset | H5Array | ZarrArray],
output_group,
output_path,
max_loaded_elems,
axis=0,
index=None,
reindexers=None,
fill_value=None,
join="inner",
):
"""
array, dataframe, csc_matrix, csc_matrix
"""
if any(isinstance(a, pd.DataFrame) for a in arrays):
if reindexers is None:
if join == "inner":
reindexers = gen_inner_reindexers(arrays, None, axis=axis)
else:
msg = "Cannot reindex dataframes with outer join."
raise NotImplementedError(msg)
if not all(
isinstance(a, pd.DataFrame) or a is MissingVal or 0 in a.shape
for a in arrays
):
msg = "Cannot concatenate a dataframe with other array types."
raise NotImplementedError(msg)
df = concat_arrays(
arrays=arrays,
reindexers=reindexers,
axis=axis,
index=index,
fill_value=fill_value,
)
write_elem(output_group, output_path, df)
elif all(
isinstance(a, pd.DataFrame | BaseCompressedSparseDataset | H5Array | ZarrArray)
for a in arrays
):
_write_concat_arrays(
arrays,
output_group,
output_path,
max_loaded_elems,
axis,
reindexers,
fill_value,
join,
)
else:
msg = f"Concatenation of these types is not yet implemented: {[type(a) for a in arrays]} with axis={axis}."
raise NotImplementedError(msg)
def _write_alt_mapping(groups, output_group, alt_axis_name, alt_indices, merge):
alt_mapping = merge([read_as_backed(g[alt_axis_name]) for g in groups])
# If its empty, we need to write an empty dataframe with the correct index
if not alt_mapping:
alt_df = pd.DataFrame(index=alt_indices)
write_elem(output_group, alt_axis_name, alt_df)
else:
write_elem(output_group, alt_axis_name, alt_mapping)
def _write_alt_annot(groups, output_group, alt_axis_name, alt_indices, merge):
# Annotation for other axis
alt_annot = merge_dataframes(
[read_elem(g[alt_axis_name]) for g in groups], alt_indices, merge
)
write_elem(output_group, alt_axis_name, alt_annot)
def _write_axis_annot(
groups, output_group, axis_name, concat_indices, label, label_col, join
):
concat_annot = pd.concat(
unify_dtypes(read_elem(g[axis_name]) for g in groups),
join=join,
ignore_index=True,
)
concat_annot.index = concat_indices
if label is not None:
concat_annot[label] = label_col
write_elem(output_group, axis_name, concat_annot)
def concat_on_disk(
in_files: Collection[PathLike[str] | str] | Mapping[str, PathLike[str] | str],
out_file: PathLike[str] | str,
*,
max_loaded_elems: int = 100_000_000,
axis: Literal["obs", 0, "var", 1] = 0,
join: Literal["inner", "outer"] = "inner",
merge: StrategiesLiteral | Callable[[Collection[Mapping]], Mapping] | None = None,
uns_merge: (
StrategiesLiteral | Callable[[Collection[Mapping]], Mapping] | None
) = None,
label: str | None = None,
keys: Collection[str] | None = None,
index_unique: str | None = None,
fill_value: Any | None = None,
pairwise: bool = False,
) -> None:
"""\
Concatenates multiple AnnData objects along a specified axis using their
corresponding stores or paths, and writes the resulting AnnData object
to a target location on disk.
Unlike :func:`anndata.concat`, this method does not require
loading the input AnnData objects into memory,
making it a memory-efficient alternative for large datasets.
The resulting object written to disk should be equivalent
to the concatenation of the loaded AnnData objects using
:func:`anndata.concat`.
To adjust the maximum amount of data loaded in memory; for sparse
arrays use the max_loaded_elems argument; for dense arrays
see the Dask documentation, as the Dask concatenation function is used
to concatenate dense arrays in this function
Params
------
in_files
The corresponding stores or paths of AnnData objects to
be concatenated. If a Mapping is passed, keys are used for the `keys`
argument and values are concatenated.
out_file
The target path or store to write the result in.
max_loaded_elems
The maximum number of elements to load in memory when concatenating
sparse arrays. Note that this number also includes the empty entries.
Set to 100m by default meaning roughly 400mb will be loaded
to memory simultaneously.
axis
Which axis to concatenate along.
join
How to align values when concatenating. If `"outer"`, the union of the other axis
is taken. If `"inner"`, the intersection. See :doc:`concatenation <../concatenation>`
for more.
merge
How elements not aligned to the axis being concatenated along are selected.
Currently implemented strategies include:
* `None`: No elements are kept.
* `"same"`: Elements that are the same in each of the objects.
* `"unique"`: Elements for which there is only one possible value.
* `"first"`: The first element seen at each from each position.
* `"only"`: Elements that show up in only one of the objects.
uns_merge
How the elements of `.uns` are selected. Uses the same set of strategies as
the `merge` argument, except applied recursively.
label
Column in axis annotation (i.e. `.obs` or `.var`) to place batch information in.
If it's None, no column is added.
keys
Names for each object being added. These values are used for column values for
`label` or appended to the index if `index_unique` is not `None`. Defaults to
incrementing integer labels.
index_unique
Whether to make the index unique by using the keys. If provided, this
is the delimiter between `"{orig_idx}{index_unique}{key}"`. When `None`,
the original indices are kept.
fill_value
When `join="outer"`, this is the value that will be used to fill the introduced
indices. By default, sparse arrays are padded with zeros, while dense arrays and
DataFrames are padded with missing values.
pairwise
Whether pairwise elements along the concatenated dimension should be included.
This is False by default, since the resulting arrays are often not meaningful.
Notes
-----
.. warning::
If you use `join='outer'` this fills 0s for sparse data when
variables are absent in a batch. Use this with care. Dense data is
filled with `NaN`.
Examples
--------
See :func:`anndata.concat` for the semantics.
The following examples highlight the differences this function has.
First, let’s get some “big” datasets with a compatible ``var`` axis:
>>> import httpx
>>> import scanpy as sc
>>> base_url = "https://datasets.cellxgene.cziscience.com"
>>> def get_cellxgene_data(id_: str):
... out_path = sc.settings.datasetdir / f'{id_}.h5ad'
... if out_path.exists():
... return out_path
... file_url = f"{base_url}/{id_}.h5ad"
... sc.settings.datasetdir.mkdir(parents=True, exist_ok=True)
... out_path.write_bytes(httpx.get(file_url).content)
... return out_path
>>> path_b_cells = get_cellxgene_data('a93eab58-3d82-4b61-8a2f-d7666dcdb7c4')
>>> path_fetal = get_cellxgene_data('d170ff04-6da0-4156-a719-f8e1bbefbf53')
Now we can concatenate them on-disk:
>>> import anndata as ad
>>> ad.experimental.concat_on_disk(
... dict(b_cells=path_b_cells, fetal=path_fetal),
... 'merged.h5ad',
... label='dataset',
... )
>>> adata = ad.read_h5ad('merged.h5ad', backed=True)
>>> adata.X
CSRDataset: backend hdf5, shape (490, 15585), data_dtype float32
>>> adata.obs['dataset'].value_counts() # doctest: +SKIP
dataset
fetal 344
b_cells 146
Name: count, dtype: int64
"""
if len(in_files) == 0:
msg = "No objects to concatenate."
raise ValueError(msg)
# Argument normalization
if pairwise:
msg = "pairwise concatenation not yet implemented"
raise NotImplementedError(msg)
merge = resolve_merge_strategy(merge)
uns_merge = resolve_merge_strategy(uns_merge)
out_file = Path(out_file)
if not out_file.parent.exists():
msg = f"Parent directory of {out_file} does not exist."
raise FileNotFoundError(msg)
if isinstance(in_files, Mapping):
if keys is not None:
msg = (
"Cannot specify categories in both mapping keys and using `keys`. "
"Only specify this once."
)
raise TypeError(msg)
keys, in_files = list(in_files.keys()), list(in_files.values())
else:
in_files = list(in_files)
if len(in_files) == 1:
shutil.copy2(in_files[0], out_file)
return
if keys is None:
keys = np.arange(len(in_files)).astype(str)
axis, axis_name = _resolve_axis(axis)
_, alt_axis_name = _resolve_axis(1 - axis)
output_group = as_group(out_file, mode="w")
groups = [as_group(f, mode="r") for f in in_files]
use_reindexing = False
alt_idxs = [_df_index(g[alt_axis_name]) for g in groups]
# All {axis_name}_names must be equal if reindexing not applied
if not _indices_equal(alt_idxs):
use_reindexing = True
# All groups must be anndata
if not all(g.attrs.get("encoding-type") == "anndata" for g in groups):
msg = "All groups must be anndata"
raise ValueError(msg)
# Write metadata
output_group.attrs.update({"encoding-type": "anndata", "encoding-version": "0.1.0"})
# Read the backed objects of Xs
Xs = [read_as_backed(g["X"]) for g in groups]
# Label column
label_col = pd.Categorical.from_codes(
np.repeat(np.arange(len(groups)), [x.shape[axis] for x in Xs]),
categories=keys,
)
# Combining indexes
concat_indices = pd.concat(
[pd.Series(_df_index(g[axis_name])) for g in groups], ignore_index=True
)
if index_unique is not None:
concat_indices = concat_indices.str.cat(
_map_cat_to_str(label_col), sep=index_unique
)
# Resulting indices for {axis_name} and {alt_axis_name}
concat_indices = pd.Index(concat_indices)
alt_index = merge_indices(alt_idxs, join=join)
reindexers = None
if use_reindexing:
reindexers = [
gen_reindexer(alt_index, alt_old_index) for alt_old_index in alt_idxs
]
else:
reindexers = [IdentityReindexer()] * len(groups)
# Write {axis_name}
_write_axis_annot(
groups, output_group, axis_name, concat_indices, label, label_col, join
)
# Write {alt_axis_name}
_write_alt_annot(groups, output_group, alt_axis_name, alt_index, merge)
# Write {alt_axis_name}m
_write_alt_mapping(groups, output_group, alt_axis_name, alt_index, merge)
# Write X
_write_concat_arrays(
arrays=Xs,
output_group=output_group,
output_path="X",
axis=axis,
reindexers=reindexers,
fill_value=fill_value,
max_loaded_elems=max_loaded_elems,
)
# Write Layers and {axis_name}m
mapping_names = [
(
f"{axis_name}m",
concat_indices,
0,
None if use_reindexing else [IdentityReindexer()] * len(groups),
),
("layers", None, axis, reindexers),
]
for m, m_index, m_axis, m_reindexers in mapping_names:
maps = [read_as_backed(g[m]) for g in groups]
_write_concat_mappings(
maps,
output_group,
intersect_keys(maps),
m,
max_loaded_elems=max_loaded_elems,
axis=m_axis,
index=m_index,
reindexers=m_reindexers,
fill_value=fill_value,
)
python-anndata-0.12.0~rc1/src/anndata/experimental/multi_files/ 0000775 0000000 0000000 00000000000 15003706322 0024527 5 ustar 00root root 0000000 0000000 python-anndata-0.12.0~rc1/src/anndata/experimental/multi_files/__init__.py 0000664 0000000 0000000 00000000153 15003706322 0026637 0 ustar 00root root 0000000 0000000 from __future__ import annotations
from ._anncollection import AnnCollection
__all__ = ["AnnCollection"]
python-anndata-0.12.0~rc1/src/anndata/experimental/multi_files/_anncollection.py 0000664 0000000 0000000 00000105176 15003706322 0030102 0 ustar 00root root 0000000 0000000 from __future__ import annotations
import warnings
from collections.abc import Callable, Mapping
from functools import reduce
from typing import TYPE_CHECKING
import numpy as np
import pandas as pd
from h5py import Dataset
from ..._core.aligned_mapping import AxisArrays
from ..._core.anndata import AnnData
from ..._core.index import _normalize_index, _normalize_indices
from ..._core.merge import concat_arrays, inner_concat_aligned_mapping
from ..._core.sparse_dataset import BaseCompressedSparseDataset
from ..._core.views import _resolve_idx
from ...compat import _map_cat_to_str, old_positionals
if TYPE_CHECKING:
from collections.abc import Iterable, Sequence
from typing import Literal
from ..._core.index import Index
ATTRS = ["obs", "obsm", "layers"]
def _merge(arrs):
rxers = [lambda x, fill_value, axis: x] * len(arrs)
return concat_arrays(arrs, rxers)
def _select_convert(key, convert, arr=None):
key_convert = None
if callable(convert):
key_convert = convert
elif isinstance(convert, dict) and key in convert:
key_convert = convert[key]
if arr is not None:
return key_convert(arr) if key_convert is not None else arr
else:
return key_convert
def _harmonize_types(attrs_keys, adatas):
attrs_keys_types = {}
def check_type(attr, key=None):
arrs = []
for a in adatas:
attr_arr = getattr(a, attr)
if key is not None:
attr_arr = attr_arr[key]
arrs.append(attr_arr)
# hacky but numpy find_common_type doesn't work with categoricals
try:
dtype = _merge([arr[:1] for arr in arrs]).dtype
except ValueError:
dtype = _merge([arr[:1, :1] for arr in arrs]).dtype
return dtype
for attr, keys in attrs_keys.items():
if len(keys) == 0:
continue
attrs_keys_types[attr] = {}
for key in keys:
attrs_keys_types[attr][key] = check_type(attr, key)
attrs_keys_types["X"] = check_type("X")
return attrs_keys_types
class _ConcatViewMixin:
def _resolve_idx(self, oidx, vidx):
adatas_oidx = []
reverse = None
old_oidx = getattr(self, "oidx", None)
if old_oidx is not None:
oidx = _resolve_idx(old_oidx, oidx, self.limits[-1])
if isinstance(oidx, slice):
start, stop, step = oidx.indices(self.limits[-1])
oidx = np.arange(start, stop, step)
else:
oidx = np.array([oidx]) if isinstance(oidx, int) else oidx
u_oidx = oidx
if len(self.adatas) == 1:
return [u_oidx], oidx, vidx, reverse
iter_limits = list(zip([0] + self.limits, self.limits))
n_adatas_used = 0
for lower, upper in iter_limits:
if np.any((u_oidx >= lower) & (u_oidx < upper)):
n_adatas_used += 1
need_reverse = (
self.indices_strict
and n_adatas_used > 1
and u_oidx.size > 1
and np.any(u_oidx[:-1] > u_oidx[1:])
)
if need_reverse:
u_oidx, reverse = np.unique(u_oidx, return_inverse=True)
for lower, upper in iter_limits:
mask = (u_oidx >= lower) & (u_oidx < upper)
adatas_oidx.append(u_oidx[mask] - lower if mask.any() else None)
old_vidx = getattr(self, "vidx", None)
if old_vidx is not None:
vidx = _resolve_idx(old_vidx, vidx, self.adatas[0].n_vars)
if isinstance(vidx, int):
vidx = np.array([vidx])
return adatas_oidx, oidx, vidx, reverse
class _IterateViewMixin:
@old_positionals("axis", "shuffle", "drop_last")
def iterate_axis(
self,
batch_size: int,
*,
axis: Literal[0, 1] = 0,
shuffle: bool = False,
drop_last: bool = False,
):
"""Iterate the lazy object over an axis.
Parameters
----------
batch_size
How many samples to put into a batch when iterating.
axis
The axis to iterate over.
shuffle
Set to `True` to have the indices reshuffled before iterating.
drop_last
Set to `True` to drop a batch with the length lower than `batch_size`.
"""
if axis not in {0, 1}:
msg = "Axis should be either 0 or 1."
raise ValueError(msg)
n = self.shape[axis]
if shuffle:
indices = np.random.permutation(n).tolist()
else:
indices = list(range(n))
for i in range(0, n, batch_size):
idx = indices[i : min(i + batch_size, n)]
if axis == 1:
batch = self[:, idx]
else:
batch = self[idx]
# only happens if the last batch is smaller than batch_size
if len(batch) < batch_size and drop_last:
continue
yield batch, idx
class MapObsView:
def __init__(
self,
attr,
adatas,
keys,
adatas_oidx,
adatas_vidx=None,
convert=None,
reverse=None,
dtypes=None,
obs_names=None,
):
self.adatas = adatas
self._keys = keys
self.adatas_oidx = adatas_oidx
self.adatas_vidx = adatas_vidx
self.attr = attr
self.convert = convert
self.reverse = reverse
self.dtypes = dtypes
self.obs_names = obs_names
def __getitem__(self, key: str, *, use_convert: bool = True):
if self._keys is not None and key not in self._keys:
msg = f"No {key} in {self.attr} view"
raise KeyError(msg)
arrs = []
for i, oidx in enumerate(self.adatas_oidx):
if oidx is None:
continue
arr = getattr(self.adatas[i], self.attr)[key]
if self.adatas_vidx is not None:
vidx = self.adatas_vidx[i]
else:
vidx = None
if vidx is not None:
idx = oidx, vidx
else:
idx = oidx
if isinstance(arr, pd.DataFrame):
arrs.append(arr.iloc[idx])
else:
if vidx is not None:
idx = np.ix_(*idx) if not isinstance(idx[1], slice) else idx
arrs.append(arr.iloc[idx] if isinstance(arr, pd.Series) else arr[idx])
if len(arrs) > 1:
_arr = _merge(arrs)
_arr = _arr if self.reverse is None else _arr[self.reverse]
else:
_arr = arrs[0]
# what if it is a dataframe?
if self.dtypes is not None:
_arr = _arr.astype(self.dtypes[key], copy=False)
if self.convert is not None and use_convert:
_arr = _select_convert(key, self.convert, _arr)
return _arr
def keys(self):
if self._keys is not None:
return self._keys
else:
return list(getattr(self.adatas[0], self.attr).keys())
@old_positionals("use_convert")
def to_dict(self, keys: Iterable[str] | None = None, *, use_convert=True):
dct = {}
keys = self.keys() if keys is None else keys
for key in keys:
dct[key] = self.__getitem__(key, use_convert=use_convert)
return dct
@property
def df(self):
if self.attr != "obs":
return None
return pd.DataFrame(self.to_dict(use_convert=False), index=self.obs_names)
def __repr__(self):
descr = f"View of {self.attr} with keys: {str(self.keys())[1:-1]}"
return descr
class AnnCollectionView(_ConcatViewMixin, _IterateViewMixin):
"""\
An object to access the observation attributes of `adatas` in AnnCollection.
Created as a result of subsetting an :class:`~anndata.experimental.AnnCollection` object.
An object of this class can have `.obs`, `.obsm`, `.layers`, `.X` depending on the
results of joins in the reference AnnCollection object.
Notes
-----
Nothing is copied until keys of the attributes or `.X` are accessed.
"""
def __init__(self, reference, convert, resolved_idx):
self.reference = reference
self.indices_strict = self.reference.indices_strict
self.adatas = self.reference.adatas
self.limits = self.reference.limits
self.adatas_oidx, self.oidx, self.vidx, self.reverse = resolved_idx
self.adatas_vidx = []
for i, vidx in enumerate(self.reference.adatas_vidx):
if vidx is None:
self.adatas_vidx.append(self.vidx)
else:
new_vidx = _resolve_idx(vidx, self.vidx, self.adatas[i].n_vars)
self.adatas_vidx.append(new_vidx)
self._view_attrs_keys = self.reference._view_attrs_keys
self._attrs = self.reference._attrs
self._dtypes = self.reference._dtypes
self._layers_view, self._obsm_view, self._obs_view = None, None, None
self._X = None
self._convert = None
self._convert_X = None
self.convert = convert
def _lazy_init_attr(self, attr: str, *, set_vidx: bool = False):
if getattr(self, f"_{attr}_view") is not None:
return
keys = None
attr_dtypes = None
if attr in self._view_attrs_keys:
reverse = self.reverse
keys = self._view_attrs_keys[attr]
if len(keys) == 0:
return
adatas = self.adatas
adatas_oidx = self.adatas_oidx
if self._dtypes is not None:
attr_dtypes = self._dtypes[attr]
else:
reverse = None
adatas = [self.reference]
adatas_oidx = [self.oidx]
adatas_vidx = self.adatas_vidx if set_vidx else None
attr_convert = None
if self.convert is not None:
attr_convert = _select_convert(attr, self.convert)
if attr == "obs":
obs_names = self.obs_names
else:
obs_names = None
setattr(
self,
f"_{attr}_view",
MapObsView(
attr,
adatas,
keys,
adatas_oidx,
adatas_vidx,
attr_convert,
reverse,
attr_dtypes,
obs_names,
),
)
def _gather_X(self):
if self._X is not None:
return self._X
Xs = []
for i, oidx in enumerate(self.adatas_oidx):
if oidx is None:
continue
adata = self.adatas[i]
X = adata.X
vidx = self.adatas_vidx[i]
if isinstance(X, Dataset):
reverse = None
if oidx.size > 1 and np.any(oidx[:-1] >= oidx[1:]):
oidx, reverse = np.unique(oidx, return_inverse=True)
if isinstance(vidx, slice):
arr = X[oidx, vidx]
else:
# this is a very memory inefficient approach
# todo: fix
arr = X[oidx][:, vidx]
Xs.append(arr if reverse is None else arr[reverse])
elif isinstance(X, BaseCompressedSparseDataset):
# very slow indexing with two arrays
if isinstance(vidx, slice) or len(vidx) <= 1000:
Xs.append(X[oidx, vidx])
else:
Xs.append(X[oidx][:, vidx])
else:
# if vidx is present it is less memory efficient
idx = oidx, vidx
idx = np.ix_(*idx) if not isinstance(vidx, slice) else idx
Xs.append(X[idx])
if len(Xs) > 1:
_X = _merge(Xs)
# todo: get rid of reverse for dense arrays
_X = _X if self.reverse is None else _X[self.reverse]
else:
_X = Xs[0]
if self._dtypes is not None:
_X = _X.astype(self._dtypes["X"], copy=False)
self._X = _X
return _X
@property
def X(self):
"""Lazy subset of data matrix.
The data matrix formed from the `.X` attributes of the underlying `adatas`,
properly reindexed and lazily merged.
Nothing is copied until `.X` is accessed, no real concatenation of the
underlying `.X` attributes is done.
"""
# inconsistent behavior here, _X can be changed,
# but the other attributes can't be changed.
# maybe do return ... _X.copy() or _X.setflags(write=False)
_X = self._gather_X()
return self._convert_X(_X) if self._convert_X is not None else _X
@property
def layers(self):
"""Lazy subset of layers.
The layers attribute formed from lazy inner join and subsetting of the `.layers`
of the underlying `adatas`. No copy is made until you access a key from `.layers`,
only the subset of the accessed key is copied.
To get `.layers` as a dictionary, use `.layers.to_dict()`. You can also specify keys
to include in the dict `.layers.to_dict(keys=['key1', 'key2'])` and if you want
converters to be turned off when copying to dict `.layers.to_dict(use_convert=False)`.
"""
self._lazy_init_attr("layers", set_vidx=True)
return self._layers_view
@property
def obsm(self):
"""Lazy subset of multi-dimensional annotation of observations.
Points to the `.obsm` attributes of the underlying adatas to `.obsm` of the parent
AnnCollection object depending on the `join_obsm` option of the AnnCollection object.
See the docs of :class:`~anndata.experimental.AnnCollection` for details.
Copy rules are the same as for `.layers`, i.e. everything is lazy.
To get `.obsm` as a dictionary, use `.obsm.to_dict()`. You can also specify keys
to include in the dict `.obsm.to_dict(keys=['key1', 'key2'])` and if you want
converters to be turned off when copying to dict `.obsm.to_dict(use_convert=False)`.
"""
self._lazy_init_attr("obsm")
return self._obsm_view
@property
def obs(self):
"""Lazy suset of one-dimensional annotation of observations.
Points to the `.obs` attributes of the underlying adatas to `.obs` of the parent
AnnCollection object depending on the `join_obs` option of the AnnCollection object.
See the docs of `~anndata.experimental.AnnCollection` for details.
Copy rules are the same as for `.layers`, i.e. everything is lazy.
To get `.obs` as a DataFrame, use `.obs.df`.
To get `.obs` as a dictionary, use `.obs.to_dict()`. You can also specify keys
to include in the dict `.obs.to_dict(keys=['key1', 'key2'])` and if you want
converters to be turned off when copying to dict `.obs.to_dict(use_convert=False)`.
"""
self._lazy_init_attr("obs")
return self._obs_view
@property
def obs_names(self):
"""Names of observations of this subset object."""
return self.reference.obs_names[self.oidx]
@property
def var_names(self):
"""Names of variables of this subset object."""
return self.reference.var_names[self.vidx]
@property
def shape(self):
"""Shape of the lazily concatenated subset of the data matrix."""
return len(self.obs_names), len(self.var_names)
@property
def n_obs(self):
"""Number of observations."""
return self.shape[0]
@property
def n_vars(self):
"""Number of variables/features."""
return self.shape[1]
@property
def convert(self):
"""On the fly converters for keys of attributes and data matrix.
A function or a Mapping of functions which will be applied
to the values of attributes (`.X`) or to specific keys of these attributes
(`.obs`, `.obsm`, `.layers`).
The keys of the Mapping should correspond to the attributes or keys of the
attributes (hierarchically) and the values should be functions used for conversion.
Examples
----------
::
{
# densify .X
"X": lambda a: a.toarray() if issparse(a) else a,
# change dtype for all keys of .obsm
"obsm": lambda a: np.asarray(a, dtype="float32"),
# change type only for one key of .obs
"obs": dict(key1=lambda c: c.astype(str)),
}
"""
return self._convert
@convert.setter
def convert(self, value):
self._convert = value
self._convert_X = _select_convert("X", self._convert)
for attr in ATTRS:
setattr(self, f"_{attr}_view", None)
def __len__(self):
return len(self.obs_names)
def __getitem__(self, index: Index):
oidx, vidx = _normalize_indices(index, self.obs_names, self.var_names)
resolved_idx = self._resolve_idx(oidx, vidx)
return AnnCollectionView(self.reference, self.convert, resolved_idx)
@property
def has_backed(self):
"""`True` if the current subset of `adatas` has backed objects, `False` otherwise."""
for i, adata in enumerate(self.adatas):
if adata.isbacked and self.adatas_oidx[i] is not None:
return True
return False
def __repr__(self):
n_obs, n_vars = self.shape
descr = f"AnnCollectionView object with n_obs × n_vars = {n_obs} × {n_vars}"
all_attrs_keys = self._view_attrs_keys.copy()
for attr in self._attrs:
all_attrs_keys[attr] = list(getattr(self.reference, attr).keys())
for attr, keys in all_attrs_keys.items():
if len(keys) > 0:
descr += f"\n {attr}: {str(keys)[1:-1]}"
return descr
@old_positionals("ignore_X", "ignore_layers")
def to_adata(self, *, ignore_X: bool = False, ignore_layers: bool = False):
"""Convert this AnnCollectionView object to an AnnData object.
Parameters
----------
ignore_X
if `True`, adds `.X` to the AnnData object.
ignore_layers
if `True`, copies `.layers` to the AnnData object.
"""
if ignore_layers or self.layers is None:
layers = None
else:
layers = self.layers.to_dict(use_convert=False)
obsm = None if self.obsm is None else self.obsm.to_dict(use_convert=False)
obs = (
None
if self.obs is None
else pd.DataFrame(self.obs.to_dict(use_convert=False))
)
if ignore_X:
X = None
shape = self.shape
else:
X = self._gather_X()
shape = None
adata = AnnData(X, obs=obs, obsm=obsm, layers=layers, shape=shape)
adata.obs_names = self.obs_names
adata.var_names = self.var_names
return adata
@property
def attrs_keys(self):
"""Dict of all accessible attributes and their keys."""
return self.reference.attrs_keys
DictCallable = dict[str, Callable]
ConvertType = Callable | dict[str, Callable | DictCallable]
class AnnCollection(_ConcatViewMixin, _IterateViewMixin):
"""\
Lazily concatenate AnnData objects along the `obs` axis.
This class doesn't copy data from underlying AnnData objects, but lazily subsets using a joint
index of observations and variables. It also allows on-the-fly application of prespecified
converters to `.obs` attributes of the AnnData objects.
Subsetting of this object returns an `AnnCollectionView`, which provides views of `.obs`,
`.obsm`, `.layers`, `.X` from the underlying AnnData objects.
Parameters
----------
adatas
The objects to be lazily concatenated.
If a Mapping is passed, keys are used for the `keys` argument and values are concatenated.
join_obs
If "inner" specified all `.obs` attributes from `adatas` will be inner joined
and copied to this object.
If "outer" specified all `.obsm` attributes from `adatas` will be outer joined
and copied to this object.
For "inner" and "outer" subset objects will access `.obs` of this object,
not the original `.obs` attributes of `adatas`.
If `None`, nothing is copied to this object's `.obs`, a subset object will directly
access `.obs` attributes of `adatas` (with proper reindexing and dtype conversions).
For `None`the inner join rule is used to select columns of `.obs` of `adatas`.
join_obsm
If "inner" specified all `.obsm` attributes from `adatas` will be inner joined
and copied to this object. Subset objects will access `.obsm` of this object,
not the original `.obsm` attributes of `adatas`.
If `None`, nothing is copied to this object's `.obsm`, a subset object will directly
access `.obsm` attributes of `adatas` (with proper reindexing and dtype conversions).
For both options the inner join rule for the underlying `.obsm` attributes is used.
join_vars
Specify how to join `adatas` along the var axis. If `None`, assumes all `adatas`
have the same variables. If "inner", the intersection of all variables in
`adatas` will be used.
label
Column in `.obs` to place batch information in.
If it's None, no column is added.
keys
Names for each object being added. These values are used for column values for
`label` or appended to the index if `index_unique` is not `None`. Defaults to
incrementing integer labels.
index_unique
Whether to make the index unique by using the keys. If provided, this
is the delimiter between "{orig_idx}{index_unique}{key}". When `None`,
the original indices are kept.
convert
You can pass a function or a Mapping of functions which will be applied
to the values of attributes (`.obs`, `.obsm`, `.layers`, `.X`) or to specific
keys of these attributes in the subset object.
Specify an attribute and a key (if needed) as keys of the passed Mapping
and a function to be applied as a value.
harmonize_dtypes
If `True`, all retrieved arrays from subset objects will have the same dtype.
indices_strict
If `True`, arrays from the subset objects will always have the same order
of indices as in selection used to subset.
This parameter can be set to `False` if the order in the returned arrays
is not important, for example, when using them for stochastic gradient descent.
In this case the performance of subsetting can be a bit better.
Examples
----------
>>> from scanpy.datasets import pbmc68k_reduced, pbmc3k_processed
>>> adata1, adata2 = pbmc68k_reduced(), pbmc3k_processed()
>>> adata1.shape
(700, 765)
>>> adata2.shape
(2638, 1838)
>>> dc = AnnCollection([adata1, adata2], join_vars='inner')
>>> dc
AnnCollection object with n_obs × n_vars = 3338 × 208
constructed from 2 AnnData objects
view of obsm: 'X_pca', 'X_umap'
obs: 'n_genes', 'percent_mito', 'n_counts', 'louvain'
>>> batch = dc[100:200] # AnnCollectionView
>>> batch
AnnCollectionView object with n_obs × n_vars = 100 × 208
obsm: 'X_pca', 'X_umap'
obs: 'n_genes', 'percent_mito', 'n_counts', 'louvain'
>>> batch.X.shape
(100, 208)
>>> len(batch.obs['louvain'])
100
"""
@old_positionals(
"join_obs",
"join_obsm",
"join_vars",
"label",
"keys",
"index_unique",
"convert",
"harmonize_dtypes",
"indices_strict",
)
def __init__(
self,
adatas: Sequence[AnnData] | dict[str, AnnData],
*,
join_obs: Literal["inner", "outer"] | None = "inner",
join_obsm: Literal["inner"] | None = None,
join_vars: Literal["inner"] | None = None,
label: str | None = None,
keys: Sequence[str] | None = None,
index_unique: str | None = None,
convert: ConvertType | None = None,
harmonize_dtypes: bool = True,
indices_strict: bool = True,
):
if isinstance(adatas, Mapping):
if keys is not None:
msg = (
"Cannot specify categories in both mapping keys and using `keys`. "
"Only specify this once."
)
raise TypeError(msg)
keys, adatas = list(adatas.keys()), list(adatas.values())
else:
adatas = list(adatas)
# check if the variables are the same in all adatas
self.adatas_vidx = [None for adata in adatas]
vars_names_list = [adata.var_names for adata in adatas]
vars_eq = all([adatas[0].var_names.equals(vrs) for vrs in vars_names_list[1:]])
if vars_eq:
self.var_names = adatas[0].var_names
elif join_vars == "inner":
var_names = reduce(pd.Index.intersection, vars_names_list)
self.adatas_vidx = []
for adata in adatas:
if var_names.equals(adata.var_names):
self.adatas_vidx.append(None)
else:
adata_vidx = _normalize_index(var_names, adata.var_names)
self.adatas_vidx.append(adata_vidx)
self.var_names = var_names
else:
msg = (
"Adatas have different variables. "
"Please specify join_vars='inner' for intersection."
)
raise ValueError(msg)
concat_indices = pd.concat(
[pd.Series(a.obs_names) for a in adatas], ignore_index=True
)
if keys is None:
keys = np.arange(len(adatas)).astype(str)
label_col = pd.Categorical.from_codes(
np.repeat(np.arange(len(adatas)), [a.shape[0] for a in adatas]),
categories=keys,
)
if index_unique is not None:
concat_indices = concat_indices.str.cat(
_map_cat_to_str(label_col), sep=index_unique
)
self.obs_names = pd.Index(concat_indices)
if not self.obs_names.is_unique:
warnings.warn("Observation names are not unique.", UserWarning)
view_attrs = ATTRS.copy()
self._attrs = []
# process obs joins
if join_obs is not None:
view_attrs.remove("obs")
self._attrs.append("obs")
concat_annot = pd.concat(
[a.obs for a in adatas], join=join_obs, ignore_index=True
)
concat_annot.index = self.obs_names
self._obs = concat_annot
else:
self._obs = pd.DataFrame(index=self.obs_names)
if label is not None:
self._obs[label] = label_col
# process obsm inner join
self._obsm = None
if join_obsm == "inner":
view_attrs.remove("obsm")
self._attrs.append("obsm")
self._obsm = inner_concat_aligned_mapping(
[a.obsm for a in adatas], index=self.obs_names
)
self._obsm = (
AxisArrays(self, axis=0, store={}) if self._obsm == {} else self._obsm
)
# process inner join of views
self._view_attrs_keys = {}
for attr in view_attrs:
self._view_attrs_keys[attr] = list(getattr(adatas[0], attr).keys())
for a in adatas[1:]:
for attr, keys in self._view_attrs_keys.items():
ai_attr = getattr(a, attr)
a0_attr = getattr(adatas[0], attr)
new_keys = []
for key in keys:
if key in ai_attr.keys():
a0_ashape = a0_attr[key].shape
ai_ashape = ai_attr[key].shape
if (
len(a0_ashape) < 2
or a0_ashape[1] == ai_ashape[1]
or attr == "layers"
):
new_keys.append(key)
self._view_attrs_keys[attr] = new_keys
self.adatas = adatas
self.limits = [adatas[0].n_obs]
for i in range(len(adatas) - 1):
self.limits.append(self.limits[i] + adatas[i + 1].n_obs)
# init converter
self._convert = convert
self._dtypes = None
if len(adatas) > 1 and harmonize_dtypes:
self._dtypes = _harmonize_types(self._view_attrs_keys, self.adatas)
self.indices_strict = indices_strict
def __getitem__(self, index: Index):
oidx, vidx = _normalize_indices(index, self.obs_names, self.var_names)
resolved_idx = self._resolve_idx(oidx, vidx)
return AnnCollectionView(self, self.convert, resolved_idx)
@property
def convert(self):
"""On the fly converters for keys of attributes and data matrix.
A function or a Mapping of functions which will be applied
to the values of attributes (`.X`) or to specific keys of these attributes
(`.obs`, `.obsm`, `.layers`) of subset objects. The converters are not
applied to `.obs` and `.obsm` (if present) of this object, only to the attributes
of subset objects.
The keys of the Mapping should correspond to the attributes or keys of the
attributes (hierarchically) and the values should be functions used for conversion.
Examples
--------
::
{
# densify .X
"X": lambda a: a.toarray() if issparse(a) else a,
# change dtype for all keys of .obsm
"obsm": lambda a: np.asarray(a, dtype="float32"),
# change type only for one key of .obs
"obs": dict(key1=lambda c: c.astype(str)),
}
"""
return self._convert
@convert.setter
def convert(self, value):
self._convert = value
@property
def obs(self):
"""One-dimensional annotation of observations.
If `join_obs` was set to "inner" and "outer", subset objects' `.obs`
will point to this `.obs`; otherwise, to `.obs` of the underlying objects (`adatas`).
"""
return self._obs
@property
def obsm(self):
"""Multi-dimensional annotation of observations.
If `join_obsm` was set to "inner", subset objects' `.obsm`
will point to this `.obsm`; otherwise, to `.obsm` of the underlying objects (`adatas`).
In the latter case, `.obsm` of this object will be `None`.
"""
return self._obsm
@property
def shape(self):
"""Shape of the lazily concatenated data matrix"""
return self.limits[-1], len(self.var_names)
@property
def n_obs(self):
"""Number of observations."""
return self.shape[0]
@property
def n_vars(self):
"""Number of variables/features."""
return self.shape[1]
def __len__(self):
return self.limits[-1]
def to_adata(self):
"""Convert this AnnCollection object to an AnnData object.
The AnnData object won't have `.X`, only `.obs` and `.obsm`.
"""
if "obs" in self._view_attrs_keys or "obsm" in self._view_attrs_keys:
concat_view = self[self.obs_names]
if "obsm" in self._view_attrs_keys:
obsm = (
concat_view.obsm.to_dict(use_convert=False)
if concat_view.obsm is not None
else None
)
else:
obsm = self.obsm.copy()
obs = self.obs.copy()
if "obs" in self._view_attrs_keys and concat_view.obs is not None:
for key, value in concat_view.obs.to_dict(use_convert=False).items():
obs[key] = value
adata = AnnData(X=None, obs=obs, obsm=obsm, shape=self.shape)
adata.obs_names = self.obs_names
adata.var_names = self.var_names
return adata
def lazy_attr(self, attr, key=None):
"""Get a subsettable key from an attribute (array-like) or an attribute.
Returns a LazyAttrData object which provides subsetting over the specified
attribute (`.obs` or `.obsm`) or over a key from this attribute.
In the latter case, it acts as a lazy array.
"""
return LazyAttrData(self, attr, key)
@property
def has_backed(self):
"""`True` if `adatas` have backed AnnData objects, `False` otherwise."""
return any([adata.isbacked for adata in self.adatas])
@property
def attrs_keys(self):
"""Dict of all accessible attributes and their keys."""
_attrs_keys = {}
for attr in self._attrs:
keys = list(getattr(self, attr).keys())
_attrs_keys[attr] = keys
_attrs_keys.update(self._view_attrs_keys)
return _attrs_keys
def __repr__(self):
n_obs, n_vars = self.shape
descr = f"AnnCollection object with n_obs × n_vars = {n_obs} × {n_vars}"
descr += f"\n constructed from {len(self.adatas)} AnnData objects"
for attr, keys in self._view_attrs_keys.items():
if len(keys) > 0:
descr += f"\n view of {attr}: {str(keys)[1:-1]}"
for attr in self._attrs:
keys = list(getattr(self, attr).keys())
if len(keys) > 0:
descr += f"\n {attr}: {str(keys)[1:-1]}"
if "obs" in self._view_attrs_keys:
keys = list(self.obs.keys())
if len(keys) > 0:
descr += f"\n own obs: {str(keys)[1:-1]}"
return descr
class LazyAttrData(_IterateViewMixin):
def __init__(self, adset: AnnCollection, attr: str, key: str | None = None):
self.adset = adset
self.attr = attr
self.key = key
def __getitem__(self, index):
oidx = None
vidx = None
if isinstance(index, tuple) and self.attr in {"obs", "obsm"}:
oidx = index[0]
if len(index) > 1:
vidx = index[1]
if oidx is None:
view = self.adset[index]
else:
view = self.adset[oidx]
attr_arr = getattr(view, self.attr)
if self.key is not None:
attr_arr = attr_arr[self.key]
return attr_arr if vidx is None else attr_arr[:, vidx]
@property
def shape(self):
shape = self.adset.shape
if self.attr in {"X", "layers"}:
return shape
elif self.attr == "obs":
return (shape[0],)
elif self.attr == "obsm" and self.key is not None:
return shape[0], self[:1].shape[1]
else:
return None
@property
def ndim(self):
return len(self.shape) if self.shape is not None else 0
@property
def dtype(self):
_dtypes = self.adset._dtypes
if _dtypes is not None and self.attr in _dtypes:
return _dtypes[self.attr][self.key]
attr = self[:1]
if hasattr(attr, "dtype"):
return attr.dtype
else:
return None
python-anndata-0.12.0~rc1/src/anndata/experimental/pytorch/ 0000775 0000000 0000000 00000000000 15003706322 0023703 5 ustar 00root root 0000000 0000000 python-anndata-0.12.0~rc1/src/anndata/experimental/pytorch/__init__.py 0000664 0000000 0000000 00000000137 15003706322 0026015 0 ustar 00root root 0000000 0000000 from __future__ import annotations
from ._annloader import AnnLoader
__all__ = ["AnnLoader"]
python-anndata-0.12.0~rc1/src/anndata/experimental/pytorch/_annloader.py 0000664 0000000 0000000 00000017745 15003706322 0026375 0 ustar 00root root 0000000 0000000 from __future__ import annotations
from collections.abc import Mapping
from copy import copy
from functools import partial
from importlib.util import find_spec
from math import ceil
from typing import TYPE_CHECKING
import numpy as np
from scipy.sparse import issparse
from ..._core.anndata import AnnData
from ...compat import old_positionals
from ..multi_files._anncollection import AnnCollection, _ConcatViewMixin
if find_spec("torch") or TYPE_CHECKING:
import torch
from torch.utils.data import BatchSampler, DataLoader, Sampler
else:
Sampler, BatchSampler, DataLoader = object, object, object
if TYPE_CHECKING:
from collections.abc import Callable, Generator, Sequence
from typing import TypeAlias, Union
from scipy.sparse import spmatrix
# need to use Union because of autodoc_mock_imports
Array: TypeAlias = Union[torch.Tensor, np.ndarray, spmatrix] # noqa: UP007
# Custom sampler to get proper batches instead of joined separate indices
# maybe move to multi_files
class BatchIndexSampler(Sampler):
@old_positionals("batch_size", "shuffle", "drop_last")
def __init__(
self,
n_obs: int,
*,
batch_size: int,
shuffle: bool = False,
drop_last: bool = False,
) -> None:
self.n_obs = n_obs
self.batch_size = batch_size if batch_size < n_obs else n_obs
self.shuffle = shuffle
self.drop_last = drop_last
def __iter__(self) -> Generator[list[int], None, None]:
indices: list[int]
if self.shuffle:
indices = np.random.permutation(self.n_obs).tolist()
else:
indices = list(range(self.n_obs))
for i in range(0, self.n_obs, self.batch_size):
batch = indices[i : min(i + self.batch_size, self.n_obs)]
# only happens if the last batch is smaller than batch_size
if len(batch) < self.batch_size and self.drop_last:
continue
yield batch
def __len__(self) -> int:
if self.drop_last:
length = self.n_obs // self.batch_size
else:
length = ceil(self.n_obs / self.batch_size)
return length
# maybe replace use_cuda with explicit device option
def default_converter(arr: Array, *, use_cuda: bool, pin_memory: bool):
if isinstance(arr, torch.Tensor):
if use_cuda:
arr = arr.cuda()
elif pin_memory:
arr = arr.pin_memory()
elif arr.dtype.name != "category" and np.issubdtype(arr.dtype, np.number):
if issparse(arr):
arr = arr.toarray()
if use_cuda:
arr = torch.tensor(arr, device="cuda")
else:
arr = torch.tensor(arr)
arr = arr.pin_memory() if pin_memory else arr
return arr
def _convert_on_top(
convert: Callable[[Array], Array] | None | Mapping[str, Callable[[Array], Array]],
top_convert: Callable[[Array], Array],
attrs_keys: Sequence[str] | Mapping[str, Sequence[str]],
):
if convert is None:
new_convert = top_convert
elif callable(convert):
def compose_convert(arr):
return top_convert(convert(arr))
new_convert = compose_convert
else:
new_convert = {}
for attr in attrs_keys:
if attr not in convert:
new_convert[attr] = top_convert
else:
as_ks: Sequence[str] | None
if not isinstance(attrs_keys, Mapping):
as_ks = None
else:
as_ks = attrs_keys[attr]
new_convert[attr] = _convert_on_top(convert[attr], top_convert, as_ks)
return new_convert
# AnnLoader has the same arguments as DataLoader, but uses BatchIndexSampler by default
class AnnLoader(DataLoader):
"""\
PyTorch DataLoader for AnnData objects.
Builds DataLoader from a sequence of AnnData objects, from an
:class:`~anndata.experimental.AnnCollection` object or from an `AnnCollectionView` object.
Takes care of the required conversions.
Parameters
----------
adatas
`AnnData` objects or an `AnnCollection` object from which to load the data.
batch_size
How many samples per batch to load.
shuffle
Set to `True` to have the data reshuffled at every epoch.
use_default_converter
Use the default converter to convert arrays to pytorch tensors, transfer to
the default cuda device (if `use_cuda=True`), do memory pinning (if `pin_memory=True`).
If you pass an AnnCollection object with prespecified converters, the default converter
won't overwrite these converters but will be applied on top of them.
use_cuda
Transfer pytorch tensors to the default cuda device after conversion.
Only works if `use_default_converter=True`
**kwargs
Arguments for PyTorch DataLoader. If `adatas` is not an `AnnCollection` object, then also
arguments for `AnnCollection` initialization.
"""
@old_positionals("batch_size", "shuffle", "use_default_converter", "use_cuda")
def __init__(
self,
adatas: Sequence[AnnData] | dict[str, AnnData],
*,
batch_size: int = 1,
shuffle: bool = False,
use_default_converter: bool = True,
use_cuda: bool = False,
**kwargs,
):
if isinstance(adatas, AnnData):
adatas = [adatas]
if (
isinstance(adatas, list)
or isinstance(adatas, tuple)
or isinstance(adatas, dict)
):
join_obs = kwargs.pop("join_obs", "inner")
join_obsm = kwargs.pop("join_obsm", None)
label = kwargs.pop("label", None)
keys = kwargs.pop("keys", None)
index_unique = kwargs.pop("index_unique", None)
convert = kwargs.pop("convert", None)
harmonize_dtypes = kwargs.pop("harmonize_dtypes", True)
indices_strict = kwargs.pop("indices_strict", True)
dataset = AnnCollection(
adatas,
join_obs=join_obs,
join_obsm=join_obsm,
label=label,
keys=keys,
index_unique=index_unique,
convert=convert,
harmonize_dtypes=harmonize_dtypes,
indices_strict=indices_strict,
)
elif isinstance(adatas, _ConcatViewMixin):
dataset = copy(adatas)
else:
msg = "adata should be of type AnnData or AnnCollection."
raise ValueError(msg)
if use_default_converter:
pin_memory = kwargs.pop("pin_memory", False)
_converter = partial(
default_converter, use_cuda=use_cuda, pin_memory=pin_memory
)
dataset.convert = _convert_on_top(
dataset.convert, _converter, dict(dataset.attrs_keys, X=[])
)
has_sampler = "sampler" in kwargs
has_batch_sampler = "batch_sampler" in kwargs
has_worker_init_fn = (
"worker_init_fn" in kwargs and kwargs["worker_init_fn"] is not None
)
has_workers = "num_workers" in kwargs and kwargs["num_workers"] > 0
use_parallel = has_worker_init_fn or has_workers
if (
batch_size is not None
and batch_size > 1
and not has_batch_sampler
and not use_parallel
):
drop_last = kwargs.pop("drop_last", False)
if has_sampler:
sampler = kwargs.pop("sampler")
sampler = BatchSampler(
sampler, batch_size=batch_size, drop_last=drop_last
)
else:
sampler = BatchIndexSampler(
len(dataset),
batch_size=batch_size,
shuffle=shuffle,
drop_last=drop_last,
)
super().__init__(dataset, batch_size=None, sampler=sampler, **kwargs)
else:
super().__init__(dataset, batch_size=batch_size, shuffle=shuffle, **kwargs)
python-anndata-0.12.0~rc1/src/anndata/io.py 0000664 0000000 0000000 00000001272 15003706322 0020501 0 ustar 00root root 0000000 0000000 from __future__ import annotations
from ._core.sparse_dataset import sparse_dataset
from ._io.h5ad import read_h5ad, write_h5ad
from ._io.read import (
read_csv,
read_excel,
read_hdf,
read_loom,
read_mtx,
read_text,
read_umi_tools,
)
from ._io.specs import read_elem, write_elem
from ._io.write import write_csvs, write_loom
from ._io.zarr import read_zarr, write_zarr
__all__ = [
"read_csv",
"read_excel",
"read_h5ad",
"read_hdf",
"read_loom",
"read_mtx",
"read_text",
"read_umi_tools",
"read_zarr",
"write_csvs",
"write_h5ad",
"write_loom",
"write_zarr",
"write_elem",
"read_elem",
"sparse_dataset",
]
python-anndata-0.12.0~rc1/src/anndata/logging.py 0000664 0000000 0000000 00000003176 15003706322 0021525 0 ustar 00root root 0000000 0000000 from __future__ import annotations
import logging
import os
from .compat import old_positionals
_previous_memory_usage = None
anndata_logger = logging.getLogger("anndata")
# Don’t pass log messages on to logging.root and its handler
anndata_logger.propagate = False
anndata_logger.addHandler(logging.StreamHandler()) # Logs go to stderr
anndata_logger.handlers[-1].setFormatter(logging.Formatter("%(message)s"))
anndata_logger.handlers[-1].setLevel("INFO")
def get_logger(name: str) -> logging.Logger:
"""\
Creates a child logger that delegates to anndata_logger
instead to logging.root
"""
return anndata_logger.manager.getLogger(name)
def get_memory_usage() -> tuple[float, float]:
import psutil
process = psutil.Process(os.getpid())
try:
meminfo = process.memory_info()
except AttributeError:
meminfo = process.get_memory_info()
mem = meminfo[0] / 2**30 # output in GB
mem_diff = mem
global _previous_memory_usage # noqa: PLW0603
if _previous_memory_usage is not None:
mem_diff = mem - _previous_memory_usage
_previous_memory_usage = mem
return mem, mem_diff
@old_positionals("newline")
def format_memory_usage(
mem_usage: tuple[float, float], msg: str = "", *, newline: bool = False
):
nl = "\n" if newline else ""
more = " \n... " if msg != "" else ""
mem, diff = mem_usage
return (
f"{nl}{msg}{more}Memory usage: current {mem:.2f} GB, difference {diff:+.2f} GB"
)
@old_positionals("newline")
def print_memory_usage(msg: str = "", *, newline: bool = False):
print(format_memory_usage(get_memory_usage(), msg, newline))
python-anndata-0.12.0~rc1/src/anndata/tests/ 0000775 0000000 0000000 00000000000 15003706322 0020660 5 ustar 00root root 0000000 0000000 python-anndata-0.12.0~rc1/src/anndata/tests/__init__.py 0000664 0000000 0000000 00000000000 15003706322 0022757 0 ustar 00root root 0000000 0000000 python-anndata-0.12.0~rc1/src/anndata/tests/helpers.py 0000664 0000000 0000000 00000105521 15003706322 0022700 0 ustar 00root root 0000000 0000000 from __future__ import annotations
import itertools
import random
import re
import warnings
from collections import Counter, defaultdict
from collections.abc import Mapping
from contextlib import contextmanager
from functools import partial, singledispatch, wraps
from string import ascii_letters
from typing import TYPE_CHECKING
import h5py
import numpy as np
import pandas as pd
import pytest
from pandas.api.types import is_numeric_dtype
from scipy import sparse
import anndata
from anndata import AnnData, ExperimentalFeatureWarning, Raw
from anndata._core.aligned_mapping import AlignedMappingBase
from anndata._core.sparse_dataset import BaseCompressedSparseDataset
from anndata._core.views import ArrayView
from anndata.compat import (
AwkArray,
CSArray,
CSMatrix,
CupyArray,
CupyCSCMatrix,
CupyCSRMatrix,
CupySparseMatrix,
DaskArray,
ZarrArray,
is_zarr_v2,
)
from anndata.utils import asarray
if TYPE_CHECKING:
from collections.abc import Callable, Collection, Iterable
from typing import Literal, TypeGuard, TypeVar
from zarr.abc.store import ByteRequest
from zarr.core.buffer import BufferPrototype
from .._types import ArrayStorageType
DT = TypeVar("DT")
try:
from pandas.core.arrays.integer import IntegerDtype
except ImportError:
IntegerDtype = (
*(pd.Int8Dtype, pd.Int16Dtype, pd.Int32Dtype, pd.Int64Dtype),
*(pd.UInt8Dtype, pd.UInt16Dtype, pd.UInt32Dtype, pd.UInt64Dtype),
)
# Give this to gen_adata when dask array support is expected.
GEN_ADATA_DASK_ARGS = dict(
obsm_types=(
sparse.csr_matrix,
np.ndarray,
pd.DataFrame,
DaskArray,
sparse.csr_array,
),
varm_types=(
sparse.csr_matrix,
np.ndarray,
pd.DataFrame,
DaskArray,
sparse.csr_array,
),
layers_types=(
sparse.csr_matrix,
np.ndarray,
pd.DataFrame,
DaskArray,
sparse.csr_array,
),
)
DEFAULT_KEY_TYPES = (
sparse.csr_matrix,
np.ndarray,
pd.DataFrame,
sparse.csr_array,
)
DEFAULT_COL_TYPES = (
pd.CategoricalDtype(ordered=False),
pd.CategoricalDtype(ordered=True),
np.int64,
np.float64,
np.uint8,
np.bool_,
pd.BooleanDtype,
pd.Int32Dtype,
)
def gen_vstr_recarray(m, n, dtype=None):
size = m * n
lengths = np.random.randint(3, 5, size)
letters = np.array(list(ascii_letters))
gen_word = lambda l: "".join(np.random.choice(letters, l))
arr = np.array([gen_word(l) for l in lengths]).reshape(m, n)
return pd.DataFrame(arr, columns=[gen_word(5) for i in range(n)]).to_records(
index=False, column_dtypes=dtype
)
def issubdtype(
a: np.dtype | pd.api.extensions.ExtensionDtype | type,
b: type[DT] | tuple[type[DT], ...],
) -> TypeGuard[DT]:
if isinstance(b, tuple):
return any(issubdtype(a, t) for t in b)
if isinstance(a, type) and issubclass(a, pd.api.extensions.ExtensionDtype):
return issubclass(a, b)
if isinstance(a, pd.api.extensions.ExtensionDtype):
return isinstance(a, b)
try:
return np.issubdtype(a, b)
except TypeError: # pragma: no cover
pytest.fail(f"issubdtype can’t handle everything yet: {a} {b}")
def gen_random_column(
n: int, dtype: np.dtype | pd.api.extensions.ExtensionDtype
) -> tuple[str, np.ndarray | pd.api.extensions.ExtensionArray]:
if issubdtype(dtype, pd.CategoricalDtype):
# TODO: Think about allowing index to be passed for n
letters = np.fromiter(iter(ascii_letters), "U1")
if n > len(letters):
letters = letters[: n // 2] # Make sure categories are repeated
key = "cat" if dtype.ordered else "cat_unordered"
return key, pd.Categorical(np.random.choice(letters, n), dtype=dtype)
if issubdtype(dtype, pd.BooleanDtype):
return (
"nullable-bool",
pd.arrays.BooleanArray(
np.random.randint(0, 2, size=n, dtype=bool),
mask=np.random.randint(0, 2, size=n, dtype=bool),
),
)
if issubdtype(dtype, IntegerDtype):
return (
"nullable-int",
pd.arrays.IntegerArray(
np.random.randint(0, 1000, size=n, dtype=np.int32),
mask=np.random.randint(0, 2, size=n, dtype=bool),
),
)
if issubdtype(dtype, pd.StringDtype):
letters = np.fromiter(iter(ascii_letters), "U1")
array = pd.array(np.random.choice(letters, n), dtype=pd.StringDtype())
array[np.random.randint(0, 2, size=n, dtype=bool)] = pd.NA
return "string", array
# if issubdtype(dtype, pd.DatetimeTZDtype):
# return "datetime", pd.to_datetime(np.random.randint(0, 1000, size=n))
if issubdtype(dtype, np.bool_):
return "bool", np.random.randint(0, 2, size=n, dtype=dtype)
if not issubdtype(dtype, np.number): # pragma: no cover
pytest.fail(f"Unexpected dtype: {dtype}")
n_bits = 8 * (dtype().itemsize if isinstance(dtype, type) else dtype.itemsize)
if issubdtype(dtype, np.unsignedinteger):
return f"uint{n_bits}", np.random.randint(0, 255, n, dtype=dtype)
if issubdtype(dtype, np.signedinteger):
return f"int{n_bits}", np.random.randint(-50, 50, n, dtype=dtype)
if issubdtype(dtype, np.floating):
return f"float{n_bits}", np.random.random(n).astype(dtype)
pytest.fail(f"Unexpected numeric dtype: {dtype}") # pragma: no cover
def gen_typed_df(
n: int,
index: pd.Index[str] | None = None,
dtypes: Collection[np.dtype | pd.api.extensions.ExtensionDtype] = DEFAULT_COL_TYPES,
):
columns = [gen_random_column(n, dtype) for dtype in dtypes]
col_names = [n for n, _ in columns]
assert len(col_names) == len(set(col_names)), "Duplicate column names generated!"
return pd.DataFrame(dict(columns), index=index)
def _gen_awkward_inner(shape, rng, dtype):
# the maximum length a ragged dimension can take
MAX_RAGGED_DIM_LEN = 20
if not len(shape):
# abort condition -> no dimension left, return an actual value instead
return dtype(rng.randrange(1000))
else:
curr_dim_len = shape[0]
lil = []
if curr_dim_len is None:
# ragged dimension, set random length
curr_dim_len = rng.randrange(MAX_RAGGED_DIM_LEN)
for _ in range(curr_dim_len):
lil.append(_gen_awkward_inner(shape[1:], rng, dtype))
return lil
def gen_awkward(shape, dtype=np.int32):
"""Function to generate an awkward array with random values.
Awkward array dimensions can either be fixed-length ("regular") or variable length ("ragged")
(the first dimension is always fixed-length).
Parameters
----------
shape
shape of the array to be generated. Any dimension specified as `None` will be simulated as ragged.
"""
import awkward as ak
if shape[0] is None:
msg = "The first dimension must be fixed-length."
raise ValueError(msg)
rng = random.Random(123)
shape = np.array(shape)
if np.any(shape == 0):
# use empty numpy array for fixed dimensions, then add empty singletons for ragged dimensions
var_dims = [i for i, s in enumerate(shape) if s is None]
shape = [s for s in shape if s is not None]
arr = ak.Array(np.empty(shape, dtype=dtype))
for d in var_dims:
arr = ak.singletons(arr, axis=d - 1)
return arr
else:
lil = _gen_awkward_inner(shape, rng, dtype)
arr = ak.values_astype(AwkArray(lil), dtype)
# make fixed-length dimensions regular
for i, d in enumerate(shape):
if d is not None:
arr = ak.to_regular(arr, i)
return arr
def gen_typed_df_t2_size(m, n, index=None, columns=None) -> pd.DataFrame:
s = 0
df = pd.DataFrame()
new_vals = gen_typed_df(m)
while s < (n / new_vals.shape[1]):
new_vals = gen_typed_df(m, index=index)
new_vals.columns = new_vals.columns + "_" + str(s)
df[new_vals.columns] = new_vals
s += 1
df = df.iloc[:m, :n].copy()
if columns is not None:
df.columns = columns
return df
def maybe_add_sparse_array(
mapping: Mapping,
types: Collection[type],
format: Literal["csr", "csc"],
random_state: np.random.Generator,
shape: tuple[int, int],
):
if sparse.csr_array in types or sparse.csr_matrix in types:
mapping["sparse_array"] = sparse.csr_array(
sparse.random(*shape, format=format, random_state=random_state)
)
return mapping
# TODO: Use hypothesis for this?
def gen_adata(
shape: tuple[int, int],
X_type: Callable[[np.ndarray], object] = sparse.csr_matrix,
*,
X_dtype: np.dtype = np.float32,
obs_dtypes: Collection[
np.dtype | pd.api.extensions.ExtensionDtype
] = DEFAULT_COL_TYPES,
var_dtypes: Collection[
np.dtype | pd.api.extensions.ExtensionDtype
] = DEFAULT_COL_TYPES,
obsm_types: Collection[type] = DEFAULT_KEY_TYPES + (AwkArray,),
varm_types: Collection[type] = DEFAULT_KEY_TYPES + (AwkArray,),
layers_types: Collection[type] = DEFAULT_KEY_TYPES,
random_state: np.random.Generator | None = None,
sparse_fmt: Literal["csr", "csc"] = "csr",
) -> AnnData:
"""\
Helper function to generate a random AnnData for testing purposes.
Note: For `obsm_types`, `varm_types`, and `layers_types` these currently
just filter already created objects.
In future, these should choose which objects are created.
Params
------
shape
What shape you want the anndata to be.
X_type
What kind of container should `X` be? This will be called on a randomly
generated 2d array.
X_dtype
What should the dtype of the `.X` container be?
obsm_types
What kinds of containers should be in `.obsm`?
varm_types
What kinds of containers should be in `.varm`?
layers_types
What kinds of containers should be in `.layers`?
sparse_fmt
What sparse format should be used for sparse matrices?
(csr, csc)
"""
import dask.array as da
if random_state is None:
random_state = np.random.default_rng()
M, N = shape
obs_names = pd.Index(f"cell{i}" for i in range(shape[0]))
var_names = pd.Index(f"gene{i}" for i in range(shape[1]))
obs = gen_typed_df(M, obs_names, dtypes=obs_dtypes)
var = gen_typed_df(N, var_names, dtypes=var_dtypes)
# For #147
obs.rename(columns=dict(cat="obs_cat"), inplace=True)
var.rename(columns=dict(cat="var_cat"), inplace=True)
if X_type is None:
X = None
else:
X = X_type(random_state.binomial(100, 0.005, (M, N)).astype(X_dtype))
obsm = dict(
array=np.random.random((M, 50)),
sparse=sparse.random(M, 100, format=sparse_fmt, random_state=random_state),
df=gen_typed_df(M, obs_names, dtypes=obs_dtypes),
awk_2d_ragged=gen_awkward((M, None)),
da=da.random.random((M, 50)),
)
obsm = {k: v for k, v in obsm.items() if type(v) in obsm_types}
obsm = maybe_add_sparse_array(
mapping=obsm,
types=obsm_types,
format=sparse_fmt,
random_state=random_state,
shape=(M, 100),
)
varm = dict(
array=np.random.random((N, 50)),
sparse=sparse.random(N, 100, format=sparse_fmt, random_state=random_state),
df=gen_typed_df(N, var_names, dtypes=var_dtypes),
awk_2d_ragged=gen_awkward((N, None)),
da=da.random.random((N, 50)),
)
varm = {k: v for k, v in varm.items() if type(v) in varm_types}
varm = maybe_add_sparse_array(
mapping=varm,
types=varm_types,
format=sparse_fmt,
random_state=random_state,
shape=(N, 100),
)
layers = dict(
array=np.random.random((M, N)),
sparse=sparse.random(M, N, format=sparse_fmt, random_state=random_state),
da=da.random.random((M, N)),
)
layers = maybe_add_sparse_array(
mapping=layers,
types=layers_types,
format=sparse_fmt,
random_state=random_state,
shape=(M, N),
)
layers = {k: v for k, v in layers.items() if type(v) in layers_types}
obsp = dict(
array=np.random.random((M, M)),
sparse=sparse.random(M, M, format=sparse_fmt, random_state=random_state),
)
obsp["sparse_array"] = sparse.csr_array(
sparse.random(M, M, format=sparse_fmt, random_state=random_state)
)
varp = dict(
array=np.random.random((N, N)),
sparse=sparse.random(N, N, format=sparse_fmt, random_state=random_state),
)
varp["sparse_array"] = sparse.csr_array(
sparse.random(N, N, format=sparse_fmt, random_state=random_state)
)
uns = dict(
O_recarray=gen_vstr_recarray(N, 5),
nested=dict(
scalar_str="str",
scalar_int=42,
scalar_float=3.0,
nested_further=dict(array=np.arange(5)),
),
awkward_regular=gen_awkward((10, 5)),
awkward_ragged=gen_awkward((12, None, None)),
# U_recarray=gen_vstr_recarray(N, 5, "U4")
)
# https://github.com/zarr-developers/zarr-python/issues/2134
# zarr v3 on-disk does not write structured dtypes
if anndata.settings.zarr_write_format == 3:
del uns["O_recarray"]
with warnings.catch_warnings():
warnings.simplefilter("ignore", ExperimentalFeatureWarning)
adata = AnnData(
X=X,
obs=obs,
var=var,
obsm=obsm,
varm=varm,
layers=layers,
obsp=obsp,
varp=varp,
uns=uns,
)
return adata
def array_bool_subset(index, min_size=2):
b = np.zeros(len(index), dtype=bool)
selected = np.random.choice(
range(len(index)),
size=np.random.randint(min_size, len(index), ()),
replace=False,
)
b[selected] = True
return b
def list_bool_subset(index, min_size=2):
return array_bool_subset(index, min_size=min_size).tolist()
def matrix_bool_subset(index, min_size=2):
with warnings.catch_warnings():
warnings.simplefilter("ignore", PendingDeprecationWarning)
indexer = np.matrix(
array_bool_subset(index, min_size=min_size).reshape(len(index), 1)
)
return indexer
def spmatrix_bool_subset(index, min_size=2):
return sparse.csr_matrix(
array_bool_subset(index, min_size=min_size).reshape(len(index), 1)
)
def sparray_bool_subset(index, min_size=2):
return sparse.csr_array(
array_bool_subset(index, min_size=min_size).reshape(len(index), 1)
)
def array_subset(index, min_size=2):
if len(index) < min_size:
msg = f"min_size (={min_size}) must be smaller than len(index) (={len(index)}"
raise ValueError(msg)
return np.random.choice(
index, size=np.random.randint(min_size, len(index), ()), replace=False
)
def array_int_subset(index, min_size=2):
if len(index) < min_size:
msg = f"min_size (={min_size}) must be smaller than len(index) (={len(index)}"
raise ValueError(msg)
return np.random.choice(
np.arange(len(index)),
size=np.random.randint(min_size, len(index), ()),
replace=False,
)
def list_int_subset(index, min_size=2):
return array_int_subset(index, min_size=min_size).tolist()
def slice_subset(index, min_size=2):
while True:
points = np.random.choice(np.arange(len(index) + 1), size=2, replace=False)
s = slice(*sorted(points))
if len(range(*s.indices(len(index)))) >= min_size:
break
return s
def single_subset(index):
return index[np.random.randint(0, len(index))]
@pytest.fixture(
params=[
array_subset,
slice_subset,
single_subset,
array_int_subset,
list_int_subset,
array_bool_subset,
list_bool_subset,
matrix_bool_subset,
spmatrix_bool_subset,
sparray_bool_subset,
]
)
def subset_func(request):
return request.param
###################
# Checking equality
###################
def format_msg(elem_name: str | None) -> str:
if elem_name is not None:
return f"Error raised from element {elem_name!r}."
else:
return ""
# TODO: it would be better to modify the other exception
def report_name(func):
"""Report name of element being tested if test fails."""
@wraps(func)
def func_wrapper(*args, _elem_name: str | None = None, **kwargs):
try:
return func(*args, **kwargs)
except Exception as e:
if _elem_name is not None and not hasattr(e, "_name_attached"):
msg = format_msg(_elem_name)
args = list(e.args)
if len(args) == 0:
args = [msg]
else:
args[0] = f"{args[0]}\n\n{msg}"
e.args = tuple(args)
e._name_attached = True
raise e
return func_wrapper
@report_name
def _assert_equal(a, b):
"""Allows reporting elem name for simple assertion."""
assert a == b
@singledispatch
def assert_equal(
a: object, b: object, *, exact: bool = False, elem_name: str | None = None
):
_assert_equal(a, b, _elem_name=elem_name)
@assert_equal.register(CupyArray)
def assert_equal_cupy(
a: CupyArray, b: object, *, exact: bool = False, elem_name: str | None = None
):
assert_equal(b, a.get(), exact=exact, elem_name=elem_name)
@assert_equal.register(np.ndarray)
def assert_equal_ndarray(
a: np.ndarray, b: object, *, exact: bool = False, elem_name: str | None = None
):
b = asarray(b)
if not exact and is_numeric_dtype(a) and is_numeric_dtype(b):
assert a.shape == b.shape, format_msg(elem_name)
np.testing.assert_allclose(a, b, equal_nan=True, err_msg=format_msg(elem_name))
elif ( # Structured dtype
not exact
and hasattr(a, "dtype")
and hasattr(b, "dtype")
and len(a.dtype) > 1
and len(b.dtype) > 0
):
# Reshaping to allow >2d arrays
assert a.shape == b.shape, format_msg(elem_name)
assert_equal(
pd.DataFrame(a.reshape(-1)),
pd.DataFrame(b.reshape(-1)),
exact=exact,
elem_name=elem_name,
)
else:
assert np.all(a == b), format_msg(elem_name)
@assert_equal.register(ArrayView)
def assert_equal_arrayview(
a: ArrayView, b: object, *, exact: bool = False, elem_name: str | None = None
):
assert_equal(asarray(a), asarray(b), exact=exact, elem_name=elem_name)
@assert_equal.register(BaseCompressedSparseDataset)
@assert_equal.register(sparse.spmatrix)
def assert_equal_sparse(
a: BaseCompressedSparseDataset | sparse.spmatrix,
b: object,
*,
exact: bool = False,
elem_name: str | None = None,
):
a = asarray(a)
assert_equal(b, a, exact=exact, elem_name=elem_name)
@assert_equal.register(CSArray)
def assert_equal_sparse_array(
a: CSArray, b: object, *, exact: bool = False, elem_name: str | None = None
):
return assert_equal_sparse(a, b, exact=exact, elem_name=elem_name)
@assert_equal.register(CupySparseMatrix)
def assert_equal_cupy_sparse(
a: CupySparseMatrix, b: object, *, exact: bool = False, elem_name: str | None = None
):
a = a.toarray()
assert_equal(b, a, exact=exact, elem_name=elem_name)
@assert_equal.register(h5py.Dataset)
@assert_equal.register(ZarrArray)
def assert_equal_h5py_dataset(
a: ArrayStorageType, b: object, *, exact: bool = False, elem_name: str | None = None
):
a = asarray(a)
assert_equal(b, a, exact=exact, elem_name=elem_name)
@assert_equal.register(DaskArray)
def assert_equal_dask_array(
a: DaskArray, b: object, *, exact: bool = False, elem_name: str | None = None
):
assert_equal(b, a.compute(), exact=exact, elem_name=elem_name)
@assert_equal.register(pd.DataFrame)
def are_equal_dataframe(
a: pd.DataFrame, b: object, *, exact: bool = False, elem_name: str | None = None
):
if not isinstance(b, pd.DataFrame):
assert_equal(b, a, exact=exact, elem_name=elem_name) # , a.values maybe?
report_name(pd.testing.assert_frame_equal)(
a,
b,
check_exact=exact,
check_column_type=exact,
check_index_type=exact,
_elem_name=elem_name,
check_frame_type=False,
)
@assert_equal.register(AwkArray)
def assert_equal_awkarray(
a: AwkArray, b: object, *, exact: bool = False, elem_name: str | None = None
):
import awkward as ak
if exact:
assert isinstance(b, AwkArray)
assert a.type == b.type, f"{a.type} != {b.type}, {format_msg(elem_name)}"
assert ak.to_list(a) == ak.to_list(b), format_msg(elem_name)
@assert_equal.register(Mapping)
def assert_equal_mapping(
a: Mapping, b: object, *, exact: bool = False, elem_name: str | None = None
):
assert isinstance(b, Mapping)
assert set(a.keys()) == set(b.keys()), format_msg(elem_name)
for k in a.keys():
if elem_name is None:
elem_name = ""
assert_equal(a[k], b[k], exact=exact, elem_name=f"{elem_name}/{k}")
@assert_equal.register(AlignedMappingBase)
def assert_equal_aligned_mapping(
a: AlignedMappingBase,
b: object,
*,
exact: bool = False,
elem_name: str | None = None,
):
assert isinstance(b, AlignedMappingBase)
a_indices = (a.parent.obs_names, a.parent.var_names)
b_indices = (b.parent.obs_names, b.parent.var_names)
for axis_idx in a.axes:
assert_equal(
a_indices[axis_idx], b_indices[axis_idx], exact=exact, elem_name=axis_idx
)
assert a.attrname == b.attrname, format_msg(elem_name)
assert_equal_mapping(a, b, exact=exact, elem_name=elem_name)
@assert_equal.register(pd.Index)
def assert_equal_index(
a: pd.Index, b: object, *, exact: bool = False, elem_name: str | None = None
):
params = dict(check_categorical=False) if not exact else {}
report_name(pd.testing.assert_index_equal)(
a, b, check_names=False, **params, _elem_name=elem_name
)
@assert_equal.register(pd.api.extensions.ExtensionArray)
def assert_equal_extension_array(
a: pd.api.extensions.ExtensionArray,
b: object,
*,
exact: bool = False,
elem_name: str | None = None,
):
report_name(pd.testing.assert_extension_array_equal)(
a,
b,
check_dtype=exact,
check_exact=exact,
_elem_name=elem_name,
)
@assert_equal.register(Raw)
def assert_equal_raw(
a: Raw, b: object, *, exact: bool = False, elem_name: str | None = None
):
def assert_is_not_none(x): # can't put an assert in a lambda
assert x is not None
report_name(assert_is_not_none)(b, _elem_name=elem_name)
for attr in ["X", "var", "varm", "obs_names"]:
assert_equal(
getattr(a, attr),
getattr(b, attr),
exact=exact,
elem_name=f"{elem_name}/{attr}",
)
@assert_equal.register(AnnData)
def assert_adata_equal(
a: AnnData, b: object, *, exact: bool = False, elem_name: str | None = None
):
"""\
Check whether two AnnData objects are equivalent,
raising an AssertionError if they aren’t.
Params
------
a
b
exact
Whether comparisons should be exact or not. This has a somewhat flexible
meaning and should probably get refined in the future.
"""
def fmt_name(x):
if elem_name is None:
return x
else:
return f"{elem_name}/{x}"
assert isinstance(b, AnnData)
# There may be issues comparing views, since np.allclose
# can modify ArrayViews if they contain `nan`s
assert_equal(a.obs_names, b.obs_names, exact=exact, elem_name=fmt_name("obs_names"))
assert_equal(a.var_names, b.var_names, exact=exact, elem_name=fmt_name("var_names"))
if not exact:
# Reorder all elements if necessary
idx = [slice(None), slice(None)]
# Since it’s a pain to compare a list of pandas objects
change_flag = False
if not np.all(a.obs_names == b.obs_names):
idx[0] = a.obs_names
change_flag = True
if not np.all(a.var_names == b.var_names):
idx[1] = a.var_names
change_flag = True
if change_flag:
b = b[tuple(idx)].copy()
for attr in [
"X",
"obs",
"var",
"obsm",
"varm",
"layers",
"uns",
"obsp",
"varp",
"raw",
]:
assert_equal(
getattr(a, attr),
getattr(b, attr),
exact=exact,
elem_name=fmt_name(attr),
)
def _half_chunk_size(a: tuple[int, ...]) -> tuple[int, ...]:
def half_rounded_up(x):
div, mod = divmod(x, 2)
return div + (mod > 0)
return tuple(half_rounded_up(x) for x in a)
@singledispatch
def as_dense_dask_array(a):
import dask.array as da
a = asarray(a)
return da.asarray(a, chunks=_half_chunk_size(a.shape))
@as_dense_dask_array.register(CSMatrix)
def _(a):
return as_dense_dask_array(a.toarray())
@as_dense_dask_array.register(DaskArray)
def _(a):
return a.map_blocks(asarray, dtype=a.dtype, meta=np.ndarray)
@singledispatch
def as_sparse_dask_array(a) -> DaskArray:
import dask.array as da
return da.from_array(sparse.csr_matrix(a), chunks=_half_chunk_size(a.shape))
@as_sparse_dask_array.register(CSMatrix)
def _(a):
import dask.array as da
return da.from_array(a, _half_chunk_size(a.shape))
@as_sparse_dask_array.register(CSArray)
def _(a):
import dask.array as da
return da.from_array(sparse.csr_matrix(a), _half_chunk_size(a.shape))
@as_sparse_dask_array.register(DaskArray)
def _(a):
return a.map_blocks(sparse.csr_matrix)
@singledispatch
def as_dense_cupy_dask_array(a):
import cupy as cp
return as_dense_dask_array(a).map_blocks(
cp.array, meta=cp.array((1.0), dtype=a.dtype), dtype=a.dtype
)
@as_dense_cupy_dask_array.register(CupyArray)
def _(a):
import cupy as cp
import dask.array as da
return da.from_array(
a,
chunks=_half_chunk_size(a.shape),
meta=cp.array((1.0), dtype=a.dtype),
)
@as_dense_cupy_dask_array.register(DaskArray)
def _(a):
import cupy as cp
if isinstance(a._meta, cp.ndarray):
return a.copy()
return a.map_blocks(
partial(as_cupy, typ=CupyArray),
dtype=a.dtype,
meta=cp.array((1.0), dtype=a.dtype),
)
try:
import cupyx.scipy.sparse as cpsparse
format_to_memory_class = {"csr": cpsparse.csr_matrix, "csc": cpsparse.csc_matrix}
except ImportError:
format_to_memory_class = {}
# TODO: If there are chunks which divide along columns, then a coo_matrix is returned by compute
# We should try and fix this upstream in dask/ cupy
@singledispatch
def as_cupy_sparse_dask_array(a, format="csr"):
memory_class = format_to_memory_class[format]
cpu_da = as_sparse_dask_array(a)
return cpu_da.rechunk((cpu_da.chunks[0], -1)).map_blocks(
memory_class, dtype=a.dtype, meta=memory_class(cpu_da._meta)
)
@as_cupy_sparse_dask_array.register(CupyArray)
@as_cupy_sparse_dask_array.register(CupySparseMatrix)
def _(a, format="csr"):
import dask.array as da
memory_class = format_to_memory_class[format]
return da.from_array(memory_class(a), chunks=(_half_chunk_size(a.shape)[0], -1))
@as_cupy_sparse_dask_array.register(DaskArray)
def _(a, format="csr"):
memory_class = format_to_memory_class[format]
if isinstance(a._meta, memory_class):
return a.copy()
return a.rechunk((a.chunks[0], -1)).map_blocks(
partial(as_cupy, typ=memory_class), dtype=a.dtype
)
@contextmanager
def pytest_8_raises(exc_cls, *, match: str | re.Pattern = None):
"""Error handling using pytest 8's support for __notes__.
See: https://github.com/pytest-dev/pytest/pull/11227
Remove once pytest 8 is out!
"""
with pytest.raises(exc_cls) as exc_info:
yield exc_info
check_error_or_notes_match(exc_info, match)
def check_error_or_notes_match(e: pytest.ExceptionInfo, pattern: str | re.Pattern):
"""
Checks whether the printed error message or the notes contains the given pattern.
DOES NOT WORK IN IPYTHON - because of the way IPython handles exceptions
"""
import traceback
message = "".join(traceback.format_exception_only(e.type, e.value))
assert re.search(pattern, message), (
f"Could not find pattern: '{pattern}' in error:\n\n{message}\n"
)
def resolve_cupy_type(val):
if not isinstance(val, type):
input_typ = type(val)
else:
input_typ = val
if issubclass(input_typ, np.ndarray):
typ = CupyArray
elif issubclass(input_typ, sparse.csr_matrix):
typ = CupyCSRMatrix
elif issubclass(input_typ, sparse.csc_matrix):
typ = CupyCSCMatrix
else:
msg = f"No default target type for input type {input_typ}"
raise NotImplementedError(msg)
return typ
@singledispatch
def as_cupy(val, typ=None):
"""
Rough conversion function
Will try to infer target type from input type if not specified.
"""
if typ is None:
typ = resolve_cupy_type(val)
if issubclass(typ, CupyArray):
import cupy as cp
if isinstance(val, CSMatrix):
val = val.toarray()
return cp.array(val)
elif issubclass(typ, CupyCSRMatrix):
import cupy as cp
import cupyx.scipy.sparse as cpsparse
if isinstance(val, np.ndarray):
return cpsparse.csr_matrix(cp.array(val))
else:
return cpsparse.csr_matrix(val)
elif issubclass(typ, CupyCSCMatrix):
import cupy as cp
import cupyx.scipy.sparse as cpsparse
if isinstance(val, np.ndarray):
return cpsparse.csc_matrix(cp.array(val))
else:
return cpsparse.csc_matrix(val)
else:
msg = f"Conversion from {type(val)} to {typ} not implemented"
raise NotImplementedError(msg)
# TODO: test
@as_cupy.register(DaskArray)
def as_cupy_dask(a, typ=None):
if typ is None:
typ = resolve_cupy_type(a._meta)
return a.map_blocks(partial(as_cupy, typ=typ), dtype=a.dtype)
@singledispatch
def shares_memory(x, y) -> bool:
return np.shares_memory(x, y)
@shares_memory.register(CSMatrix)
def shares_memory_sparse(x, y):
return (
np.shares_memory(x.data, y.data)
and np.shares_memory(x.indices, y.indices)
and np.shares_memory(x.indptr, y.indptr)
)
BASE_MATRIX_PARAMS = [
pytest.param(asarray, id="np_array"),
pytest.param(sparse.csr_matrix, id="scipy_csr_matrix"),
pytest.param(sparse.csc_matrix, id="scipy_csc_matrix"),
pytest.param(sparse.csr_array, id="scipy_csr_array"),
pytest.param(sparse.csc_array, id="scipy_csc_array"),
]
DASK_MATRIX_PARAMS = [
pytest.param(as_dense_dask_array, id="dense_dask_array"),
pytest.param(as_sparse_dask_array, id="sparse_dask_array"),
]
CUPY_MATRIX_PARAMS = [
pytest.param(
partial(as_cupy, typ=CupyArray), id="cupy_array", marks=pytest.mark.gpu
),
pytest.param(
partial(as_cupy, typ=CupyCSRMatrix),
id="cupy_csr",
marks=pytest.mark.gpu,
),
pytest.param(
partial(as_cupy, typ=CupyCSCMatrix),
id="cupy_csc",
marks=pytest.mark.gpu,
),
]
DASK_CUPY_MATRIX_PARAMS = [
pytest.param(
as_dense_cupy_dask_array,
id="cupy_dense_dask_array",
marks=pytest.mark.gpu,
),
pytest.param(
as_cupy_sparse_dask_array, id="cupy_csr_dask_array", marks=pytest.mark.gpu
),
]
if is_zarr_v2():
from zarr.storage import DirectoryStore as LocalStore
else:
from zarr.storage import LocalStore
class AccessTrackingStoreBase(LocalStore):
_access_count: Counter[str]
_accessed: defaultdict[str, set]
_accessed_keys: defaultdict[str, list[str]]
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._access_count = Counter()
self._accessed = defaultdict(set)
self._accessed_keys = defaultdict(list)
def _check_and_track_key(self, key: str):
for tracked in self._access_count:
if tracked in key:
self._access_count[tracked] += 1
self._accessed[tracked].add(key)
self._accessed_keys[tracked] += [key]
def get_access_count(self, key: str) -> int:
# access defaultdict when value is not there causes key to be there,
# which causes it to be tracked
if key not in self._access_count:
msg = f"{key} not found among access count"
raise KeyError(msg)
return self._access_count[key]
def get_subkeys_accessed(self, key: str) -> set[str]:
if key not in self._accessed:
msg = f"{key} not found among accessed"
raise KeyError(msg)
return self._accessed[key]
def get_accessed_keys(self, key: str) -> list[str]:
if key not in self._accessed_keys:
msg = f"{key} not found among accessed keys"
raise KeyError(msg)
return self._accessed_keys[key]
def initialize_key_trackers(self, keys_to_track: Iterable[str]) -> None:
for k in keys_to_track:
self._access_count[k] = 0
self._accessed_keys[k] = []
self._accessed[k] = set()
def reset_key_trackers(self) -> None:
self.initialize_key_trackers(self._access_count.keys())
def assert_access_count(self, key: str, count: int):
keys_accessed = self.get_subkeys_accessed(key)
access_count = self.get_access_count(key)
assert self.get_access_count(key) == count, (
f"Found {access_count} accesses at {keys_accessed}"
)
if is_zarr_v2():
class AccessTrackingStore(AccessTrackingStoreBase):
def __getitem__(self, key: str) -> bytes:
self._check_and_track_key(key)
return super().__getitem__(key)
else:
class AccessTrackingStore(AccessTrackingStoreBase):
async def get(
self,
key: str,
prototype: BufferPrototype | None = None,
byte_range: ByteRequest | None = None,
) -> object:
self._check_and_track_key(key)
return await super().get(key, prototype=prototype, byte_range=byte_range)
if is_zarr_v2():
class AccessTrackingStore(AccessTrackingStoreBase):
def __getitem__(self, key: str) -> bytes:
self._check_and_track_key(key)
return super().__getitem__(key)
else:
class AccessTrackingStore(AccessTrackingStoreBase):
async def get(
self,
key: str,
prototype: BufferPrototype | None = None,
byte_range: ByteRequest | None = None,
) -> object:
self._check_and_track_key(key)
return await super().get(key, prototype=prototype, byte_range=byte_range)
def get_multiindex_columns_df(shape: tuple[int, int]) -> pd.DataFrame:
return pd.DataFrame(
np.random.rand(shape[0], shape[1]),
columns=pd.MultiIndex.from_tuples(
list(itertools.product(["a"], range(shape[1] - (shape[1] // 2))))
+ list(itertools.product(["b"], range(shape[1] // 2)))
),
)
python-anndata-0.12.0~rc1/src/anndata/types.py 0000664 0000000 0000000 00000001272 15003706322 0021236 0 ustar 00root root 0000000 0000000 from __future__ import annotations
from typing import TYPE_CHECKING, Protocol, runtime_checkable
if TYPE_CHECKING:
from ._core.anndata import AnnData
@runtime_checkable
class ExtensionNamespace(Protocol):
"""Protocol for extension namespaces.
Enforces that the namespace initializer accepts a class with the proper `__init__` method.
Protocol's can't enforce that the `__init__` accepts the correct types. See
`_check_namespace_signature` for that. This is mainly useful for static type
checking with mypy and IDEs.
"""
def __init__(self, adata: AnnData) -> None:
"""
Used to enforce the correct signature for extension namespaces.
"""
python-anndata-0.12.0~rc1/src/anndata/typing.py 0000664 0000000 0000000 00000002766 15003706322 0021415 0 ustar 00root root 0000000 0000000 from __future__ import annotations
from typing import TYPE_CHECKING
import numpy as np
import pandas as pd
from numpy import ma
from . import abc
from ._core.anndata import AnnData
from .compat import (
AwkArray,
CSArray,
CSMatrix,
CupyArray,
CupySparseMatrix,
DaskArray,
H5Array,
ZappyArray,
ZarrArray,
)
from .compat import Index as _Index
if TYPE_CHECKING:
from typing import TypeAlias
__all__ = ["Index", "RWAble", "AxisStorable"]
Index = _Index
"""1D or 2D index an :class:`~anndata.AnnData` object can be sliced with."""
XDataType: TypeAlias = (
np.ndarray
| ma.MaskedArray
| CSMatrix
| CSArray
| H5Array
| ZarrArray
| ZappyArray
| abc.CSRDataset
| abc.CSCDataset
| DaskArray
| CupyArray
| CupySparseMatrix
)
ArrayDataStructureTypes: TypeAlias = XDataType | AwkArray
InMemoryArrayOrScalarType: TypeAlias = (
pd.DataFrame | np.number | str | ArrayDataStructureTypes
)
AxisStorable: TypeAlias = (
InMemoryArrayOrScalarType | dict[str, "AxisStorable"] | list["AxisStorable"]
)
"""A serializable object, excluding :class:`anndata.AnnData` objects i.e., something that can be stored in `uns` or `obsm`."""
RWAble: TypeAlias = (
AxisStorable | AnnData | pd.Categorical | pd.api.extensions.ExtensionArray
)
"""A superset of :type:`anndata.typing.AxisStorable` (i.e., including :class:`anndata.AnnData`) which is everything can be read/written by :func:`anndata.io.read_elem` and :func:`anndata.io.write_elem`."""
python-anndata-0.12.0~rc1/src/anndata/utils.py 0000664 0000000 0000000 00000034141 15003706322 0021233 0 ustar 00root root 0000000 0000000 from __future__ import annotations
import re
import warnings
from functools import singledispatch, wraps
from typing import TYPE_CHECKING
import h5py
import numpy as np
import pandas as pd
from scipy import sparse
import anndata
from ._core.sparse_dataset import BaseCompressedSparseDataset
from .compat import CSArray, CupyArray, CupySparseMatrix, DaskArray
from .logging import get_logger
if TYPE_CHECKING:
from collections.abc import Iterable, Mapping, Sequence
from typing import Any, Literal
logger = get_logger(__name__)
def import_name(name: str) -> Any:
from importlib import import_module
parts = name.split(".")
obj = import_module(parts[0])
for i, name in enumerate(parts[1:]):
try:
obj = import_module(f"{obj.__name__}.{name}")
except ModuleNotFoundError:
break
for name in parts[i + 1 :]:
try:
obj = getattr(obj, name)
except AttributeError:
msg = f"{parts[:i]}, {parts[i + 1 :]}, {obj} {name}"
raise RuntimeError(msg)
return obj
@singledispatch
def asarray(x):
"""Convert x to a numpy array"""
return np.asarray(x)
@asarray.register(CSArray)
@asarray.register(sparse.spmatrix)
def asarray_sparse(x):
return x.toarray()
@asarray.register(BaseCompressedSparseDataset)
def asarray_sparse_dataset(x):
return asarray(x.to_memory())
@asarray.register(h5py.Dataset)
def asarray_h5py_dataset(x):
return x[...]
@asarray.register(CupyArray)
def asarray_cupy(x):
return x.get()
@asarray.register(CupySparseMatrix)
def asarray_cupy_sparse(x):
return x.toarray().get()
@asarray.register(DaskArray)
def asarray_dask(x):
return asarray(x.compute())
@singledispatch
def convert_to_dict(obj) -> dict:
return dict(obj)
@convert_to_dict.register(dict)
def convert_to_dict_dict(obj: dict):
return obj
@convert_to_dict.register(np.ndarray)
def convert_to_dict_ndarray(obj: np.ndarray):
if obj.dtype.fields is None:
msg = (
"Can only convert np.ndarray with compound dtypes to dict, "
f"passed array had “{obj.dtype}”."
)
raise TypeError(msg)
return {k: obj[k] for k in obj.dtype.fields.keys()}
@convert_to_dict.register(type(None))
def convert_to_dict_nonetype(obj: None):
return dict()
@singledispatch
def axis_len(x, axis: Literal[0, 1]) -> int | None:
"""\
Return the size of an array in dimension `axis`.
Returns None if `x` is an awkward array with variable length in the requested dimension.
"""
return x.shape[axis]
try:
from .compat import awkward as ak
def _size_at_depth(layout, depth, lateral_context, **kwargs):
"""Callback function for dim_len_awkward, resolving the dim_len for a given level"""
if layout.is_numpy:
# if it's an embedded rectilinear array, we have to deal with its shape
# which might not be 1-dimensional
if layout.is_unknown:
shape = (0,)
else:
shape = layout.shape
numpy_axis = lateral_context["axis"] - depth + 1
if not (1 <= numpy_axis < len(shape)):
msg = f"axis={lateral_context['axis']} is too deep"
raise TypeError(msg)
lateral_context["out"] = shape[numpy_axis]
return ak.contents.EmptyArray()
elif layout.is_list and depth == lateral_context["axis"]:
if layout.parameter("__array__") in {"string", "bytestring"}:
# Strings are implemented like an array of lists of uint8 (ListType(NumpyType(...)))
# which results in an extra hierarchy-level that shouldn't show up in dim_len
# See https://github.com/scikit-hep/awkward/discussions/1654#discussioncomment-3736747
msg = f"axis={lateral_context['axis']} is too deep"
raise TypeError(msg)
if layout.is_regular:
# if it's a regular list, you want the size
lateral_context["out"] = layout.size
else:
# if it's an irregular list, you want a null token
lateral_context["out"] = -1
return ak.contents.EmptyArray()
elif layout.is_record and depth == lateral_context["axis"]:
lateral_context["out"] = len(layout.fields)
return ak.contents.EmptyArray()
elif layout.is_record:
# currently, we don't recurse into records
# in theory we could, just not sure how to do it at the moment
# Would need to consider cases like: scalars, unevenly sized values
msg = f"Cannot recurse into record type found at axis={lateral_context['axis']}"
raise TypeError(msg)
elif layout.is_union:
# if it's a union, you could get the result of each union branch
# separately and see if they're all the same; if not, it's an error
result = None
for content in layout.contents:
context = {"axis": lateral_context["axis"]}
ak.transform(
_size_at_depth,
content,
lateral_context=context,
)
if result is None:
result = context["out"]
elif result != context["out"]:
# Union branches have different lengths -> return null token
lateral_context["out"] = -1
return ak.contents.EmptyArray()
lateral_context["out"] = result
return ak.contents.EmptyArray()
@axis_len.register(ak.Array)
def axis_len_awkward(array, axis: Literal[0, 1]) -> int | None:
"""Get the length of an awkward array in a given axis
Returns None if the axis is of variable length.
Code adapted from @jpivarski's solution in https://github.com/scikit-hep/awkward/discussions/1654#discussioncomment-3521574
"""
if axis < 0: # negative axis is another can of worms... maybe later
msg = "Does not support negative axis"
raise NotImplementedError(msg)
elif axis == 0:
return len(array)
else:
# communicate with the recursive function using a context (lateral)
context = {"axis": axis}
# "transform" but we don't care what kind of array it returns
ak.transform(
_size_at_depth,
array,
lateral_context=context,
)
# Use `None` as null token.
return None if context["out"] == -1 else context["out"]
@asarray.register(ak.Array)
def asarray_awkward(x):
return x
except ImportError:
pass
def make_index_unique(index: pd.Index, join: str = "-"):
"""
Makes the index unique by appending a number string to each duplicate index element:
'1', '2', etc.
If a tentative name created by the algorithm already exists in the index, it tries
the next integer in the sequence.
The first occurrence of a non-unique value is ignored.
Parameters
----------
join
The connecting string between name and integer.
Examples
--------
>>> from anndata import AnnData
>>> adata = AnnData(np.ones((2, 3)), var=pd.DataFrame(index=["a", "a", "b"]))
>>> adata.var_names
Index(['a', 'a', 'b'], dtype='object')
>>> adata.var_names_make_unique()
>>> adata.var_names
Index(['a', 'a-1', 'b'], dtype='object')
"""
if index.is_unique:
return index
from collections import Counter
values = index.values.copy()
indices_dup = index.duplicated(keep="first")
values_dup = values[indices_dup]
values_set = set(values)
counter = Counter()
issue_interpretation_warning = False
example_colliding_values = []
for i, v in enumerate(values_dup):
while True:
counter[v] += 1
tentative_new_name = v + join + str(counter[v])
if tentative_new_name not in values_set:
values_set.add(tentative_new_name)
values_dup[i] = tentative_new_name
break
issue_interpretation_warning = True
if len(example_colliding_values) < 5:
example_colliding_values.append(tentative_new_name)
if issue_interpretation_warning:
warnings.warn(
f"Suffix used ({join}[0-9]+) to deduplicate index values may make index "
+ "values difficult to interpret. There values with a similar suffixes in "
+ "the index. Consider using a different delimiter by passing "
+ "`join={delimiter}`"
+ "Example key collisions generated by the make_index_unique algorithm: "
+ str(example_colliding_values)
)
values[indices_dup] = values_dup
index = pd.Index(values, name=index.name)
return index
def join_english(words: Iterable[str], conjunction: str = "or") -> str:
words = list(words) # no need to be efficient
if len(words) == 0:
return ""
if len(words) == 1:
return words[0]
if len(words) == 2:
return f"{words[0]} {conjunction} {words[1]}"
return ", ".join(words[:-1]) + f", {conjunction} {words[-1]}"
def warn_names_duplicates(attr: str):
names = "Observation" if attr == "obs" else "Variable"
warnings.warn(
f"{names} names are not unique. "
f"To make them unique, call `.{attr}_names_make_unique`.",
UserWarning,
stacklevel=2,
)
def ensure_df_homogeneous(
df: pd.DataFrame, name: str
) -> np.ndarray | sparse.csr_matrix:
# TODO: rename this function, I would not expect this to return a non-dataframe
if all(isinstance(dt, pd.SparseDtype) for dt in df.dtypes):
arr = df.sparse.to_coo().tocsr()
else:
arr = df.to_numpy()
if df.dtypes.nunique() != 1:
warnings.warn(f"{name} converted to numpy array with dtype {arr.dtype}")
return arr
def convert_dictionary_to_structured_array(source: Mapping[str, Sequence[Any]]):
names = list(source.keys())
try: # transform to byte-strings
cols = [
np.asarray(col)
if np.array(col[0]).dtype.char not in {"U", "S"}
else np.asarray(col).astype("U")
for col in source.values()
]
except UnicodeEncodeError:
msg = (
"Currently only support ascii strings. "
"Don’t use “ö” etc. for sample annotation."
)
raise ValueError(msg)
# if old_index_key not in source:
# names.append(new_index_key)
# cols.append(np.arange(len(cols[0]) if cols else n_row).astype("U"))
# else:
# names[names.index(old_index_key)] = new_index_key
# cols[names.index(old_index_key)] = cols[names.index(old_index_key)].astype("U")
dtype_list = list(
zip(names, [str(c.dtype) for c in cols], [(c.shape[1],) for c in cols])
)
# might be unnecessary
dtype = np.dtype(dtype_list)
arr = np.zeros((len(cols[0]),), dtype)
# here, we do not want to call BoundStructArray.__getitem__
# but np.ndarray.__getitem__, therefore we avoid the following line
# arr = np.ndarray.__new__(cls, (len(cols[0]),), dtype)
for i, name in enumerate(dtype.names):
arr[name] = np.array(cols[i], dtype=dtype_list[i][1])
return arr
def warn_once(msg: str, category: type[Warning], stacklevel: int = 1):
warnings.warn(msg, category, stacklevel=stacklevel)
# Prevent from showing up every time an awkward array is used
# You'd think `'once'` works, but it doesn't at the repl and in notebooks
warnings.filterwarnings("ignore", category=category, message=re.escape(msg))
def deprecated(
new_name: str,
category: type[Warning] = FutureWarning,
add_msg: str = "",
*,
hide: bool = True,
):
"""\
This is a decorator which can be used to mark functions
as deprecated with a FutureWarning. It will result in a warning being emitted
when the function is used.
"""
def decorator(func):
name = func.__qualname__
msg = (
f"Use {new_name} instead of {name}, "
f"{name} is deprecated and will be removed in the future."
)
if add_msg:
msg += f" {add_msg}"
@wraps(func)
def new_func(*args, **kwargs):
warnings.warn(msg, category=category, stacklevel=2)
return func(*args, **kwargs)
setattr(new_func, "__deprecated", (category, msg, hide))
return new_func
return decorator
class DeprecationMixinMeta(type):
"""\
Use this as superclass so deprecated methods and properties
do not appear in vars(MyClass)/dir(MyClass)
"""
def __dir__(cls):
def is_hidden(attr) -> bool:
if isinstance(attr, property):
attr = attr.fget
_, _, hide = getattr(attr, "__deprecated", (None, None, False))
return hide
return [
item
for item in type.__dir__(cls)
if not is_hidden(getattr(cls, item, None))
]
def raise_value_error_if_multiindex_columns(df: pd.DataFrame, attr: str):
if isinstance(df.columns, pd.MultiIndex):
msg = (
"MultiIndex columns are not supported in AnnData. "
f"Please use a single-level index for {attr}."
)
raise ValueError(msg)
def module_get_attr_redirect(
attr_name: str,
deprecated_mapping: Mapping[str, str],
old_module_path: str | None = None,
) -> Any:
full_old_module_path = (
f"anndata{'.' + old_module_path if old_module_path is not None else ''}"
)
if new_path := deprecated_mapping.get(attr_name):
msg = (
f"Importing {attr_name} from `{full_old_module_path}` is deprecated. "
f"Import anndata.{new_path} instead."
)
warnings.warn(msg, FutureWarning)
# hacky import_object_by_name, but we test all these
mod = anndata
while "." in new_path:
mod_name, new_path = new_path.split(".", 1)
mod = getattr(mod, mod_name)
return getattr(mod, new_path)
msg = f"module {full_old_module_path} has no attribute {attr_name!r}"
raise AttributeError(msg)
python-anndata-0.12.0~rc1/src/testing/ 0000775 0000000 0000000 00000000000 15003706322 0017565 5 ustar 00root root 0000000 0000000 python-anndata-0.12.0~rc1/src/testing/anndata/ 0000775 0000000 0000000 00000000000 15003706322 0021173 5 ustar 00root root 0000000 0000000 python-anndata-0.12.0~rc1/src/testing/anndata/__init__.py 0000664 0000000 0000000 00000000000 15003706322 0023272 0 ustar 00root root 0000000 0000000 python-anndata-0.12.0~rc1/src/testing/anndata/_doctest.py 0000664 0000000 0000000 00000000530 15003706322 0023347 0 ustar 00root root 0000000 0000000 from __future__ import annotations
from collections.abc import Callable
from typing import TypeVar
F = TypeVar("F", bound=Callable)
def doctest_needs(mod: str) -> Callable[[F], F]:
"""Mark function with doctest dependency."""
def decorator(func: F) -> F:
func._doctest_needs = mod
return func
return decorator
python-anndata-0.12.0~rc1/src/testing/anndata/_pytest.py 0000664 0000000 0000000 00000007756 15003706322 0023253 0 ustar 00root root 0000000 0000000 """Private anndata pytest plugin.
This file exists
1. to allow ignoring warnings without test collection failing on CI
2. as a pytest plugin/config that applies to doctests as well
It lives outside of the anndata package in order to avoid importing anndata too early.
"""
from __future__ import annotations
import re
import warnings
from importlib.util import find_spec
from typing import TYPE_CHECKING, cast
import pytest
if TYPE_CHECKING:
from collections.abc import Generator, Iterable
from pathlib import Path
@pytest.fixture(autouse=True)
def _anndata_test_env(request: pytest.FixtureRequest) -> None:
import anndata
if isinstance(request.node, pytest.DoctestItem):
request.getfixturevalue("_doctest_env")
anndata.settings.reset(anndata.settings._registered_options.keys())
@pytest.fixture
def _doctest_env(
request: pytest.FixtureRequest, cache: pytest.Cache, tmp_path: Path
) -> Generator[None, None, None]:
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore", message=r"Importing read_.* from `anndata` is deprecated"
)
from scanpy import settings
from contextlib import chdir
from anndata.utils import import_name
assert isinstance(request.node.parent, pytest.Module)
# request.node.parent is either a DoctestModule or a DoctestTextFile.
# Only DoctestModule has a .obj attribute (the imported module).
if request.node.parent.obj:
func = import_name(request.node.name)
warning_detail: tuple[type[Warning], str, bool] | None
if warning_detail := getattr(func, "__deprecated", None):
cat, msg, _ = warning_detail
warnings.filterwarnings("ignore", category=cat, message=re.escape(msg))
if (mod := getattr(func, "_doctest_needs", None)) is not None and not find_spec(
mod
):
request.applymarker(pytest.skip(reason=f"doctest needs {mod} to run"))
old_dd, settings.datasetdir = settings.datasetdir, cache.mkdir("scanpy-data")
with chdir(tmp_path):
yield
settings.datasetdir = old_dd
def pytest_itemcollected(item: pytest.Item) -> None:
"""Define behavior of pytest.mark.gpu."""
is_gpu = len([mark for mark in item.iter_markers(name="gpu")]) > 0
if is_gpu:
item.add_marker(
pytest.mark.skipif(not find_spec("cupy"), reason="Cupy not installed.")
)
def pytest_addoption(parser: pytest.Parser) -> None:
"""Hook to register custom CLI options and config values"""
parser.addoption(
"--strict-warnings",
action="store_true",
default=False,
help="Turn warnings into errors that are not overridden by `filterwarnings` or `filterwarnings_when_strict`.",
)
parser.addini(
"filterwarnings_when_strict",
"Filters to apply after `-Werror` when --strict-warnings is active",
type="linelist",
default=[],
)
def pytest_collection_modifyitems(
session: pytest.Session, config: pytest.Config, items: Iterable[pytest.Item]
):
if not config.getoption("--strict-warnings"):
return
warning_filters = [
"error",
*_config_get_strlist(config, "filterwarnings"),
*_config_get_strlist(config, "filterwarnings_when_strict"),
]
warning_marks = [pytest.mark.filterwarnings(f) for f in warning_filters]
# Add warning filters defined in the config to all tests items.
# Test items might already have @pytest.mark.filterwarnings applied,
# so we prepend ours to ensure that an item’s explicit filters override these.
# Reversing then individually prepending ensures that the order is preserved.
for item in items:
for mark in reversed(warning_marks):
item.add_marker(mark, append=False)
def _config_get_strlist(config: pytest.Config, name: str) -> list[str]:
if strs := config.getini(name):
assert isinstance(strs, list)
assert all(isinstance(item, str) for item in strs)
return cast("list[str]", strs)
return []
python-anndata-0.12.0~rc1/src/testing/anndata/py.typed 0000664 0000000 0000000 00000000000 15003706322 0022660 0 ustar 00root root 0000000 0000000 python-anndata-0.12.0~rc1/tests/ 0000775 0000000 0000000 00000000000 15003706322 0016463 5 ustar 00root root 0000000 0000000 python-anndata-0.12.0~rc1/tests/conftest.py 0000664 0000000 0000000 00000012304 15003706322 0020662 0 ustar 00root root 0000000 0000000 from __future__ import annotations
from functools import partial
from typing import TYPE_CHECKING
import dask
import joblib
import pytest
from dask.base import normalize_token, tokenize
from packaging.version import Version
if Version(dask.__version__) < Version("2024.8.0"):
from dask.base import normalize_seq
else:
from dask.tokenize import normalize_seq
from filelock import FileLock
from scipy import sparse
import anndata as ad
from anndata.tests.helpers import subset_func # noqa: F401
if TYPE_CHECKING:
from collections.abc import Generator
from types import EllipsisType
@pytest.fixture
def backing_h5ad(tmp_path):
return tmp_path / "test.h5ad"
@pytest.fixture(
params=[("h5ad", None), ("zarr", 2), ("zarr", 3)], ids=["h5ad", "zarr2", "zarr3"]
)
def diskfmt(request):
if (fmt := request.param[0]) == "h5ad":
yield fmt
else:
with ad.settings.override(zarr_write_format=request.param[1]):
yield fmt
@pytest.fixture
def diskfmt2(diskfmt):
if diskfmt == "h5ad":
with ad.settings.override(zarr_write_format=2):
yield "zarr"
else:
yield "h5ad"
@pytest.fixture(
params=[
pytest.param((..., (slice(None), slice(None))), id="ellipsis"),
pytest.param(((...,), (slice(None), slice(None))), id="ellipsis_tuple"),
pytest.param(
((..., slice(0, 10)), (slice(None), slice(0, 10))), id="obs-ellipsis"
),
pytest.param(
((slice(0, 10), ...), (slice(0, 10), slice(None))), id="var-ellipsis"
),
pytest.param(
((slice(0, 10), slice(0, 10), ...), (slice(0, 10), slice(0, 10))),
id="obs-var-ellipsis",
),
pytest.param(
((..., slice(0, 10), slice(0, 10)), (slice(0, 10), slice(0, 10))),
id="ellipsis-obs-var",
),
pytest.param(
((slice(0, 10), ..., slice(0, 10)), (slice(0, 10), slice(0, 10))),
id="obs-ellipsis-var",
),
]
)
def ellipsis_index_with_equivalent(
request,
) -> tuple[tuple[EllipsisType | slice, ...] | EllipsisType, tuple[slice, slice]]:
return request.param
@pytest.fixture
def ellipsis_index(
ellipsis_index_with_equivalent: tuple[
tuple[EllipsisType | slice, ...] | EllipsisType, tuple[slice, slice]
],
) -> tuple[EllipsisType | slice, ...] | EllipsisType:
return ellipsis_index_with_equivalent[0]
@pytest.fixture
def equivalent_ellipsis_index(
ellipsis_index_with_equivalent: tuple[
tuple[EllipsisType | slice, ...] | EllipsisType, tuple[slice, slice]
],
) -> tuple[slice, slice]:
return ellipsis_index_with_equivalent[1]
@pytest.fixture(scope="session")
def local_cluster_addr(
tmp_path_factory: pytest.TempPathFactory, worker_id: str
) -> Generator[str, None, None]:
# Adapted from https://pytest-xdist.readthedocs.io/en/latest/how-to.html#making-session-scoped-fixtures-execute-only-once
import dask.distributed as dd
def make_cluster() -> dd.LocalCluster:
return dd.LocalCluster(n_workers=1, threads_per_worker=1)
if worker_id == "master":
with make_cluster() as cluster:
yield cluster.scheduler_address
return
# get the temp directory shared by all workers
root_tmp_dir = tmp_path_factory.getbasetemp().parent
fn = root_tmp_dir / "dask_scheduler_address.txt"
lock = FileLock(str(fn) + ".lock")
lock.acquire() # can’t use context manager, because we need to release the lock before yielding
address = fn.read_text() if fn.is_file() else None
if address:
lock.release()
yield address
return
with make_cluster() as cluster:
fn.write_text(cluster.scheduler_address)
lock.release()
yield cluster.scheduler_address
#####################
# Dask tokenization #
#####################
# TODO: Should we be exporting this?
# sparray classes don't have tokenize defined yet, see: https://github.com/dask/dask/issues/10375
def normalize_sparse_matrix(x, attrs):
return (
type(x).__name__,
normalize_seq(normalize_token(getattr(x, key)) for key in attrs),
)
for cls, attrs in [
(sparse.dia_array, ("data", "offsets", "shape")),
(sparse.bsr_array, ("data", "indices", "indptr", "blocksize", "shape")),
(sparse.coo_array, ("data", "row", "col", "shape")),
(sparse.csr_array, ("data", "indices", "indptr", "shape")),
(sparse.csc_array, ("data", "indices", "indptr", "shape")),
(sparse.lil_array, ("data", "rows", "shape")),
]:
normalize_token.register(cls, partial(normalize_sparse_matrix, attrs=attrs))
@normalize_token.register(sparse.dok_array)
def normalize_dok_matrix(x):
return type(x).__name__, normalize_token(sorted(x.items()))
@normalize_token.register(ad.AnnData)
def tokenize_anndata(adata: ad.AnnData):
res = []
if adata.X is not None:
res.append(tokenize(adata.X))
res.extend([tokenize(adata.obs), tokenize(adata.var)])
for attr in ["obsm", "varm", "obsp", "varp", "layers"]:
elem = getattr(adata, attr)
res.append(tokenize(list(dict(elem).items())))
res.append(joblib.hash(adata.uns))
if adata.raw is not None:
res.append(tokenize(adata.raw.to_adata()))
return tuple(res)
python-anndata-0.12.0~rc1/tests/data/ 0000775 0000000 0000000 00000000000 15003706322 0017374 5 ustar 00root root 0000000 0000000 python-anndata-0.12.0~rc1/tests/data/adata-comments.tsv 0000664 0000000 0000000 00000000145 15003706322 0023027 0 ustar 00root root 0000000 0000000 # A regular comment
# The next comment is actually colnames
# c1 c2
r1 1.0 0.0
r2 3.0 0.0
r3 5.0 6.0
python-anndata-0.12.0~rc1/tests/data/adata.csv 0000664 0000000 0000000 00000000050 15003706322 0021156 0 ustar 00root root 0000000 0000000 ,c1,c2
r1,1.0,0.0
r2,3.0,0.0
r3,5.0,6.0
python-anndata-0.12.0~rc1/tests/data/archives/ 0000775 0000000 0000000 00000000000 15003706322 0021200 5 ustar 00root root 0000000 0000000 python-anndata-0.12.0~rc1/tests/data/archives/readme.md 0000664 0000000 0000000 00000000477 15003706322 0022767 0 ustar 00root root 0000000 0000000 # archives
This directory contains an archive of anndata files written by older versions of the library.
It's for testing backwards compat.
This should really live somewhere else, but it's here for now.
## Directories
Directories with version numbers contain files written by the corresponding version of `anndata`.
python-anndata-0.12.0~rc1/tests/data/archives/v0.7.0/ 0000775 0000000 0000000 00000000000 15003706322 0022030 5 ustar 00root root 0000000 0000000 python-anndata-0.12.0~rc1/tests/data/archives/v0.7.0/adata.h5ad 0000664 0000000 0000000 00001015260 15003706322 0023652 0 ustar 00root root 0000000 0000000 HDF
` TREE 8 @ HEAP X X 8 0 p
( TREE HEAP X data indices indptr 8 SNOD H h H x! k n ( 8 H < > H encoding-type
P encoding-version GCOL
csr_matrix 0.1.0 raw 0.1.0
csr_matrix 0.1.0 dataframe 0.1.0 float64
uint8 int64 cat_ordered
var_cat _index gene20 gene21 gene22 gene23 gene24 gene25 gene26 gene27 gene28 gene29 gene30 gene31 gene32 gene33 gene34 gene35 gene36 gene37 ! gene38 " gene39 # gene10 $ gene11 % gene12 &