pax_global_header00006660000000000000000000000064141425574120014517gustar00rootroot0000000000000052 comment=30c1fce6c17b53c14e3b754dae10906429273fea anndata-0.7.8/000077500000000000000000000000001414255741200131415ustar00rootroot00000000000000anndata-0.7.8/.azure-pipelines.yml000066400000000000000000000045141414255741200170620ustar00rootroot00000000000000trigger: - master variables: PIP_CACHE_DIR: $(Pipeline.Workspace)/.pip RUN_COVERAGE: no jobs: - job: PyTest pool: vmImage: 'ubuntu-18.04' strategy: matrix: Python38: python.version: '3.8' RUN_COVERAGE: yes Python36: python.version: '3.6' steps: - task: UsePythonVersion@0 inputs: versionSpec: '$(python.version)' displayName: 'Use Python $(python.version)' - task: Cache@2 inputs: key: '"python $(python.version)" | "$(Agent.OS)" | pyproject.toml' restoreKeys: | python | "$(Agent.OS)" python path: $(PIP_CACHE_DIR) displayName: Cache pip packages - script: | python -m pip install --upgrade pip pip install pytest-cov wheel pip install .[dev,test] displayName: 'Install dependencies' - script: | pip list displayName: 'Display installed versions' - script: | pytest --color=yes --junitxml=junit/test-results.xml displayName: 'PyTest' condition: eq(variables['RUN_COVERAGE'], 'no') - script: | pytest --color=yes --junitxml=junit/test-results.xml --cov --cov-report=xml --cov-context=test displayName: 'PyTest (coverage)' condition: eq(variables['RUN_COVERAGE'], 'yes') - task: PublishCodeCoverageResults@1 inputs: codeCoverageTool: Cobertura summaryFileLocation: '$(System.DefaultWorkingDirectory)/**/coverage.xml' reportDirectory: '$(System.DefaultWorkingDirectory)/**/htmlcov' condition: eq(variables['RUN_COVERAGE'], 'yes') - task: PublishTestResults@2 condition: succeededOrFailed() inputs: testResultsFiles: 'junit/test-*.xml' testRunTitle: 'Publish test results for Python $(python.version)' - script: bash <(curl -s https://codecov.io/bash) displayName: 'Upload to codecov.io' condition: eq(variables['RUN_COVERAGE'], 'yes') - job: CheckBuild pool: vmImage: 'ubuntu-18.04' steps: - task: UsePythonVersion@0 inputs: versionSpec: '3.8' displayName: 'Use Python 3.8' - script: | python -m pip install --upgrade pip pip install build twine displayName: 'Install build tools and requirements' - script: pip list displayName: 'Display installed versions' - script: | python -m build --sdist --wheel . twine check dist/* displayName: 'Build & Twine check' anndata-0.7.8/.codecov.yml000066400000000000000000000004431414255741200153650ustar00rootroot00000000000000# Based on pydata/xarray codecov: require_ci_to_pass: no coverage: status: project: default: # Require 1% coverage, i.e., always succeed target: 1 patch: false changes: false comment: layout: "diff, flags, files" behavior: once require_base: no anndata-0.7.8/.editorconfig000066400000000000000000000002611414255741200156150ustar00rootroot00000000000000root = true [*] charset = utf-8 end_of_line = lf insert_final_newline = true trim_trailing_whitespace = true max_line_length = 88 [*.py] indent_size = 4 indent_style = space anndata-0.7.8/.flake8000066400000000000000000000035241414255741200143200ustar00rootroot00000000000000# Can't yet be moved to the pyproject.toml due to https://github.com/PyCQA/flake8/issues/234 [flake8] max-line-length = 88 ignore = # module imported but unused -> required for Scanpys API F401, # line break before a binary operator -> black does not adhere to PEP8 W503, # line break occured after a binary operator -> black does not adhere to PEP8 W504, # line too long -> we accept long comment lines; black gets rid of long code lines E501, # whitespace before : -> black does not adhere to PEP8 E203, # missing whitespace after ,', ';', or ':' -> black does not adhere to PEP8 E231, # module level import not at top of file -> required to circumvent circular imports for Scanpys API E402, # continuation line over-indented for hanging indent -> black does not adhere to PEP8 E126, # E266 too many leading '#' for block comment -> Scanpy allows them for comments into sections E262, # inline comment should start with '# ' -> Scanpy allows them for specific explanations E266, # Do not assign a lambda expression, use a def -> Scanpy allows lambda expression assignments, E731, # allow I, O, l as variable names -> I is the identity matrix, i, j, k, l is reasonable indexing notation E741 per-file-ignores = # F811 Redefinition of unused name from line, does not play nice with pytest fixtures tests/test*.py: F811 # F821 Undefined name, can't import AnnData or it'd be a circular import anndata/compat/_overloaded_dict.py: F821 # E721 comparing types, but we specifically are checking that we aren't getting subtypes (views) anndata/tests/test_readwrite.py: E721 exclude = .git, __pycache__, build, docs/_build, dist, anndata-0.7.8/.github/000077500000000000000000000000001414255741200145015ustar00rootroot00000000000000anndata-0.7.8/.github/workflows/000077500000000000000000000000001414255741200165365ustar00rootroot00000000000000anndata-0.7.8/.github/workflows/pre-commit.yml000066400000000000000000000003511414255741200213340ustar00rootroot00000000000000name: pre-commit on: pull_request: push: branches: [master] jobs: pre-commit: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - uses: actions/setup-python@v2 - uses: pre-commit/action@v2.0.0 anndata-0.7.8/.gitignore000066400000000000000000000003641414255741200151340ustar00rootroot00000000000000# Temp files .DS_Store *~ # Compiled files __pycache__/ # Distribution / packaging /build/ /dist/ /*.egg-info/ # Tests and coverage /.pytest_cache/ /.cache/ /data/ /tmp.zarr/ test.h5ad # docs /docs/generated/ /docs/_build/ # IDEs /.idea/ anndata-0.7.8/.pre-commit-config.yaml000066400000000000000000000004601414255741200174220ustar00rootroot00000000000000repos: - repo: https://github.com/psf/black rev: 21.8b0 hooks: - id: black - repo: https://github.com/PyCQA/flake8 rev: 3.9.2 hooks: - id: flake8 - repo: https://github.com/pre-commit/mirrors-autopep8 rev: v1.5.7 hooks: - id: autopep8 args: ["-i"] anndata-0.7.8/.readthedocs.yml000066400000000000000000000002501414255741200162240ustar00rootroot00000000000000version: 2 build: image: latest sphinx: configuration: docs/conf.py python: version: 3.7 install: - method: pip path: . extra_requirements: - doc anndata-0.7.8/LICENSE000066400000000000000000000030101414255741200141400ustar00rootroot00000000000000BSD 3-Clause License Copyright (c) 2017-2018 P. Angerer, F. Alexander Wolf, Theis Lab All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. anndata-0.7.8/README.rst000066400000000000000000000032401414255741200146270ustar00rootroot00000000000000|Stars| |PyPI| |PyPIDownloadsTotal| |PyPIDownloadsMonth| |Conda| |Docs| |Build Status| |Coverage| .. |Stars| image:: https://img.shields.io/github/stars/theislab/anndata?logo=GitHub&color=yellow :target: https://github.com/theislab/anndata/stargazers .. |PyPI| image:: https://img.shields.io/pypi/v/anndata.svg :target: https://pypi.org/project/anndata .. |PyPIDownloadsTotal| image:: https://pepy.tech/badge/anndata :target: https://pepy.tech/project/anndata .. |PyPIDownloadsMonth| image:: https://img.shields.io/pypi/dm/scanpy?logo=PyPI&color=blue :target: https://pypi.org/project/anndata .. |Conda| image:: https://img.shields.io/conda/vn/conda-forge/anndata.svg :target: https://anaconda.org/conda-forge/anndata .. |Docs| image:: https://readthedocs.com/projects/icb-anndata/badge/?version=latest :target: https://anndata.readthedocs.io .. |Build Status| image:: https://dev.azure.com/theislab/anndata/_apis/build/status/theislab.anndata?branchName=master :target: https://dev.azure.com/theislab/anndata/_build .. |Coverage| image:: https://api.codacy.com/project/badge/Coverage/b92ae35b691141ceb5f2ee74beaf39d3 :target: https://www.codacy.com/manual/theislab/anndata anndata - Annotated Data ======================== ``AnnData`` provides a scalable way of keeping track of data and learned annotations. * Read the `documentation `_. * Install via ``pip install anndata`` or ``conda install anndata -c conda-forge``. .. would be nice to have http://falexwolf.de/img/scanpy/anndata.svg also on GitHub, but it’s much too wide there; .. GitHub doesn’t plan to resolve scaling images: https://github.com/github/markup/issues/295 anndata-0.7.8/anndata/000077500000000000000000000000001414255741200145475ustar00rootroot00000000000000anndata-0.7.8/anndata/__init__.py000066400000000000000000000010721414255741200166600ustar00rootroot00000000000000"""Annotated multivariate observation data.""" from ._metadata import __version__, within_flit if not within_flit(): del within_flit from ._core.anndata import AnnData, ImplicitModificationWarning from ._core.merge import concat from ._core.raw import Raw from ._io import ( read_h5ad, read_loom, read_hdf, read_excel, read_umi_tools, read_csv, read_text, read_mtx, read_zarr, ) # backwards compat / shortcut for default format from ._io import read_h5ad as read anndata-0.7.8/anndata/_core/000077500000000000000000000000001414255741200156365ustar00rootroot00000000000000anndata-0.7.8/anndata/_core/__init__.py000066400000000000000000000000001414255741200177350ustar00rootroot00000000000000anndata-0.7.8/anndata/_core/access.py000066400000000000000000000014621414255741200174540ustar00rootroot00000000000000from functools import reduce from typing import NamedTuple, Tuple from . import anndata class ElementRef(NamedTuple): parent: "anndata.AnnData" attrname: str keys: Tuple[str, ...] = () def __str__(self) -> str: return f".{self.attrname}" + "".join(map(lambda x: f"['{x}']", self.keys)) @property def _parent_el(self): return reduce( lambda d, k: d[k], self.keys[:-1], getattr(self.parent, self.attrname) ) def get(self): """Get referenced value in self.parent.""" return reduce(lambda d, k: d[k], self.keys, getattr(self.parent, self.attrname)) def set(self, val): """Set referenced value in self.parent.""" self._parent_el[self.keys[-1]] = val def delete(self): del self._parent_el[self.keys[-1]] anndata-0.7.8/anndata/_core/aligned_mapping.py000066400000000000000000000235331414255741200213340ustar00rootroot00000000000000from abc import ABC, abstractmethod from collections import abc as cabc from typing import Union, Optional, Type, ClassVar, TypeVar # Special types from typing import Iterator, Mapping, Sequence # ABCs from typing import Tuple, List, Dict # Generic base types import numpy as np import pandas as pd from scipy.sparse import spmatrix from ..utils import deprecated, ensure_df_homogeneous from . import raw, anndata from .views import as_view from .access import ElementRef from .index import _subset OneDIdx = Union[Sequence[int], Sequence[bool], slice] TwoDIdx = Tuple[OneDIdx, OneDIdx] I = TypeVar("I", OneDIdx, TwoDIdx, covariant=True) # TODO: pd.DataFrame only allowed in AxisArrays? V = Union[pd.DataFrame, spmatrix, np.ndarray] class AlignedMapping(cabc.MutableMapping, ABC): """\ An abstract base class for Mappings containing array-like values aligned to either one or both AnnData axes. """ _allow_df: ClassVar[bool] """If this mapping supports heterogeneous DataFrames""" _view_class: ClassVar[Type["AlignedViewMixin"]] """The view class for this aligned mapping.""" _actual_class: ClassVar[Type["AlignedActualMixin"]] """The actual class (which has it’s own data) for this aligned mapping.""" def __repr__(self): return f"{type(self).__name__} with keys: {', '.join(self.keys())}" def _ipython_key_completions_(self) -> List[str]: return list(self.keys()) def _validate_value(self, val: V, key: str) -> V: """Raises an error if value is invalid""" for i, axis in enumerate(self.axes): if self.parent.shape[axis] != val.shape[i]: right_shape = tuple(self.parent.shape[a] for a in self.axes) raise ValueError( f"Value passed for key {key!r} is of incorrect shape. " f"Values of {self.attrname} must match dimensions " f"{self.axes} of parent. Value had shape {val.shape} while " f"it should have had {right_shape}." ) if not self._allow_df and isinstance(val, pd.DataFrame): name = self.attrname.title().rstrip("s") val = ensure_df_homogeneous(val, f"{name} {key!r}") return val @property @abstractmethod def attrname(self) -> str: """What attr for the AnnData is this?""" pass @property @abstractmethod def axes(self) -> Tuple[int, ...]: """Which axes of the parent is this aligned to?""" pass @property @abstractmethod def is_view(self) -> bool: pass @property def parent(self) -> Union["anndata.AnnData", "raw.Raw"]: return self._parent def copy(self): d = self._actual_class(self.parent, self._axis) for k, v in self.items(): d[k] = v.copy() return d def _view(self, parent: "anndata.AnnData", subset_idx: I): """Returns a subset copy-on-write view of the object.""" return self._view_class(self, parent, subset_idx) @deprecated("dict(obj)") def as_dict(self) -> dict: return dict(self) class AlignedViewMixin: parent: "anndata.AnnData" """Reference to parent AnnData view""" attrname: str """What attribute in the parent is this?""" parent_mapping: Mapping[str, V] """The object this is a view of.""" is_view = True def __getitem__(self, key: str) -> V: return as_view( _subset(self.parent_mapping[key], self.subset_idx), ElementRef(self.parent, self.attrname, (key,)), ) def __setitem__(self, key: str, value: V): value = self._validate_value(value, key) # Validate before mutating adata = self.parent.copy() new_mapping = getattr(adata, self.attrname) new_mapping[key] = value self.parent._init_as_actual(adata) def __delitem__(self, key: str): self[key] # Make sure it exists before bothering with a copy adata = self.parent.copy() new_mapping = getattr(adata, self.attrname) del new_mapping[key] self.parent._init_as_actual(adata) def __contains__(self, key: str) -> bool: return key in self.parent_mapping def __iter__(self) -> Iterator[str]: return iter(self.parent_mapping) def __len__(self) -> int: return len(self.parent_mapping) class AlignedActualMixin: _data: Dict[str, V] """Underlying mapping to the data""" is_view = False def __getitem__(self, key: str) -> V: return self._data[key] def __setitem__(self, key: str, value: V): value = self._validate_value(value, key) self._data[key] = value def __contains__(self, key: str) -> bool: return key in self._data def __delitem__(self, key: str): del self._data[key] def __iter__(self) -> Iterator[str]: return iter(self._data) def __len__(self) -> int: return len(self._data) class AxisArraysBase(AlignedMapping): """\ Mapping of key→array-like, where array-like is aligned to an axis of parent AnnData. """ _allow_df = True _dimnames = ("obs", "var") @property def attrname(self) -> str: return f"{self.dim}m" @property def axes(self) -> Tuple[int]: """Axes of the parent this is aligned to""" return (self._axis,) @property def dim(self) -> str: """Name of the dimension this aligned to.""" return self._dimnames[self._axis] def flipped(self) -> "AxisArraysBase": """Transpose.""" new = self.copy() new.dimension = abs(self._axis - 1) return new def to_df(self) -> pd.DataFrame: """Convert to pandas dataframe.""" df = pd.DataFrame(index=self.dim_names) for key in self.keys(): value = self[key] for icolumn, column in enumerate(value.T): df[f"{key}{icolumn + 1}"] = column return df def _validate_value(self, val: V, key: str) -> V: if ( hasattr(val, "index") and isinstance(val.index, cabc.Collection) and not (val.index == self.dim_names).all() ): # Could probably also re-order index if it’s contained raise ValueError( f"value.index does not match parent’s axis {self.axes[0]} names" ) return super()._validate_value(val, key) @property def dim_names(self) -> pd.Index: return (self.parent.obs_names, self.parent.var_names)[self._axis] class AxisArrays(AlignedActualMixin, AxisArraysBase): def __init__( self, parent: Union["anndata.AnnData", "raw.Raw"], axis: int, vals: Union[Mapping, AxisArraysBase, None] = None, ): self._parent = parent if axis not in (0, 1): raise ValueError() self._axis = axis self._data = dict() if vals is not None: self.update(vals) class AxisArraysView(AlignedViewMixin, AxisArraysBase): def __init__( self, parent_mapping: AxisArraysBase, parent_view: "anndata.AnnData", subset_idx: OneDIdx, ): self.parent_mapping = parent_mapping self._parent = parent_view self.subset_idx = subset_idx self._axis = parent_mapping._axis AxisArraysBase._view_class = AxisArraysView AxisArraysBase._actual_class = AxisArrays class LayersBase(AlignedMapping): """\ Mapping of key: array-like, where array-like is aligned to both axes of the parent anndata. """ _allow_df = False attrname = "layers" axes = (0, 1) # TODO: I thought I had a more elegant solution to overiding this... def copy(self) -> "Layers": d = self._actual_class(self.parent) for k, v in self.items(): d[k] = v.copy() return d class Layers(AlignedActualMixin, LayersBase): def __init__(self, parent: "anndata.AnnData", vals: Optional[Mapping] = None): self._parent = parent self._data = dict() if vals is not None: self.update(vals) class LayersView(AlignedViewMixin, LayersBase): def __init__( self, parent_mapping: LayersBase, parent_view: "anndata.AnnData", subset_idx: TwoDIdx, ): self.parent_mapping = parent_mapping self._parent = parent_view self.subset_idx = subset_idx LayersBase._view_class = LayersView LayersBase._actual_class = Layers class PairwiseArraysBase(AlignedMapping): """\ Mapping of key: array-like, where both axes of array-like are aligned to one axis of the parent anndata. """ _allow_df = False _dimnames = ("obs", "var") @property def attrname(self) -> str: return f"{self.dim}p" @property def axes(self) -> Tuple[int, int]: """Axes of the parent this is aligned to""" return self._axis, self._axis @property def dim(self) -> str: """Name of the dimension this aligned to.""" return self._dimnames[self._axis] class PairwiseArrays(AlignedActualMixin, PairwiseArraysBase): def __init__( self, parent: "anndata.AnnData", axis: int, vals: Optional[Mapping] = None, ): self._parent = parent if axis not in (0, 1): raise ValueError() self._axis = axis self._data = dict() if vals is not None: self.update(vals) class PairwiseArraysView(AlignedViewMixin, PairwiseArraysBase): def __init__( self, parent_mapping: PairwiseArraysBase, parent_view: "anndata.AnnData", subset_idx: OneDIdx, ): self.parent_mapping = parent_mapping self._parent = parent_view self.subset_idx = (subset_idx, subset_idx) self._axis = parent_mapping._axis PairwiseArraysBase._view_class = PairwiseArraysView PairwiseArraysBase._actual_class = PairwiseArrays anndata-0.7.8/anndata/_core/anndata.py000066400000000000000000002220401414255741200176160ustar00rootroot00000000000000"""\ Main class and helper functions. """ import warnings import collections.abc as cabc from collections import OrderedDict from copy import copy, deepcopy from enum import Enum from functools import partial, singledispatch from pathlib import Path from os import PathLike from textwrap import dedent from typing import Any, Union, Optional # Meta from typing import Iterable, Sequence, Mapping, MutableMapping # Generic ABCs from typing import Tuple, List # Generic import h5py from natsort import natsorted import numpy as np from numpy import ma import pandas as pd from pandas.api.types import infer_dtype, is_string_dtype, is_categorical_dtype from scipy import sparse from scipy.sparse import issparse, csr_matrix from .raw import Raw from .index import _normalize_indices, _subset, Index, Index1D, get_vector from .file_backing import AnnDataFileManager, to_memory from .access import ElementRef from .aligned_mapping import ( AxisArrays, AxisArraysView, PairwiseArrays, PairwiseArraysView, Layers, LayersView, ) from .views import ( ArrayView, DictView, DataFrameView, as_view, _resolve_idxs, ) from .sparse_dataset import SparseDataset from .. import utils from ..utils import convert_to_dict, ensure_df_homogeneous from ..logging import anndata_logger as logger from ..compat import ( ZarrArray, ZappyArray, DaskArray, Literal, _slice_uns_sparse_matrices, _move_adj_mtx, _overloaded_uns, OverloadedDict, ) class StorageType(Enum): Array = np.ndarray Masked = ma.MaskedArray Sparse = sparse.spmatrix ZarrArray = ZarrArray ZappyArray = ZappyArray DaskArray = DaskArray @classmethod def classes(cls): return tuple(c.value for c in cls.__members__.values()) # for backwards compat def _find_corresponding_multicol_key(key, keys_multicol): """Find the corresponding multicolumn key.""" for mk in keys_multicol: if key.startswith(mk) and "of" in key: return mk return None # for backwards compat def _gen_keys_from_multicol_key(key_multicol, n_keys): """Generates single-column keys from multicolumn key.""" keys = [f"{key_multicol}{i + 1:03}of{n_keys:03}" for i in range(n_keys)] return keys def _check_2d_shape(X): """\ Check shape of array or sparse matrix. Assure that X is always 2D: Unlike numpy we always deal with 2D arrays. """ if X.dtype.names is None and len(X.shape) != 2: raise ValueError( f"X needs to be 2-dimensional, not {len(X.shape)}-dimensional." ) @singledispatch def _gen_dataframe(anno, length, index_names): if anno is None or len(anno) == 0: return pd.DataFrame(index=pd.RangeIndex(0, length, name=None).astype(str)) for index_name in index_names: if index_name in anno: return pd.DataFrame( anno, index=anno[index_name], columns=[k for k in anno.keys() if k != index_name], ) return pd.DataFrame(anno, index=pd.RangeIndex(0, length, name=None).astype(str)) @_gen_dataframe.register(pd.DataFrame) def _(anno, length, index_names): anno = anno.copy() if not is_string_dtype(anno.index): warnings.warn("Transforming to str index.", ImplicitModificationWarning) anno.index = anno.index.astype(str) return anno @_gen_dataframe.register(pd.Series) @_gen_dataframe.register(pd.Index) def _(anno, length, index_names): raise ValueError(f"Cannot convert {type(anno)} to DataFrame") class ImplicitModificationWarning(UserWarning): """\ Raised whenever initializing an object or assigning a property changes the type of a part of a parameter or the value being assigned. Examples ======== >>> import pandas as pd >>> adata = AnnData(obs=pd.DataFrame(index=[0, 1, 2])) # doctest: +SKIP ImplicitModificationWarning: Transforming to str index. """ pass class AnnData(metaclass=utils.DeprecationMixinMeta): """\ An annotated data matrix. :class:`~anndata.AnnData` stores a data matrix :attr:`X` together with annotations of observations :attr:`obs` (:attr:`obsm`, :attr:`obsp`), variables :attr:`var` (:attr:`varm`, :attr:`varp`), and unstructured annotations :attr:`uns`. .. figure:: https://falexwolf.de/img/scanpy/anndata.svg :width: 350px An :class:`~anndata.AnnData` object `adata` can be sliced like a :class:`~pandas.DataFrame`, for instance `adata_subset = adata[:, list_of_variable_names]`. :class:`~anndata.AnnData`’s basic structure is similar to R’s ExpressionSet [Huber15]_. If setting an `.h5ad`-formatted HDF5 backing file `.filename`, data remains on the disk but is automatically loaded into memory if needed. See this `blog post`_ for more details. .. _blog post: http://falexwolf.de/blog/171223_AnnData_indexing_views_HDF5-backing/ Parameters ---------- X A #observations × #variables data matrix. A view of the data is used if the data type matches, otherwise, a copy is made. obs Key-indexed one-dimensional observations annotation of length #observations. var Key-indexed one-dimensional variables annotation of length #variables. uns Key-indexed unstructured annotation. obsm Key-indexed multi-dimensional observations annotation of length #observations. If passing a :class:`~numpy.ndarray`, it needs to have a structured datatype. varm Key-indexed multi-dimensional variables annotation of length #variables. If passing a :class:`~numpy.ndarray`, it needs to have a structured datatype. layers Key-indexed multi-dimensional arrays aligned to dimensions of `X`. dtype Data type used for storage. shape Shape tuple (#observations, #variables). Can only be provided if `X` is `None`. filename Name of backing file. See :class:`h5py.File`. filemode Open mode of backing file. See :class:`h5py.File`. See Also -------- read_h5ad read_csv read_excel read_hdf read_loom read_zarr read_mtx read_text read_umi_tools Notes ----- :class:`~anndata.AnnData` stores observations (samples) of variables/features in the rows of a matrix. This is the convention of the modern classics of statistics [Hastie09]_ and machine learning [Murphy12]_, the convention of dataframes both in R and Python and the established statistics and machine learning packages in Python (statsmodels_, scikit-learn_). Single dimensional annotations of the observation and variables are stored in the :attr:`obs` and :attr:`var` attributes as :class:`~pandas.DataFrame`\\ s. This is intended for metrics calculated over their axes. Multi-dimensional annotations are stored in :attr:`obsm` and :attr:`varm`, which are aligned to the objects observation and variable dimensions respectively. Square matrices representing graphs are stored in :attr:`obsp` and :attr:`varp`, with both of their own dimensions aligned to their associated axis. Additional measurements across both observations and variables are stored in :attr:`layers`. Indexing into an AnnData object can be performed by relative position with numeric indices (like pandas’ :meth:`~pandas.DataFrame.iloc`), or by labels (like :meth:`~pandas.DataFrame.loc`). To avoid ambiguity with numeric indexing into observations or variables, indexes of the AnnData object are converted to strings by the constructor. Subsetting an AnnData object by indexing into it will also subset its elements according to the dimensions they were aligned to. This means an operation like `adata[list_of_obs, :]` will also subset :attr:`obs`, :attr:`obsm`, and :attr:`layers`. Subsetting an AnnData object returns a view into the original object, meaning very little additional memory is used upon subsetting. This is achieved lazily, meaning that the constituent arrays are subset on access. Copying a view causes an equivalent “real” AnnData object to be generated. Attempting to modify a view (at any attribute except X) is handled in a copy-on-modify manner, meaning the object is initialized in place. Here’s an example:: batch1 = adata[adata.obs["batch"] == "batch1", :] batch1.obs["value"] = 0 # This makes batch1 a “real” AnnData object At the end of this snippet: `adata` was not modified, and `batch1` is its own AnnData object with its own data. Similar to Bioconductor’s `ExpressionSet` and :mod:`scipy.sparse` matrices, subsetting an AnnData object retains the dimensionality of its constituent arrays. Therefore, unlike with the classes exposed by :mod:`pandas`, :mod:`numpy`, and `xarray`, there is no concept of a one dimensional AnnData object. AnnDatas always have two inherent dimensions, :attr:`obs` and :attr:`var`. Additionally, maintaining the dimensionality of the AnnData object allows for consistent handling of :mod:`scipy.sparse` matrices and :mod:`numpy` arrays. .. _statsmodels: http://www.statsmodels.org/stable/index.html .. _scikit-learn: http://scikit-learn.org/ """ _BACKED_ATTRS = ["X", "raw.X"] # backwards compat _H5_ALIASES = dict( X={"X", "_X", "data", "_data"}, obs={"obs", "_obs", "smp", "_smp"}, var={"var", "_var"}, uns={"uns"}, obsm={"obsm", "_obsm", "smpm", "_smpm"}, varm={"varm", "_varm"}, layers={"layers", "_layers"}, ) _H5_ALIASES_NAMES = dict( obs={"obs_names", "smp_names", "row_names", "index"}, var={"var_names", "col_names", "index"}, ) def __init__( self, X: Optional[Union[np.ndarray, sparse.spmatrix, pd.DataFrame]] = None, obs: Optional[Union[pd.DataFrame, Mapping[str, Iterable[Any]]]] = None, var: Optional[Union[pd.DataFrame, Mapping[str, Iterable[Any]]]] = None, uns: Optional[Mapping[str, Any]] = None, obsm: Optional[Union[np.ndarray, Mapping[str, Sequence[Any]]]] = None, varm: Optional[Union[np.ndarray, Mapping[str, Sequence[Any]]]] = None, layers: Optional[Mapping[str, Union[np.ndarray, sparse.spmatrix]]] = None, raw: Optional[Mapping[str, Any]] = None, dtype: Union[np.dtype, str] = "float32", shape: Optional[Tuple[int, int]] = None, filename: Optional[PathLike] = None, filemode: Optional[Literal["r", "r+"]] = None, asview: bool = False, *, obsp: Optional[Union[np.ndarray, Mapping[str, Sequence[Any]]]] = None, varp: Optional[Union[np.ndarray, Mapping[str, Sequence[Any]]]] = None, oidx: Index1D = None, vidx: Index1D = None, ): if asview: if not isinstance(X, AnnData): raise ValueError("`X` has to be an AnnData object.") self._init_as_view(X, oidx, vidx) else: self._init_as_actual( X=X, obs=obs, var=var, uns=uns, obsm=obsm, varm=varm, raw=raw, layers=layers, dtype=dtype, shape=shape, obsp=obsp, varp=varp, filename=filename, filemode=filemode, ) def _init_as_view(self, adata_ref: "AnnData", oidx: Index, vidx: Index): if adata_ref.isbacked and adata_ref.is_view: raise ValueError( "Currently, you cannot index repeatedly into a backed AnnData, " "that is, you cannot make a view of a view." ) self._is_view = True if isinstance(oidx, (int, np.integer)): if not (-adata_ref.n_obs <= oidx < adata_ref.n_obs): raise IndexError(f"Observation index `{oidx}` is out of range.") oidx += adata_ref.n_obs * (oidx < 0) oidx = slice(oidx, oidx + 1, 1) if isinstance(vidx, (int, np.integer)): if not (-adata_ref.n_vars <= vidx < adata_ref.n_vars): raise IndexError(f"Variable index `{vidx}` is out of range.") vidx += adata_ref.n_vars * (vidx < 0) vidx = slice(vidx, vidx + 1, 1) if adata_ref.is_view: prev_oidx, prev_vidx = adata_ref._oidx, adata_ref._vidx adata_ref = adata_ref._adata_ref oidx, vidx = _resolve_idxs((prev_oidx, prev_vidx), (oidx, vidx), adata_ref) # self._adata_ref is never a view self._adata_ref = adata_ref self._oidx = oidx self._vidx = vidx # the file is the same as of the reference object self.file = adata_ref.file # views on attributes of adata_ref obs_sub = adata_ref.obs.iloc[oidx] var_sub = adata_ref.var.iloc[vidx] self._obsm = adata_ref.obsm._view(self, (oidx,)) self._varm = adata_ref.varm._view(self, (vidx,)) self._layers = adata_ref.layers._view(self, (oidx, vidx)) self._obsp = adata_ref.obsp._view(self, oidx) self._varp = adata_ref.varp._view(self, vidx) # Speical case for old neighbors, backwards compat. Remove in anndata 0.8. uns_new = _slice_uns_sparse_matrices( copy(adata_ref._uns), self._oidx, adata_ref.n_obs ) # fix categories self._remove_unused_categories(adata_ref.obs, obs_sub, uns_new) self._remove_unused_categories(adata_ref.var, var_sub, uns_new) # set attributes self._obs = DataFrameView(obs_sub, view_args=(self, "obs")) self._var = DataFrameView(var_sub, view_args=(self, "var")) self._uns = uns_new self._n_obs = len(self.obs) self._n_vars = len(self.var) # set data if self.isbacked: self._X = None # set raw, easy, as it’s immutable anyways... if adata_ref._raw is not None: # slicing along variables axis is ignored self._raw = adata_ref.raw[oidx] self._raw._adata = self else: self._raw = None def _init_as_actual( self, X=None, obs=None, var=None, uns=None, obsm=None, varm=None, varp=None, obsp=None, raw=None, layers=None, dtype="float32", shape=None, filename=None, filemode=None, ): # view attributes self._is_view = False self._adata_ref = None self._oidx = None self._vidx = None # ---------------------------------------------------------------------- # various ways of initializing the data # ---------------------------------------------------------------------- # If X is a data frame, we store its indices for verification x_indices = [] # init from file if filename is not None: self.file = AnnDataFileManager(self, filename, filemode) else: self.file = AnnDataFileManager(self, None) # init from AnnData if isinstance(X, AnnData): if any((obs, var, uns, obsm, varm, obsp, varp)): raise ValueError( "If `X` is a dict no further arguments must be provided." ) X, obs, var, uns, obsm, varm, obsp, varp, layers, raw = ( X._X, X.obs, X.var, X.uns, X.obsm, X.varm, X.obsp, X.varp, X.layers, X.raw, ) # init from DataFrame elif isinstance(X, pd.DataFrame): # to verify index matching, we wait until obs and var are DataFrames if obs is None: obs = pd.DataFrame(index=X.index) elif not isinstance(X.index, pd.RangeIndex): x_indices.append(("obs", "index", X.index)) if var is None: var = pd.DataFrame(index=X.columns) elif not isinstance(X.columns, pd.RangeIndex): x_indices.append(("var", "columns", X.columns)) X = ensure_df_homogeneous(X, "X") # ---------------------------------------------------------------------- # actually process the data # ---------------------------------------------------------------------- # check data type of X if X is not None: for s_type in StorageType: if isinstance(X, s_type.value): break else: class_names = ", ".join(c.__name__ for c in StorageType.classes()) raise ValueError( f"`X` needs to be of one of {class_names}, not {type(X)}." ) if shape is not None: raise ValueError("`shape` needs to be `None` if `X` is not `None`.") _check_2d_shape(X) # if type doesn’t match, a copy is made, otherwise, use a view if issparse(X) or isinstance(X, ma.MaskedArray): # TODO: maybe use view on data attribute of sparse matrix # as in readwrite.read_10x_h5 if X.dtype != np.dtype(dtype): X = X.astype(dtype) elif isinstance(X, ZarrArray): X = X.astype(dtype) else: # is np.ndarray or a subclass, convert to true np.ndarray X = np.array(X, dtype, copy=False) # data matrix and shape self._X = X self._n_obs, self._n_vars = self._X.shape else: self._X = None self._n_obs = len([] if obs is None else obs) self._n_vars = len([] if var is None else var) # check consistency with shape if shape is not None: if self._n_obs == 0: self._n_obs = shape[0] else: if self._n_obs != shape[0]: raise ValueError("`shape` is inconsistent with `obs`") if self._n_vars == 0: self._n_vars = shape[1] else: if self._n_vars != shape[1]: raise ValueError("`shape` is inconsistent with `var`") # annotations self._obs = _gen_dataframe(obs, self._n_obs, ["obs_names", "row_names"]) self._var = _gen_dataframe(var, self._n_vars, ["var_names", "col_names"]) # now we can verify if indices match! for attr_name, x_name, idx in x_indices: attr = getattr(self, attr_name) if isinstance(attr.index, pd.RangeIndex): attr.index = idx elif not idx.equals(attr.index): raise ValueError(f"Index of {attr_name} must match {x_name} of X.") # unstructured annotations self.uns = uns or OrderedDict() # TODO: Think about consequences of making obsm a group in hdf self._obsm = AxisArrays(self, 0, vals=convert_to_dict(obsm)) self._varm = AxisArrays(self, 1, vals=convert_to_dict(varm)) self._obsp = PairwiseArrays(self, 0, vals=convert_to_dict(obsp)) self._varp = PairwiseArrays(self, 1, vals=convert_to_dict(varp)) # Backwards compat for connectivities matrices in uns["neighbors"] _move_adj_mtx({"uns": self._uns, "obsp": self._obsp}) self._check_dimensions() self._check_uniqueness() if self.filename: assert not isinstance( raw, Raw ), "got raw from other adata but also filename?" if {"raw", "raw.X"} & set(self.file): raw = dict(X=None, **raw) if not raw: self._raw = None elif isinstance(raw, cabc.Mapping): self._raw = Raw(self, **raw) else: # is a Raw from another AnnData self._raw = Raw(self, raw._X, raw.var, raw.varm) # clean up old formats self._clean_up_old_format(uns) # layers self._layers = Layers(self, layers) def __sizeof__(self, show_stratified=None) -> int: def get_size(X): if issparse(X): X_csr = csr_matrix(X) return X_csr.data.nbytes + X_csr.indptr.nbytes + X_csr.indices.nbytes else: return X.__sizeof__() size = 0 attrs = list(["_X", "_obs", "_var"]) attrs_multi = list(["_uns", "_obsm", "_varm", "varp", "_obsp", "_layers"]) for attr in attrs + attrs_multi: if attr in attrs_multi: keys = getattr(self, attr).keys() s = sum([get_size(getattr(self, attr)[k]) for k in keys]) else: s = get_size(getattr(self, attr)) if s > 0 and show_stratified: str_attr = attr.replace("_", ".") + " " * (7 - len(attr)) print(f"Size of {str_attr}: {'%3.2f' % (s / (1024 ** 2))} MB") size += s return size def _gen_repr(self, n_obs, n_vars) -> str: if self.isbacked: backed_at = f" backed at {str(self.filename)!r}" else: backed_at = "" descr = f"AnnData object with n_obs × n_vars = {n_obs} × {n_vars}{backed_at}" for attr in [ "obs", "var", "uns", "obsm", "varm", "layers", "obsp", "varp", ]: keys = getattr(self, attr).keys() if len(keys) > 0: descr += f"\n {attr}: {str(list(keys))[1:-1]}" return descr def __repr__(self) -> str: if self.is_view: return "View of " + self._gen_repr(self.n_obs, self.n_vars) else: return self._gen_repr(self.n_obs, self.n_vars) def __eq__(self, other): """Equality testing""" raise NotImplementedError( "Equality comparisons are not supported for AnnData objects, " "instead compare the desired attributes." ) @property def shape(self) -> Tuple[int, int]: """Shape of data matrix (:attr:`n_obs`, :attr:`n_vars`).""" return self.n_obs, self.n_vars @property def X(self) -> Optional[Union[np.ndarray, sparse.spmatrix, ArrayView]]: """Data matrix of shape :attr:`n_obs` × :attr:`n_vars`.""" if self.isbacked: if not self.file.is_open: self.file.open() X = self.file["X"] if isinstance(X, h5py.Group): X = SparseDataset(X) # This is so that we can index into a backed dense dataset with # indices that aren’t strictly increasing if self.is_view: X = _subset(X, (self._oidx, self._vidx)) elif self.is_view: X = as_view( _subset(self._adata_ref.X, (self._oidx, self._vidx)), ElementRef(self, "X"), ) else: X = self._X return X # if self.n_obs == 1 and self.n_vars == 1: # return X[0, 0] # elif self.n_obs == 1 or self.n_vars == 1: # if issparse(X): X = X.toarray() # return X.flatten() # else: # return X @X.setter def X(self, value: Optional[Union[np.ndarray, sparse.spmatrix]]): if value is None: if self.isbacked: raise NotImplementedError( "Cannot currently remove data matrix from backed object." ) if self.is_view: self._init_as_actual(self.copy()) self._X = None return if not isinstance(value, StorageType.classes()) and not np.isscalar(value): if hasattr(value, "to_numpy") and hasattr(value, "dtypes"): value = ensure_df_homogeneous(value, "X") else: # TODO: asarray? asanyarray? value = np.array(value) # If indices are both arrays, we need to modify them # so we don’t set values like coordinates # This can occur if there are succesive views if ( self.is_view and isinstance(self._oidx, np.ndarray) and isinstance(self._vidx, np.ndarray) ): oidx, vidx = np.ix_(self._oidx, self._vidx) else: oidx, vidx = self._oidx, self._vidx if ( np.isscalar(value) or (hasattr(value, "shape") and (self.shape == value.shape)) or (self.n_vars == 1 and self.n_obs == len(value)) or (self.n_obs == 1 and self.n_vars == len(value)) ): if not np.isscalar(value) and self.shape != value.shape: # For assigning vector of values to 2d array or matrix # Not neccesary for row of 2d array value = value.reshape(self.shape) if self.isbacked: if self.is_view: X = self.file["X"] if isinstance(X, h5py.Group): X = SparseDataset(X) X[oidx, vidx] = value else: self._set_backed("X", value) else: if self.is_view: if sparse.issparse(self._adata_ref._X) and isinstance( value, np.ndarray ): value = sparse.coo_matrix(value) self._adata_ref._X[oidx, vidx] = value else: self._X = value else: raise ValueError( f"Data matrix has wrong shape {value.shape}, " f"need to be {self.shape}." ) @X.deleter def X(self): self.X = None @property def layers(self) -> Union[Layers, LayersView]: """\ Dictionary-like object with values of the same dimensions as :attr:`X`. Layers in AnnData are inspired by loompy’s :ref:`loomlayers`. Return the layer named `"unspliced"`:: adata.layers["unspliced"] Create or replace the `"spliced"` layer:: adata.layers["spliced"] = ... Assign the 10th column of layer `"spliced"` to the variable a:: a = adata.layers["spliced"][:, 10] Delete the `"spliced"` layer:: del adata.layers["spliced"] Return layers’ names:: adata.layers.keys() """ return self._layers @layers.setter def layers(self, value): layers = Layers(self, vals=convert_to_dict(value)) if self.is_view: self._init_as_actual(self.copy()) self._layers = layers @layers.deleter def layers(self): self.layers = dict() @property def raw(self) -> Raw: """\ Store raw version of :attr:`X` and :attr:`var` as `.raw.X` and `.raw.var`. The :attr:`raw` attribute is initialized with the current content of an object by setting:: adata.raw = adata Its content can be deleted:: adata.raw = None # or del adata.raw Upon slicing an AnnData object along the obs (row) axis, :attr:`raw` is also sliced. Slicing an AnnData object along the vars (columns) axis leaves :attr:`raw` unaffected. Note that you can call:: adata.raw[:, 'orig_variable_name'].X to retrieve the data associated with a variable that might have been filtered out or "compressed away" in :attr:`X`. """ return self._raw @raw.setter def raw(self, value: "AnnData"): if value is None: del self.raw elif not isinstance(value, AnnData): raise ValueError("Can only init raw attribute with an AnnData object.") else: if self.is_view: self._init_as_actual(self.copy()) self._raw = Raw(value) @raw.deleter def raw(self): if self.is_view: self._init_as_actual(self.copy()) self._raw = None @property def n_obs(self) -> int: """Number of observations.""" return self._n_obs @property def n_vars(self) -> int: """Number of variables/features.""" return self._n_vars def _set_dim_df(self, value: pd.DataFrame, attr: str): if not isinstance(value, pd.DataFrame): raise ValueError(f"Can only assign pd.DataFrame to {attr}.") value_idx = self._prep_dim_index(value.index, attr) if self.is_view: self._init_as_actual(self.copy()) setattr(self, f"_{attr}", value) self._set_dim_index(value_idx, attr) def _prep_dim_index(self, value, attr: str) -> pd.Index: """Prepares index to be uses as obs_names or var_names for AnnData object.AssertionError If a pd.Index is passed, this will use a reference, otherwise a new index object is created. """ if self.shape[attr == "var"] != len(value): raise ValueError( f"Length of passed value for {attr}_names is {len(value)}, but this AnnData has shape: {self.shape}" ) if isinstance(value, pd.Index) and not isinstance( value.name, (str, type(None)) ): raise ValueError( f"AnnData expects .{attr}.index.name to be a string or None, " f"but you passed a name of type {type(value.name).__name__!r}" ) else: value = pd.Index(value) if not isinstance(value.name, (str, type(None))): value.name = None # fmt: off if ( not isinstance(value, pd.RangeIndex) and not infer_dtype(value) in ("string", "bytes") ): sample = list(value[: min(len(value), 5)]) warnings.warn(dedent( f""" AnnData expects .{attr}.index to contain strings, but got values like: {sample} Inferred to be: {infer_dtype(value)} """ ), # noqa stacklevel=2, ) # fmt: on return value def _set_dim_index(self, value: pd.Index, attr: str): # Assumes _prep_dim_index has been run if self.is_view: self._init_as_actual(self.copy()) getattr(self, attr).index = value for v in getattr(self, f"{attr}m").values(): if isinstance(v, pd.DataFrame): v.index = value @property def obs(self) -> pd.DataFrame: """One-dimensional annotation of observations (`pd.DataFrame`).""" return self._obs @obs.setter def obs(self, value: pd.DataFrame): self._set_dim_df(value, "obs") @obs.deleter def obs(self): self.obs = pd.DataFrame(index=self.obs_names) @property def obs_names(self) -> pd.Index: """Names of observations (alias for `.obs.index`).""" return self.obs.index @obs_names.setter def obs_names(self, names: Sequence[str]): names = self._prep_dim_index(names, "obs") self._set_dim_index(names, "obs") @property def var(self) -> pd.DataFrame: """One-dimensional annotation of variables/ features (`pd.DataFrame`).""" return self._var @var.setter def var(self, value: pd.DataFrame): self._set_dim_df(value, "var") @var.deleter def var(self): self.var = pd.DataFrame(index=self.var_names) @property def var_names(self) -> pd.Index: """Names of variables (alias for `.var.index`).""" return self.var.index @var_names.setter def var_names(self, names: Sequence[str]): names = self._prep_dim_index(names, "var") self._set_dim_index(names, "var") @property def uns(self) -> MutableMapping: """Unstructured annotation (ordered dictionary).""" uns = self._uns if self.is_view: uns = DictView(uns, view_args=(self, "_uns")) uns = _overloaded_uns(self, uns) return uns @uns.setter def uns(self, value: MutableMapping): if not isinstance(value, MutableMapping): raise ValueError( "Only mutable mapping types (e.g. dict) are allowed for `.uns`." ) if isinstance(value, (OverloadedDict, DictView)): value = value.copy() if self.is_view: self._init_as_actual(self.copy()) self._uns = value @uns.deleter def uns(self): self.uns = OrderedDict() @property def obsm(self) -> Union[AxisArrays, AxisArraysView]: """\ Multi-dimensional annotation of observations (mutable structured :class:`~numpy.ndarray`). Stores for each key a two or higher-dimensional :class:`~numpy.ndarray` of length `n_obs`. Is sliced with `data` and `obs` but behaves otherwise like a :term:`mapping`. """ return self._obsm @obsm.setter def obsm(self, value): obsm = AxisArrays(self, 0, vals=convert_to_dict(value)) if self.is_view: self._init_as_actual(self.copy()) self._obsm = obsm @obsm.deleter def obsm(self): self.obsm = dict() @property def varm(self) -> Union[AxisArrays, AxisArraysView]: """\ Multi-dimensional annotation of variables/features (mutable structured :class:`~numpy.ndarray`). Stores for each key a two or higher-dimensional :class:`~numpy.ndarray` of length `n_vars`. Is sliced with `data` and `var` but behaves otherwise like a :term:`mapping`. """ return self._varm @varm.setter def varm(self, value): varm = AxisArrays(self, 1, vals=convert_to_dict(value)) if self.is_view: self._init_as_actual(self.copy()) self._varm = varm @varm.deleter def varm(self): self.varm = dict() @property def obsp(self) -> Union[PairwiseArrays, PairwiseArraysView]: """\ Pairwise annotation of observations, a mutable mapping with array-like values. Stores for each key a two or higher-dimensional :class:`~numpy.ndarray` whose first two dimensions are of length `n_obs`. Is sliced with `data` and `obs` but behaves otherwise like a :term:`mapping`. """ return self._obsp @obsp.setter def obsp(self, value): obsp = PairwiseArrays(self, 0, vals=convert_to_dict(value)) if self.is_view: self._init_as_actual(self.copy()) self._obsp = obsp @obsp.deleter def obsp(self): self.obsp = dict() @property def varp(self) -> Union[PairwiseArrays, PairwiseArraysView]: """\ Pairwise annotation of observations, a mutable mapping with array-like values. Stores for each key a two or higher-dimensional :class:`~numpy.ndarray` whose first two dimensions are of length `n_var`. Is sliced with `data` and `var` but behaves otherwise like a :term:`mapping`. """ return self._varp @varp.setter def varp(self, value): varp = PairwiseArrays(self, 1, vals=convert_to_dict(value)) if self.is_view: self._init_as_actual(self.copy()) self._varp = varp @varp.deleter def varp(self): self.varp = dict() def obs_keys(self) -> List[str]: """List keys of observation annotation :attr:`obs`.""" return self._obs.keys().tolist() def var_keys(self) -> List[str]: """List keys of variable annotation :attr:`var`.""" return self._var.keys().tolist() def obsm_keys(self) -> List[str]: """List keys of observation annotation :attr:`obsm`.""" return list(self._obsm.keys()) def varm_keys(self) -> List[str]: """List keys of variable annotation :attr:`varm`.""" return list(self._varm.keys()) def uns_keys(self) -> List[str]: """List keys of unstructured annotation.""" return sorted(list(self._uns.keys())) @property def isbacked(self) -> bool: """`True` if object is backed on disk, `False` otherwise.""" return self.filename is not None @property def is_view(self) -> bool: """`True` if object is view of another AnnData object, `False` otherwise.""" return self._is_view @property def filename(self) -> Optional[Path]: """\ Change to backing mode by setting the filename of a `.h5ad` file. - Setting the filename writes the stored data to disk. - Setting the filename when the filename was previously another name moves the backing file from the previous file to the new file. If you want to copy the previous file, use `copy(filename='new_filename')`. """ return self.file.filename @filename.setter def filename(self, filename: Optional[PathLike]): # convert early for later comparison filename = None if filename is None else Path(filename) # change from backing-mode back to full loading into memory if filename is None: if self.filename is not None: self.file._to_memory_mode() else: # both filename and self.filename are None # do nothing return else: if self.filename is not None: if self.filename != filename: # write the content of self to the old file # and close the file self.write() self.filename.rename(filename) else: # do nothing return else: # change from memory to backing-mode # write the content of self to disk self.write(filename, force_dense=True) # open new file for accessing self.file.open(filename, "r+") # as the data is stored on disk, we can safely set self._X to None self._X = None def _set_backed(self, attr, value): from .._io.utils import write_attribute write_attribute(self.file._file, attr, value) def _normalize_indices(self, index: Optional[Index]) -> Tuple[slice, slice]: return _normalize_indices(index, self.obs_names, self.var_names) # TODO: this is not quite complete... def __delitem__(self, index: Index): obs, var = self._normalize_indices(index) # TODO: does this really work? if not self.isbacked: del self._X[obs, var] else: X = self.file["X"] del X[obs, var] self._set_backed("X", X) if var == slice(None): del self._obs.iloc[obs, :] if obs == slice(None): del self._var.iloc[var, :] def __getitem__(self, index: Index) -> "AnnData": """Returns a sliced view of the object.""" oidx, vidx = self._normalize_indices(index) return AnnData(self, oidx=oidx, vidx=vidx, asview=True) def _remove_unused_categories(self, df_full, df_sub, uns): for k in df_full: if not is_categorical_dtype(df_full[k]): continue all_categories = df_full[k].cat.categories with pd.option_context("mode.chained_assignment", None): df_sub[k] = df_sub[k].cat.remove_unused_categories() # also correct the colors... color_key = f"{k}_colors" if color_key not in uns: continue color_vec = uns[color_key] if np.array(color_vec).ndim == 0: # Make 0D arrays into 1D ones uns[color_key] = np.array(color_vec)[(None,)] elif len(color_vec) != len(all_categories): # Reset colors del uns[color_key] else: idx = np.where(np.in1d(all_categories, df_sub[k].cat.categories))[0] uns[color_key] = np.array(color_vec)[(idx,)] def rename_categories(self, key: str, categories: Sequence[Any]): """\ Rename categories of annotation `key` in :attr:`obs`, :attr:`var`, and :attr:`uns`. Only supports passing a list/array-like `categories` argument. Besides calling `self.obs[key].cat.categories = categories` – similar for :attr:`var` - this also renames categories in unstructured annotation that uses the categorical annotation `key`. Parameters ---------- key Key for observations or variables annotation. categories New categories, the same number as the old categories. """ if isinstance(categories, Mapping): raise ValueError("Only list-like `categories` is supported.") if key in self.obs: old_categories = self.obs[key].cat.categories.tolist() self.obs[key].cat.rename_categories(categories, inplace=True) elif key in self.var: old_categories = self.var[key].cat.categories.tolist() self.var[key].cat.rename_categories(categories, inplace=True) else: raise ValueError(f"{key} is neither in `.obs` nor in `.var`.") # this is not a good solution # but depends on the scanpy conventions for storing the categorical key # as `groupby` in the `params` slot for k1, v1 in self.uns.items(): if not ( isinstance(v1, Mapping) and "params" in v1 and "groupby" in v1["params"] and v1["params"]["groupby"] == key ): continue for k2, v2 in v1.items(): # picks out the recarrays that are named according to the old # categories if isinstance(v2, np.ndarray) and v2.dtype.names is not None: if list(v2.dtype.names) == old_categories: self.uns[k1][k2].dtype.names = categories else: logger.warning( f"Omitting {k1}/{k2} as old categories do not match." ) def strings_to_categoricals(self, df: Optional[pd.DataFrame] = None): """\ Transform string annotations to categoricals. Only affects string annotations that lead to less categories than the total number of observations. Params ------ df If `df` is `None`, modifies both :attr:`obs` and :attr:`var`, otherwise modifies `df` inplace. Notes ----- Turns the view of an :class:`~anndata.AnnData` into an actual :class:`~anndata.AnnData`. """ dont_modify = False # only necessary for backed views if df is None: dfs = [self.obs, self.var] if self.is_view and self.isbacked: dont_modify = True else: dfs = [df] for df in dfs: string_cols = [ key for key in df.columns if infer_dtype(df[key]) == "string" ] for key in string_cols: c = pd.Categorical(df[key]) # TODO: We should only check if non-null values are unique, but # this would break cases where string columns with nulls could # be written as categorical, but not as string. # Possible solution: https://github.com/theislab/anndata/issues/504 if len(c.categories) >= len(c): continue c.reorder_categories(natsorted(c.categories), inplace=True) if dont_modify: raise RuntimeError( "Please call `.strings_to_categoricals()` on full " "AnnData, not on this view. You might encounter this" "error message while copying or writing to disk." ) if self.is_view: warnings.warn( "Initializing view as actual.", ImplicitModificationWarning ) # If `self` is a view, it will be actualized in the next line, # therefore the previous warning df[key] = c logger.info(f"... storing {key!r} as categorical") _sanitize = strings_to_categoricals # backwards compat def _inplace_subset_var(self, index: Index1D): """\ Inplace subsetting along variables dimension. Same as `adata = adata[:, index]`, but inplace. """ adata_subset = self[:, index].copy() self._init_as_actual(adata_subset, dtype=self._X.dtype) def _inplace_subset_obs(self, index: Index1D): """\ Inplace subsetting along variables dimension. Same as `adata = adata[index, :]`, but inplace. """ adata_subset = self[index].copy() self._init_as_actual(adata_subset, dtype=self.X.dtype) # TODO: Update, possibly remove def __setitem__( self, index: Index, val: Union[int, float, np.ndarray, sparse.spmatrix] ): if self.is_view: raise ValueError("Object is view and cannot be accessed with `[]`.") obs, var = self._normalize_indices(index) if not self.isbacked: self._X[obs, var] = val else: X = self.file["X"] X[obs, var] = val self._set_backed("X", X) def __len__(self) -> int: return self.shape[0] def transpose(self) -> "AnnData": """\ Transpose whole object. Data matrix is transposed, observations and variables are interchanged. Ignores `.raw`. """ if not self.isbacked: X = self.X else: X = self.file["X"] if self.is_view: raise ValueError( "You’re trying to transpose a view of an `AnnData`, " "which is currently not implemented. Call `.copy()` before transposing." ) def t_csr(m: sparse.spmatrix) -> sparse.csr_matrix: return m.T.tocsr() if sparse.isspmatrix_csr(m) else m.T return AnnData( X=t_csr(X) if X is not None else None, obs=self.var, var=self.obs, # we're taking a private attributes here to be able to modify uns of the original object uns=self._uns, obsm=self.varm.flipped(), varm=self.obsm.flipped(), obsp=self.varp.copy(), varp=self.obsp.copy(), filename=self.filename, layers={k: t_csr(v) for k, v in self.layers.items()}, dtype=self.X.dtype.name if X is not None else "float32", ) T = property(transpose) def to_df(self, layer=None) -> pd.DataFrame: """\ Generate shallow :class:`~pandas.DataFrame`. The data matrix :attr:`X` is returned as :class:`~pandas.DataFrame`, where :attr:`obs_names` initializes the index, and :attr:`var_names` the columns. * No annotations are maintained in the returned object. * The data matrix is densified in case it is sparse. Params ------ layer : str Key for `.layers`. """ if layer is not None: X = self.layers[layer] else: X = self.X if issparse(X): X = X.toarray() return pd.DataFrame(X, index=self.obs_names, columns=self.var_names) def _get_X(self, use_raw=False, layer=None): """\ Convenience method for getting expression values with common arguments and error handling. """ is_layer = layer is not None if use_raw and is_layer: raise ValueError( "Cannot use expression from both layer and raw. You provided:" f"`use_raw={use_raw}` and `layer={layer}`" ) if is_layer: return self.layers[layer] elif use_raw: if self.raw is None: raise ValueError("This AnnData doesn’t have a value in `.raw`.") return self.raw.X else: return self.X def obs_vector(self, k: str, *, layer: Optional[str] = None) -> np.ndarray: """\ Convenience function for returning a 1 dimensional ndarray of values from :attr:`X`, :attr:`layers`\\ `[k]`, or :attr:`obs`. Made for convenience, not performance. Intentionally permissive about arguments, for easy iterative use. Params ------ k Key to use. Should be in :attr:`var_names` or :attr:`obs`\\ `.columns`. layer What layer values should be returned from. If `None`, :attr:`X` is used. Returns ------- A one dimensional nd array, with values for each obs in the same order as :attr:`obs_names`. """ if layer == "X": if "X" in self.layers: pass else: warnings.warn( "In a future version of AnnData, access to `.X` by passing" " `layer='X'` will be removed. Instead pass `layer=None`.", FutureWarning, ) layer = None return get_vector(self, k, "obs", "var", layer=layer) def var_vector(self, k, *, layer: Optional[str] = None) -> np.ndarray: """\ Convenience function for returning a 1 dimensional ndarray of values from :attr:`X`, :attr:`layers`\\ `[k]`, or :attr:`obs`. Made for convenience, not performance. Intentionally permissive about arguments, for easy iterative use. Params ------ k Key to use. Should be in :attr:`obs_names` or :attr:`var`\\ `.columns`. layer What layer values should be returned from. If `None`, :attr:`X` is used. Returns ------- A one dimensional nd array, with values for each var in the same order as :attr:`var_names`. """ if layer == "X": if "X" in self.layers: pass else: warnings.warn( "In a future version of AnnData, access to `.X` by passing " "`layer='X'` will be removed. Instead pass `layer=None`.", FutureWarning, ) layer = None return get_vector(self, k, "var", "obs", layer=layer) @utils.deprecated("obs_vector") def _get_obs_array(self, k, use_raw=False, layer=None): """\ Get an array from the layer (default layer='X') along the :attr:`obs` dimension by first looking up `obs.keys` and then :attr:`obs_names`. """ if not use_raw or k in self.obs.columns: return self.obs_vector(k=k, layer=layer) else: return self.raw.obs_vector(k) @utils.deprecated("var_vector") def _get_var_array(self, k, use_raw=False, layer=None): """\ Get an array from the layer (default layer='X') along the :attr:`var` dimension by first looking up `var.keys` and then :attr:`var_names`. """ if not use_raw or k in self.var.columns: return self.var_vector(k=k, layer=layer) else: return self.raw.var_vector(k) def _mutated_copy(self, **kwargs): """Creating AnnData with attributes optionally specified via kwargs.""" if self.isbacked: if "X" not in kwargs or (self.raw is not None and "raw" not in kwargs): raise NotImplementedError( "This function does not currently handle backed objects " "internally, this should be dealt with before." ) new = {} for key in ["obs", "var", "obsm", "varm", "obsp", "varp", "layers"]: if key in kwargs: new[key] = kwargs[key] else: new[key] = getattr(self, key).copy() if "X" in kwargs: new["X"] = kwargs["X"] else: new["X"] = self.X.copy() if "uns" in kwargs: new["uns"] = kwargs["uns"] else: new["uns"] = deepcopy(self._uns) if "raw" in kwargs: new["raw"] = kwargs["raw"] elif self.raw is not None: new["raw"] = self.raw.copy() new["dtype"] = new["X"].dtype return AnnData(**new) def to_memory(self) -> "AnnData": """Load backed AnnData object into memory. Example ------- .. code:: python import anndata backed = anndata.read_h5ad("file.h5ad", backed="r") mem = backed[backed.obs["cluster"] == "a", :].to_memory() """ if not self.isbacked: raise ValueError("Object is already in memory.") else: elems = {"X": to_memory(self.X)} if self.raw is not None: elems["raw"] = { "X": to_memory(self.raw.X), "var": self.raw.var, "varm": self.raw.varm, } adata = self._mutated_copy(**elems) self.file.close() return adata def copy(self, filename: Optional[PathLike] = None) -> "AnnData": """Full copy, optionally on disk.""" if not self.isbacked: if self.is_view: # TODO: How do I unambiguously check if this is a copy? # Subsetting this way means we don’t have to have a view type # defined for the matrix, which is needed for some of the # current distributed backend. X = _subset(self._adata_ref.X, (self._oidx, self._vidx)).copy() else: X = self.X.copy() return self._mutated_copy(X=X) else: from .._io import read_h5ad from .._io.write import _write_h5ad if filename is None: raise ValueError( "To copy an AnnData object in backed mode, " "pass a filename: `.copy(filename='myfilename.h5ad')`. " "To load the object into memory, use `.to_memory()`." ) mode = self.file._filemode _write_h5ad(filename, self) return read_h5ad(filename, backed=mode) def concatenate( self, *adatas: "AnnData", join: str = "inner", batch_key: str = "batch", batch_categories: Sequence[Any] = None, uns_merge: Optional[str] = None, index_unique: Optional[str] = "-", fill_value=None, ) -> "AnnData": """\ Concatenate along the observations axis. The :attr:`uns`, :attr:`varm` and :attr:`obsm` attributes are ignored. Currently, this works only in `'memory'` mode. .. note:: For more flexible and efficient concatenation, see: :func:`~anndata.concat`. Parameters ---------- adatas AnnData matrices to concatenate with. Each matrix is referred to as a “batch”. join Use intersection (`'inner'`) or union (`'outer'`) of variables. batch_key Add the batch annotation to :attr:`obs` using this key. batch_categories Use these as categories for the batch annotation. By default, use increasing numbers. uns_merge Strategy to use for merging entries of uns. These strategies are applied recusivley. Currently implemented strategies include: * `None`: The default. The concatenated object will just have an empty dict for `uns`. * `"same"`: Only entries which have the same value in all AnnData objects are kept. * `"unique"`: Only entries which have one unique value in all AnnData objects are kept. * `"first"`: The first non-missing value is used. * `"only"`: A value is included if only one of the AnnData objects has a value at this path. index_unique Make the index unique by joining the existing index names with the batch category, using `index_unique='-'`, for instance. Provide `None` to keep existing indices. fill_value Scalar value to fill newly missing values in arrays with. Note: only applies to arrays and sparse matrices (not dataframes) and will only be used if `join="outer"`. .. note:: If not provided, the default value is `0` for sparse matrices and `np.nan` for numpy arrays. See the examples below for more information. Returns ------- :class:`~anndata.AnnData` The concatenated :class:`~anndata.AnnData`, where `adata.obs[batch_key]` stores a categorical variable labeling the batch. Notes ----- .. warning:: If you use `join='outer'` this fills 0s for sparse data when variables are absent in a batch. Use this with care. Dense data is filled with `NaN`. See the examples. Examples -------- Joining on intersection of variables. >>> adata1 = AnnData( ... np.array([[1, 2, 3], [4, 5, 6]]), ... dict(obs_names=['s1', 's2'], anno1=['c1', 'c2']), ... dict(var_names=['a', 'b', 'c'], annoA=[0, 1, 2]), ... ) >>> adata2 = AnnData( ... np.array([[1, 2, 3], [4, 5, 6]]), ... dict(obs_names=['s3', 's4'], anno1=['c3', 'c4']), ... dict(var_names=['d', 'c', 'b'], annoA=[0, 1, 2]), ... ) >>> adata3 = AnnData( ... np.array([[1, 2, 3], [4, 5, 6]]), ... dict(obs_names=['s1', 's2'], anno2=['d3', 'd4']), ... dict(var_names=['d', 'c', 'b'], annoA=[0, 2, 3], annoB=[0, 1, 2]), ... ) >>> adata = adata1.concatenate(adata2, adata3) >>> adata AnnData object with n_obs × n_vars = 6 × 2 obs: 'anno1', 'anno2', 'batch' var: 'annoA-0', 'annoA-1', 'annoA-2', 'annoB-2' >>> adata.X array([[2., 3.], [5., 6.], [3., 2.], [6., 5.], [3., 2.], [6., 5.]], dtype=float32) >>> adata.obs anno1 anno2 batch s1-0 c1 NaN 0 s2-0 c2 NaN 0 s3-1 c3 NaN 1 s4-1 c4 NaN 1 s1-2 NaN d3 2 s2-2 NaN d4 2 >>> adata.var.T b c annoA-0 1 2 annoA-1 2 1 annoA-2 3 2 annoB-2 2 1 Joining on the union of variables. >>> outer = adata1.concatenate(adata2, adata3, join='outer') >>> outer AnnData object with n_obs × n_vars = 6 × 4 obs: 'anno1', 'anno2', 'batch' var: 'annoA-0', 'annoA-1', 'annoA-2', 'annoB-2' >>> outer.var.T a b c d annoA-0 0.0 1.0 2.0 NaN annoA-1 NaN 2.0 1.0 0.0 annoA-2 NaN 3.0 2.0 0.0 annoB-2 NaN 2.0 1.0 0.0 >>> outer.var_names Index(['a', 'b', 'c', 'd'], dtype='object') >>> outer.X array([[ 1., 2., 3., nan], [ 4., 5., 6., nan], [nan, 3., 2., 1.], [nan, 6., 5., 4.], [nan, 3., 2., 1.], [nan, 6., 5., 4.]], dtype=float32) >>> outer.X.sum(axis=0) array([nan, 25., 23., nan], dtype=float32) >>> import pandas as pd >>> Xdf = pd.DataFrame(outer.X, columns=outer.var_names) >>> Xdf a b c d 0 1.0 2.0 3.0 NaN 1 4.0 5.0 6.0 NaN 2 NaN 3.0 2.0 1.0 3 NaN 6.0 5.0 4.0 4 NaN 3.0 2.0 1.0 5 NaN 6.0 5.0 4.0 >>> Xdf.sum() a 5.0 b 25.0 c 23.0 d 10.0 dtype: float32 One way to deal with missing values is to use masked arrays: >>> from numpy import ma >>> outer.X = ma.masked_invalid(outer.X) >>> outer.X masked_array( data=[[1.0, 2.0, 3.0, --], [4.0, 5.0, 6.0, --], [--, 3.0, 2.0, 1.0], [--, 6.0, 5.0, 4.0], [--, 3.0, 2.0, 1.0], [--, 6.0, 5.0, 4.0]], mask=[[False, False, False, True], [False, False, False, True], [ True, False, False, False], [ True, False, False, False], [ True, False, False, False], [ True, False, False, False]], fill_value=1e+20, dtype=float32) >>> outer.X.sum(axis=0).data array([ 5., 25., 23., 10.], dtype=float32) The masked array is not saved but has to be reinstantiated after saving. >>> outer.write('./test.h5ad') >>> from anndata import read_h5ad >>> outer = read_h5ad('./test.h5ad') >>> outer.X array([[ 1., 2., 3., nan], [ 4., 5., 6., nan], [nan, 3., 2., 1.], [nan, 6., 5., 4.], [nan, 3., 2., 1.], [nan, 6., 5., 4.]], dtype=float32) For sparse data, everything behaves similarly, except that for `join='outer'`, zeros are added. >>> from scipy.sparse import csr_matrix >>> adata1 = AnnData( ... csr_matrix([[0, 2, 3], [0, 5, 6]]), ... dict(obs_names=['s1', 's2'], anno1=['c1', 'c2']), ... dict(var_names=['a', 'b', 'c']), ... ) >>> adata2 = AnnData( ... csr_matrix([[0, 2, 3], [0, 5, 6]]), ... dict(obs_names=['s3', 's4'], anno1=['c3', 'c4']), ... dict(var_names=['d', 'c', 'b']), ... ) >>> adata3 = AnnData( ... csr_matrix([[1, 2, 0], [0, 5, 6]]), ... dict(obs_names=['s5', 's6'], anno2=['d3', 'd4']), ... dict(var_names=['d', 'c', 'b']), ... ) >>> adata = adata1.concatenate(adata2, adata3, join='outer') >>> adata.var_names Index(['a', 'b', 'c', 'd'], dtype='object') >>> adata.X.toarray() array([[0., 2., 3., 0.], [0., 5., 6., 0.], [0., 3., 2., 0.], [0., 6., 5., 0.], [0., 0., 2., 1.], [0., 6., 5., 0.]], dtype=float32) """ from .merge import concat, merge_outer, merge_dataframes, merge_same if self.isbacked: raise ValueError("Currently, concatenate does only work in memory mode.") if len(adatas) == 0: return self.copy() elif len(adatas) == 1 and not isinstance(adatas[0], AnnData): adatas = adatas[0] # backwards compatibility all_adatas = (self,) + tuple(adatas) out = concat( all_adatas, axis=0, join=join, label=batch_key, keys=batch_categories, uns_merge=uns_merge, fill_value=fill_value, index_unique=index_unique, pairwise=False, ) # Backwards compat (some of this could be more efficient) # obs used to always be an outer join out.obs = concat( [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas], axis=0, join="outer", label=batch_key, keys=batch_categories, index_unique=index_unique, ).obs # Removing varm del out.varm # Implementing old-style merging of var if batch_categories is None: batch_categories = np.arange(len(all_adatas)).astype(str) pat = rf"-({'|'.join(batch_categories)})$" out.var = merge_dataframes( [a.var for a in all_adatas], out.var_names, partial(merge_outer, batch_keys=batch_categories, merge=merge_same), ) out.var = out.var.iloc[ :, ( out.var.columns.str.extract(pat, expand=False) .fillna("") .argsort(kind="stable") ), ] return out def var_names_make_unique(self, join: str = "-"): # Important to go through the setter so obsm dataframes are updated too self.var_names = utils.make_index_unique(self.var.index, join) var_names_make_unique.__doc__ = utils.make_index_unique.__doc__ def obs_names_make_unique(self, join: str = "-"): # Important to go through the setter so obsm dataframes are updated too self.obs_names = utils.make_index_unique(self.obs.index, join) obs_names_make_unique.__doc__ = utils.make_index_unique.__doc__ def _check_uniqueness(self): if not self.obs.index.is_unique: utils.warn_names_duplicates("obs") if not self.var.index.is_unique: utils.warn_names_duplicates("var") def __contains__(self, key: Any): raise AttributeError( "AnnData has no attribute __contains__, don’t check `in adata`." ) def _check_dimensions(self, key=None): if key is None: key = {"obs", "var", "obsm", "varm"} else: key = {key} if "obs" in key and len(self._obs) != self._n_obs: raise ValueError( "Observations annot. `obs` must have number of rows of `X`" f" ({self._n_obs}), but has {self._obs.shape[0]} rows." ) if "var" in key and len(self._var) != self._n_vars: raise ValueError( "Variables annot. `var` must have number of columns of `X`" f" ({self._n_vars}), but has {self._var.shape[0]} rows." ) if "obsm" in key: obsm = self._obsm if ( not all([o.shape[0] == self._n_obs for o in obsm.values()]) and len(obsm.dim_names) != self._n_obs ): raise ValueError( "Observations annot. `obsm` must have number of rows of `X`" f" ({self._n_obs}), but has {len(obsm)} rows." ) if "varm" in key: varm = self._varm if ( not all([v.shape[0] == self._n_vars for v in varm.values()]) and len(varm.dim_names) != self._n_vars ): raise ValueError( "Variables annot. `varm` must have number of columns of `X`" f" ({self._n_vars}), but has {len(varm)} rows." ) def write_h5ad( self, filename: Optional[PathLike] = None, compression: Optional[Literal["gzip", "lzf"]] = None, compression_opts: Union[int, Any] = None, force_dense: Optional[bool] = None, as_dense: Sequence[str] = (), ): """\ Write `.h5ad`-formatted hdf5 file. .. note:: Setting compression to `'gzip'` can save disk space but will slow down writing and subsequent reading. Prior to v0.6.16, this was the default for parameter `compression`. Generally, if you have sparse data that are stored as a dense matrix, you can dramatically improve performance and reduce disk space by converting to a :class:`~scipy.sparse.csr_matrix`:: from scipy.sparse import csr_matrix adata.X = csr_matrix(adata.X) Parameters ---------- filename Filename of data file. Defaults to backing file. compression See the h5py :ref:`dataset_compression`. compression_opts See the h5py :ref:`dataset_compression`. as_dense Sparse arrays in AnnData object to write as dense. Currently only supports `X` and `raw/X`. force_dense Write sparse data as a dense matrix. Defaults to `True` if object is backed, otherwise to `False`. """ from .._io.write import _write_h5ad if filename is None and not self.isbacked: raise ValueError("Provide a filename!") if filename is None: filename = self.filename _write_h5ad( Path(filename), self, compression=compression, compression_opts=compression_opts, force_dense=force_dense, as_dense=as_dense, ) if self.isbacked: self.file.filename = filename write = write_h5ad # a shortcut and backwards compat def write_csvs(self, dirname: PathLike, skip_data: bool = True, sep: str = ","): """\ Write annotation to `.csv` files. It is not possible to recover the full :class:`~anndata.AnnData` from these files. Use :meth:`write` for this. Parameters ---------- dirname Name of directory to which to export. skip_data Skip the data matrix :attr:`X`. sep Separator for the data. """ from .._io.write import write_csvs write_csvs(dirname, self, skip_data=skip_data, sep=sep) def write_loom(self, filename: PathLike, write_obsm_varm: bool = False): """\ Write `.loom`-formatted hdf5 file. Parameters ---------- filename The filename. """ from .._io.write import write_loom write_loom(filename, self, write_obsm_varm=write_obsm_varm) def write_zarr( self, store: Union[MutableMapping, PathLike], chunks: Union[bool, int, Tuple[int, ...], None] = None, ): """\ Write a hierarchical Zarr array store. Parameters ---------- store The filename, a :class:`~typing.MutableMapping`, or a Zarr storage class. chunks Chunk shape. """ from .._io.write import write_zarr write_zarr(store, self, chunks=chunks) def chunked_X(self, chunk_size: Optional[int] = None): """\ Return an iterator over the rows of the data matrix :attr:`X`. Parameters ---------- chunk_size Row size of a single chunk. """ if chunk_size is None: # Should be some adaptive code chunk_size = 6000 start = 0 n = self.n_obs for _ in range(int(n // chunk_size)): end = start + chunk_size yield (self.X[start:end], start, end) start = end if start < n: yield (self.X[start:n], start, n) def chunk_X( self, select: Union[int, Sequence[int], np.ndarray] = 1000, replace: bool = True, ): """\ Return a chunk of the data matrix :attr:`X` with random or specified indices. Parameters ---------- select Depending on the type: :class:`int` A random chunk with `select` rows will be returned. :term:`sequence` (e.g. a list, tuple or numpy array) of :class:`int` A chunk with these indices will be returned. replace If `select` is an integer then `True` means random sampling of indices with replacement, `False` without replacement. """ if isinstance(select, int): select = select if select < self.n_obs else self.n_obs choice = np.random.choice(self.n_obs, select, replace) elif isinstance(select, (np.ndarray, cabc.Sequence)): choice = np.asarray(select) else: raise ValueError("select should be int or array") reverse = None if self.isbacked: # h5py can only slice with a sorted list of unique index values # so random batch with indices [2, 2, 5, 3, 8, 10, 8] will fail # this fixes the problem indices, reverse = np.unique(choice, return_inverse=True) selection = self.X[indices.tolist()] else: selection = self.X[choice] selection = selection.toarray() if issparse(selection) else selection return selection if reverse is None else selection[reverse] # -------------------------------------------------------------------------- # all of the following is for backwards compat # -------------------------------------------------------------------------- @property @utils.deprecated("is_view") def isview(self): return self.is_view def _clean_up_old_format(self, uns): # multicolumn keys # all of the rest is only for backwards compat for bases in [["obs", "smp"], ["var"]]: axis = bases[0] for k in [f"{p}{base}_keys_multicol" for p in ["", "_"] for base in bases]: if uns and k in uns: keys = list(uns[k]) del uns[k] break else: keys = [] # now, for compat, fill the old multicolumn entries into obsm and varm # and remove them from obs and var m_attr = getattr(self, f"_{axis}m") for key in keys: m_attr[key] = self._get_and_delete_multicol_field(axis, key) def _get_and_delete_multicol_field(self, a, key_multicol): keys = [] for k in getattr(self, a).columns: if k.startswith(key_multicol): keys.append(k) values = getattr(self, a)[keys].values getattr(self, a).drop(keys, axis=1, inplace=True) return values anndata-0.7.8/anndata/_core/file_backing.py000066400000000000000000000060021414255741200206030ustar00rootroot00000000000000from functools import singledispatch from os import PathLike from pathlib import Path from typing import Optional, Union, Iterator import h5py from . import anndata from .sparse_dataset import SparseDataset from ..compat import Literal, ZarrArray class AnnDataFileManager: """Backing file manager for AnnData.""" def __init__( self, adata: "anndata.AnnData", filename: Optional[PathLike] = None, filemode: Optional[Literal["r", "r+"]] = None, ): self._adata = adata self.filename = filename self._filemode = filemode self._file = None if filename: self.open() def __repr__(self) -> str: if self.filename is None: return "Backing file manager: no file is set." else: return f"Backing file manager of file {self.filename}." def __contains__(self, x) -> bool: return x in self._file def __iter__(self) -> Iterator[str]: return iter(self._file) def __getitem__(self, key: str) -> Union[h5py.Group, h5py.Dataset, SparseDataset]: return self._file[key] def __setitem__( self, key: str, value: Union[h5py.Group, h5py.Dataset, SparseDataset] ): self._file[key] = value def __delitem__(self, key: str): del self._file[key] @property def filename(self) -> Path: return self._filename @filename.setter def filename(self, filename: Optional[PathLike]): self._filename = None if filename is None else Path(filename) def open( self, filename: Optional[PathLike] = None, filemode: Optional[Literal["r", "r+"]] = None, ): if filename is not None: self.filename = filename if filemode is not None: self._filemode = filemode if self.filename is None: raise ValueError("Cannot open backing file if backing not initialized.") self._file = h5py.File(self.filename, self._filemode) def close(self): """Close the backing file, remember filename, do *not* change to memory mode.""" if self._file is not None: self._file.close() def _to_memory_mode(self): """Close the backing file, forget filename, *do* change to memory mode.""" self._adata.__X = self._adata.X[()] self._file.close() self._file = None self._filename = None @property def is_open(self) -> bool: """State of backing file.""" if self._file is None: return False # try accessing the id attribute to see if the file is open return bool(self._file.id) @singledispatch def to_memory(x): """Permissivley convert objects to in-memory representation. If they already are in-memory, (or are just unrecognized) pass a copy through. """ return x.copy() @to_memory.register(ZarrArray) @to_memory.register(h5py.Dataset) def _(x): return x[...] @to_memory.register(SparseDataset) def _(x: SparseDataset): return x.to_memory() anndata-0.7.8/anndata/_core/index.py000066400000000000000000000150341414255741200173220ustar00rootroot00000000000000import collections.abc as cabc from functools import singledispatch from itertools import repeat from typing import Union, Sequence, Optional, Tuple import h5py import numpy as np import pandas as pd from scipy.sparse import spmatrix, issparse Index1D = Union[slice, int, str, np.int64, np.ndarray] Index = Union[Index1D, Tuple[Index1D, Index1D], spmatrix] def _normalize_indices( index: Optional[Index], names0: pd.Index, names1: pd.Index ) -> Tuple[slice, slice]: # deal with tuples of length 1 if isinstance(index, tuple) and len(index) == 1: index = index[0] # deal with pd.Series if isinstance(index, pd.Series): index: Index = index.values if isinstance(index, tuple): if len(index) > 2: raise ValueError("AnnData can only be sliced in rows and columns.") # deal with pd.Series # TODO: The series should probably be aligned first if isinstance(index[1], pd.Series): index = index[0], index[1].values if isinstance(index[0], pd.Series): index = index[0].values, index[1] ax0, ax1 = unpack_index(index) ax0 = _normalize_index(ax0, names0) ax1 = _normalize_index(ax1, names1) return ax0, ax1 def _normalize_index( indexer: Union[ slice, np.integer, int, str, Sequence[Union[int, np.integer]], np.ndarray, pd.Index, ], index: pd.Index, ) -> Union[slice, int, np.ndarray]: # ndarray of int if not isinstance(index, pd.RangeIndex): assert ( index.dtype != float and index.dtype != int ), "Don’t call _normalize_index with non-categorical/string names" # the following is insanely slow for sequences, # we replaced it using pandas below def name_idx(i): if isinstance(i, str): i = index.get_loc(i) return i if isinstance(indexer, slice): start = name_idx(indexer.start) stop = name_idx(indexer.stop) # string slices can only be inclusive, so +1 in that case if isinstance(indexer.stop, str): stop = None if stop is None else stop + 1 step = indexer.step return slice(start, stop, step) elif isinstance(indexer, (np.integer, int)): return indexer elif isinstance(indexer, str): return index.get_loc(indexer) # int elif isinstance(indexer, (Sequence, np.ndarray, pd.Index, spmatrix, np.matrix)): if hasattr(indexer, "shape") and ( (indexer.shape == (index.shape[0], 1)) or (indexer.shape == (1, index.shape[0])) ): if isinstance(indexer, spmatrix): indexer = indexer.toarray() indexer = np.ravel(indexer) if not isinstance(indexer, (np.ndarray, pd.Index)): indexer = np.array(indexer) if issubclass(indexer.dtype.type, (np.integer, np.floating)): return indexer # Might not work for range indexes elif issubclass(indexer.dtype.type, np.bool_): if indexer.shape != index.shape: raise IndexError( f"Boolean index does not match AnnData’s shape along this " f"dimension. Boolean index has shape {indexer.shape} while " f"AnnData index has shape {index.shape}." ) positions = np.where(indexer)[0] return positions # np.ndarray[int] else: # indexer should be string array positions = index.get_indexer(indexer) if np.any(positions < 0): not_found = indexer[positions < 0] raise KeyError( f"Values {list(not_found)}, from {list(indexer)}, " "are not valid obs/ var names or indices." ) return positions # np.ndarray[int] else: raise IndexError(f"Unknown indexer {indexer!r} of type {type(indexer)}") def unpack_index(index: Index) -> Tuple[Index1D, Index1D]: if not isinstance(index, tuple): return index, slice(None) elif len(index) == 2: return index elif len(index) == 1: return index[0], slice(None) else: raise IndexError("invalid number of indices") @singledispatch def _subset(a: Union[np.ndarray, pd.DataFrame], subset_idx: Index): # Select as combination of indexes, not coordinates # Correcting for indexing behaviour of np.ndarray if all(isinstance(x, cabc.Iterable) for x in subset_idx): subset_idx = np.ix_(*subset_idx) return a[subset_idx] @_subset.register(spmatrix) def _subset_spmatrix(a: spmatrix, subset_idx: Index): # Correcting for indexing behaviour of sparse.spmatrix if len(subset_idx) > 1 and all(isinstance(x, cabc.Iterable) for x in subset_idx): subset_idx = (subset_idx[0].reshape(-1, 1), *subset_idx[1:]) return a[subset_idx] @_subset.register(pd.DataFrame) def _subset_df(df: pd.DataFrame, subset_idx: Index): return df.iloc[subset_idx] # Registration for SparseDataset occurs in sparse_dataset.py @_subset.register(h5py.Dataset) def _subset_dataset(d, subset_idx): if not isinstance(subset_idx, tuple): subset_idx = (subset_idx,) ordered = list(subset_idx) rev_order = [slice(None) for _ in range(len(subset_idx))] for axis, axis_idx in enumerate(ordered.copy()): if isinstance(axis_idx, np.ndarray) and axis_idx.dtype.type != bool: order = np.argsort(axis_idx) ordered[axis] = axis_idx[order] rev_order[axis] = np.argsort(order) # from hdf5, then to real order return d[tuple(ordered)][tuple(rev_order)] def make_slice(idx, dimidx, n=2): mut = list(repeat(slice(None), n)) mut[dimidx] = idx return tuple(mut) def get_vector(adata, k, coldim, idxdim, layer=None): # adata could be self if Raw and AnnData shared a parent dims = ("obs", "var") col = getattr(adata, coldim).columns idx = getattr(adata, f"{idxdim}_names") in_col = k in col in_idx = k in idx if (in_col + in_idx) == 2: raise ValueError( f"Key {k} could be found in both .{idxdim}_names and .{coldim}.columns" ) elif (in_col + in_idx) == 0: raise KeyError( f"Could not find key {k} in .{idxdim}_names or .{coldim}.columns." ) elif in_col: return getattr(adata, coldim)[k].values elif in_idx: selected_dim = dims.index(idxdim) idx = adata._normalize_indices(make_slice(k, selected_dim)) a = adata._get_X(layer=layer)[idx] if issparse(a): a = a.toarray() return np.ravel(a) anndata-0.7.8/anndata/_core/merge.py000066400000000000000000000716651414255741200173260ustar00rootroot00000000000000""" Code for merging/ concatenating AnnData objects. """ from collections import OrderedDict from collections.abc import Mapping, MutableSet from functools import reduce, singledispatch from itertools import repeat from operator import and_, or_, sub from typing import Any, Callable, Collection, Iterable, Optional, Tuple, TypeVar, Union import typing from warnings import warn import numpy as np import pandas as pd from scipy import sparse from scipy.sparse.base import spmatrix from .anndata import AnnData from ..compat import Literal from ..utils import asarray T = TypeVar("T") ################### # Utilities ################### # Pretty much just for maintaining order of keys class OrderedSet(MutableSet): def __init__(self, vals=()): self.dict = OrderedDict(zip(vals, repeat(None))) def __contains__(self, val): return val in self.dict def __iter__(self): return iter(self.dict) def __len__(self): return len(self.dict) def __repr__(self): return "OrderedSet: {" + ", ".join(map(str, self)) + "}" def copy(self): return OrderedSet(self.dict.copy()) def add(self, val): self.dict[val] = None def union(self, *vals) -> "OrderedSet": return reduce(or_, vals, self) def discard(self, val): if val in self: del self.dict[val] def difference(self, *vals) -> "OrderedSet": return reduce(sub, vals, self) def union_keys(ds: Collection) -> OrderedSet: return reduce(or_, ds, OrderedSet()) def intersect_keys(ds: Collection) -> OrderedSet: return reduce(and_, map(OrderedSet, ds)) class MissingVal: """Represents a missing value.""" def is_missing(v) -> bool: return v is MissingVal def not_missing(v) -> bool: return v is not MissingVal # We need to be able to check for equality of arrays to know which are the same. # Unfortunatley equality of arrays is poorly defined. # * `np.array_equal` does not work for sparse arrays # * `np.array_equal(..., equal_nan=True)` does not work for null values at the moment # (see https://github.com/numpy/numpy/issues/16377) # So we have to define it ourselves with these two issues in mind. # TODO: Hopefully this will stop being an issue in the future and this code can be removed. @singledispatch def equal(a, b) -> bool: return np.array_equal(a, asarray(b)) @equal.register(pd.DataFrame) def equal_dataframe(a, b) -> bool: return a.equals(b) @equal.register(np.ndarray) def equal_array(a, b) -> bool: return equal(pd.DataFrame(a), pd.DataFrame(asarray(b))) @equal.register(sparse.spmatrix) def equal_sparse(a, b) -> bool: # It's a weird api, don't blame me if isinstance(b, sparse.spmatrix): comp = a != b if isinstance(comp, bool): return not comp # fmt: off return ( (len(comp.data) == 0) or ( np.isnan(a[comp]).all() and np.isnan(b[comp]).all() ) ) # fmt: on else: return False def as_sparse(x): if not isinstance(x, sparse.spmatrix): return sparse.csr_matrix(x) else: return x ################### # Per element logic ################### def unique_value(vals: Collection[T]) -> Union[T, MissingVal]: """ Given a collection vals, returns the unique value (if one exists), otherwise returns MissingValue. """ unique_val = vals[0] for v in vals[1:]: if not equal(v, unique_val): return MissingVal return unique_val def first(vals: Collection[T]) -> Union[T, MissingVal]: """ Given a collection of vals, return the first non-missing one.If they're all missing, return MissingVal. """ for val in vals: if not_missing(val): return val return MissingVal def only(vals: Collection[T]) -> Union[T, MissingVal]: """Return the only value in the collection, otherwise MissingVal.""" if len(vals) == 1: return vals[0] else: return MissingVal ################### # Merging ################### def merge_nested(ds: Collection[Mapping], keys_join: Callable, value_join: Callable): out = {} for k in keys_join(ds): v = _merge_nested(ds, k, keys_join, value_join) if not_missing(v): out[k] = v return out def _merge_nested( ds: Collection[Mapping], k, keys_join: Callable, value_join: Callable ): vals = [d[k] for d in ds if k in d] if len(vals) == 0: return MissingVal elif all(isinstance(v, Mapping) for v in vals): new_map = merge_nested(vals, keys_join, value_join) if len(new_map) == 0: return MissingVal else: return new_map else: return value_join(vals) def merge_unique(ds: Collection[Mapping]) -> Mapping: return merge_nested(ds, union_keys, unique_value) def merge_same(ds: Collection[Mapping]) -> Mapping: return merge_nested(ds, intersect_keys, unique_value) def merge_first(ds: Collection[Mapping]) -> Mapping: return merge_nested(ds, union_keys, first) def merge_only(ds: Collection[Mapping]) -> Mapping: return merge_nested(ds, union_keys, only) ################### # Interface ################### # Leaving out for now, it's ugly in the rendered docs and would be adding a dependency. # from typing_extensions import Literal # UNS_STRATEGIES_TYPE = Literal[None, "same", "unique", "first", "only"] MERGE_STRATEGIES = { None: lambda x: {}, "same": merge_same, "unique": merge_unique, "first": merge_first, "only": merge_only, } StrategiesLiteral = Literal["same", "unique", "first", "only"] def resolve_merge_strategy( strategy: Union[str, Callable, None] ) -> Callable[[Collection[Mapping]], Mapping]: if not isinstance(strategy, Callable): strategy = MERGE_STRATEGIES[strategy] return strategy ##################### # Concatenation ##################### class Reindexer(object): """ Indexing to be applied to axis of 2d array orthogonal to the axis being concatenated. Attrs ----- old_idx Original index new_idx Target index old_pos Indices of original index which will be kept new_pos Indices of new index which data from old_pos will be placed in. Together with `old_pos` this forms a mapping. """ def __init__(self, old_idx, new_idx): self.old_idx = old_idx self.new_idx = new_idx self.no_change = new_idx.equals(old_idx) new_pos = new_idx.get_indexer(old_idx) old_pos = np.arange(len(new_pos)) mask = new_pos != -1 self.new_pos = new_pos[mask] self.old_pos = old_pos[mask] def __call__(self, el, *, axis=1, fill_value=None): return self.apply(el, axis=axis, fill_value=fill_value) def apply(self, el, *, axis, fill_value=None): """ Reindex element so el[axis] is aligned to self.new_idx. Missing values are to be replaced with `fill_value`. """ if self.no_change and (el.shape[axis] == len(self.old_idx)): return el if isinstance(el, pd.DataFrame): return self._apply_to_df(el, axis=axis, fill_value=fill_value) elif isinstance(el, sparse.spmatrix): return self._apply_to_sparse(el, axis=axis, fill_value=fill_value) else: return self._apply_to_array(el, axis=axis, fill_value=fill_value) def _apply_to_df(self, el: pd.DataFrame, *, axis, fill_value=None): if fill_value is None: fill_value = np.NaN return el.reindex(self.new_idx, axis=axis, fill_value=fill_value) def _apply_to_array(self, el, *, axis, fill_value=None): if fill_value is None: fill_value = default_fill_value([el]) if el.shape[axis] == 0: # Presumably faster since it won't allocate the full array shape = list(el.shape) shape[axis] = len(self.new_idx) return np.broadcast_to(fill_value, tuple(shape)) indexer = self.old_idx.get_indexer(self.new_idx) # Indexes real fast, and does outer indexing return pd.api.extensions.take( el, indexer, axis=axis, allow_fill=True, fill_value=fill_value ) def _apply_to_sparse(self, el: spmatrix, *, axis, fill_value=None) -> spmatrix: if fill_value is None: fill_value = default_fill_value([el]) if fill_value != 0: to_fill = self.new_idx.get_indexer(self.new_idx.difference(self.old_idx)) else: to_fill = np.array([]) # Fixing outer indexing for missing values if el.shape[axis] == 0: shape = list(el.shape) shape[axis] = len(self.new_idx) shape = tuple(shape) if fill_value == 0: return sparse.csr_matrix(shape) else: return np.broadcast_to(fill_value, shape) fill_idxer = None if len(to_fill) > 0: idxmtx_dtype = np.promote_types(el.dtype, np.array(fill_value).dtype) else: idxmtx_dtype = bool if axis == 1: idxmtx = sparse.coo_matrix( (np.ones(len(self.new_pos), dtype=bool), (self.old_pos, self.new_pos)), shape=(len(self.old_idx), len(self.new_idx)), dtype=idxmtx_dtype, ) out = el @ idxmtx if len(to_fill) > 0: out = out.tocsc() fill_idxer = (slice(None), to_fill) elif axis == 0: idxmtx = sparse.coo_matrix( (np.ones(len(self.new_pos), dtype=bool), (self.new_pos, self.old_pos)), shape=(len(self.new_idx), len(self.old_idx)), dtype=idxmtx_dtype, ) out = idxmtx @ el if len(to_fill) > 0: out = out.tocsr() fill_idxer = (to_fill, slice(None)) if fill_idxer is not None: out[fill_idxer] = fill_value return out def merge_indices( inds: Iterable[pd.Index], join: Literal["inner", "outer"] ) -> pd.Index: if join == "inner": return reduce(lambda x, y: x.intersection(y), inds) elif join == "outer": return reduce(lambda x, y: x.union(y), inds) else: raise ValueError() def default_fill_value(els): """Given some arrays, returns what the default fill value should be. This is largely due to backwards compat, and might not be the ideal solution. """ if any(isinstance(el, sparse.spmatrix) for el in els): return 0 else: return np.nan def gen_reindexer(new_var: pd.Index, cur_var: pd.Index): """ Given a new set of var_names, and a current set, generates a function which will reindex a matrix to be aligned with the new set. Usage ----- >>> a = AnnData(sparse.eye(3), var=pd.DataFrame(index=list("abc"))) >>> b = AnnData(sparse.eye(2), var=pd.DataFrame(index=list("ba"))) >>> reindexer = gen_reindexer(a.var_names, b.var_names) >>> sparse.vstack([a.X, reindexer(b.X)]).toarray() array([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.], [0., 1., 0.], [1., 0., 0.]], dtype=float32) """ return Reindexer(cur_var, new_var) def concat_arrays(arrays, reindexers, axis=0, index=None, fill_value=None): arrays = list(arrays) if fill_value is None: fill_value = default_fill_value(arrays) if any(isinstance(a, pd.DataFrame) for a in arrays): # TODO: This is hacky, 0 is a sentinel for outer_concat_aligned_mapping if not all( isinstance(a, pd.DataFrame) or a is MissingVal or 0 in a.shape for a in arrays ): raise NotImplementedError( "Cannot concatenate a dataframe with other array types." ) # TODO: behaviour here should be chosen through a merge strategy df = pd.concat( [f(x) for f, x in zip(reindexers, arrays)], ignore_index=True, axis=axis ) df.index = index return df elif any(isinstance(a, sparse.spmatrix) for a in arrays): sparse_stack = (sparse.vstack, sparse.hstack)[axis] return sparse_stack( [ f(as_sparse(a), axis=1 - axis, fill_value=fill_value) for f, a in zip(reindexers, arrays) ], format="csr", ) else: return np.concatenate( [ f(x, fill_value=fill_value, axis=1 - axis) for f, x in zip(reindexers, arrays) ], axis=axis, ) def inner_concat_aligned_mapping(mappings, reindexers=None, index=None, axis=0): result = {} for k in intersect_keys(mappings): els = [m[k] for m in mappings] if reindexers is None: cur_reindexers = gen_inner_reindexers(els, new_index=index, axis=axis) else: cur_reindexers = reindexers result[k] = concat_arrays(els, cur_reindexers, index=index, axis=axis) return result def gen_inner_reindexers(els, new_index, axis: Literal[0, 1] = 0): alt_axis = 1 - axis if axis == 0: df_indices = lambda x: x.columns elif axis == 1: df_indices = lambda x: x.indices if all(isinstance(el, pd.DataFrame) for el in els if not_missing(el)): common_ind = reduce( lambda x, y: x.intersection(y), (df_indices(el) for el in els) ) reindexers = [Reindexer(df_indices(el), common_ind) for el in els] else: min_ind = min(el.shape[alt_axis] for el in els) reindexers = [ gen_reindexer(pd.RangeIndex(min_ind), pd.RangeIndex(el.shape[alt_axis])) for el in els ] return reindexers def gen_outer_reindexers(els, shapes, new_index: pd.Index, *, axis=0): if all(isinstance(el, pd.DataFrame) for el in els if not_missing(el)): reindexers = [ (lambda x: x) if not_missing(el) else (lambda x: pd.DataFrame(index=range(shape))) for el, shape in zip(els, shapes) ] else: # if fill_value is None: # fill_value = default_fill_value(els) max_col = max(el.shape[1] for el in els if not_missing(el)) orig_cols = [el.shape[1] if not_missing(el) else 0 for el in els] reindexers = [ gen_reindexer(pd.RangeIndex(max_col), pd.RangeIndex(n)) for n in orig_cols ] return reindexers def outer_concat_aligned_mapping( mappings, reindexers=None, index=None, fill_value=None, axis=0 ): result = {} ns = [m.parent.shape[axis] for m in mappings] for k in union_keys(mappings): els = [m.get(k, MissingVal) for m in mappings] if reindexers is None: cur_reindexers = gen_outer_reindexers(els, ns, new_index=index, axis=axis) else: cur_reindexers = reindexers # Handling of missing values here is hacky for dataframes # We should probably just handle missing elements for all types result[k] = concat_arrays( [ el if not_missing(el) else np.zeros((n, 0), dtype=bool) for el, n in zip(els, ns) ], cur_reindexers, axis=axis, index=index, fill_value=fill_value, ) return result def concat_pairwise_mapping( mappings: Collection[Mapping], shapes: Collection[int], join_keys=intersect_keys ): result = {} for k in join_keys(mappings): els = [ m.get(k, sparse.csr_matrix((s, s), dtype=bool)) for m, s in zip(mappings, shapes) ] result[k] = sparse.block_diag(els, format="csr") return result def merge_dataframes( dfs: Iterable[pd.DataFrame], new_index, merge_strategy=merge_unique ) -> pd.DataFrame: dfs = [df.reindex(index=new_index) for df in dfs] # New dataframe with all shared data new_df = pd.DataFrame(merge_strategy(dfs), index=new_index) return new_df def merge_outer(mappings, batch_keys, *, join_index="-", merge=merge_unique): """ Combine elements of two mappings, such that non-overlapping entries are added with their batch-key appended. Note: this currently does NOT work for nested mappings. Additionally, values are not promised to be unique, and may be overwritten. """ all_keys = union_keys(mappings) out = merge(mappings) for key in all_keys.difference(out.keys()): for b, m in zip(batch_keys, mappings): val = m.get(key, None) if val is not None: out[f"{key}{join_index}{b}"] = val return out def _resolve_dim(*, dim: str = None, axis: int = None) -> Tuple[int, str]: _dims = ("obs", "var") if (dim is None and axis is None) or (dim is not None and axis is not None): raise ValueError( f"Must pass exactly one of `dim` or `axis`. Got: dim={dim}, axis={axis}." ) elif dim is not None and dim not in _dims: raise ValueError(f"`dim` must be one of ('obs', 'var'), was {dim}") elif axis is not None and axis not in (0, 1): raise ValueError(f"`axis` must be either 0 or 1, was {axis}") if dim is not None: return _dims.index(dim), dim else: return axis, _dims[axis] def dim_indices(adata, *, axis=None, dim=None) -> pd.Index: """Helper function to get adata.{dim}_names.""" _, dim = _resolve_dim(axis=axis, dim=dim) return getattr(adata, f"{dim}_names") def dim_size(adata, *, axis=None, dim=None) -> int: """Helper function to get adata.shape[dim].""" ax, _ = _resolve_dim(axis, dim) return adata.shape[ax] def concat( adatas: Union[Collection[AnnData], "typing.Mapping[str, AnnData]"], *, axis: Literal[0, 1] = 0, join: Literal["inner", "outer"] = "inner", merge: Union[StrategiesLiteral, Callable, None] = None, uns_merge: Union[StrategiesLiteral, Callable, None] = None, label: Optional[str] = None, keys: Optional[Collection] = None, index_unique: Optional[str] = None, fill_value: Optional[Any] = None, pairwise: bool = False, ) -> AnnData: """Concatenates AnnData objects along an axis. See the :doc:`concatenation <../concatenation>` section in the docs for a more in-depth description. .. warning:: This function is marked as experimental for the `0.7` release series, and will supercede the :meth:`AnnData.concatenate() ` method in future releases. Params ------ adatas The objects to be concatenated. If a Mapping is passed, keys are used for the `keys` argument and values are concatenated. axis Which axis to concatenate along. join How to align values when concatenating. If "outer", the union of the other axis is taken. If "inner", the intersection. See :doc:`concatenation <../concatenation>` for more. merge How elements not aligned to the axis being concatenated along are selected. Currently implemented strategies include: * `None`: No elements are kept. * `"same"`: Elements that are the same in each of the objects. * `"unique"`: Elements for which there is only one possible value. * `"first"`: The first element seen at each from each position. * `"only"`: Elements that show up in only one of the objects. uns_merge How the elements of `.uns` are selected. Uses the same set of strategies as the `merge` argument, except applied recursively. label Column in axis annotation (i.e. `.obs` or `.var`) to place batch information in. If it's None, no column is added. keys Names for each object being added. These values are used for column values for `label` or appended to the index if `index_unique` is not `None`. Defaults to incrementing integer labels. index_unique Whether to make the index unique by using the keys. If provided, this is the delimeter between "{orig_idx}{index_unique}{key}". When `None`, the original indices are kept. fill_value When `join="outer"`, this is the value that will be used to fill the introduced indices. By default, sparse arrays are padded with zeros, while dense arrays and DataFrames are padded with missing values. pairwise Whether pairwise elements along the concatenated dimension should be included. This is False by default, since the resulting arrays are often not meaningful. Notes ----- .. warning:: If you use `join='outer'` this fills 0s for sparse data when variables are absent in a batch. Use this with care. Dense data is filled with `NaN`. Examples -------- Preparing example objects >>> import anndata as ad, pandas as pd, numpy as np >>> from scipy import sparse >>> a = ad.AnnData( ... X=sparse.csr_matrix(np.array([[0, 1], [2, 3]])), ... obs=pd.DataFrame({"group": ["a", "b"]}, index=["s1", "s2"]), ... var=pd.DataFrame(index=["var1", "var2"]), ... varm={"ones": np.ones((2, 5)), "rand": np.random.randn(2, 3), "zeros": np.zeros((2, 5))}, ... uns={"a": 1, "b": 2, "c": {"c.a": 3, "c.b": 4}}, ... ) >>> b = ad.AnnData( ... X=sparse.csr_matrix(np.array([[4, 5, 6], [7, 8, 9]])), ... obs=pd.DataFrame({"group": ["b", "c"], "measure": [1.2, 4.3]}, index=["s3", "s4"]), ... var=pd.DataFrame(index=["var1", "var2", "var3"]), ... varm={"ones": np.ones((3, 5)), "rand": np.random.randn(3, 5)}, ... uns={"a": 1, "b": 3, "c": {"c.b": 4}}, ... ) >>> c = ad.AnnData( ... X=sparse.csr_matrix(np.array([[10, 11], [12, 13]])), ... obs=pd.DataFrame({"group": ["a", "b"]}, index=["s1", "s2"]), ... var=pd.DataFrame(index=["var3", "var4"]), ... uns={"a": 1, "b": 4, "c": {"c.a": 3, "c.b": 4, "c.c": 5}}, ... ) Concatenating along different axes >>> ad.concat([a, b]).to_df() var1 var2 s1 0.0 1.0 s2 2.0 3.0 s3 4.0 5.0 s4 7.0 8.0 >>> ad.concat([a, c], axis=1).to_df() var1 var2 var3 var4 s1 0.0 1.0 10.0 11.0 s2 2.0 3.0 12.0 13.0 Inner and outer joins >>> inner = ad.concat([a, b]) # Joining on intersection of variables >>> inner AnnData object with n_obs × n_vars = 4 × 2 obs: 'group' >>> (inner.obs_names, inner.var_names) # doctest: +NORMALIZE_WHITESPACE (Index(['s1', 's2', 's3', 's4'], dtype='object'), Index(['var1', 'var2'], dtype='object')) >>> outer = ad.concat([a, b], join="outer") # Joining on union of variables >>> outer AnnData object with n_obs × n_vars = 4 × 3 obs: 'group', 'measure' >>> outer.var_names Index(['var1', 'var2', 'var3'], dtype='object') >>> outer.to_df() # Sparse arrays are padded with zeroes by default var1 var2 var3 s1 0.0 1.0 0.0 s2 2.0 3.0 0.0 s3 4.0 5.0 6.0 s4 7.0 8.0 9.0 Keeping track of source objects >>> ad.concat({"a": a, "b": b}, label="batch").obs group batch s1 a a s2 b a s3 b b s4 c b >>> ad.concat([a, b], label="batch", keys=["a", "b"]).obs # Equivalent to previous group batch s1 a a s2 b a s3 b b s4 c b >>> ad.concat({"a": a, "b": b}, index_unique="-").obs group s1-a a s2-a b s3-b b s4-b c Combining values not aligned to axis of concatenation >>> ad.concat([a, b], merge="same") AnnData object with n_obs × n_vars = 4 × 2 obs: 'group' varm: 'ones' >>> ad.concat([a, b], merge="unique") AnnData object with n_obs × n_vars = 4 × 2 obs: 'group' varm: 'ones', 'zeros' >>> ad.concat([a, b], merge="first") AnnData object with n_obs × n_vars = 4 × 2 obs: 'group' varm: 'ones', 'rand', 'zeros' >>> ad.concat([a, b], merge="only") AnnData object with n_obs × n_vars = 4 × 2 obs: 'group' varm: 'zeros' The same merge strategies can be used for elements in `.uns` >>> dict(ad.concat([a, b, c], uns_merge="same").uns) {'a': 1, 'c': {'c.b': 4}} >>> dict(ad.concat([a, b, c], uns_merge="unique").uns) {'a': 1, 'c': {'c.a': 3, 'c.b': 4, 'c.c': 5}} >>> dict(ad.concat([a, b, c], uns_merge="only").uns) {'c': {'c.c': 5}} >>> dict(ad.concat([a, b, c], uns_merge="first").uns) {'a': 1, 'b': 2, 'c': {'c.a': 3, 'c.b': 4, 'c.c': 5}} """ # Argument normalization merge = resolve_merge_strategy(merge) uns_merge = resolve_merge_strategy(uns_merge) if isinstance(adatas, Mapping): if keys is not None: raise TypeError( "Cannot specify categories in both mapping keys and using `keys`. " "Only specify this once." ) keys, adatas = list(adatas.keys()), list(adatas.values()) else: adatas = list(adatas) if keys is None: keys = np.arange(len(adatas)).astype(str) axis, dim = _resolve_dim(axis=axis) alt_axis, alt_dim = _resolve_dim(axis=1 - axis) # Label column label_col = pd.Categorical.from_codes( np.repeat(np.arange(len(adatas)), [a.shape[axis] for a in adatas]), categories=keys, ) # Combining indexes concat_indices = pd.concat( [pd.Series(dim_indices(a, axis=axis)) for a in adatas], ignore_index=True ) if index_unique is not None: concat_indices = concat_indices.str.cat(label_col.map(str), sep=index_unique) concat_indices = pd.Index(concat_indices) alt_indices = merge_indices( [dim_indices(a, axis=alt_axis) for a in adatas], join=join ) reindexers = [ gen_reindexer(alt_indices, dim_indices(a, axis=alt_axis)) for a in adatas ] # Annotation for concatenation axis concat_annot = pd.concat( [getattr(a, dim) for a in adatas], join=join, ignore_index=True ) concat_annot.index = concat_indices if label is not None: concat_annot[label] = label_col # Annotation for other axis alt_annot = merge_dataframes( [getattr(a, alt_dim) for a in adatas], alt_indices, merge ) X = concat_arrays( [a.X for a in adatas], reindexers, axis=axis, fill_value=fill_value ) if join == "inner": layers = inner_concat_aligned_mapping( [a.layers for a in adatas], axis=axis, reindexers=reindexers ) concat_mapping = inner_concat_aligned_mapping( [getattr(a, f"{dim}m") for a in adatas], index=concat_indices ) if pairwise: concat_pairwise = concat_pairwise_mapping( mappings=[getattr(a, f"{dim}p") for a in adatas], shapes=[a.shape[axis] for a in adatas], join_keys=intersect_keys, ) else: concat_pairwise = {} elif join == "outer": layers = outer_concat_aligned_mapping( [a.layers for a in adatas], reindexers, axis=axis, fill_value=fill_value ) concat_mapping = outer_concat_aligned_mapping( [getattr(a, f"{dim}m") for a in adatas], index=concat_indices, fill_value=fill_value, ) if pairwise: concat_pairwise = concat_pairwise_mapping( mappings=[getattr(a, f"{dim}p") for a in adatas], shapes=[a.shape[axis] for a in adatas], join_keys=union_keys, ) else: concat_pairwise = {} # TODO: Reindex lazily, so we don't have to make those copies until we're sure we need the element alt_mapping = merge( [ {k: r(v, axis=0) for k, v in getattr(a, f"{alt_dim}m").items()} for r, a in zip(reindexers, adatas) ], ) alt_pairwise = merge( [ {k: r(r(v, axis=0), axis=1) for k, v in getattr(a, f"{alt_dim}p").items()} for r, a in zip(reindexers, adatas) ] ) uns = uns_merge([a.uns for a in adatas]) raw = None has_raw = [a.raw is not None for a in adatas] if all(has_raw): raw = concat( [ AnnData( X=a.raw.X, obs=pd.DataFrame(index=a.obs_names), var=a.raw.var, varm=a.raw.varm, ) for a in adatas ], join=join, label=label, keys=keys, index_unique=index_unique, fill_value=fill_value, axis=axis, ) elif any(has_raw): warn( "Only some AnnData objects have `.raw` attribute, " "not concatenating `.raw` attributes.", UserWarning, ) return AnnData( **{ "X": X, "layers": layers, dim: concat_annot, alt_dim: alt_annot, f"{dim}m": concat_mapping, f"{alt_dim}m": alt_mapping, f"{dim}p": concat_pairwise, f"{alt_dim}p": alt_pairwise, "uns": uns, "raw": raw, } ) anndata-0.7.8/anndata/_core/raw.py000066400000000000000000000152121414255741200170020ustar00rootroot00000000000000from typing import Union, Mapping, Sequence, Tuple import h5py import numpy as np import pandas as pd from scipy import sparse from scipy.sparse import issparse from . import anndata from .index import _normalize_index, _subset, unpack_index, get_vector from .aligned_mapping import AxisArrays, AxisArraysView from .sparse_dataset import SparseDataset # TODO: Implement views for Raw class Raw: def __init__( self, adata: "anndata.AnnData", X: Union[np.ndarray, sparse.spmatrix, None] = None, var: Union[pd.DataFrame, Mapping[str, Sequence], None] = None, varm: Union[AxisArrays, Mapping[str, np.ndarray], None] = None, ): from .anndata import _gen_dataframe self._adata = adata self._n_obs = adata.n_obs # construct manually if adata.isbacked == (X is None): self._X = X self._var = _gen_dataframe(var, self.X.shape[1], ["var_names"]) self._varm = AxisArrays(self, 1, varm) elif X is None: # construct from adata self._X = adata.X.copy() self._var = adata.var.copy() self._varm = AxisArrays(self, 1, adata.varm.copy()) elif adata.isbacked: raise ValueError("Cannot specify X if adata is backed") def _get_X(self, layer=None): if layer is not None: raise ValueError() return self.X @property def X(self) -> Union[SparseDataset, np.ndarray, sparse.spmatrix]: # TODO: Handle unsorted array of integer indices for h5py.Datasets if not self._adata.isbacked: return self._X if not self._adata.file.is_open: self._adata.file.open() # Handle legacy file formats: if "raw/X" in self._adata.file: X = self._adata.file["raw/X"] elif "raw.X" in self._adata.file: X = self._adata.file["raw.X"] # Backwards compat else: raise AttributeError( f"Could not find dataset for raw X in file: " f"{self._adata.file.filename}." ) if isinstance(X, h5py.Group): X = SparseDataset(X) # Check if we need to subset if self._adata.is_view: # TODO: As noted above, implement views of raw # so we can know if we need to subset by var return _subset(X, (self._adata._oidx, slice(None))) else: return X @property def shape(self): return self.n_obs, self.n_vars @property def var(self): return self._var @property def n_vars(self): return self._var.shape[0] @property def n_obs(self): return self._n_obs @property def varm(self): return self._varm @property def var_names(self): return self.var.index @property def obs_names(self): return self._adata.obs_names def __getitem__(self, index): oidx, vidx = self._normalize_indices(index) # To preserve two dimensional shape if isinstance(vidx, (int, np.integer)): vidx = slice(vidx, vidx + 1, 1) if isinstance(oidx, (int, np.integer)): oidx = slice(oidx, oidx + 1, 1) if not self._adata.isbacked: X = _subset(self.X, (oidx, vidx)) else: X = None var = self._var.iloc[vidx] new = Raw(self._adata, X=X, var=var) if self._varm is not None: # Since there is no view of raws new._varm = self._varm._view(_RawViewHack(self, vidx), (vidx,)).copy() return new def __str__(self): descr = f"Raw AnnData with n_obs × n_vars = {self.n_obs} × {self.n_vars}" for attr in ["var", "varm"]: keys = getattr(self, attr).keys() if len(keys) > 0: descr += f"\n {attr}: {str(list(keys))[1:-1]}" return descr def copy(self): return Raw( self._adata, X=self.X.copy(), var=self.var.copy(), varm=None if self._varm is None else self._varm.copy(), ) def to_adata(self): """Create full AnnData object.""" return anndata.AnnData( X=self.X.copy(), var=self.var.copy(), varm=None if self._varm is None else self._varm.copy(), obs=self._adata.obs.copy(), obsm=self._adata.obsm.copy(), uns=self._adata.uns.copy(), ) def _normalize_indices(self, packed_index): # deal with slicing with pd.Series if isinstance(packed_index, pd.Series): packed_index = packed_index.values if isinstance(packed_index, tuple): if len(packed_index) != 2: raise IndexDimError(len(packed_index)) if isinstance(packed_index[1], pd.Series): packed_index = packed_index[0], packed_index[1].values if isinstance(packed_index[0], pd.Series): packed_index = packed_index[0].values, packed_index[1] obs, var = unpack_index(packed_index) obs = _normalize_index(obs, self._adata.obs_names) var = _normalize_index(var, self.var_names) return obs, var def var_vector(self, k: str) -> np.ndarray: # TODO decorator to copy AnnData.var_vector docstring return get_vector(self, k, "var", "obs") def obs_vector(self, k: str) -> np.ndarray: # TODO decorator to copy AnnData.obs_vector docstring idx = self._normalize_indices((slice(None), k)) a = self.X[idx] if issparse(a): a = a.toarray() return np.ravel(a) # This exists to accommodate AlignedMappings, # until we implement a proper RawView or get rid of Raw in favor of modes. class _RawViewHack: def __init__(self, raw: Raw, vidx: Union[slice, np.ndarray]): self.parent_raw = raw self.vidx = vidx @property def shape(self) -> Tuple[int, int]: return self.parent_raw.n_obs, len(self.var_names) @property def obs_names(self) -> pd.Index: return self.parent_raw.obs_names @property def var_names(self) -> pd.Index: return self.parent_raw.var_names[self.vidx] class IndexDimError(IndexError): MSG = ( "You tried to slice an AnnData(View) object with an" "{}-dimensional index, but only 2 dimensions exist in such an object." ) MSG_1D = ( "\nIf you tried to slice cells using adata[cells, ], " "note that Python (unlike R) uses adata[cells, :] as slicing syntax." ) def __init__(self, n_dims: int): msg = self.MSG.format(n_dims) if n_dims == 1: msg += self.MSG_1D super().__init__(msg) anndata-0.7.8/anndata/_core/sparse_dataset.py000066400000000000000000000312101414255741200212070ustar00rootroot00000000000000"""\ This module implements on disk sparse datasets. This code is based on and uses the conventions of h5sparse_ by `Appier Inc.`_. See the copyright and license note in this directory source code. .. _h5sparse: https://github.com/appier/h5sparse .. _Appier Inc.: https://www.appier.com/ """ # TODO: # - think about supporting the COO format import collections.abc as cabc from itertools import accumulate, chain from typing import Union, NamedTuple, Tuple, Sequence, Iterable, Type from warnings import warn import h5py import numpy as np import scipy.sparse as ss from scipy.sparse import _sparsetools try: # Not really important, just for IDEs to be more helpful from scipy.sparse.compressed import _cs_matrix except ImportError: _cs_matrix = ss.spmatrix from .index import unpack_index, Index, _subset class BackedFormat(NamedTuple): format_str: str backed_type: Type["BackedSparseMatrix"] memory_type: Type[ss.spmatrix] class BackedSparseMatrix(_cs_matrix): """\ Mixin class for backed sparse matrices. Largely needed for the case `backed_sparse_csr(...)[:]`, since that calls copy on `.data`, `.indices`, and `.indptr`. """ def copy(self) -> ss.spmatrix: if isinstance(self.data, h5py.Dataset): return SparseDataset(self.data.parent).to_memory() else: return super().copy() def _set_many(self, i: Iterable[int], j: Iterable[int], x): """\ Sets value at each (i, j) to x Here (i,j) index major and minor respectively, and must not contain duplicate entries. """ # Scipy 1.3+ compat n_samples = 1 if np.isscalar(x) else len(x) offsets = self._offsets(i, j, n_samples) if -1 not in offsets: # make a list for interaction with h5py offsets = list(offsets) # only affects existing non-zero cells self.data[offsets] = x return else: raise ValueError( "You cannot change the sparsity structure of a SparseDataset." ) # replace where possible # mask = offsets > -1 # # offsets[mask] # bool_data_mask = np.zeros(len(self.data), dtype=bool) # bool_data_mask[offsets[mask]] = True # self.data[bool_data_mask] = x[mask] # # self.data[offsets[mask]] = x[mask] # # only insertions remain # mask = ~mask # i = i[mask] # i[i < 0] += M # j = j[mask] # j[j < 0] += N # self._insert_many(i, j, x[mask]) def _zero_many(self, i: Sequence[int], j: Sequence[int]): """\ Sets value at each (i, j) to zero, preserving sparsity structure. Here (i,j) index major and minor respectively. """ offsets = self._offsets(i, j, len(i)) # only assign zeros to the existing sparsity structure self.data[list(offsets[offsets > -1])] = 0 def _offsets( self, i: Iterable[int], j: Iterable[int], n_samples: int ) -> np.ndarray: i, j, M, N = self._prepare_indices(i, j) offsets = np.empty(n_samples, dtype=self.indices.dtype) ret = _sparsetools.csr_sample_offsets( M, N, self.indptr, self.indices, n_samples, i, j, offsets ) if ret == 1: # rinse and repeat self.sum_duplicates() _sparsetools.csr_sample_offsets( M, N, self.indptr, self.indices, n_samples, i, j, offsets ) return offsets class backed_csr_matrix(BackedSparseMatrix, ss.csr_matrix): def _get_intXslice(self, row: int, col: slice) -> ss.csr_matrix: return ss.csr_matrix( get_compressed_vector(self, row), shape=(1, self.shape[1]) )[:, col] def _get_sliceXslice(self, row: slice, col: slice) -> ss.csr_matrix: out_shape = ( slice_len(row, self.shape[0]), slice_len(col, self.shape[1]), ) if out_shape[0] == 1: return self._get_intXslice(slice_as_int(row, self.shape[0]), col) elif out_shape[1] == self.shape[1] and out_shape[0] < self.shape[0]: return self._get_arrayXslice(np.arange(*row.indices(self.shape[0])), col) return super()._get_sliceXslice(row, col) def _get_arrayXslice(self, row: Sequence[int], col: slice) -> ss.csr_matrix: idxs = np.asarray(row) if idxs.dtype == bool: idxs = np.where(idxs) return ss.csr_matrix( get_compressed_vectors(self, idxs), shape=(len(idxs), self.shape[1]) )[:, col] class backed_csc_matrix(BackedSparseMatrix, ss.csc_matrix): def _get_sliceXint(self, row: slice, col: int) -> ss.csc_matrix: return ss.csc_matrix( get_compressed_vector(self, col), shape=(self.shape[0], 1) )[row, :] def _get_sliceXslice(self, row: slice, col: slice) -> ss.csc_matrix: out_shape = ( slice_len(row, self.shape[0]), slice_len(col, self.shape[1]), ) if out_shape[1] == 1: return self._get_sliceXint(row, slice_as_int(col, self.shape[1])) elif out_shape[0] == self.shape[0] and out_shape[1] < self.shape[1]: return self._get_sliceXarray(row, np.arange(*col.indices(self.shape[1]))) return super()._get_sliceXslice(row, col) def _get_sliceXarray(self, row: slice, col: Sequence[int]) -> ss.csc_matrix: idxs = np.asarray(col) if idxs.dtype == bool: idxs = np.where(idxs) return ss.csc_matrix( get_compressed_vectors(self, idxs), shape=(self.shape[0], len(idxs)) )[row, :] FORMATS = [ BackedFormat("csr", backed_csr_matrix, ss.csr_matrix), BackedFormat("csc", backed_csc_matrix, ss.csc_matrix), ] def slice_len(s: slice, l: int) -> int: """Returns length of `a[s]` where `len(a) == l`.""" return len(range(*s.indices(l))) def slice_as_int(s: slice, l: int) -> int: """Converts slices of length 1 to the integer index they’ll access.""" out = list(range(*s.indices(l))) assert len(out) == 1 return out[0] def get_compressed_vectors( x: BackedSparseMatrix, row_idxs: Iterable[int] ) -> Tuple[Sequence, Sequence, Sequence]: slices = [slice(*(x.indptr[i : i + 2])) for i in row_idxs] data = np.concatenate([x.data[s] for s in slices]) indices = np.concatenate([x.indices[s] for s in slices]) indptr = list(accumulate(chain((0,), (s.stop - s.start for s in slices)))) return data, indices, indptr def get_compressed_vector( x: BackedSparseMatrix, idx: int ) -> Tuple[Sequence, Sequence, Sequence]: s = slice(*(x.indptr[idx : idx + 2])) data = x.data[s] indices = x.indices[s] indptr = [0, len(data)] return data, indices, indptr def get_format_str(data: ss.spmatrix) -> str: for fmt, _, memory_class in FORMATS: if isinstance(data, memory_class): return fmt raise ValueError(f"Data type {type(data)} is not supported.") def get_memory_class(format_str: str) -> Type[ss.spmatrix]: for fmt, _, memory_class in FORMATS: if format_str == fmt: return memory_class raise ValueError(f"Format string {format_str} is not supported.") def get_backed_class(format_str: str) -> Type[BackedSparseMatrix]: for fmt, backed_class, _ in FORMATS: if format_str == fmt: return backed_class raise ValueError(f"Format string {format_str} is not supported.") class SparseDataset: """Analogous to :class:`h5py.Dataset `, but for sparse matrices.""" def __init__(self, group: h5py.Group): self.group = group @property def dtype(self) -> np.dtype: return self.group["data"].dtype @property def format_str(self) -> str: if "h5sparse_format" in self.group.attrs: return self.group.attrs["h5sparse_format"] else: # Should this be an extra field? return self.group.attrs["encoding-type"].replace("_matrix", "") @property def h5py_group(self) -> h5py.Group: warn( "Attribute `h5py_group` of SparseDatasets is deprecated. " "Use `group` instead.", DeprecationWarning, ) return self.group @property def name(self) -> str: return self.group.name @property def file(self) -> h5py.File: return self.group.file @property def shape(self) -> Tuple[int, int]: shape = self.group.attrs.get("h5sparse_shape") return tuple(self.group.attrs["shape"] if shape is None else shape) @property def value(self) -> ss.spmatrix: return self.to_memory() def __repr__(self) -> str: return ( f"' ) def __getitem__(self, index: Union[Index, Tuple[()]]) -> Union[float, ss.spmatrix]: row, col = self._normalize_index(index) mtx = self.to_backed() return mtx[row, col] def __setitem__(self, index: Union[Index, Tuple[()]], value): row, col = self._normalize_index(index) mock_matrix = self.to_backed() mock_matrix[row, col] = value def _normalize_index( self, index: Union[Index, Tuple[()]] ) -> Tuple[np.ndarray, np.ndarray]: if index == (): index = slice(None) row, col = unpack_index(index) if all(isinstance(x, cabc.Iterable) for x in (row, col)): row, col = np.ix_(row, col) return row, col def append(self, sparse_matrix: ss.spmatrix): # Prep variables shape = self.shape if isinstance(sparse_matrix, SparseDataset): sparse_matrix = sparse_matrix.to_backed() # Check input if not ss.isspmatrix(sparse_matrix): raise NotImplementedError( "Currently, only sparse matrices of equivalent format can be " "appended to a SparseDataset." ) if self.format_str not in {"csr", "csc"}: raise NotImplementedError( f"The append method for format {self.format_str} " f"is not implemented." ) if self.format_str != get_format_str(sparse_matrix): raise ValueError( f"Matrices must have same format. Currently are " f"{self.format_str!r} and {get_format_str(sparse_matrix)!r}" ) # shape if self.format_str == "csr": assert ( shape[1] == sparse_matrix.shape[1] ), "CSR matrices must have same size of dimension 1 to be appended." new_shape = (shape[0] + sparse_matrix.shape[0], shape[1]) elif self.format_str == "csc": assert ( shape[0] == sparse_matrix.shape[0] ), "CSC matrices must have same size of dimension 0 to be appended." new_shape = (shape[0], shape[1] + sparse_matrix.shape[1]) else: assert False, "We forgot to update this branching to a new format" if "h5sparse_shape" in self.group.attrs: del self.group.attrs["h5sparse_shape"] self.group.attrs["shape"] = new_shape # data data = self.group["data"] orig_data_size = data.shape[0] data.resize((orig_data_size + sparse_matrix.data.shape[0],)) data[orig_data_size:] = sparse_matrix.data # indptr indptr = self.group["indptr"] orig_data_size = indptr.shape[0] append_offset = indptr[-1] indptr.resize((orig_data_size + sparse_matrix.indptr.shape[0] - 1,)) indptr[orig_data_size:] = ( sparse_matrix.indptr[1:].astype(np.int64) + append_offset ) # indices indices = self.group["indices"] orig_data_size = indices.shape[0] indices.resize((orig_data_size + sparse_matrix.indices.shape[0],)) indices[orig_data_size:] = sparse_matrix.indices def to_backed(self) -> BackedSparseMatrix: format_class = get_backed_class(self.format_str) mtx = format_class(self.shape, dtype=self.dtype) mtx.data = self.group["data"] mtx.indices = self.group["indices"] mtx.indptr = self.group["indptr"][:] return mtx def to_memory(self) -> ss.spmatrix: format_class = get_memory_class(self.format_str) mtx = format_class(self.shape, dtype=self.dtype) mtx.data = self.group["data"][...] mtx.indices = self.group["indices"][...] mtx.indptr = self.group["indptr"][...] return mtx @_subset.register(SparseDataset) def subset_sparsedataset(d, subset_idx): return d[subset_idx] anndata-0.7.8/anndata/_core/views.py000066400000000000000000000120071414255741200173450ustar00rootroot00000000000000from contextlib import contextmanager from copy import deepcopy from functools import reduce, singledispatch, wraps from typing import Any, KeysView, Optional, Sequence, Tuple import numpy as np import pandas as pd from pandas.api.types import is_bool_dtype from scipy import sparse from .access import ElementRef from ..logging import anndata_logger as logger from ..compat import ZappyArray class _SetItemMixin: """\ Class which (when values are being set) lets their parent AnnData view know, so it can make a copy of itself. This implements copy-on-modify semantics for views of AnnData objects. """ def __setitem__(self, idx: Any, value: Any): if self._view_args is None: super().__setitem__(idx, value) else: logger.warning( f"Trying to set attribute `.{self._view_args.attrname}` of view, copying." ) with self._update() as container: container[idx] = value @contextmanager def _update(self): adata_view, attr_name, keys = self._view_args new = adata_view.copy() attr = getattr(new, attr_name) container = reduce(lambda d, k: d[k], keys, attr) yield container adata_view._init_as_actual(new) class _ViewMixin(_SetItemMixin): def __init__( self, *args, view_args: Tuple["anndata.AnnData", str, Tuple[str, ...]] = None, **kwargs, ): if view_args is not None: view_args = ElementRef(*view_args) self._view_args = view_args super().__init__(*args, **kwargs) # TODO: This makes `deepcopy(obj)` return `obj._view_args.parent._adata_ref`, fix it def __deepcopy__(self, memo): parent, attrname, keys = self._view_args return deepcopy(getattr(parent._adata_ref, attrname)) class ArrayView(_SetItemMixin, np.ndarray): def __new__( cls, input_array: Sequence[Any], view_args: Tuple["anndata.AnnData", str, Tuple[str, ...]] = None, ): arr = np.asanyarray(input_array).view(cls) if view_args is not None: view_args = ElementRef(*view_args) arr._view_args = view_args return arr def __array_finalize__(self, obj: Optional[np.ndarray]): if obj is not None: self._view_args = getattr(obj, "_view_args", None) def keys(self) -> KeysView[str]: # it’s a structured array return self.dtype.names def copy(self, order: str = "C") -> np.ndarray: # we want a conventional array return np.array(self) def toarray(self) -> np.ndarray: return self.copy() # Unlike array views, SparseCSRView and SparseCSCView # do not propagate through subsetting class SparseCSRView(_ViewMixin, sparse.csr_matrix): pass class SparseCSCView(_ViewMixin, sparse.csc_matrix): pass class DictView(_ViewMixin, dict): pass class DataFrameView(_ViewMixin, pd.DataFrame): _metadata = ["_view_args"] @wraps(pd.DataFrame.drop) def drop(self, *args, inplace: bool = False, **kw): if not inplace: return self.copy().drop(*args, **kw) with self._update() as df: df.drop(*args, inplace=True, **kw) @singledispatch def as_view(obj, view_args): raise NotImplementedError(f"No view type has been registered for {type(obj)}") @as_view.register(np.ndarray) def as_view_array(array, view_args): return ArrayView(array, view_args=view_args) @as_view.register(pd.DataFrame) def as_view_df(df, view_args): return DataFrameView(df, view_args=view_args) @as_view.register(sparse.csr_matrix) def as_view_csr(mtx, view_args): return SparseCSRView(mtx, view_args=view_args) @as_view.register(sparse.csc_matrix) def as_view_csc(mtx, view_args): return SparseCSCView(mtx, view_args=view_args) @as_view.register(dict) def as_view_dict(d, view_args): return DictView(d, view_args=view_args) @as_view.register(ZappyArray) def as_view_zappy(z, view_args): # Previous code says ZappyArray works as view, # but as far as I can tell they’re immutable. return z def _resolve_idxs(old, new, adata): t = tuple(_resolve_idx(old[i], new[i], adata.shape[i]) for i in (0, 1)) return t @singledispatch def _resolve_idx(old, new, l): return old[new] @_resolve_idx.register(np.ndarray) def _resolve_idx_ndarray(old, new, l): if is_bool_dtype(old): old = np.where(old)[0] return old[new] @_resolve_idx.register(np.integer) @_resolve_idx.register(int) def _resolve_idx_scalar(old, new, l): return np.array([old])[new] @_resolve_idx.register(slice) def _resolve_idx_slice(old, new, l): if isinstance(new, slice): return _resolve_idx_slice_slice(old, new, l) else: return np.arange(*old.indices(l))[new] def _resolve_idx_slice_slice(old, new, l): r = range(*old.indices(l))[new] # Convert back to slice start, stop, step = r.start, r.stop, r.step if len(r) == 0: stop = start elif stop < 0: stop = None return slice(start, stop, step) anndata-0.7.8/anndata/_io/000077500000000000000000000000001414255741200153155ustar00rootroot00000000000000anndata-0.7.8/anndata/_io/__init__.py000066400000000000000000000004421414255741200174260ustar00rootroot00000000000000class WriteWarning(UserWarning): pass from .read import ( read_csv, read_excel, read_umi_tools, read_hdf, read_loom, read_mtx, read_text, read_zarr, read_h5ad, ) from .write import write_csvs, write_loom, _write_h5ad, write_zarr from . import h5ad anndata-0.7.8/anndata/_io/h5ad.py000066400000000000000000000512651414255741200165210ustar00rootroot00000000000000import re import collections.abc as cabc from functools import _find_impl, partial from warnings import warn from pathlib import Path from types import MappingProxyType from typing import Callable, Type, TypeVar, Union from typing import Collection, Sequence, Mapping import h5py import numpy as np import pandas as pd from pandas.api.types import is_categorical_dtype from scipy import sparse from .._core.sparse_dataset import SparseDataset from .._core.file_backing import AnnDataFileManager from .._core.anndata import AnnData from .._core.raw import Raw from ..compat import ( _from_fixed_length_strings, _decode_structured_array, _clean_uns, Literal, ) from .utils import ( H5PY_V3, check_key, report_read_key_on_error, report_write_key_on_error, idx_chunks_along_axis, write_attribute, read_attribute, _read_legacy_raw, EncodingVersions, ) H5Group = Union[h5py.Group, h5py.File] H5Dataset = Union[h5py.Dataset] T = TypeVar("T") def _to_hdf5_vlen_strings(value: np.ndarray) -> np.ndarray: """This corrects compound dtypes to work with hdf5 files.""" new_dtype = [] for dt_name, (dt_type, _) in value.dtype.fields.items(): if dt_type.kind in ("U", "O"): new_dtype.append((dt_name, h5py.special_dtype(vlen=str))) else: new_dtype.append((dt_name, dt_type)) return value.astype(new_dtype) def write_h5ad( filepath: Union[Path, str], adata: AnnData, *, force_dense: bool = None, as_dense: Sequence[str] = (), dataset_kwargs: Mapping = MappingProxyType({}), **kwargs, ) -> None: if force_dense is not None: warn( "The `force_dense` argument is deprecated. Use `as_dense` instead.", FutureWarning, ) if force_dense is True: if adata.raw is not None: as_dense = ("X", "raw/X") else: as_dense = ("X",) if isinstance(as_dense, str): as_dense = [as_dense] if "raw.X" in as_dense: as_dense = list(as_dense) as_dense[as_dense.index("raw.X")] = "raw/X" if any(val not in {"X", "raw/X"} for val in as_dense): raise NotImplementedError( "Currently, only `X` and `raw/X` are supported values in `as_dense`" ) if "raw/X" in as_dense and adata.raw is None: raise ValueError("Cannot specify writing `raw/X` to dense if it doesn’t exist.") adata.strings_to_categoricals() if adata.raw is not None: adata.strings_to_categoricals(adata.raw.var) dataset_kwargs = {**dataset_kwargs, **kwargs} filepath = Path(filepath) mode = "a" if adata.isbacked else "w" if adata.isbacked: # close so that we can reopen below adata.file.close() with h5py.File(filepath, mode) as f: if "X" in as_dense and isinstance(adata.X, (sparse.spmatrix, SparseDataset)): write_sparse_as_dense(f, "X", adata.X, dataset_kwargs=dataset_kwargs) elif not (adata.isbacked and Path(adata.filename) == Path(filepath)): # If adata.isbacked, X should already be up to date write_attribute(f, "X", adata.X, dataset_kwargs=dataset_kwargs) if "raw/X" in as_dense and isinstance( adata.raw.X, (sparse.spmatrix, SparseDataset) ): write_sparse_as_dense( f, "raw/X", adata.raw.X, dataset_kwargs=dataset_kwargs ) write_attribute(f, "raw/var", adata.raw.var, dataset_kwargs=dataset_kwargs) write_attribute( f, "raw/varm", adata.raw.varm, dataset_kwargs=dataset_kwargs ) else: write_attribute(f, "raw", adata.raw, dataset_kwargs=dataset_kwargs) write_attribute(f, "obs", adata.obs, dataset_kwargs=dataset_kwargs) write_attribute(f, "var", adata.var, dataset_kwargs=dataset_kwargs) write_attribute(f, "obsm", adata.obsm, dataset_kwargs=dataset_kwargs) write_attribute(f, "varm", adata.varm, dataset_kwargs=dataset_kwargs) write_attribute(f, "obsp", adata.obsp, dataset_kwargs=dataset_kwargs) write_attribute(f, "varp", adata.varp, dataset_kwargs=dataset_kwargs) write_attribute(f, "layers", adata.layers, dataset_kwargs=dataset_kwargs) write_attribute(f, "uns", adata.uns, dataset_kwargs=dataset_kwargs) def _write_method(cls: Type[T]) -> Callable[[H5Group, str, T], None]: return _find_impl(cls, H5AD_WRITE_REGISTRY) @write_attribute.register(h5py.File) @write_attribute.register(h5py.Group) def write_attribute_h5ad(f: H5Group, key: str, value, *args, **kwargs): if key in f: del f[key] _write_method(type(value))(f, key, value, *args, **kwargs) def write_raw(f, key, value, dataset_kwargs=MappingProxyType({})): group = f.create_group(key) group.attrs["encoding-type"] = "raw" group.attrs["encoding-version"] = EncodingVersions.raw.value group.attrs["shape"] = value.shape write_attribute(f, "raw/X", value.X, dataset_kwargs=dataset_kwargs) write_attribute(f, "raw/var", value.var, dataset_kwargs=dataset_kwargs) write_attribute(f, "raw/varm", value.varm, dataset_kwargs=dataset_kwargs) @report_write_key_on_error def write_not_implemented(f, key, value, dataset_kwargs=MappingProxyType({})): # If it’s not an array, try and make it an array. If that fails, pickle it. # Maybe rethink that, maybe this should just pickle, # and have explicit implementations for everything else raise NotImplementedError( f"Failed to write value for {key}, " f"since a writer for type {type(value)} has not been implemented yet." ) @report_write_key_on_error def write_basic(f, key, value, dataset_kwargs=MappingProxyType({})): f.create_dataset(key, data=value, **dataset_kwargs) @report_write_key_on_error def write_list(f, key, value, dataset_kwargs=MappingProxyType({})): write_array(f, key, np.array(value), dataset_kwargs=dataset_kwargs) @report_write_key_on_error def write_none(f, key, value, dataset_kwargs=MappingProxyType({})): pass @report_write_key_on_error def write_scalar(f, key, value, dataset_kwargs=MappingProxyType({})): # Can’t compress scalars, error is thrown # TODO: Add more terms to filter once they're supported by dataset_kwargs key_filter = {"compression", "compression_opts"} dataset_kwargs = {k: v for k, v in dataset_kwargs.items() if k not in key_filter} write_array(f, key, np.array(value), dataset_kwargs=dataset_kwargs) @report_write_key_on_error def write_array(f, key, value, dataset_kwargs=MappingProxyType({})): # Convert unicode to fixed length strings if value.dtype.kind in {"U", "O"}: value = value.astype(h5py.special_dtype(vlen=str)) elif value.dtype.names is not None: value = _to_hdf5_vlen_strings(value) f.create_dataset(key, data=value, **dataset_kwargs) @report_write_key_on_error def write_sparse_compressed( f, key, value, fmt: Literal["csr", "csc"], dataset_kwargs=MappingProxyType({}) ): g = f.create_group(key) g.attrs["encoding-type"] = f"{fmt}_matrix" g.attrs["encoding-version"] = EncodingVersions[f"{fmt}_matrix"].value g.attrs["shape"] = value.shape # Allow resizing if "maxshape" not in dataset_kwargs: dataset_kwargs = dict(maxshape=(None,), **dataset_kwargs) g.create_dataset("data", data=value.data, **dataset_kwargs) g.create_dataset("indices", data=value.indices, **dataset_kwargs) g.create_dataset("indptr", data=value.indptr, **dataset_kwargs) write_csr = partial(write_sparse_compressed, fmt="csr") write_csc = partial(write_sparse_compressed, fmt="csc") @report_write_key_on_error def write_sparse_dataset(f, key, value, dataset_kwargs=MappingProxyType({})): write_sparse_compressed( f, key, value.to_backed(), fmt=value.format_str, dataset_kwargs=dataset_kwargs ) @report_write_key_on_error def write_sparse_as_dense(f, key, value, dataset_kwargs=MappingProxyType({})): real_key = None # Flag for if temporary key was used if key in f: if ( isinstance(value, (h5py.Group, h5py.Dataset, SparseDataset)) and value.file.filename == f.filename ): # Write to temporary key before overwriting real_key = key # Transform key to temporary, e.g. raw/X -> raw/_X, or X -> _X key = re.sub(r"(.*)(\w(?!.*/))", r"\1_\2", key.rstrip("/")) else: del f[key] # Wipe before write dset = f.create_dataset(key, shape=value.shape, dtype=value.dtype, **dataset_kwargs) compressed_axis = int(isinstance(value, sparse.csc_matrix)) for idx in idx_chunks_along_axis(value.shape, compressed_axis, 1000): dset[idx] = value[idx].toarray() if real_key is not None: del f[real_key] f[real_key] = f[key] del f[key] @report_write_key_on_error def write_dataframe(f, key, df, dataset_kwargs=MappingProxyType({})): # Check arguments for reserved in ("__categories", "_index"): if reserved in df.columns: raise ValueError(f"{reserved!r} is a reserved name for dataframe columns.") col_names = [check_key(c) for c in df.columns] if df.index.name is not None: index_name = df.index.name else: index_name = "_index" index_name = check_key(index_name) group = f.create_group(key) group.attrs["encoding-type"] = "dataframe" group.attrs["encoding-version"] = EncodingVersions.dataframe.value group.attrs["column-order"] = col_names group.attrs["_index"] = index_name write_series(group, index_name, df.index, dataset_kwargs=dataset_kwargs) for col_name, (_, series) in zip(col_names, df.items()): write_series(group, col_name, series, dataset_kwargs=dataset_kwargs) @report_write_key_on_error def write_series(group, key, series, dataset_kwargs=MappingProxyType({})): # group here is an h5py type, otherwise categoricals won’t write if series.dtype == object: # Assuming it’s string group.create_dataset( key, data=series.values, dtype=h5py.special_dtype(vlen=str), **dataset_kwargs, ) elif is_categorical_dtype(series): # This should work for categorical Index and Series categorical: pd.Categorical = series.values categories: np.ndarray = categorical.categories.values codes: np.ndarray = categorical.codes category_key = f"__categories/{key}" write_array(group, category_key, categories, dataset_kwargs=dataset_kwargs) write_array(group, key, codes, dataset_kwargs=dataset_kwargs) group[key].attrs["categories"] = group[category_key].ref group[category_key].attrs["ordered"] = categorical.ordered else: write_array(group, key, series.values, dataset_kwargs=dataset_kwargs) def write_mapping(f, key, value, dataset_kwargs=MappingProxyType({})): for sub_key, sub_value in value.items(): write_attribute(f, f"{key}/{sub_key}", sub_value, dataset_kwargs=dataset_kwargs) H5AD_WRITE_REGISTRY = { Raw: write_raw, object: write_not_implemented, h5py.Dataset: write_basic, list: write_list, type(None): write_none, str: write_scalar, float: write_scalar, np.floating: write_scalar, bool: write_scalar, np.bool_: write_scalar, int: write_scalar, np.integer: write_scalar, np.ndarray: write_array, sparse.csr_matrix: write_csr, sparse.csc_matrix: write_csc, SparseDataset: write_sparse_dataset, pd.DataFrame: write_dataframe, cabc.Mapping: write_mapping, } def read_h5ad_backed(filename: Union[str, Path], mode: Literal["r", "r+"]) -> AnnData: d = dict(filename=filename, filemode=mode) f = h5py.File(filename, mode) attributes = ["obsm", "varm", "obsp", "varp", "uns", "layers"] df_attributes = ["obs", "var"] d.update({k: read_attribute(f[k]) for k in attributes if k in f}) for k in df_attributes: if k in f: # Backwards compat d[k] = read_dataframe(f[k]) d["raw"] = _read_raw(f, attrs={"var", "varm"}) X_dset = f.get("X", None) if X_dset is None: pass elif isinstance(X_dset, h5py.Group): d["dtype"] = X_dset["data"].dtype elif hasattr(X_dset, "dtype"): d["dtype"] = f["X"].dtype else: raise ValueError() _clean_uns(d) return AnnData(**d) def read_h5ad( filename: Union[str, Path], backed: Union[Literal["r", "r+"], bool, None] = None, *, as_sparse: Sequence[str] = (), as_sparse_fmt: Type[sparse.spmatrix] = sparse.csr_matrix, chunk_size: int = 6000, # TODO, probably make this 2d chunks ) -> AnnData: """\ Read `.h5ad`-formatted hdf5 file. Parameters ---------- filename File name of data file. backed If `'r'`, load :class:`~anndata.AnnData` in `backed` mode instead of fully loading it into memory (`memory` mode). If you want to modify backed attributes of the AnnData object, you need to choose `'r+'`. as_sparse If an array was saved as dense, passing its name here will read it as a sparse_matrix, by chunk of size `chunk_size`. as_sparse_fmt Sparse format class to read elements from `as_sparse` in as. chunk_size Used only when loading sparse dataset that is stored as dense. Loading iterates through chunks of the dataset of this row size until it reads the whole dataset. Higher size means higher memory consumption and higher (to a point) loading speed. """ if backed not in {None, False}: mode = backed if mode is True: mode = "r+" assert mode in {"r", "r+"} return read_h5ad_backed(filename, mode) if as_sparse_fmt not in (sparse.csr_matrix, sparse.csc_matrix): raise NotImplementedError( "Dense formats can only be read to CSR or CSC matrices at this time." ) if isinstance(as_sparse, str): as_sparse = [as_sparse] else: as_sparse = list(as_sparse) for i in range(len(as_sparse)): if as_sparse[i] in {("raw", "X"), "raw.X"}: as_sparse[i] = "raw/X" elif as_sparse[i] not in {"raw/X", "X"}: raise NotImplementedError( "Currently only `X` and `raw/X` can be read as sparse." ) rdasp = partial( read_dense_as_sparse, sparse_format=as_sparse_fmt, axis_chunk=chunk_size ) with h5py.File(filename, "r") as f: d = {} for k in f.keys(): # Backwards compat for old raw if k == "raw" or k.startswith("raw."): continue if k == "X" and "X" in as_sparse: d[k] = rdasp(f[k]) elif k == "raw": assert False, "unexpected raw format" elif k in {"obs", "var"}: d[k] = read_dataframe(f[k]) else: # Base case d[k] = read_attribute(f[k]) d["raw"] = _read_raw(f, as_sparse, rdasp) X_dset = f.get("X", None) if X_dset is None: pass elif isinstance(X_dset, h5py.Group): d["dtype"] = X_dset["data"].dtype elif hasattr(X_dset, "dtype"): d["dtype"] = f["X"].dtype else: raise ValueError() _clean_uns(d) # backwards compat return AnnData(**d) def _read_raw( f: Union[h5py.File, AnnDataFileManager], as_sparse: Collection[str] = (), rdasp: Callable[[h5py.Dataset], sparse.spmatrix] = None, *, attrs: Collection[str] = ("X", "var", "varm"), ): if as_sparse: assert rdasp is not None, "must supply rdasp if as_sparse is supplied" raw = {} if "X" in attrs and "raw/X" in f: read_x = rdasp if "raw/X" in as_sparse else read_attribute raw["X"] = read_x(f["raw/X"]) for v in ("var", "varm"): if v in attrs and f"raw/{v}" in f: raw[v] = read_attribute(f[f"raw/{v}"]) return _read_legacy_raw(f, raw, read_dataframe, read_attribute, attrs=attrs) @report_read_key_on_error def read_dataframe_legacy(dataset) -> pd.DataFrame: """Read pre-anndata 0.7 dataframes.""" if H5PY_V3: df = pd.DataFrame( _decode_structured_array( _from_fixed_length_strings(dataset[()]), dtype=dataset.dtype ) ) else: df = pd.DataFrame(_from_fixed_length_strings(dataset[()])) df.set_index(df.columns[0], inplace=True) return df @report_read_key_on_error def read_dataframe(group) -> pd.DataFrame: if not isinstance(group, h5py.Group): return read_dataframe_legacy(group) columns = list(group.attrs["column-order"]) idx_key = group.attrs["_index"] df = pd.DataFrame( {k: read_series(group[k]) for k in columns}, index=read_series(group[idx_key]), columns=list(columns), ) if idx_key != "_index": df.index.name = idx_key return df @report_read_key_on_error def read_series(dataset) -> Union[np.ndarray, pd.Categorical]: if "categories" in dataset.attrs: categories = dataset.attrs["categories"] if isinstance(categories, h5py.Reference): categories_dset = dataset.parent[dataset.attrs["categories"]] categories = read_dataset(categories_dset) ordered = bool(categories_dset.attrs.get("ordered", False)) else: # TODO: remove this code at some point post 0.7 # TODO: Add tests for this warn( f"Your file {str(dataset.file.name)!r} has invalid categorical " "encodings due to being written from a development version of " "AnnData. Rewrite the file ensure you can read it in the future.", FutureWarning, ) return pd.Categorical.from_codes( read_dataset(dataset), categories, ordered=ordered ) else: return read_dataset(dataset) # @report_read_key_on_error # def read_sparse_dataset_backed(group: h5py.Group) -> sparse.spmatrix: # return SparseDataset(group) @read_attribute.register(h5py.Group) @report_read_key_on_error def read_group(group: h5py.Group) -> Union[dict, pd.DataFrame, sparse.spmatrix]: if "h5sparse_format" in group.attrs: # Backwards compat return SparseDataset(group).to_memory() encoding_type = group.attrs.get("encoding-type") if encoding_type: EncodingVersions[encoding_type].check( group.name, group.attrs["encoding-version"] ) if encoding_type in {None, "raw"}: pass elif encoding_type == "dataframe": return read_dataframe(group) elif encoding_type in {"csr_matrix", "csc_matrix"}: return SparseDataset(group).to_memory() else: raise ValueError(f"Unfamiliar `encoding-type`: {encoding_type}.") d = dict() for sub_key, sub_value in group.items(): d[sub_key] = read_attribute(sub_value) return d @read_attribute.register(h5py.Dataset) @report_read_key_on_error def read_dataset(dataset: h5py.Dataset): if H5PY_V3: string_dtype = h5py.check_string_dtype(dataset.dtype) if (string_dtype is not None) and (string_dtype.encoding == "utf-8"): dataset = dataset.asstr() value = dataset[()] if not hasattr(value, "dtype"): return value elif isinstance(value.dtype, str): pass elif issubclass(value.dtype.type, np.string_): value = value.astype(str) # Backwards compat, old datasets have strings as one element 1d arrays if len(value) == 1: return value[0] elif len(value.dtype.descr) > 1: # Compound dtype # For backwards compat, now strings are written as variable length dtype = value.dtype value = _from_fixed_length_strings(value) if H5PY_V3: value = _decode_structured_array(value, dtype=dtype) if value.shape == (): value = value[()] return value @report_read_key_on_error def read_dense_as_sparse( dataset: h5py.Dataset, sparse_format: sparse.spmatrix, axis_chunk: int ): if sparse_format == sparse.csr_matrix: return read_dense_as_csr(dataset, axis_chunk) elif sparse_format == sparse.csc_matrix: return read_dense_as_csc(dataset, axis_chunk) else: raise ValueError(f"Cannot read dense array as type: {sparse_format}") def read_dense_as_csr(dataset, axis_chunk=6000): sub_matrices = [] for idx in idx_chunks_along_axis(dataset.shape, 0, axis_chunk): dense_chunk = dataset[idx] sub_matrix = sparse.csr_matrix(dense_chunk) sub_matrices.append(sub_matrix) return sparse.vstack(sub_matrices, format="csr") def read_dense_as_csc(dataset, axis_chunk=6000): sub_matrices = [] for idx in idx_chunks_along_axis(dataset.shape, 1, axis_chunk): sub_matrix = sparse.csc_matrix(dataset[idx]) sub_matrices.append(sub_matrix) return sparse.hstack(sub_matrices, format="csc") anndata-0.7.8/anndata/_io/read.py000066400000000000000000000373671414255741200166220ustar00rootroot00000000000000from pathlib import Path from os import PathLike, fspath from types import MappingProxyType from typing import Union, Optional, Mapping, Tuple from typing import Iterable, Iterator, Generator from collections import OrderedDict import gzip import bz2 from warnings import warn import h5py import numpy as np import pandas as pd from .. import AnnData from ..compat import _deprecate_positional_args from .utils import is_float from .h5ad import read_h5ad try: from .zarr import read_zarr except ImportError as e: # noqa: F841 def read_zarr(*_, **__): raise e def read_csv( filename: Union[PathLike, Iterator[str]], delimiter: Optional[str] = ",", first_column_names: Optional[bool] = None, dtype: str = "float32", ) -> AnnData: """\ Read `.csv` file. Same as :func:`~anndata.read_text` but with default delimiter `','`. Parameters ---------- filename Data file. delimiter Delimiter that separates data within text file. If `None`, will split at arbitrary number of white spaces, which is different from enforcing splitting at single white space `' '`. first_column_names Assume the first column stores row names. dtype Numpy data type. """ return read_text(filename, delimiter, first_column_names, dtype) def read_excel( filename: PathLike, sheet: Union[str, int], dtype: str = "float32" ) -> AnnData: """\ Read `.xlsx` (Excel) file. Assumes that the first columns stores the row names and the first row the column names. Parameters ---------- filename File name to read from. sheet Name of sheet in Excel file. """ # rely on pandas for reading an excel file from pandas import read_excel df = read_excel(fspath(filename), sheet) X = df.values[:, 1:] row = dict(row_names=df.iloc[:, 0].values.astype(str)) col = dict(col_names=np.array(df.columns[1:], dtype=str)) return AnnData(X, row, col, dtype=dtype) def read_umi_tools(filename: PathLike, dtype: str = "float32") -> AnnData: """\ Read a gzipped condensed count matrix from umi_tools. Parameters ---------- filename File name to read from. """ # import pandas for conversion of a dict of dicts into a matrix # import gzip to read a gzipped file :-) import gzip from pandas import DataFrame dod = {} # this will contain basically everything fh = gzip.open(fspath(filename)) _ = fh.readline() # read the first line for line in fh: # gzip read bytes, hence the decoding t = line.decode("ascii").split("\t") try: dod[t[1]].update({t[0]: int(t[2])}) except KeyError: dod[t[1]] = {t[0]: int(t[2])} df = DataFrame.from_dict(dod, orient="index") # build the matrix df.fillna(value=0.0, inplace=True) # many NaN, replace with zeros return AnnData( np.array(df), dict(obs_names=df.index), dict(var_names=df.columns), dtype=dtype, ) def read_hdf(filename: PathLike, key: str) -> AnnData: """\ Read `.h5` (hdf5) file. Note: Also looks for fields `row_names` and `col_names`. Parameters ---------- filename Filename of data file. key Name of dataset in the file. """ with h5py.File(filename, "r") as f: # the following is necessary in Python 3, because only # a view and not a list is returned keys = [k for k in f.keys()] if key == "": raise ValueError( f"The file {filename} stores the following sheets:\n{keys}\n" f"Call read/read_hdf5 with one of them." ) # read array X = f[key][()] # try to find row and column names rows_cols = [{}, {}] for iname, name in enumerate(["row_names", "col_names"]): if name in keys: rows_cols[iname][name] = f[name][()] adata = AnnData(X, rows_cols[0], rows_cols[1], dtype=X.dtype.name) return adata def _fmt_loom_axis_attrs( input: Mapping, idx_name: str, dimm_mapping: Mapping[str, Iterable[str]] ) -> Tuple[pd.DataFrame, Mapping[str, np.ndarray]]: axis_df = pd.DataFrame() axis_mapping = {} for key, names in dimm_mapping.items(): axis_mapping[key] = np.array([input.pop(name) for name in names]).T for k, v in input.items(): if v.ndim > 1 and v.shape[1] > 1: axis_mapping[k] = v else: axis_df[k] = v if idx_name in axis_df: axis_df.set_index(idx_name, drop=True, inplace=True) return axis_df, axis_mapping @_deprecate_positional_args(version="0.9") def read_loom( filename: PathLike, *, sparse: bool = True, cleanup: bool = False, X_name: str = "spliced", obs_names: str = "CellID", obsm_names: Optional[Mapping[str, Iterable[str]]] = None, var_names: str = "Gene", varm_names: Optional[Mapping[str, Iterable[str]]] = None, dtype: str = "float32", obsm_mapping: Mapping[str, Iterable[str]] = MappingProxyType({}), varm_mapping: Mapping[str, Iterable[str]] = MappingProxyType({}), **kwargs, ) -> AnnData: """\ Read `.loom`-formatted hdf5 file. This reads the whole file into memory. Beware that you have to explicitly state when you want to read the file as sparse data. Parameters ---------- filename The filename. sparse Whether to read the data matrix as sparse. cleanup Whether to collapse all obs/var fields that only store one unique value into `.uns['loom-.']`. X_name Loompy key with which the data matrix :attr:`~anndata.AnnData.X` is initialized. obs_names Loompy key where the observation/cell names are stored. obsm_mapping Loompy keys which will be constructed into observation matrices var_names Loompy key where the variable/gene names are stored. varm_mapping Loompy keys which will be constructed into variable matrices **kwargs: Arguments to loompy.connect Example ------- .. code:: python pbmc = anndata.read_loom( "pbmc.loom", sparse=True, X_name="lognorm", obs_names="cell_names", var_names="gene_names", obsm_mapping={ "X_umap": ["umap_1", "umap_2"] } ) """ # Deprecations if obsm_names is not None: warn( "Argument obsm_names has been deprecated in favour of `obsm_mapping`. " "In 0.9 this will be an error.", FutureWarning, ) if obsm_mapping != {}: raise ValueError( "Recieved values for both `obsm_names` and `obsm_mapping`. This is " "ambiguous, only pass `obsm_mapping`." ) obsm_mapping = obsm_names if varm_names is not None: warn( "Argument varm_names has been deprecated in favour of `varm_mapping`. " "In 0.9 this will be an error.", FutureWarning, ) if varm_mapping != {}: raise ValueError( "Recieved values for both `varm_names` and `varm_mapping`. This is " "ambiguous, only pass `varm_mapping`." ) varm_mapping = varm_names filename = fspath(filename) # allow passing pathlib.Path objects from loompy import connect with connect(filename, "r", **kwargs) as lc: if X_name not in lc.layers.keys(): X_name = "" X = lc.layers[X_name].sparse().T.tocsr() if sparse else lc.layers[X_name][()].T layers = OrderedDict() if X_name != "": layers["matrix"] = ( lc.layers[""].sparse().T.tocsr() if sparse else lc.layers[""][()].T ) for key in lc.layers.keys(): if key != "": layers[key] = ( lc.layers[key].sparse().T.tocsr() if sparse else lc.layers[key][()].T ) # TODO: Figure out the singleton obs elements obs, obsm = _fmt_loom_axis_attrs(dict(lc.col_attrs), obs_names, obsm_mapping) var, varm = _fmt_loom_axis_attrs(dict(lc.row_attrs), var_names, varm_mapping) uns = {} if cleanup: uns_obs = {} for key in list(obs.keys()): if len(set(obs[key])) == 1: uns_obs[f"{key}"] = obs[key][0] del obs[key] if uns_obs: uns["loom-obs"] = uns_obs uns_var = {} for key in list(var.keys()): if len(set(var[key])) == 1: uns_var[f"{key}"] = var[key][0] del var[key] if uns_var: uns["loom-var"] = uns_var adata = AnnData( X, obs=obs, var=var, layers=layers, obsm=obsm if obsm else None, varm=varm if varm else None, uns=uns, dtype=dtype, ) return adata def read_mtx(filename: PathLike, dtype: str = "float32") -> AnnData: """\ Read `.mtx` file. Parameters ---------- filename The filename. dtype Numpy data type. """ from scipy.io import mmread # could be rewritten accounting for dtype to be more performant X = mmread(fspath(filename)).astype(dtype) from scipy.sparse import csr_matrix X = csr_matrix(X) return AnnData(X, dtype=dtype) def read_text( filename: Union[PathLike, Iterator[str]], delimiter: Optional[str] = None, first_column_names: Optional[bool] = None, dtype: str = "float32", ) -> AnnData: """\ Read `.txt`, `.tab`, `.data` (text) file. Same as :func:`~anndata.read_csv` but with default delimiter `None`. Parameters ---------- filename Data file, filename or stream. delimiter Delimiter that separates data within text file. If `None`, will split at arbitrary number of white spaces, which is different from enforcing splitting at single white space `' '`. first_column_names Assume the first column stores row names. dtype Numpy data type. """ if not isinstance(filename, (PathLike, str, bytes)): return _read_text(filename, delimiter, first_column_names, dtype) filename = Path(filename) if filename.suffix == ".gz": with gzip.open(str(filename), mode="rt") as f: return _read_text(f, delimiter, first_column_names, dtype) elif filename.suffix == ".bz2": with bz2.open(str(filename), mode="rt") as f: return _read_text(f, delimiter, first_column_names, dtype) else: with filename.open() as f: return _read_text(f, delimiter, first_column_names, dtype) def iter_lines(file_like: Iterable[str]) -> Generator[str, None, None]: """Helper for iterating only nonempty lines without line breaks""" for line in file_like: line = line.rstrip("\r\n") if line: yield line def _read_text( f: Iterator[str], delimiter: Optional[str], first_column_names: Optional[bool], dtype: str, ) -> AnnData: comments = [] data = [] lines = iter_lines(f) col_names = [] row_names = [] # read header and column names for line in lines: if line.startswith("#"): comment = line.lstrip("# ") if comment: comments.append(comment) else: if delimiter is not None and delimiter not in line: raise ValueError(f"Did not find delimiter {delimiter!r} in first line.") line_list = line.split(delimiter) # the first column might be row names, so check the last if not is_float(line_list[-1]): col_names = line_list # logg.msg(" assuming first line in file stores column names", v=4) else: if not is_float(line_list[0]) or first_column_names: first_column_names = True row_names.append(line_list[0]) data.append(np.array(line_list[1:], dtype=dtype)) else: data.append(np.array(line_list, dtype=dtype)) break if not col_names: # try reading col_names from the last comment line if len(comments) > 0: # logg.msg(" assuming last comment line stores variable names", v=4) col_names = np.array(comments[-1].split()) # just numbers as col_names else: # logg.msg(" did not find column names in file", v=4) col_names = np.arange(len(data[0])).astype(str) col_names = np.array(col_names, dtype=str) # read another line to check if first column contains row names or not if first_column_names is None: first_column_names = False for line in lines: line_list = line.split(delimiter) if first_column_names or not is_float(line_list[0]): # logg.msg(" assuming first column in file stores row names", v=4) first_column_names = True row_names.append(line_list[0]) data.append(np.array(line_list[1:], dtype=dtype)) else: data.append(np.array(line_list, dtype=dtype)) break # if row names are just integers if len(data) > 1 and data[0].size != data[1].size: # logg.msg( # " assuming first row stores column names and first column row names", # v=4, # ) first_column_names = True col_names = np.array(data[0]).astype(int).astype(str) row_names.append(data[1][0].astype(int).astype(str)) data = [data[1][1:]] # parse the file for line in lines: line_list = line.split(delimiter) if first_column_names: row_names.append(line_list[0]) data.append(np.array(line_list[1:], dtype=dtype)) else: data.append(np.array(line_list, dtype=dtype)) # logg.msg(" read data into list of lists", t=True, v=4) # transfrom to array, this takes a long time and a lot of memory # but it’s actually the same thing as np.genfromtxt does # - we don’t use the latter as it would involve another slicing step # in the end, to separate row_names from float data, slicing takes # a lot of memory and CPU time if data[0].size != data[-1].size: raise ValueError( f"Length of first line ({data[0].size}) is different " f"from length of last line ({data[-1].size})." ) data = np.array(data, dtype=dtype) # logg.msg(" constructed array from list of list", t=True, v=4) # transform row_names if not row_names: row_names = np.arange(len(data)).astype(str) # logg.msg(" did not find row names in file", v=4) else: row_names = np.array(row_names) for iname, name in enumerate(row_names): row_names[iname] = name.strip('"') # adapt col_names if necessary if col_names.size > data.shape[1]: col_names = col_names[1:] for iname, name in enumerate(col_names): col_names[iname] = name.strip('"') return AnnData( data, obs=dict(obs_names=row_names), var=dict(var_names=col_names), dtype=dtype, ) def load_sparse_csr(d, key="X"): from scipy.sparse.csr import csr_matrix key_csr = f"{key}_csr" d[key] = csr_matrix( (d[f"{key_csr}_data"], d[f"{key_csr}_indices"], d[f"{key_csr}_indptr"]), shape=d[f"{key_csr}_shape"], ) del_sparse_matrix_keys(d, key_csr) return d def del_sparse_matrix_keys(mapping, key_csr): del mapping[f"{key_csr}_data"] del mapping[f"{key_csr}_indices"] del mapping[f"{key_csr}_indptr"] del mapping[f"{key_csr}_shape"] anndata-0.7.8/anndata/_io/utils.py000066400000000000000000000162111414255741200170300ustar00rootroot00000000000000from enum import Enum from functools import wraps, singledispatch from warnings import warn from packaging import version import h5py from .._core.sparse_dataset import SparseDataset # For allowing h5py v3 # https://github.com/theislab/anndata/issues/442 H5PY_V3 = version.parse(h5py.__version__).major >= 3 # ------------------------------------------------------------------------------- # Type conversion # ------------------------------------------------------------------------------- # Could be numba’d if it returned tuples instead of slices def idx_chunks_along_axis(shape: tuple, axis: int, chunk_size: int): """\ Gives indexer tuples chunked along an axis. Params ------ shape Shape of array to be chunked axis Axis to chunk along chunk_size Size of chunk along axis Returns ------- An iterator of tuples for indexing into an array of passed shape. """ total = shape[axis] cur = 0 mutable_idx = [slice(None) for i in range(len(shape))] while cur + chunk_size < total: mutable_idx[axis] = slice(cur, cur + chunk_size) yield tuple(mutable_idx) cur += chunk_size mutable_idx[axis] = slice(cur, None) yield tuple(mutable_idx) def is_float(string): """\ Check whether string is float. See also -------- http://stackoverflow.com/questions/736043/checking-if-a-string-can-be-converted-to-float-in-python """ try: float(string) return True except ValueError: return False def is_int(string): """Check whether string is integer.""" try: int(string) return True except ValueError: return False def convert_bool(string): """Check whether string is boolean.""" if string == "True": return True, True elif string == "False": return True, False else: return False, False def convert_string(string): """Convert string to int, float or bool.""" if is_int(string): return int(string) elif is_float(string): return float(string) elif convert_bool(string)[0]: return convert_bool(string)[1] elif string == "None": return None else: return string def check_key(key): """Checks that passed value is a valid h5py key. Should convert it if there is an obvious conversion path, error otherwise. """ typ = type(key) if issubclass(typ, str): return str(key) # TODO: Should I try to decode bytes? It's what h5py would do, # but it will be read out as a str. # elif issubclass(typ, bytes): # return key else: raise TypeError(f"{key} of type {typ} is an invalid key. Should be str.") # ------------------------------------------------------------------------------- # Generic functions # ------------------------------------------------------------------------------- @singledispatch def write_attribute(*args, **kwargs): raise NotImplementedError("Unrecognized argument types for `write_attribute`.") @singledispatch def read_attribute(*args, **kwargs): raise NotImplementedError("Unrecognized argument types for `read_attribute`.") @read_attribute.register(type(None)) def read_attribute_none(value) -> None: return None # ------------------------------------------------------------------------------- # Errors handling # ------------------------------------------------------------------------------- # TODO: Is there a consistent way to do this which just modifies the previously # thrown error? Could do a warning? class AnnDataReadError(OSError): """Error caused while trying to read in AnnData.""" pass def _get_parent(elem): try: import zarr except ImportError: zarr = None if zarr and isinstance(elem, (zarr.Group, zarr.Array)): parent = elem.store # Not sure how to always get a name out of this elif isinstance(elem, SparseDataset): parent = elem.group.file.name else: parent = elem.file.name return parent def report_read_key_on_error(func): """\ A decorator for zarr element reading which makes keys involved in errors get reported. Example ------- >>> import zarr >>> @report_read_key_on_error ... def read_arr(group): ... raise NotImplementedError() >>> z = zarr.open("tmp.zarr") >>> z["X"] = [1, 2, 3] >>> read_arr(z["X"]) # doctest: +SKIP """ @wraps(func) def func_wrapper(elem, *args, **kwargs): try: return func(elem, *args, **kwargs) except Exception as e: if isinstance(e, AnnDataReadError): raise e else: parent = _get_parent(elem) raise AnnDataReadError( f"Above error raised while reading key {elem.name!r} of " f"type {type(elem)} from {parent}." ) return func_wrapper def report_write_key_on_error(func): """\ A decorator for zarr element reading which makes keys involved in errors get reported. Example ------- >>> import zarr >>> @report_write_key_on_error ... def write_arr(group, key, val): ... raise NotImplementedError() >>> z = zarr.open("tmp.zarr") >>> X = [1, 2, 3] >>> write_arr(z, "X", X) # doctest: +SKIP """ @wraps(func) def func_wrapper(elem, key, val, *args, **kwargs): try: return func(elem, key, val, *args, **kwargs) except Exception as e: parent = _get_parent(elem) raise type(e)( f"{e}\n\n" f"Above error raised while writing key {key!r} of {type(elem)}" f" from {parent}." ) from e return func_wrapper # ------------------------------------------------------------------------------- # Common h5ad/zarr stuff # ------------------------------------------------------------------------------- def _read_legacy_raw(f, modern_raw, read_df, read_attr, *, attrs=("X", "var", "varm")): """\ Backwards compat for reading legacy raw. Makes sure that no modern raw group coexists with legacy raw.* groups. """ if modern_raw: if any(k.startswith("raw.") for k in f): what = f"File {f.filename}" if hasattr(f, "filename") else "Store" raise ValueError(f"{what} has both legacy and current raw formats.") return modern_raw raw = {} if "X" in attrs and "raw.X" in f: raw["X"] = read_attr(f["raw.X"]) if "var" in attrs and "raw.var" in f: raw["var"] = read_df(f["raw.var"]) # Backwards compat if "varm" in attrs and "raw.varm" in f: raw["varm"] = read_attr(f["raw.varm"]) return raw class EncodingVersions(Enum): raw = "0.1.0" csr_matrix = csc_matrix = "0.1.0" dataframe = "0.1.0" def check(self, key: str, encoded_version: str): if version.parse(encoded_version) > version.parse(self.value): warn( f"The supported version for decoding {self.name} is {self.value}, " f"but a {self.name} with version {encoded_version} " f"was encountered at {key}.", FutureWarning, ) anndata-0.7.8/anndata/_io/write.py000066400000000000000000000107521414255741200170260ustar00rootroot00000000000000import warnings from pathlib import Path from os import PathLike, fspath import pandas as pd import math import numpy as np from scipy.sparse import issparse from .. import AnnData from ..logging import get_logger from . import WriteWarning # Exports from .h5ad import write_h5ad as _write_h5ad from ..utils import import_function logger = get_logger(__name__) write_zarr = import_function("anndata._io.zarr", "write_zarr") def write_csvs( dirname: PathLike, adata: AnnData, skip_data: bool = True, sep: str = "," ): """See :meth:`~anndata.AnnData.write_csvs`.""" dirname = Path(dirname) if dirname.suffix == ".csv": dirname = dirname.with_suffix("") logger.info(f"writing .csv files to {dirname}") if not dirname.is_dir(): dirname.mkdir(parents=True, exist_ok=True) dir_uns = dirname / "uns" if not dir_uns.is_dir(): dir_uns.mkdir(parents=True, exist_ok=True) d = dict( obs=adata._obs, var=adata._var, obsm=adata._obsm.to_df(), varm=adata._varm.to_df(), ) if not skip_data: d["X"] = pd.DataFrame(adata.X.toarray() if issparse(adata.X) else adata.X) d_write = {**d, **adata._uns} not_yet_raised_sparse_warning = True for key, value in d_write.items(): if issparse(value): if not_yet_raised_sparse_warning: warnings.warn("Omitting to write sparse annotation.", WriteWarning) not_yet_raised_sparse_warning = False continue filename = dirname if key not in {"X", "var", "obs", "obsm", "varm"}: filename = dir_uns filename /= f"{key}.csv" df = value if not isinstance(value, pd.DataFrame): value = np.array(value) if np.ndim(value) == 0: value = value[None] try: df = pd.DataFrame(value) except Exception as e: warnings.warn( f"Omitting to write {key!r} of type {type(e)}.", WriteWarning, ) continue df.to_csv( filename, sep=sep, header=key in {"obs", "var", "obsm", "varm"}, index=key in {"obs", "var"}, ) def write_loom(filename: PathLike, adata: AnnData, write_obsm_varm: bool = False): filename = Path(filename) row_attrs = {k: np.array(v) for k, v in adata.var.to_dict("list").items()} row_names = adata.var_names row_dim = row_names.name if row_names.name is not None else "var_names" row_attrs[row_dim] = row_names.values col_attrs = {k: np.array(v) for k, v in adata.obs.to_dict("list").items()} col_names = adata.obs_names col_dim = col_names.name if col_names.name is not None else "obs_names" col_attrs[col_dim] = col_names.values if adata.X is None: raise ValueError("loompy does not accept empty matrices as data") if write_obsm_varm: for key in adata.obsm.keys(): col_attrs[key] = adata.obsm[key] for key in adata.varm.keys(): row_attrs[key] = adata.varm[key] elif len(adata.obsm.keys()) > 0 or len(adata.varm.keys()) > 0: logger.warning( f"The loom file will lack these fields:\n" f"{adata.obsm.keys() | adata.varm.keys()}\n" f"Use write_obsm_varm=True to export multi-dimensional annotations" ) layers = {"": adata.X.T} for key in adata.layers.keys(): layers[key] = adata.layers[key].T from loompy import create if filename.exists(): filename.unlink() create(fspath(filename), layers, row_attrs=row_attrs, col_attrs=col_attrs) def _get_chunk_indices(za): # TODO: does zarr provide code for this? """\ Return all the indices (coordinates) for the chunks in a zarr array, even empty ones. """ return [ (i, j) for i in range(int(math.ceil(float(za.shape[0]) / za.chunks[0]))) for j in range(int(math.ceil(float(za.shape[1]) / za.chunks[1]))) ] def _write_in_zarr_chunks(za, key, value): if key != "X": za[:] = value # don’t chunk metadata else: for ci in _get_chunk_indices(za): s0, e0 = za.chunks[0] * ci[0], za.chunks[0] * (ci[0] + 1) s1, e1 = za.chunks[1] * ci[1], za.chunks[1] * (ci[1] + 1) print(ci, s0, e1, s1, e1) if issparse(value): za[s0:e0, s1:e1] = value[s0:e0, s1:e1].todense() else: za[s0:e0, s1:e1] = value[s0:e0, s1:e1] anndata-0.7.8/anndata/_io/zarr.py000066400000000000000000000306701414255741200166530ustar00rootroot00000000000000from collections.abc import Mapping, MutableMapping from functools import _find_impl, singledispatch from pathlib import Path from types import MappingProxyType from typing import Callable, Type, TypeVar, Union from warnings import warn import numpy as np from scipy import sparse import pandas as pd from pandas.api.types import is_categorical_dtype import numcodecs import zarr from .._core.anndata import AnnData from .._core.raw import Raw from ..compat import ( _from_fixed_length_strings, _to_fixed_length_strings, _clean_uns, ) from .utils import ( report_read_key_on_error, report_write_key_on_error, write_attribute, _read_legacy_raw, EncodingVersions, check_key, ) from . import WriteWarning T = TypeVar("T") def write_zarr( store: Union[MutableMapping, str, Path], adata: AnnData, chunks=None, **dataset_kwargs, ) -> None: if isinstance(store, Path): store = str(store) adata.strings_to_categoricals() if adata.raw is not None: adata.strings_to_categoricals(adata.raw.var) f = zarr.open(store, mode="w") if chunks is not None and not isinstance(adata.X, sparse.spmatrix): write_attribute(f, "X", adata.X, dict(chunks=chunks, **dataset_kwargs)) else: write_attribute(f, "X", adata.X, dataset_kwargs) write_attribute(f, "obs", adata.obs, dataset_kwargs) write_attribute(f, "var", adata.var, dataset_kwargs) write_attribute(f, "obsm", adata.obsm, dataset_kwargs) write_attribute(f, "varm", adata.varm, dataset_kwargs) write_attribute(f, "obsp", adata.obsp, dataset_kwargs) write_attribute(f, "varp", adata.varp, dataset_kwargs) write_attribute(f, "layers", adata.layers, dataset_kwargs) write_attribute(f, "uns", adata.uns, dataset_kwargs) write_attribute(f, "raw", adata.raw, dataset_kwargs) def _write_method(cls: Type[T]) -> Callable[[zarr.Group, str, T], None]: return _find_impl(cls, ZARR_WRITE_REGISTRY) @write_attribute.register(zarr.Group) def write_attribute_zarr(f, key, value, dataset_kwargs=MappingProxyType({})): if key in f: del f[key] _write_method(type(value))(f, key, value, dataset_kwargs) def write_mapping(f, key, value: Mapping, dataset_kwargs=MappingProxyType({})): for sub_k, sub_v in value.items(): if not isinstance(key, str): warn( f"dict key {key} transformed to str upon writing to zarr, using " "string keys is recommended.", WriteWarning, ) write_attribute(f, f"{key}/{sub_k}", sub_v, dataset_kwargs) @report_write_key_on_error def write_dataframe(z, key, df, dataset_kwargs=MappingProxyType({})): # Check arguments for reserved in ("__categories", "_index"): if reserved in df.columns: raise ValueError(f"{reserved!r} is a reserved name for dataframe columns.") col_names = [check_key(c) for c in df.columns] if df.index.name is not None: index_name = df.index.name else: index_name = "_index" index_name = check_key(index_name) group = z.create_group(key) group.attrs["encoding-type"] = "dataframe" group.attrs["encoding-version"] = EncodingVersions.dataframe.value group.attrs["column-order"] = col_names group.attrs["_index"] = index_name write_series(group, index_name, df.index, dataset_kwargs) for col_name, (_, series) in zip(col_names, df.items()): write_series(group, col_name, series, dataset_kwargs) @report_write_key_on_error def write_series(group, key, series, dataset_kwargs=MappingProxyType({})): if series.dtype == object: group.create_dataset( key, shape=series.shape, dtype=object, object_codec=numcodecs.VLenUTF8(), **dataset_kwargs, ) group[key][:] = series.values elif is_categorical_dtype(series): # This should work for categorical Index and Series categorical: pd.Categorical = series.values categories: np.ndarray = categorical.categories.values codes: np.ndarray = categorical.codes category_key = f"__categories/{key}" write_array(group, category_key, categories, dataset_kwargs=dataset_kwargs) write_array(group, key, codes, dataset_kwargs=dataset_kwargs) group[key].attrs["categories"] = category_key # Must coerce np.bool_ to bool for json writing group[category_key].attrs["ordered"] = bool(categorical.ordered) else: write_array(group, key, series.values, dataset_kwargs=dataset_kwargs) @report_write_key_on_error def write_not_implemented(f, key, value, dataset_kwargs=MappingProxyType({})): # If it’s not an array, try and make it an array. If that fails, pickle it. # Maybe rethink that, maybe this should just pickle, # and have explicit implementations for everything else raise NotImplementedError( f"Failed to write value for {key}, since a writer for type {type(value)}" f" has not been implemented yet." ) @report_write_key_on_error def write_list(g, key, value, dataset_kwargs=MappingProxyType({})): write_array(g, key, np.array(value), dataset_kwargs) @report_write_key_on_error def write_array(g, key, value, dataset_kwargs=MappingProxyType({})): if value.dtype == object: g.create_dataset( key, shape=value.shape, dtype=object, object_codec=numcodecs.VLenUTF8(), **dataset_kwargs, ) g[key][:] = value elif value.dtype.kind == "V": # Structured dtype g.create_dataset(key, data=_to_fixed_length_strings(value), **dataset_kwargs) else: g.create_dataset(key, data=value, **dataset_kwargs) # TODO: Not working quite right @report_write_key_on_error def write_scalar(f, key, value, dataset_kwargs=MappingProxyType({})): f.create_dataset(key, data=np.array(value), **dataset_kwargs) @report_write_key_on_error def write_none(f, key, value, dataset_kwargs=MappingProxyType({})): pass # TODO: Figure out what to do with dataset_kwargs for these @report_write_key_on_error def write_csr(f, key, value: sparse.csr_matrix, dataset_kwargs=MappingProxyType({})): group = f.create_group(key) group.attrs["encoding-type"] = "csr_matrix" group.attrs["encoding-version"] = EncodingVersions.csr_matrix.value group.attrs["shape"] = value.shape write_array(group, "data", value.data, dataset_kwargs=dataset_kwargs) write_array(group, "indices", value.indices, dataset_kwargs=dataset_kwargs) write_array(group, "indptr", value.indptr, dataset_kwargs=dataset_kwargs) @report_write_key_on_error def write_csc(f, key, value: sparse.csc_matrix, dataset_kwargs=MappingProxyType({})): group = f.create_group(key) group.attrs["encoding-type"] = "csc_matrix" group.attrs["encoding-version"] = EncodingVersions.csc_matrix.value group.attrs["shape"] = value.shape write_array(group, "data", value.data, dataset_kwargs=dataset_kwargs) write_array(group, "indices", value.indices, dataset_kwargs=dataset_kwargs) write_array(group, "indptr", value.indptr, dataset_kwargs=dataset_kwargs) def write_raw(f, key, value, dataset_kwargs=MappingProxyType({})): group = f.create_group(key) group.attrs["encoding-type"] = "raw" group.attrs["encoding-version"] = EncodingVersions.raw.value group.attrs["shape"] = value.shape write_attribute(group, "X", value.X, dataset_kwargs) write_attribute(group, "var", value.var, dataset_kwargs) write_attribute(group, "varm", value.varm, dataset_kwargs) ZARR_WRITE_REGISTRY = { type(None): write_none, Mapping: write_mapping, object: write_not_implemented, np.ndarray: write_array, # Possibly merge with write_series list: write_list, pd.DataFrame: write_dataframe, Raw: write_raw, # object: write_not_implemented, # h5py.Dataset: write_basic, # type(None): write_none, str: write_scalar, float: write_scalar, np.floating: write_scalar, bool: write_scalar, np.bool_: write_scalar, int: write_scalar, np.integer: write_scalar, sparse.csr_matrix: write_csr, sparse.csc_matrix: write_csc, } def read_zarr(store: Union[str, Path, MutableMapping, zarr.Group]) -> AnnData: """\ Read from a hierarchical Zarr array store. Parameters ---------- store The filename, a :class:`~typing.MutableMapping`, or a Zarr storage class. """ if isinstance(store, Path): store = str(store) f = zarr.open(store, mode="r") d = {} for k in f.keys(): # Backwards compat if k.startswith("raw."): continue if k in {"obs", "var"}: d[k] = read_dataframe(f[k]) else: # Base case d[k] = read_attribute(f[k]) d["raw"] = _read_legacy_raw(f, d.get("raw"), read_dataframe, read_attribute) _clean_uns(d) return AnnData(**d) @singledispatch def read_attribute(value): raise NotImplementedError() @read_attribute.register(zarr.Array) @report_read_key_on_error def read_dataset(dataset: zarr.Array): value = dataset[...] if not hasattr(value, "dtype"): return value elif isinstance(value.dtype, str): pass elif issubclass(value.dtype.type, np.str_): value = value.astype(object) elif issubclass(value.dtype.type, np.string_): value = value.astype(str).astype(object) # bytestring -> unicode -> str elif len(value.dtype.descr) > 1: # Compound dtype # For backwards compat, now strings are written as variable length value = _from_fixed_length_strings(value) if value.shape == (): value = value[()] return value @read_attribute.register(zarr.Group) @report_read_key_on_error def read_group(group: zarr.Group): if "encoding-type" in group.attrs: enctype = group.attrs["encoding-type"] EncodingVersions[enctype].check(group.name, group.attrs["encoding-version"]) if enctype == "dataframe": return read_dataframe(group) elif enctype == "csr_matrix": return read_csr(group) elif enctype == "csc_matrix": return read_csc(group) # At the moment, just treat raw as normal group return {k: read_attribute(group[k]) for k in group.keys()} @report_read_key_on_error def read_csr(group: zarr.Group) -> sparse.csr_matrix: return sparse.csr_matrix( (group["data"], group["indices"], group["indptr"]), shape=group.attrs["shape"], ) @report_read_key_on_error def read_csc(group: zarr.Group) -> sparse.csc_matrix: return sparse.csc_matrix( (group["data"], group["indices"], group["indptr"]), shape=group.attrs["shape"], ) @report_read_key_on_error def read_dataframe_legacy(dataset: zarr.Array) -> pd.DataFrame: """Reads old format of dataframes""" # NOTE: Likely that categoricals need to be removed from uns df = pd.DataFrame(_from_fixed_length_strings(dataset[()])) df.set_index(df.columns[0], inplace=True) return df @report_read_key_on_error def read_dataframe(group) -> pd.DataFrame: if isinstance(group, zarr.Array): return read_dataframe_legacy(group) columns = list(group.attrs["column-order"]) idx_key = group.attrs["_index"] df = pd.DataFrame( {k: read_series(group[k]) for k in columns}, index=read_series(group[idx_key]), columns=list(columns), ) if idx_key != "_index": df.index.name = idx_key return df @report_read_key_on_error def read_series(dataset: zarr.Array) -> Union[np.ndarray, pd.Categorical]: if "categories" in dataset.attrs: categories = dataset.attrs["categories"] if isinstance(categories, str): categories_key = categories parent_name = dataset.name.rstrip(dataset.basename) parent = zarr.open(dataset.store)[parent_name] categories_dset = parent[categories_key] categories = categories_dset[...] ordered = categories_dset.attrs.get("ordered", False) else: # TODO: remove this code at some point post 0.7 # TODO: Add tests for this warn( f"Your file {str(dataset.file.name)!r} has invalid categorical " "encodings due to being written from a development version of " "AnnData. Rewrite the file ensure you can read it in the future.", FutureWarning, ) return pd.Categorical.from_codes(dataset[...], categories, ordered=ordered) else: return dataset[...] anndata-0.7.8/anndata/_metadata.py000066400000000000000000000025651414255741200170500ustar00rootroot00000000000000import traceback from pathlib import Path here = Path(__file__).parent def refresh_entry_points(): """\ Under some circumstances, (e.g. when installing a PEP 517 package via pip), pkg_resources.working_set.entries is stale. This tries to fix that. See https://github.com/pypa/setuptools_scm/issues/513 """ try: import sys import pkg_resources ws: pkg_resources.WorkingSet = pkg_resources.working_set for entry in sys.path: ws.add_entry(entry) except Exception: pass try: from setuptools_scm import get_version refresh_entry_points() __version__ = get_version(root="..", relative_to=__file__) except (ImportError, LookupError, FileNotFoundError): try: from importlib.metadata import metadata except ImportError: from importlib_metadata import metadata meta = metadata(here.name) __version__ = meta["Version"] def within_flit(): """\ Checks if we are being imported by flit. This is necessary so flit can import __version__ without all depedencies installed. There are a few options to make this hack unnecessary, see: https://github.com/takluyver/flit/issues/253#issuecomment-737870438 """ for frame in traceback.extract_stack(): if frame.name == "get_docstring_and_version_via_import": return True return False anndata-0.7.8/anndata/compat/000077500000000000000000000000001414255741200160325ustar00rootroot00000000000000anndata-0.7.8/anndata/compat/__init__.py000066400000000000000000000212761414255741200201530ustar00rootroot00000000000000from copy import deepcopy from functools import reduce, wraps from inspect import signature, Parameter from typing import Collection, Union, Mapping, MutableMapping, Optional from warnings import warn import h5py from scipy.sparse import spmatrix import numpy as np import pandas as pd from ._overloaded_dict import _overloaded_uns, OverloadedDict from .._core.index import _subset # try importing zarr, dask, and zappy from packaging import version try: from zarr.core import Array as ZarrArray except ImportError: class ZarrArray: @staticmethod def __repr__(): return "mock zarr.core.Array" try: from zappy.base import ZappyArray except ImportError: class ZappyArray: @staticmethod def __repr__(): return "mock zappy.base.ZappyArray" try: from dask.array import Array as DaskArray except ImportError: class DaskArray: @staticmethod def __repr__(): return "mock dask.array.core.Array" try: from typing import Literal except ImportError: try: from typing_extensions import Literal except ImportError: class LiteralMeta(type): def __getitem__(cls, values): if not isinstance(values, tuple): values = (values,) return type("Literal_", (Literal,), dict(__args__=values)) class Literal(metaclass=LiteralMeta): pass def _from_fixed_length_strings(value): """\ Convert from fixed length strings to unicode. For backwards compatability with older h5ad and zarr files. """ new_dtype = [] for dt in value.dtype.descr: dt_list = list(dt) dt_type = dt[1] # could probably match better is_annotated = isinstance(dt_type, tuple) if is_annotated: dt_type = dt_type[0] # Fixing issue introduced with h5py v2.10.0, see: # https://github.com/h5py/h5py/issues/1307 if issubclass(np.dtype(dt_type).type, np.string_): dt_list[1] = f"U{int(dt_type[2:])}" elif is_annotated or np.issubdtype(np.dtype(dt_type), np.str_): dt_list[1] = "O" # Assumption that it’s a vlen str new_dtype.append(tuple(dt_list)) return value.astype(new_dtype) def _decode_structured_array( arr: np.ndarray, dtype: Optional[np.dtype] = None, copy: bool = False ) -> np.ndarray: """ h5py 3.0 now reads all strings as bytes. There is a helper method which can convert these to strings, but there isn't anything for fields of structured dtypes. Params ------ arr An array with structured dtype dtype dtype of the array. This is checked for h5py string data types. Passing this is allowed for cases where array may have been processed by another function before hand. """ if copy: arr = arr.copy() if dtype is None: dtype = arr.dtype # codecs.decode is 2x slower than this lambda, go figure decode = np.frompyfunc(lambda x: x.decode("utf-8"), 1, 1) for k, (dt, _) in dtype.fields.items(): check = h5py.check_string_dtype(dt) if check is not None and check.encoding == "utf-8": decode(arr[k], out=arr[k]) return arr def _to_fixed_length_strings(value: np.ndarray) -> np.ndarray: """\ Convert variable length strings to fixed length. Currently a workaround for https://github.com/zarr-developers/zarr-python/pull/422 """ new_dtype = [] for dt_name, (dt_type, dt_offset) in value.dtype.fields.items(): if dt_type.kind == "O": # Assuming the objects are str size = max(len(x.encode()) for x in value.getfield("O", dt_offset)) new_dtype.append((dt_name, ("U", size))) else: new_dtype.append((dt_name, dt_type)) return value.astype(new_dtype) ############################# # Dealing with uns ############################# def _clean_uns(d: Mapping[str, MutableMapping[str, Union[pd.Series, str, int]]]): """ Compat function for when categorical keys were stored in uns. This used to be buggy because when storing categorical columns in obs and var with the same column name, only one `_categories` is retained. """ k_to_delete = set() for cats_name, cats in d.get("uns", {}).items(): if not cats_name.endswith("_categories"): continue name = cats_name.replace("_categories", "") # fix categories with a single category if isinstance(cats, (str, int)): cats = [cats] for ann in ["obs", "var"]: if name not in d[ann]: continue codes: np.ndarray = d[ann][name].values # hack to maybe find the axis the categories were for if not np.all(codes < len(cats)): continue d[ann][name] = pd.Categorical.from_codes(codes, cats) k_to_delete.add(cats_name) for cats_name in k_to_delete: del d["uns"][cats_name] def _move_adj_mtx(d): """ Read-time fix for moving adjacency matrices from uns to obsp """ n = d.get("uns", {}).get("neighbors", {}) obsp = d.setdefault("obsp", {}) for k in ("distances", "connectivities"): if ( (k in n) and isinstance(n[k], (spmatrix, np.ndarray)) and len(n[k].shape) == 2 ): warn( f"Moving element from .uns['neighbors']['{k}'] to .obsp['{k}'].\n\n" "This is where adjacency matrices should go now.", FutureWarning, ) obsp[k] = n.pop(k) def _find_sparse_matrices(d: Mapping, n: int, keys: tuple, paths: list): """Find paths to sparse matrices with shape (n, n).""" for k, v in d.items(): if isinstance(v, Mapping): _find_sparse_matrices(v, n, (*keys, k), paths) elif isinstance(v, spmatrix) and v.shape == (n, n): paths.append((*keys, k)) return paths def _slice_uns_sparse_matrices(uns: MutableMapping, oidx: "Index1d", orig_n_obs: int): """slice sparse spatrices of n_obs × n_obs in self.uns""" if isinstance(oidx, slice) and len(range(*oidx.indices(orig_n_obs))) == orig_n_obs: return uns # slice of entire dimension is a no-op paths = _find_sparse_matrices(uns, orig_n_obs, (), []) if not paths: return uns uns = deepcopy(uns) for path in paths: str_path = "".join(f"['{key}']" for key in path) warn( f"During AnnData slicing, found matrix at .uns{str_path} that happens" f" to be dimensioned at n_obs×n_obs ({orig_n_obs}×{orig_n_obs}).\n\n" "These matrices should now be stored in the .obsp attribute.\n" "This slicing behavior will be removed in anndata 0.8.", FutureWarning, ) d = reduce(lambda d, k: d[k], path[:-1], uns) d[path[-1]] = _subset(d[path[-1]], (oidx, oidx)) return uns # This function was adapted from scikit-learn # github.com/scikit-learn/scikit-learn/blob/master/sklearn/utils/validation.py def _deprecate_positional_args(func=None, *, version: str = "1.0 (renaming of 0.25)"): """Decorator for methods that issues warnings for positional arguments. Using the keyword-only argument syntax in pep 3102, arguments after the * will issue a warning when passed as a positional argument. Parameters ---------- func Function to check arguments on. version The version when positional arguments will result in error. """ def _inner_deprecate_positional_args(f): sig = signature(f) kwonly_args = [] all_args = [] for name, param in sig.parameters.items(): if param.kind == Parameter.POSITIONAL_OR_KEYWORD: all_args.append(name) elif param.kind == Parameter.KEYWORD_ONLY: kwonly_args.append(name) @wraps(f) def inner_f(*args, **kwargs): extra_args = len(args) - len(all_args) if extra_args <= 0: return f(*args, **kwargs) # extra_args > 0 args_msg = [ "{}={}".format(name, arg) for name, arg in zip(kwonly_args[:extra_args], args[-extra_args:]) ] args_msg = ", ".join(args_msg) warn( f"Pass {args_msg} as keyword args. From version {version} passing " "these as positional arguments will result in an error", FutureWarning, ) kwargs.update(zip(sig.parameters, args)) return f(**kwargs) return inner_f if func is not None: return _inner_deprecate_positional_args(func) return _inner_deprecate_positional_args anndata-0.7.8/anndata/compat/_overloaded_dict.py000066400000000000000000000137751414255741200217070ustar00rootroot00000000000000from collections.abc import MutableMapping from functools import partial from typing import Any, Callable, List, Mapping, Optional, Union from warnings import warn from weakref import proxy class KeyOverload: """ This class contains the information neccesary to overload a key of a dict. It's like a descriptor, but for a key of a dict instead of an attribute. Register getter, setter, and deleter methods by passing them at instantiation, or assigning them to the `._get`, `._set`, and `._delete` attributes respectivley. These functions will be passed the parent `OverloadedDict` and the key as their first two arguments. The get and delete methods will be called by the parent with no additional arguments, while the setter will be passed the value to set. Note that the parent is not set on instantiation. It's currently assumed that's added when the parent is constructed. Attrs ----- key Key in parent dict to overload parent The parent OverloadedDict this key is attached to. """ def __init__( self, key, get: Optional[Callable] = None, set: Optional[Callable] = None, delete: Optional[Callable] = None, ): self.key = key if get is not None: self._get = get if set is not None: self._set = set if delete is not None: self._delete = delete @staticmethod def _get(parent, key): """Default key getter.""" return parent.data[key] @staticmethod def _set(parent, key, value): parent.data[key] = value @staticmethod def _delete(parent, key): del parent.data[key] @property def get(self): return partial(self._get, self.parent, self.key) @property def set(self): return partial(self._set, self.parent, self.key) @property def delete(self): return partial(self._delete, self.parent, self.key) class OverloadedDict(MutableMapping): """A mapping where some of the keys have been overloaded. Each overloaded key should be defined as an KeyOverload instance, and can have specific getter, settter, and deleter methods. Additionally, overloaded keys don't show up in iteration or from `__contains__` calls unless they exist in `.data`. Attrs ----- data Wrapped mapping. overloaded Maps from keys to overloaded behaviours. """ data: Mapping overloaded: Mapping[Any, KeyOverload] def __init__(self, data: Mapping, *, overloaded: Mapping[Any, KeyOverload]): self.data = data self.overloaded = overloaded for v in overloaded.values(): v.parent = proxy(self) def __getitem__(self, key): if key in self.overloaded: return self.overloaded[key].get() else: return self.data[key] def __setitem__(self, key, value): if key in self.overloaded: self.overloaded[key].set(value) else: self.data[key] = value def __delitem__(self, key): if key in self.overloaded: self.overloaded[key].delete() else: del self.data[key] def __contains__(self, key): return key in self.data def __iter__(self): return iter(self.data) def __len__(self): return len(self.data) def __repr__(self): return ( f"OverloadedDict, wrapping:\n\t{self.data!r}\nWith overloaded keys:" f"\n\t{list(self.overloaded.keys())}." ) def copy(self) -> dict: return self.data.copy() def keys(self): return self.data.keys() def _ipython_key_completions_(self) -> List[str]: return list(self.keys()) ####################################### # Handling .uns["neighbors"] ####################################### def _access_warn(key, cur_loc): warn( f"This location for '{key}' is deprecated. It has been moved to {cur_loc}, " "and will not be accesible here in a future version of anndata.", FutureWarning, stacklevel=4, ) def _adjacency_getter(ovld: OverloadedDict, key, adata: "AnnData"): """For overloading: >>> mtx = adata.uns["neighbors"]["connectivities"] # doctest: +SKIP >>> mtx = adata.uns["neighbors"]["distances"] # doctest: +SKIP """ _access_warn(key, f".obsp[{key}]") return adata.obsp[key] def _adjacency_setter(ovld: OverloadedDict, key, value, adata: "AnnData"): """For overloading: >>> adata.uns["neighbors"]["connectivities"] = mtx # doctest: +SKIP >>> adata.uns["neighbors"]["distances"] = mtx # doctest: +SKIP """ _access_warn(key, f".obsp[{key}]") adata.obsp[key] = value def _neighbors_setter(ovld: OverloadedDict, key, neighbors: Mapping, adata: "AnnData"): """For overloading: `adata.uns["neighbors"] = d`.""" for k in ("distances", "connectivities"): if k in neighbors: _access_warn(k, f".obsp[{k}]") adata.obsp[k] = neighbors.pop(k) ovld.data[key] = neighbors def _neighbors_getter(ovld: OverloadedDict, key, adata: "AnnData"): """For overloading: `adata.uns["neighbors"]`""" return OverloadedDict( ovld.data[key], overloaded={ "connectivities": KeyOverload( "connectivities", get=partial(_adjacency_getter, adata=adata), set=partial(_adjacency_setter, adata=adata), ), "distances": KeyOverload( "distances", get=partial(_adjacency_getter, adata=adata), set=partial(_adjacency_setter, adata=adata), ), }, ) def _overloaded_uns(adata: "AnnData", uns: Union[dict, "DictView"]) -> OverloadedDict: return OverloadedDict( uns, overloaded={ "neighbors": KeyOverload( "neighbors", get=partial(_neighbors_getter, adata=adata), set=partial(_neighbors_setter, adata=adata), ), }, ) anndata-0.7.8/anndata/core.py000066400000000000000000000002151414255741200160470ustar00rootroot00000000000000from warnings import warn warn("Please only import from anndata, not anndata.core", DeprecationWarning) from ._core import * # noqa: F403 anndata-0.7.8/anndata/experimental/000077500000000000000000000000001414255741200172445ustar00rootroot00000000000000anndata-0.7.8/anndata/experimental/__init__.py000066400000000000000000000001061414255741200213520ustar00rootroot00000000000000from .multi_files import AnnCollection from .pytorch import AnnLoader anndata-0.7.8/anndata/experimental/multi_files/000077500000000000000000000000001414255741200215605ustar00rootroot00000000000000anndata-0.7.8/anndata/experimental/multi_files/__init__.py000066400000000000000000000000521414255741200236660ustar00rootroot00000000000000from ._anncollection import AnnCollection anndata-0.7.8/anndata/experimental/multi_files/_anncollection.py000066400000000000000000001030711414255741200251230ustar00rootroot00000000000000from collections.abc import Mapping from functools import reduce from h5py import Dataset import numpy as np import pandas as pd from typing import Dict, Union, Optional, Sequence, Callable from ...compat import Literal from ..._core.anndata import AnnData from ..._core.index import _normalize_indices, _normalize_index, Index from ..._core.views import _resolve_idx from ..._core.merge import concat_arrays, inner_concat_aligned_mapping from ..._core.sparse_dataset import SparseDataset from ..._core.aligned_mapping import AxisArrays from ...logging import anndata_logger as logger ATTRS = ["obs", "obsm", "layers"] def _merge(arrs): rxers = [lambda x, fill_value, axis: x] * len(arrs) return concat_arrays(arrs, rxers) def _select_convert(key, convert, arr=None): key_convert = None if callable(convert): key_convert = convert elif isinstance(convert, dict) and key in convert: key_convert = convert[key] if arr is not None: return key_convert(arr) if key_convert is not None else arr else: return key_convert def _harmonize_types(attrs_keys, adatas): attrs_keys_types = {} def check_type(attr, key=None): arrs = [] for a in adatas: attr_arr = getattr(a, attr) if key is not None: attr_arr = attr_arr[key] arrs.append(attr_arr) # hacky but numpy find_common_type doesn't work with categoricals try: dtype = _merge([arr[:1] for arr in arrs]).dtype except ValueError: dtype = _merge([arr[:1, :1] for arr in arrs]).dtype return dtype for attr, keys in attrs_keys.items(): if len(keys) == 0: continue attrs_keys_types[attr] = {} for key in keys: attrs_keys_types[attr][key] = check_type(attr, key) attrs_keys_types["X"] = check_type("X") return attrs_keys_types class _ConcatViewMixin: def _resolve_idx(self, oidx, vidx): adatas_oidx = [] reverse = None old_oidx = getattr(self, "oidx", None) if old_oidx is not None: oidx = _resolve_idx(old_oidx, oidx, self.limits[-1]) if isinstance(oidx, slice): start, stop, step = oidx.indices(self.limits[-1]) oidx = np.arange(start, stop, step) else: oidx = np.array([oidx]) if isinstance(oidx, int) else oidx u_oidx = oidx if len(self.adatas) == 1: return [u_oidx], oidx, vidx, reverse iter_limits = list(zip([0] + self.limits, self.limits)) n_adatas_used = 0 for lower, upper in iter_limits: if np.any((u_oidx >= lower) & (u_oidx < upper)): n_adatas_used += 1 need_reverse = ( self.indices_strict and n_adatas_used > 1 and u_oidx.size > 1 and np.any(u_oidx[:-1] > u_oidx[1:]) ) if need_reverse: u_oidx, reverse = np.unique(u_oidx, return_inverse=True) for lower, upper in iter_limits: mask = (u_oidx >= lower) & (u_oidx < upper) adatas_oidx.append(u_oidx[mask] - lower if mask.any() else None) old_vidx = getattr(self, "vidx", None) if old_vidx is not None: vidx = _resolve_idx(old_vidx, vidx, self.adatas[0].n_vars) if isinstance(vidx, int): vidx = np.array([vidx]) return adatas_oidx, oidx, vidx, reverse class _IterateViewMixin: def iterate_axis( self, batch_size: int, axis: Literal[0, 1] = 0, shuffle: bool = False, drop_last: bool = False, ): """Iterate the lazy object over an axis. Parameters ---------- batch_size How many samples to put into a batch when iterating. axis The axis to iterate over. shuffle Set to `True` to have the indices reshuffled before iterating. drop_last Set to `True` to drop a batch with the length lower than `batch_size`. """ if axis not in (0, 1): raise ValueError("Axis should be either 0 or 1.") n = self.shape[axis] if shuffle: indices = np.random.permutation(n).tolist() else: indices = list(range(n)) for i in range(0, n, batch_size): idx = indices[i : min(i + batch_size, n)] if axis == 1: batch = self[:, idx] else: batch = self[idx] # only happens if the last batch is smaller then batch_size if len(batch) < batch_size and drop_last: continue yield batch, idx class MapObsView: def __init__( self, attr, adatas, keys, adatas_oidx, adatas_vidx=None, convert=None, reverse=None, dtypes=None, obs_names=None, ): self.adatas = adatas self._keys = keys self.adatas_oidx = adatas_oidx self.adatas_vidx = adatas_vidx self.attr = attr self.convert = convert self.reverse = reverse self.dtypes = dtypes self.obs_names = obs_names def __getitem__(self, key, use_convert=True): if self._keys is not None and key not in self._keys: raise KeyError(f"No {key} in {self.attr} view") arrs = [] for i, oidx in enumerate(self.adatas_oidx): if oidx is None: continue arr = getattr(self.adatas[i], self.attr)[key] if self.adatas_vidx is not None: vidx = self.adatas_vidx[i] else: vidx = None if vidx is not None: idx = oidx, vidx else: idx = oidx if isinstance(arr, pd.DataFrame): arrs.append(arr.iloc[idx]) else: if vidx is not None: idx = np.ix_(*idx) if not isinstance(idx[1], slice) else idx arrs.append(arr[idx]) if len(arrs) > 1: _arr = _merge(arrs) _arr = _arr if self.reverse is None else _arr[self.reverse] else: _arr = arrs[0] # what if it is a dataframe? if self.dtypes is not None: _arr = _arr.astype(self.dtypes[key], copy=False) if self.convert is not None and use_convert: _arr = _select_convert(key, self.convert, _arr) return _arr def keys(self): if self._keys is not None: return self._keys else: return list(getattr(self.adatas[0], self.attr).keys()) def to_dict(self, keys=None, use_convert=True): dct = {} keys = self.keys() if keys is None else keys for key in keys: dct[key] = self.__getitem__(key, use_convert) return dct @property def df(self): if self.attr != "obs": return None return pd.DataFrame(self.to_dict(use_convert=False), index=self.obs_names) def __repr__(self): descr = f"View of {self.attr} with keys: {str(self.keys())[1:-1]}" return descr class AnnCollectionView(_ConcatViewMixin, _IterateViewMixin): """\ An object to access the observation attributes of `adatas` in AnnCollection. Created as a result of subsetting an :class:`~anndata.experimental.AnnCollection` object. An object of this class can have `.obs`, `.obsm`, `.layers`, `.X` depending on the results of joins in the reference AnnCollection object. Notes ----- Nothing is copied until keys of the attributes or `.X` are accessed. """ def __init__(self, reference, convert, resolved_idx): self.reference = reference self.indices_strict = self.reference.indices_strict self.adatas = self.reference.adatas self.limits = self.reference.limits self.adatas_oidx, self.oidx, self.vidx, self.reverse = resolved_idx self.adatas_vidx = [] for i, vidx in enumerate(self.reference.adatas_vidx): if vidx is None: self.adatas_vidx.append(self.vidx) else: new_vidx = _resolve_idx(vidx, self.vidx, self.adatas[i].n_vars) self.adatas_vidx.append(new_vidx) self._view_attrs_keys = self.reference._view_attrs_keys self._attrs = self.reference._attrs self._dtypes = self.reference._dtypes self._layers_view, self._obsm_view, self._obs_view = None, None, None self._X = None self._convert = None self._convert_X = None self.convert = convert def _lazy_init_attr(self, attr, set_vidx=False): if getattr(self, f"_{attr}_view") is not None: return keys = None attr_dtypes = None if attr in self._view_attrs_keys: reverse = self.reverse keys = self._view_attrs_keys[attr] if len(keys) == 0: return adatas = self.adatas adatas_oidx = self.adatas_oidx if self._dtypes is not None: attr_dtypes = self._dtypes[attr] else: reverse = None adatas = [self.reference] adatas_oidx = [self.oidx] adatas_vidx = self.adatas_vidx if set_vidx else None attr_convert = None if self.convert is not None: attr_convert = _select_convert(attr, self.convert) if attr == "obs": obs_names = self.obs_names else: obs_names = None setattr( self, f"_{attr}_view", MapObsView( attr, adatas, keys, adatas_oidx, adatas_vidx, attr_convert, reverse, attr_dtypes, obs_names, ), ) def _gather_X(self): if self._X is not None: return self._X Xs = [] for i, oidx in enumerate(self.adatas_oidx): if oidx is None: continue adata = self.adatas[i] X = adata.X vidx = self.adatas_vidx[i] if isinstance(X, Dataset): reverse = None if oidx.size > 1 and np.any(oidx[:-1] >= oidx[1:]): oidx, reverse = np.unique(oidx, return_inverse=True) if isinstance(vidx, slice): arr = X[oidx, vidx] else: # this is a very memory inefficient approach # todo: fix arr = X[oidx][:, vidx] Xs.append(arr if reverse is None else arr[reverse]) elif isinstance(X, SparseDataset): # very slow indexing with two arrays if isinstance(vidx, slice) or len(vidx) <= 1000: Xs.append(X[oidx, vidx]) else: Xs.append(X[oidx][:, vidx]) else: # if vidx is present it is less memory efficient idx = oidx, vidx idx = np.ix_(*idx) if not isinstance(vidx, slice) else idx Xs.append(X[idx]) if len(Xs) > 1: _X = _merge(Xs) # todo: get rid of reverse for dense arrays _X = _X if self.reverse is None else _X[self.reverse] else: _X = Xs[0] if self._dtypes is not None: _X = _X.astype(self._dtypes["X"], copy=False) self._X = _X return _X @property def X(self): """Lazy subset of data matrix. The data matrix formed from the `.X` attributes of the underlying `adatas`, properly reindexed and lazily merged. Nothing is copied until `.X` is accessed, no real concatenation of the unerlying `.X` attributes is done. """ # inconsistent behavior here, _X can be changed, # but the other attributes can't be changed. # maybe do return ... _X.copy() or _X.setflags(write=False) _X = self._gather_X() return self._convert_X(_X) if self._convert_X is not None else _X @property def layers(self): """Lazy subset of layers. The layers attribute formed from lazy inner join and subsetting of the `.layers` of the underlying `adatas`. No copy is made until you access a key from `.layers`, only the subset of the accessed key is copied. To get `.layers` as a dictionary, use `.layers.to_dict()`. You can also specify keys to include in the dict `.layers.to_dict(keys=['key1', 'key2'])` and if you want converters to be turned off when copying to dict `.layers.to_dict(use_convert=False)`. """ self._lazy_init_attr("layers", set_vidx=True) return self._layers_view @property def obsm(self): """Lazy subset of multi-dimensional annotation of observations. Points to the `.obsm` attributes of the underlying adatas ot to `.obsm` of the parent AnnCollection object depending on the `join_obsm` option of the AnnCollection object. See the docs of :class:`~anndata.experimental.AnnCollection` for details. Copy rules are the same as for `.layers`, i.e. everything is lazy. To get `.obsm` as a dictionary, use `.obsm.to_dict()`. You can also specify keys to include in the dict `.obsm.to_dict(keys=['key1', 'key2'])` and if you want converters to be turned off when copying to dict `.obsm.to_dict(use_convert=False)`. """ self._lazy_init_attr("obsm") return self._obsm_view @property def obs(self): """Lazy suset of one-dimensional annotation of observations. Points to the `.obs` attributes of the underlying adatas ot to `.obs` of the parent AnnCollection object depending on the `join_obs` option of the AnnCollection object. See the docs of `~anndata.experimental.AnnCollection` for details. Copy rules are the same as for `.layers`, i.e. everything is lazy. To get `.obs` as a DataFrame, use `.obs.df`. To get `.obs` as a dictionary, use `.obs.to_dict()`. You can also specify keys to include in the dict `.obs.to_dict(keys=['key1', 'key2'])` and if you want converters to be truned off when copying to dict `.obs.to_dict(use_convert=False)`. """ self._lazy_init_attr("obs") return self._obs_view @property def obs_names(self): """Names of observations of this subset object.""" return self.reference.obs_names[self.oidx] @property def var_names(self): """Names of variables of this subset object.""" return self.reference.var_names[self.vidx] @property def shape(self): """Shape of the lazily concatenated subset of the data matrix.""" return len(self.obs_names), len(self.var_names) @property def convert(self): """On the fly converters for keys of attributes and data matrix. A function or a Mapping of functions which will be applied to the values of attributes (`.X`) or to specific keys of these attributes (`.obs`, `.obsm`, `.layers`). The keys of the the Mapping should correspond to the attributes or keys of the attributes (hierarchically) and the values should be functions used for conversion. Examples ---------- :: { 'X': lambda a: a.toarray() if issparse(a) else a, # densify .X 'obsm': lambda a: np.asarray(a, dtype='float32'), # change dtype for all keys of .obsm 'obs': dict(key1 = lambda c: c.astype(str)) # change type only for one key of .obs } """ return self._convert @convert.setter def convert(self, value): self._convert = value self._convert_X = _select_convert("X", self._convert) for attr in ATTRS: setattr(self, f"_{attr}_view", None) def __len__(self): return len(self.obs_names) def __getitem__(self, index: Index): oidx, vidx = _normalize_indices(index, self.obs_names, self.var_names) resolved_idx = self._resolve_idx(oidx, vidx) return AnnCollectionView(self.reference, self.convert, resolved_idx) @property def has_backed(self): """`True` if the current subset of `adatas` has backed objects, `False` otherwise.""" for i, adata in enumerate(self.adatas): if adata.isbacked and self.adatas_oidx[i] is not None: return True return False def __repr__(self): n_obs, n_vars = self.shape descr = f"AnnCollectionView object with n_obs × n_vars = {n_obs} × {n_vars}" all_attrs_keys = self._view_attrs_keys.copy() for attr in self._attrs: all_attrs_keys[attr] = list(getattr(self.reference, attr).keys()) for attr, keys in all_attrs_keys.items(): if len(keys) > 0: descr += f"\n {attr}: {str(keys)[1:-1]}" return descr def to_adata(self, ignore_X: bool = False, ignore_layers: bool = False): """Convert this AnnCollectionView object to an AnnData object. Parameters ---------- ignore_X if `True`, adds `.X` to the AnnData object. ignore_layers if `True`, copies `.layers` to the AnnData object. """ if ignore_layers or self.layers is None: layers = None else: layers = self.layers.to_dict(use_convert=False) obsm = None if self.obsm is None else self.obsm.to_dict(use_convert=False) obs = ( None if self.obs is None else pd.DataFrame(self.obs.to_dict(use_convert=False)) ) if ignore_X: X = None shape = self.shape else: X = self._gather_X() shape = None adata = AnnData(X, obs=obs, obsm=obsm, layers=layers, shape=shape) adata.obs_names = self.obs_names adata.var_names = self.var_names return adata @property def attrs_keys(self): """Dict of all accessible attributes and their keys.""" return self.reference.attrs_keys DictCallable = Dict[str, Callable] ConvertType = Union[Callable, DictCallable, Dict[str, DictCallable]] class AnnCollection(_ConcatViewMixin, _IterateViewMixin): """\ Lazily concatenate AnnData objects along the `obs` axis. This class doesn't copy data from underlying AnnData objects, but lazily subsets using a joint index of observations and variables. It also allows on-the-fly application of prespecified converters to `.obs` attributes of the AnnData objects. Subsetting of this object returns an `AnnCollectionView`, which provides views of `.obs`, `.obsm`, `.layers`, `.X` from the underlying AnnData objects. Parameters ---------- adatas The objects to be lazily concatenated. If a Mapping is passed, keys are used for the `keys` argument and values are concatenated. join_obs If "inner" specified all `.obs` attributes from `adatas` will be inner joined and copied to this object. If "outer" specified all `.obsm` attributes from `adatas` will be outer joined and copied to this object. For "inner" and "outer" subset objects will access `.obs` of this object, not the original `.obs` attributes of `adatas`. If `None`, nothing is copied to this object's `.obs`, a subset object will directly access `.obs` attributes of `adatas` (with proper reindexing and dtype conversions). For `None`the inner join rule is used to select columns of `.obs` of `adatas`. join_obsm If "inner" specified all `.obsm` attributes from `adatas` will be inner joined and copied to this object. Subset objects will access `.obsm` of this object, not the original `.obsm` attributes of `adatas`. If `None`, nothing is copied to this object's `.obsm`, a subset object will directly access `.obsm` attributes of `adatas` (with proper reindexing and dtype conversions). For both options the inner join rule for the underlying `.obsm` attributes is used. join_vars Specify how to join `adatas` along the var axis. If `None`, assumes all `adatas` have the same variables. If "inner", the intersection of all variables in `adatas` will be used. label Column in `.obs` to place batch information in. If it's None, no column is added. keys Names for each object being added. These values are used for column values for `label` or appended to the index if `index_unique` is not `None`. Defaults to incrementing integer labels. index_unique Whether to make the index unique by using the keys. If provided, this is the delimeter between "{orig_idx}{index_unique}{key}". When `None`, the original indices are kept. convert You can pass a function or a Mapping of functions which will be applied to the values of attributes (`.obs`, `.obsm`, `.layers`, `.X`) or to specific keys of these attributes in the subset object. Specify an attribute and a key (if needed) as keys of the passed Mapping and a function to be applied as a value. harmonize_dtypes If `True`, all retrieved arrays from subset objects will have the same dtype. indices_strict If `True`, arrays from the subset objects will always have the same order of indices as in selection used to subset. This parameter can be set to `False` if the order in the returned arrays is not important, for example, when using them for stochastic gradient descent. In this case the performance of subsetting can be a bit better. Examples ---------- >>> from scanpy.datasets import pbmc68k_reduced, pbmc3k_processed >>> adata1, adata2 = pbmc68k_reduced(), pbmc3k_processed() >>> adata1.shape (700, 765) >>> adata2.shape (2638, 1838) >>> dc = AnnCollection([adata1, adata2], join_vars='inner') >>> dc AnnCollection object with n_obs × n_vars = 3338 × 208 constructed from 2 AnnData objects view of obsm: 'X_pca', 'X_umap' obs: 'n_genes', 'percent_mito', 'n_counts', 'louvain' >>> batch = dc[100:200] # AnnCollectionView >>> batch AnnCollectionView object with n_obs × n_vars = 100 × 208 obsm: 'X_pca', 'X_umap' obs: 'n_genes', 'percent_mito', 'n_counts', 'louvain' >>> batch.X.shape (100, 208) >>> len(batch.obs['louvain']) 100 """ def __init__( self, adatas: Union[Sequence[AnnData], Dict[str, AnnData]], join_obs: Optional[Literal["inner", "outer"]] = "inner", join_obsm: Optional[Literal["inner"]] = None, join_vars: Optional[Literal["inner"]] = None, label: Optional[str] = None, keys: Optional[Sequence[str]] = None, index_unique: Optional[str] = None, convert: Optional[ConvertType] = None, harmonize_dtypes: bool = True, indices_strict: bool = True, ): if isinstance(adatas, Mapping): if keys is not None: raise TypeError( "Cannot specify categories in both mapping keys and using `keys`. " "Only specify this once." ) keys, adatas = list(adatas.keys()), list(adatas.values()) else: adatas = list(adatas) # check if the variables are the same in all adatas self.adatas_vidx = [None for adata in adatas] vars_names_list = [adata.var_names for adata in adatas] vars_eq = all([adatas[0].var_names.equals(vrs) for vrs in vars_names_list[1:]]) if vars_eq: self.var_names = adatas[0].var_names elif join_vars == "inner": var_names = reduce(pd.Index.intersection, vars_names_list) self.adatas_vidx = [] for adata in adatas: if var_names.equals(adata.var_names): self.adatas_vidx.append(None) else: adata_vidx = _normalize_index(var_names, adata.var_names) self.adatas_vidx.append(adata_vidx) self.var_names = var_names else: raise ValueError( "Adatas have different variables. " "Please specify join_vars='inner' for intersection." ) concat_indices = pd.concat( [pd.Series(a.obs_names) for a in adatas], ignore_index=True ) if keys is None: keys = np.arange(len(adatas)).astype(str) label_col = pd.Categorical.from_codes( np.repeat(np.arange(len(adatas)), [a.shape[0] for a in adatas]), categories=keys, ) if index_unique is not None: concat_indices = concat_indices.str.cat( label_col.map(str), sep=index_unique ) self.obs_names = pd.Index(concat_indices) if not self.obs_names.is_unique: logger.info("Observation names are not unique.") view_attrs = ATTRS.copy() self._attrs = [] # process obs joins if join_obs is not None: view_attrs.remove("obs") self._attrs.append("obs") concat_annot = pd.concat( [a.obs for a in adatas], join=join_obs, ignore_index=True ) concat_annot.index = self.obs_names self._obs = concat_annot else: self._obs = pd.DataFrame(index=self.obs_names) if label is not None: self._obs[label] = label_col # process obsm inner join self._obsm = None if join_obsm == "inner": view_attrs.remove("obsm") self._attrs.append("obsm") self._obsm = inner_concat_aligned_mapping( [a.obsm for a in adatas], index=self.obs_names ) self._obsm = AxisArrays(self, axis=0) if self._obsm == {} else self._obsm # process inner join of views self._view_attrs_keys = {} for attr in view_attrs: self._view_attrs_keys[attr] = list(getattr(adatas[0], attr).keys()) for a in adatas[1:]: for attr, keys in self._view_attrs_keys.items(): ai_attr = getattr(a, attr) a0_attr = getattr(adatas[0], attr) new_keys = [] for key in keys: if key in ai_attr.keys(): a0_ashape = a0_attr[key].shape ai_ashape = ai_attr[key].shape if ( len(a0_ashape) < 2 or a0_ashape[1] == ai_ashape[1] or attr == "layers" ): new_keys.append(key) self._view_attrs_keys[attr] = new_keys self.adatas = adatas self.limits = [adatas[0].n_obs] for i in range(len(adatas) - 1): self.limits.append(self.limits[i] + adatas[i + 1].n_obs) # init converter self._convert = convert self._dtypes = None if len(adatas) > 1 and harmonize_dtypes: self._dtypes = _harmonize_types(self._view_attrs_keys, self.adatas) self.indices_strict = indices_strict def __getitem__(self, index: Index): oidx, vidx = _normalize_indices(index, self.obs_names, self.var_names) resolved_idx = self._resolve_idx(oidx, vidx) return AnnCollectionView(self, self.convert, resolved_idx) @property def convert(self): """On the fly converters for keys of attributes and data matrix. A function or a Mapping of functions which will be applied to the values of attributes (`.X`) or to specific keys of these attributes (`.obs`, `.obsm`, `.layers`) of subset objects. The converters are not applied to `.obs` and `.obsm` (if present) of this object, only to the attributes of subset objects. The keys of the the Mapping should correspond to the attributes or keys of the attributes (hierarchically) and the values should be functions used for conversion. Examples -------- :: { 'X': lambda a: a.toarray() if issparse(a) else a, # densify .X 'obsm': lambda a: np.asarray(a, dtype='float32'), # change dtype for all keys of .obsm 'obs': dict(key1 = lambda c: c.astype(str)) # change type only for one key of .obs } """ return self._convert @convert.setter def convert(self, value): self._convert = value @property def obs(self): """One-dimensional annotation of observations. If `join_obs` was set to "inner" and "outer", subset objects' `.obs` will point to this `.obs`; otherwise, to `.obs` of the underlying objects (`adatas`). """ return self._obs @property def obsm(self): """Multi-dimensional annotation of observations. If `join_obsm` was set to "inner", subset objects' `.obsm` will point to this `.obsm`; otherwise, to `.obsm` of the underlying objects (`adatas`). In the latter case, `.obsm` of this object will be `None`. """ return self._obsm @property def shape(self): """Shape of the lazily concatenated data matrix""" return self.limits[-1], len(self.var_names) @property def n_obs(self): """Number of observations.""" return self.shape[0] @property def n_vars(self): """Number of variables/features.""" return self.shape[1] def __len__(self): return self.limits[-1] def to_adata(self): """Convert this AnnCollection object to an AnnData object. The AnnData object won't have `.X`, only `.obs` and `.obsm`. """ if "obs" in self._view_attrs_keys or "obsm" in self._view_attrs_keys: concat_view = self[self.obs_names] if "obsm" in self._view_attrs_keys: obsm = ( concat_view.obsm.to_dict(use_convert=False) if concat_view.obsm is not None else None ) else: obsm = self.obsm.copy() obs = self.obs.copy() if "obs" in self._view_attrs_keys and concat_view.obs is not None: for key, value in concat_view.obs.to_dict(use_convert=False).items(): obs[key] = value adata = AnnData(X=None, obs=obs, obsm=obsm, shape=self.shape) adata.obs_names = self.obs_names adata.var_names = self.var_names return adata def lazy_attr(self, attr, key=None): """Get a subsettable key from an attribute (array-like) or an attribute. Returns a LazyAttrData object which provides subsetting over the specified attribute (`.obs` or `.obsm`) or over a key from this attribute. In the latter case, it acts as a lazy array. """ return LazyAttrData(self, attr, key) @property def has_backed(self): """`True` if `adatas` have backed AnnData objects, `False` otherwise.""" return any([adata.isbacked for adata in self.adatas]) @property def attrs_keys(self): """Dict of all accessible attributes and their keys.""" _attrs_keys = {} for attr in self._attrs: keys = list(getattr(self, attr).keys()) _attrs_keys[attr] = keys _attrs_keys.update(self._view_attrs_keys) return _attrs_keys def __repr__(self): n_obs, n_vars = self.shape descr = f"AnnCollection object with n_obs × n_vars = {n_obs} × {n_vars}" descr += f"\n constructed from {len(self.adatas)} AnnData objects" for attr, keys in self._view_attrs_keys.items(): if len(keys) > 0: descr += f"\n view of {attr}: {str(keys)[1:-1]}" for attr in self._attrs: keys = list(getattr(self, attr).keys()) if len(keys) > 0: descr += f"\n {attr}: {str(keys)[1:-1]}" if "obs" in self._view_attrs_keys: keys = list(self.obs.keys()) if len(keys) > 0: descr += f"\n own obs: {str(keys)[1:-1]}" return descr class LazyAttrData(_IterateViewMixin): def __init__(self, adset: AnnCollection, attr: str, key: Optional[str] = None): self.adset = adset self.attr = attr self.key = key def __getitem__(self, index): oidx = None vidx = None if isinstance(index, tuple) and self.attr in ("obs", "obsm"): oidx = index[0] if len(index) > 1: vidx = index[1] if oidx is None: view = self.adset[index] else: view = self.adset[oidx] attr_arr = getattr(view, self.attr) if self.key is not None: attr_arr = attr_arr[self.key] return attr_arr if vidx is None else attr_arr[:, vidx] @property def shape(self): shape = self.adset.shape if self.attr in ["X", "layers"]: return shape elif self.attr == "obs": return (shape[0],) elif self.attr == "obsm" and self.key is not None: return shape[0], self[:1].shape[1] else: return None @property def ndim(self): return len(self.shape) if self.shape is not None else 0 @property def dtype(self): _dtypes = self.adset._dtypes if _dtypes is not None and self.attr in _dtypes: return _dtypes[self.attr][self.key] attr = self[:1] if hasattr(attr, dtype): return attr.dtype else: return None anndata-0.7.8/anndata/experimental/pytorch/000077500000000000000000000000001414255741200207345ustar00rootroot00000000000000anndata-0.7.8/anndata/experimental/pytorch/__init__.py000066400000000000000000000000421414255741200230410ustar00rootroot00000000000000from ._annloader import AnnLoader anndata-0.7.8/anndata/experimental/pytorch/_annloader.py000066400000000000000000000154451414255741200234210ustar00rootroot00000000000000from scipy.sparse import issparse from math import ceil from copy import copy from functools import partial from typing import Dict, Union, Sequence import numpy as np import warnings from ..._core.anndata import AnnData from ..multi_files._anncollection import AnnCollection, _ConcatViewMixin try: import torch from torch.utils.data import Sampler, Dataset, DataLoader except ImportError: warnings.warn("Сould not load pytorch.") Sampler, Dataset, DataLoader = object, object, object # Custom sampler to get proper batches instead of joined separate indices # maybe move to multi_files class BatchIndexSampler(Sampler): def __init__(self, n_obs, batch_size, shuffle=False, drop_last=False): self.n_obs = n_obs self.batch_size = batch_size if batch_size < n_obs else n_obs self.shuffle = shuffle self.drop_last = drop_last def __iter__(self): if self.shuffle: indices = np.random.permutation(self.n_obs).tolist() else: indices = list(range(self.n_obs)) for i in range(0, self.n_obs, self.batch_size): batch = indices[i : min(i + self.batch_size, self.n_obs)] # only happens if the last batch is smaller then batch_size if len(batch) < self.batch_size and self.drop_last: continue yield batch def __len__(self): if self.drop_last: length = self.n_obs // self.batch_size else: length = ceil(self.n_obs / self.batch_size) return length # maybe replace use_cuda with explicit device option def default_converter(arr, use_cuda, pin_memory): if isinstance(arr, torch.Tensor): if use_cuda: arr = arr.cuda() elif pin_memory: arr = arr.pin_memory() elif arr.dtype.name != "category" and np.issubdtype(arr.dtype, np.number): if issparse(arr): arr = arr.toarray() if use_cuda: arr = torch.tensor(arr, device="cuda") else: arr = torch.tensor(arr) arr = arr.pin_memory() if pin_memory else arr return arr def _convert_on_top(convert, top_convert, attrs_keys): if convert is None: new_convert = top_convert elif callable(convert): def compose_convert(arr): return top_convert(convert(arr)) new_convert = compose_convert else: new_convert = {} for attr in attrs_keys: if attr not in convert: new_convert[attr] = top_convert else: if isinstance(attrs_keys, list): as_ks = None else: as_ks = attrs_keys[attr] new_convert[attr] = _convert_on_top(convert[attr], top_convert, as_ks) return new_convert # AnnLoader has the same arguments as DataLoader, but uses BatchIndexSampler by default class AnnLoader(DataLoader): """\ PyTorch DataLoader for AnnData objects. Builds DataLoader from a sequence of AnnData objects, from an :class:`~anndata.experimental.AnnCollection` object or from an `AnnCollectionView` object. Takes care of the required conversions. Parameters ---------- adatas `AnnData` objects or an `AnnCollection` object from which to load the data. batch_size How many samples per batch to load. shuffle Set to `True` to have the data reshuffled at every epoch. use_default_converter Use the default converter to convert arrays to pytorch tensors, transfer to the default cuda device (if `use_cuda=True`), do memory pinning (if `pin_memory=True`). If you pass an AnnCollection object with prespecified converters, the default converter won't overwrite these converters but will be applied on top of them. use_cuda Transfer pytorch tensors to the default cuda device after conversion. Only works if `use_default_converter=True` **kwargs Arguments for PyTorch DataLoader. If `adatas` is not an `AnnCollection` object, then also arguments for `AnnCollection` initialization. """ def __init__( self, adatas: Union[Sequence[AnnData], Dict[str, AnnData]], batch_size: int = 1, shuffle: bool = False, use_default_converter: bool = True, use_cuda: bool = False, **kwargs, ): if isinstance(adatas, AnnData): adatas = [adatas] if ( isinstance(adatas, list) or isinstance(adatas, tuple) or isinstance(adatas, dict) ): join_obs = kwargs.pop("join_obs", "inner") join_obsm = kwargs.pop("join_obsm", None) label = kwargs.pop("label", None) keys = kwargs.pop("keys", None) index_unique = kwargs.pop("index_unique", None) convert = kwargs.pop("convert", None) harmonize_dtypes = kwargs.pop("harmonize_dtypes", True) indices_strict = kwargs.pop("indices_strict", True) dataset = AnnCollection( adatas, join_obs=join_obs, join_obsm=join_obsm, label=label, keys=keys, index_unique=index_unique, convert=convert, harmonize_dtypes=harmonize_dtypes, indices_strict=indices_strict, ) elif isinstance(adatas, _ConcatViewMixin): dataset = copy(adatas) else: raise ValueError("adata should be of type AnnData or AnnCollection.") if use_default_converter: pin_memory = kwargs.pop("pin_memory", False) _converter = partial( default_converter, use_cuda=use_cuda, pin_memory=pin_memory ) dataset.convert = _convert_on_top( dataset.convert, _converter, dict(dataset.attrs_keys, X=[]) ) has_sampler = "sampler" in kwargs has_batch_sampler = "batch_sampler" in kwargs has_worker_init_fn = ( "worker_init_fn" in kwargs and kwargs["worker_init_fn"] is not None ) has_workers = "num_workers" in kwargs and kwargs["num_workers"] > 0 use_parallel = has_worker_init_fn or has_workers if ( batch_size is not None and batch_size > 1 and not has_sampler and not has_batch_sampler and not use_parallel ): drop_last = kwargs.pop("drop_last", False) default_sampler = BatchIndexSampler( len(dataset), batch_size, shuffle, drop_last ) super().__init__( dataset, batch_size=None, sampler=default_sampler, **kwargs ) else: super().__init__(dataset, batch_size=batch_size, shuffle=shuffle, **kwargs) anndata-0.7.8/anndata/logging.py000066400000000000000000000026671414255741200165620ustar00rootroot00000000000000import os import logging _previous_memory_usage = None anndata_logger = logging.getLogger("anndata") # Don’t pass log messages on to logging.root and its handler anndata_logger.propagate = False anndata_logger.setLevel("INFO") anndata_logger.addHandler(logging.StreamHandler()) # Logs go to stderr anndata_logger.handlers[-1].setFormatter(logging.Formatter("%(message)s")) anndata_logger.handlers[-1].setLevel("INFO") def get_logger(name): """\ Creates a child logger that delegates to anndata_logger instead to logging.root """ return anndata_logger.manager.getLogger(name) def get_memory_usage(): import psutil process = psutil.Process(os.getpid()) try: meminfo = process.memory_info() except AttributeError: meminfo = process.get_memory_info() mem = meminfo[0] / 2 ** 30 # output in GB mem_diff = mem global _previous_memory_usage if _previous_memory_usage is not None: mem_diff = mem - _previous_memory_usage _previous_memory_usage = mem return mem, mem_diff def format_memory_usage(mem_usage, msg="", newline=False): newline = "\n" if newline else "" more = " \n... " if msg != "" else "" mem, diff = mem_usage return ( f"{newline}{msg}{more}" f"Memory usage: current {mem:.2f} GB, difference {diff:+.2f} GB" ) def print_memory_usage(msg="", newline=False): print(format_memory_usage(get_memory_usage(), msg, newline)) anndata-0.7.8/anndata/readwrite.py000066400000000000000000000002201414255741200171010ustar00rootroot00000000000000from warnings import warn warn("Please only import from anndata, not anndata.readwrite", DeprecationWarning) from ._io import * # noqa: F403 anndata-0.7.8/anndata/tests/000077500000000000000000000000001414255741200157115ustar00rootroot00000000000000anndata-0.7.8/anndata/tests/adata-comments.tsv000066400000000000000000000001451414255741200213440ustar00rootroot00000000000000# A regular comment # The next comment is actually colnames # c1 c2 r1 1.0 0.0 r2 3.0 0.0 r3 5.0 6.0 anndata-0.7.8/anndata/tests/adata.csv000066400000000000000000000000471414255741200175010ustar00rootroot00000000000000,c1,c2 r1,1.0,0.0 r2,3.0,0.0 r3,5.0,6.0anndata-0.7.8/anndata/tests/conftest.py000066400000000000000000000001361414255741200201100ustar00rootroot00000000000000import pytest @pytest.fixture def backing_h5ad(tmp_path): return tmp_path / "test.h5ad" anndata-0.7.8/anndata/tests/data/000077500000000000000000000000001414255741200166225ustar00rootroot00000000000000anndata-0.7.8/anndata/tests/data/excel.xlsx000066400000000000000000000211601414255741200206420ustar00rootroot00000000000000PK!bh^[Content_Types].xml (N0EHC-Jܲ@5*Q>ēƪc[iiBj7{2hnmƻR U^7/%rZY@1__fqR4DAJh>Vƹ Z9NV8ʩji){^-I"{v^P!XS)bRrKs(3`c07M4ZƐk+|\|z(P6h_-[@!Pk2n}?L %ddN"m,ǞDO97*~ɸ8Oc|nEB!$};{[2PK!U0#L _rels/.rels (MO0 HݐBKwAH!T~I$ݿ'TG~xl/_rels/workbook.xml.rels (RMK0 0wvt/"Uɴ)&!3~*]XK/oyv5+zl;obG s>,8(%"D҆4j0u2jsMY˴S쭂 )fCy I< y!+EfMyk K5=|t G)s墙UtB),fPK! Uaxl/workbook.xmlUn8}_`A"R7[B²dl"H%*""ZҖ4"Mkؤ9C9CwsϤ_ rRT]wi_VlBqn+ĝZF!ƱQޣB*߂!ꚗ,ر^@$kლк-pwb[,(r2텤ޓK#: \x)>h@U{HuކzsS+U|Še4ҲZI!yDN|~\37Lo&`spSO1KgA$NKꐳ/8'E @1ܐYnohFumȚwQs>ͨMujz2ñN/z/MeO͍-lhM[PK!  xl/styles.xmlUmo0>iIJ$(Ti&$XKd6H>{^R=3k"ĈRW\m2fYGUEV,fUUjA-c:,Ij/tH`k66z')HS")WGXO@$5&(lk.;tXrqQеm4%jQkA:8F[] %yɞҝ9 _%$ޚ"Ma{ˇYTr`U#p+OW,1&yZj rP:P.E%o\S׆{cM%uI[<l.`]L A# zCB8fThXUA[跷789.`:<v᛭uvZ-O+N7ZQu;z HdB~Hd!max^=^c{忇Em}w'uFi7~W`7To sF@tzDžjrόNSPb5 w:yU|' >vDw`{g]/o77Mp9 &, r$jU8>\0A & +`!فٖѦ;smAqFdJglzEūdyɈ{('',LpuձBc+ H+AbPK!N xl/theme/theme1.xmlY͋7? sw5%l$dQV32%9R(Bo=@ $'#$lJZv G~ztҽzG ’_P=ؘ$Ӗk8(4|OHe n ,K۟~rmDlI9*f8&H#ޘ+R#^bP{}2!# J{O1B (W%òBR!a1;{(~h%/V&DYCn2L`|Xsj Z{_\Zҧh4:na PաWU_]נT E A)>\Çfgנ_[K^PkPDIr.jwd A)Q RSLX"7Z2>R$I O(9%o&`T) JU>#02]`XRxbL+7 /={=_*Kn%SSՏ__7'Ŀ˗:/}}O!c&a?0BĒ@v^[ uXsXa3W"`J+U`ek)r+emgoqx(ߤDJ]8TzM5)0IYgz|]p+~o`_=|j QkekZAj|&O3!ŻBw}ь0Q'j"5,ܔ#-q&?'2ڏ ZCeLTx3&cu+ЭNxNg x)\CJZ=ޭ~TwY(aLfQuQ_B^g^ٙXtXPꗡZFq 0mxEAAfc ΙFz3Pb/3 tSٺqyjuiE-#t00,;͖Yƺ2Obr3kE"'&&S;nj*#4kx#[SvInwaD:\N1{-_- 4m+W>Z@+qt;x2#iQNSp$½:7XX/+r1w`h׼9#:Pvd5O+Oٚ.<O7sig*t; CԲ*nN-rk.yJ}0-2MYNÊQ۴3, O6muF8='?ȝZu@,JܼfwTz}vLm'U16!H#HEw &rcv"Ҵi% (r|R%СQ1)nCVhBȚjʽZ 4Օ9N`ה7w-(8LC M$TT#*ybWSthgL-ZxKgJFHgקztWjΤPst{ڦlt&׷%W+mHr^o4 F3dxyL~nr,],]l.N<'$QMW"&f>'u64} s'ē>⒃G4?*&5&WWX+j.П 6s]bI|qr(_#}q5Mr%02>2iIq[ԼKn`1#M; {;&NdS<\+KZ]*Aa BIt1!)ޤl؛an3K:B7( ݄|s7tx7%rBww?Y/v{(d[gc-Zֻ%vBkQC̱B=LLBPK!a5ս(xl/sharedStrings.xmltj0 {a`t_PF]Js56r)c{e1$d%w8UrЬ (/'P,+Ode;"Nk":"-KJ%A8a9"Jɺ5fKH3-uO{[GjV.M}lМͯSPK!ύL{docProps/core.xml (]K0C}~8?Bہʮ n(ޅlD{v)x>9%|'Z(bb )zQ`UZA`Ѽ*~@e'7aiJ,{rċI&#w)PK!^YdocProps/app.xml (Mo0  9mQ bHWbvgNc$Ovڍ/^>+zLdrQ J<.?| .xIOjB*2ǕdZs i4}0ozWey+k/PL״fࣗ1f`ίO֤@ - :%29hޒ.jk: 8B%? aXl"z^h8쯼+Q=$ 3 1v8!RȤdL1k籽Qs`09βCl ?sap4s7>9O{wy^TN>cdrɺ]wc8vQ^_g5%?ZPK-!bh^[Content_Types].xmlPK-!U0#L _rels/.relsPK-!>xl/_rels/workbook.xml.relsPK-! Uaxl/workbook.xmlPK-!  } xl/styles.xmlPK-!N yxl/theme/theme1.xmlPK-!HSόt\xl/worksheets/sheet1.xmlPK-!a5ս(xl/sharedStrings.xmlPK-!ύL{docProps/core.xmlPK-!^YdocProps/app.xmlPK anndata-0.7.8/anndata/tests/helpers.py000066400000000000000000000307771414255741200177430ustar00rootroot00000000000000from functools import singledispatch, wraps from string import ascii_letters from typing import Tuple from collections.abc import Mapping import warnings import h5py import numpy as np import pandas as pd from pandas.api.types import is_numeric_dtype import pytest from scipy import sparse from anndata import AnnData, Raw from anndata._core.views import ArrayView from anndata._core.sparse_dataset import SparseDataset from anndata._core.aligned_mapping import AlignedMapping from anndata.utils import asarray def gen_vstr_recarray(m, n, dtype=None): size = m * n lengths = np.random.randint(3, 5, size) letters = np.array(list(ascii_letters)) gen_word = lambda l: "".join(np.random.choice(letters, l)) arr = np.array([gen_word(l) for l in lengths]).reshape(m, n) return pd.DataFrame(arr, columns=[gen_word(5) for i in range(n)]).to_records( index=False, column_dtypes=dtype ) def gen_typed_df(n, index=None): # TODO: Think about allowing index to be passed for n letters = np.fromiter(iter(ascii_letters), "U1") if n > len(letters): letters = letters[: n // 2] # Make sure categories are repeated return pd.DataFrame( dict( cat=pd.Categorical(np.random.choice(letters, n)), cat_ordered=pd.Categorical(np.random.choice(letters, n), ordered=True), int64=np.random.randint(-50, 50, n), float64=np.random.random(n), uint8=np.random.randint(255, size=n, dtype="uint8"), ), index=index, ) def gen_typed_df_t2_size(m, n, index=None, columns=None) -> pd.DataFrame: s = 0 df = pd.DataFrame() new_vals = gen_typed_df(m) while s < (n / new_vals.shape[1]): new_vals = gen_typed_df(m, index=index) new_vals.columns = new_vals.columns + "_" + str(s) df[new_vals.columns] = new_vals s += 1 df = df.iloc[:m, :n].copy() if columns is not None: df.columns = columns return df # TODO: Use hypothesis for this? def gen_adata( shape: Tuple[int, int], X_type=sparse.csr_matrix, X_dtype=np.float32, # obs_dtypes, # var_dtypes, obsm_types: "Collection[Type]" = (sparse.csr_matrix, np.ndarray, pd.DataFrame), varm_types: "Collection[Type]" = (sparse.csr_matrix, np.ndarray, pd.DataFrame), layers_types: "Collection[Type]" = (sparse.csr_matrix, np.ndarray, pd.DataFrame), ) -> AnnData: """\ Helper function to generate a random AnnData for testing purposes. Note: For `obsm_types`, `varm_types`, and `layers_types` these currently just filter already created objects. In future, these should choose which objects are created. Params ------ shape What shape you want the anndata to be. X_type What kind of container should `X` be? This will be called on a randomly generated 2d array. X_dtype What should the dtype of the `.X` container be? obsm_types What kinds of containers should be in `.obsm`? varm_types What kinds of containers should be in `.varm`? layers_types What kinds of containers should be in `.layers`? """ M, N = shape obs_names = pd.Index(f"cell{i}" for i in range(shape[0])) var_names = pd.Index(f"gene{i}" for i in range(shape[1])) obs = gen_typed_df(M, obs_names) var = gen_typed_df(N, var_names) # For #147 obs.rename(columns=dict(cat="obs_cat"), inplace=True) var.rename(columns=dict(cat="var_cat"), inplace=True) if X_type is None: X = None else: X = X_type(np.random.binomial(100, 0.005, (M, N)).astype(X_dtype)) obsm = dict( array=np.random.random((M, 50)), sparse=sparse.random(M, 100, format="csr"), df=gen_typed_df(M, obs_names), ) obsm = {k: v for k, v in obsm.items() if type(v) in obsm_types} varm = dict( array=np.random.random((N, 50)), sparse=sparse.random(N, 100, format="csr"), df=gen_typed_df(N, var_names), ) varm = {k: v for k, v in varm.items() if type(v) in varm_types} layers = dict( array=np.random.random((M, N)), sparse=sparse.random(M, N, format="csr") ) layers = {k: v for k, v in layers.items() if type(v) in layers_types} obsp = dict( array=np.random.random((M, M)), sparse=sparse.random(M, M, format="csr") ) varp = dict( array=np.random.random((N, N)), sparse=sparse.random(N, N, format="csr") ) uns = dict( O_recarray=gen_vstr_recarray(N, 5), nested=dict( scalar_str="str", scalar_int=42, scalar_float=3.0, nested_further=dict(array=np.arange(5)), ), # U_recarray=gen_vstr_recarray(N, 5, "U4") ) adata = AnnData( X=X, obs=obs, var=var, obsm=obsm, varm=varm, layers=layers, obsp=obsp, varp=varp, dtype=X_dtype, uns=uns, ) return adata def array_bool_subset(index, min_size=2): b = np.zeros(len(index), dtype=bool) selected = np.random.choice( range(len(index)), size=np.random.randint(min_size, len(index), ()), replace=False, ) b[selected] = True return b def matrix_bool_subset(index, min_size=2): with warnings.catch_warnings(): warnings.simplefilter("ignore", PendingDeprecationWarning) indexer = np.matrix( array_bool_subset(index, min_size=min_size).reshape(len(index), 1) ) return indexer def spmatrix_bool_subset(index, min_size=2): return sparse.csr_matrix( array_bool_subset(index, min_size=min_size).reshape(len(index), 1) ) def array_subset(index, min_size=2): if len(index) < min_size: raise ValueError( f"min_size (={min_size}) must be smaller than len(index) (={len(index)}" ) return np.random.choice( index, size=np.random.randint(min_size, len(index), ()), replace=False ) def array_int_subset(index, min_size=2): if len(index) < min_size: raise ValueError( f"min_size (={min_size}) must be smaller than len(index) (={len(index)}" ) return np.random.choice( np.arange(len(index)), size=np.random.randint(min_size, len(index), ()), replace=False, ) def slice_subset(index, min_size=2): while True: points = np.random.choice(np.arange(len(index) + 1), size=2, replace=False) s = slice(*sorted(points)) if len(range(*s.indices(len(index)))) >= min_size: break return s def single_subset(index): return index[np.random.randint(0, len(index), size=())] @pytest.fixture( params=[ array_subset, slice_subset, single_subset, array_int_subset, array_bool_subset, matrix_bool_subset, spmatrix_bool_subset, ] ) def subset_func(request): return request.param ################### # Checking equality ################### def format_msg(elem_name): if elem_name is not None: return f"Error raised from element {elem_name!r}." else: return "" # TODO: it would be better to modify the other exception def report_name(func): """Report name of element being tested if test fails.""" @wraps(func) def func_wrapper(*args, _elem_name=None, **kwargs): try: return func(*args, **kwargs) except Exception as e: if _elem_name is not None and not hasattr(e, "_name_attached"): msg = format_msg(_elem_name) args = list(e.args) if len(args) == 0: args = [msg] else: args[0] = f"{args[0]}\n\n{msg}" e.args = tuple(args) e._name_attached = True raise e return func_wrapper @report_name def _assert_equal(a, b): """Allows reporting elem name for simple assertion.""" assert a == b @singledispatch def assert_equal(a, b, exact=False, elem_name=None): _assert_equal(a, b, _elem_name=elem_name) @assert_equal.register(np.ndarray) def assert_equal_ndarray(a, b, exact=False, elem_name=None): b = asarray(b) if not exact and is_numeric_dtype(a) and is_numeric_dtype(b): assert a.shape == b.shape, format_msg(elem_name) assert np.allclose(a, b, equal_nan=True), format_msg(elem_name) elif ( # Structured dtype not exact and hasattr(a, "dtype") and hasattr(b, "dtype") and len(a.dtype) > 1 and len(b.dtype) > 0 ): assert_equal(pd.DataFrame(a), pd.DataFrame(b), exact, elem_name) else: assert np.all(a == b), format_msg(elem_name) @assert_equal.register(ArrayView) def assert_equal_arrayview(a, b, exact=False, elem_name=None): assert_equal(asarray(a), asarray(b), exact=exact, elem_name=elem_name) @assert_equal.register(SparseDataset) @assert_equal.register(sparse.spmatrix) def assert_equal_sparse(a, b, exact=False, elem_name=None): a = asarray(a) assert_equal(b, a, exact, elem_name=elem_name) @assert_equal.register(h5py.Dataset) def assert_equal_h5py_dataset(a, b, exact=False, elem_name=None): a = asarray(a) assert_equal(b, a, exact, elem_name=elem_name) @assert_equal.register(pd.DataFrame) def are_equal_dataframe(a, b, exact=False, elem_name=None): if not isinstance(b, pd.DataFrame): assert_equal(b, a, exact, elem_name) # , a.values maybe? report_name(pd.testing.assert_frame_equal)( a, b, check_index_type=exact, check_exact=exact, _elem_name=elem_name, check_frame_type=False, ) @assert_equal.register(Mapping) def assert_equal_mapping(a, b, exact=False, elem_name=None): assert set(a.keys()) == set(b.keys()), format_msg(elem_name) for k in a.keys(): if elem_name is None: elem_name = "" assert_equal(a[k], b[k], exact, f"{elem_name}/{k}") @assert_equal.register(AlignedMapping) def assert_equal_aligned_mapping(a, b, exact=False, elem_name=None): a_indices = (a.parent.obs_names, a.parent.var_names) b_indices = (b.parent.obs_names, b.parent.var_names) for axis_idx in a.axes: assert_equal( a_indices[axis_idx], b_indices[axis_idx], exact=exact, elem_name=axis_idx ) assert a.attrname == b.attrname, format_msg(elem_name) assert_equal_mapping(a, b, exact=exact, elem_name=elem_name) @assert_equal.register(pd.Index) def assert_equal_index(a, b, exact=False, elem_name=None): if not exact: report_name(pd.testing.assert_index_equal)( a, b, check_names=False, check_categorical=False, _elem_name=elem_name ) else: report_name(pd.testing.assert_index_equal)(a, b, _elem_name=elem_name) @assert_equal.register(Raw) def assert_equal_raw(a, b, exact=False, elem_name=None): def assert_is_not_none(x): # can't put an assert in a lambda assert x is not None report_name(assert_is_not_none)(b, _elem_name=elem_name) for attr in ["X", "var", "varm", "obs_names"]: assert_equal( getattr(a, attr), getattr(b, attr), exact=exact, elem_name=f"{elem_name}/{attr}", ) @assert_equal.register(AnnData) def assert_adata_equal(a: AnnData, b: AnnData, exact: bool = False): """\ Check whether two AnnData objects are equivalent, raising an AssertionError if they aren’t. Params ------ a b exact Whether comparisons should be exact or not. This has a somewhat flexible meaning and should probably get refined in the future. """ # There may be issues comparing views, since np.allclose # can modify ArrayViews if they contain `nan`s assert_equal(a.obs_names, b.obs_names, exact, elem_name="obs_names") assert_equal(a.var_names, b.var_names, exact, elem_name="var_names") if not exact: # Reorder all elements if neccesary idx = [slice(None), slice(None)] # Since it’s a pain to compare a list of pandas objects change_flag = False if not np.all(a.obs_names == b.obs_names): idx[0] = a.obs_names change_flag = True if not np.all(a.var_names == b.var_names): idx[1] = a.var_names change_flag = True if change_flag: b = b[tuple(idx)].copy() for attr in [ "X", "obs", "var", "obsm", "varm", "layers", "uns", "obsp", "varp", "raw", ]: assert_equal( getattr(a, attr), getattr(b, attr), exact, elem_name=attr, ) anndata-0.7.8/anndata/tests/test_anncollection.py000066400000000000000000000053311414255741200221540ustar00rootroot00000000000000import pytest import anndata as ad import numpy as np from scipy.sparse import csr_matrix, issparse from sklearn.preprocessing import LabelEncoder from anndata.experimental.multi_files import AnnCollection _dense = lambda a: a.toarray() if issparse(a) else a @pytest.fixture def adatas(request): adata1 = ad.AnnData( X=request.param([[1, 2, 0], [4, 5, 0], [7, 8, 0]]), dtype="float32" ) adata1.obs["a_test"] = ["a", "a", "b"] adata1.obsm["o_test"] = np.ones((adata1.n_obs, 2)) adata2 = ad.AnnData(X=request.param([[1, 3, 0], [9, 8, 0]]), dtype="float32") adata2.obs["a_test"] = ["c", "c"] adata2.obsm["o_test"] = np.zeros((adata2.n_obs, 2)) return adata1, adata2 @pytest.mark.parametrize("adatas", [np.array, csr_matrix], indirect=True) def test_full_selection(adatas): dat = AnnCollection(adatas, index_unique="_") adt_concat = ad.concat(adatas, index_unique="_") # sorted selection from one adata dat_view = dat[:2, :2] for adata in (adatas[0], adt_concat): adt_view = adata[:2, :2] np.testing.assert_allclose(_dense(dat_view.X), _dense(adt_view.X)) np.testing.assert_allclose(dat_view.obsm["o_test"], adt_view.obsm["o_test"]) np.testing.assert_array_equal(dat_view.obs["a_test"], adt_view.obs["a_test"]) # sorted and unsorted selection from 2 adatas rand_idxs = np.random.choice(dat.shape[0], 4, replace=False) for select in (slice(2, 5), [4, 2, 3], rand_idxs): dat_view = dat[select, :2] adt_view = adt_concat[select, :2] np.testing.assert_allclose(_dense(dat_view.X), _dense(adt_view.X)) np.testing.assert_allclose(dat_view.obsm["o_test"], adt_view.obsm["o_test"]) np.testing.assert_array_equal(dat_view.obs["a_test"], adt_view.obs["a_test"]) # test duplicate selection idxs = [1, 2, 4, 4] dat_view = dat[idxs, :2] np.testing.assert_allclose( _dense(dat_view.X), np.array([[4, 5], [7, 8], [9, 8], [9, 8]]) ) @pytest.mark.parametrize("adatas", [np.array, csr_matrix], indirect=True) def test_creation(adatas): adatas_inner = [adatas[0], adatas[1][:, :2].copy()] dat = AnnCollection(adatas_inner, join_vars="inner", index_unique="_") adt_concat = ad.concat(adatas_inner, index_unique="_") np.testing.assert_array_equal(dat.var_names, adt_concat.var_names) @pytest.mark.parametrize("adatas", [np.array], indirect=True) def test_convert(adatas): dat = AnnCollection(adatas, index_unique="_") le = LabelEncoder() le.fit(dat[:].obs["a_test"]) obs_no_convert = dat[:].obs["a_test"] convert = dict(obs={"a_test": lambda a: le.transform(a)}) dat.convert = convert np.testing.assert_array_equal(dat[:].obs["a_test"], le.transform(obs_no_convert)) anndata-0.7.8/anndata/tests/test_annot.py000066400000000000000000000030351414255741200204420ustar00rootroot00000000000000"""Test handling of values in `obs`/ `var`""" import numpy as np import pandas as pd import anndata as ad import pytest @pytest.mark.parametrize("dtype", [object, "string"]) def test_str_to_categorical(dtype): obs = pd.DataFrame( {"str": ["a", "a", None, "b", "b"]}, index=[f"cell-{i}" for i in range(5)] ) obs["str"] = obs["str"].astype(dtype) a = ad.AnnData(obs=obs.copy()) a.strings_to_categoricals() expected = obs["str"].astype("category") pd.testing.assert_series_equal(expected, a.obs["str"]) def test_non_str_to_not_categorical(): # Test case based on https://github.com/theislab/anndata/issues/141#issuecomment-802105259 obs = pd.DataFrame(index=[f"cell-{i}" for i in range(5)]).assign( str_with_nan=["foo", "bar", None, np.nan, "foo"], boolean_with_nan_and_none=[True, False, np.nan, None, True], boolean_with_nan=[True, False, np.nan, np.nan, True], boolean_with_none=[True, False, None, None, True], ) adata = ad.AnnData(obs=obs.copy()) orig_dtypes = {k: v.name for k, v in obs.dtypes.items()} expected_dtypes = orig_dtypes.copy() expected_dtypes["str_with_nan"] = "category" adata.strings_to_categoricals() result_dtypes = {k: v.name for k, v in adata.obs.dtypes.items()} assert expected_dtypes == result_dtypes expected_non_transformed = obs.drop(columns=["str_with_nan"]) result_non_transformed = adata.obs.drop(columns=["str_with_nan"]) pd.testing.assert_frame_equal(expected_non_transformed, result_non_transformed) anndata-0.7.8/anndata/tests/test_backed_sparse.py000066400000000000000000000105361414255741200221150ustar00rootroot00000000000000import h5py import numpy as np import pytest from scipy import sparse import anndata as ad from anndata._core.sparse_dataset import SparseDataset from anndata.tests.helpers import assert_equal, subset_func subset_func2 = subset_func @pytest.fixture(scope="function") def ondisk_equivalent_adata(tmp_path): csr_path = tmp_path / "csr.h5ad" csc_path = tmp_path / "csc.h5ad" dense_path = tmp_path / "dense.h5ad" csr_mem = ad.AnnData(X=sparse.random(50, 50, format="csr", density=0.1)) csc_mem = ad.AnnData(X=csr_mem.X.tocsc()) csr_mem.write_h5ad(csr_path) csc_mem.write_h5ad(csc_path) csr_mem.write_h5ad(dense_path, as_dense="X") csr_disk = ad.read_h5ad(csr_path, backed="r") csc_disk = ad.read_h5ad(csc_path, backed="r") dense_disk = ad.read_h5ad(dense_path, backed="r") return csr_mem, csr_disk, csc_disk, dense_disk def test_backed_indexing(ondisk_equivalent_adata, subset_func, subset_func2): csr_mem, csr_disk, csc_disk, dense_disk = ondisk_equivalent_adata obs_idx = subset_func(csr_mem.obs_names) var_idx = subset_func2(csr_mem.var_names) assert_equal(csr_mem[obs_idx, var_idx].X, csr_disk[obs_idx, var_idx].X) assert_equal(csr_mem[obs_idx, var_idx].X, csc_disk[obs_idx, var_idx].X) assert_equal(csr_mem[obs_idx, :].X, dense_disk[obs_idx, :].X) assert_equal(csr_mem[:, var_idx].X, dense_disk[:, var_idx].X) @pytest.mark.parametrize( ["sparse_format", "append_method"], [ pytest.param(sparse.csr_matrix, sparse.vstack), pytest.param(sparse.csc_matrix, sparse.hstack), ], ) def test_dataset_append_memory(tmp_path, sparse_format, append_method): h5_path = tmp_path / "test.h5" a = sparse_format(sparse.random(100, 100)) b = sparse_format(sparse.random(100, 100)) with h5py.File(h5_path, "a") as f: ad._io.h5ad.write_attribute(f, "mtx", a) diskmtx = SparseDataset(f["mtx"]) diskmtx.append(b) fromdisk = diskmtx.to_memory() frommem = append_method([a, b]) assert_equal(fromdisk, frommem) @pytest.mark.parametrize( ["sparse_format", "append_method"], [ pytest.param(sparse.csr_matrix, sparse.vstack), pytest.param(sparse.csc_matrix, sparse.hstack), ], ) def test_dataset_append_disk(tmp_path, sparse_format, append_method): h5_path = tmp_path / "test.h5" a = sparse_format(sparse.random(10, 10)) b = sparse_format(sparse.random(10, 10)) with h5py.File(h5_path, "a") as f: ad._io.h5ad.write_attribute(f, "a", a) ad._io.h5ad.write_attribute(f, "b", b) a_disk = SparseDataset(f["a"]) b_disk = SparseDataset(f["b"]) a_disk.append(b_disk) fromdisk = a_disk.to_memory() frommem = append_method([a, b]) assert_equal(fromdisk, frommem) @pytest.mark.parametrize( ["sparse_format", "a_shape", "b_shape"], [ pytest.param("csr", (100, 100), (100, 200)), pytest.param("csc", (100, 100), (200, 100)), ], ) def test_wrong_shape(tmp_path, sparse_format, a_shape, b_shape): h5_path = tmp_path / "base.h5" a_mem = sparse.random(*a_shape, format=sparse_format) b_mem = sparse.random(*b_shape, format=sparse_format) with h5py.File(h5_path, "a") as f: ad._io.h5ad.write_attribute(f, "a", a_mem) ad._io.h5ad.write_attribute(f, "b", b_mem) a_disk = SparseDataset(f["a"]) b_disk = SparseDataset(f["b"]) with pytest.raises(AssertionError): a_disk.append(b_disk) def test_wrong_formats(tmp_path): h5_path = tmp_path / "base.h5" base = sparse.random(100, 100, format="csr") with h5py.File(h5_path, "a") as f: ad._io.h5ad.write_attribute(f, "base", base) disk_mtx = SparseDataset(f["base"]) pre_checks = disk_mtx.to_memory() with pytest.raises(ValueError): disk_mtx.append(sparse.random(100, 100, format="csc")) with pytest.raises(ValueError): disk_mtx.append(sparse.random(100, 100, format="coo")) with pytest.raises(NotImplementedError): disk_mtx.append(np.random.random((100, 100))) disk_dense = f.create_dataset("dense", data=np.random.random((100, 100))) with pytest.raises(NotImplementedError): disk_mtx.append(disk_dense) post_checks = disk_mtx.to_memory() # Check nothing changed assert not np.any((pre_checks != post_checks).toarray()) anndata-0.7.8/anndata/tests/test_base.py000066400000000000000000000475551414255741200202540ustar00rootroot00000000000000from itertools import product import numpy as np from numpy import ma import pandas as pd import pytest from scipy import sparse as sp from scipy.sparse import csr_matrix, issparse from anndata import AnnData from anndata.tests.helpers import assert_equal, gen_adata # some test objects that we use below adata_dense = AnnData(np.array([[1, 2], [3, 4]])) adata_dense.layers["test"] = adata_dense.X adata_sparse = AnnData( csr_matrix([[0, 2, 3], [0, 5, 6]]), dict(obs_names=["s1", "s2"], anno1=["c1", "c2"]), dict(var_names=["a", "b", "c"]), ) def test_creation(): AnnData(np.array([[1, 2], [3, 4]])) AnnData(np.array([[1, 2], [3, 4]]), {}, {}) AnnData(ma.array([[1, 2], [3, 4]]), uns=dict(mask=[0, 1, 1, 0])) AnnData(sp.eye(2)) X = np.array([[1, 2, 3], [4, 5, 6]]) adata = AnnData( X=X, obs=dict(Obs=["A", "B"]), var=dict(Feat=["a", "b", "c"]), obsm=dict(X_pca=np.array([[1, 2], [3, 4]])), raw=dict(X=X, var=dict(var_names=["a", "b", "c"])), ) assert adata.raw.X.tolist() == X.tolist() assert adata.raw.var_names.tolist() == ["a", "b", "c"] with pytest.raises(ValueError): AnnData(np.array([[1, 2], [3, 4]]), dict(TooLong=[1, 2, 3, 4])) # init with empty data matrix shape = (3, 5) adata = AnnData(None, uns=dict(test=np.array((3, 3))), shape=shape) assert adata.X is None assert adata.shape == shape assert "test" in adata.uns def test_create_with_dfs(): X = np.ones((6, 3)) obs = pd.DataFrame(dict(cat_anno=pd.Categorical(["a", "a", "a", "a", "b", "a"]))) obs_copy = obs.copy() adata = AnnData(X=X, obs=obs) assert obs.index.equals(obs_copy.index) assert obs.index.astype(str).equals(adata.obs.index) def test_create_from_df(): df = pd.DataFrame(np.ones((3, 2)), index=["a", "b", "c"], columns=["A", "B"]) ad = AnnData(df) assert df.values.tolist() == ad.X.tolist() assert df.columns.tolist() == ad.var_names.tolist() assert df.index.tolist() == ad.obs_names.tolist() def test_create_from_sparse_df(): s = sp.random(20, 30, density=0.2) obs_names = [f"obs{i}" for i in range(20)] var_names = [f"var{i}" for i in range(30)] df = pd.DataFrame.sparse.from_spmatrix(s, index=obs_names, columns=var_names) a = AnnData(df) b = AnnData(s, obs=pd.DataFrame(index=obs_names), var=pd.DataFrame(index=var_names)) assert_equal(a, b) assert issparse(a.X) def test_create_from_df_with_obs_and_var(): df = pd.DataFrame(np.ones((3, 2)), index=["a", "b", "c"], columns=["A", "B"]) obs = pd.DataFrame(np.ones((3, 1)), index=df.index, columns=["C"]) var = pd.DataFrame(np.ones((2, 1)), index=df.columns, columns=["D"]) ad = AnnData(df, obs=obs, var=var) assert df.values.tolist() == ad.X.tolist() assert df.columns.tolist() == ad.var_names.tolist() assert df.index.tolist() == ad.obs_names.tolist() assert obs.equals(ad.obs) assert var.equals(ad.var) with pytest.raises(ValueError, match=r"Index of obs must match index of X."): AnnData(df, obs=obs.reset_index()) with pytest.raises(ValueError, match=r"Index of var must match columns of X."): AnnData(df, var=var.reset_index()) def test_from_df_and_dict(): df = pd.DataFrame(dict(a=[0.1, 0.2, 0.3], b=[1.1, 1.2, 1.3])) adata = AnnData(df, dict(species=pd.Categorical(["a", "b", "a"]))) assert adata.obs["species"].values.tolist() == ["a", "b", "a"] def test_df_warnings(): df = pd.DataFrame(dict(A=[1, 2, 3], B=[1.0, 2.0, 3.0]), index=["a", "b", "c"]) with pytest.warns(UserWarning, match=r"X.*dtype float64"): adata = AnnData(df) with pytest.warns(UserWarning, match=r"X.*dtype float64"): adata.X = df def test_attr_deletion(): full = gen_adata((30, 30)) # Empty has just X, obs_names, var_names empty = AnnData(None, obs=full.obs[[]], var=full.var[[]]) for attr in ["X", "obs", "var", "obsm", "varm", "obsp", "varp", "layers", "uns"]: delattr(full, attr) assert_equal(getattr(full, attr), getattr(empty, attr)) assert_equal(full, empty, exact=True) def test_names(): adata = AnnData( np.array([[1, 2, 3], [4, 5, 6]]), dict(obs_names=["A", "B"]), dict(var_names=["a", "b", "c"]), ) assert adata.obs_names.tolist() == "A B".split() assert adata.var_names.tolist() == "a b c".split() adata = AnnData(np.array([[1, 2], [3, 4], [5, 6]]), var=dict(var_names=["a", "b"])) assert adata.var_names.tolist() == ["a", "b"] @pytest.mark.parametrize( "names,after", [ pytest.param(["a", "b"], None, id="list"), pytest.param( pd.Series(["AAD", "CCA"], name="barcodes"), "barcodes", id="Series-str" ), pytest.param(pd.Series(["x", "y"], name=0), None, id="Series-int"), ], ) @pytest.mark.parametrize("attr", ["obs_names", "var_names"]) def test_setting_index_names(names, after, attr): adata = adata_dense.copy() assert getattr(adata, attr).name is None setattr(adata, attr, names) assert getattr(adata, attr).name == after if hasattr(names, "name"): assert names.name is not None # Testing for views new = adata[:, :] assert new.is_view setattr(new, attr, names) assert_equal(new, adata, exact=True) assert not new.is_view @pytest.mark.parametrize("attr", ["obs_names", "var_names"]) def test_setting_index_names_error(attr): orig = adata_sparse[:2, :2] adata = adata_sparse[:2, :2] assert getattr(adata, attr).name is None with pytest.raises(ValueError, match=fr"AnnData expects \.{attr[:3]}\.index\.name"): setattr(adata, attr, pd.Index(["x", "y"], name=0)) assert adata.is_view assert getattr(adata, attr).tolist() != ["x", "y"] assert getattr(adata, attr).tolist() == getattr(orig, attr).tolist() assert_equal(orig, adata, exact=True) @pytest.mark.parametrize("dim", ["obs", "var"]) def test_setting_dim_index(dim): index_attr = f"{dim}_names" mapping_attr = f"{dim}m" orig = gen_adata((5, 5)) orig.raw = orig curr = orig.copy() view = orig[:, :] new_idx = pd.Index(list("abcde"), name="letters") setattr(curr, index_attr, new_idx) pd.testing.assert_index_equal(getattr(curr, index_attr), new_idx) pd.testing.assert_index_equal(getattr(curr, mapping_attr)["df"].index, new_idx) pd.testing.assert_index_equal(getattr(curr, mapping_attr).dim_names, new_idx) pd.testing.assert_index_equal(curr.obs_names, curr.raw.obs_names) # Testing view behaviour setattr(view, index_attr, new_idx) assert not view.is_view pd.testing.assert_index_equal(getattr(view, index_attr), new_idx) pd.testing.assert_index_equal(getattr(view, mapping_attr)["df"].index, new_idx) pd.testing.assert_index_equal(getattr(view, mapping_attr).dim_names, new_idx) with pytest.raises(AssertionError): pd.testing.assert_index_equal( getattr(view, index_attr), getattr(orig, index_attr) ) assert_equal(view, curr, exact=True) # test case in #459 fake_m = pd.DataFrame(curr.X.T, index=getattr(curr, index_attr)) getattr(curr, mapping_attr)["df2"] = fake_m def test_indices_dtypes(): adata = AnnData( np.array([[1, 2, 3], [4, 5, 6]]), dict(obs_names=["A", "B"]), dict(var_names=["a", "b", "c"]), ) adata.obs_names = ["ö", "a"] assert adata.obs_names.tolist() == ["ö", "a"] def test_slicing(): adata = AnnData(np.array([[1, 2, 3], [4, 5, 6]])) # assert adata[:, 0].X.tolist() == adata.X[:, 0].tolist() # No longer the case assert adata[0, 0].X.tolist() == np.reshape(1, (1, 1)).tolist() assert adata[0, :].X.tolist() == np.reshape([1, 2, 3], (1, 3)).tolist() assert adata[:, 0].X.tolist() == np.reshape([1, 4], (2, 1)).tolist() assert adata[:, [0, 1]].X.tolist() == [[1, 2], [4, 5]] assert adata[:, np.array([0, 2])].X.tolist() == [[1, 3], [4, 6]] assert adata[:, np.array([False, True, True])].X.tolist() == [ [2, 3], [5, 6], ] assert adata[:, 1:3].X.tolist() == [[2, 3], [5, 6]] assert adata[0:2, :][:, 0:2].X.tolist() == [[1, 2], [4, 5]] assert adata[0:1, :][:, 0:2].X.tolist() == np.reshape([1, 2], (1, 2)).tolist() assert adata[0, :][:, 0].X.tolist() == np.reshape(1, (1, 1)).tolist() assert adata[:, 0:2][0:2, :].X.tolist() == [[1, 2], [4, 5]] assert adata[:, 0:2][0:1, :].X.tolist() == np.reshape([1, 2], (1, 2)).tolist() assert adata[:, 0][0, :].X.tolist() == np.reshape(1, (1, 1)).tolist() def test_boolean_slicing(): adata = AnnData(np.array([[1, 2, 3], [4, 5, 6]])) obs_selector = np.array([True, False], dtype=bool) vars_selector = np.array([True, False, False], dtype=bool) assert adata[obs_selector, :][:, vars_selector].X.tolist() == [[1]] assert adata[:, vars_selector][obs_selector, :].X.tolist() == [[1]] assert adata[obs_selector, :][:, 0].X.tolist() == [[1]] assert adata[:, 0][obs_selector, :].X.tolist() == [[1]] assert adata[0, :][:, vars_selector].X.tolist() == [[1]] assert adata[:, vars_selector][0, :].X.tolist() == [[1]] obs_selector = np.array([True, False], dtype=bool) vars_selector = np.array([True, True, False], dtype=bool) assert adata[obs_selector, :][:, vars_selector].X.tolist() == [[1, 2]] assert adata[:, vars_selector][obs_selector, :].X.tolist() == [[1, 2]] assert adata[obs_selector, :][:, 0:2].X.tolist() == [[1, 2]] assert adata[:, 0:2][obs_selector, :].X.tolist() == [[1, 2]] assert adata[0, :][:, vars_selector].X.tolist() == [[1, 2]] assert adata[:, vars_selector][0, :].X.tolist() == [[1, 2]] obs_selector = np.array([True, True], dtype=bool) vars_selector = np.array([True, True, False], dtype=bool) assert adata[obs_selector, :][:, vars_selector].X.tolist() == [ [1, 2], [4, 5], ] assert adata[:, vars_selector][obs_selector, :].X.tolist() == [ [1, 2], [4, 5], ] assert adata[obs_selector, :][:, 0:2].X.tolist() == [[1, 2], [4, 5]] assert adata[:, 0:2][obs_selector, :].X.tolist() == [[1, 2], [4, 5]] assert adata[0:2, :][:, vars_selector].X.tolist() == [[1, 2], [4, 5]] assert adata[:, vars_selector][0:2, :].X.tolist() == [[1, 2], [4, 5]] def test_oob_boolean_slicing(): len1, len2 = np.random.choice(100, 2, replace=False) with pytest.raises(IndexError) as e: AnnData(np.empty((len1, 100)))[np.random.randint(0, 2, len2, dtype=bool), :] assert str(len1) in str(e.value) assert str(len2) in str(e.value) len1, len2 = np.random.choice(100, 2, replace=False) with pytest.raises(IndexError) as e: AnnData(np.empty((100, len1)))[:, np.random.randint(0, 2, len2, dtype=bool)] assert str(len1) in str(e.value) assert str(len2) in str(e.value) def test_slicing_strings(): adata = AnnData( np.array([[1, 2, 3], [4, 5, 6]]), dict(obs_names=["A", "B"]), dict(var_names=["a", "b", "c"]), ) assert adata["A", "a"].X.tolist() == [[1]] assert adata["A", :].X.tolist() == [[1, 2, 3]] assert adata[:, "a"].X.tolist() == [[1], [4]] assert adata[:, ["a", "b"]].X.tolist() == [[1, 2], [4, 5]] assert adata[:, np.array(["a", "c"])].X.tolist() == [[1, 3], [4, 6]] assert adata[:, "b":"c"].X.tolist() == [[2, 3], [5, 6]] with pytest.raises(KeyError): _ = adata[:, "X"] with pytest.raises(KeyError): _ = adata["X", :] with pytest.raises(KeyError): _ = adata["A":"X", :] with pytest.raises(KeyError): _ = adata[:, "a":"X"] # Test if errors are helpful with pytest.raises(KeyError, match=r"not_in_var"): adata[:, ["A", "B", "not_in_var"]] with pytest.raises(KeyError, match=r"not_in_obs"): adata[["A", "B", "not_in_obs"], :] def test_slicing_graphs(): # Testing for deprecated behaviour of connectivity matrices in .uns["neighbors"] with pytest.warns(FutureWarning, match=r".obsp\['connectivities'\]"): adata = AnnData( np.array([[1, 2], [3, 4], [5, 6]]), uns=dict(neighbors=dict(connectivities=sp.csr_matrix(np.ones((3, 3))))), ) adata_sub = adata[[0, 1], :] with pytest.warns(FutureWarning): assert adata_sub.uns["neighbors"]["connectivities"].shape[0] == 2 assert adata.uns["neighbors"]["connectivities"].shape[0] == 3 assert adata_sub.copy().uns["neighbors"]["connectivities"].shape[0] == 2 def test_slicing_series(): adata = AnnData( np.array([[1, 2], [3, 4], [5, 6]]), dict(obs_names=["A", "B", "C"]), dict(var_names=["a", "b"]), ) df = pd.DataFrame(dict(a=["1", "2", "2"])) df1 = pd.DataFrame(dict(b=["1", "2"])) assert adata[df["a"].values == "2"].X.tolist() == adata[df["a"] == "2"].X.tolist() assert ( adata[:, df1["b"].values == "2"].X.tolist() == adata[:, df1["b"] == "2"].X.tolist() ) def test_strings_to_categoricals(): adata = AnnData( np.array([[1, 2], [3, 4], [5, 6], [7, 8]]), dict(k=["a", "a", "b", "b"]) ) adata.strings_to_categoricals() assert adata.obs["k"].cat.categories.tolist() == ["a", "b"] def test_slicing_remove_unused_categories(): adata = AnnData( np.array([[1, 2], [3, 4], [5, 6], [7, 8]]), dict(k=["a", "a", "b", "b"]) ) adata._sanitize() assert adata[2:4].obs["k"].cat.categories.tolist() == ["b"] def test_get_subset_annotation(): adata = AnnData( np.array([[1, 2, 3], [4, 5, 6]]), dict(S=["A", "B"]), dict(F=["a", "b", "c"]), ) assert adata[0, 0].obs["S"].tolist() == ["A"] assert adata[0, 0].var["F"].tolist() == ["a"] def test_append_col(): adata = AnnData(np.array([[1, 2, 3], [4, 5, 6]])) adata.obs["new"] = [1, 2] # this worked in the initial AnnData, but not with a dataframe # adata.obs[['new2', 'new3']] = [['A', 'B'], ['c', 'd']] with pytest.raises(ValueError): adata.obs["new4"] = "far too long".split() def test_delete_col(): adata = AnnData(np.array([[1, 2, 3], [4, 5, 6]]), dict(o1=[1, 2], o2=[3, 4])) assert ["o1", "o2"] == adata.obs_keys() del adata.obs["o1"] assert ["o2"] == adata.obs_keys() assert [3, 4] == adata.obs["o2"].tolist() def test_set_obs(): adata = AnnData(np.array([[1, 2, 3], [4, 5, 6]])) adata.obs = pd.DataFrame(dict(a=[3, 4])) assert adata.obs_names.tolist() == [0, 1] with pytest.raises(ValueError): adata.obs = pd.DataFrame(dict(a=[3, 4, 5])) adata.obs = dict(a=[1, 2]) def test_multicol(): adata = AnnData(np.array([[1, 2, 3], [4, 5, 6]])) # 'c' keeps the columns as should be adata.obsm["c"] = np.array([[0.0, 1.0], [2, 3]]) assert adata.obsm_keys() == ["c"] assert adata.obsm["c"].tolist() == [[0.0, 1.0], [2, 3]] def test_n_obs(): adata = AnnData(np.array([[1, 2], [3, 4], [5, 6]])) assert adata.n_obs == 3 adata1 = adata[:2] assert adata1.n_obs == 2 def test_equality_comparisons(): adata1 = AnnData(np.array([[1, 2], [3, 4], [5, 6]])) adata2 = AnnData(np.array([[1, 2], [3, 4], [5, 6]])) with pytest.raises(NotImplementedError): adata1 == adata1 with pytest.raises(NotImplementedError): adata1 == adata2 with pytest.raises(NotImplementedError): adata1 != adata2 with pytest.raises(NotImplementedError): adata1 == 1 with pytest.raises(NotImplementedError): adata1 != 1 def test_rename_categories(): X = np.ones((6, 3)) obs = pd.DataFrame(dict(cat_anno=pd.Categorical(["a", "a", "a", "a", "b", "a"]))) adata = AnnData(X=X, obs=obs) adata.uns["tool"] = {} adata.uns["tool"]["cat_array"] = np.rec.fromarrays( [np.ones(2) for cat in adata.obs["cat_anno"].cat.categories], dtype=[(cat, "float32") for cat in adata.obs["cat_anno"].cat.categories], ) adata.uns["tool"]["params"] = dict(groupby="cat_anno") new_categories = ["c", "d"] adata.rename_categories("cat_anno", new_categories) assert list(adata.obs["cat_anno"].cat.categories) == new_categories assert list(adata.uns["tool"]["cat_array"].dtype.names) == new_categories def test_pickle(): import pickle adata = AnnData() adata2 = pickle.loads(pickle.dumps(adata)) assert adata2.obsm.parent is adata2 def test_to_df_dense(): X_df = adata_dense.to_df() layer_df = adata_dense.to_df(layer="test") np.testing.assert_array_equal(adata_dense.layers["test"], layer_df.values) np.testing.assert_array_equal(adata_dense.X, X_df.values) pd.testing.assert_index_equal(X_df.columns, layer_df.columns) pd.testing.assert_index_equal(X_df.index, layer_df.index) def test_convenience(): adata = adata_sparse.copy() adata.layers["x2"] = adata.X * 2 adata.var["anno2"] = ["p1", "p2", "p3"] adata.raw = adata adata.X = adata.X / 2 adata_dense = adata.copy() adata_dense.X = adata_dense.X.toarray() def assert_same_op_result(a1, a2, op): r1 = op(a1) r2 = op(a2) assert np.all(r1 == r2) assert type(r1) is type(r2) assert np.allclose(adata.obs_vector("b"), np.array([1.0, 2.5])) assert np.allclose(adata.raw.obs_vector("c"), np.array([3, 6])) assert np.all(adata.obs_vector("anno1") == np.array(["c1", "c2"])) assert np.allclose(adata.var_vector("s1"), np.array([0, 1.0, 1.5])) assert np.allclose(adata.raw.var_vector("s2"), np.array([0, 5, 6])) for obs_k, layer in product(["a", "b", "c", "anno1"], [None, "x2"]): assert_same_op_result( adata, adata_dense, lambda x: x.obs_vector(obs_k, layer=layer) ) for obs_k in ["a", "b", "c"]: assert_same_op_result(adata, adata_dense, lambda x: x.raw.obs_vector(obs_k)) for var_k, layer in product(["s1", "s2", "anno2"], [None, "x2"]): assert_same_op_result( adata, adata_dense, lambda x: x.var_vector(var_k, layer=layer) ) for var_k in ["s1", "s2", "anno2"]: assert_same_op_result(adata, adata_dense, lambda x: x.raw.var_vector(var_k)) def test_1d_slice_dtypes(): N, M = 10, 20 obs_df = pd.DataFrame( dict( cat=pd.Categorical(np.arange(N, dtype=int)), int=np.arange(N, dtype=int), float=np.arange(N, dtype=float), obj=[str(i) for i in np.arange(N, dtype=int)], ), index=[f"cell{i}" for i in np.arange(N, dtype=int)], ) var_df = pd.DataFrame( dict( cat=pd.Categorical(np.arange(M, dtype=int)), int=np.arange(M, dtype=int), float=np.arange(M, dtype=float), obj=[str(i) for i in np.arange(M, dtype=int)], ), index=[f"gene{i}" for i in np.arange(M, dtype=int)], ) adata = AnnData(X=np.random.random((N, M)), obs=obs_df, var=var_df) new_obs_df = pd.DataFrame(index=adata.obs_names) for k in obs_df.columns: new_obs_df[k] = adata.obs_vector(k) assert new_obs_df[k].dtype == obs_df[k].dtype assert np.all(new_obs_df == obs_df) new_var_df = pd.DataFrame(index=adata.var_names) for k in var_df.columns: new_var_df[k] = adata.var_vector(k) assert new_var_df[k].dtype == var_df[k].dtype assert np.all(new_var_df == var_df) def test_to_df_sparse(): X = adata_sparse.X.toarray() df = adata_sparse.to_df() assert df.values.tolist() == X.tolist() def test_copy(): adata_copy = adata_sparse.copy() def assert_eq_not_id(a, b): assert a is not b assert issparse(a) == issparse(b) if issparse(a): assert np.all(a.data == b.data) assert np.all(a.indices == b.indices) assert np.all(a.indptr == b.indptr) else: assert np.all(a == b) assert adata_sparse is not adata_copy assert_eq_not_id(adata_sparse.X, adata_copy.X) for attr in "layers var obs obsm varm".split(): map_sprs = getattr(adata_sparse, attr) map_copy = getattr(adata_copy, attr) assert map_sprs is not map_copy assert_eq_not_id(map_sprs.keys(), map_copy.keys()) for key in map_sprs.keys(): assert_eq_not_id(map_sprs[key], map_copy[key]) anndata-0.7.8/anndata/tests/test_concatenate.py000066400000000000000000001067071414255741200216210ustar00rootroot00000000000000from collections.abc import Hashable from copy import deepcopy from itertools import chain, product from functools import partial, singledispatch import warnings import numpy as np from numpy import ma import pandas as pd from pandas.api.types import is_categorical_dtype import pytest from scipy import sparse from boltons.iterutils import research, remap, default_exit from anndata import AnnData, Raw, concat from anndata._core.index import _subset from anndata._core import merge from anndata.tests import helpers from anndata.tests.helpers import assert_equal, gen_adata from anndata.utils import asarray @singledispatch def filled_like(a, fill_value=None): raise NotImplementedError() @filled_like.register(np.ndarray) def _filled_array(a, fill_value=None): if fill_value is None: fill_value = np.nan return np.broadcast_to(fill_value, a.shape) @filled_like.register(sparse.spmatrix) def _filled_sparse(a, fill_value=None): if fill_value is None: return sparse.csr_matrix(a.shape) else: return sparse.csr_matrix(np.broadcast_to(fill_value, a.shape)) @filled_like.register(pd.DataFrame) def _filled_df(a, fill_value=np.nan): # dtype from pd.concat can be unintuitive, this returns something close enough return a.loc[[], :].reindex(index=a.index, fill_value=fill_value) def check_filled_like(x, fill_value=None, elem_name=None): if fill_value is None: assert_equal(x, filled_like(x), elem_name=elem_name) else: assert_equal(x, filled_like(x, fill_value=fill_value), elem_name=elem_name) def make_idx_tuple(idx, axis): tup = [slice(None), slice(None)] tup[axis] = idx return tuple(tup) @pytest.fixture( params=[asarray, sparse.csr_matrix, sparse.csc_matrix], ids=["np_array", "scipy_csr", "scipy_csc"], ) def array_type(request): return request.param @pytest.fixture(params=["inner", "outer"]) def join_type(request): return request.param @pytest.fixture(params=[0, np.nan, np.pi]) def fill_val(request): return request.param @pytest.fixture(params=[0, 1]) def axis(request): return request.param @pytest.fixture(params=list(merge.MERGE_STRATEGIES.keys())) def merge_strategy(request): return request.param def fix_known_differences(orig, result, backwards_compat=True): """ Helper function for reducing anndata's to only the elements we expect to be equivalent after concatenation. Only for the case where orig is the ground truth result of what concatenation should be. If backwards_compat, checks against what `AnnData.concatenate` could do. Otherwise checks for `concat`. """ orig = orig.copy() result = result.copy() result.strings_to_categoricals() # Should this be implicit in concatenation? # TODO # * merge varm, varp similar to uns # * merge obsp, but some information should be lost del orig.obsp # TODO if backwards_compat: del orig.varm del orig.varp result.obs.drop(columns=["batch"], inplace=True) # Possibly need to fix this, ordered categoricals lose orderedness for k, dtype in orig.obs.dtypes.items(): if is_categorical_dtype(dtype) and dtype.ordered: result.obs[k] = result.obs[k].astype(dtype) return orig, result def test_concat_interface_errors(): adatas = [gen_adata((5, 10)), gen_adata((5, 10))] with pytest.raises(ValueError): concat(adatas, axis=3) with pytest.raises(ValueError): concat(adatas, join="not implemented") with pytest.raises(ValueError): concat([]) @pytest.mark.parametrize( ["concat_func", "backwards_compat"], [ (partial(concat, merge="unique"), False), (lambda x, **kwargs: x[0].concatenate(x[1:], **kwargs), True), ], ) def test_concatenate_roundtrip(join_type, array_type, concat_func, backwards_compat): adata = gen_adata((100, 10), X_type=array_type) remaining = adata.obs_names subsets = [] while len(remaining) > 0: n = min(len(remaining), np.random.choice(50)) subset_idx = np.random.choice(remaining, n, replace=False) subsets.append(adata[subset_idx]) remaining = remaining.difference(subset_idx) result = concat_func(subsets, join=join_type, uns_merge="same", index_unique=None) # Correcting for known differences orig, result = fix_known_differences( adata, result, backwards_compat=backwards_compat ) assert_equal(result[orig.obs_names].copy(), orig) def test_concatenate_dense(): # dense data X1 = np.array([[1, 2, 3], [4, 5, 6]]) X2 = np.array([[1, 2, 3], [4, 5, 6]]) X3 = np.array([[1, 2, 3], [4, 5, 6]]) adata1 = AnnData( X1, dict(obs_names=["s1", "s2"], anno1=["c1", "c2"]), dict(var_names=["a", "b", "c"], annoA=[0, 1, 2]), obsm=dict(X_1=X1, X_2=X2, X_3=X3), layers=dict(Xs=X1), ) adata2 = AnnData( X2, dict(obs_names=["s3", "s4"], anno1=["c3", "c4"]), dict(var_names=["d", "c", "b"], annoA=[0, 1, 2]), obsm=dict(X_1=X1, X_2=X2, X_3=X3), layers={"Xs": X2}, ) adata3 = AnnData( X3, dict(obs_names=["s1", "s2"], anno2=["d3", "d4"]), dict(var_names=["d", "c", "b"], annoB=[0, 1, 2]), obsm=dict(X_1=X1, X_2=X2), layers=dict(Xs=X3), ) # inner join adata = adata1.concatenate(adata2, adata3) X_combined = [[2, 3], [5, 6], [3, 2], [6, 5], [3, 2], [6, 5]] assert adata.X.astype(int).tolist() == X_combined assert adata.layers["Xs"].astype(int).tolist() == X_combined assert adata.obs_keys() == ["anno1", "anno2", "batch"] assert adata.var_keys() == ["annoA-0", "annoA-1", "annoB-2"] assert adata.var.values.tolist() == [[1, 2, 2], [2, 1, 1]] assert adata.obsm_keys() == ["X_1", "X_2"] assert adata.obsm["X_1"].tolist() == np.concatenate([X1, X1, X1]).tolist() # with batch_key and batch_categories adata = adata1.concatenate(adata2, adata3, batch_key="batch1") assert adata.obs_keys() == ["anno1", "anno2", "batch1"] adata = adata1.concatenate(adata2, adata3, batch_categories=["a1", "a2", "a3"]) assert adata.obs["batch"].cat.categories.tolist() == ["a1", "a2", "a3"] assert adata.var_names.tolist() == ["b", "c"] # outer join adata = adata1.concatenate(adata2, adata3, join="outer") X_ref = np.array( [ [1.0, 2.0, 3.0, np.nan], [4.0, 5.0, 6.0, np.nan], [np.nan, 3.0, 2.0, 1.0], [np.nan, 6.0, 5.0, 4.0], [np.nan, 3.0, 2.0, 1.0], [np.nan, 6.0, 5.0, 4.0], ] ) np.testing.assert_equal(adata.X, X_ref) var_ma = ma.masked_invalid(adata.var.values.tolist()) var_ma_ref = ma.masked_invalid( np.array( [ [0.0, np.nan, np.nan], [1.0, 2.0, 2.0], [2.0, 1.0, 1.0], [np.nan, 0.0, 0.0], ] ) ) assert np.array_equal(var_ma.mask, var_ma_ref.mask) assert np.allclose(var_ma.compressed(), var_ma_ref.compressed()) def test_concatenate_layers(array_type, join_type): adatas = [] for _ in range(5): a = array_type(sparse.random(100, 200, format="csr")) adatas.append(AnnData(X=a, layers={"a": a})) merged = adatas[0].concatenate(adatas[1:], join=join_type) assert_equal(merged.X, merged.layers["a"]) @pytest.fixture def obsm_adatas(): def gen_index(n): return [f"cell{i}" for i in range(n)] return [ AnnData( X=sparse.csr_matrix((3, 5)), obs=pd.DataFrame(index=gen_index(3)), obsm={ "dense": np.arange(6).reshape(3, 2), "sparse": sparse.csr_matrix(np.arange(6).reshape(3, 2)), "df": pd.DataFrame( { "a": np.arange(3), "b": list("abc"), "c": pd.Categorical(list("aab")), }, index=gen_index(3), ), }, ), AnnData( X=sparse.csr_matrix((4, 10)), obs=pd.DataFrame(index=gen_index(4)), obsm=dict( dense=np.arange(12).reshape(4, 3), df=pd.DataFrame(dict(a=np.arange(3, 7)), index=gen_index(4)), ), ), AnnData( X=sparse.csr_matrix((2, 100)), obs=pd.DataFrame(index=gen_index(2)), obsm={ "sparse": np.arange(8).reshape(2, 4), "dense": np.arange(4, 8).reshape(2, 2), "df": pd.DataFrame( { "a": np.arange(7, 9), "b": list("cd"), "c": pd.Categorical(list("ab")), }, index=gen_index(2), ), }, ), ] def test_concatenate_obsm_inner(obsm_adatas): adata = obsm_adatas[0].concatenate(obsm_adatas[1:], join="inner") assert set(adata.obsm.keys()) == {"dense", "df"} assert adata.obsm["dense"].shape == (9, 2) assert adata.obsm["dense"].tolist() == [ [0, 1], [2, 3], [4, 5], [0, 1], [3, 4], [6, 7], [9, 10], [4, 5], [6, 7], ] assert adata.obsm["df"].columns == ["a"] assert adata.obsm["df"]["a"].tolist() == list(range(9)) # fmt: off true_df = ( pd.concat([a.obsm["df"] for a in obsm_adatas], join="inner") .reset_index(drop=True) ) # fmt: on cur_df = adata.obsm["df"].reset_index(drop=True) pd.testing.assert_frame_equal(true_df, cur_df) def test_concatenate_obsm_outer(obsm_adatas, fill_val): outer = obsm_adatas[0].concatenate( obsm_adatas[1:], join="outer", fill_value=fill_val ) inner = obsm_adatas[0].concatenate(obsm_adatas[1:], join="inner") for k, inner_v in inner.obsm.items(): assert np.array_equal( _subset(outer.obsm[k], (slice(None), slice(None, inner_v.shape[1]))), inner_v, ) assert set(outer.obsm.keys()) == {"dense", "df", "sparse"} assert isinstance(outer.obsm["dense"], np.ndarray) np.testing.assert_equal( outer.obsm["dense"], np.array( [ [0, 1, fill_val], [2, 3, fill_val], [4, 5, fill_val], [0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11], [4, 5, fill_val], [6, 7, fill_val], ] ), ) assert isinstance(outer.obsm["sparse"], sparse.spmatrix) np.testing.assert_equal( outer.obsm["sparse"].toarray(), np.array( [ [0, 1, fill_val, fill_val], [2, 3, fill_val, fill_val], [4, 5, fill_val, fill_val], [fill_val, fill_val, fill_val, fill_val], [fill_val, fill_val, fill_val, fill_val], [fill_val, fill_val, fill_val, fill_val], [fill_val, fill_val, fill_val, fill_val], [0, 1, 2, 3], [4, 5, 6, 7], ] ), ) # fmt: off true_df = ( pd.concat([a.obsm["df"] for a in obsm_adatas], join="outer") .reset_index(drop=True) ) # fmt: on cur_df = outer.obsm["df"].reset_index(drop=True) pd.testing.assert_frame_equal(true_df, cur_df) def test_concat_annot_join(obsm_adatas, join_type): adatas = [ AnnData(sparse.csr_matrix(a.shape), obs=a.obsm["df"], var=a.var) for a in obsm_adatas ] pd.testing.assert_frame_equal( concat(adatas, join=join_type).obs, pd.concat([a.obs for a in adatas], join=join_type), ) def test_concatenate_layers_misaligned(array_type, join_type): adatas = [] for _ in range(5): a = array_type(sparse.random(100, 200, format="csr")) adata = AnnData(X=a, layers={"a": a}) adatas.append( adata[:, np.random.choice(adata.var_names, 150, replace=False)].copy() ) merged = adatas[0].concatenate(adatas[1:], join=join_type) assert_equal(merged.X, merged.layers["a"]) def test_concatenate_layers_outer(array_type, fill_val): # Testing that issue #368 is fixed a = AnnData( X=np.ones((10, 20)), layers={"a": array_type(sparse.random(10, 20, format="csr"))}, ) b = AnnData(X=np.ones((10, 20))) c = a.concatenate(b, join="outer", fill_value=fill_val, batch_categories=["a", "b"]) np.testing.assert_array_equal( asarray(c[c.obs["batch"] == "b"].layers["a"]), fill_val ) def test_concatenate_fill_value(fill_val): def get_obs_els(adata): return { "X": adata.X, **{f"layer_{k}": adata.layers[k] for k in adata.layers}, **{f"obsm_{k}": adata.obsm[k] for k in adata.obsm}, } adata1 = gen_adata((10, 10)) adata1.obsm = { k: v for k, v in adata1.obsm.items() if not isinstance(v, pd.DataFrame) } adata2 = gen_adata((10, 5)) adata2.obsm = { k: v[:, : v.shape[1] // 2] for k, v in adata2.obsm.items() if not isinstance(v, pd.DataFrame) } adata3 = gen_adata((7, 3)) adata3.obsm = { k: v[:, : v.shape[1] // 3] for k, v in adata3.obsm.items() if not isinstance(v, pd.DataFrame) } joined = adata1.concatenate([adata2, adata3], join="outer", fill_value=fill_val) ptr = 0 for orig in [adata1, adata2, adata3]: cur = joined[ptr : ptr + orig.n_obs] cur_els = get_obs_els(cur) orig_els = get_obs_els(orig) for k, cur_v in cur_els.items(): orig_v = orig_els.get(k, sparse.csr_matrix((orig.n_obs, 0))) assert_equal(cur_v[:, : orig_v.shape[1]], orig_v) np.testing.assert_equal(asarray(cur_v[:, orig_v.shape[1] :]), fill_val) ptr += orig.n_obs def test_concatenate_dense_duplicates(): X1 = np.array([[1, 2, 3], [4, 5, 6]]) X2 = np.array([[1, 2, 3], [4, 5, 6]]) X3 = np.array([[1, 2, 3], [4, 5, 6]]) # inner join duplicates adata1 = AnnData( X1, dict(obs_names=["s1", "s2"], anno1=["c1", "c2"]), dict( var_names=["a", "b", "c"], annoA=[0, 1, 2], annoB=[1.1, 1.0, 2.0], annoC=[1.1, 1.0, 2.0], annoD=[2.1, 2.0, 3.0], ), ) adata2 = AnnData( X2, dict(obs_names=["s3", "s4"], anno1=["c3", "c4"]), dict( var_names=["a", "b", "c"], annoA=[0, 1, 2], annoB=[1.1, 1.0, 2.0], annoC=[1.1, 1.0, 2.0], annoD=[2.1, 2.0, 3.0], ), ) adata3 = AnnData( X3, dict(obs_names=["s1", "s2"], anno2=["d3", "d4"]), dict( var_names=["a", "b", "c"], annoA=[0, 1, 2], annoB=[1.1, 1.0, 2.0], annoD=[2.1, 2.0, 3.1], ), ) adata = adata1.concatenate(adata2, adata3) assert adata.var_keys() == [ "annoA", "annoB", "annoC-0", "annoD-0", "annoC-1", "annoD-1", "annoD-2", ] def test_concatenate_sparse(): # sparse data from scipy.sparse import csr_matrix X1 = csr_matrix([[0, 2, 3], [0, 5, 6]]) X2 = csr_matrix([[0, 2, 3], [0, 5, 6]]) X3 = csr_matrix([[1, 2, 0], [0, 5, 6]]) adata1 = AnnData( X1, dict(obs_names=["s1", "s2"], anno1=["c1", "c2"]), dict(var_names=["a", "b", "c"]), layers=dict(Xs=X1), ) adata2 = AnnData( X2, dict(obs_names=["s3", "s4"], anno1=["c3", "c4"]), dict(var_names=["d", "c", "b"]), layers=dict(Xs=X2), ) adata3 = AnnData( X3, dict(obs_names=["s5", "s6"], anno2=["d3", "d4"]), dict(var_names=["d", "c", "b"]), layers=dict(Xs=X3), ) # inner join adata = adata1.concatenate(adata2, adata3) X_combined = [[2, 3], [5, 6], [3, 2], [6, 5], [0, 2], [6, 5]] assert adata.X.toarray().astype(int).tolist() == X_combined assert adata.layers["Xs"].toarray().astype(int).tolist() == X_combined # outer join adata = adata1.concatenate(adata2, adata3, join="outer") assert adata.X.toarray().tolist() == [ [0.0, 2.0, 3.0, 0.0], [0.0, 5.0, 6.0, 0.0], [0.0, 3.0, 2.0, 0.0], [0.0, 6.0, 5.0, 0.0], [0.0, 0.0, 2.0, 1.0], [0.0, 6.0, 5.0, 0.0], ] def test_concatenate_mixed(): X1 = sparse.csr_matrix(np.array([[1, 2, 0], [4, 0, 6], [0, 0, 9]])) X2 = sparse.csr_matrix(np.array([[0, 2, 3], [4, 0, 0], [7, 0, 9]])) X3 = sparse.csr_matrix(np.array([[1, 0, 3], [0, 0, 6], [0, 8, 0]])) X4 = np.array([[0, 2, 3], [4, 0, 0], [7, 0, 9]]) adata1 = AnnData( X1, dict(obs_names=["s1", "s2", "s3"], anno1=["c1", "c2", "c3"]), dict(var_names=["a", "b", "c"], annoA=[0, 1, 2]), layers=dict(counts=X1), ) adata2 = AnnData( X2, dict(obs_names=["s4", "s5", "s6"], anno1=["c3", "c4", "c5"]), dict(var_names=["d", "c", "b"], annoA=[0, 1, 2]), layers=dict(counts=X4), # sic ) adata3 = AnnData( X3, dict(obs_names=["s7", "s8", "s9"], anno2=["d3", "d4", "d5"]), dict(var_names=["d", "c", "b"], annoA=[0, 2, 3], annoB=[0, 1, 2]), layers=dict(counts=X3), ) adata4 = AnnData( X4, dict(obs_names=["s4", "s5", "s6"], anno1=["c3", "c4", "c5"]), dict(var_names=["d", "c", "b"], annoA=[0, 1, 2]), layers=dict(counts=X2), # sic ) adata_all = AnnData.concatenate(adata1, adata2, adata3, adata4) assert isinstance(adata_all.X, sparse.csr_matrix) assert isinstance(adata_all.layers["counts"], sparse.csr_matrix) def test_concatenate_with_raw(): # dense data X1 = np.array([[1, 2, 3], [4, 5, 6]]) X2 = np.array([[1, 2, 3], [4, 5, 6]]) X3 = np.array([[1, 2, 3], [4, 5, 6]]) X4 = np.array([[1, 2, 3, 4], [5, 6, 7, 8]]) adata1 = AnnData( X1, dict(obs_names=["s1", "s2"], anno1=["c1", "c2"]), dict(var_names=["a", "b", "c"], annoA=[0, 1, 2]), layers=dict(Xs=X1), ) adata2 = AnnData( X2, dict(obs_names=["s3", "s4"], anno1=["c3", "c4"]), dict(var_names=["d", "c", "b"], annoA=[0, 1, 2]), layers=dict(Xs=X2), ) adata3 = AnnData( X3, dict(obs_names=["s1", "s2"], anno2=["d3", "d4"]), dict(var_names=["d", "c", "b"], annoB=[0, 1, 2]), layers=dict(Xs=X3), ) adata4 = AnnData( X4, dict(obs_names=["s1", "s2"], anno1=["c1", "c2"]), dict(var_names=["a", "b", "c", "z"], annoA=[0, 1, 2, 3]), layers=dict(Xs=X4), ) adata1.raw = adata1 adata2.raw = adata2 adata3.raw = adata3 adata_all = AnnData.concatenate(adata1, adata2, adata3) assert isinstance(adata_all.raw, Raw) assert set(adata_all.raw.var_names) == {"b", "c"} assert_equal(adata_all.raw.to_adata().obs, adata_all.obs) assert np.array_equal(adata_all.raw.X, adata_all.X) adata_all = AnnData.concatenate(adata1, adata2, adata3, join="outer") assert isinstance(adata_all.raw, Raw) assert set(adata_all.raw.var_names) == set("abcd") assert_equal(adata_all.raw.to_adata().obs, adata_all.obs) assert np.array_equal(np.nan_to_num(adata_all.raw.X), np.nan_to_num(adata_all.X)) adata3.raw = adata4 adata_all = AnnData.concatenate(adata1, adata2, adata3, join="outer") assert isinstance(adata_all.raw, Raw) assert set(adata_all.raw.var_names) == set("abcdz") assert set(adata_all.var_names) == set("abcd") assert not np.array_equal( np.nan_to_num(adata_all.raw.X), np.nan_to_num(adata_all.X) ) del adata3.raw with pytest.warns( UserWarning, match=( "Only some AnnData objects have `.raw` attribute, " "not concatenating `.raw` attributes." ), ): adata_all = AnnData.concatenate(adata1, adata2, adata3) assert adata_all.raw is None del adata1.raw del adata2.raw assert all(_adata.raw is None for _adata in (adata1, adata2, adata3)) adata_all = AnnData.concatenate(adata1, adata2, adata3) assert adata_all.raw is None def test_pairwise_concat(axis, array_type): dim_sizes = [[100, 200, 50], [50, 50, 50]] if axis: dim_sizes.reverse() Ms, Ns = dim_sizes dim = ("obs", "var")[axis] alt = ("var", "obs")[axis] dim_attr = f"{dim}p" alt_attr = f"{alt}p" def gen_dim_array(m): return array_type(sparse.random(m, m, format="csr", density=0.1)) adatas = { k: AnnData( **{ "X": sparse.csr_matrix((m, n)), "obsp": {"arr": gen_dim_array(m)}, "varp": {"arr": gen_dim_array(n)}, } ) for k, m, n in zip("abc", Ms, Ns) } w_pairwise = concat(adatas, axis=axis, label="orig", pairwise=True) wo_pairwise = concat(adatas, axis=axis, label="orig", pairwise=False) # Check that argument controls whether elements are included assert getattr(wo_pairwise, dim_attr) == {} assert getattr(w_pairwise, dim_attr) != {} # Check values of included elements full_inds = np.arange(w_pairwise.shape[axis]) groups = getattr(w_pairwise, dim).groupby("orig").indices for k, inds in groups.items(): orig_arr = getattr(adatas[k], dim_attr)["arr"] full_arr = getattr(w_pairwise, dim_attr)["arr"] # Check original values are intact assert_equal(orig_arr, _subset(full_arr, (inds, inds))) # Check that entries are filled with zeroes assert_equal( sparse.csr_matrix((len(inds), len(full_inds) - len(inds))), _subset(full_arr, (inds, np.setdiff1d(full_inds, inds))), ) assert_equal( sparse.csr_matrix((len(full_inds) - len(inds), len(inds))), _subset(full_arr, (np.setdiff1d(full_inds, inds), inds)), ) # Check that argument does not affect alternative axis assert "arr" in getattr( concat(adatas, axis=axis, pairwise=False, merge="first"), alt_attr ) def test_nan_merge(axis, join_type, array_type): # concat_dim = ("obs", "var")[axis] alt_dim = ("var", "obs")[axis] mapping_attr = f"{alt_dim}m" adata_shape = (20, 10) arr = array_type( sparse.random(adata_shape[1 - axis], 10, density=0.1, format="csr") ) arr_nan = arr.copy() with warnings.catch_warnings(): warnings.simplefilter("ignore", category=sparse.SparseEfficiencyWarning) for _ in range(10): arr_nan[ np.random.choice(arr.shape[0]), np.random.choice(arr.shape[1]) ] = np.nan _data = {"X": sparse.csr_matrix(adata_shape), mapping_attr: {"arr": arr_nan}} orig1 = AnnData(**_data) orig2 = AnnData(**_data) result = concat([orig1, orig2], axis=axis, merge="same") assert_equal(getattr(orig1, mapping_attr), getattr(result, mapping_attr)) orig_nonan = AnnData( **{"X": sparse.csr_matrix(adata_shape), mapping_attr: {"arr": arr}} ) result_nonan = concat([orig1, orig_nonan], axis=axis, merge="same") assert len(getattr(result_nonan, mapping_attr)) == 0 def test_merge_unique(): from anndata._core.merge import merge_unique # Simple cases assert merge_unique([{"a": "b"}, {"a": "b"}]) == {"a": "b"} assert merge_unique([{"a": {"b": "c"}}, {"a": {"b": "c"}}]) == {"a": {"b": "c"}} assert merge_unique([{"a": {"b": "c"}}, {"a": {"b": "d"}}]) == {} assert merge_unique([{"a": {"b": "c", "d": "e"}}, {"a": {"b": "c", "d": "f"}}]) == { "a": {"b": "c"} } assert merge_unique( [{"a": {"b": {"c": {"d": "e"}}}}, {"a": {"b": {"c": {"d": "e"}}}}] ) == {"a": {"b": {"c": {"d": "e"}}}} assert ( merge_unique( [ {"a": {"b": {"c": {"d": "e"}}}}, {"a": {"b": {"c": {"d": "f"}}}}, {"a": {"b": {"c": {"d": "e"}}}}, ] ) == {} ) assert merge_unique([{"a": 1}, {"b": 2}]) == {"a": 1, "b": 2} assert merge_unique([{"a": 1}, {"b": 2}, {"a": 1, "b": {"c": 2, "d": 3}}]) == { "a": 1 } # Test equivalency between arrays and lists assert list( merge_unique([{"a": np.ones(5)}, {"a": list(np.ones(5))}])["a"] ) == list(np.ones(5)) assert merge_unique([{"a": np.ones(5)}, {"a": list(np.ones(4))}]) == {} def test_merge_same(): from anndata._core.merge import merge_same # Same as unique for a number of cases: assert merge_same([{"a": "b"}, {"a": "b"}]) == {"a": "b"} assert merge_same([{"a": {"b": "c"}}, {"a": {"b": "c"}}]) == {"a": {"b": "c"}} assert merge_same([{"a": {"b": "c"}}, {"a": {"b": "d"}}]) == {} assert merge_same([{"a": {"b": "c", "d": "e"}}, {"a": {"b": "c", "d": "f"}}]) == { "a": {"b": "c"} } assert merge_same([{"a": {"b": "c"}, "d": "e"}, {"a": {"b": "c"}, "d": 2}]) == { "a": {"b": "c"} } assert merge_same( [{"a": {"b": {"c": {"d": "e"}}}}, {"a": {"b": {"c": {"d": "e"}}}}] ) == {"a": {"b": {"c": {"d": "e"}}}} assert merge_same([{"a": 1}, {"b": 2}]) == {} assert merge_same([{"a": 1}, {"b": 2}, {"a": 1, "b": {"c": 2, "d": 3}}]) == {} # Test equivalency between arrays and lists assert list(merge_same([{"a": np.ones(5)}, {"a": list(np.ones(5))}])["a"]) == list( np.ones(5) ) def test_merge_first(): from anndata._core.merge import merge_first assert merge_first([{"a": "b"}, {"a": "b"}]) == {"a": "b"} assert merge_first([{"a": {"b": "c"}}, {"a": {"b": "c"}}]) == {"a": {"b": "c"}} assert merge_first([{"a": 1}, {"a": 2}]) == {"a": 1} assert merge_first([{"a": 1}, {"a": {"b": {"c": {"d": "e"}}}}]) == {"a": 1} assert merge_first([{"a": {"b": {"c": {"d": "e"}}}}, {"a": 1}]) == { "a": {"b": {"c": {"d": "e"}}} } # Helpers for test_concatenate_uns def uns_ad(uns): return AnnData(np.zeros((10, 10)), uns=uns) def map_values(mapping, path, key, old_parent, new_parent, new_items): ret = default_exit(path, key, old_parent, new_parent, new_items) for k, v in ret.items(): if isinstance(v, Hashable) and v in mapping: ret[k] = mapping[v] return ret def permute_nested_values(dicts: "List[dict]", gen_val: "Callable[[int], Any]"): """ This function permutes the values of a nested mapping, for testing that out merge method work regardless of the values types. Assumes the intial dictionary had integers for values. """ dicts = deepcopy(dicts) initial_values = [ x[1] for x in research(dicts, query=lambda p, k, v: isinstance(v, int)) ] mapping = {k: gen_val(k) for k in initial_values} return [remap(d, exit=partial(map_values, mapping)) for d in dicts] def gen_df(n): return helpers.gen_typed_df(n) def gen_array(n): return np.random.randn(n) def gen_list(n): return list(gen_array(n)) def gen_sparse(n): return sparse.random(np.random.randint(1, 100), np.random.randint(1, 100)) def gen_something(n): options = [gen_df, gen_array, gen_list, gen_sparse] return np.random.choice(options)(n) def gen_concat_params(unss, compat2result): value_generators = [ lambda x: x, gen_df, gen_array, gen_list, gen_sparse, gen_something, ] for gen, (mode, result) in product(value_generators, compat2result.items()): yield pytest.param(unss, mode, result, gen) @pytest.mark.parametrize( ["unss", "merge_strategy", "result", "value_gen"], chain( gen_concat_params( [{"a": 1}, {"a": 2}], {None: {}, "first": {"a": 1}, "unique": {}, "same": {}, "only": {}}, ), gen_concat_params( [{"a": 1}, {"b": 2}], { None: {}, "first": {"a": 1, "b": 2}, "unique": {"a": 1, "b": 2}, "same": {}, "only": {"a": 1, "b": 2}, }, ), gen_concat_params( [ {"a": {"b": 1, "c": {"d": 3}}}, {"a": {"b": 1, "c": {"e": 4}}}, ], { None: {}, "first": {"a": {"b": 1, "c": {"d": 3, "e": 4}}}, "unique": {"a": {"b": 1, "c": {"d": 3, "e": 4}}}, "same": {"a": {"b": 1}}, "only": {"a": {"c": {"d": 3, "e": 4}}}, }, ), gen_concat_params( [ {"a": 1}, {"a": 1, "b": 2}, {"a": 1, "b": {"b.a": 1}, "c": 3}, {"d": 4}, ], { None: {}, "first": {"a": 1, "b": 2, "c": 3, "d": 4}, "unique": {"a": 1, "c": 3, "d": 4}, "same": {}, "only": {"c": 3, "d": 4}, }, ), gen_concat_params( [{"a": i} for i in range(15)], {None: {}, "first": {"a": 0}, "unique": {}, "same": {}, "only": {}}, ), gen_concat_params( [{"a": 1} for i in range(10)] + [{"a": 2}], {None: {}, "first": {"a": 1}, "unique": {}, "same": {}, "only": {}}, ), ), ) def test_concatenate_uns(unss, merge_strategy, result, value_gen): """ Test that concatenation works out for different strategies and sets of values. Params ------ unss Set of patterns for values in uns. compat Strategy to use for merging uns. result Pattern we expect to see for the given input and strategy. value_gen Maps values in unss and results to another set of values. This is for checking that we're comparing values correctly. For example `[{"a": 1}, {"a": 1}]` may get mapped to `[{"a": [1, 2, 3]}, {"a": [1, 2, 3]}]`. """ # So we can see what the initial pattern was meant to be print(merge_strategy, "\n", unss, "\n", result) result, *unss = permute_nested_values([result] + unss, value_gen) adatas = [uns_ad(uns) for uns in unss] assert_equal( adatas[0].concatenate(adatas[1:], uns_merge=merge_strategy).uns, result, elem_name="uns", ) def test_transposed_concat(array_type, axis, join_type, merge_strategy, fill_val): lhs = gen_adata((10, 10), X_type=array_type) rhs = gen_adata((10, 12), X_type=array_type) a = concat([lhs, rhs], axis=axis, join=join_type, merge=merge_strategy) b = concat( [lhs.T, rhs.T], axis=abs(axis - 1), join=join_type, merge=merge_strategy ).T assert_equal(a, b) def test_batch_key(axis): """Test that concat only adds a label if the key is provided""" def get_annot(adata): return getattr(adata, ("obs", "var")[axis]) lhs = gen_adata((10, 10)) rhs = gen_adata((10, 12)) # There is probably a prettier way to do this annot = get_annot(concat([lhs, rhs], axis=axis)) assert ( list( annot.columns.difference( get_annot(lhs).columns.union(get_annot(rhs).columns) ) ) == [] ) batch_annot = get_annot(concat([lhs, rhs], axis=axis, label="batch")) assert list( batch_annot.columns.difference( get_annot(lhs).columns.union(get_annot(rhs).columns) ) ) == ["batch"] def test_concat_categories_from_mapping(): mapping = { "a": gen_adata((10, 10)), "b": gen_adata((10, 10)), } keys = list(mapping.keys()) adatas = list(mapping.values()) mapping_call = partial(concat, mapping) iter_call = partial(concat, adatas, keys=keys) assert_equal(mapping_call(), iter_call()) assert_equal(mapping_call(label="batch"), iter_call(label="batch")) assert_equal(mapping_call(index_unique="-"), iter_call(index_unique="-")) assert_equal( mapping_call(label="group", index_unique="+"), iter_call(label="group", index_unique="+"), ) def test_concat_names(axis): def get_annot(adata): return getattr(adata, ("obs", "var")[axis]) lhs = gen_adata((10, 10)) rhs = gen_adata((10, 10)) assert not get_annot(concat([lhs, rhs], axis=axis)).index.is_unique assert get_annot(concat([lhs, rhs], axis=axis, index_unique="-")).index.is_unique def axis_labels(adata, axis): return (adata.obs_names, adata.var_names)[axis] def expected_shape(a, b, axis, join): labels = partial(axis_labels, axis=abs(axis - 1)) shape = [None, None] shape[axis] = a.shape[axis] + b.shape[axis] if join == "inner": shape[abs(axis - 1)] = len(labels(a).intersection(labels(b))) elif join == "outer": shape[abs(axis - 1)] = len(labels(a).union(labels(b))) else: raise ValueError() return tuple(shape) @pytest.mark.parametrize( "shape", [pytest.param((8, 0), id="no_var"), pytest.param((0, 10), id="no_obs")] ) def test_concat_size_0_dim(axis, join_type, merge_strategy, shape): # https://github.com/theislab/anndata/issues/526 a = gen_adata((5, 7)) b = gen_adata(shape) alt_axis = 1 - axis dim = ("obs", "var")[axis] expected_size = expected_shape(a, b, axis=axis, join=join_type) result = concat( {"a": a, "b": b}, axis=axis, join=join_type, merge=merge_strategy, pairwise=True, index_unique="-", ) assert result.shape == expected_size if join_type == "outer": # Check new entries along axis of concatenation axis_new_inds = axis_labels(result, axis).str.endswith("-b") altaxis_new_inds = ~axis_labels(result, alt_axis).isin(axis_labels(a, alt_axis)) axis_idx = make_idx_tuple(axis_new_inds, axis) altaxis_idx = make_idx_tuple(altaxis_new_inds, 1 - axis) check_filled_like(result.X[axis_idx], elem_name="X") check_filled_like(result.X[altaxis_idx], elem_name="X") for k, elem in getattr(result, "layers").items(): check_filled_like(elem[axis_idx], elem_name=f"layers/{k}") check_filled_like(elem[altaxis_idx], elem_name=f"layers/{k}") if shape[axis] > 0: b_result = result[axis_idx].copy() mapping_elem = f"{dim}m" setattr(b_result, f"{dim}_names", getattr(b, f"{dim}_names")) for k, result_elem in getattr(b_result, mapping_elem).items(): elem_name = f"{mapping_elem}/{k}" # pd.concat can have unintuitive return types. is similar to numpy promotion if isinstance(result_elem, pd.DataFrame): assert_equal( getattr(b, mapping_elem)[k].astype(object), result_elem.astype(object), elem_name=elem_name, ) else: assert_equal( getattr(b, mapping_elem)[k], result_elem, elem_name=elem_name, ) @pytest.mark.parametrize("elem", ["sparse", "array", "df"]) def test_concat_outer_aligned_mapping(elem): a = gen_adata((5, 5)) b = gen_adata((3, 5)) del b.obsm[elem] concated = concat({"a": a, "b": b}, join="outer", label="group") result = concated.obsm[elem][concated.obs["group"] == "b"] check_filled_like(result, elem_name=f"obsm/{elem}") def test_concatenate_size_0_dim(): # https://github.com/theislab/anndata/issues/526 a = gen_adata((5, 10)) b = gen_adata((5, 0)) # Mostly testing that this doesn't error a.concatenate([b]).shape == (10, 0) b.concatenate([a]).shape == (10, 0) # Leaving out for now. See definition of these values for explanation # def test_concatenate_uns_types(): # from anndata._core.merge import UNS_STRATEGIES, UNS_STRATEGIES_TYPE # assert set(UNS_STRATEGIES.keys()) == set(UNS_STRATEGIES_TYPE.__args__) anndata-0.7.8/anndata/tests/test_deprecations.py000066400000000000000000000137661414255741200220170ustar00rootroot00000000000000"""\ This file contains tests for deprecated functions. This includes correct behaviour as well as throwing warnings. """ import h5py import numpy as np import pytest from scipy import sparse import anndata as ad from anndata import AnnData from anndata.tests.helpers import assert_equal @pytest.fixture def adata(): adata = AnnData( X=sparse.csr_matrix([[0, 2, 3], [0, 5, 6]]), obs=dict(obs_names=["s1", "s2"], anno1=["c1", "c2"]), var=dict(var_names=["a", "b", "c"]), ) adata.raw = adata adata.layers["x2"] = adata.X * 2 adata.var["anno2"] = ["p1", "p2", "p3"] adata.X = adata.X / 2 return adata def test_get_obsvar_array_warn(adata): with pytest.warns(DeprecationWarning): adata._get_obs_array("a") with pytest.warns(DeprecationWarning): adata._get_var_array("s1") # TODO: Why doesn’t this mark work? # @pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_get_obsvar_array(adata): with pytest.warns(DeprecationWarning): # Just to hide warnings assert np.allclose(adata._get_obs_array("a"), adata.obs_vector("a")) assert np.allclose( adata._get_obs_array("a", layer="x2"), adata.obs_vector("a", layer="x2"), ) assert np.allclose( adata._get_obs_array("a", use_raw=True), adata.raw.obs_vector("a") ) assert np.allclose(adata._get_var_array("s1"), adata.var_vector("s1")) assert np.allclose( adata._get_var_array("s1", layer="x2"), adata.var_vector("s1", layer="x2"), ) assert np.allclose( adata._get_var_array("s1", use_raw=True), adata.raw.var_vector("s1") ) def test_obsvar_vector_Xlayer(adata): with pytest.warns(FutureWarning): adata.var_vector("s1", layer="X") with pytest.warns(FutureWarning): adata.obs_vector("a", layer="X") adata = adata.copy() adata.layers["X"] = adata.X * 3 with pytest.warns(None) as records: adata.var_vector("s1", layer="X") adata.obs_vector("a", layer="X") for r in records: # This time it shouldn’t throw a warning if "anndata" in r.filename: assert r.category is not FutureWarning def test_force_dense_deprecated(tmp_path): dense_pth = tmp_path / "dense.h5ad" adata = AnnData(X=sparse.random(10, 10, format="csr")) adata.raw = adata with pytest.warns(FutureWarning): adata.write_h5ad(dense_pth, force_dense=True) with h5py.File(dense_pth, "r") as f: assert isinstance(f["X"], h5py.Dataset) assert isinstance(f["raw/X"], h5py.Dataset) dense = ad.read_h5ad(dense_pth) assert isinstance(dense.X, np.ndarray) assert isinstance(dense.raw.X, np.ndarray) assert_equal(adata, dense) ####################################### # Dealing with uns adj matrices ####################################### def test_get_uns_neighbors_deprecated(adata): n = adata.shape[0] mtx = sparse.random(n, n, density=0.3, format="csr") adata.obsp["connectivities"] = mtx adata.uns["neighbors"] = {} with pytest.warns(FutureWarning): from_uns = adata.uns["neighbors"]["connectivities"] assert_equal(from_uns, mtx) with pytest.warns(None) as rec: v = adata[: n // 2] assert not rec with pytest.warns(FutureWarning): from_uns_v = v.uns["neighbors"]["connectivities"] assert_equal(from_uns_v, v.obsp["connectivities"]) def test_set_uns_neighbors_deprecated(adata): n = adata.shape[0] mtx = sparse.random(n, n, format="csr") adata.uns["neighbors"] = {} with pytest.warns(FutureWarning): adata.uns["neighbors"]["connectivities"] = sparse.random(n, n, format="csr") assert_equal(adata.obsp["connectivities"], mtx) with pytest.warns(FutureWarning): assert_equal(adata.uns["neighbors"]["connectivities"], mtx) # Make sure that we can write to uns normally: adata.uns["new_key"] = 100 assert adata.uns["new_key"] == 100 def test_slice_uns_sparse_deprecated(): adata = AnnData(sparse.csr_matrix((500, 10))) n = adata.shape[0] mtx = sparse.random(n, n, density=0.2, format="csr") adata.uns["sparse_mtx"] = mtx with pytest.warns(FutureWarning): v = adata[: n // 2] assert_equal(adata.uns["sparse_mtx"], mtx) assert_equal(v.uns["sparse_mtx"], mtx[: n // 2, : n // 2]) @pytest.fixture def adata_neighbors(): return ad.AnnData( X=sparse.random(100, 200, format="csr"), obsp=dict( distances=sparse.random(100, 100, format="csr"), connectivities=sparse.random(100, 100, format="csr"), ), uns={"neighbors": {"params": {"method": "umap", "n_neighbors": 10}}}, ) def test_deprecated_neighbors_get_mtx(adata_neighbors): """Test getting neighbor matrices from adata.uns""" adata = adata_neighbors with pytest.warns(FutureWarning): assert_equal(adata.obsp["distances"], adata.uns["neighbors"]["distances"]) with pytest.warns(FutureWarning): assert_equal( adata.obsp["connectivities"], adata.uns["neighbors"]["connectivities"] ) def test_deprecated_neighbors_get_other(adata_neighbors): """Test getting other fields from adata.uns""" adata = adata_neighbors # This shouldn't throw a warning with pytest.warns(None) as rec: assert adata.uns["neighbors"]["params"] == {"method": "umap", "n_neighbors": 10} assert not rec def test_deprecated_neighbors_set_other(adata_neighbors): adata = adata_neighbors # This shouldn't throw a warning with pytest.warns(None) as rec: adata.uns["neighbors"]["new_key"] = 10 assert adata.uns["neighbors"]["new_key"] == 10 # Test nested adata.uns["neighbors"]["params"]["new_param"] = 100 assert adata.uns["neighbors"]["params"]["new_param"] == 100 assert adata.uns["neighbors"]["params"] == { "method": "umap", "n_neighbors": 10, "new_param": 100, } assert not rec anndata-0.7.8/anndata/tests/test_get_vector.py000066400000000000000000000044151414255741200214670ustar00rootroot00000000000000import numpy as np import pandas as pd from scipy import sparse import pytest import anndata as ad def test_amgibuous_keys(): """Tests that an error is raised if obs_vector or var_vector is ambiguous.""" var_keys = ["The", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog"] obs_keys = [ "Lorem", "ipsum", "dolor", "sit", "amet", "consectetur", "adipiscing", "elit", ] adata = ad.AnnData( X=sparse.random(len(obs_keys), len(var_keys), format="csr"), layers={"layer": sparse.random(len(obs_keys), len(var_keys), format="csr")}, obs=pd.DataFrame( np.random.randn(len(obs_keys), len(obs_keys) + len(var_keys)), index=obs_keys, columns=obs_keys + var_keys, ), var=pd.DataFrame( np.random.randn(len(var_keys), len(obs_keys) + len(var_keys)), index=var_keys, columns=var_keys + obs_keys, ), ) adata.raw = adata for k in var_keys: # These are mostly to check that the test is working assert k in adata.var_names assert k in adata.obs.columns # Now the actual checks: with pytest.raises(ValueError, match=r".*var_names.*obs\.columns.*"): adata.obs_vector(k) with pytest.raises(ValueError, match=r".*var_names.*obs\.columns.*"): adata.obs_vector(k, layer="layer") # Should uniquely select column from in adata.var assert list(adata.var[k]) == list(adata.var_vector(k)) assert list(adata.var[k]) == list(adata.var_vector(k, layer="layer")) assert list(adata.raw.var[k]) == list(adata.raw.var_vector(k)) for k in obs_keys: assert k in adata.obs_names assert k in adata.var.columns with pytest.raises(ValueError, match=r".*obs_names.*var\.columns"): adata.var_vector(k) with pytest.raises(ValueError, match=r".*obs_names.*var\.columns"): adata.var_vector(k, layer="layer") assert list(adata.obs[k]) == list(adata.obs_vector(k)) assert list(adata.obs[k]) == list(adata.obs_vector(k, layer="layer")) with pytest.raises(ValueError, match=r".*obs_names.*var\.columns*"): adata.raw.var_vector(k) anndata-0.7.8/anndata/tests/test_hdf5_backing.py000066400000000000000000000240161414255741200216310ustar00rootroot00000000000000from pathlib import Path import re import joblib import pytest import numpy as np from scipy import sparse import anndata as ad from anndata.tests.helpers import gen_adata, assert_equal, subset_func from anndata.utils import asarray subset_func2 = subset_func # ------------------------------------------------------------------------------- # Some test data # ------------------------------------------------------------------------------- @pytest.fixture def adata(): X_list = [ [1, 2, 3], [4, 5, 6], [7, 8, 9], ] # data matrix of shape n_obs x n_vars X = np.array(X_list) obs_dict = dict( # annotation of observations / rows row_names=["name1", "name2", "name3"], # row annotation oanno1=["cat1", "cat2", "cat2"], # categorical annotation oanno2=["o1", "o2", "o3"], # string annotation oanno3=[2.1, 2.2, 2.3], # float annotation ) var_dict = dict(vanno1=[3.1, 3.2, 3.3]) # annotation of variables / columns uns_dict = dict( # unstructured annotation oanno1_colors=["#000000", "#FFFFFF"], uns2=["some annotation"] ) return ad.AnnData( X, obs=obs_dict, var=var_dict, uns=uns_dict, obsm=dict(o1=np.zeros((X.shape[0], 10))), varm=dict(v1=np.ones((X.shape[1], 20))), layers=dict(float=X.astype(float), sparse=sparse.csr_matrix(X)), dtype="int32", ) @pytest.fixture( params=[sparse.csr_matrix, sparse.csc_matrix, np.array], ids=["scipy-csr", "scipy-csc", "np-array"], ) def mtx_format(request): return request.param @pytest.fixture(params=[sparse.csr_matrix, sparse.csc_matrix]) def sparse_format(request): return request.param @pytest.fixture(params=["r+", "r", False]) def backed_mode(request): return request.param @pytest.fixture(params=[True, False]) def force_dense(request): return request.param # ------------------------------------------------------------------------------- # The test functions # ------------------------------------------------------------------------------- # TODO: Check to make sure obs, obsm, layers, ... are written and read correctly as well def test_read_write_X(tmp_path, mtx_format, backed_mode, force_dense): base_pth = Path(tmp_path) orig_pth = base_pth / "orig.h5ad" backed_pth = base_pth / "backed.h5ad" orig = ad.AnnData(mtx_format(asarray(sparse.random(10, 10, format="csr")))) orig.write(orig_pth) backed = ad.read(orig_pth, backed=backed_mode) backed.write(backed_pth, as_dense=["X"]) backed.file.close() from_backed = ad.read(backed_pth) assert np.all(asarray(orig.X) == asarray(from_backed.X)) # this is very similar to the views test def test_backing(adata, tmp_path, backing_h5ad): assert not adata.isbacked adata.filename = backing_h5ad adata.write() assert not adata.file.is_open assert adata.isbacked assert adata[:, 0].is_view assert adata[:, 0].X.tolist() == np.reshape([1, 4, 7], (3, 1)).tolist() # this might give us a trouble as the user might not # know that the file is open again.... assert adata.file.is_open adata[:2, 0].X = [0, 0] assert adata[:, 0].X.tolist() == np.reshape([0, 0, 7], (3, 1)).tolist() adata_subset = adata[:2, [0, 1]] assert adata_subset.is_view subset_hash = joblib.hash(adata_subset) # cannot set view in backing mode... with pytest.raises(ValueError): adata_subset.obs["foo"] = range(2) with pytest.raises(ValueError): adata_subset.var["bar"] = -12 with pytest.raises(ValueError): adata_subset.obsm["o2"] = np.ones((2, 2)) with pytest.raises(ValueError): adata_subset.varm["v2"] = np.zeros((2, 2)) with pytest.raises(ValueError): adata_subset.layers["float2"] = adata_subset.layers["float"].copy() # Things should stay the same after failed operations assert subset_hash == joblib.hash(adata_subset) assert adata_subset.is_view # need to copy first adata_subset = adata_subset.copy(tmp_path / "test.subset.h5ad") # now transition to actual object assert not adata_subset.is_view adata_subset.obs["foo"] = range(2) assert not adata_subset.is_view assert adata_subset.isbacked assert adata_subset.obs["foo"].tolist() == list(range(2)) # save adata_subset.write() def test_backing_copy(adata, tmp_path, backing_h5ad): adata.filename = backing_h5ad adata.write() copypath = tmp_path / "test.copy.h5ad" copy = adata.copy(copypath) assert adata.filename == backing_h5ad assert copy.filename == copypath assert adata.isbacked assert copy.isbacked # TODO: Also test updating the backing file inplace def test_backed_raw(tmp_path): backed_pth = tmp_path / "backed.h5ad" final_pth = tmp_path / "final.h5ad" mem_adata = gen_adata((10, 10)) mem_adata.raw = mem_adata mem_adata.write(backed_pth) backed_adata = ad.read_h5ad(backed_pth, backed="r") assert_equal(backed_adata, mem_adata) backed_adata.write_h5ad(final_pth) final_adata = ad.read_h5ad(final_pth) assert_equal(final_adata, mem_adata) @pytest.mark.parametrize( "array_type", [ pytest.param(asarray, id="dense_array"), pytest.param(sparse.csr_matrix, id="csr_matrix"), ], ) def test_backed_raw_subset(tmp_path, array_type, subset_func, subset_func2): backed_pth = tmp_path / "backed.h5ad" final_pth = tmp_path / "final.h5ad" mem_adata = gen_adata((10, 10), X_type=array_type) mem_adata.raw = mem_adata obs_idx = subset_func(mem_adata.obs_names) var_idx = subset_func2(mem_adata.var_names) if ( array_type is asarray and isinstance(obs_idx, (np.ndarray, sparse.spmatrix)) and isinstance(var_idx, (np.ndarray, sparse.spmatrix)) ): pytest.xfail( "Fancy indexing does not work with multiple arrays on a h5py.Dataset" ) mem_adata.write(backed_pth) ### Backed view has same values as in memory view ### backed_adata = ad.read_h5ad(backed_pth, backed="r") backed_v = backed_adata[obs_idx, var_idx] assert backed_v.is_view mem_v = mem_adata[obs_idx, var_idx] # Value equivalent assert_equal(mem_v, backed_v) # Type and value equivalent assert_equal(mem_v.copy(), backed_v.to_memory(), exact=True) assert backed_v.is_view assert backed_v.isbacked ### Write from backed view ### backed_v.write_h5ad(final_pth) final_adata = ad.read_h5ad(final_pth) assert_equal(mem_v, final_adata) assert_equal(final_adata, backed_v.to_memory()) # assert loading into memory @pytest.mark.parametrize( "array_type", [ pytest.param(asarray, id="dense_array"), pytest.param(sparse.csr_matrix, id="csr_matrix"), ], ) def test_to_memory_full(tmp_path, array_type): backed_pth = tmp_path / "backed.h5ad" mem_adata = gen_adata((15, 10), X_type=array_type) mem_adata.raw = gen_adata((15, 12), X_type=array_type) mem_adata.write_h5ad(backed_pth, compression="lzf") backed_adata = ad.read_h5ad(backed_pth, backed="r") assert_equal(mem_adata, backed_adata.to_memory()) # Test that raw can be removed del backed_adata.raw del mem_adata.raw assert_equal(mem_adata, backed_adata.to_memory()) def test_to_memory_error(): adata = gen_adata((5, 3)) with pytest.raises(ValueError): adata.to_memory() def test_double_index(adata, backing_h5ad): adata.filename = backing_h5ad with pytest.raises(ValueError): # no view of view of backed object currently adata[:2][:, 0] # close backing file adata.write() def test_return_to_memory_mode(adata, backing_h5ad): bdata = adata.copy() adata.filename = backing_h5ad assert adata.isbacked adata.filename = None assert not adata.isbacked # make sure the previous file had been properly closed # when setting `adata.filename = None` # if it hadn’t the following line would throw an error bdata.filename = backing_h5ad # close the file bdata.filename = None def test_backed_modification(adata, backing_h5ad): adata.X[:, 1] = 0 # Make it a little sparse adata.X = sparse.csr_matrix(adata.X) assert not adata.isbacked # While this currently makes the file backed, it doesn’t write it as sparse adata.filename = backing_h5ad adata.write() assert not adata.file.is_open assert adata.isbacked adata.X[0, [0, 2]] = 10 adata.X[1, [0, 2]] = [11, 12] adata.X[2, 1] = 13 # If it were written as sparse, this should fail assert adata.isbacked assert np.all(adata.X[0, :] == np.array([10, 0, 10])) assert np.all(adata.X[1, :] == np.array([11, 0, 12])) assert np.all(adata.X[2, :] == np.array([7, 13, 9])) def test_backed_modification_sparse(adata, backing_h5ad, sparse_format): adata.X[:, 1] = 0 # Make it a little sparse adata.X = sparse_format(adata.X) assert not adata.isbacked adata.write(backing_h5ad) adata = ad.read_h5ad(backing_h5ad, backed="r+") assert adata.filename == backing_h5ad assert adata.isbacked adata.X[0, [0, 2]] = 10 adata.X[1, [0, 2]] = [11, 12] with pytest.raises(ValueError): adata.X[2, 1] = 13 assert adata.isbacked assert np.all(adata.X[0, :] == np.array([10, 0, 10])) assert np.all(adata.X[1, :] == np.array([11, 0, 12])) assert np.all(adata.X[2, :] == np.array([7, 0, 9])) # TODO: Work around h5py not supporting this # def test_backed_view_modification(adata, backing_h5ad): # adata.write(backing_h5ad) # backed_adata = ad.read_h5ad(backing_h5ad, backed=True) # backed_view = backed_adata[[1, 2], :] # backed_view.X = 0 # assert np.all(backed_adata.X[:3, :] == 0) # TODO: Implement # def test_backed_view_modification_sparse(adata, backing_h5ad, sparse_format): # adata[:, 1] = 0 # Make it a little sparse # adata.X = sparse_format(adata.X) # adata.write(backing_h5ad) # backed_adata = ad.read_h5ad(backing_h5ad, backed=True) # backed_view = backed_adata[[1,2], :] # backed_view.X = 0 # assert np.all(backed_adata.X[[1,2], :] == 0) anndata-0.7.8/anndata/tests/test_helpers.py000066400000000000000000000136241414255741200207720ustar00rootroot00000000000000from string import ascii_letters import pandas as pd import pytest import numpy as np from scipy import sparse import anndata as ad from anndata.tests.helpers import assert_equal, report_name, gen_adata # Testing to see if all error types can have the key name appended. # Currently fails for 22/118 since they have required arguments. Not sure what to do about that. # # @singledispatch # def iswarning(x): # return iswarning(type(x)) # @iswarning.register(type) # def _notwarning(x): # return False # @iswarning.register(Warning) # def _iswarning(x): # return True # @pytest.mark.parametrize("exception", list(filter(lambda t: not iswarning(t), Exception.__subclasses__()))) # def test_report_name_types(exception): # def throw(e): # raise e() # tag = "".join(np.random.permutation(list(ascii_letters))) # with pytest.raises(exception) as err: # report_name(throw)(exception, _elem_name=tag) # assert tag in str(err.value) @pytest.fixture(scope="function") def reusable_adata(): """Reusable anndata for when tests shouldn’t mutate it""" return gen_adata((10, 10)) # Does this work for every warning? def test_report_name(): def raise_error(): raise Exception("an error occured!") letters = np.array(list(ascii_letters)) tag = "".join(np.random.permutation(letters)) with pytest.raises(Exception) as e1: raise_error() with pytest.raises(Exception) as e2: report_name(raise_error)(_elem_name=tag) assert str(e2.value).startswith(str(e1.value)) assert tag in str(e2.value) def test_assert_equal(): # ndarrays assert_equal(np.ones((10, 10)), np.ones((10, 10))) assert_equal( # Should this require an exact test? np.ones((10, 10), dtype="i8"), np.ones((10, 10), dtype="f8") ) assert_equal( np.array(list(ascii_letters)), np.array(list(ascii_letters)), exact=True ) with pytest.raises(AssertionError): assert_equal(np.array(list(ascii_letters)), np.array(list(ascii_letters))[::-1]) adata = gen_adata((10, 10)) adata.raw = adata.copy() assert_equal(adata, adata.copy(), exact=True) # TODO: I’m not sure this is good behaviour, I’ve disabled in for now. # assert_equal( # adata, # adata[ # np.random.permutation(adata.obs_names), # np.random.permutation(adata.var_names), # ].copy(), # exact=False, # ) adata2 = adata.copy() to_modify = list(adata2.layers.keys())[0] del adata2.layers[to_modify] with pytest.raises(AssertionError) as missing_layer_error: assert_equal(adata, adata2) assert "layers" in str(missing_layer_error.value) # `to_modify` will be in pytest info adata2 = adata.copy() adata2.layers[to_modify][0, 0] = adata2.layers[to_modify][0, 0] + 1 with pytest.raises(AssertionError) as changed_layer_error: assert_equal(adata, adata2) assert "layers" in str(changed_layer_error.value) assert to_modify in str(changed_layer_error.value) assert_equal(adata.obs, adata.obs.copy(), exact=True) csr = sparse.random(100, 100, format="csr") csc = csr.tocsc() dense = csr.toarray() assert_equal(csr, csc) assert_equal(csc, dense) assert_equal(dense, csc) def test_assert_equal_raw(): base = gen_adata((10, 10)) orig = base.copy() orig.raw = base.copy() mod = base.copy() mod.X[0, 0] = mod.X[0, 0] + 1 to_compare = base.copy() to_compare.raw = mod.copy() with pytest.raises(AssertionError): assert_equal(orig, to_compare) mod = base.copy() mod.var["new_val"] = 1 to_compare = base.copy() to_compare.raw = mod.copy() with pytest.raises(AssertionError): assert_equal(orig, to_compare) def test_assert_equal_raw_presence(): # This was causing some testing issues during # https://github.com/theislab/anndata/pull/542 a = gen_adata((10, 20)) b = a.copy() a.raw = a.copy() assert b.raw is None with pytest.raises(AssertionError): assert_equal(a, b) with pytest.raises(AssertionError): assert_equal(b, a) # TODO: Should views be equal to actual? # Should they not be if an exact comparison is made? def test_assert_equal_aligned_mapping(): adata1 = gen_adata((10, 10)) adata2 = adata1.copy() for attr in ["obsm", "varm", "layers", "obsp", "varp"]: assert_equal(getattr(adata1, attr), getattr(adata2, attr)) # Checking that subsetting other axis only changes some attrs obs_subset = adata2[:5, :] for attr in ["obsm", "layers", "obsp"]: with pytest.raises(AssertionError): assert_equal(getattr(adata1, attr), getattr(obs_subset, attr)) for attr in ["varm", "varp"]: assert_equal(getattr(adata1, attr), getattr(obs_subset, attr)) var_subset = adata2[:, 5:] for attr in ["varm", "layers", "varp"]: with pytest.raises(AssertionError): assert_equal(getattr(adata1, attr), getattr(var_subset, attr)) for attr in ["obsm", "obsp"]: assert_equal(getattr(adata1, attr), getattr(var_subset, attr)) def test_assert_equal_aligned_mapping_empty(): chars = np.array(list(ascii_letters)) adata = ad.AnnData( X=np.zeros((10, 10)), obs=pd.DataFrame([], index=np.random.choice(chars[:20], 10, replace=False)), var=pd.DataFrame([], index=np.random.choice(chars[:20], 10, replace=False)), ) diff_idx = ad.AnnData( X=np.zeros((10, 10)), obs=pd.DataFrame([], index=np.random.choice(chars[20:], 10, replace=False)), var=pd.DataFrame([], index=np.random.choice(chars[20:], 10, replace=False)), ) same_idx = ad.AnnData(adata.X, obs=adata.obs.copy(), var=adata.var.copy()) for attr in ["obsm", "varm", "layers", "obsp", "varp"]: with pytest.raises(AssertionError): assert_equal(getattr(adata, attr), getattr(diff_idx, attr)) assert_equal(getattr(adata, attr), getattr(same_idx, attr)) anndata-0.7.8/anndata/tests/test_inplace_subset.py000066400000000000000000000042451414255741200223270ustar00rootroot00000000000000import numpy as np import pytest from scipy import sparse from anndata.tests.helpers import assert_equal, gen_adata, subset_func from anndata.utils import asarray @pytest.fixture( params=[np.array, sparse.csr_matrix, sparse.csc_matrix], ids=["np_array", "scipy_csr", "scipy_csc"], ) def matrix_type(request): return request.param # TODO: Test values of .uns def test_inplace_subset_var(matrix_type, subset_func): orig = gen_adata((30, 30), X_type=matrix_type) subset_idx = subset_func(orig.var_names) modified = orig.copy() from_view = orig[:, subset_idx].copy() modified._inplace_subset_var(subset_idx) assert_equal(asarray(from_view.X), asarray(modified.X), exact=True) assert_equal(from_view.obs, modified.obs, exact=True) assert_equal(from_view.var, modified.var, exact=True) for k in from_view.obsm: assert_equal(asarray(from_view.obsm[k]), asarray(modified.obsm[k]), exact=True) assert_equal(asarray(orig.obsm[k]), asarray(modified.obsm[k]), exact=True) for k in from_view.varm: assert_equal(asarray(from_view.varm[k]), asarray(modified.varm[k]), exact=True) for k in from_view.layers: assert_equal( asarray(from_view.layers[k]), asarray(modified.layers[k]), exact=True ) def test_inplace_subset_obs(matrix_type, subset_func): orig = gen_adata((30, 30), X_type=matrix_type) subset_idx = subset_func(orig.obs_names) modified = orig.copy() from_view = orig[subset_idx, :].copy() modified._inplace_subset_obs(subset_idx) assert_equal(asarray(from_view.X), asarray(modified.X), exact=True) assert_equal(from_view.obs, modified.obs, exact=True) assert_equal(from_view.var, modified.var, exact=True) for k in from_view.obsm: assert_equal(asarray(from_view.obsm[k]), asarray(modified.obsm[k]), exact=True) for k in from_view.varm: assert_equal(asarray(from_view.varm[k]), asarray(modified.varm[k]), exact=True) assert_equal(asarray(orig.varm[k]), asarray(modified.varm[k]), exact=True) for k in from_view.layers: assert_equal( asarray(from_view.layers[k]), asarray(modified.layers[k]), exact=True ) anndata-0.7.8/anndata/tests/test_io_conversion.py000066400000000000000000000073541414255741200222070ustar00rootroot00000000000000"""\ This file contains tests for conversion made during io. """ import h5py import numpy as np import pytest from scipy import sparse import anndata as ad from anndata.tests.helpers import gen_adata, assert_equal @pytest.fixture( params=[sparse.csr_matrix, sparse.csc_matrix, np.array], ids=["scipy-csr", "scipy-csc", "np-array"], ) def mtx_format(request): return request.param @pytest.fixture( params=[sparse.csr_matrix, sparse.csc_matrix], ids=["scipy-csr", "scipy-csc"], ) def spmtx_format(request): return request.param @pytest.fixture(params=[("raw/X",), ("X",), ("X", "raw/X")]) def to_convert(request): return request.param def test_sparse_to_dense_disk(tmp_path, mtx_format, to_convert): mem_pth = tmp_path / "orig.h5ad" dense_from_mem_pth = tmp_path / "dense_mem.h5ad" dense_from_disk_pth = tmp_path / "dense_disk.h5ad" mem = gen_adata((50, 50), mtx_format) mem.raw = mem mem.write_h5ad(mem_pth) disk = ad.read_h5ad(mem_pth, backed="r") mem.write_h5ad(dense_from_mem_pth, as_dense=to_convert) disk.write_h5ad(dense_from_disk_pth, as_dense=to_convert) with h5py.File(dense_from_mem_pth, "r") as f: for k in to_convert: assert isinstance(f[k], h5py.Dataset) with h5py.File(dense_from_disk_pth, "r") as f: for k in to_convert: assert isinstance(f[k], h5py.Dataset) for backed in [None, "r"]: from_mem = ad.read_h5ad(dense_from_mem_pth, backed=backed) from_disk = ad.read_h5ad(dense_from_disk_pth, backed=backed) assert_equal(mem, from_mem) assert_equal(mem, from_disk) assert_equal(disk, from_mem) assert_equal(disk, from_disk) def test_sparse_to_dense_inplace(tmp_path, spmtx_format): pth = tmp_path / "adata.h5ad" orig = gen_adata((50, 50), spmtx_format) orig.raw = orig orig.write(pth) backed = ad.read_h5ad(pth, backed="r+") backed.write(as_dense=("X", "raw/X")) new = ad.read_h5ad(pth) assert_equal(orig, new) assert_equal(backed, new) assert isinstance(new.X, np.ndarray) assert isinstance(new.raw.X, np.ndarray) assert isinstance(orig.X, spmtx_format) assert isinstance(orig.raw.X, spmtx_format) assert isinstance(backed.X, h5py.Dataset) assert isinstance(backed.raw.X, h5py.Dataset) def test_sparse_to_dense_errors(tmp_path): adata = ad.AnnData(X=sparse.random(50, 50, format="csr")) adata.layers["like_X"] = adata.X.copy() with pytest.raises(ValueError): adata.write_h5ad(tmp_path / "failure.h5ad", as_dense=("raw/X")) adata.write_h5ad(tmp_path / "failure.h5ad", as_dense=("raw", "X")) with pytest.raises(NotImplementedError): adata.write_h5ad(tmp_path / "failure.h5ad", as_dense=("layers/like_X")) def test_dense_to_sparse_memory(tmp_path, spmtx_format, to_convert): dense_path = tmp_path / "dense.h5ad" orig = gen_adata((50, 50), np.array) orig.raw = orig orig.write_h5ad(dense_path) assert not isinstance(orig.X, sparse.spmatrix) assert not isinstance(orig.raw.X, sparse.spmatrix) curr = ad.read_h5ad(dense_path, as_sparse=to_convert, as_sparse_fmt=spmtx_format) if "X" in to_convert: assert isinstance(curr.X, spmtx_format) if "raw/X" in to_convert: assert isinstance(curr.raw.X, spmtx_format) assert_equal(orig, curr) def test_dense_to_sparse_errors(tmp_path): dense_pth = tmp_path / "dense.h5ad" adata = ad.AnnData(X=np.ones((50, 50))) adata.layers["like_X"] = adata.X.copy() adata.write(dense_pth) with pytest.raises(NotImplementedError): ad.read_h5ad(dense_pth, as_sparse=("X",), as_sparse_fmt=sparse.coo_matrix) with pytest.raises(NotImplementedError): ad.read_h5ad(dense_pth, as_sparse=("layers/like_X",)) anndata-0.7.8/anndata/tests/test_io_utils.py000066400000000000000000000027201414255741200211520ustar00rootroot00000000000000from contextlib import suppress import pytest import zarr import h5py import pandas as pd from anndata.compat import _clean_uns from anndata._io.utils import report_read_key_on_error, AnnDataReadError @pytest.mark.parametrize( "group_fn", [ pytest.param(lambda _: zarr.group(), id="zarr"), pytest.param(lambda p: h5py.File(p / "test.h5", mode="a"), id="h5py"), ], ) def test_key_error(tmp_path, group_fn): @report_read_key_on_error def read_attr(_): raise NotImplementedError() group = group_fn(tmp_path) with group if hasattr(group, "__enter__") else suppress(): group["X"] = [1, 2, 3] group.create_group("group") with pytest.raises(AnnDataReadError) as e: read_attr(group["X"]) assert "'/X'" in str(e.value) with pytest.raises(AnnDataReadError) as e: read_attr(group["group"]) assert "'/group'" in str(e.value) def test_clean_uns(): d = dict( uns=dict(species_categories=["a", "b"]), obs=dict(species=pd.Series([0, 1, 0])), var=dict(species=pd.Series([0, 1, 0, 2])), ) _clean_uns(d) assert "species_categories" not in d["uns"] assert isinstance(d["obs"]["species"], pd.Categorical) assert d["obs"]["species"].tolist() == ["a", "b", "a"] # var’s categories were overwritten by obs’s, # which we can detect here because var has too high codes assert isinstance(d["var"]["species"], pd.Series) anndata-0.7.8/anndata/tests/test_layers.py000066400000000000000000000053621414255741200206270ustar00rootroot00000000000000from importlib.util import find_spec import pytest import numpy as np import pandas as pd from anndata import AnnData, read_loom, read_h5ad from anndata.tests.helpers import gen_typed_df_t2_size X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) L = np.array([[10, 11, 12], [13, 14, 15], [16, 17, 18]]) def test_creation(): adata = AnnData(X=X, layers=dict(L=L.copy())) assert list(adata.layers.keys()) == ["L"] assert "L" in adata.layers assert "X" not in adata.layers assert "some_other_thing" not in adata.layers assert (adata.layers["L"] == L).all() def test_views(): adata = AnnData(X=X, layers=dict(L=L.copy())) adata_view = adata[1:, 1:] assert adata_view.layers.is_view assert adata_view.layers.parent_mapping == adata.layers assert adata_view.layers.keys() == adata.layers.keys() assert (adata_view.layers["L"] == adata.layers["L"][1:, 1:]).all() adata.layers["S"] = X assert adata_view.layers.keys() == adata.layers.keys() assert (adata_view.layers["S"] == adata.layers["S"][1:, 1:]).all() adata_view.layers["T"] = X[1:, 1:] assert not adata_view.layers.is_view assert not adata_view.is_view @pytest.mark.parametrize( "df,homogenous,dtype", [ (lambda: gen_typed_df_t2_size(*X.shape), True, np.object_), (lambda: pd.DataFrame(X ** 2), False, np.int_), ], ) def test_set_dataframe(homogenous, df, dtype): adata = AnnData(X) if homogenous: with pytest.warns(UserWarning, match=r"Layer 'df'.*dtype object"): adata.layers["df"] = df() else: with pytest.warns(None) as warnings: adata.layers["df"] = df() assert not len(warnings) assert isinstance(adata.layers["df"], np.ndarray) assert np.issubdtype(adata.layers["df"].dtype, dtype) def test_readwrite(backing_h5ad): adata = AnnData(X=X, layers=dict(L=L.copy())) adata.write(backing_h5ad) adata_read = read_h5ad(backing_h5ad) assert adata.layers.keys() == adata_read.layers.keys() assert (adata.layers["L"] == adata_read.layers["L"]).all() @pytest.mark.skipif(find_spec("loompy") is None, reason="loompy not installed") def test_readwrite_loom(tmp_path): loom_path = tmp_path / "test.loom" adata = AnnData(X=X, layers=dict(L=L.copy())) adata.write_loom(loom_path) adata_read = read_loom(loom_path, X_name="") assert adata.layers.keys() == adata_read.layers.keys() assert (adata.layers["L"] == adata_read.layers["L"]).all() def test_backed(): # backed mode for layers isn’t implemented, layers stay in memory pass def test_copy(): adata = AnnData(X=X, layers=dict(L=L.copy())) bdata = adata.copy() adata.layers["L"] += 10 assert np.all(adata.layers["L"] != bdata.layers["L"]) # 201 anndata-0.7.8/anndata/tests/test_obsmvarm.py000066400000000000000000000056361414255741200211620ustar00rootroot00000000000000import joblib import numpy as np import pandas as pd import pytest from scipy import sparse import anndata M, N = (100, 100) @pytest.fixture def adata(): X = np.zeros((M, N)) obs = pd.DataFrame( dict(batch=np.array(["a", "b"])[np.random.randint(0, 2, M)]), index=[f"cell{i:03d}" for i in range(N)], ) var = pd.DataFrame(index=[f"gene{i:03d}" for i in range(N)]) return anndata.AnnData(X, obs=obs, var=var) def test_assigmnent_dict(adata): d_obsm = dict( a=pd.DataFrame( dict(a1=np.ones(M), a2=[f"a{i}" for i in range(M)]), index=adata.obs_names, ), b=np.zeros((M, 2)), ) d_varm = dict( a=pd.DataFrame( dict(a1=np.ones(N), a2=[f"a{i}" for i in range(N)]), index=adata.var_names, ), b=np.zeros((N, 2)), ) adata.obsm = d_obsm for k, v in d_obsm.items(): assert np.all(adata.obsm[k] == v) adata.varm = d_varm for k, v in d_varm.items(): assert np.all(adata.varm[k] == v) def test_setting_ndarray(adata): adata.obsm["a"] = np.ones((M, 10)) adata.varm["a"] = np.ones((N, 10)) assert np.all(adata.obsm["a"] == np.ones((M, 10))) assert np.all(adata.varm["a"] == np.ones((N, 10))) h = joblib.hash(adata) with pytest.raises(ValueError): adata.obsm["b"] = np.ones((int(M / 2), 10)) with pytest.raises(ValueError): adata.obsm["b"] = np.ones((int(M * 2), 10)) with pytest.raises(ValueError): adata.varm["b"] = np.ones((int(N / 2), 10)) with pytest.raises(ValueError): adata.varm["b"] = np.ones((int(N * 2), 10)) assert h == joblib.hash(adata) def test_setting_dataframe(adata): obsm_df = pd.DataFrame(dict(b_1=np.ones(M), b_2=["a"] * M), index=adata.obs_names) varm_df = pd.DataFrame(dict(b_1=np.ones(N), b_2=["a"] * N), index=adata.var_names) adata.obsm["b"] = obsm_df assert np.all(adata.obsm["b"] == obsm_df) adata.varm["b"] = varm_df assert np.all(adata.varm["b"] == varm_df) bad_obsm_df = obsm_df.copy() bad_obsm_df.reset_index(inplace=True) with pytest.raises(ValueError): adata.obsm["c"] = bad_obsm_df bad_varm_df = varm_df.copy() bad_varm_df.reset_index(inplace=True) with pytest.raises(ValueError): adata.varm["c"] = bad_varm_df def test_setting_sparse(adata): obsm_sparse = sparse.random(M, 100) adata.obsm["a"] = obsm_sparse assert not np.any((adata.obsm["a"] != obsm_sparse).data) varm_sparse = sparse.random(N, 100) adata.varm["a"] = varm_sparse assert not np.any((adata.varm["a"] != varm_sparse).data) h = joblib.hash(adata) bad_obsm_sparse = sparse.random(M * 2, M) with pytest.raises(ValueError): adata.obsm["b"] = bad_obsm_sparse bad_varm_sparse = sparse.random(N * 2, N) with pytest.raises(ValueError): adata.varm["b"] = bad_varm_sparse assert h == joblib.hash(adata) anndata-0.7.8/anndata/tests/test_obspvarp.py000066400000000000000000000063741414255741200211700ustar00rootroot00000000000000# TODO: These tests should share code with test_layers, and test_obsmvarm import joblib import numpy as np import pandas as pd import pytest from scipy import sparse import anndata from anndata.tests.helpers import gen_typed_df_t2_size from anndata.utils import asarray M, N = (200, 100) @pytest.fixture def adata(): X = np.zeros((M, N)) obs = pd.DataFrame( dict(batch=np.array(["a", "b"])[np.random.randint(0, 2, M)]), index=[f"cell{i:03d}" for i in range(M)], ) var = pd.DataFrame(index=[f"gene{i:03d}" for i in range(N)]) return anndata.AnnData(X, obs=obs, var=var) def test_assigmnent_dict(adata): d_obsp = dict( a=pd.DataFrame(np.ones((M, M)), columns=adata.obs_names, index=adata.obs_names), b=np.zeros((M, M)), c=sparse.random(M, M, format="csr"), ) d_varp = dict( a=pd.DataFrame(np.ones((N, N)), columns=adata.var_names, index=adata.var_names), b=np.zeros((N, N)), c=sparse.random(N, N, format="csr"), ) adata.obsp = d_obsp for k, v in d_obsp.items(): assert np.all(asarray(adata.obsp[k]) == asarray(v)) adata.varp = d_varp for k, v in d_varp.items(): assert np.all(asarray(adata.varp[k]) == asarray(v)) def test_setting_ndarray(adata): adata.obsp["a"] = np.ones((M, M)) adata.varp["a"] = np.ones((N, N)) assert np.all(adata.obsp["a"] == np.ones((M, M))) assert np.all(adata.varp["a"] == np.ones((N, N))) h = joblib.hash(adata) with pytest.raises(ValueError): adata.obsp["b"] = np.ones((int(M / 2), M)) with pytest.raises(ValueError): adata.obsp["b"] = np.ones((M, int(M * 2))) with pytest.raises(ValueError): adata.varp["b"] = np.ones((int(N / 2), 10)) with pytest.raises(ValueError): adata.varp["b"] = np.ones((N, int(N * 2))) assert h == joblib.hash(adata) def test_setting_sparse(adata): obsp_sparse = sparse.random(M, M) adata.obsp["a"] = obsp_sparse assert not np.any((adata.obsp["a"] != obsp_sparse).data) varp_sparse = sparse.random(N, N) adata.varp["a"] = varp_sparse assert not np.any((adata.varp["a"] != varp_sparse).data) h = joblib.hash(adata) bad_obsp_sparse = sparse.random(M * 2, M) with pytest.raises(ValueError): adata.obsp["b"] = bad_obsp_sparse bad_varp_sparse = sparse.random(N * 2, N) with pytest.raises(ValueError): adata.varp["b"] = bad_varp_sparse assert h == joblib.hash(adata) @pytest.mark.parametrize("field,dim", [("obsp", M), ("varp", N)]) @pytest.mark.parametrize( "df,homogenous,dtype", [ (lambda dim: gen_typed_df_t2_size(dim, dim), True, np.object_), (lambda dim: pd.DataFrame(np.random.randn(dim, dim)), False, np.float_), ], ids=["heterogeneous", "homogeneous"], ) def test_setting_dataframe(adata, field, dim, homogenous, df, dtype): if homogenous: with pytest.warns(UserWarning, match=rf"{field.title()} 'df'.*dtype object"): getattr(adata, field)["df"] = df(dim) else: with pytest.warns(None) as warnings: getattr(adata, field)["df"] = df(dim) assert not len(warnings) assert isinstance(getattr(adata, field)["df"], np.ndarray) assert np.issubdtype(getattr(adata, field)["df"].dtype, dtype) anndata-0.7.8/anndata/tests/test_raw.py000066400000000000000000000100731414255741200201140ustar00rootroot00000000000000import numpy as np import pytest import anndata as ad from anndata._core.anndata import ImplicitModificationWarning from anndata.tests.helpers import assert_equal # ------------------------------------------------------------------------------- # Some test data # ------------------------------------------------------------------------------- data = [ [1, 2, 3], [4, 5, 6], [7, 8, 9], ] # data matrix of shape n_obs × n_vars obs_dict = dict( # annotation of observations / rows row_names=["name1", "name2", "name3"], # row annotation oanno1=["cat1", "cat2", "cat2"], # categorical annotation oanno2=["o1", "o2", "o3"], # string annotation oanno3=[2.1, 2.2, 2.3], # float annotation ) var_dict = dict( # annotation of variables / columns col_names=["var1", "var2", "var3"], vanno1=[3.1, 3.2, 3.3] ) uns_dict = dict( # unstructured annotation oanno1_colors=["#000000", "#FFFFFF"], uns2=["some annotation"] ) @pytest.fixture def adata_raw(): adata = ad.AnnData( np.array(data), obs=obs_dict, var=var_dict, uns=uns_dict, dtype="int32" ) adata.raw = adata # Make them different shapes adata = adata[:, [0, 1]].copy() return adata # ------------------------------------------------------------------------------- # The test functions # ------------------------------------------------------------------------------- def test_raw_init(adata_raw): assert adata_raw.var_names.tolist() == ["var1", "var2"] assert adata_raw.raw.var_names.tolist() == ["var1", "var2", "var3"] assert adata_raw.raw[:, 0].X.tolist() == [[1], [4], [7]] def test_raw_del(adata_raw): del adata_raw.raw assert adata_raw.raw is None def test_raw_set_as_none(adata_raw): # Test for theislab/anndata#445 a = adata_raw b = adata_raw.copy() del a.raw b.raw = None assert_equal(a, b) def test_raw_of_view(adata_raw): adata_view = adata_raw[adata_raw.obs["oanno1"] == "cat2"] assert adata_view.raw.X.tolist() == [ [4, 5, 6], [7, 8, 9], ] def test_raw_rw(adata_raw, backing_h5ad): adata_raw.write(backing_h5ad) adata_read = ad.read(backing_h5ad) assert_equal(adata_read, adata_raw, exact=True) assert adata_raw.var_names.tolist() == ["var1", "var2"] assert adata_raw.raw.var_names.tolist() == ["var1", "var2", "var3"] assert adata_raw.raw[:, 0].X.tolist() == [[1], [4], [7]] def test_raw_view_rw(adata_raw, backing_h5ad): # Make sure it still writes correctly if the object is a view adata_raw_view = adata_raw[:, adata_raw.var_names] assert_equal(adata_raw_view, adata_raw) with pytest.warns(ImplicitModificationWarning, match="Initializing view as actual"): adata_raw_view.write(backing_h5ad) adata_read = ad.read(backing_h5ad) assert_equal(adata_read, adata_raw_view, exact=True) assert adata_raw.var_names.tolist() == ["var1", "var2"] assert adata_raw.raw.var_names.tolist() == ["var1", "var2", "var3"] assert adata_raw.raw[:, 0].X.tolist() == [[1], [4], [7]] def test_raw_backed(adata_raw, backing_h5ad): adata_raw.filename = backing_h5ad assert adata_raw.var_names.tolist() == ["var1", "var2"] assert adata_raw.raw.var_names.tolist() == ["var1", "var2", "var3"] if adata_raw.raw[:, 0].X.shape[1] != 1: pytest.xfail("Raw is broken for backed slices") assert adata_raw.raw[:, 0].X[:].tolist() == [[1], [4], [7]] def test_raw_view_backed(adata_raw, backing_h5ad): adata_raw.filename = backing_h5ad assert adata_raw.var_names.tolist() == ["var1", "var2"] assert adata_raw.raw.var_names.tolist() == ["var1", "var2", "var3"] if adata_raw.raw[:, 0].X.shape[1] != 1: pytest.xfail("Raw is broken for backed slices") assert adata_raw.raw[:, 0].X[:].tolist() == [[1], [4], [7]] def test_raw_as_parent_view(): # https://github.com/theislab/anndata/issues/288 a = ad.AnnData(np.ones((4, 3))) a.varm["PCs"] = np.ones((3, 3)) a.raw = a # create a Raw containing views. This used to trigger #288. b = a.raw[:, "0"] # actualize b.varm["PCs"] = np.array([[1, 2, 3]]) anndata-0.7.8/anndata/tests/test_readwrite.py000066400000000000000000000577131414255741200213250ustar00rootroot00000000000000from importlib.util import find_spec from os import PathLike from pathlib import Path from string import ascii_letters import tempfile import h5py import numpy as np import pandas as pd from pandas.api.types import is_categorical_dtype import pytest from scipy.sparse import csr_matrix, csc_matrix import zarr import anndata as ad from anndata.utils import asarray from anndata.tests.helpers import gen_adata, assert_equal HERE = Path(__file__).parent # ------------------------------------------------------------------------------ # Some test data # ------------------------------------------------------------------------------ X_sp = csr_matrix([[1, 0, 0], [3, 0, 0], [5, 6, 0], [0, 0, 0], [0, 0, 0]]) X_list = [[1, 0], [3, 0], [5, 6]] # data matrix of shape n_obs x n_vars obs_dict = dict( # annotation of observations / rows row_names=["name1", "name2", "name3"], # row annotation oanno1=["cat1", "cat2", "cat2"], # categorical annotation oanno1b=["cat1", "cat1", "cat1"], # categorical annotation with one category oanno1c=["cat1", "cat1", np.nan], # categorical annotation with a missing value oanno2=["o1", "o2", "o3"], # string annotation oanno3=[2.1, 2.2, 2.3], # float annotation oanno4=[3.3, 1.1, 2.2], # float annotation ) var_dict = dict( # annotation of variables / columns vanno1=[3.1, 3.2], vanno2=["cat1", "cat1"], # categorical annotation vanno3=[2.1, 2.2], # float annotation vanno4=[3.3, 1.1], # float annotation ) uns_dict = dict( # unstructured annotation oanno1_colors=["#000000", "#FFFFFF"], uns2=["some annotation"], uns3="another annotation", uns4=dict( a=1, b=[2, 3], c="4", d=["some", "strings"], e=np.ones(5), f=np.int32(7), g=[1, np.float32(2.5)], ), ) @pytest.fixture(params=[{}, dict(compression="gzip")]) def dataset_kwargs(request): return request.param @pytest.fixture(params=["h5ad", "zarr"]) def diskfmt(request): return request.param @pytest.fixture def rw(backing_h5ad): M, N = 100, 101 orig = gen_adata((M, N)) orig.write(backing_h5ad) curr = ad.read(backing_h5ad) return curr, orig diskfmt2 = diskfmt # ------------------------------------------------------------------------------ # The test functions # ------------------------------------------------------------------------------ @pytest.mark.parametrize("typ", [np.array, csr_matrix]) def test_readwrite_roundtrip(typ, tmp_path, diskfmt, diskfmt2): tmpdir = Path(tmp_path) pth1 = tmpdir / f"first.{diskfmt}" write1 = lambda x: getattr(x, f"write_{diskfmt}")(pth1) read1 = lambda: getattr(ad, f"read_{diskfmt}")(pth1) pth2 = tmpdir / f"second.{diskfmt2}" write2 = lambda x: getattr(x, f"write_{diskfmt2}")(pth2) read2 = lambda: getattr(ad, f"read_{diskfmt2}")(pth2) adata1 = ad.AnnData(typ(X_list), obs=obs_dict, var=var_dict, uns=uns_dict) write1(adata1) adata2 = read1() write2(adata2) adata3 = read2() assert_equal(adata2, adata1) assert_equal(adata3, adata1) assert_equal(adata2, adata1) @pytest.mark.parametrize("typ", [np.array, csr_matrix]) def test_readwrite_h5ad(typ, dataset_kwargs, backing_h5ad): tmpdir = tempfile.TemporaryDirectory() tmpdirpth = Path(tmpdir.name) mid_pth = tmpdirpth / "mid.h5ad" X = typ(X_list) adata_src = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict) assert not is_categorical_dtype(adata_src.obs["oanno1"]) adata_src.raw = adata_src adata_src.write(backing_h5ad, **dataset_kwargs) adata_mid = ad.read(backing_h5ad) adata_mid.write(mid_pth, **dataset_kwargs) adata = ad.read_h5ad(mid_pth) assert is_categorical_dtype(adata.obs["oanno1"]) assert not is_categorical_dtype(adata.obs["oanno2"]) assert adata.obs.index.tolist() == ["name1", "name2", "name3"] assert adata.obs["oanno1"].cat.categories.tolist() == ["cat1", "cat2"] assert adata.obs["oanno1c"].cat.categories.tolist() == ["cat1"] assert is_categorical_dtype(adata.raw.var["vanno2"]) pd.testing.assert_frame_equal(adata.obs, adata_src.obs) pd.testing.assert_frame_equal(adata.var, adata_src.var) assert np.all(adata.var.index == adata_src.var.index) assert adata.var.index.dtype == adata_src.var.index.dtype assert type(adata.raw.X) is type(adata_src.raw.X) assert type(adata.raw.varm) is type(adata_src.raw.varm) assert np.allclose(asarray(adata.raw.X), asarray(adata_src.raw.X)) pd.testing.assert_frame_equal(adata.raw.var, adata_src.raw.var) assert isinstance(adata.uns["uns4"]["a"], (int, np.integer)) assert isinstance(adata_src.uns["uns4"]["a"], (int, np.integer)) assert type(adata.uns["uns4"]["c"]) is type(adata_src.uns["uns4"]["c"]) assert_equal(adata, adata_src) @pytest.mark.skipif(not find_spec("zarr"), reason="Zarr is not installed") @pytest.mark.parametrize("typ", [np.array, csr_matrix]) def test_readwrite_zarr(typ, tmp_path): X = typ(X_list) adata_src = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict) adata_src.raw = adata_src assert not is_categorical_dtype(adata_src.obs["oanno1"]) adata_src.write_zarr(tmp_path / "test_zarr_dir", chunks=True) adata = ad.read_zarr(tmp_path / "test_zarr_dir") assert is_categorical_dtype(adata.obs["oanno1"]) assert not is_categorical_dtype(adata.obs["oanno2"]) assert adata.obs.index.tolist() == ["name1", "name2", "name3"] assert adata.obs["oanno1"].cat.categories.tolist() == ["cat1", "cat2"] assert adata.obs["oanno1c"].cat.categories.tolist() == ["cat1"] assert is_categorical_dtype(adata.raw.var["vanno2"]) pd.testing.assert_frame_equal(adata.obs, adata_src.obs) pd.testing.assert_frame_equal(adata.var, adata_src.var) assert np.all(adata.var.index == adata_src.var.index) assert adata.var.index.dtype == adata_src.var.index.dtype assert type(adata.raw.X) is type(adata_src.raw.X) assert np.allclose(asarray(adata.raw.X), asarray(adata_src.raw.X)) assert np.all(adata.raw.var == adata_src.raw.var) assert isinstance(adata.uns["uns4"]["a"], (int, np.integer)) assert isinstance(adata_src.uns["uns4"]["a"], (int, np.integer)) assert type(adata.uns["uns4"]["c"]) is type(adata_src.uns["uns4"]["c"]) assert_equal(adata, adata_src) @pytest.mark.parametrize("typ", [np.array, csr_matrix]) def test_readwrite_maintain_X_dtype(typ, backing_h5ad): X = typ(X_list) adata_src = ad.AnnData(X, dtype="int8") adata_src.write(backing_h5ad) adata = ad.read(backing_h5ad) assert adata.X.dtype == adata_src.X.dtype def test_read_write_maintain_obsmvarm_dtypes(rw): curr, orig = rw assert type(orig.obsm["array"]) is type(curr.obsm["array"]) assert np.all(orig.obsm["array"] == curr.obsm["array"]) assert np.all(orig.varm["array"] == curr.varm["array"]) assert type(orig.obsm["sparse"]) is type(curr.obsm["sparse"]) assert not np.any((orig.obsm["sparse"] != curr.obsm["sparse"]).toarray()) assert not np.any((orig.varm["sparse"] != curr.varm["sparse"]).toarray()) assert type(orig.obsm["df"]) is type(curr.obsm["df"]) assert np.all(orig.obsm["df"] == curr.obsm["df"]) assert np.all(orig.varm["df"] == curr.varm["df"]) def test_maintain_layers(rw): curr, orig = rw assert type(orig.layers["array"]) is type(curr.layers["array"]) assert np.all(orig.layers["array"] == curr.layers["array"]) assert type(orig.layers["sparse"]) is type(curr.layers["sparse"]) assert not np.any((orig.layers["sparse"] != curr.layers["sparse"]).toarray()) @pytest.mark.parametrize("typ", [np.array, csr_matrix]) def test_readwrite_h5ad_one_dimension(typ, backing_h5ad): X = typ(X_list) adata_src = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict) adata_one = adata_src[:, 0].copy() adata_one.write(backing_h5ad) adata = ad.read(backing_h5ad) assert adata.shape == (3, 1) assert_equal(adata, adata_one) @pytest.mark.parametrize("typ", [np.array, csr_matrix]) def test_readwrite_backed(typ, backing_h5ad): X = typ(X_list) adata_src = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict) adata_src.filename = backing_h5ad # change to backed mode adata_src.write() adata = ad.read(backing_h5ad) assert is_categorical_dtype(adata.obs["oanno1"]) assert not is_categorical_dtype(adata.obs["oanno2"]) assert adata.obs.index.tolist() == ["name1", "name2", "name3"] assert adata.obs["oanno1"].cat.categories.tolist() == ["cat1", "cat2"] assert_equal(adata, adata_src) @pytest.mark.parametrize("typ", [np.array, csr_matrix, csc_matrix]) def test_readwrite_equivalent_h5ad_zarr(typ): tmpdir = tempfile.TemporaryDirectory() tmpdirpth = Path(tmpdir.name) h5ad_pth = tmpdirpth / "adata.h5ad" zarr_pth = tmpdirpth / "adata.zarr" M, N = 100, 101 adata = gen_adata((M, N), X_type=typ) adata.raw = adata adata.write_h5ad(h5ad_pth) adata.write_zarr(zarr_pth) from_h5ad = ad.read_h5ad(h5ad_pth) from_zarr = ad.read_zarr(zarr_pth) assert_equal(from_h5ad, from_zarr, exact=True) @pytest.mark.parametrize( "compression,compression_opts", [ (None, None), ("lzf", None), ("gzip", None), ("gzip", 8), ], ) def test_hdf5_compression_opts(tmp_path, compression, compression_opts): # https://github.com/theislab/anndata/issues/497 pth = Path(tmp_path) / "adata.h5ad" adata = gen_adata((10, 8)) kwargs = {} if compression is not None: kwargs["compression"] = compression if compression_opts is not None: kwargs["compression_opts"] = compression_opts not_compressed = [] adata.write_h5ad(pth, **kwargs) def check_compressed(key, value): if isinstance(value, h5py.Dataset) and value.shape != (): if compression is not None and value.compression != compression: not_compressed.append(key) elif ( compression_opts is not None and value.compression_opts != compression_opts ): not_compressed.append(key) with h5py.File(pth) as f: f.visititems(check_compressed) if not_compressed: msg = "\n\t".join(not_compressed) raise AssertionError(f"These elements were not compressed correctly:\n\t{msg}") assert_equal(adata, ad.read_h5ad(pth)) def test_zarr_compression(tmp_path): from numcodecs import Blosc pth = str(Path(tmp_path) / "adata.zarr") adata = gen_adata((10, 8)) compressor = Blosc(cname="zstd", clevel=3, shuffle=Blosc.BITSHUFFLE) not_compressed = [] ad._io.write_zarr(pth, adata, compressor=compressor) def check_compressed(key, value): if isinstance(value, zarr.Array) and value.shape != (): if value.compressor != compressor: not_compressed.append(key) with zarr.open(str(pth), "r") as f: f.visititems(check_compressed) if not_compressed: msg = "\n\t".join(not_compressed) raise AssertionError(f"These elements were not compressed correctly:\n\t{msg}") assert_equal(adata, ad.read_zarr(pth)) def test_changed_obs_var_names(tmp_path, diskfmt): filepth = tmp_path / f"test.{diskfmt}" orig = gen_adata((10, 10)) orig.obs_names.name = "obs" orig.var_names.name = "var" modified = orig.copy() modified.obs_names.name = "cells" modified.var_names.name = "genes" getattr(orig, f"write_{diskfmt}")(filepth) read = getattr(ad, f"read_{diskfmt}")(filepth) assert_equal(orig, read, exact=True) assert orig.var.index.name == "var" assert read.obs.index.name == "obs" with pytest.raises(AssertionError): assert_equal(orig, modified, exact=True) with pytest.raises(AssertionError): assert_equal(read, modified, exact=True) @pytest.mark.skipif(not find_spec("loompy"), reason="Loompy is not installed") @pytest.mark.parametrize("typ", [np.array, csr_matrix]) @pytest.mark.parametrize("obsm_mapping", [{}, dict(X_composed=["oanno3", "oanno4"])]) @pytest.mark.parametrize("varm_mapping", [{}, dict(X_composed2=["vanno3", "vanno4"])]) def test_readwrite_loom(typ, obsm_mapping, varm_mapping, tmp_path): X = typ(X_list) obs_dim = "meaningful_obs_dim_name" var_dim = "meaningful_var_dim_name" adata_src = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict) adata_src.obs_names.name = obs_dim adata_src.var_names.name = var_dim adata_src.obsm["X_a"] = np.zeros((adata_src.n_obs, 2)) adata_src.varm["X_b"] = np.zeros((adata_src.n_vars, 3)) adata_src.write_loom(tmp_path / "test.loom", write_obsm_varm=True) adata = ad.read_loom( tmp_path / "test.loom", sparse=typ is csr_matrix, obsm_mapping=obsm_mapping, obs_names=obs_dim, varm_mapping=varm_mapping, var_names=var_dim, cleanup=True, ) if isinstance(X, np.ndarray): assert np.allclose(adata.X, X) else: # TODO: this should not be necessary assert np.allclose(adata.X.toarray(), X.toarray()) assert "X_a" in adata.obsm_keys() and adata.obsm["X_a"].shape[1] == 2 assert "X_b" in adata.varm_keys() and adata.varm["X_b"].shape[1] == 3 # as we called with `cleanup=True` assert "oanno1b" in adata.uns["loom-obs"] assert "vanno2" in adata.uns["loom-var"] for k, v in obsm_mapping.items(): assert k in adata.obsm_keys() and adata.obsm[k].shape[1] == len(v) for k, v in varm_mapping.items(): assert k in adata.varm_keys() and adata.varm[k].shape[1] == len(v) assert adata.obs_names.name == obs_dim assert adata.var_names.name == var_dim @pytest.mark.skipif(not find_spec("loompy"), reason="Loompy is not installed") def test_readloom_deprecations(tmp_path): loom_pth = tmp_path / "test.loom" adata_src = gen_adata((5, 10), obsm_types=[np.ndarray], varm_types=[np.ndarray]) adata_src.write_loom(loom_pth, write_obsm_varm=True) # obsm_names -> obsm_mapping obsm_mapping = {"df": adata_src.obs.columns} with pytest.warns(FutureWarning): depr_result = ad.read_loom(loom_pth, obsm_names=obsm_mapping) actual_result = ad.read_loom(loom_pth, obsm_mapping=obsm_mapping) assert_equal(actual_result, depr_result) with pytest.raises(ValueError, match="ambiguous"): ad.read_loom(loom_pth, obsm_mapping=obsm_mapping, obsm_names=obsm_mapping) # varm_names -> varm_mapping varm_mapping = {"df": adata_src.var.columns} with pytest.warns(FutureWarning): depr_result = ad.read_loom(loom_pth, varm_names=varm_mapping) actual_result = ad.read_loom(loom_pth, varm_mapping=varm_mapping) assert_equal(actual_result, depr_result) with pytest.raises(ValueError, match="ambiguous"): ad.read_loom(loom_pth, varm_mapping=varm_mapping, varm_names=varm_mapping) # positional -> keyword with pytest.warns(FutureWarning, match="sparse"): depr_result = ad.read_loom(loom_pth, True) actual_result = ad.read_loom(loom_pth, sparse=True) assert type(depr_result.X) == type(actual_result.X) def test_read_csv(): adata = ad.read_csv(HERE / "adata.csv") assert adata.obs_names.tolist() == ["r1", "r2", "r3"] assert adata.var_names.tolist() == ["c1", "c2"] assert adata.X.tolist() == X_list def test_read_tsv_strpath(): adata = ad.read_text(str(HERE / "adata-comments.tsv"), "\t") assert adata.obs_names.tolist() == ["r1", "r2", "r3"] assert adata.var_names.tolist() == ["c1", "c2"] assert adata.X.tolist() == X_list def test_read_tsv_iter(): with (HERE / "adata-comments.tsv").open() as f: adata = ad.read_text(f, "\t") assert adata.obs_names.tolist() == ["r1", "r2", "r3"] assert adata.var_names.tolist() == ["c1", "c2"] assert adata.X.tolist() == X_list @pytest.mark.parametrize("typ", [np.array, csr_matrix]) def test_write_csv(typ, tmp_path): X = typ(X_list) adata = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict) adata.write_csvs(tmp_path / "test_csv_dir", skip_data=False) @pytest.mark.parametrize("typ", [np.array, csr_matrix]) def test_write_csv_view(typ, tmp_path): # https://github.com/theislab/anndata/issues/401 import hashlib def md5_path(pth: PathLike) -> bytes: checksum = hashlib.md5() with open(pth, "rb") as f: while True: buf = f.read(checksum.block_size * 100) if not buf: break checksum.update(buf) return checksum.digest() def hash_dir_contents(dir: Path) -> "dict[str, bytes]": root_pth = str(dir) return { str(k)[len(root_pth) :]: md5_path(k) for k in dir.rglob("*") if k.is_file() } adata = ad.AnnData(typ(X_list), obs=obs_dict, var=var_dict, uns=uns_dict) # Test writing a view view_pth = tmp_path / "test_view_csv_dir" copy_pth = tmp_path / "test_copy_csv_dir" adata[::2].write_csvs(view_pth, skip_data=False) adata[::2].copy().write_csvs(copy_pth, skip_data=False) assert hash_dir_contents(view_pth) == hash_dir_contents(copy_pth) @pytest.mark.parametrize( ["read", "write", "name"], [ pytest.param(ad.read_h5ad, ad._io.write._write_h5ad, "test_empty.h5ad"), pytest.param( ad.read_loom, ad._io.write_loom, "test_empty.loom", marks=pytest.mark.xfail(reason="Loom can’t handle 0×0 matrices"), ), pytest.param(ad.read_zarr, ad._io.write_zarr, "test_empty.zarr"), pytest.param( ad.read_zarr, ad._io.write_zarr, "test_empty.zip", marks=pytest.mark.xfail(reason="Zarr zip storage doesn’t seem to work…"), ), ], ) def test_readwrite_hdf5_empty(read, write, name, tmp_path): if read is ad.read_zarr: pytest.importorskip("zarr") adata = ad.AnnData(uns=dict(empty=np.array([], dtype=float))) write(tmp_path / name, adata) ad_read = read(tmp_path / name) assert ad_read.uns["empty"].shape == (0,) def test_read_excel(): adata = ad.read_excel(HERE / "data/excel.xlsx", "Sheet1", dtype=int) assert adata.X.tolist() == X_list def test_write_categorical(tmp_path, diskfmt): adata_pth = tmp_path / f"adata.{diskfmt}" orig = ad.AnnData( X=np.ones((5, 5)), obs=pd.DataFrame( dict( cat1=["a", "a", "b", np.nan, np.nan], cat2=pd.Categorical(["a", "a", "b", np.nan, np.nan]), ) ), ) getattr(orig, f"write_{diskfmt}")(adata_pth) curr = getattr(ad, f"read_{diskfmt}")(adata_pth) assert np.all(orig.obs.notna() == curr.obs.notna()) assert np.all(orig.obs.stack().dropna() == curr.obs.stack().dropna()) def test_write_categorical_index(tmp_path, diskfmt): adata_pth = tmp_path / f"adata.{diskfmt}" orig = ad.AnnData( X=np.ones((5, 5)), uns={"df": pd.DataFrame(index=pd.Categorical(list("aabcd")))}, ) getattr(orig, f"write_{diskfmt}")(adata_pth) curr = getattr(ad, f"read_{diskfmt}")(adata_pth) # Also covered by next assertion, but checking this value specifically pd.testing.assert_index_equal( orig.uns["df"].index, curr.uns["df"].index, exact=True ) assert_equal(orig, curr, exact=True) def test_dataframe_reserved_columns(tmp_path, diskfmt): reserved = ("_index", "__categories") adata_pth = tmp_path / f"adata.{diskfmt}" orig = ad.AnnData(X=np.ones((5, 5))) for colname in reserved: to_write = orig.copy() to_write.obs[colname] = np.ones(5) with pytest.raises(ValueError) as e: getattr(to_write, f"write_{diskfmt}")(adata_pth) assert colname in str(e.value) for colname in reserved: to_write = orig.copy() to_write.varm["df"] = pd.DataFrame( {colname: list("aabcd")}, index=to_write.var_names ) with pytest.raises(ValueError) as e: getattr(to_write, f"write_{diskfmt}")(adata_pth) assert colname in str(e.value) def test_write_large_categorical(tmp_path, diskfmt): M = 30_000 N = 1000 ls = np.array(list(ascii_letters)) def random_cats(n): cats = { "".join(np.random.choice(ls, np.random.choice(range(5, 30)))) for _ in range(n) } while len(cats) < n: # For the rare case that there’s duplicates cats |= random_cats(n - len(cats)) return cats cats = np.array(sorted(random_cats(10_000))) adata_pth = tmp_path / f"adata.{diskfmt}" n_cats = len(np.unique(cats)) orig = ad.AnnData( csr_matrix(([1], ([0], [0])), shape=(M, N)), obs=dict( cat1=cats[np.random.choice(n_cats, M)], cat2=pd.Categorical.from_codes(np.random.choice(n_cats, M), cats), ), ) getattr(orig, f"write_{diskfmt}")(adata_pth) curr = getattr(ad, f"read_{diskfmt}")(adata_pth) assert_equal(orig, curr) def test_write_string_types(tmp_path, diskfmt): # https://github.com/theislab/anndata/issues/456 adata_pth = tmp_path / f"adata.{diskfmt}" adata = ad.AnnData( np.ones((3, 3)), obs=pd.DataFrame( np.ones((3, 2)), columns=["a", np.str_("b")], index=["a", "b", "c"], ), ) write = getattr(adata, f"write_{diskfmt}") read = getattr(ad, f"read_{diskfmt}") write(adata_pth) from_disk = read(adata_pth) assert_equal(adata, from_disk) adata.obs[b"c"] = np.zeros(3) # This should error, and tell you which key is at fault with pytest.raises(TypeError, match=str(b"c")): write(adata_pth) def test_zarr_chunk_X(tmp_path): import zarr zarr_pth = Path(tmp_path) / "test.zarr" adata = gen_adata((100, 100), X_type=np.array) adata.write_zarr(zarr_pth, chunks=(10, 10)) z = zarr.open(str(zarr_pth)) # As of v2.3.2 zarr won’t take a Path assert z["X"].chunks == (10, 10) from_zarr = ad.read_zarr(zarr_pth) assert_equal(from_zarr, adata) ################################ # Round-tripping scanpy datasets ################################ diskfmt2 = diskfmt @pytest.mark.skipif(not find_spec("scanpy"), reason="Scanpy is not installed") def test_scanpy_pbmc68k(tmp_path, diskfmt, diskfmt2): read1 = lambda pth: getattr(ad, f"read_{diskfmt}")(pth) write1 = lambda adata, pth: getattr(adata, f"write_{diskfmt}")(pth) read2 = lambda pth: getattr(ad, f"read_{diskfmt2}")(pth) write2 = lambda adata, pth: getattr(adata, f"write_{diskfmt2}")(pth) filepth1 = tmp_path / f"test1.{diskfmt}" filepth2 = tmp_path / f"test2.{diskfmt2}" import scanpy as sc pbmc = sc.datasets.pbmc68k_reduced() write1(pbmc, filepth1) from_disk1 = read1(filepth1) # Do we read okay write2(from_disk1, filepth2) # Can we round trip from_disk2 = read2(filepth2) assert_equal(pbmc, from_disk1) # Not expected to be exact due to `nan`s assert_equal(pbmc, from_disk2) @pytest.mark.skipif(not find_spec("scanpy"), reason="Scanpy is not installed") def test_scanpy_krumsiek11(tmp_path, diskfmt): filepth = tmp_path / f"test.{diskfmt}" import scanpy as sc orig = sc.datasets.krumsiek11() del orig.uns["highlights"] # Can’t write int keys getattr(orig, f"write_{diskfmt}")(filepth) read = getattr(ad, f"read_{diskfmt}")(filepth) assert_equal(orig, read, exact=True) # Checking if we can read legacy zarr files # TODO: Check how I should add this file to the repo @pytest.mark.skipif(not find_spec("scanpy"), reason="Scanpy is not installed") @pytest.mark.skipif( not Path(HERE / "data/pbmc68k_reduced_legacy.zarr.zip").is_file(), reason="File not present.", ) def test_backwards_compat_zarr(): import scanpy as sc import zarr pbmc_orig = sc.datasets.pbmc68k_reduced() # Old zarr writer couldn’t do sparse arrays pbmc_orig.raw._X = pbmc_orig.raw.X.toarray() del pbmc_orig.uns["neighbors"] # Since these have moved, see PR #337 del pbmc_orig.obsp["distances"] del pbmc_orig.obsp["connectivities"] # This was written out with anndata=0.6.22.post1 zarrpth = HERE / "data/pbmc68k_reduced_legacy.zarr.zip" with zarr.ZipStore(zarrpth, mode="r") as z: pbmc_zarr = ad.read_zarr(z) assert_equal(pbmc_zarr, pbmc_orig) anndata-0.7.8/anndata/tests/test_repr.py000066400000000000000000000032061414255741200202730ustar00rootroot00000000000000import re from string import ascii_letters import numpy as np import pandas as pd import pytest import anndata as ad ADATA_ATTRS = ("obs", "var", "varm", "obsm", "layers", "obsp", "varp", "uns") @pytest.fixture def adata(): return ad.AnnData( np.zeros((20, 10)), obs=pd.DataFrame( dict(obs_key=list(ascii_letters[:20])), index=[f"cell{i}" for i in range(20)], ), var=pd.DataFrame( dict(var_key=np.arange(10)), index=[f"gene{i}" for i in range(10)] ), varm=dict(varm_key=np.zeros((10, 20))), obsm=dict(obsm_key=np.zeros((20, 20))), layers=dict(layers_key=np.zeros((20, 10))), obsp=dict(obsp_key=np.zeros((20, 20))), varp=dict(varp_key=np.zeros((10, 10))), uns=dict(uns_key=dict(zip("abc", range(3)))), ) @pytest.fixture(params=ADATA_ATTRS) def adata_attr(request): return request.param def test_anndata_repr(adata): assert f"{adata.n_obs} × {adata.n_vars}" in repr(adata) for idxr in [ (slice(10, 20), 9), (12, 9), (["cell1", "cell2"], slice(10, 15)), ]: v = adata[idxr] v_repr = repr(v) assert f"{v.n_obs} × {v.n_vars}" in v_repr assert "View of" in v_repr for attr in ADATA_ATTRS: assert re.search( rf"^\s+{attr}:[^$]*{attr}_key.*$", v_repr, flags=re.MULTILINE ) def test_removal(adata, adata_attr): attr = adata_attr assert re.search(rf"^\s+{attr}:.*$", repr(adata), flags=re.MULTILINE) delattr(adata, attr) assert re.search(rf"^\s+{attr}:.*$", repr(adata), flags=re.MULTILINE) is None anndata-0.7.8/anndata/tests/test_transpose.py000066400000000000000000000040021414255741200213340ustar00rootroot00000000000000from scipy import sparse import pytest from anndata.tests.helpers import gen_adata, assert_equal def test_transpose_orig(): """ Original test for transpose, should be covered by more thorough tests below, but keeping around just in case. """ adata = gen_adata((5, 3)) adata.varp = {f"varp_{k}": v for k, v in adata.varp.items()} adata1 = adata.T adata1.uns["test123"] = 1 assert "test123" in adata.uns assert_equal(adata1.X.shape, (3, 5)) assert_equal(adata1.obsp.keys(), adata.varp.keys()) def _add_raw(adata, *, var_subset=slice(None)): new = adata[:, var_subset].copy() new.raw = adata return new # TODO: Cases to add: # * Views # * X is None should have the xfail marker removed # * Backed @pytest.fixture( params=[ pytest.param(gen_adata((50, 20)), id="csr_X"), pytest.param(gen_adata((50, 20), sparse.csc_matrix), id="csc_X"), pytest.param(_add_raw(gen_adata((50, 20))), id="with_raw"), pytest.param(gen_adata((20, 10), X_type=None), id="None_X"), ] ) def adata(request): return request.param def test_transpose_removes_raw(adata): """ Since Raw must have the same `obs_names` as AnnData, but does not have the same `var_names`, transpose doesn't really make sense for Raw. So it should just get deleted. """ assert adata.T.raw is None def test_transposed_contents(adata): t = adata.T if adata.X is not None: assert_equal(adata.X.T, t.X) else: assert adata.X is t.X is None assert_equal( {k: v.T for k, v in adata.layers.items()}, {k: v for k, v in t.layers.items()} ) assert_equal(adata.obs, t.var) assert_equal(adata.var, t.obs) assert_equal(dict(adata.obsm), dict(t.varm)) assert_equal(dict(adata.varm), dict(t.obsm)) assert_equal(dict(adata.obsp), dict(t.varp)) assert_equal(dict(adata.varp), dict(t.obsp)) assert_equal(adata.uns, t.uns) def test_transpose_roundtrip(adata): del adata.raw assert_equal(adata, adata.T.T) anndata-0.7.8/anndata/tests/test_uns.py000066400000000000000000000030421414255741200201260ustar00rootroot00000000000000import numpy as np import pandas as pd import pytest from anndata import AnnData from anndata.tests.helpers import assert_equal def test_uns_color_subset(): # Tests for https://github.com/theislab/anndata/issues/257 obs = pd.DataFrame( { "cat1": pd.Categorical(list("aabcd")), "cat2": pd.Categorical(list("aabbb")), }, index=[f"cell{i}" for i in range(5)], ) # If number of categories does not match number of colors, they should be reset wrong_color_length_adata = AnnData( np.ones((5, 5)), obs=obs, uns={ "cat1_colors": ["red", "green", "blue"], "cat2_colors": ["red", "green", "blue"], }, ) v = wrong_color_length_adata[:, [0, 1]] assert "cat1_colors" not in v.uns assert "cat2_colors" not in v.uns # Otherwise the colors should still match after reseting cat1_colors = np.array(["red", "green", "blue", "yellow"], dtype=object) adata = AnnData(np.ones((5, 5)), obs=obs, uns={"cat1_colors": cat1_colors.copy()}) for color, idx in [("red", [0, 1]), ("green", [2]), ("blue", [3]), ("yellow", [4])]: v = adata[idx, :] assert len(v.uns["cat1_colors"]) == 1 assert v.uns["cat1_colors"][0] == color c = v.copy() assert_equal(v.uns, c.uns, elem_name="uns") with pytest.raises(AssertionError): assert_equal(adata.uns, c.uns, elem_name="uns") # But original object should not change assert list(adata.uns["cat1_colors"]) == list(cat1_colors) anndata-0.7.8/anndata/tests/test_utils.py000066400000000000000000000052001414255741200204570ustar00rootroot00000000000000import pandas as pd from scipy import sparse from itertools import repeat import pytest import anndata as ad from anndata.utils import import_function, make_index_unique from anndata.tests.helpers import gen_typed_df def test_make_index_unique(): index = pd.Index(["val", "val", "val-1", "val-1"]) with pytest.warns(UserWarning): result = make_index_unique(index) expected = pd.Index(["val", "val-2", "val-1", "val-1-1"]) assert list(expected) == list(result) assert result.is_unique def test_adata_unique_indices(): m, n = (10, 20) obs_index = pd.Index(repeat("a", m), name="obs") var_index = pd.Index(repeat("b", n), name="var") adata = ad.AnnData( X=sparse.random(m, n, format="csr"), obs=gen_typed_df(m, index=obs_index), var=gen_typed_df(n, index=var_index), obsm={"df": gen_typed_df(m, index=obs_index)}, varm={"df": gen_typed_df(n, index=var_index)}, ) pd.testing.assert_index_equal(adata.obsm["df"].index, adata.obs_names) pd.testing.assert_index_equal(adata.varm["df"].index, adata.var_names) adata.var_names_make_unique() adata.obs_names_make_unique() assert adata.obs_names.name == "obs" assert adata.var_names.name == "var" assert len(pd.unique(adata.obs_names)) == m assert len(pd.unique(adata.var_names)) == n pd.testing.assert_index_equal(adata.obsm["df"].index, adata.obs_names) pd.testing.assert_index_equal(adata.varm["df"].index, adata.var_names) v = adata[:5, :5] assert v.obs_names.name == "obs" assert v.var_names.name == "var" pd.testing.assert_index_equal(v.obsm["df"].index, v.obs_names) pd.testing.assert_index_equal(v.varm["df"].index, v.var_names) def test_import_function_no_import_error(): """/ A TypeError is expected if the `write_zarr` function is imported correctly because `write_zarr` requires two arguments. """ with pytest.raises(TypeError): write_zarr = import_function("anndata._io.zarr", "write_zarr") write_zarr() def test_import_function_missing_module(): """/ A ModuleNotFoundError is expected because there is no module called `should_not_exist`. """ with pytest.raises(ModuleNotFoundError): some_function = import_function("should_not_exist", "some_function") some_function() def test_import_function_missing_function(): """/ An AttributeError is expected because the `anndata` module exists but it does not export a function called `some_function`. """ with pytest.raises(AttributeError): some_function = import_function("anndata", "some_function") some_function() anndata-0.7.8/anndata/tests/test_views.py000066400000000000000000000374601414255741200204710ustar00rootroot00000000000000from operator import mul import joblib import numpy as np from scipy import sparse import pandas as pd import pytest import anndata as ad from anndata._core.index import _normalize_index from anndata.utils import asarray from anndata.tests.helpers import ( gen_adata, subset_func, slice_subset, single_subset, assert_equal, ) # ------------------------------------------------------------------------------ # Some test data # ------------------------------------------------------------------------------ # data matrix of shape n_obs x n_vars X_list = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] # annotation of observations / rows obs_dict = dict( row_names=["name1", "name2", "name3"], # row annotation oanno1=["cat1", "cat2", "cat2"], # categorical annotation oanno2=["o1", "o2", "o3"], # string annotation oanno3=[2.1, 2.2, 2.3], # float annotation ) # annotation of variables / columns var_dict = dict(vanno1=[3.1, 3.2, 3.3]) # unstructured annotation uns_dict = dict(oanno1_colors=["#000000", "#FFFFFF"], uns2=["some annotation"]) subset_func2 = subset_func class NDArraySubclass(np.ndarray): def view(self, dtype=None, typ=None): return self @pytest.fixture def adata(): adata = ad.AnnData(np.zeros((100, 100))) adata.obsm["o"] = np.zeros((100, 50)) adata.varm["o"] = np.zeros((100, 50)) return adata @pytest.fixture(params=[asarray, sparse.csr_matrix, sparse.csc_matrix]) def adata_parameterized(request): return gen_adata(shape=(200, 300), X_type=request.param) @pytest.fixture( params=[np.array, sparse.csr_matrix, sparse.csc_matrix], ids=["np_array", "scipy_csr", "scipy_csc"], ) def matrix_type(request): return request.param @pytest.fixture(params=["layers", "obsm", "varm"]) def mapping_name(request): return request.param # ------------------------------------------------------------------------------ # The test functions # ------------------------------------------------------------------------------ def test_views(): X = np.array(X_list) adata = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict, dtype="int32") assert adata[:, 0].is_view assert adata[:, 0].X.tolist() == np.reshape([1, 4, 7], (3, 1)).tolist() adata[:2, 0].X = [0, 0] assert adata[:, 0].X.tolist() == np.reshape([0, 0, 7], (3, 1)).tolist() adata_subset = adata[:2, [0, 1]] assert adata_subset.is_view # now transition to actual object adata_subset.obs["foo"] = range(2) assert not adata_subset.is_view assert adata_subset.obs["foo"].tolist() == list(range(2)) def test_modify_view_component(matrix_type, mapping_name): adata = ad.AnnData( np.zeros((10, 10)), **{mapping_name: dict(m=matrix_type(asarray(sparse.random(10, 10))))}, ) init_hash = joblib.hash(adata) subset = adata[:5, :][:, :5] assert subset.is_view m = getattr(subset, mapping_name)["m"] m[0, 0] = 100 assert not subset.is_view assert getattr(subset, mapping_name)["m"][0, 0] == 100 assert init_hash == joblib.hash(adata) # TODO: These tests could probably be condensed into a fixture # based test for obsm and varm def test_set_obsm_key(adata): init_hash = joblib.hash(adata) orig_obsm_val = adata.obsm["o"].copy() subset_obsm = adata[:50] assert subset_obsm.is_view subset_obsm.obsm["o"] = np.ones((50, 20)) assert not subset_obsm.is_view assert np.all(adata.obsm["o"] == orig_obsm_val) assert init_hash == joblib.hash(adata) def test_set_varm_key(adata): init_hash = joblib.hash(adata) orig_varm_val = adata.varm["o"].copy() subset_varm = adata[:, :50] assert subset_varm.is_view subset_varm.varm["o"] = np.ones((50, 20)) assert not subset_varm.is_view assert np.all(adata.varm["o"] == orig_varm_val) assert init_hash == joblib.hash(adata) def test_set_obs(adata, subset_func): init_hash = joblib.hash(adata) subset = adata[subset_func(adata.obs_names), :] new_obs = pd.DataFrame( dict(a=np.ones(subset.n_obs), b=np.ones(subset.n_obs)), index=subset.obs_names, ) assert subset.is_view subset.obs = new_obs assert not subset.is_view assert np.all(subset.obs == new_obs) assert joblib.hash(adata) == init_hash def test_set_var(adata, subset_func): init_hash = joblib.hash(adata) subset = adata[:, subset_func(adata.var_names)] new_var = pd.DataFrame( dict(a=np.ones(subset.n_vars), b=np.ones(subset.n_vars)), index=subset.var_names, ) assert subset.is_view subset.var = new_var assert not subset.is_view assert np.all(subset.var == new_var) assert joblib.hash(adata) == init_hash def test_drop_obs_column(): adata = ad.AnnData(np.array(X_list), obs=obs_dict, dtype="int32") subset = adata[:2] assert subset.is_view # returns a copy of obs assert subset.obs.drop(columns=["oanno1"]).columns.tolist() == ["oanno2", "oanno3"] assert subset.is_view # would modify obs, so it should actualize subset and not modify adata subset.obs.drop(columns=["oanno1"], inplace=True) assert not subset.is_view assert subset.obs.columns.tolist() == ["oanno2", "oanno3"] assert adata.obs.columns.tolist() == ["oanno1", "oanno2", "oanno3"] def test_set_obsm(adata): init_hash = joblib.hash(adata) dim0_size = np.random.randint(2, adata.shape[0] - 1) dim1_size = np.random.randint(1, 99) orig_obsm_val = adata.obsm["o"].copy() subset_idx = np.random.choice(adata.obs_names, dim0_size, replace=False) subset = adata[subset_idx, :] assert subset.is_view subset.obsm = dict(o=np.ones((dim0_size, dim1_size))) assert not subset.is_view assert np.all(orig_obsm_val == adata.obsm["o"]) # Checking for mutation assert np.all(subset.obsm["o"] == np.ones((dim0_size, dim1_size))) subset = adata[subset_idx, :] subset_hash = joblib.hash(subset) with pytest.raises(ValueError): subset.obsm = dict(o=np.ones((dim0_size + 1, dim1_size))) with pytest.raises(ValueError): subset.varm = dict(o=np.ones((dim0_size - 1, dim1_size))) assert subset_hash == joblib.hash(subset) # Only modification have been made to a view assert init_hash == joblib.hash(adata) def test_set_varm(adata): init_hash = joblib.hash(adata) dim0_size = np.random.randint(2, adata.shape[1] - 1) dim1_size = np.random.randint(1, 99) orig_varm_val = adata.varm["o"].copy() subset_idx = np.random.choice(adata.var_names, dim0_size, replace=False) subset = adata[:, subset_idx] assert subset.is_view subset.varm = dict(o=np.ones((dim0_size, dim1_size))) assert not subset.is_view assert np.all(orig_varm_val == adata.varm["o"]) # Checking for mutation assert np.all(subset.varm["o"] == np.ones((dim0_size, dim1_size))) subset = adata[:, subset_idx] subset_hash = joblib.hash(subset) with pytest.raises(ValueError): subset.varm = dict(o=np.ones((dim0_size + 1, dim1_size))) with pytest.raises(ValueError): subset.varm = dict(o=np.ones((dim0_size - 1, dim1_size))) # subset should not be changed by failed setting assert subset_hash == joblib.hash(subset) assert init_hash == joblib.hash(adata) # TODO: Determine if this is the intended behavior, # or just the behaviour we’ve had for a while def test_not_set_subset_X(matrix_type, subset_func): adata = ad.AnnData(matrix_type(asarray(sparse.random(20, 20)))) init_hash = joblib.hash(adata) orig_X_val = adata.X.copy() while True: subset_idx = slice_subset(adata.obs_names) if len(adata[subset_idx, :]) > 2: break subset = adata[subset_idx, :] subset = adata[:, subset_idx] internal_idx = _normalize_index( subset_func(np.arange(subset.X.shape[1])), subset.var_names ) assert subset.is_view subset.X[:, internal_idx] = 1 assert not subset.is_view assert not np.any(asarray(adata.X != orig_X_val)) assert init_hash == joblib.hash(adata) def test_set_scalar_subset_X(matrix_type, subset_func): adata = ad.AnnData(matrix_type(np.zeros((10, 10)))) orig_X_val = adata.X.copy() subset_idx = slice_subset(adata.obs_names) adata_subset = adata[subset_idx, :] adata_subset.X = 1 assert adata_subset.is_view assert np.all(asarray(adata[subset_idx, :].X) == 1) assert asarray((orig_X_val != adata.X)).sum() == mul(*adata_subset.shape) # TODO: Use different kind of subsetting for adata and view def test_set_subset_obsm(adata, subset_func): init_hash = joblib.hash(adata) orig_obsm_val = adata.obsm["o"].copy() while True: subset_idx = slice_subset(adata.obs_names) if len(adata[subset_idx, :]) > 2: break subset = adata[subset_idx, :] internal_idx = _normalize_index( subset_func(np.arange(subset.obsm["o"].shape[0])), subset.obs_names ) assert subset.is_view subset.obsm["o"][internal_idx] = 1 assert not subset.is_view assert np.all(adata.obsm["o"] == orig_obsm_val) assert init_hash == joblib.hash(adata) def test_set_subset_varm(adata, subset_func): init_hash = joblib.hash(adata) orig_varm_val = adata.varm["o"].copy() while True: subset_idx = slice_subset(adata.var_names) if (adata[:, subset_idx]).shape[1] > 2: break subset = adata[:, subset_idx] internal_idx = _normalize_index( subset_func(np.arange(subset.varm["o"].shape[0])), subset.var_names ) assert subset.is_view subset.varm["o"][internal_idx] = 1 assert not subset.is_view assert np.all(adata.varm["o"] == orig_varm_val) assert init_hash == joblib.hash(adata) @pytest.mark.parametrize("attr", ["obsm", "varm", "obsp", "varp", "layers"]) def test_view_failed_delitem(attr): adata = gen_adata((10, 10)) view = adata[5:7, :][:, :5] adata_hash = joblib.hash(adata) view_hash = joblib.hash(view) with pytest.raises(KeyError): getattr(view, attr).__delitem__("not a key") assert view.is_view assert adata_hash == joblib.hash(adata) assert view_hash == joblib.hash(view) @pytest.mark.parametrize("attr", ["obsm", "varm", "obsp", "varp", "layers"]) def test_view_delitem(attr): adata = gen_adata((10, 10)) getattr(adata, attr)["to_delete"] = np.ones((10, 10)) # Shouldn’t be a subclass, should be an ndarray assert type(getattr(adata, attr)["to_delete"]) is np.ndarray view = adata[5:7, :][:, :5] adata_hash = joblib.hash(adata) view_hash = joblib.hash(view) getattr(view, attr).__delitem__("to_delete") assert not view.is_view assert "to_delete" not in getattr(view, attr) assert "to_delete" in getattr(adata, attr) assert adata_hash == joblib.hash(adata) assert view_hash != joblib.hash(view) @pytest.mark.parametrize( "attr", ["X", "obs", "var", "obsm", "varm", "obsp", "varp", "layers", "uns"] ) def test_view_delattr(attr, subset_func): base = gen_adata((10, 10)) orig_hash = joblib.hash(base) subset = base[subset_func(base.obs_names), subset_func(base.var_names)] empty = ad.AnnData(obs=subset.obs[[]], var=subset.var[[]]) delattr(subset, attr) assert not subset.is_view # Should now have same value as default assert_equal(getattr(subset, attr), getattr(empty, attr)) assert orig_hash == joblib.hash(base) # Original should not be modified @pytest.mark.parametrize( "attr", ["obs", "var", "obsm", "varm", "obsp", "varp", "layers", "uns"] ) def test_view_setattr_machinery(attr, subset_func, subset_func2): # Tests that setting attributes on a view doesn't mess anything up too bad adata = gen_adata((10, 10)) view = adata[subset_func(adata.obs_names), subset_func2(adata.var_names)] actual = view.copy() setattr(view, attr, getattr(actual, attr)) assert_equal(actual, view, exact=True) def test_layers_view(): X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) L = np.array([[10, 11, 12], [13, 14, 15], [16, 17, 18]]) real_adata = ad.AnnData(X) real_adata.layers["L"] = L view_adata = real_adata[1:, 1:] real_hash = joblib.hash(real_adata) view_hash = joblib.hash(view_adata) assert view_adata.is_view with pytest.raises(ValueError): view_adata.layers["L2"] = L + 2 assert view_adata.is_view # Failing to set layer item makes adata not view assert real_hash == joblib.hash(real_adata) assert view_hash == joblib.hash(view_adata) view_adata.layers["L2"] = L[1:, 1:] + 2 assert not view_adata.is_view assert real_hash == joblib.hash(real_adata) assert view_hash != joblib.hash(view_adata) # TODO: This can be flaky. Make that stop def test_view_of_view(matrix_type, subset_func, subset_func2): adata = gen_adata((30, 15), X_type=matrix_type) adata.raw = adata if subset_func is single_subset: pytest.xfail("Other subset generating functions have trouble with this") var_s1 = subset_func(adata.var_names, min_size=4) var_view1 = adata[:, var_s1] var_s2 = subset_func2(var_view1.var_names) var_view2 = var_view1[:, var_s2] assert var_view2._adata_ref is adata obs_s1 = subset_func(adata.obs_names, min_size=4) obs_view1 = adata[obs_s1, :] obs_s2 = subset_func2(obs_view1.obs_names) assert adata[obs_s1, :][:, var_s1][obs_s2, :]._adata_ref is adata view_of_actual_copy = adata[:, var_s1].copy()[obs_s1, :].copy()[:, var_s2].copy() view_of_view_copy = adata[:, var_s1][obs_s1, :][:, var_s2].copy() assert_equal(view_of_actual_copy, view_of_view_copy, exact=True) def test_view_of_view_modification(): adata = ad.AnnData(np.zeros((10, 10))) adata[0, :][:, 5:].X = np.ones(5) assert np.all(adata.X[0, 5:] == np.ones(5)) adata[[1, 2], :][:, [1, 2]].X = np.ones((2, 2)) assert np.all(adata.X[1:3, 1:3] == np.ones((2, 2))) adata.X = sparse.csr_matrix(adata.X) adata[0, :][:, 5:].X = np.ones(5) * 2 assert np.all(asarray(adata.X)[0, 5:] == np.ones(5) * 2) adata[[1, 2], :][:, [1, 2]].X = np.ones((2, 2)) * 2 assert np.all(asarray(adata.X)[1:3, 1:3] == np.ones((2, 2)) * 2) def test_double_index(subset_func, subset_func2): adata = gen_adata((10, 10)) obs_subset = subset_func(adata.obs_names) var_subset = subset_func2(adata.var_names) v1 = adata[obs_subset, var_subset] v2 = adata[obs_subset, :][:, var_subset] assert np.all(asarray(v1.X) == asarray(v2.X)) assert np.all(v1.obs == v2.obs) assert np.all(v1.var == v2.var) def test_view_retains_ndarray_subclass(): adata = ad.AnnData(np.zeros((10, 10))) adata.obsm["foo"] = np.zeros((10, 5)).view(NDArraySubclass) view = adata[:5, :] assert isinstance(view.obsm["foo"], NDArraySubclass) assert view.obsm["foo"].shape == (5, 5) def test_modify_uns_in_copy(): # https://github.com/theislab/anndata/issues/571 adata = ad.AnnData(np.ones((5, 5)), uns={"parent": {"key": "value"}}) adata_copy = adata[:3].copy() adata_copy.uns["parent"]["key"] = "new_value" assert adata.uns["parent"]["key"] != adata_copy.uns["parent"]["key"] @pytest.mark.parametrize("index", [-101, 100, (slice(None), -101), (slice(None), 100)]) def test_invalid_scalar_index(adata, index): # https://github.com/theislab/anndata/issues/619 with pytest.raises(IndexError, match=r".*index.* out of range\."): _ = adata[index] @pytest.mark.parametrize("obs", [False, True]) @pytest.mark.parametrize("index", [-100, -50, -1]) def test_negative_scalar_index(adata, index: int, obs: bool): pos_index = index + (adata.n_obs if obs else adata.n_vars) if obs: adata_pos_subset = adata[pos_index] adata_neg_subset = adata[index] else: adata_pos_subset = adata[:, pos_index] adata_neg_subset = adata[:, index] np.testing.assert_array_equal( adata_pos_subset.obs_names, adata_neg_subset.obs_names ) np.testing.assert_array_equal( adata_pos_subset.var_names, adata_neg_subset.var_names ) anndata-0.7.8/anndata/tests/test_x.py000066400000000000000000000041261414255741200175740ustar00rootroot00000000000000"""Tests for the attribute .X""" import numpy as np from scipy import sparse from anndata import AnnData from anndata.utils import asarray import pytest from anndata.tests.helpers import gen_adata, assert_equal UNLABELLED_ARRAY_TYPES = [ pytest.param(sparse.csr_matrix, id="csr"), pytest.param(sparse.csc_matrix, id="csc"), pytest.param(asarray, id="ndarray"), ] SINGULAR_SHAPES = [ pytest.param(shape, id=str(shape)) for shape in [(1, 10), (10, 1), (1, 1)] ] @pytest.mark.parametrize("shape", SINGULAR_SHAPES) @pytest.mark.parametrize("orig_array_type", UNLABELLED_ARRAY_TYPES) @pytest.mark.parametrize("new_array_type", UNLABELLED_ARRAY_TYPES) def test_setter_singular_dim(shape, orig_array_type, new_array_type): # https://github.com/theislab/anndata/issues/500 adata = gen_adata(shape, X_type=orig_array_type) adata.X = new_array_type(np.ones(shape)) np.testing.assert_equal(asarray(adata.X), 1) ############################### # Tests for `adata.X is None` # ############################### def test_set_x_is_none(): # test setter and getter adata = AnnData(np.array([[1, 2, 3], [4, 5, 6]]), dict(o1=[1, 2], o2=[3, 4])) adata.X = None assert adata.X is None def test_del_set_equiv_X(): """Tests that `del adata.X` is equivalent to `adata.X = None`""" # test setter and deleter orig = gen_adata((10, 10)) copy = orig.copy() del orig.X copy.X = None assert orig.X is None assert_equal(orig, copy) # Check that deleting again is still fine del orig.X assert orig.X is None def test_init_X_as_none(): # test initialiser shape = (3, 5) adata = AnnData(None, uns=dict(test=np.array((3, 3))), shape=shape) assert adata.X is None assert adata.shape == shape @pytest.mark.parametrize("shape", SINGULAR_SHAPES + [pytest.param((5, 3), id="(5, 3)")]) def test_transpose_with_X_as_none(shape): adata = gen_adata(shape, X_type=lambda x: None) adataT = adata.transpose() assert_equal(adataT.shape, shape[::-1]) assert_equal(adataT.obsp.keys(), adata.varp.keys()) assert_equal(adataT.T, adata) anndata-0.7.8/anndata/utils.py000066400000000000000000000170621414255741200162670ustar00rootroot00000000000000import warnings from functools import wraps, singledispatch from typing import Mapping, Any, Sequence, Union, Callable import h5py import pandas as pd import numpy as np from scipy import sparse from .logging import get_logger from ._core.sparse_dataset import SparseDataset logger = get_logger(__name__) @singledispatch def asarray(x): """Convert x to a numpy array""" return np.asarray(x) @asarray.register(sparse.spmatrix) def asarray_sparse(x): return x.toarray() @asarray.register(SparseDataset) def asarray_sparse_dataset(x): return asarray(x.value) @asarray.register(h5py.Dataset) def asarray_h5py_dataset(x): return x[...] @singledispatch def convert_to_dict(obj) -> dict: return dict(obj) @convert_to_dict.register(dict) def convert_to_dict_dict(obj: dict): return obj @convert_to_dict.register(np.ndarray) def convert_to_dict_ndarray(obj: np.ndarray): if obj.dtype.fields is None: raise TypeError( "Can only convert np.ndarray with compound dtypes to dict, " f"passed array had “{obj.dtype}”." ) return {k: obj[k] for k in obj.dtype.fields.keys()} @convert_to_dict.register(type(None)) def convert_to_dict_nonetype(obj: None): return dict() def make_index_unique(index: pd.Index, join: str = "-"): """ Makes the index unique by appending a number string to each duplicate index element: '1', '2', etc. If a tentative name created by the algorithm already exists in the index, it tries the next integer in the sequence. The first occurrence of a non-unique value is ignored. Parameters ---------- join The connecting string between name and integer. Examples -------- >>> from anndata import AnnData >>> adata = AnnData(np.ones((2, 3)), var=pd.DataFrame(index=["a", "a", "b"])) >>> adata.var_names Index(['a', 'a', 'b'], dtype='object') >>> adata.var_names_make_unique() >>> adata.var_names Index(['a', 'a-1', 'b'], dtype='object') """ if index.is_unique: return index from collections import Counter values = index.values.copy() indices_dup = index.duplicated(keep="first") values_dup = values[indices_dup] values_set = set(values) counter = Counter() issue_interpretation_warning = False example_colliding_values = [] for i, v in enumerate(values_dup): while True: counter[v] += 1 tentative_new_name = v + join + str(counter[v]) if tentative_new_name not in values_set: values_set.add(tentative_new_name) values_dup[i] = tentative_new_name break issue_interpretation_warning = True if len(example_colliding_values) < 5: example_colliding_values.append(tentative_new_name) if issue_interpretation_warning: warnings.warn( f"Suffix used ({join}[0-9]+) to deduplicate index values may make index " + "values difficult to interpret. There values with a similar suffixes in " + "the index. Consider using a different delimiter by passing " + "`join={delimiter}`" + "Example key collisions generated by the make_index_unique algorithm: " + str(example_colliding_values) ) values[indices_dup] = values_dup index = pd.Index(values, name=index.name) return index def warn_names_duplicates(attr: str): names = "Observation" if attr == "obs" else "Variable" logger.info( f"{names} names are not unique. " f"To make them unique, call `.{attr}_names_make_unique`." ) def ensure_df_homogeneous( df: pd.DataFrame, name: str ) -> Union[np.ndarray, sparse.csr_matrix]: # TODO: rename this function, I would not expect this to return a non-dataframe if all(isinstance(dt, pd.SparseDtype) for dt in df.dtypes): arr = df.sparse.to_coo().tocsr() else: arr = df.to_numpy() if df.dtypes.nunique() != 1: warnings.warn(f"{name} converted to numpy array with dtype {arr.dtype}") return arr def convert_dictionary_to_structured_array(source: Mapping[str, Sequence[Any]]): names = list(source.keys()) try: # transform to byte-strings cols = [ np.asarray(col) if np.array(col[0]).dtype.char not in {"U", "S"} else np.asarray(col).astype("U") for col in source.values() ] except UnicodeEncodeError: raise ValueError( "Currently only support ascii strings. " "Don’t use “ö” etc. for sample annotation." ) # if old_index_key not in source: # names.append(new_index_key) # cols.append(np.arange(len(cols[0]) if cols else n_row).astype("U")) # else: # names[names.index(old_index_key)] = new_index_key # cols[names.index(old_index_key)] = cols[names.index(old_index_key)].astype("U") dtype_list = list( zip(names, [str(c.dtype) for c in cols], [(c.shape[1],) for c in cols]) ) # might be unnecessary dtype = np.dtype(dtype_list) arr = np.zeros((len(cols[0]),), dtype) # here, we do not want to call BoundStructArray.__getitem__ # but np.ndarray.__getitem__, therefore we avoid the following line # arr = np.ndarray.__new__(cls, (len(cols[0]),), dtype) for i, name in enumerate(dtype.names): arr[name] = np.array(cols[i], dtype=dtype_list[i][1]) return arr def deprecated(new_name: str): """\ This is a decorator which can be used to mark functions as deprecated. It will result in a warning being emitted when the function is used. """ def decorator(func): @wraps(func) def new_func(*args, **kwargs): # turn off filter warnings.simplefilter("always", DeprecationWarning) warnings.warn( f"Use {new_name} instead of {func.__name__}, " f"{func.__name__} will be removed in the future.", category=DeprecationWarning, stacklevel=2, ) warnings.simplefilter("default", DeprecationWarning) # reset filter return func(*args, **kwargs) setattr(new_func, "__deprecated", True) return new_func return decorator class DeprecationMixinMeta(type): """\ Use this as superclass so deprecated methods and properties do not appear in vars(MyClass)/dir(MyClass) """ def __dir__(cls): def is_deprecated(attr): if isinstance(attr, property): attr = attr.fget return getattr(attr, "__deprecated", False) return [ item for item in type.__dir__(cls) if not is_deprecated(getattr(cls, item, None)) ] def import_function(module: str, name: str) -> Callable: """\ Try to import function from module. If the module is not installed or function is not part of the module, it returns a dummy function that raises the respective import error once the function is called. This could be a ModuleNotFoundError if the module is missing or an AttributeError if the module is installed but the function is not exported by it. Params ------- module Module to import from. Can be nested, e.g. "sklearn.utils". name Name of function to import from module. """ from importlib import import_module try: module = import_module(module) func = getattr(module, name) except (ImportError, AttributeError) as e: error = e def func(*_, **__): raise error return func anndata-0.7.8/docs/000077500000000000000000000000001414255741200140715ustar00rootroot00000000000000anndata-0.7.8/docs/Makefile000066400000000000000000000012631414255741200155330ustar00rootroot00000000000000# Minimal makefile for Sphinx documentation # # You can set these variables from the command line. SPHINXOPTS = SPHINXBUILD = python3 -msphinx SPHINXPROJ = Scanpy SOURCEDIR = . BUILDDIR = _build # Put it first so that "make" without argument is like "make help". help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) .PHONY: help Makefile clean: rm -r "$(BUILDDIR)" rm -r "generated" find . -name scanpy.*.rst -delete # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) anndata-0.7.8/docs/_key_contributors.rst000066400000000000000000000006571414255741200203770ustar00rootroot00000000000000.. sidebar:: Key Contributors * `Isaac Virshup`_: anndata >= 0.7, diverse contributions * Sergei Rybakov: diverse contributions * `Alex Wolf`_: initial conception/development * Philipp Angerer: initial conception/development, software quality .. _contributions graph: https://github.com/theislab/anndata/graphs/contributors .. _Isaac Virshup: https://twitter.com/ivirshup .. _Alex Wolf: https://twitter.com/falexwolf anndata-0.7.8/docs/_templates/000077500000000000000000000000001414255741200162265ustar00rootroot00000000000000anndata-0.7.8/docs/_templates/autosummary/000077500000000000000000000000001414255741200206145ustar00rootroot00000000000000anndata-0.7.8/docs/_templates/autosummary/base.rst000066400000000000000000000002431414255741200222570ustar00rootroot00000000000000:github_url: {{ fullname | github_url }} {% extends "!autosummary/base.rst" %} .. http://www.sphinx-doc.org/en/stable/ext/autosummary.html#customizing-templates anndata-0.7.8/docs/_templates/autosummary/class.rst000066400000000000000000000012751414255741200224600ustar00rootroot00000000000000:github_url: {{ fullname | github_url }} {{ fullname | escape | underline}} .. currentmodule:: {{ module }} .. add toctree option to make autodoc generate the pages .. autoclass:: {{ objname }} {% block attributes %} {% if attributes %} .. rubric:: Attributes .. autosummary:: :toctree: . {% for item in attributes %} ~{{ fullname }}.{{ item }} {%- endfor %} {% endif %} {% endblock %} {% block methods %} {% if methods %} .. rubric:: Methods .. autosummary:: :toctree: . {% for item in methods %} {%- if item != '__init__' %} ~{{ fullname }}.{{ item }} {%- endif -%} {%- endfor %} {% endif %} {% endblock %} anndata-0.7.8/docs/api.rst000066400000000000000000000016371414255741200154030ustar00rootroot00000000000000API === .. module:: anndata The central class: .. autosummary:: :toctree: generated/ AnnData Combining --------- Combining AnnData objects. See also the section on concatenation. .. autosummary:: :toctree: generated/ concat Reading ------- Reading anndata’s native file format `.h5ad`. .. autosummary:: :toctree: generated/ read_h5ad Reading other file formats. .. autosummary:: :toctree: generated/ read_csv read_excel read_hdf read_loom read_mtx read_text read_umi_tools read_zarr Writing ------- Writing to anndata’s native file format `.h5ad`. .. autosummary:: :toctree: generated/ AnnData.write Writing to other formats. .. autosummary:: :toctree: generated/ AnnData.write_csvs AnnData.write_loom AnnData.write_zarr Errors and warnings ------------------- .. autosummary:: :toctree: generated/ ImplicitModificationWarning anndata-0.7.8/docs/benchmarks.rst000066400000000000000000000011561414255741200167430ustar00rootroot00000000000000Benchmarks ========== Reading and writing h5ad files ------------------------------ Of course, we want the associated notebook to be called "reading-writing.h5ad". https://github.com/Koncopd/anndata-scanpy-benchmarks/blob/master/Benchmarks-Loading.ipynb The notebook should also cover reading chunks. https://github.com/Koncopd/anndata-scanpy-benchmarks/blob/master/chunk_X.ipynb It should compare the results with loom. We don’t need images here, at this stage. File sizes ---------- Another notebook should cover file sizes. https://github.com/Koncopd/anndata-scanpy-benchmarks/blob/master/File_sizes.ipynb anndata-0.7.8/docs/concatenation.rst000066400000000000000000000316141414255741200174550ustar00rootroot00000000000000Concatenation ============= .. warning:: The :func:`~anndata.concat` function is marked as experimental for the `0.7` release series, and will supercede the :meth:`AnnData.concatenate() ` method in future releases. While the current API is not likely to change much, this gives us a bit of freedom to make sure we've got the arguments and feature set right. With :func:`~anndata.concat`, :class:`~anndata.AnnData` objects can be combined via a composition of two operations: concatenation and merging. * Concatenation is when we keep all sub elements of each object, and stack these elements in an ordered way. * Merging is combining a set of collections into one resulting collection which contains elements from the objects. .. note:: This function borrows from similar functions in pandas_ and xarray_. Argument which are used to control concatenation are modeled after :func:`pandas.concat` while strategies for merging are inspired by :func:`xarray.merge`'s `compat` argument. .. _pandas: https://pandas.pydata.org .. _xarray: http://xarray.pydata.org Concatenation ------------- Let's start off with an example: >>> import scanpy as sc, anndata as ad, numpy as np, pandas as pd >>> from scipy import sparse >>> from anndata import AnnData >>> pbmc = sc.datasets.pbmc68k_reduced() >>> pbmc AnnData object with n_obs × n_vars = 700 × 765 obs: 'bulk_labels', 'n_genes', 'percent_mito', 'n_counts', 'S_score', 'G2M_score', 'phase', 'louvain' var: 'n_counts', 'means', 'dispersions', 'dispersions_norm', 'highly_variable' uns: 'bulk_labels_colors', 'louvain', 'louvain_colors', 'neighbors', 'pca', 'rank_genes_groups' obsm: 'X_pca', 'X_umap' varm: 'PCs' obsp: 'distances', 'connectivities' If we split this object up by clusters of observations, then stack those subsets we'll obtain the same values – just ordered differently. >>> groups = pbmc.obs.groupby("louvain").indices >>> pbmc_concat = ad.concat([pbmc[inds] for inds in groups.values()], merge="same") >>> assert np.array_equal(pbmc.X, pbmc_concat[pbmc.obs_names].X) >>> pbmc_concat AnnData object with n_obs × n_vars = 700 × 765 obs: 'bulk_labels', 'n_genes', 'percent_mito', 'n_counts', 'S_score', 'G2M_score', 'phase', 'louvain' var: 'n_counts', 'means', 'dispersions', 'dispersions_norm', 'highly_variable' obsm: 'X_pca', 'X_umap' varm: 'PCs' Note that we concatenated along the observations by default, and that most elements aligned to the observations were concatenated as well. A notable exception is :attr:`~anndata.AnnData.obsp`, which can be re-enabled with the `pairwise` keyword argument. This is because it's not obvious that combining graphs or distance matrices padded with 0s is particularly useful, and may be unintuitive. Inner and outer joins ~~~~~~~~~~~~~~~~~~~~~ When the variables present in the objects to be concatenated aren't exactly the same, you can choose to take either the intersection or union of these variables. This is otherwise called taking the `"inner"` (intersection) or `"outer"` (union) join. For example, given two anndata objects with differing variables: >>> a = AnnData(sparse.eye(3), var=pd.DataFrame(index=list("abc"))) >>> b = AnnData(sparse.eye(2), var=pd.DataFrame(index=list("ba"))) >>> ad.concat([a, b], join="inner").X.toarray() array([[1., 0.], [0., 1.], [0., 0.], [0., 1.], [1., 0.]], dtype=float32) >>> ad.concat([a, b], join="outer").X.toarray() array([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.], [0., 1., 0.], [1., 0., 0.]], dtype=float32) The join argument is used for any element which has both (1) an axis being concatenated and (2) has an axis not being concatenated. When concatenating along the `obs` dimension, this means elements of `.X`, `obs`, `.layers`, and `.obsm` will be affected by the choice of `join`. To demonstrate this, let's say we're trying to combine a droplet based experiment with a spatial one. When building a joint anndata object, we would still like to store the coordinates for the spatial samples. >>> coords = np.hstack([np.repeat(np.arange(10), 10), np.tile(np.arange(10), 10)]).T >>> spatial = AnnData( ... sparse.random(5000, 10000, format="csr"), ... obsm={"coords": np.random.randn(5000, 2)} ... ) >>> droplet = AnnData(sparse.random(5000, 10000, format="csr")) >>> combined = ad.concat([spatial, droplet], join="outer") >>> sc.pl.embedding(combined, "coords") # doctest: +SKIP .. TODO: Get the above plot to show up Annotating data source (`label`, `keys`, and `index_unique`) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Often, you'd like to be able to tell which values came from which object. This can be accomplished with the `label`, `keys`, and `index_unique` keyword arguments. For an example, we'll show how you can keep track of the original dataset by passing a `Mapping` of dataset names to `AnnData` objects to `concat`: >>> adatas = { ... "a": ad.AnnData( ... sparse.random(3, 50, format="csr", density=0.1), ... obs=pd.DataFrame(index=[f"a-{i}" for i in range(3)]) ... ), ... "b": ad.AnnData( ... sparse.random(5, 50, format="csr", density=0.1), ... obs=pd.DataFrame(index=[f"b-{i}" for i in range(5)]) ... ), ... } >>> ad.concat(adatas, label="dataset").obs dataset a-0 a a-1 a a-2 a b-0 b b-1 b b-2 b b-3 b b-4 b Here, a categorical column (with the name specified by `label`) was added to the result. As an alternative to passing a `Mapping`, you can also specify dataset names with the `keys` argument. In some cases, your objects may share names along the axes being concatenated. These values can be made unique by appending the relevant key using the `index_unique` argument: .. TODO: skipping example since doctest does not capture stderr, but it's relevant to show the unique message >>> adatas = { ... "a": ad.AnnData( ... sparse.random(3, 10, format="csr", density=0.1), ... obs=pd.DataFrame(index=[f"cell-{i}" for i in range(3)]) ... ), ... "b": ad.AnnData( ... sparse.random(5, 10, format="csr", density=0.1), ... obs=pd.DataFrame(index=[f"cell-{i}" for i in range(5)]) ... ), ... } >>> ad.concat(adatas).obs # doctest: +SKIP Observation names are not unique. To make them unique, call `.obs_names_make_unique`. Empty DataFrame Columns: [] Index: [cell-0, cell-1, cell-2, cell-0, cell-1, cell-2, cell-3, cell-4] >>> ad.concat(adatas, index_unique="_").obs Empty DataFrame Columns: [] Index: [cell-0_a, cell-1_a, cell-2_a, cell-0_b, cell-1_b, cell-2_b, cell-3_b, cell-4_b] Merging ------- Combining elements not aligned to the axis of concatenation is controlled through the `merge` arguments. We provide a few strategies for merging elements aligned to the alternative axes: * `None`: No elements aligned to alternative axes are present in the result object. * `"same"`: Elements that are the same in each of the objects. * `"unique"`: Elements for which there is only one possible value. * `"first"`: The first element seen at each from each position. * `"only"`: Elements that show up in only one of the objects. We'll show how this works with elements aligned to the alternative axis, and then how merging works with `.uns`. First, our example case: >>> import scanpy as sc >>> blobs = sc.datasets.blobs(n_variables=30, n_centers=5) >>> sc.pp.pca(blobs) >>> blobs AnnData object with n_obs × n_vars = 640 × 30 obs: 'blobs' uns: 'pca' obsm: 'X_pca' varm: 'PCs' Now we will split this object by the categorical `"blobs"` and recombine it to illustrate different merge strategies. >>> adatas = [] >>> for group, idx in blobs.obs.groupby("blobs").indices.items(): ... sub_adata = blobs[idx].copy() ... sub_adata.obsm["qc"], sub_adata.varm[f"{group}_qc"] = sc.pp.calculate_qc_metrics( ... sub_adata, percent_top=(), inplace=False, log1p=False ... ) ... adatas.append(sub_adata) >>> adatas[0] AnnData object with n_obs × n_vars = 128 × 30 obs: 'blobs' uns: 'pca' obsm: 'X_pca', 'qc' varm: 'PCs', '0_qc' `adatas` is now a list of datasets with disjoint sets of observations and a common set of variables. Each object has had QC metrics computed, with observation-wise metrics stored under `"qc"` in `.obsm`, and variable-wise metrics stored with a unique key for each subset. Taking a look at how this effects concatenation: >>> ad.concat(adatas) AnnData object with n_obs × n_vars = 640 × 30 obs: 'blobs' obsm: 'X_pca', 'qc' >>> ad.concat(adatas, merge="same") AnnData object with n_obs × n_vars = 640 × 30 obs: 'blobs' obsm: 'X_pca', 'qc' varm: 'PCs' >>> ad.concat(adatas, merge="unique") AnnData object with n_obs × n_vars = 640 × 30 obs: 'blobs' obsm: 'X_pca', 'qc' varm: 'PCs', '0_qc', '1_qc', '2_qc', '3_qc', '4_qc' Note that comparisons are made after indices are aligned. That is, if the objects only share a subset of indices on the alternative axis, it's only required that values for those indices match when using a strategy like `"same"`. >>> a = AnnData( ... sparse.eye(3), ... var=pd.DataFrame({"nums": [1, 2, 3]}, index=list("abc")) ... ) >>> b = AnnData( ... sparse.eye(2), ... var=pd.DataFrame({"nums": [2, 1]}, index=list("ba")) ... ) >>> ad.concat([a, b], merge="same").var nums a 1 b 2 Merging `.uns` ~~~~~~~~~~~~~~ We use the same set of strategies for merging `uns` as we do for entries aligned to an axis, but these strategies are applied recursively. This is a little abstract, so we'll look at some examples of this. Here's our setup: >>> from anndata import AnnData >>> import numpy as np >>> a = AnnData(np.zeros((10, 10)), uns={"a": 1, "b": 2, "c": {"c.a": 3, "c.b": 4}}) >>> b = AnnData(np.zeros((10, 10)), uns={"a": 1, "b": 3, "c": {"c.b": 4}}) >>> c = AnnData(np.zeros((10, 10)), uns={"a": 1, "b": 4, "c": {"c.a": 3, "c.b": 4, "c.c": 5}}) For quick reference, these are the results from each of the merge strategies. These are discussed in more depth below: =========== ======================================================= `uns_merge` Result =========== ======================================================= `None` `{}` `"same"` `{"a": 1, "c": {"c.b": 4}}` `"unique"` `{"a": 1, "c": {"c.a": 3, "c.b": 4, "c.c": 5}}` `"only"` `{"c": {"c.c": 5}}` `"first"` `{"a": 1, "b": 2, "c": {"c.a": 3, "c.b": 4, "c.c": 5}}` =========== ======================================================= The default returns a fairly obvious result: >>> ad.concat([a, b, c]).uns == {} True But let's take a look at the others in a bit more depth. Here, we'll be wrapping the output data in a `dict` for simplicity of the return value. >>> dict(ad.concat([a, b, c], uns_merge="same").uns) {'a': 1, 'c': {'c.b': 4}} Here only the values for `uns["a"]` and `uns["c"]["c.b"]` were exactly the same, so only they were kept. `uns["b"]` has a number of values and neither `uns["c"]["c.a"]` or `uns["c"]["c.b"]` appears in each `uns`. A key feature to note is that comparisons are aware of the nested structure of `uns` and will be applied at any depth. This is why `uns["c"]["c.b"]` was kept. Merging `uns` in this way can be useful when there is some shared data between the objects being concatenated. For example, if each was put through the same pipeline with the same parameters, those parameters used would still be present in the resulting object. Now let's look at the behaviour of `unique`: >>> dict(ad.concat([a, b, c], uns_merge="unique").uns) {'a': 1, 'c': {'c.a': 3, 'c.b': 4, 'c.c': 5}} The results here are a super-set of those from `"same"`. Note that there was only one possible value at each position in the resulting mapping. That is, there were not alternative values present for `uns["c"]["c.c"]` even though it appeared only once. This can be useful when the object's were both run through the same pipeline but contain specific metadata per object. An example of this would be a spatial dataset, where the images are stored in `uns`. >>> dict(ad.concat([a, b, c], uns_merge="only").uns) {'c': {'c.c': 5}} `uns["c"]["c.c"]` is the only value that is kept, since it is the only one which was specified in only one `uns`. >>> dict(ad.concat([a, b, c], uns_merge="first").uns) {'a': 1, 'b': 2, 'c': {'c.a': 3, 'c.b': 4, 'c.c': 5}} In this case, the result has the union of the keys from all the starting dictionaries. The value is taken from the first object to have a value at this key. anndata-0.7.8/docs/conf.py000066400000000000000000000075151414255741200154000ustar00rootroot00000000000000import sys import logging from pathlib import Path from datetime import datetime from sphinx.application import Sphinx HERE = Path(__file__).parent sys.path[:0] = [str(HERE.parent), str(HERE / "extensions")] import anndata # noqa logger = logging.getLogger(__name__) for generated in HERE.glob("anndata.*.rst"): generated.unlink() # -- General configuration ------------------------------------------------ needs_sphinx = "1.7" # autosummary bugfix # General information project = "anndata" author = "AnnData development team" copyright = f"{datetime.now():%Y}, the AnnData development team." version = anndata.__version__.replace(".dirty", "") release = version # default settings templates_path = ["_templates"] source_suffix = ".rst" master_doc = "index" default_role = "literal" exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] pygments_style = "sphinx" extensions = [ "sphinx.ext.autodoc", "sphinx.ext.intersphinx", "sphinx.ext.doctest", "sphinx.ext.coverage", "sphinx.ext.mathjax", "sphinx.ext.napoleon", "sphinx.ext.autosummary", "sphinx_autodoc_typehints", # needs to be after napoleon "scanpydoc", *[p.stem for p in (HERE / "extensions").glob("*.py")], ] # Generate the API documentation when building autosummary_generate = True autodoc_member_order = "bysource" # autodoc_default_flags = ['members'] napoleon_google_docstring = False napoleon_numpy_docstring = True napoleon_include_init_with_doc = False napoleon_use_rtype = True # having a separate entry generally helps readability napoleon_use_param = True napoleon_custom_sections = [("Params", "Parameters")] todo_include_todos = False nitpicky = True # Report broken links nitpick_ignore = [ ("py:meth", "pandas.DataFrame.iloc"), ("py:meth", "pandas.DataFrame.loc"), ] suppress_warnings = ["ref.citation"] def setup(app: Sphinx): # Don’t allow broken links. DO NOT CHANGE THIS LINE, fix problems instead. app.warningiserror = True intersphinx_mapping = dict( h5py=("https://docs.h5py.org/en/latest/", None), loompy=("https://linnarssonlab.org/loompy/", None), numpy=("https://numpy.org/doc/stable/", None), pandas=("https://pandas.pydata.org/pandas-docs/stable/", None), python=("https://docs.python.org/3", None), scipy=("https://docs.scipy.org/doc/scipy/reference/", None), sklearn=("https://scikit-learn.org/stable/", None), zarr=("https://zarr.readthedocs.io/en/stable/", None), xarray=("http://xarray.pydata.org/en/stable/", None), ) qualname_overrides = { "anndata._core.anndata.AnnData": "anndata.AnnData", # Temporarily "anndata._core.raw.Raw": "anndata.AnnData", "anndata._core.views.ArrayView": "numpy.ndarray", **{ f"anndata._core.aligned_mapping.{cls}{kind}": "typing.Mapping" for cls in "Layers AxisArrays PairwiseArrays".split() for kind in ["", "View"] }, } # -- Options for HTML output ---------------------------------------------- html_theme = "scanpydoc" html_theme_options = dict(navigation_depth=4) html_context = dict( display_github=True, # Integrate GitHub github_user="theislab", # Username github_repo="anndata", # Repo name github_version="master", # Version conf_py_path="/docs/", # Path in the checkout to the docs root ) issues_github_path = "{github_user}/{github_repo}".format_map(html_context) html_show_sphinx = False # -- Options for other output formats ------------------------------------------ htmlhelp_basename = f"{project}doc" doc_title = f"{project} Documentation" latex_documents = [(master_doc, f"{project}.tex", doc_title, author, "manual")] man_pages = [(master_doc, project, doc_title, [author], 1)] texinfo_documents = [ ( master_doc, project, doc_title, author, project, "One line description of project.", "Miscellaneous", ) ] anndata-0.7.8/docs/extensions/000077500000000000000000000000001414255741200162705ustar00rootroot00000000000000anndata-0.7.8/docs/extensions/github_links.py000066400000000000000000000032131414255741200213230ustar00rootroot00000000000000from types import MappingProxyType from typing import Any, Mapping, Sequence, NamedTuple from docutils import nodes from docutils.parsers.rst.directives import class_option from docutils.parsers.rst.states import Inliner from sphinx.application import Sphinx from sphinx.config import Config class AutoLink(NamedTuple): class_name: str url_template: str title_template: str = "{}" options: Mapping[str, Any] = MappingProxyType({"class": class_option}) def __call__( self, name: str, rawtext: str, text: str, lineno: int, inliner: Inliner, options: Mapping[str, Any] = MappingProxyType({}), content: Sequence[str] = (), ): url = self.url_template.format(text) title = self.title_template.format(text) options = {**dict(classes=[self.class_name]), **options} node = nodes.reference(rawtext, title, refuri=url, **options) return [node], [] def register_links(app: Sphinx, config: Config): gh_url = "https://github.com/{github_user}/{github_repo}".format_map( config.html_context ) app.add_role("pr", AutoLink("pr", f"{gh_url}/pull/{{}}", "PR {}")) app.add_role("issue", AutoLink("issue", f"{gh_url}/issues/{{}}", "issue {}")) app.add_role("noteversion", AutoLink("noteversion", f"{gh_url}/releases/tag/{{}}")) # tutorial links scanpy_tutorials_url = "https://scanpy-tutorials.readthedocs.io/en/latest/" app.add_role( "tutorial", AutoLink("tutorial", f"{scanpy_tutorials_url}{{}}.html", "→ tutorial: {}"), ) def setup(app: Sphinx): app.connect("config-inited", register_links) anndata-0.7.8/docs/fileformat-prose.rst000066400000000000000000000143561414255741200201120ustar00rootroot00000000000000On-disk format -------------- .. note:: These docs are written for anndata 0.7. Files written before this version may differ in some conventions, but will still be read by newer versions of the library. AnnData objects are saved on disk to hierarchichal array stores like HDF5_ (via :doc:`H5py `) and :doc:`zarr:index`. This allows us to have very similar structures in disk and on memory. In general, `AnnData` objects can hold three kinds of values: (1) dense arrays, (2) sparse arrays, and (3) DataFrames. As an example we’ll look into a typical `.h5ad` object that’s been through an analysis. This structure should be largely equivalent to Zarr structure, though there are a few minor differences. .. _HDF5: https://en.wikipedia.org/wiki/Hierarchical_Data_Format .. I’ve started using h5py since I couldn’t figure out a nice way to print attributes from bash. >>> import h5py >>> f = h5py.File("02_processed.h5ad", "r") >>> list(f.keys()) ['X', 'layers', 'obs', 'obsm', 'uns', 'var', 'varm'] .. .. code:: bash .. $ h5ls 02_processed.h5ad .. X Group .. layers Group .. obs Group .. obsm Group .. uns Group .. var Group .. varm Group Dense arrays ~~~~~~~~~~~~ Dense arrays have the most simple representation on disk, as they have native equivalents in H5py :doc:`h5py:high/dataset` and Zarr :ref:`Arrays `. We can see an example of this with dimensionality reductions stored in the `obsm` group: >>> f["obsm"].visititems(print) X_pca X_umap .. .. code:: bash .. $ h5ls 02_processed.h5ad/obsm .. X_pca Dataset {38410, 50} .. X_umap Dataset {38410, 2} Sparse arrays ~~~~~~~~~~~~~ Sparse arrays don’t have a native representations in HDF5 or Zarr, so we've defined our own based on their in-memory structure. Currently two sparse data formats are supported by `AnnData` objects, CSC and CSR (corresponding to :class:`scipy.sparse.csc_matrix` and :class:`scipy.sparse.csr_matrix` respectivley). These formats represent a two-dimensional sparse array with three one-dimensional arrays, `indptr`, `indices`, and `data`. .. note:: A full description of these formats is out of scope for this document, but are `easy to find`_. .. _easy to find: https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format) We represent a sparse array as a `Group` on-disk, where the kind and shape of the sparse array is defined in the `Group`'s attributes: >>> dict(f["X"].attrs) {'encoding-type': 'csr_matrix', 'encoding-version': '0.1.0', 'shape': array([38410, 27899])} Inside the group are the three constituent arrays: >>> f["X"].visititems(print) data indices indptr .. .. code:: bash .. $ h5ls 02_processed.h5ad/X .. data Dataset {41459314/Inf} .. indices Dataset {41459314/Inf} .. indptr Dataset {38411/Inf} DataFrames ~~~~~~~~~~ DataFrames are saved as a columnar format in a group, so each column of a DataFrame gets its own dataset. To maintain efficiency with categorical values, only the numeric codes are stored for each row, while categories values are saved in a reserved subgroup `__categories`. Dataframes can be identified from other groups by their attributes: >>> dict(f["obs"].attrs) {'_index': 'Cell', 'column-order': array(['sample', 'cell_type', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mito', 'log1p_total_counts_mito', 'pct_counts_mito', 'label_by_score'], dtype=object), 'encoding-type': 'dataframe', 'encoding-version': '0.1.0'} These attributes identify the column used as an index, the order of the original columns, and some type information. >>> f["obs"].visititems(print) Cell __categories __categories/cell_type __categories/label_by_score __categories/sample cell_type label_by_score log1p_n_genes_by_counts [...] Categorical Series can be identified by the presence of the attribute `"categories"`, which contains a pointer to the categories' values: >>> dict(f["obs/cell_type"].attrs) {'categories': } .. note:: In `zarr`, as there are no reference objects, the `categories` attribute is an absolute path to the category values. Other values ~~~~~~~~~~~~ Mappings ^^^^^^^^ Mappings are simply stored as `Group` s on disk. These are distinct from DataFrames and sparse arrays since they don’t have any special attributes. A `Group` is created for any `Mapping` in the AnnData object, including the standard `obsm`, `varm`, `layers`, and `uns`. Notably, this definition is used recursively within `uns`: >>> f["uns"].visititems(print) [...] pca pca/variance pca/variance_ratio [...] Scalars ^^^^^^^ Zero dimensional arrays are used for scalar values (i.e. single values like strings, numbers or booleans). These should only occur inside of `uns`, and are common inside of saved parameters: >>> f["uns/neighbors/params"].visititems(print) method metric n_neighbors >>> f["uns/neighbors/params/metric"][()] 'euclidean' anndata-0.7.8/docs/index.rst000066400000000000000000000014751414255741200157410ustar00rootroot00000000000000.. include:: ../README.rst :end-line: 22 .. figure:: https://falexwolf.de/img/scanpy/anndata.svg :width: 350px :class:`~anndata.AnnData` provides a scalable way of keeping track of data and learned annotations. * See `Scanpy's documentation `__ for usage related to single cell data. * Discuss development on `GitHub `_. * Install via `pip install anndata` or `conda install anndata -c conda-forge`. * anndata was initially built for Scanpy `(Genome Biology, 2018) `__. News ---- .. include:: news.rst Latest additions ---------------- .. include:: release-latest.rst .. toctree:: :maxdepth: 1 :hidden: api concatenation fileformat-prose benchmarks release-notes references anndata-0.7.8/docs/news.rst000066400000000000000000000012511414255741200155760ustar00rootroot00000000000000.. role:: small COVID-19 datasets distributed as `h5ad` :small:`2020-04-01` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ In a joint initiative, the Wellcome Sanger Institute, the Human Cell Atlas, and the CZI distribute datasets related to COVID-19 via anndata's `h5ad` files: `covid19cellatlas.org `__. It wasn't anticipated that the `initial idea `__ of sharing and backing an on-disk representation of `AnnData` would become so widely adopted. Curious? Read up more on the `format `__. anndata-0.7.8/docs/references.rst000066400000000000000000000012311414255741200167410ustar00rootroot00000000000000References ---------- .. [Hastie09] Hastie *et al.* (2009), *The Elements of Statistical Learning*, `Springer `_. .. [Huber15] Huber *et al.* (2015), *Orchestrating high-throughput genomic analysis with Bioconductor*, `Nature Methods `_. .. [Murphy12] Murphy (2012, *Machine Learning: A Probabilisitc Perspective*, `MIT Press `_. .. [Wolf18] Wolf *et al.* (2018), *Scanpy: large-scale single-cell gene expression data analysis*, `Genome Biology `_. anndata-0.7.8/docs/release-latest.rst000066400000000000000000000127031414255741200175400ustar00rootroot00000000000000.. role:: small .. role:: smaller 0.7.8 :small:`9 November, 2021` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. rubric:: Bug fixes - Re-include test helpers :pr:`641` :smaller:`I Virshup` 0.7.7 :small:`9 November, 2021` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. rubric:: Bug fixes - Fixed propagation of import error when importing `write_zarr` but not all dependencies are installed :pr:`579` :smaller:`R Hillje` - Fixed issue with `.uns` sub-dictionaries being referenced by copies :pr:`576` :smaller:`I Virshup` - Fixed out-of-bounds integer indices not raising :class:`IndexError` :pr:`630` :smaller:`M Klein` - Fixed backed `SparseDataset` indexing with scipy 1.7.2 :pr:`638` :smaller:`I Virshup` .. rubric:: Development processes - Use PEPs 621 (standardized project metadata), 631 (standardized dependencies), and 660 (standardized editable installs) :pr:`639` :smaller:`I Virshup` 0.7.6 :small:`11 April, 2021` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. rubric:: New features - Added :meth:`anndata.AnnData.to_memory` for returning an in memory object from a backed one :pr:`470` :pr:`542` :smaller:`V Bergen` :smaller:`I Virshup` - :meth:`anndata.AnnData.write_loom` now writes `obs_names` and `var_names` using the `Index`'s `.name` attribute, if set :pr:`538` :smaller:`I Virshup` .. rubric:: Bug fixes - Fixed bug where `np.str_` column names errored at write time :pr:`457` :smaller:`I Virshup` - Fixed "value.index does not match parent’s axis 0/1 names" error triggered when a data frame is stored in obsm/varm after obs_names/var_names is updated :pr:`461` :smaller:`G Eraslan` - Fixed `adata.write_csvs` when `adata` is a view :pr:`462` :smaller:`I Virshup` - Fixed null values being converted to strings when strings are converted to categorical :pr:`529` :smaller:`I Virshup` - Fixed handling of compression key word arguments :pr:`536` :smaller:`I Virshup` - Fixed copying a backed `AnnData` from changing which file the original object points at :pr:`533` :smaller:`ilia-kats` - Fixed a bug where calling `AnnData.concatenate` an `AnnData` with no variables would error :pr:`537` :smaller:`I Virshup` .. rubric:: Deprecations - Passing positional arguments to :func:`anndata.read_loom` besides the path is now deprecated :pr:`538` :smaller:`I Virshup` - :func:`anndata.read_loom` arguments `obsm_names` and `varm_names` are now deprecated in favour of `obsm_mapping` and `varm_mapping` :pr:`538` :smaller:`I Virshup` 0.7.5 :small:`12 November, 2020` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. rubric:: Functionality - Added ipython tab completion and a useful return from `.keys` to `adata.uns` :pr:`415` :smaller:`I Virshup` .. rubric:: Bug fixes - Compatibility with `h5py>=3` strings :pr:`444` :smaller:`I Virshup` - Allow `adata.raw = None`, as is documented :pr:`447` :smaller:`I Virshup` - Fix warnings from pandas 1.1 :pr:`425` :smaller:`I Virshup` 0.7.4 :small:`10 July, 2020` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. rubric:: Concatenation overhaul :pr:`378` :smaller:`I Virshup` - New function :func:`anndata.concat` for concatenating `AnnData` objects along either observations or variables - New documentation section: :doc:`concatenation` .. rubric:: Functionality - AnnData object created from dataframes with sparse values will have sparse `.X` :pr:`395` :smaller:`I Virshup` .. rubric:: Bug fixes - Fixed error from `AnnData.concatenate` by bumping minimum versions of numpy and pandas :issue:`385` - Fixed colors being incorrectly changed when `AnnData` object was subset :pr:`388` 0.7.3 :small:`20 May, 2020` ~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. rubric:: Bug fixes - Fixed bug where graphs used too much memory when copying :pr:`381` :smaller:`I Virshup` 0.7.2 :small:`15 May, 2020` ~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. rubric:: Concatenation overhaul :smaller:`I Virshup` - Elements of `uns` can now be merged, see :pr:`350` - Outer joins now work for `layers` and `obsm`, see :pr:`352` - Fill value for outer joins can now be specified - Expect improvments in performance, see :issue:`303` .. rubric:: Functionality - :attr:`~anndata.AnnData.obsp` and :attr:`~anndata.AnnData.varp` can now be transposed :pr:`370` :smaller:`A Wolf` - :meth:`~anndata.AnnData.obs_names_make_unique` is now better at making values unique, and will warn if ambiguities arise :pr:`345` :smaller:`M Weiden` - :attr:`~anndata.AnnData.obsp` is now preferred for storing pairwise relationships between observations. In practice, this means there will be deprecation warnings and reformatting applied to objects which stored connectivities under `uns["neighbors"]`. Square matrices in :attr:`~anndata.AnnData.uns` will no longer be sliced (use `.{obs,var}p` instead). :pr:`337` :smaller:`I Virshup` - :class:`~anndata.ImplicitModificationWarning` is now exported :pr:`315` :smaller:`P Angerer` - Better support for :class:`~numpy.ndarray` subclasses stored in `AnnData` objects :pr:`335` :smaller:`michalk8` .. rubric:: Bug fixes - Fixed inplace modification of :class:`~pandas.Index` objects by the make unique function :pr:`348` :smaller:`I Virshup` - Passing ambiguous keys to :meth:`~anndata.AnnData.obs_vector` and :meth:`~anndata.AnnData.var_vector` now throws errors :pr:`340` :smaller:`I Virshup` - Fix instantiating :class:`~anndata.AnnData` objects from :class:`~pandas.DataFrame` :pr:`316` :smaller:`P Angerer` - Fixed indexing into `AnnData` objects with arrays like `adata[adata[:, gene].X > 0]` :pr:`332` :smaller:`I Virshup` - Fixed type of version :pr:`315` :smaller:`P Angerer` - Fixed deprecated import from :mod:`pandas` :pr:`319` :smaller:`P Angerer` anndata-0.7.8/docs/release-notes.rst000066400000000000000000000137521414255741200174010ustar00rootroot00000000000000Release Notes ============= .. role:: small .. role:: smaller .. role:: noteversion .. include:: _key_contributors.rst .. rubric:: Upcoming changes - :attr:`~anndata.AnnData.layers` and :attr:`~anndata.AnnData.X` will be unified. - :attr:`~anndata.AnnData.filename` and :attr:`~anndata.AnnData.isbacked` will be unified under a new name. - The types of :attr:`~anndata.AnnData.raw`, :attr:`~anndata.AnnData.layers`, :attr:`~anndata.AnnData.obsm`, :attr:`~anndata.AnnData.varm`, :attr:`~anndata.AnnData.obsp` and :attr:`~anndata.AnnData.varp` will be exported. Version 0.7 ----------- .. include:: release-latest.rst 0.7.0 :small:`22 January, 2020` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. warning:: Breaking changes introduced between `0.6.22.post1` and `0.7`: - Elements of :class:`~anndata.AnnData`\ s don’t have their dimensionality reduced when the main object is subset. This is to maintain consistency when subsetting. See discussion in :issue:`145`. - Internal modules like `anndata.core` are private and their contents are not stable: See :issue:`174`. - The old deprecated attributes `.smp*`. `.add` and `.data` have been removed. .. rubric:: View overhaul :pr:`164` - Indexing into a view no longer keeps a reference to intermediate view, see :issue:`62`. - Views are now lazy. Elements of view of AnnData are not indexed until they’re accessed. - Indexing with scalars no longer reduces dimensionality of contained arrays, see :issue:`145`. - All elements of AnnData should now follow the same rules about how they’re subset, see :issue:`145`. - Can now index by observations and variables at the same time. .. rubric:: IO overhaul :pr:`167` - Reading and writing has been overhauled for simplification and speed. - Time and memory usage can be half of previous in typical use cases - Zarr backend now supports sparse arrays, and generally is closer to having the same features as HDF5. - Backed mode should see significant speed and memory improvements for access along compressed dimensions and IO. PR :pr:`241`. - :class:`~pandas.Categorical`\ s can now be ordered (PR :pr:`230`) and written to disk with a large number of categories (PR :pr:`217`). .. rubric:: Mapping attributes overhaul :smaller:`(obsm, varm, layers, ...)` - New attributes :attr:`~anndata.AnnData.obsp` and :attr:`~anndata.AnnData.varp` have been added for two dimensional arrays where each axis corresponds to a single axis of the AnnData object. PR :pr:`207`. - These are intended to store values like cell-by-cell graphs, which are currently stored in :attr:`~anndata.AnnData.uns`. - Sparse arrays are now allowed as values in all mapping attributes. - DataFrames are now allowed as values in :attr:`~anndata.AnnData.obsm` and :attr:`~anndata.AnnData.varm`. - All mapping attributes now share an implementation and will have the same behaviour. PR :pr:`164`. .. rubric:: Miscellaneous improvements - Mapping attributes now have ipython tab completion (e.g. `adata.obsm["\\t` can provide suggestions) PR :pr:`183`. - :class:`~anndata.AnnData` attributes are now delete-able (e.g. `del adata.raw`) PR :pr:`242`. - Many many bug fixes Version 0.6 ----------- 0.6.* :small:`2019-*-*` ~~~~~~~~~~~~~~~~~~~~~~~~~ - better support for aligned mappings (obsm, varm, layers) :noteversion:`0.6.22` :pr:`155` :smaller:`I Virshup` - convenience accesors :func:`~anndata.AnnData.obs_vector`, :func:`~anndata.AnnData.var_vector` for 1d arrays. :noteversion:`0.6.21` :pr:`144` :smaller:`I Virshup` - compatibility with Scipy >=1.3 by removing `IndexMixin` dependency. :noteversion:`0.6.20` :pr:`151` :smaller:`P Angerer` - bug fix for second-indexing into views. :noteversion:`0.6.19` :smaller:`P Angerer` - bug fix for reading excel files. :noteversion:`0.6.19` :smaller:`A Wolf` - changed default compression to `None` in :func:`~anndata.AnnData.write_h5ad` to speed up read and write, disk space use is usually less critical. :noteversion:`0.6.16` :smaller:`A Wolf` - maintain dtype upon copy. :noteversion:`0.6.13` :smaller:`A Wolf` - :attr:`~anndata.AnnData.layers` inspired by `.loom`_ files allows their information lossless reading via :func:`~anndata.read_loom`. :noteversion:`0.6.7`–:noteversion:`0.6.9` :pr:`46` & :pr:`48` :smaller:`S Rybakov` - support for reading zarr files: :func:`~anndata.read_zarr` :noteversion:`0.6.7` :pr:`38` :smaller:`T White` - initialization from pandas DataFrames :noteversion:`0.6.` :smaller:`A Wolf` - iteration over chunks :func:`~anndata.AnnData.chunked_X` and :func:`~anndata.AnnData.chunk_X` :noteversion:`0.6.1` :pr:`20` :smaller:`S Rybakov` 0.6.0 :small:`1 May, 2018` ~~~~~~~~~~~~~~~~~~~~~~~~~~ - compatibility with Seurat converter - tremendous speedup for :func:`~anndata.AnnData.concatenate` - bug fix for deep copy of unstructured annotation after slicing - bug fix for reading HDF5 stored single-category annotations - `'outer join'` concatenation: adds zeros for concatenation of sparse data and nans for dense data - better memory efficiency in loom exports Version 0.5 ----------- 0.5.0 :small:`9 February, 2018` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - inform about duplicates in :class:`~anndata.AnnData.var_names` and resolve them using :func:`~anndata.AnnData.var_names_make_unique` - automatically remove unused categories after slicing - read/write `.loom`_ files using loompy 2 - fixed read/write for a few text file formats - read `UMI tools`_ files: :func:`~anndata.read_umi_tools` .. _UMI tools: https://github.com/CGATOxford/UMI-tools Version 0.4 ----------- 0.4.0 :small:`23 December, 2017` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - read/write `.loom`_ files - scalability beyond dataset sizes that fit into memory: see this `blog post`_ - :class:`~anndata.AnnData` has a :class:`~anndata.AnnData.raw` attribute, which simplifies storing the data matrix when you consider it *raw*: see the `clustering tutorial`_ .. _.loom: http://loompy.org .. _blog post: http://falexwolf.de/blog/171223_AnnData_indexing_views_HDF5-backing/ .. _clustering tutorial: https://github.com/theislab/scanpy_usage/tree/master/170505_seurat anndata-0.7.8/pyproject.toml000066400000000000000000000055661414255741200160710ustar00rootroot00000000000000[build-system] build-backend = "flit_core.buildapi" requires = [ "flit_core >=3.4,<4", "setuptools_scm", "importlib_metadata>=0.7; python_version < '3.8'", ] [project] name = "anndata" description = "Annotated data." requires-python = ">=3.6" license = {file = "LICENSE"} authors = [ {name = "Philipp Angerer"}, {name = "Alex Wolf"}, {name = "Isaac Virshup"}, {name = "Sergei Rybakov"}, ] maintainers = [ {name = "Isaac Virshup", email = "ivirshup@gmail.com"}, {name = "Philipp Angerer", email = "philipp.angerer@helmholtz-muenchen.de"}, {name = "Alex Wolf", email = "f.alex.wolf@gmx.de"}, ] readme = {file = "README.rst", content-type="text/x-rst"} classifiers = [ "License :: OSI Approved :: BSD License", "Environment :: Console", "Framework :: Jupyter", "Intended Audience :: Developers", "Intended Audience :: Science/Research", "Natural Language :: English", "Operating System :: MacOS :: MacOS X", "Operating System :: Microsoft :: Windows", "Operating System :: POSIX :: Linux", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Topic :: Scientific/Engineering :: Bio-Informatics", "Topic :: Scientific/Engineering :: Visualization", ] dependencies = [ "pandas>=1.1.1", # pandas <1.1.1 has pandas/issues/35446 "numpy>=1.16.5", # required by pandas 1.x "scipy>1.4", "h5py", "natsort", "packaging>=20", "xlrd<2.0", # xlsx format not support anymore from v2.0, see pandas/issues/38524 # for getting the stable version "importlib_metadata>=0.7; python_version < '3.8'", ] dynamic = ["version"] [project.urls] Documentation = "https://anndata.readthedocs.io/" Source = "https://github.com/theislab/anndata" Home-page = "https://github.com/theislab/anndata" [project.optional-dependencies] dev = [ # dev version generation "setuptools_scm", # static checking "black>=20.8b1", "docutils", ] doc = [ "sphinx>=4.1,<4.2", "sphinx-rtd-theme", "sphinx-autodoc-typehints>=1.11.0", "sphinx_issues", "scanpydoc>=0.7.3", "typing_extensions; python_version < '3.8'", ] test = [ "loompy>=3.0.5", "pytest>=6.0", "pytest-cov>=2.10", "zarr", "matplotlib", "sklearn", "openpyxl", "joblib", "boltons", "scanpy", ] [tool.flit.sdist] exclude = [ 'anndata/tests/test_*.py', 'anndata/tests/data', ] [tool.coverage.run] source = ["anndata"] omit = [ "setup.py", "versioneer.py", "anndata/_version.py", "**/test_*.py", ] [tool.pytest.ini_options] addopts = "--doctest-modules" python_files = "test_*.py" testpaths = ["anndata", "docs/concatenation.rst"] xfail_strict = true [tool.black] line-length = 88 target-version = ["py36"] exclude = "^/build/.*$"