pax_global_header00006660000000000000000000000064144215420630014513gustar00rootroot0000000000000052 comment=6a9cca80725100e33b09511459a9053b195e3535 unifrac-1.3/000077500000000000000000000000001442154206300130055ustar00rootroot00000000000000unifrac-1.3/.github/000077500000000000000000000000001442154206300143455ustar00rootroot00000000000000unifrac-1.3/.github/workflows/000077500000000000000000000000001442154206300164025ustar00rootroot00000000000000unifrac-1.3/.github/workflows/main.yml000066400000000000000000000157011442154206300200550ustar00rootroot00000000000000name: UniFrac CI on: push: branches: [ master ] pull_request: branches: [ master ] # A workflow run is made up of one or more jobs that can run sequentially or in parallel jobs: lint: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - name: flake8 uses: actions/setup-python@v3 with: python-version: 3.9 - name: install dependencies run: python -m pip install --upgrade pip - name: lint run: | pip install -q flake8 flake8 unifrac setup.py build-and-test: needs: lint strategy: matrix: python-version: ['3.8', '3.9', '3.10', '3.11'] os: [ubuntu-latest, macos-latest, linux-gpu-cuda] exclude: - os: macos-latest python-version: '3.9' - os: macos-latest python-version: '3.10' - os: linux-gpu-cuda python-version: '3.8' - os: linux-gpu-cuda python-version: '3.9' runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v3 - uses: conda-incubator/setup-miniconda@v2 with: miniconda-version: "latest" auto-update-conda: true python-version: ${{ matrix.python-version }} - name: Install shell: bash -l {0} run: | conda create --yes -n unifrac -c conda-forge -c bioconda python=${{ matrix.python-version }} conda activate unifrac conda config --add channels conda-forge conda config --add channels bioconda if [[ "$(uname -s)" == "Linux" ]]; then conda install --yes -c conda-forge -c bioconda gxx_linux-64 else conda install --yes -c conda-forge -c bioconda clangxx_osx-64 fi conda install --yes -c conda-forge -c bioconda "unifrac-binaries>=1.2" conda install --yes -c conda-forge -c bioconda cython biom-format numpy "h5py>3.3.0" "scikit-bio>=0.5.8" nose echo "$(uname -s)" if [[ "$(uname -s)" == "Linux" ]]; then which x86_64-conda-linux-gnu-gcc x86_64-conda-linux-gnu-gcc -v x86_64-conda-linux-gnu-g++ -v else which clang clang -v fi which h5c++ pip install iow pip install -e . - name: Tests shell: bash -l {0} run: | conda activate unifrac # keep it low for runs in containers # and a weird number to potentially catch potential bugs export OMP_NUM_THREADS=3 # diagnostic messages for debugging, if needed export UNIFRAC_GPU_INFO=Y ls -lrt $CONDA_PREFIX/lib/libhdf5_cpp* nosetests - name: Sanity checks shell: bash -l {0} run: | conda activate unifrac # keep it low for runs in containers # and a weird number to potentially catch potential bugs export OMP_NUM_THREADS=3 # diagnostic messages for debugging, if needed export UNIFRAC_GPU_INFO=Y set -e ssu -i unifrac/tests/data/crawford.biom -t unifrac/tests/data/crawford.tre -o ci/test.dm -m unweighted python -c "import skbio; dm = skbio.DistanceMatrix.read('ci/test.dm')" pushd unifrac/tests export UNIFRAC_TIMING_INFO=Y python -c "import unifrac; unifrac.unweighted_to_file('data/crawford.biom','data/crawford.tre','../../ci/test.dm.h5')" python -c "import unifrac,skbio; dm_u=unifrac.unweighted('data/crawford.biom','data/crawford.tre'); dm = skbio.DistanceMatrix.read('../../ci/test.dm'); t=abs(dm_u.data-dm.data).max(); print(t); assert t < 0.1" python -c "import unifrac; unifrac.unweighted_to_file('data/crawford.biom','data/crawford.tre','../../ci/test2.dm.h5',permanova_perms=99,grouping_filename='data/crawford.group.tsv',grouping_columns='Treatment')" python -c "import unifrac; unifrac.weighted_normalized_to_file('data/crawford.biom','data/crawford.tre','../../ci/test3.dm.h5',subsample_depth=2,pcoa_dims=2)" export UNIFRAC_TIMING_INFO=N popd python -c "import h5py,skbio; f_u=h5py.File('ci/test.dm.h5','r'); dm_u=skbio.stats.distance.DistanceMatrix(f_u['matrix'][:,:],f_u['order'][:])" python -c "import h5py,skbio; dm = skbio.DistanceMatrix.read('ci/test.dm'); f_u=h5py.File('ci/test.dm.h5','r'); dm_u=skbio.stats.distance.DistanceMatrix(f_u['matrix'][:,:],f_u['order'][:]); t=abs(dm_u.data-dm.data).max(); print(t); assert t < 0.1" python -c "import h5py,skbio; dm = skbio.DistanceMatrix.read('ci/test.dm'); f_u=h5py.File('ci/test2.dm.h5','r'); dm_u=skbio.stats.distance.DistanceMatrix(f_u['matrix'][:,:],f_u['order'][:]); t=abs(dm_u.data-dm.data).max(); print(t); assert t < 0.1" python -c "import h5py; f_u=h5py.File('ci/test2.dm.h5','r'); print(f_u.keys()); assert len(f_u['stat_methods'][:]) == 1" python -c "import h5py; f_u=h5py.File('ci/test3.dm.h5','r'); print(f_u.keys()); assert len(f_u['pcoa_eigvals'][:]) == 2" # repeat using unifrac's h5 interfaces python -c "import unifrac; dm_u=unifrac.h5unifrac('ci/test.dm.h5'); dm_l=unifrac.h5unifrac_all('ci/test.dm.h5')" python -c "import unifrac,skbio; dm = skbio.DistanceMatrix.read('ci/test.dm'); dm_u=unifrac.h5unifrac('ci/test.dm.h5'); t=abs(dm_u.data-dm.data).max(); print(t); assert t < 0.1" python -c "import unifrac,skbio; dm = skbio.DistanceMatrix.read('ci/test.dm'); dm_u=unifrac.h5unifrac_all('ci/test.dm.h5')[0]; t=abs(dm_u.data-dm.data).max(); print(t); assert t < 0.1" python -c "import unifrac,skbio; dm = skbio.DistanceMatrix.read('ci/test.dm'); dm_u=unifrac.h5unifrac('ci/test2.dm.h5'); t=abs(dm_u.data-dm.data).max(); print(t); assert t < 0.1" python -c "import unifrac; st_l=unifrac.h5permanova_dict('ci/test2.dm.h5'); assert len(st_l) == 1" python -c "import unifrac; pc=unifrac.h5pcoa('ci/test3.dm.h5'); print(pc); assert len(pc.eigvals) == 2" if [[ "$(uname -s)" == "Linux" ]]; then MD5=md5sum else MD5='md5 -r' fi ssu -i unifrac/tests/data/crawford.biom -t unifrac/tests/data/crawford.tre -o ci/test.dm.start0.stop3 -m unweighted --mode partial --start 0 --stop 3 ssu -i unifrac/tests/data/crawford.biom -t unifrac/tests/data/crawford.tre -o ci/test.dm.start3.stop5 -m unweighted --mode partial --start 3 --stop 5 ssu -i unifrac/tests/data/crawford.biom -t unifrac/tests/data/crawford.tre -o ci/test.dm.partial --mode merge-partial --partial-pattern "ci/test.dm.start*" exp=$($MD5 ci/test.dm | awk '{ print $1 }') obs=$($MD5 ci/test.dm.partial | awk '{ print $1 }') python -c "assert '${obs}' == '${exp}'" faithpd -i unifrac/tests/data/crawford.biom -t unifrac/tests/data/crawford.tre -o ci/test.faith.obs tail -n +2 ci/test.faith.obs > ci/test.faith.header-removed.obs exp1=$($MD5 unifrac/tests/data/test.faith.exp | awk '{ print $1 }') obs1=$($MD5 ci/test.faith.header-removed.obs | awk '{ print $1 }') python -c "assert '${obs1}' == '${exp1}'" unifrac-1.3/.gitignore000066400000000000000000000020741442154206300150000ustar00rootroot00000000000000# Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python env/ build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ *.egg-info/ .installed.cfg *.egg # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *,cover .hypothesis/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # IPython Notebook .ipynb_checkpoints # pyenv .python-version # celery beat schedule file celerybeat-schedule # dotenv .env # virtualenv venv/ ENV/ # Spyder project settings .spyderproject # Rope project settings .ropeproject # vim *.swp *.swo *~ # ssu sucpp/ssu unifrac-1.3/.gitmodules000066400000000000000000000000001442154206300151500ustar00rootroot00000000000000unifrac-1.3/LICENSE000066400000000000000000000027721442154206300140220ustar00rootroot00000000000000BSD 3-Clause License Copyright (c) 2016-2021, UniFrac development team. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. unifrac-1.3/MANIFEST.in000066400000000000000000000001671442154206300145470ustar00rootroot00000000000000graft unifrac global-exclude *.pyc global-exclude *.pyo global-exclude .git global-exclude *.so global-exclude .*.swp unifrac-1.3/README.md000066400000000000000000000546061442154206300142770ustar00rootroot00000000000000# UniFrac ##### Canonically pronounced *yew-nih-frak* [![Build Status](https://travis-ci.com/biocore/unifrac.svg?branch=master)](https://travis-ci.com/biocore/unifrac) The *de facto* repository for high-performance phylogenetic diversity calculations. The methods in this repository are based on an implementation of the [Strided State UniFrac](https://www.nature.com/articles/s41592-018-0187-8) algorithm which is faster, and uses less memory than [Fast UniFrac](http://www.nature.com/ismej/journal/v4/n1/full/ismej200997a.html). Strided State UniFrac supports [Unweighted UniFrac](http://aem.asm.org/content/71/12/8228.abstract), [Weighted UniFrac](http://aem.asm.org/content/73/5/1576), [Generalized UniFrac](https://academic.oup.com/bioinformatics/article/28/16/2106/324465/Associating-microbiome-composition-with), [Variance Adjusted UniFrac](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-12-118) and [meta UniFrac](http://www.pnas.org/content/105/39/15076.short), in both double and single precision (fp32). This repository also includes Stacked Faith (manuscript in preparation), a method for calculating Faith's PD that is faster and uses less memory than the Fast UniFrac-based [reference implementation](http://scikit-bio.org/). This repository produces the Python interface against the C API exposed via a shared library provided by the dependent [unifrac-binaries](https://github.com/biocore/unifrac-binaries) repository. # Citation A original description of the Strided State UniFrac algorithm can be found in [McDonald et al. 2018 Nature Methods](https://www.nature.com/articles/s41592-018-0187-8) with further improvements available in [Sfiligoi et al. mSystems 2022](https://www.doi.org/10.1128/msystems.00028-22). Please note that this package implements multiple UniFrac variants, which may have their own citation. Details can be found in the help output from the command line interface in the citations section, and is included immediately below: ssu For UniFrac, please see: Sfiligoi et al. mSystems 2022; DOI: 10.1128/msystems.00028-22 McDonald et al. Nature Methods 2018; DOI: 10.1038/s41592-018-0187-8 Lozupone and Knight Appl Environ Microbiol 2005; DOI: 10.1128/AEM.71.12.8228-8235.2005 Lozupone et al. Appl Environ Microbiol 2007; DOI: 10.1128/AEM.01996-06 Hamady et al. ISME 2010; DOI: 10.1038/ismej.2009.97 Lozupone et al. ISME 2011; DOI: 10.1038/ismej.2010.133 For Generalized UniFrac, please see: Chen et al. Bioinformatics 2012; DOI: 10.1093/bioinformatics/bts342 For Variance Adjusted UniFrac, please see: Chang et al. BMC Bioinformatics 2011; DOI: 10.1186/1471-2105-12-118 faithpd For Faith's PD, please see: Faith Biological Conservation 1992; DOI: 10.1016/0006-3207(92)91201-3 # Install At this time, there are three primary ways to install the library. The first is through QIIME2, the second is through `bioconda`, and the third is via `pip`. It is also possible to clone the repository and install the python bindings with `setup.py`. Compilation has been performed on both clang 16.0 (OS X) or gcc 12.2 (Ubuntu) and HDF5 >= 1.8.17. Python installation requires Python >= 3.8, NumPy >= 1.12.1, scikit-bio >= 0.5.8, and Cython >= 0.28.3. Installation time should be a few minutes at most. ## Install (example) An example of installing UniFrac, and using it with CPUs as well as GPUs, can be be found on [Google Colabs](https://colab.research.google.com/drive/1yL0MdF1zNAkPg1_yESI1iABUH4ZHNGwj?usp=sharing). ## Install (QIIME2) The easiest way to use this library is through [QIIME2](https://docs.qiime2.org/2019.7/install/). This library is installed by default with the QIIME 2 Core Distribution. Currently, this module is used for phylogenetic diversity calculations in `qiime diversity beta-phylogenetic` for UniFrac and `qiime diversity alpha-phylogenetic-alt` for Faith's PD. If installing a newer version of UniFrac into an existing QIIME 2 environment, it is necessary to construct a "throwaway" conda environment, and force the install. An example is below, based on the observations [here](https://github.com/caporaso-lab/pretrained-feature-classifiers/pull/6#issuecomment-586023587): ``` conda create -n throwaway -c bioconda -c conda-forge conda-forge::python=3.8 unifrac unifrac-binaries conda list -n throwaway --explicit | grep 'EXPLICIT\|unifrac\|hdf5\|h5py\|lapack' > packages.txt conda install -n qiime2-2022.2 --file packages.txt ``` ## Install (bioconda) This library can also be installed via a combination of `conda-forge` and `bioconda`: ``` conda create --name unifrac -c conda-forge -c bioconda unifrac pip install iow ``` ## Install (pip) ``` pip install unifrac iow ``` ## Install (native) To install, first the cython wrappers must be compiled. It also needs the libssu library to be present. Assuming the compiler is in your path, the following should work: pip install -e . **Note**: if you are using `conda` we recommend installing the compiler and libssu using the `biooconda` channel, for example: conda install -c conda-forge -c bioconda gxx_linux-64 unifrac-binaries # Environment considerations ## Multi-core support Unifrac uses OpenMP to make use of multiple CPU cores. By default, Unifrac will use all the cores that are available on the system. To restrict the number of cores used, set: export OMP_NUM_THREADS=nthreads ## GPU support On Linux platforms, Unifrac will run on a GPU, if one is found. To disable GPU offload, and thus force CPU-only execution, one can set: export UNIFRAC_USE_GPU=N To check which code path is used (Unifrac will print it to standard output at runtime), set: export UNIFRAC_GPU_INFO=Y Finally, Unifrac will only use one GPU at a time. If more than one GPU is present, one can select the one to use by setting: export ACC_DEVICE_NUM=gpunum Note that there is no GPU support for MacOS. # Examples of use Below are a few light examples of different ways to use this library. ## QIIME2 To use Strided State UniFrac through QIIME2, you need to provide a `FeatureTable[Frequency]` and a `Phylogeny[Rooted]` artifacts. An example of use is: qiime diversity beta-phylogenetic --i-table table-evenly-sampled.qza \ --i-phylogeny a-tree.qza \ --o-distance-matrix resulting-distance-matrix.qza \ --p-metric unweighted_unifrac To use Stacked Faith through QIIME2, given similar artifacts, you can use: qiime diversity alpha-phylogenetic-alt --i-table table-evenly-sampled.qza \ --i-phylogeny a-tree.qza \ --o-alpha-diversity resulting-diversity-series.qza \ --p-metric faith_Pd ## Python The library can be accessed directly from within Python. If operating in this mode, the API methods are expecting a filepath to a BIOM-Format V2.1.0 table, and a filepath to a Newick formatted phylogeny. $ python Python 3.10.8 | packaged by conda-forge | (main, Nov 22 2022, 08:23:14) [GCC 10.4.0] on linux Type "help", "copyright", "credits" or "license" for more information. >>> import unifrac >>> dir(unifrac) ['__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '__version__', '_api', '_meta', '_methods', 'faith_pd', 'generalized', 'generalized_fp32', 'generalized_fp32_to_file', 'generalized_fp64', 'generalized_fp64_to_file', 'generalized_to_file', 'h5pcoa', 'h5pcoa_all', 'h5permanova', 'h5permanova_dict', 'h5unifrac', 'h5unifrac_all', 'meta', 'pkg_resources', 'set_random_seed', 'ssu', 'ssu_fast', 'ssu_inmem', 'ssu_to_file', 'ssu_to_file_v2', 'unweighted', 'unweighted_fp32', 'unweighted_fp32_to_file', 'unweighted_fp64', 'unweighted_fp64_to_file', 'unweighted_to_file', 'weighted_normalized', 'weighted_normalized_fp32', 'weighted_normalized_fp32_to_file', 'weighted_normalized_fp64', 'weighted_normalized_fp64_to_file', 'weighted_normalized_to_file', 'weighted_unnormalized', 'weighted_unnormalized_fp32', 'weighted_unnormalized_fp32_to_file', 'weighted_unnormalized_fp64', 'weighted_unnormalized_fp64_to_file', 'weighted_unnormalized_to_file'] >>> print(unifrac.unweighted.__doc__) Compute Unweighted UniFrac Parameters ---------- table : str A filepath to a BIOM-Format 2.1 file. phylogeny : str A filepath to a Newick formatted tree. threads : int, optional Deprecated, no-op. variance_adjusted : bool, optional Adjust for varianace or not. Default is False. bypass_tips : bool, optional Bypass the tips of the tree in the computation. This reduces compute by about 50%, but is an approximation. n_substeps : int, optional Internally split the problem in substeps for reduced memory footprint. Returns ------- skbio.DistanceMatrix The resulting distance matrix. Raises ------ IOError If the tree file is not found If the table is not found ValueError If the table does not appear to be BIOM-Format v2.1. If the phylogeny does not appear to be in Newick format. Environment variables --------------------- OMP_NUM_THREADS Number of CPU cores to use. If not defined, use all detected cores. UNIFRAC_USE_GPU Enable or disable GPU offload. If not defined, autodetect. ACC_DEVICE_NUM The GPU to use. If not defined, the first GPU will be used. Notes ----- Unweighted UniFrac was originally described in [1]_. Variance Adjusted UniFrac was originally described in [2]_, and while its application to Unweighted UniFrac was not described, factoring in the variance adjustment is still feasible and so it is exposed. References ---------- .. [1] Lozupone, C. & Knight, R. UniFrac: a new phylogenetic method for comparing microbial communities. Appl. Environ. Microbiol. 71, 8228-8235 (2005). .. [2] Chang, Q., Luan, Y. & Sun, F. Variance adjusted weighted UniFrac: a powerful beta diversity measure for comparing communities based on phylogeny. BMC Bioinformatics 12:118 (2011). >>> print(unifrac.unweighted_to_file.__doc__) Compute Unweighted UniFrac and write to file Parameters ---------- table : str A filepath to a BIOM-Format 2.1 file. phylogeny : str A filepath to a Newick formatted tree. out_filename : str A filepath to the output file. pcoa_dims : int, optional Number of dimensions to use for PCoA compute. if set to 0, no PCoA is computed. Defaults of 10. threads : int, optional Deprecated, no-op. variance_adjusted : bool, optional Adjust for varianace or not. Default is False. bypass_tips : bool, optional Bypass the tips of the tree in the computation. This reduces compute by about 50%, but is an approximation. format : str, optional Output format to use. Defaults to "hdf5" if n_subsamples<=1 else "hdf5_nodist" buf_dirname : str, optional If set, the directory where the disk buffer is hosted, can be used to reduce the amount of memory needed. n_substeps : int, optional Internally split the problem in substeps for reduced memory footprint. n_subsamples : int If >1, perform multiple subsamples. subsample_depth : int Depth of subsampling, if >0 subsample_with_replacement : bool Use subsampling with replacement? (only True supported in 1.3) permanova_perms : int If not 0, compute PERMANOVA using that many permutations grouping_filename : str The TSV filename containing grouping information grouping_columns : str The columns to use for grouping Returns ------- str A filepath to the output file. Raises ------ IOError If the tree file is not found If the table is not found If the output file cannot be created ValueError If the table does not appear to be BIOM-Format v2.1. If the phylogeny does not appear to be in Newick format. Environment variables --------------------- OMP_NUM_THREADS Number of CPU cores to use. If not defined, use all detected cores. UNIFRAC_USE_GPU Enable or disable GPU offload. If not defined, autodetect. ACC_DEVICE_NUM The GPU to use. If not defined, the first GPU will be used. Notes ----- Unweighted UniFrac was originally described in [1]_. Variance Adjusted UniFrac was originally described in [2]_, and while its application to Unweighted UniFrac was not described, factoring in the variance adjustment is still feasible and so it is exposed. References ---------- .. [1] Lozupone, C. & Knight, R. UniFrac: a new phylogenetic method for comparing microbial communities. Appl. Environ. Microbiol. 71, 8228-8235 (2005). .. [2] Chang, Q., Luan, Y. & Sun, F. Variance adjusted weighted UniFrac: a powerful beta diversity measure for comparing communities based on phylogeny. BMC Bioinformatics 12:118 (2011). >>> print(unifrac.faith_pd.__doc__) Execute a call to the Stacked Faith API in the UniFrac package Parameters ---------- biom_filename : str A filepath to a BIOM 2.1 formatted table (HDF5) tree_filename : str A filepath to a Newick formatted tree Returns ------- pd.Series Series of Faith's PD for each sample in `biom_filename` Raises ------ IOError If the tree file is not found If the table is not found If the table is empty >>> print(unifrac.h5unifrac.__doc__) Read UniFrac from a hdf5 file Parameters ---------- h5file : str A filepath to a hdf5 file. Returns ------- skbio.DistanceMatrix The distance matrix. Raises ------ OSError If the hdf5 file is not found KeyError If the hdf5 does not have the necessary fields References ---------- .. [1] Lozupone, C. & Knight, R. UniFrac: a new phylogenetic method for comparing microbial communities. Appl. Environ. Microbiol. 71, 8228-8235 (2005). .. [2] Chang, Q., Luan, Y. & Sun, F. Variance adjusted weighted UniFrac: a powerful beta diversity measure for comparing communities based on phylogeny. BMC Bioinformatics 12:118 (2011). >>> print(unifrac.h5pcoa.__doc__) Read PCoA from a hdf5 file Parameters ---------- h5file : str A filepath to a hdf5 file. Returns ------- skbio.OrdinationResults The PCoA of the distance matrix Raises ------ OSError If the hdf5 file is not found KeyError If the hdf5 does not have the necessary fields >>> print(unifrac.h5permanova_dict.__doc__) Read PERMANOVA statistical tests from a hdf5 file As describe in scikit-bio skbio.stats.distance.permanova.py, Permutational Multivariate Analysis of Variance (PERMANOVA) is a non-parametric method that tests whether two or more groups of objects are significantly different based on a categorical factor. Parameters ---------- h5file : str A filepath to a hdf5 file. Returns ------- dict[str]=pandas.Series Results of the statistical test, including ``test statistic`` and ``p-value``. Raises ------ OSError If the hdf5 file is not found KeyError If the hdf5 does not have the necessary fields References ---------- .. [1] Anderson, Marti J. "A new method for non-parametric multivariate analysis of variance." Austral Ecology 26.1 (2001): 32-46. ## Command line The methods can also be used directly through the command line after install of the dependent [unifrac-binaries](https://github.com/biocore/unifrac-binaries) package: $ which ssu /Users//miniconda3/envs/qiime2-20xx.x/bin/ssu $ ssu --help usage: ssu -i -o -m [METHOD] -t [-a alpha] [-f] [--vaw] [--mode MODE] [--start starting-stripe] [--stop stopping-stripe] [--partial-pattern ] [--n-partials number_of_partitions] [--report-bare] [--format|-r out-mode] [--n-substeps n] [--pcoa dims] [--diskbuf path] -i The input BIOM table. -t The input phylogeny in newick. -m The method, [unweighted | weighted_normalized | weighted_unnormalized | generalized | unweighted_fp32 | weighted_normalized_fp32 | weighted_unnormalized_fp32 | generalized_fp32]. -o The output distance matrix. -a [OPTIONAL] Generalized UniFrac alpha, default is 1. -f [OPTIONAL] Bypass tips, reduces compute by about 50%. --vaw [OPTIONAL] Variance adjusted, default is to not adjust for variance. --mode [OPTIONAL] Mode of operation: one-off : [DEFAULT] compute UniFrac. partial : Compute UniFrac over a subset of stripes. partial-report : Start and stop suggestions for partial compute. merge-partial : Merge partial UniFrac results. --start [OPTIONAL] If mode==partial, the starting stripe. --stop [OPTIONAL] If mode==partial, the stopping stripe. --partial-pattern [OPTIONAL] If mode==merge-partial, a glob pattern for partial outputs to merge. --n-partials [OPTIONAL] If mode==partial-report, the number of partitions to compute. --report-bare [OPTIONAL] If mode==partial-report, produce barebones output. --n-substeps [OPTIONAL] Internally split the problem in n substeps for reduced memory footprint, default is 1. --format|-r [OPTIONAL] Output format: ascii : [DEFAULT] Original ASCII format. hfd5 : HFD5 format. May be fp32 or fp64, depending on method. hdf5_fp32 : HFD5 format, using fp32 precision. hdf5_fp64 : HFD5 format, using fp64 precision. --pcoa [OPTIONAL] Number of PCoA dimensions to compute (default: 10, do not compute if 0) --diskbuf [OPTIONAL] Use a disk buffer to reduce memory footprint. Provide path to a fast partition (ideally NVMe). -n [OPTIONAL] DEPRECATED, no-op. Environment variables: CPU parallelism is controlled by OMP_NUM_THREADS. If not defined, all detected core will be used. GPU offload can be disabled with UNIFRAC_USE_GPU=N. By default, if a NVIDIA GPU is detected, it will be used. A specific GPU can be selected with ACC_DEVICE_NUM. If not defined, the first GPU will be used. Citations: For UniFrac, please see: Sfiligoi et al. mSystems 2022; DOI: 10.1128/msystems.00028-22 McDonald et al. Nature Methods 2018; DOI: 10.1038/s41592-018-0187-8 Lozupone and Knight Appl Environ Microbiol 2005; DOI: 10.1128/AEM.71.12.8228-8235.2005 Lozupone et al. Appl Environ Microbiol 2007; DOI: 10.1128/AEM.01996-06 Hamady et al. ISME 2010; DOI: 10.1038/ismej.2009.97 Lozupone et al. ISME 2011; DOI: 10.1038/ismej.2010.133 For Generalized UniFrac, please see: Chen et al. Bioinformatics 2012; DOI: 10.1093/bioinformatics/bts342 For Variance Adjusted UniFrac, please see: Chang et al. BMC Bioinformatics 2011; DOI: 10.1186/1471-2105-12-118 $ which faithpd /Users//miniconda3/envs/qiime2-20xx.x/bin/faithpd $ faithpd --help usage: faithpd -i -t -o -i The input BIOM table. -t The input phylogeny in newick. -o The output series. Citations: For Faith's PD, please see: Faith Biological Conservation 1992; DOI: 10.1016/0006-3207(92)91201-3 ## Minor test dataset A small test `.biom` and `.tre` can be found in `unifrac/tests/data/`. An example with expected output is below, and should execute in 10s of milliseconds: $ python Python 3.10.8 | packaged by conda-forge | (main, Nov 22 2022, 08:23:14) [GCC 10.4.0] on linux Type "help", "copyright", "credits" or "license" for more information. >>> import unifrac >>> d=unifrac.unweighted('unifrac/tests/data/crawford.biom','unifrac/tests/data/crawford.tre') >>> d.data array([[0. , 0.71836066, 0.7131736 , 0.6974604 , 0.6258721 , 0.7282667 , 0.72065896, 0.7264058 , 0.7360605 ], [0.71836066, 0. , 0.7030297 , 0.734073 , 0.6548042 , 0.71547383, 0.7839781 , 0.723184 , 0.7613893 ], [0.7131736 , 0.7030297 , 0. , 0.6104128 , 0.623313 , 0.71848303, 0.7041634 , 0.75258476, 0.7924903 ], [0.6974604 , 0.734073 , 0.6104128 , 0. , 0.6439278 , 0.7005273 , 0.6983272 , 0.77818936, 0.72959894], [0.6258721 , 0.6548042 , 0.623313 , 0.6439278 , 0. , 0.75782686, 0.7100514 , 0.75065047, 0.7894437 ], [0.7282667 , 0.71547383, 0.71848303, 0.7005273 , 0.75782686, 0. , 0.63593644, 0.71283615, 0.5831464 ], [0.72065896, 0.7839781 , 0.7041634 , 0.6983272 , 0.7100514 , 0.63593644, 0. , 0.6920076 , 0.6897206 ], [0.7264058 , 0.723184 , 0.75258476, 0.77818936, 0.75065047, 0.71283615, 0.6920076 , 0. , 0.7151408 ], [0.7360605 , 0.7613893 , 0.7924903 , 0.72959894, 0.7894437 , 0.5831464 , 0.6897206 , 0.7151408 , 0. ]], dtype=float32) unifrac-1.3/ci/000077500000000000000000000000001442154206300134005ustar00rootroot00000000000000unifrac-1.3/ci/linux-64.txt000066400000000000000000000000571442154206300155310ustar00rootroot00000000000000cython flake8 nose scikit-bio biom-format h5py unifrac-1.3/setup.py000066400000000000000000000063201442154206300145200ustar00rootroot00000000000000# ---------------------------------------------------------------------------- # Copyright (c) 2016-2017, QIIME 2 development team. # # Distributed under the terms of the Modified BSD License. # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- from setuptools import setup, find_packages from setuptools.extension import Extension from setuptools.command.build_ext import build_ext as build_ext_orig import numpy as np import subprocess import os import sys PREFIX = os.environ.get('PREFIX', "") base = ["cython >= 0.26", "biom-format", "numpy", "h5py >= 3.3.0", "scikit-bio >= 0.5.8", "iow"] test = ["nose", "flake8"] all_deps = base + test # https://stackoverflow.com/a/33308902/379593 if sys.platform == 'darwin': os.environ['MACOSX_DEPLOYMENT_TARGET'] = '10.12' def compile_ssu(): """Clean and compile the SSU binary""" to_link = ["unifrac/task_parameters.hpp", "unifrac/api.hpp", "unifrac/status_enum.hpp"] # clean the target cmd = ["rm", "-f"] + to_link ret = subprocess.call(cmd) if ret != 0: raise Exception('Error removing temp unifrac files!') for f in to_link: # link to files from conda cmd = ["ln", "-s", os.environ.get('CONDA_PREFIX') + '/include/' + f, "unifrac/"] ret = subprocess.call(cmd) if ret != 0: raise Exception('Error removing linking unifrac files!') class build_ext(build_ext_orig): """Pre-installation for any time an Extension is built""" def run(self): self.run_compile_ssu() super().run() def run_compile_ssu(self): self.execute(compile_ssu, [], 'Compiling SSU') if sys.platform == "darwin": LINK_ARGS = ['-Wl,' + os.environ.get('CONDA_PREFIX') + '/lib/libssu.so'] else: LINK_ARGS = [] COMPILE_ARGS = [] if 'CONDA_PREFIX' in os.environ: CONDA_INCLUDES = [os.environ.get('CONDA_PREFIX') + '/include'] else: CONDA_INCLUDES = [] USE_CYTHON = os.environ.get('USE_CYTHON', True) ext = '.pyx' if USE_CYTHON else '.cpp' extensions = [Extension("unifrac._api", sources=["unifrac/_api" + ext], extra_link_args=LINK_ARGS, extra_compile_args=COMPILE_ARGS, include_dirs=([np.get_include()] + CONDA_INCLUDES), libraries=['ssu'])] if USE_CYTHON: from Cython.Build import cythonize extensions = cythonize(extensions) with open('README.md') as f: long_description = f.read() setup( name="unifrac", version="1.3.0", packages=find_packages(), author="Daniel McDonald", license='BSD-3-Clause', author_email="wasade@gmail.com", url="https://github.com/biocore/unifrac", description="High performance phylogenetic diversity calculations", long_description=long_description, long_description_content_type='text/markdown', ext_modules=extensions, install_requires=base, extras_require={'test': test, 'all': all_deps}, cmdclass={'build_ext': build_ext}, package_data={ 'unifrac.tests': ['data/*', ]} ) unifrac-1.3/unifrac/000077500000000000000000000000001442154206300144345ustar00rootroot00000000000000unifrac-1.3/unifrac/__init__.py000066400000000000000000000061071442154206300165510ustar00rootroot00000000000000# ---------------------------------------------------------------------------- # Copyright (c) 2016-2020, UniFrac development team. # # Distributed under the terms of the Modified BSD License. # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- import pkg_resources from unifrac._methods import (unweighted, weighted_normalized, weighted_unnormalized, generalized, unweighted_fp64, weighted_normalized_fp64, weighted_unnormalized_fp64, generalized_fp64, unweighted_fp32, weighted_normalized_fp32, weighted_unnormalized_fp32, generalized_fp32, unweighted_to_file, weighted_normalized_to_file, weighted_unnormalized_to_file, generalized_to_file, unweighted_fp64_to_file, weighted_normalized_fp64_to_file, weighted_unnormalized_fp64_to_file, generalized_fp64_to_file, unweighted_fp32_to_file, weighted_normalized_fp32_to_file, weighted_unnormalized_fp32_to_file, generalized_fp32_to_file, meta, h5unifrac, h5unifrac_all, h5pcoa, h5pcoa_all, h5permanova, h5permanova_dict) from unifrac._api import ssu, ssu_fast, faith_pd, set_random_seed from unifrac._api import ssu_to_file, ssu_to_file_v2, ssu_inmem __version__ = pkg_resources.get_distribution('unifrac').version __all__ = ['unweighted', 'weighted_normalized', 'weighted_unnormalized', 'generalized', 'unweighted_fp64', 'weighted_normalized_fp64', 'weighted_unnormalized_fp64', 'generalized_fp64', 'unweighted_fp32', 'weighted_normalized_fp32', 'weighted_unnormalized_fp32', 'generalized_fp32', 'meta', 'set_random_seed', 'unweighted_to_file', 'weighted_normalized_to_file', 'weighted_unnormalized_to_file', 'generalized_to_file', 'unweighted_fp64_to_file', 'weighted_normalized_fp64_to_file', 'weighted_unnormalized_fp64_to_file', 'generalized_fp64_to_file', 'unweighted_fp32_to_file', 'weighted_normalized_fp32_to_file', 'weighted_unnormalized_fp32_to_file', 'generalized_fp32_to_file', 'h5unifrac', 'h5unifrac_all', 'h5pcoa', 'h5pcoa_all', 'h5permanova', 'h5permanova_dict', 'ssu', 'ssu_fast', 'faith_pd', 'ssu_to_file', 'ssu_to_file_v2', 'ssu_inmem'] unifrac-1.3/unifrac/_api.pxd000066400000000000000000000116461442154206300160710ustar00rootroot00000000000000#cython: language_level=3 from libc.stdint cimport uint32_t, uint8_t ctypedef uint8_t bool cdef extern from "status_enum.hpp": enum compute_status: okay, tree_missing, table_missing, table_empty, unknown_method, table_and_tree_do_not_overlap, output_error, invalid_method, grouping_missing cdef extern from "api.hpp": struct mat: double* condensed_form unsigned int n_samples unsigned int cf_size char** sample_ids struct mat_full_fp64: uint32_t n_samples uint32_t flags double* matrix char** sample_ids struct mat_full_fp32: uint32_t n_samples uint32_t flags float* matrix char** sample_ids struct results_vec: unsigned int n_samples double* values char** sample_ids struct support_biom: char** obs_ids char** sample_ids uint32_t* indices uint32_t* indptr double* data int n_obs int n_samples int nnz struct support_bptree: bool* structure double* lengths char** names int n_parens compute_status one_off(const char* biom_filename, const char* tree_filename, const char* unifrac_method, bool variance_adjust, double alpha, bool bypass_tips, unsigned int n_substeps, mat** result) compute_status one_off_matrix(const char* biom_filename, const char* tree_filename, const char* unifrac_method, bool variance_adjust, double alpha, bool bypass_tips, unsigned int n_substeps, const char *mmap_dir, mat_full_fp64** result) compute_status one_off_matrix_fp32(const char* biom_filename, const char* tree_filename, const char* unifrac_method, bool variance_adjust, double alpha, bool bypass_tips, unsigned int n_substeps, const char *mmap_dir, mat_full_fp32** result) compute_status one_off_inmem(const support_biom *table, const support_bptree *tree, const char* unifrac_method, bool variance_adjust, double alpha, bool bypass_tips, unsigned int n_substeps, mat_full_fp64** result) compute_status one_off_inmem_fp32(const support_biom *table, const support_bptree *tree, const char* unifrac_method, bool variance_adjust, double alpha, bool bypass_tips, unsigned int n_substeps, mat_full_fp32** result) compute_status faith_pd_one_off(const char* biom_filename, const char* tree_filename, results_vec** result) void destroy_mat(mat** result) void destroy_mat_full_fp32(mat_full_fp32** result) void destroy_mat_full_fp64(mat_full_fp64** result) void destroy_results_vec(results_vec** result) void ssu_set_random_seed(unsigned int new_seed) compute_status unifrac_to_file_v2(const char* biom_filename, const char* tree_filename, const char* out_filename, const char* unifrac_method, bool variance_adjust, double alpha, bool bypass_tips, unsigned int n_substeps, const char* format, unsigned int subsample_depth, bool subsample_with_replacement, unsigned int pcoa_dims, unsigned int permanova_perms, const char *grouping_filename, const char *grouping_columns, const char *mmap_dir) # obsolete, will be removed in the future compute_status unifrac_to_file(const char* biom_filename, const char* tree_filename, const char* out_filename, const char* unifrac_method, bool variance_adjust, double alpha, bool bypass_tips, unsigned int n_substeps, const char* format, unsigned int pcoa_dims, const char *mmap_dir) compute_status unifrac_multi_to_file_v2(const char* biom_filename, const char* tree_filename, const char* out_filename, const char* unifrac_method, bool variance_adjust, double alpha, bool bypass_tips, unsigned int n_substeps, const char* format, unsigned int n_subsamples, unsigned int subsample_depth, bool subsample_with_replacement, unsigned int pcoa_dims, unsigned int permanova_perms, const char *grouping_filename, const char *grouping_columns, const char *mmap_dir) unifrac-1.3/unifrac/_api.pyx000066400000000000000000000706431442154206300161200ustar00rootroot00000000000000# cython: boundscheck=False import skbio import numpy as np cimport numpy as np import bp import pandas as pd from libc.stdlib cimport malloc, free from libc.string cimport strcpy def check_status(compute_status status): if status != okay: if status == tree_missing: raise IOError("Tree file not found.") elif status == table_missing: raise IOError("Table file not found.") elif status == table_empty: raise ValueError("Table file is empty.") elif status == table_and_tree_do_not_overlap: raise ValueError("The table does not appear to be completely " "represented by the phylogeny.") elif status == unknown_method: raise ValueError("Unknown method.") elif status == output_error: raise IOError("Could not write output file.") elif status == invalid_method: raise ValueError("Invalid method.") elif status == grouping_missing: raise IOError("PERMANOVA groupping not found.") else: raise Exception("Unknown Error: {}".format(status)) def set_random_seed(unsigned int new_seed): """Set random seed used by this library""" ssu_set_random_seed(new_seed) # # Functions that compute Unifrac and return a memory object # def ssu_inmem(object table, object tree, str unifrac_method, bool variance_adjust, double alpha, bool bypass_tips, unsigned int n_substeps): """Execute a call to Strided State UniFrac via the direct API Parameters ---------- table : biom.Table An instance of a biom.Table object tree : bp.BP or skbio.TreeNode A phylogeny corresponding to the table unifrac_method : str The requested UniFrac method, one of {unweighted, weighted_normalized, weighted_unnormalized, generalized, unweighted_fp64, weighted_normalized_fp64, weighted_unnormalized_fp64, generalized_fp64, unweighted_fp32, weighted_normalized_fp32, weighted_unnormalized_fp32, generalized_fp32} variance_adjust : bool Whether to perform Variance Adjusted UniFrac alpha : float The value of alpha for Generalized UniFrac; only applies to Generalized UniFraca bypass_tips : bool Bypass the tips of the tree in the computation. This reduces compute by about 50%, but is an approximation. n_substeps : int The number of substeps to use. Returns ------- skbio.DistanceMatrix The resulting distance matrix Raises ------ ValueError If the table is empty If the table is not completely represented by the phylogeny If an unknown method is requested. Exception If an unkown error is experienced """ cdef: bytes met_py_bytes char* met_c_string support_biom* inmem_biom support_bptree* inmem_tree object result_dm np.ndarray[np.float32_t, ndim=2] numpy_arr_fp32 np.ndarray[np.double_t, ndim=2] numpy_arr_fp64 inmem_biom = construct_support_biom(table) inmem_tree = construct_support_bptree(tree) met_py_bytes = unifrac_method.encode() met_c_string = met_py_bytes if '_fp64' not in unifrac_method: numpy_arr_fp32 = _ssu_inmem_fp32(inmem_biom, inmem_tree, met_c_string, variance_adjust, alpha, bypass_tips, n_substeps) result_dm = skbio.DistanceMatrix(numpy_arr_fp32, table.ids(), validate=False) else: numpy_arr_fp64 = _ssu_inmem_fp64(inmem_biom, inmem_tree, met_c_string, variance_adjust, alpha, bypass_tips, n_substeps) result_dm = skbio.DistanceMatrix(numpy_arr_fp64, table.ids(), validate=False) destroy_support_biom(inmem_biom) destroy_support_bptree(inmem_tree) return result_dm cdef np.ndarray _ssu_inmem_fp64(support_biom *inmem_biom, support_bptree *inmem_tree, char* met_c_string, bool variance_adjust, double alpha, bool bypass_tips, unsigned int n_substeps): cdef: compute_status status mat_full_fp64 *result np.ndarray[np.double_t, ndim=2] numpy_arr # allocate our array, and steal the pointer so we may write into it numpy_arr = np.empty((inmem_biom.n_samples, inmem_biom.n_samples), dtype=np.double) result = malloc(sizeof(mat_full_fp64)) result.n_samples = inmem_biom.n_samples result.flags = 0 result.matrix = &numpy_arr[0, 0] result.sample_ids = inmem_biom.sample_ids status = one_off_inmem(inmem_biom, inmem_tree, met_c_string, variance_adjust, alpha, bypass_tips, n_substeps, &result) check_status(status) # both matrix and sample_ids are borrowed pointers -- we do not need to # worry about freeing them result.matrix = NULL result.sample_ids = NULL free(result) return numpy_arr cdef np.ndarray _ssu_inmem_fp32(support_biom *inmem_biom, support_bptree *inmem_tree, char* met_c_string, bool variance_adjust, double alpha, bool bypass_tips, unsigned int n_substeps): cdef: compute_status status mat_full_fp32 *result np.ndarray[np.float32_t, ndim=2] numpy_arr # allocate our array, and steal the pointer so we may write into it numpy_arr = np.empty((inmem_biom.n_samples, inmem_biom.n_samples), dtype=np.float32) result = malloc(sizeof(mat_full_fp32)) result.n_samples = inmem_biom.n_samples result.flags = 0 result.matrix = &numpy_arr[0, 0] result.sample_ids = inmem_biom.sample_ids status = one_off_inmem_fp32(inmem_biom, inmem_tree, met_c_string, variance_adjust, alpha, bypass_tips, n_substeps, &result) check_status(status) # both matrix and sample_ids are borrowed pointers -- we do not need to # worry about freeing them result.matrix = NULL result.sample_ids = NULL free(result) return numpy_arr def ssu_fast(str biom_filename, str tree_filename, object ids, str unifrac_method, bool variance_adjust, double alpha, bool bypass_tips, unsigned int n_substeps): """Execute a call to Strided State UniFrac via the direct API Parameters ---------- biom_filename : str A filepath to a BIOM 2.1 formatted table (HDF5) tree_filename : str A filepath to a Newick formatted tree ids : tuple or list Ids as present in biom_filename file unifrac_method : str The requested UniFrac method, one of {unweighted, weighted_normalized, weighted_unnormalized, generalized, unweighted_fp64, weighted_normalized_fp64, weighted_unnormalized_fp64, generalized_fp64, unweighted_fp32, weighted_normalized_fp32, weighted_unnormalized_fp32, generalized_fp32} variance_adjust : bool Whether to perform Variance Adjusted UniFrac alpha : float The value of alpha for Generalized UniFrac; only applies to Generalized UniFraca bypass_tips : bool Bypass the tips of the tree in the computation. This reduces compute by about 50%, but is an approximation. n_substeps : int The number of substeps to use. Returns ------- skbio.DistanceMatrix The resulting distance matrix Raises ------ ValueError If the table is empty If the table is not completely represented by the phylogeny If an unknown method is requested. Exception If an unkown error is experienced """ cdef: bytes biom_py_bytes bytes tree_py_bytes bytes met_py_bytes char* biom_c_string char* tree_c_string char* met_c_string object result_dm np.ndarray[np.float32_t, ndim=2] numpy_arr_fp32 np.ndarray[np.double_t, ndim=2] numpy_arr_fp64 biom_py_bytes = biom_filename.encode() tree_py_bytes = tree_filename.encode() met_py_bytes = unifrac_method.encode() biom_c_string = biom_py_bytes tree_c_string = tree_py_bytes met_c_string = met_py_bytes if '_fp64' not in unifrac_method: numpy_arr_fp32 = _ssu_fast_fp32(biom_c_string, tree_c_string, ids.__len__(), met_c_string, variance_adjust, alpha, bypass_tips, n_substeps) result_dm = skbio.DistanceMatrix(numpy_arr_fp32, ids, validate=False) else: numpy_arr_fp64 = _ssu_fast_fp64(biom_c_string, tree_c_string, ids.__len__(), met_c_string, variance_adjust, alpha, bypass_tips, n_substeps) result_dm = skbio.DistanceMatrix(numpy_arr_fp64, ids, validate=False) return result_dm cdef np.ndarray _ssu_fast_fp64(char* biom_c_string, char* tree_c_string, unsigned int n_ids, char* met_c_string, bool variance_adjust, double alpha, bool bypass_tips, unsigned int n_substeps): cdef: unsigned int i compute_status status mat_full_fp64 *result np.ndarray[np.double_t, ndim=2] numpy_arr # allocate our array, and steal the pointer so we may write into it numpy_arr = np.empty((n_ids, n_ids), dtype=np.double) result = malloc(sizeof(mat_full_fp64)) result.n_samples = n_ids result.flags = 0 result.matrix = &numpy_arr[0, 0] result.sample_ids = malloc(n_ids*sizeof(char*)) # the values are not really used, so just point to a dummy string for i in range(n_ids): result.sample_ids[i] = biom_c_string status = one_off_matrix(biom_c_string, tree_c_string, met_c_string, variance_adjust, alpha, bypass_tips, n_substeps, NULL, &result) check_status(status) # matrix is a borrowed pointer -- we do not need to # worry about freeing it result.matrix = NULL free(result.sample_ids) free(result) return numpy_arr cdef np.ndarray _ssu_fast_fp32(char* biom_c_string, char* tree_c_string, unsigned int n_ids, char* met_c_string, bool variance_adjust, double alpha, bool bypass_tips, unsigned int n_substeps): cdef: unsigned int i compute_status status mat_full_fp32 *result np.ndarray[np.float32_t, ndim=2] numpy_arr # allocate our array, and steal the pointer so we may write into it numpy_arr = np.empty((n_ids, n_ids), dtype=np.float32) result = malloc(sizeof(mat_full_fp32)) result.n_samples = n_ids result.flags = 0 result.matrix = &numpy_arr[0, 0] result.sample_ids = malloc(n_ids*sizeof(char*)) # the values are not really used, so just point to a dummy string for i in range(n_ids): result.sample_ids[i] = biom_c_string status = one_off_matrix_fp32(biom_c_string, tree_c_string, met_c_string, variance_adjust, alpha, bypass_tips, n_substeps, NULL, &result) check_status(status) # matrix is a borrowed pointer -- we do not need to # worry about freeing it result.matrix = NULL free(result.sample_ids) free(result) return numpy_arr def ssu(str biom_filename, str tree_filename, str unifrac_method, bool variance_adjust, double alpha, bool bypass_tips, unsigned int n_substeps): """Execute a call to Strided State UniFrac via the direct API Parameters ---------- biom_filename : str A filepath to a BIOM 2.1 formatted table (HDF5) tree_filename : str A filepath to a Newick formatted tree unifrac_method : str The requested UniFrac method, one of {unweighted, weighted_normalized, weighted_unnormalized, generalized, unweighted_fp64, weighted_normalized_fp64, weighted_unnormalized_fp64, generalized_fp64, unweighted_fp32, weighted_normalized_fp32, weighted_unnormalized_fp32, generalized_fp32} variance_adjust : bool Whether to perform Variance Adjusted UniFrac alpha : float The value of alpha for Generalized UniFrac; only applies to Generalized UniFraca bypass_tips : bool Bypass the tips of the tree in the computation. This reduces compute by about 50%, but is an approximation. n_substeps : int The number of substeps to use. Returns ------- skbio.DistanceMatrix The resulting distance matrix Raises ------ IOError If the tree file is not found If the table is not found ValueError If the table is empty If the table is not completely represented by the phylogeny If an unknown method is requested. Exception If an unkown error is experienced Note ---- This version makes several memory conversions and is thus much slower than ssu_fast. Retaining only for backward compatibility reasons. """ cdef: mat *result; compute_status status; bytes biom_py_bytes bytes tree_py_bytes bytes met_py_bytes char* biom_c_string char* tree_c_string char* met_c_string biom_py_bytes = biom_filename.encode() tree_py_bytes = tree_filename.encode() met_py_bytes = unifrac_method.encode() biom_c_string = biom_py_bytes tree_c_string = tree_py_bytes met_c_string = met_py_bytes status = one_off(biom_c_string, tree_c_string, met_c_string, variance_adjust, alpha, bypass_tips, n_substeps, &result) check_status(status) return result_to_skbio_distance_matrix(result) cdef object result_to_skbio_distance_matrix(mat *result): cdef: np.ndarray[np.double_t, ndim=1] numpy_arr list ids int i ids = [] numpy_arr = np.zeros(result.cf_size, dtype=np.double) numpy_arr[:] = result.condensed_form for i in range(result.n_samples): ids.append(result.sample_ids[i].decode('utf-8')) destroy_mat(&result) return skbio.DistanceMatrix(numpy_arr, ids) def faith_pd(str biom_filename, str tree_filename): """Execute a call to the Stacked Faith API in the UniFrac package Parameters ---------- biom_filename : str A filepath to a BIOM 2.1 formatted table (HDF5) tree_filename : str A filepath to a Newick formatted tree Returns ------- pd.Series Series of Faith's PD for each sample in `biom_filename` Raises ------ IOError If the tree file is not found If the table is not found ValueError If the table is empty If the table is not completely represented by the phylogeny Exception If an unkown error is experienced """ cdef: results_vec *result; compute_status status; np.ndarray[np.double_t, ndim=1] numpy_arr bytes biom_py_bytes bytes tree_py_bytes char* biom_c_string char* tree_c_string list ids biom_py_bytes = biom_filename.encode() tree_py_bytes = tree_filename.encode() biom_c_string = biom_py_bytes tree_c_string = tree_py_bytes status = faith_pd_one_off(biom_c_string, tree_c_string, &result) check_status(status) numpy_arr = np.zeros(result.n_samples, dtype=np.double) numpy_arr[:] = result.values ids = [] for i in range(result.n_samples): ids.append(result.sample_ids[i].decode('utf-8')) faith_pd_series = pd.Series(numpy_arr, index=ids) faith_pd_series.rename("faith_pd", inplace=True) destroy_results_vec(&result) return faith_pd_series # # Functions that compute Unifrac and write into a file # def ssu_to_file_v2(str biom_filename, str tree_filename, str out_filename, str unifrac_method, bool variance_adjust, double alpha, bool bypass_tips, unsigned int n_substeps, str format, unsigned int n_subsamples, unsigned int subsample_depth, bool subsample_with_replacement, unsigned int pcoa_dims, unsigned int permanova_perms, str grouping_filename, str grouping_columns, str buf_dirname): """Execute a call to UniFrac to file via the direct API Parameters ---------- biom_filename : str A filepath to a BIOM 2.1 formatted table (HDF5) tree_filename : str A filepath to a Newick formatted tree out_filename : str A filepath to the output file. unifrac_method : str The requested UniFrac method, one of {unweighted, weighted_normalized, weighted_unnormalized, generalized, unweighted_fp64, weighted_normalized_fp64, weighted_unnormalized_fp64, generalized_fp64, unweighted_fp32, weighted_normalized_fp32, weighted_unnormalized_fp32, generalized_fp32} variance_adjust : bool Whether to perform Variance Adjusted UniFrac alpha : float The value of alpha for Generalized UniFrac; only applies to Generalized UniFraca bypass_tips : bool Bypass the tips of the tree in the computation. This reduces compute by about 50%, but is an approximation. n_substeps : int The number of substeps to use. format : str Output format to use; one of {hdf5, hdf5_fp32, hdf5_fp64, hdf5_nodist} If an empty string, use "hdf5" if n_subsamples<=1 else "hdf5_nodist" n_subsamples : int If >1, perform multiple subsamples. subsample_depth : int Depth of subsampling, if >0 subsample_with_replacement : bool Use subsampling with replacement? (only True supported in 1.3) pcoa_dims : int Number of dimension for PCoA, if 0, no PCoA is computed. permanova_perms : int If not 0, compute PERMANOVA using that many permutations grouping_filename : str The TSV filename containing grouping information grouping_columns : str The columns to use for grouping out_filename : str If using a disk buffer for saving memory is desired, a dirpath. Returns ------- str A filepath to the output file. Raises ------ IOError If the tree file is not found If the table is not found If the output file cannot be created ValueError If the table is empty If the table is not completely represented by the phylogeny If an unknown method is requested. Exception If an unkown error is experienced """ cdef: compute_status status; int i bytes biom_py_bytes bytes tree_py_bytes bytes out_py_bytes bytes met_py_bytes bytes format_py_bytes bytes grouping_filename_py_bytes bytes grouping_columns_py_bytes bytes dirbuf_py_bytes char* biom_c_string char* tree_c_string char* out_c_string char* met_c_string char* format_c_string char* grouping_filename_c_string char* grouping_columns_c_string char* dirbuf_c_string list ids if format=="": real_format = "hdf5" if n_subsamples<=1 else "hdf5_nodist" else: real_format = format biom_py_bytes = biom_filename.encode() tree_py_bytes = tree_filename.encode() out_py_bytes = out_filename.encode() met_py_bytes = unifrac_method.encode() format_py_bytes = real_format.encode() grouping_filename_py_bytes = grouping_filename.encode() grouping_columns_py_bytes = grouping_columns.encode() dirbuf_py_bytes = buf_dirname.encode() biom_c_string = biom_py_bytes tree_c_string = tree_py_bytes out_c_string = out_py_bytes met_c_string = met_py_bytes format_c_string = format_py_bytes grouping_filename_c_string = grouping_filename_py_bytes grouping_columns_c_string = grouping_columns_py_bytes dirbuf_c_string = dirbuf_py_bytes if n_subsamples>1: if subsample_depth==0: raise ValueError("subsample_depth cannot be 0 if n_subsamples>1") status = unifrac_multi_to_file_v2(biom_c_string, tree_c_string, out_c_string, met_c_string, variance_adjust, alpha, bypass_tips, n_substeps, format_c_string, n_subsamples, subsample_depth, subsample_with_replacement, pcoa_dims, permanova_perms, grouping_filename_c_string, grouping_columns_c_string, dirbuf_c_string) else: status = unifrac_to_file_v2(biom_c_string, tree_c_string, out_c_string, met_c_string, variance_adjust, alpha, bypass_tips, n_substeps, format_c_string, subsample_depth, subsample_with_replacement, pcoa_dims, permanova_perms, grouping_filename_c_string, grouping_columns_c_string, dirbuf_c_string) check_status(status) return out_filename # obsolete, just for backward compatibility def ssu_to_file(str biom_filename, str tree_filename, str out_filename, str unifrac_method, bool variance_adjust, double alpha, bool bypass_tips, unsigned int n_substeps, str format, unsigned int pcoa_dims, str buf_dirname): return ssu_to_file_v2(biom_filename, tree_filename, out_filename, unifrac_method, variance_adjust, alpha, bypass_tips, n_substeps, format, 0, True, pcoa_dims, 0, "" , "", buf_dirname) cdef support_biom* construct_support_biom(object table): cdef: char** obs_ids char** sample_ids uint32_t* indices uint32_t* indptr double* data support_biom* sp_biom object matrix_data = table.matrix_data.tocsr() int i, nsamples, nobs, nnz, nindptr, n np.ndarray[object, ndim=1] table_obs_ids np.ndarray[object, ndim=1] table_samp_ids np.ndarray[np.int32_t, ndim=1] table_indices np.ndarray[np.int32_t, ndim=1] table_indptr np.ndarray[np.double_t, ndim=1] table_data # it seems that even if we use fixed width upstream, we cannot readily # utilize it within cython. Casting to 'object' now. It likely would be # beneficial to operate directly on fixed width. # https://stackoverflow.com/questions/42543485/cython-specify-numpy-array-of-fixed-length-strings#comment72258848_ table_obs_ids = table.ids(axis='observation').astype(object) table_samp_ids = table.ids(axis='sample').astype(object) table_indices = table.matrix_data.indices table_indptr = table.matrix_data.indptr table_data = table.matrix_data.data indices = &table_indices[0] indptr = &table_indptr[0] data = &table_data[0] nsamples = table_samp_ids.size nobs = table_obs_ids.size nnz = table_data.size nindptr = table_indptr.size obs_ids = malloc(nobs * sizeof(char*)) if not obs_ids: return NULL sample_ids = malloc(nsamples * sizeof(char*)) if not sample_ids: return NULL # cannot use prange for strings as the GIL is required for indexing # arrays of object dtype for i in range(nsamples): n = len(table_samp_ids[i]) + 1 # for \0 sample_ids[i] = malloc(n * sizeof(char)) if not sample_ids[i]: return NULL strcpy(sample_ids[i], table_samp_ids[i].encode('ascii')) for i in range(nobs): n = len(table_obs_ids[i]) + 1 # for \0 obs_ids[i] = malloc(n * sizeof(char)) if not obs_ids[i]: return NULL strcpy(obs_ids[i], table_obs_ids[i].encode('ascii')) sp_biom = malloc(sizeof(support_biom)) if not sp_biom: return NULL sp_biom.obs_ids = obs_ids sp_biom.sample_ids = sample_ids sp_biom.indices = indices sp_biom.indptr = indptr sp_biom.data = data sp_biom.n_obs = nobs sp_biom.n_samples = nsamples sp_biom.nnz = nnz return sp_biom cdef void destroy_support_biom(support_biom *sp_biom): cdef: int i for i in range(sp_biom.n_obs): free(sp_biom.obs_ids[i]) free(sp_biom.obs_ids) for i in range(sp_biom.n_samples): free(sp_biom.sample_ids[i]) free(sp_biom.sample_ids) # these are borrowed pointers so do not free sp_biom.indices = NULL sp_biom.indptr = NULL sp_biom.data = NULL free(sp_biom) cdef support_bptree* construct_support_bptree(object tree): cdef: bool* structure double* lengths char** names object tree_as_bp int length, i, n support_bptree* sp_bptree if isinstance(tree, skbio.TreeNode): tree_as_bp = bp.from_skbio_treenode(tree) else: tree_as_bp = tree length = tree_as_bp.B.size # malloc these things structure = malloc(length * sizeof(bool)) if not structure: return NULL lengths = malloc(length * sizeof(double)) if not lengths: return NULL names = malloc(length * sizeof(char*)) if not names: return NULL # cannot prange as we need to do attribute access # TODO: expose these structures directly from BP for i in range(length): structure[i] = tree_as_bp.B[i] lengths[i] = tree_as_bp.length(i) name = tree_as_bp.name(i) if name is None: names[i] = malloc(sizeof(char)) if not names[i]: return NULL names[i][0] = b'\0' else: n = len(name) + 1 # for \0 names[i] = malloc(n * sizeof(char)) if not names[i]: return NULL strcpy(names[i], name.encode('ascii')) sp_bptree = malloc(sizeof(support_bptree)) if not sp_bptree: return NULL sp_bptree.structure = structure sp_bptree.lengths = lengths sp_bptree.names = names sp_bptree.n_parens = length return sp_bptree cdef void destroy_support_bptree(support_bptree* sp_bptree): cdef: int i for i in range(sp_bptree.n_parens): free(sp_bptree.names[i]) free(sp_bptree.names) free(sp_bptree.lengths) free(sp_bptree.structure) free(sp_bptree) unifrac-1.3/unifrac/_meta.py000066400000000000000000000073601442154206300161010ustar00rootroot00000000000000# ---------------------------------------------------------------------------- # Copyright (c) 2016-2017, UniFrac development team. # # Distributed under the terms of the Modified BSD License. # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- # Code pulled from cogent.maths.unifrac.fast_unifrac; the authors have # previously indicated approval for converstion from GPL -> BSD # https://github.com/biocore/scikit-bio#the-pre-history-of-scikit-bio # These methods did not have unit tests in cogent import numpy as np def consolidate_skipping_missing_matrices(matrices, env_names, weights, all_env_names): """Consolidates matrices, skipping any that are missing envs""" weight_sum = 0 result = np.zeros((len(all_env_names), len(all_env_names)), float) for m, e, w in zip(matrices, env_names, weights): if e == all_env_names: # note -- assumes sorted result += m * w weight_sum += w # readjust weights for missing matrices result /= weight_sum return result def consolidate_missing_zero(matrices, env_names, weights, all_env_names): """Consolidates matrices, setting missing values to 0 distance""" result = np.zeros((len(all_env_names), len(all_env_names)), float) for m, e, w in zip(matrices, env_names, weights): result += reshape_by_name(m, e, all_env_names, 0) * w return result def consolidate_missing_one(matrices, env_names, weights, all_env_names): """Consolidates matrices, setting missing values to 1 distance""" result = np.zeros((len(all_env_names), len(all_env_names)), float) for m, e, w in zip(matrices, env_names, weights): result += reshape_by_name(m, e, all_env_names, 1) * w return result def consolidate_skipping_missing_values(matrices, env_names, weights, all_env_names): """Consolidates matrices, skipping only values from missing envs""" result = [] for m, e, w in zip(matrices, env_names, weights): reshaped = reshape_by_name(m, e, all_env_names, masked=True) reshaped *= w result.append(reshaped) data = np.array([i.data for i in result], float) masks = np.array([i.mask for i in result], bool) masked_result = np.ma.array(data, mask=masks) # figure out mask of weights so we can figure out per-element weighting masked_weights = np.ma.array(np.zeros(data.shape), mask=masks) + \ np.array(weights, float).reshape((len(weights), 1, 1)) return masked_result.sum(0) / masked_weights.sum(0) def reshape_by_name(m, old_names, new_names, default_off_diag=0, default_diag=0, masked=False): """Reshape matrix m mapping slots from old names to new names. """ num_names = len(new_names) result = np.zeros((num_names, num_names), float) + default_off_diag for i in range(num_names): result[i, i] = default_diag pairs = {} for i, n in enumerate(old_names): if n in new_names: pairs[i] = new_names.index(n) for i, row in enumerate(m): new_i = pairs[i] for j, val in enumerate(row): new_j = pairs[j] result[new_i, new_j] = val if masked: mask = np.ones((num_names, num_names), float) for i in pairs.values(): for j in pairs.values(): mask[i, j] = 0 result = np.ma.array(result, mask=mask) return result CONSOLIDATIONS = \ {'skipping_missing_matrices': consolidate_skipping_missing_matrices, 'missing_zero': consolidate_missing_zero, 'missing_one': consolidate_missing_one, 'skipping_missing_values': consolidate_skipping_missing_values} unifrac-1.3/unifrac/_methods.py000066400000000000000000003261321442154206300166170ustar00rootroot00000000000000# ---------------------------------------------------------------------------- # Copyright (c) 2016-2020, UniFrac development team. # # Distributed under the terms of the Modified BSD License. # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- from warnings import warn from functools import reduce from operator import or_ from typing import Union import numpy as np import pandas as pd import skbio import h5py from bp import BP from skbio import TreeNode from skbio.stats.distance._base import _build_results as _build_stat from biom import Table import unifrac as qsu from unifrac._meta import CONSOLIDATIONS def is_biom_v210(f, ids=None): import h5py if not h5py.is_hdf5(f): return False with h5py.File(f, 'r') as fp: if 'format-version' not in fp.attrs: return False version = fp.attrs.get('format-version', None) if version is None: return False if tuple(version) != (2, 1): return False if ids is not None: for idel in fp['sample/ids']: if isinstance(idel, bytes): ids.append(idel.decode('ascii')) else: ids.append(idel) return True def is_newick(f): sniffer = skbio.io.format.newick.newick.sniffer_function return sniffer(f)[0] def _validate(table, phylogeny, ids=None): if not is_biom_v210(table, ids): raise ValueError("Table does not appear to be a BIOM-Format v2.1") if not is_newick(phylogeny): raise ValueError("The phylogeny does not appear to be newick") def _call_ssu(table, phylogeny, *args): if isinstance(table, Table) and isinstance(phylogeny, (TreeNode, BP)): return qsu.ssu_inmem(table, phylogeny, *args) elif isinstance(table, str) and isinstance(phylogeny, str): ids = [] _validate(table, phylogeny, ids) return qsu.ssu_fast(table, phylogeny, ids, *args) else: table_type = type(table) tree_type = type(phylogeny) raise ValueError(f"table ('{table_type}') and tree ('{tree_type}') " f"are incompatible with the library call") def _call_ssu_to_file(table, phylogeny, *args): if isinstance(table, Table) and isinstance(phylogeny, (TreeNode, BP)): raise NotImplementedError("Direct to file support from in memory " "objects has not been implemented yet") elif isinstance(table, str) and isinstance(phylogeny, str): _validate(table, phylogeny) return qsu.ssu_to_file_v2(table, phylogeny, *args) else: table_type = type(table) tree_type = type(phylogeny) raise ValueError(f"table ('{table_type}') and tree ('{tree_type}') " f"are incompatible with the library call") # # Functions that compute Unifrac and return a memory object # def unweighted(table: Union[str, Table], phylogeny: Union[str, TreeNode, BP], threads: int = 1, variance_adjusted: bool = False, bypass_tips: bool = False, n_substeps: int = 1) -> skbio.DistanceMatrix: """Compute Unweighted UniFrac Parameters ---------- table : str A filepath to a BIOM-Format 2.1 file. phylogeny : str A filepath to a Newick formatted tree. threads : int, optional Deprecated, no-op. variance_adjusted : bool, optional Adjust for varianace or not. Default is False. bypass_tips : bool, optional Bypass the tips of the tree in the computation. This reduces compute by about 50%, but is an approximation. n_substeps : int, optional Internally split the problem in substeps for reduced memory footprint. Returns ------- skbio.DistanceMatrix The resulting distance matrix. Raises ------ IOError If the tree file is not found If the table is not found ValueError If the table does not appear to be BIOM-Format v2.1. If the phylogeny does not appear to be in Newick format. Environment variables --------------------- OMP_NUM_THREADS Number of CPU cores to use. If not defined, use all detected cores. UNIFRAC_USE_GPU Enable or disable GPU offload. If not defined, autodetect. ACC_DEVICE_NUM The GPU to use. If not defined, the first GPU will be used. Notes ----- Unweighted UniFrac was originally described in [1]_. Variance Adjusted UniFrac was originally described in [2]_, and while its application to Unweighted UniFrac was not described, factoring in the variance adjustment is still feasible and so it is exposed. References ---------- .. [1] Lozupone, C. & Knight, R. UniFrac: a new phylogenetic method for comparing microbial communities. Appl. Environ. Microbiol. 71, 8228-8235 (2005). .. [2] Chang, Q., Luan, Y. & Sun, F. Variance adjusted weighted UniFrac: a powerful beta diversity measure for comparing communities based on phylogeny. BMC Bioinformatics 12:118 (2011). """ return _call_ssu(table, phylogeny, 'unweighted', variance_adjusted, 1.0, bypass_tips, n_substeps) def unweighted_fp64(table: Union[str, Table], phylogeny: Union[str, TreeNode, BP], threads: int = 1, variance_adjusted: bool = False, bypass_tips: bool = False, n_substeps: int = 1) -> skbio.DistanceMatrix: """Compute Unweighted UniFrac using fp64 math Parameters ---------- table : str A filepath to a BIOM-Format 2.1 file. phylogeny : str A filepath to a Newick formatted tree. threads : int, optional Deprecated, no-op. variance_adjusted : bool, optional Adjust for varianace or not. Default is False. bypass_tips : bool, optional Bypass the tips of the tree in the computation. This reduces compute by about 50%, but is an approximation. n_substeps : int, optional Internally split the problem in substeps for reduced memory footprint. Returns ------- skbio.DistanceMatrix The resulting distance matrix. Raises ------ IOError If the tree file is not found If the table is not found ValueError If the table does not appear to be BIOM-Format v2.1. If the phylogeny does not appear to be in Newick format. Environment variables --------------------- OMP_NUM_THREADS Number of CPU cores to use. If not defined, use all detected cores. UNIFRAC_USE_GPU Enable or disable GPU offload. If not defined, autodetect. ACC_DEVICE_NUM The GPU to use. If not defined, the first GPU will be used. Notes ----- Unweighted UniFrac was originally described in [1]_. Variance Adjusted UniFrac was originally described in [2]_, and while its application to Unweighted UniFrac was not described, factoring in the variance adjustment is still feasible and so it is exposed. References ---------- .. [1] Lozupone, C. & Knight, R. UniFrac: a new phylogenetic method for comparing microbial communities. Appl. Environ. Microbiol. 71, 8228-8235 (2005). .. [2] Chang, Q., Luan, Y. & Sun, F. Variance adjusted weighted UniFrac: a powerful beta diversity measure for comparing communities based on phylogeny. BMC Bioinformatics 12:118 (2011). """ return _call_ssu(table, phylogeny, 'unweighted_fp64', variance_adjusted, 1.0, bypass_tips, n_substeps) def unweighted_fp32(table: Union[str, Table], phylogeny: Union[str, TreeNode, BP], threads: int = 1, variance_adjusted: bool = False, bypass_tips: bool = False, n_substeps: int = 1) -> skbio.DistanceMatrix: """Compute Unweighted UniFrac using fp32 math Parameters ---------- table : str A filepath to a BIOM-Format 2.1 file. phylogeny : str A filepath to a Newick formatted tree. threads : int, optional Deprecated, no-op. variance_adjusted : bool, optional Adjust for varianace or not. Default is False. bypass_tips : bool, optional Bypass the tips of the tree in the computation. This reduces compute by about 50%, but is an approximation. n_substeps : int, optional Internally split the problem in substeps for reduced memory footprint. Returns ------- skbio.DistanceMatrix The resulting distance matrix. Raises ------ IOError If the tree file is not found If the table is not found ValueError If the table does not appear to be BIOM-Format v2.1. If the phylogeny does not appear to be in Newick format. Environment variables --------------------- OMP_NUM_THREADS Number of CPU cores to use. If not defined, use all detected cores. UNIFRAC_USE_GPU Enable or disable GPU offload. If not defined, autodetect. ACC_DEVICE_NUM The GPU to use. If not defined, the first GPU will be used. Notes ----- Unweighted UniFrac was originally described in [1]_. Variance Adjusted UniFrac was originally described in [2]_, and while its application to Unweighted UniFrac was not described, factoring in the variance adjustment is still feasible and so it is exposed. References ---------- .. [1] Lozupone, C. & Knight, R. UniFrac: a new phylogenetic method for comparing microbial communities. Appl. Environ. Microbiol. 71, 8228-8235 (2005). .. [2] Chang, Q., Luan, Y. & Sun, F. Variance adjusted weighted UniFrac: a powerful beta diversity measure for comparing communities based on phylogeny. BMC Bioinformatics 12:118 (2011). """ return _call_ssu(table, phylogeny, 'unweighted_fp32', variance_adjusted, 1.0, bypass_tips, n_substeps) def weighted_normalized(table: Union[str, Table], phylogeny: Union[str, TreeNode, BP], threads: int = 1, variance_adjusted: bool = False, bypass_tips: bool = False, n_substeps: int = 1) -> skbio.DistanceMatrix: """Compute weighted normalized UniFrac Parameters ---------- table : str A filepath to a BIOM-Format 2.1 file. phylogeny : str A filepath to a Newick formatted tree. threads : int, optional Deprecated, no-op. variance_adjusted : bool, optional Adjust for varianace or not. Default is False. bypass_tips : bool, optional Bypass the tips of the tree in the computation. This reduces compute by about 50%, but is an approximation. n_substeps : int, optional Internally split the problem in substeps for reduced memory footprint. Returns ------- skbio.DistanceMatrix The resulting distance matrix. Raises ------ IOError If the tree file is not found If the table is not found ValueError If the table does not appear to be BIOM-Format v2.1. If the phylogeny does not appear to be in Newick format. Environment variables --------------------- OMP_NUM_THREADS Number of CPU cores to use. If not defined, use all detected cores. UNIFRAC_USE_GPU Enable or disable GPU offload. If not defined, autodetect. ACC_DEVICE_NUM The GPU to use. If not defined, the first GPU will be used. Notes ----- Weighted UniFrac was originally described in [1]_. Variance Adjusted Weighted UniFrac was originally described in [2]_. References ---------- .. [1] Lozupone, C. A., Hamady, M., Kelley, S. T. & Knight, R. Quantitative and qualitative beta diversity measures lead to different insights into factors that structure microbial communities. Appl. Environ. Microbiol. 73, 1576-1585 (2007). .. [2] Chang, Q., Luan, Y. & Sun, F. Variance adjusted weighted UniFrac: a powerful beta diversity measure for comparing communities based on phylogeny. BMC Bioinformatics 12:118 (2011). """ return _call_ssu(str(table), str(phylogeny), 'weighted_normalized', variance_adjusted, 1.0, bypass_tips, n_substeps) def weighted_normalized_fp64(table: Union[str, Table], phylogeny: Union[str, TreeNode, BP], threads: int = 1, variance_adjusted: bool = False, bypass_tips: bool = False, n_substeps: int = 1 ) -> skbio.DistanceMatrix: """Compute weighted normalized UniFrac using fp64 math Parameters ---------- table : str A filepath to a BIOM-Format 2.1 file. phylogeny : str A filepath to a Newick formatted tree. threads : int, optional Deprecated, no-op. variance_adjusted : bool, optional Adjust for varianace or not. Default is False. bypass_tips : bool, optional Bypass the tips of the tree in the computation. This reduces compute by about 50%, but is an approximation. n_substeps : int, optional Internally split the problem in substeps for reduced memory footprint. Returns ------- skbio.DistanceMatrix The resulting distance matrix. Raises ------ IOError If the tree file is not found If the table is not found ValueError If the table does not appear to be BIOM-Format v2.1. If the phylogeny does not appear to be in Newick format. Environment variables --------------------- OMP_NUM_THREADS Number of CPU cores to use. If not defined, use all detected cores. UNIFRAC_USE_GPU Enable or disable GPU offload. If not defined, autodetect. ACC_DEVICE_NUM The GPU to use. If not defined, the first GPU will be used. Notes ----- Weighted UniFrac was originally described in [1]_. Variance Adjusted Weighted UniFrac was originally described in [2]_. References ---------- .. [1] Lozupone, C. A., Hamady, M., Kelley, S. T. & Knight, R. Quantitative and qualitative beta diversity measures lead to different insights into factors that structure microbial communities. Appl. Environ. Microbiol. 73, 1576-1585 (2007). .. [2] Chang, Q., Luan, Y. & Sun, F. Variance adjusted weighted UniFrac: a powerful beta diversity measure for comparing communities based on phylogeny. BMC Bioinformatics 12:118 (2011). """ return _call_ssu(str(table), str(phylogeny), 'weighted_normalized_fp64', variance_adjusted, 1.0, bypass_tips, n_substeps) def weighted_normalized_fp32(table: Union[str, Table], phylogeny: Union[str, TreeNode, BP], threads: int = 1, variance_adjusted: bool = False, bypass_tips: bool = False, n_substeps: int = 1 ) -> skbio.DistanceMatrix: """Compute weighted normalized UniFrac using fp32 math Parameters ---------- table : str A filepath to a BIOM-Format 2.1 file. phylogeny : str A filepath to a Newick formatted tree. threads : int, optional Deprecated, no-op. variance_adjusted : bool, optional Adjust for varianace or not. Default is False. bypass_tips : bool, optional Bypass the tips of the tree in the computation. This reduces compute by about 50%, but is an approximation. n_substeps : int, optional Internally split the problem in substeps for reduced memory footprint. Returns ------- skbio.DistanceMatrix The resulting distance matrix. Raises ------ IOError If the tree file is not found If the table is not found ValueError If the table does not appear to be BIOM-Format v2.1. If the phylogeny does not appear to be in Newick format. Environment variables --------------------- OMP_NUM_THREADS Number of CPU cores to use. If not defined, use all detected cores. UNIFRAC_USE_GPU Enable or disable GPU offload. If not defined, autodetect. ACC_DEVICE_NUM The GPU to use. If not defined, the first GPU will be used. Notes ----- Weighted UniFrac was originally described in [1]_. Variance Adjusted Weighted UniFrac was originally described in [2]_. References ---------- .. [1] Lozupone, C. A., Hamady, M., Kelley, S. T. & Knight, R. Quantitative and qualitative beta diversity measures lead to different insights into factors that structure microbial communities. Appl. Environ. Microbiol. 73, 1576-1585 (2007). .. [2] Chang, Q., Luan, Y. & Sun, F. Variance adjusted weighted UniFrac: a powerful beta diversity measure for comparing communities based on phylogeny. BMC Bioinformatics 12:118 (2011). """ return _call_ssu(str(table), str(phylogeny), 'weighted_normalized_fp32', variance_adjusted, 1.0, bypass_tips, n_substeps) def weighted_unnormalized(table: Union[str, Table], phylogeny: Union[str, TreeNode, BP], threads: int = 1, variance_adjusted: bool = False, bypass_tips: bool = False, n_substeps: int = 1) -> skbio.DistanceMatrix: # noqa """Compute weighted unnormalized UniFrac Parameters ---------- table : str A filepath to a BIOM-Format 2.1 file. phylogeny : str A filepath to a Newick formatted tree. threads : int, optional Deprecated, no-op.. variance_adjusted : bool, optional Adjust for varianace or not. Default is False. bypass_tips : bool, optional Bypass the tips of the tree in the computation. This reduces compute by about 50%, but is an approximation. n_substeps : int, optional Internally split the problem in substeps for reduced memory footprint. Returns ------- skbio.DistanceMatrix The resulting distance matrix. Raises ------ IOError If the tree file is not found If the table is not found ValueError If the table does not appear to be BIOM-Format v2.1. If the phylogeny does not appear to be in Newick format. Environment variables --------------------- OMP_NUM_THREADS Number of CPU cores to use. If not defined, use all detected cores. UNIFRAC_USE_GPU Enable or disable GPU offload. If not defined, autodetect. ACC_DEVICE_NUM The GPU to use. If not defined, the first GPU will be used. Notes ----- Weighted UniFrac was originally described in [1]_. Variance Adjusted Weighted UniFrac was originally described in [2]_. References ---------- .. [1] Lozupone, C. A., Hamady, M., Kelley, S. T. & Knight, R. Quantitative and qualitative beta diversity measures lead to different insights into factors that structure microbial communities. Appl. Environ. Microbiol. 73, 1576-1585 (2007). .. [2] Chang, Q., Luan, Y. & Sun, F. Variance adjusted weighted UniFrac: a powerful beta diversity measure for comparing communities based on phylogeny. BMC Bioinformatics 12:118 (2011). """ return _call_ssu(str(table), str(phylogeny), 'weighted_unnormalized', variance_adjusted, 1.0, bypass_tips, n_substeps) def weighted_unnormalized_fp64(table: Union[str, Table], phylogeny: Union[str, TreeNode, BP], threads: int = 1, variance_adjusted: bool = False, bypass_tips: bool = False, n_substeps: int = 1 ) -> skbio.DistanceMatrix: # noqa """Compute weighted unnormalized UniFrac using fp64 math Parameters ---------- table : str A filepath to a BIOM-Format 2.1 file. phylogeny : str A filepath to a Newick formatted tree. threads : int, optional TDeprecated, no-op.. variance_adjusted : bool, optional Adjust for varianace or not. Default is False. bypass_tips : bool, optional Bypass the tips of the tree in the computation. This reduces compute by about 50%, but is an approximation. n_substeps : int, optional Internally split the problem in substeps for reduced memory footprint. Returns ------- skbio.DistanceMatrix The resulting distance matrix. Raises ------ IOError If the tree file is not found If the table is not found ValueError If the table does not appear to be BIOM-Format v2.1. If the phylogeny does not appear to be in Newick format. Environment variables --------------------- OMP_NUM_THREADS Number of CPU cores to use. If not defined, use all detected cores. UNIFRAC_USE_GPU Enable or disable GPU offload. If not defined, autodetect. ACC_DEVICE_NUM The GPU to use. If not defined, the first GPU will be used. Notes ----- Weighted UniFrac was originally described in [1]_. Variance Adjusted Weighted UniFrac was originally described in [2]_. References ---------- .. [1] Lozupone, C. A., Hamady, M., Kelley, S. T. & Knight, R. Quantitative and qualitative beta diversity measures lead to different insights into factors that structure microbial communities. Appl. Environ. Microbiol. 73, 1576-1585 (2007). .. [2] Chang, Q., Luan, Y. & Sun, F. Variance adjusted weighted UniFrac: a powerful beta diversity measure for comparing communities based on phylogeny. BMC Bioinformatics 12:118 (2011). """ return _call_ssu(str(table), str(phylogeny), 'weighted_unnormalized_fp64', variance_adjusted, 1.0, bypass_tips, n_substeps) def weighted_unnormalized_fp32(table: Union[str, Table], phylogeny: Union[str, TreeNode, BP], threads: int = 1, variance_adjusted: bool = False, bypass_tips: bool = False, n_substeps: int = 1 ) -> skbio.DistanceMatrix: # noqa """Compute weighted unnormalized UniFrac using fp32 math Parameters ---------- table : str A filepath to a BIOM-Format 2.1 file. phylogeny : str A filepath to a Newick formatted tree. threads : int, optional TDeprecated, no-op.. variance_adjusted : bool, optional Adjust for varianace or not. Default is False. bypass_tips : bool, optional Bypass the tips of the tree in the computation. This reduces compute by about 50%, but is an approximation. n_substeps : int, optional Internally split the problem in substeps for reduced memory footprint. Returns ------- skbio.DistanceMatrix The resulting distance matrix. Raises ------ IOError If the tree file is not found If the table is not found ValueError If the table does not appear to be BIOM-Format v2.1. If the phylogeny does not appear to be in Newick format. Environment variables --------------------- OMP_NUM_THREADS Number of CPU cores to use. If not defined, use all detected cores. UNIFRAC_USE_GPU Enable or disable GPU offload. If not defined, autodetect. ACC_DEVICE_NUM The GPU to use. If not defined, the first GPU will be used. Notes ----- Weighted UniFrac was originally described in [1]_. Variance Adjusted Weighted UniFrac was originally described in [2]_. References ---------- .. [1] Lozupone, C. A., Hamady, M., Kelley, S. T. & Knight, R. Quantitative and qualitative beta diversity measures lead to different insights into factors that structure microbial communities. Appl. Environ. Microbiol. 73, 1576-1585 (2007). .. [2] Chang, Q., Luan, Y. & Sun, F. Variance adjusted weighted UniFrac: a powerful beta diversity measure for comparing communities based on phylogeny. BMC Bioinformatics 12:118 (2011). """ return _call_ssu(str(table), str(phylogeny), 'weighted_unnormalized_fp32', variance_adjusted, 1.0, bypass_tips, n_substeps) def generalized(table: Union[str, Table], phylogeny: Union[str, TreeNode, BP], threads: int = 1, alpha: float = 1.0, variance_adjusted: bool = False, bypass_tips: bool = False, n_substeps: int = 1) -> skbio.DistanceMatrix: """Compute Generalized UniFrac Parameters ---------- table : str A filepath to a BIOM-Format 2.1 file. phylogeny : str A filepath to a Newick formatted tree. threads : int, optional Deprecated, no-op. alpha : float, optional The level of contribution of high abundance branches. Higher alpha increases the contribution of from high abundance branches while lower alpha reduces the contribution. Alpha was originally defined over the range [0, 1]. Default is 1.0. variance_adjusted : bool, optional Adjust for varianace or not. Default is False. bypass_tips : bool, optional Bypass the tips of the tree in the computation. This reduces compute by about 50%, but is an approximation. n_substeps : int, optional Internally split the problem in substeps for reduced memory footprint. Returns ------- skbio.DistanceMatrix The resulting distance matrix. Raises ------ IOError If the tree file is not found If the table is not found ValueError If the table does not appear to be BIOM-Format v2.1. If the phylogeny does not appear to be in Newick format. Environment variables --------------------- OMP_NUM_THREADS Number of CPU cores to use. If not defined, use all detected cores. UNIFRAC_USE_GPU Enable or disable GPU offload. If not defined, autodetect. ACC_DEVICE_NUM The GPU to use. If not defined, the first GPU will be used. Notes ----- Generalized UniFrac was originally described in [1]_. Variance Adjusted UniFrac was originally described in [2]_, but was not described in as applied to Generalized UniFrac. It is feasible to do, so it is exposed here. An alpha of 1.0 is Weighted normalized UniFrac. An alpha of 0.0 is approximately Unweighted UniFrac, and is if the proportions are dichotomized. References ---------- .. [1] Chen, J., Bittinger, K., Charlson, E. S., Hoffmann C., Lewis, J., Wu, G. D., Collman R. G., Bushman, F. D. & Hongzhe L. Associating microbiome composition with environmental covariates using generalized UniFrac distances. Bioinformatics 28(16), 2106–2113 (2012). .. [2] Chang, Q., Luan, Y. & Sun, F. Variance adjusted weighted UniFrac: a powerful beta diversity measure for comparing communities based on phylogeny. BMC Bioinformatics 12:118 (2011). """ if alpha == 1.0: warn("alpha of 1.0 is weighted-normalized UniFrac. " "Weighted-normalized is being used instead as it is more " "optimized.", Warning) return weighted_normalized(table, phylogeny, threads, variance_adjusted, bypass_tips, n_substeps) else: return _call_ssu(str(table), str(phylogeny), 'generalized', variance_adjusted, alpha, bypass_tips, n_substeps) def generalized_fp64(table: Union[str, Table], phylogeny: Union[str, TreeNode, BP], threads: int = 1, alpha: float = 1.0, variance_adjusted: bool = False, bypass_tips: bool = False, n_substeps: int = 1) -> skbio.DistanceMatrix: """Compute Generalized UniFrac using fp64 math Parameters ---------- table : str A filepath to a BIOM-Format 2.1 file. phylogeny : str A filepath to a Newick formatted tree. threads : int, optional Deprecated, no-op. alpha : float, optional The level of contribution of high abundance branches. Higher alpha increases the contribution of from high abundance branches while lower alpha reduces the contribution. Alpha was originally defined over the range [0, 1]. Default is 1.0. variance_adjusted : bool, optional Adjust for varianace or not. Default is False. bypass_tips : bool, optional Bypass the tips of the tree in the computation. This reduces compute by about 50%, but is an approximation. n_substeps : int, optional Internally split the problem in substeps for reduced memory footprint. Returns ------- skbio.DistanceMatrix The resulting distance matrix. Raises ------ IOError If the tree file is not found If the table is not found ValueError If the table does not appear to be BIOM-Format v2.1. If the phylogeny does not appear to be in Newick format. Environment variables --------------------- OMP_NUM_THREADS Number of CPU cores to use. If not defined, use all detected cores. UNIFRAC_USE_GPU Enable or disable GPU offload. If not defined, autodetect. ACC_DEVICE_NUM The GPU to use. If not defined, the first GPU will be used. Notes ----- Generalized UniFrac was originally described in [1]_. Variance Adjusted UniFrac was originally described in [2]_, but was not described in as applied to Generalized UniFrac. It is feasible to do, so it is exposed here. An alpha of 1.0 is Weighted normalized UniFrac. An alpha of 0.0 is approximately Unweighted UniFrac, and is if the proportions are dichotomized. References ---------- .. [1] Chen, J., Bittinger, K., Charlson, E. S., Hoffmann C., Lewis, J., Wu, G. D., Collman R. G., Bushman, F. D. & Hongzhe L. Associating microbiome composition with environmental covariates using generalized UniFrac distances. Bioinformatics 28(16), 2106–2113 (2012). .. [2] Chang, Q., Luan, Y. & Sun, F. Variance adjusted weighted UniFrac: a powerful beta diversity measure for comparing communities based on phylogeny. BMC Bioinformatics 12:118 (2011). """ if alpha == 1.0: warn("alpha of 1.0 is weighted-normalized UniFrac. " "Weighted-normalized is being used instead as it is more " "optimized.", Warning) return weighted_normalized_fp64(table, phylogeny, threads, variance_adjusted, bypass_tips, n_substeps) else: return _call_ssu(str(table), str(phylogeny), 'generalized_fp64', variance_adjusted, alpha, bypass_tips, n_substeps) def generalized_fp32(table: Union[str, Table], phylogeny: Union[str, TreeNode, BP], threads: int = 1, alpha: float = 1.0, variance_adjusted: bool = False, bypass_tips: bool = False, n_substeps: int = 1) -> skbio.DistanceMatrix: """Compute Generalized UniFrac using fp32 math Parameters ---------- table : str A filepath to a BIOM-Format 2.1 file. phylogeny : str A filepath to a Newick formatted tree. threads : int, optional Deprecated, no-op. alpha : float, optional The level of contribution of high abundance branches. Higher alpha increases the contribution of from high abundance branches while lower alpha reduces the contribution. Alpha was originally defined over the range [0, 1]. Default is 1.0. variance_adjusted : bool, optional Adjust for varianace or not. Default is False. bypass_tips : bool, optional Bypass the tips of the tree in the computation. This reduces compute by about 50%, but is an approximation. n_substeps : int, optional Internally split the problem in substeps for reduced memory footprint. Returns ------- skbio.DistanceMatrix The resulting distance matrix. Raises ------ IOError If the tree file is not found If the table is not found ValueError If the table does not appear to be BIOM-Format v2.1. If the phylogeny does not appear to be in Newick format. Environment variables --------------------- OMP_NUM_THREADS Number of CPU cores to use. If not defined, use all detected cores. UNIFRAC_USE_GPU Enable or disable GPU offload. If not defined, autodetect. ACC_DEVICE_NUM The GPU to use. If not defined, the first GPU will be used. Notes ----- Generalized UniFrac was originally described in [1]_. Variance Adjusted UniFrac was originally described in [2]_, but was not described in as applied to Generalized UniFrac. It is feasible to do, so it is exposed here. An alpha of 1.0 is Weighted normalized UniFrac. An alpha of 0.0 is approximately Unweighted UniFrac, and is if the proportions are dichotomized. References ---------- .. [1] Chen, J., Bittinger, K., Charlson, E. S., Hoffmann C., Lewis, J., Wu, G. D., Collman R. G., Bushman, F. D. & Hongzhe L. Associating microbiome composition with environmental covariates using generalized UniFrac distances. Bioinformatics 28(16), 2106–2113 (2012). .. [2] Chang, Q., Luan, Y. & Sun, F. Variance adjusted weighted UniFrac: a powerful beta diversity measure for comparing communities based on phylogeny. BMC Bioinformatics 12:118 (2011). """ if alpha == 1.0: warn("alpha of 1.0 is weighted-normalized UniFrac. " "Weighted-normalized is being used instead as it is more " "optimized.", Warning) return weighted_normalized_fp32(table, phylogeny, threads, variance_adjusted, bypass_tips, n_substeps) else: return _call_ssu(str(table), str(phylogeny), 'generalized_fp32', variance_adjusted, alpha, bypass_tips, n_substeps) METHODS = {'unweighted': unweighted, 'weighted_normalized': weighted_normalized, 'weighted_unnormalized': weighted_unnormalized, 'generalized': generalized, 'unweighted_fp64': unweighted_fp64, 'weighted_normalized_fp64': weighted_normalized_fp64, 'weighted_unnormalized_fp64': weighted_unnormalized_fp64, 'generalized_fp64': generalized_fp64, 'unweighted_fp32': unweighted_fp32, 'weighted_normalized_fp32': weighted_normalized_fp32, 'weighted_unnormalized_fp32': weighted_unnormalized_fp32, 'generalized_fp32': generalized_fp32} def meta(tables: tuple, phylogenies: tuple, weights: tuple = None, consolidation: str = None, method: str = None, threads: int = 1, variance_adjusted: bool = False, alpha: float = None, bypass_tips: bool = False, n_substeps: int = 1) -> \ skbio.DistanceMatrix: """Compute meta UniFrac Parameters ---------- tables : tuple of str Filepaths to BIOM-Format 2.1 files. This tuple is expected to be in index order with phylogenies. phylogenies : tuple of str Filepaths to Newick formatted trees. This tuple is expected to be in index order with tables. weights : tuple of float, optional The weight applied to each tree/table pair. This tuple is expected to be in index order with tables and phylogenies. Default is to weight each tree/table pair evenly. consolidation : str, optional The matrix consolidation method. The available choices are: 'skipping_missing_matrices', 'missing_zero', 'missing_one', 'skipping_missing_values'. The default is 'skipping_missing_values'. method : str The UniFrac method to use. The available choices are: 'unweighted', 'unweighted_fp64', 'unweighted_fp32', 'weighted_unnormalized', 'weighted_unnormalized_fp64', 'weighted_unnormalized_fp32', 'weighted_normalized', 'weighted_normalized_fp64', 'weighted_normalized_fp32', 'generalized', 'generalized_fp64' and 'generalized_fp32'. threads : int, optional TDeprecated, no-op. bypass_tips : bool, optional Bypass the tips of the tree in the computation. This reduces compute by about 50%, but is an approximation. alpha : float, optional The level of contribution of high abundance branches. Higher alpha increases the contribution of from high abundance branches while lower alpha reduces the contribution. Alpha was originally defined over the range [0, 1]. Default is 1.0 variance_adjusted : bool, optional Adjust for varianace or not. Default is False. n_substeps : int, optional Internally split the problem in substeps for reduced memory footprint. Returns ------- skbio.DistanceMatrix The resulting distance matrix. Raises ------ IOError If the tree file is not found If the table is not found ValueError If the table does not appear to be BIOM-Format v2.1. If the phylogeny does not appear to be in Newick format. Environment variables --------------------- OMP_NUM_THREADS Number of CPU cores to use. If not defined, use all detected cores. UNIFRAC_USE_GPU Enable or disable GPU offload. If not defined, autodetect. ACC_DEVICE_NUM The GPU to use. If not defined, the first GPU will be used. Notes ----- UniFrac can be adapted to account for multiple genes, as originally done in [1]_. Generalized UniFrac was originally described in [2]_. Variance Adjusted UniFrac was originally described in [3]_, but was not described in as applied to Generalized UniFrac. It is feasible to do, so it is exposed here. References ---------- .. [1] Lozupone C. A., Hamady M., Cantarel B. L., Coutinho P. M., Henrissat B., Gordon J. I. & Knight R. The convergence of carbohydrate active gene repertoires in human gut microbes. PNAS 105(39):15076-81 (2008). .. [2] Chen, J., Bittinger, K., Charlson, E. S., Hoffmann C., Lewis, J., Wu, G. D., Collman R. G., Bushman, F. D. & Hongzhe L. Associating microbiome composition with environmental covariates using generalized UniFrac distances. Bioinformatics 28(16), 2106–2113 (2012). .. [3] Chang, Q., Luan, Y. & Sun, F. Variance adjusted weighted UniFrac: a powerful beta diversity measure for comparing communities based on phylogeny. BMC Bioinformatics 12:118 (2011). """ if not len(tables): raise ValueError("No tables specified.") if not len(phylogenies): raise ValueError("No trees specified.") if len(tables) != len(phylogenies): raise ValueError("Number of trees and tables must be the same.") if weights is None: weights = tuple(1 for _ in phylogenies) else: if len(weights) != len(phylogenies): raise ValueError("Number of weights does not match number of " "trees and tables.") if method is None: raise ValueError("No method specified.") method_ = METHODS.get(method.replace('-', '_')) if method_ is None: raise ValueError("Method (%s) unrecognized. Available methods are: %s" % (method, ', '.join(METHODS.keys()))) if consolidation is None: consolidation = 'skipping_missing_values' consolidation_ = CONSOLIDATIONS.get(consolidation.replace('-', '_')) if consolidation_ is None: raise ValueError("Consolidation (%s) unrecognized. Available " "consolidations are: %s" % (consolidation, ', '.join(CONSOLIDATIONS.keys()))) if alpha is not None and method_ is not generalized: raise ValueError("The alpha parameter can only be set when the method " "is set as 'generalized', the selected method is " "'%s'." % method) kwargs = {'n_substeps': n_substeps, 'bypass_tips': bypass_tips, 'variance_adjusted': variance_adjusted} if alpha is not None: kwargs['alpha'] = alpha weights = np.array(weights, float)/sum(weights) dms = [method_(table, tree, **kwargs) for table, tree in zip(tables, phylogenies)] all_ids = sorted(reduce(or_, [set(dm.ids) for dm in dms])) dm = consolidation_(dms, [dm.ids for dm in dms], weights, all_ids) return skbio.DistanceMatrix(dm, ids=all_ids) # # Functions that compute Unifrac and write into a file # def unweighted_to_file(table: str, phylogeny: str, out_filename: str, pcoa_dims: int = 10, threads: int = 1, variance_adjusted: bool = False, bypass_tips: bool = False, format: str = "", buf_dirname: str = "", n_substeps: int = 1, n_subsamples: int = 1, subsample_depth: int = 0, subsample_with_replacement: bool = True, permanova_perms: int = 0, grouping_filename: str = "", grouping_columns: str = "") -> str: """Compute Unweighted UniFrac and write to file Parameters ---------- table : str A filepath to a BIOM-Format 2.1 file. phylogeny : str A filepath to a Newick formatted tree. out_filename : str A filepath to the output file. pcoa_dims : int, optional Number of dimensions to use for PCoA compute. if set to 0, no PCoA is computed. Defaults of 10. threads : int, optional Deprecated, no-op. variance_adjusted : bool, optional Adjust for varianace or not. Default is False. bypass_tips : bool, optional Bypass the tips of the tree in the computation. This reduces compute by about 50%, but is an approximation. format : str, optional Output format to use. Defaults to "hdf5" if n_subsamples<=1 else "hdf5_nodist" buf_dirname : str, optional If set, the directory where the disk buffer is hosted, can be used to reduce the amount of memory needed. n_substeps : int, optional Internally split the problem in substeps for reduced memory footprint. n_subsamples : int If >1, perform multiple subsamples. subsample_depth : int Depth of subsampling, if >0 subsample_with_replacement : bool Use subsampling with replacement? (only True supported in 1.3) permanova_perms : int If not 0, compute PERMANOVA using that many permutations grouping_filename : str The TSV filename containing grouping information grouping_columns : str The columns to use for grouping Returns ------- str A filepath to the output file. Raises ------ IOError If the tree file is not found If the table is not found If the output file cannot be created ValueError If the table does not appear to be BIOM-Format v2.1. If the phylogeny does not appear to be in Newick format. Environment variables --------------------- OMP_NUM_THREADS Number of CPU cores to use. If not defined, use all detected cores. UNIFRAC_USE_GPU Enable or disable GPU offload. If not defined, autodetect. ACC_DEVICE_NUM The GPU to use. If not defined, the first GPU will be used. Notes ----- Unweighted UniFrac was originally described in [1]_. Variance Adjusted UniFrac was originally described in [2]_, and while its application to Unweighted UniFrac was not described, factoring in the variance adjustment is still feasible and so it is exposed. References ---------- .. [1] Lozupone, C. & Knight, R. UniFrac: a new phylogenetic method for comparing microbial communities. Appl. Environ. Microbiol. 71, 8228-8235 (2005). .. [2] Chang, Q., Luan, Y. & Sun, F. Variance adjusted weighted UniFrac: a powerful beta diversity measure for comparing communities based on phylogeny. BMC Bioinformatics 12:118 (2011). """ return _call_ssu_to_file(table, phylogeny, out_filename, 'unweighted', variance_adjusted, 1.0, bypass_tips, n_substeps, format, n_subsamples, subsample_depth, subsample_with_replacement, pcoa_dims, permanova_perms, grouping_filename, grouping_columns, buf_dirname) def unweighted_fp64_to_file(table: str, phylogeny: str, out_filename: str, pcoa_dims: int = 10, threads: int = 1, variance_adjusted: bool = False, bypass_tips: bool = False, format: str = "hdf5", buf_dirname: str = "", n_substeps: int = 1, n_subsamples: int = 1, subsample_depth: int = 0, subsample_with_replacement: bool = True, permanova_perms: int = 0, grouping_filename: str = "", grouping_columns: str = "") -> str: """Compute Unweighted UniFrac using fp64 math and write to file Parameters ---------- table : str A filepath to a BIOM-Format 2.1 file. phylogeny : str A filepath to a Newick formatted tree. out_filename : str A filepath to the output file. pcoa_dims : int, optional Number of dimensions to use for PCoA compute. if set to 0, no PCoA is computed. Defaults of 10. threads : int, optional Deprecated, no-op. variance_adjusted : bool, optional Adjust for varianace or not. Default is False. bypass_tips : bool, optional Bypass the tips of the tree in the computation. This reduces compute by about 50%, but is an approximation. format : str, optional Output format to use. Defaults to "hdf5". buf_dirname : str, optional If set, the directory where the disk buffer is hosted, can be used to reduce the amount of memory needed. n_substeps : int, optional Internally split the problem in substeps for reduced memory footprint. n_subsamples : int If >1, perform multiple subsamples. subsample_depth : int Depth of subsampling, if >0 subsample_with_replacement : bool Use subsampling with replacement? (only True supported in 1.3) permanova_perms : int If not 0, compute PERMANOVA using that many permutations grouping_filename : str The TSV filename containing grouping information grouping_columns : str The columns to use for grouping Returns ------- str A filepath to the output file. Raises ------ IOError If the tree file is not found If the table is not found If the output file cannot be created ValueError If the table does not appear to be BIOM-Format v2.1. If the phylogeny does not appear to be in Newick format. Environment variables --------------------- OMP_NUM_THREADS Number of CPU cores to use. If not defined, use all detected cores. UNIFRAC_USE_GPU Enable or disable GPU offload. If not defined, autodetect. ACC_DEVICE_NUM The GPU to use. If not defined, the first GPU will be used. Notes ----- Unweighted UniFrac was originally described in [1]_. Variance Adjusted UniFrac was originally described in [2]_, and while its application to Unweighted UniFrac was not described, factoring in the variance adjustment is still feasible and so it is exposed. References ---------- .. [1] Lozupone, C. & Knight, R. UniFrac: a new phylogenetic method for comparing microbial communities. Appl. Environ. Microbiol. 71, 8228-8235 (2005). .. [2] Chang, Q., Luan, Y. & Sun, F. Variance adjusted weighted UniFrac: a powerful beta diversity measure for comparing communities based on phylogeny. BMC Bioinformatics 12:118 (2011). """ return _call_ssu_to_file(table, phylogeny, out_filename, 'unweighted_fp64', variance_adjusted, 1.0, bypass_tips, n_substeps, format, n_subsamples, subsample_depth, subsample_with_replacement, pcoa_dims, permanova_perms, grouping_filename, grouping_columns, buf_dirname) def unweighted_fp32_to_file(table: str, phylogeny: str, out_filename: str, pcoa_dims: int = 10, threads: int = 1, variance_adjusted: bool = False, bypass_tips: bool = False, format: str = "hdf5", buf_dirname: str = "", n_substeps: int = 1, n_subsamples: int = 1, subsample_depth: int = 0, subsample_with_replacement: bool = True, permanova_perms: int = 0, grouping_filename: str = "", grouping_columns: str = "") -> str: """Compute Unweighted UniFrac using fp32 math and write to file Parameters ---------- table : str A filepath to a BIOM-Format 2.1 file. phylogeny : str A filepath to a Newick formatted tree. out_filename : str A filepath to the output file. pcoa_dims : int, optional Number of dimensions to use for PCoA compute. if set to 0, no PCoA is computed. Defaults of 10. threads : int, optional Deprecated, no-op. variance_adjusted : bool, optional Adjust for varianace or not. Default is False. bypass_tips : bool, optional Bypass the tips of the tree in the computation. This reduces compute by about 50%, but is an approximation. format : str, optional Output format to use. Defaults to "hdf5". buf_dirname : str, optional If set, the directory where the disk buffer is hosted, can be used to reduce the amount of memory needed. n_substeps : int, optional Internally split the problem in substeps for reduced memory footprint. n_subsamples : int If >1, perform multiple subsamples. subsample_depth : int Depth of subsampling, if >0 subsample_with_replacement : bool Use subsampling with replacement? (only True supported in 1.3) permanova_perms : int If not 0, compute PERMANOVA using that many permutations grouping_filename : str The TSV filename containing grouping information grouping_columns : str The columns to use for grouping Returns ------- str A filepath to the output file. Raises ------ IOError If the tree file is not found If the table is not found If the output file cannot be created ValueError If the table does not appear to be BIOM-Format v2.1. If the phylogeny does not appear to be in Newick format. Environment variables --------------------- OMP_NUM_THREADS Number of CPU cores to use. If not defined, use all detected cores. UNIFRAC_USE_GPU Enable or disable GPU offload. If not defined, autodetect. ACC_DEVICE_NUM The GPU to use. If not defined, the first GPU will be used. Notes ----- Unweighted UniFrac was originally described in [1]_. Variance Adjusted UniFrac was originally described in [2]_, and while its application to Unweighted UniFrac was not described, factoring in the variance adjustment is still feasible and so it is exposed. References ---------- .. [1] Lozupone, C. & Knight, R. UniFrac: a new phylogenetic method for comparing microbial communities. Appl. Environ. Microbiol. 71, 8228-8235 (2005). .. [2] Chang, Q., Luan, Y. & Sun, F. Variance adjusted weighted UniFrac: a powerful beta diversity measure for comparing communities based on phylogeny. BMC Bioinformatics 12:118 (2011). """ return _call_ssu_to_file(table, phylogeny, out_filename, 'unweighted_fp32', variance_adjusted, 1.0, bypass_tips, n_substeps, format, n_subsamples, subsample_depth, subsample_with_replacement, pcoa_dims, permanova_perms, grouping_filename, grouping_columns, buf_dirname) def weighted_normalized_to_file(table: str, phylogeny: str, out_filename: str, pcoa_dims: int = 10, threads: int = 1, variance_adjusted: bool = False, bypass_tips: bool = False, format: str = "hdf5", buf_dirname: str = "", n_substeps: int = 1, n_subsamples: int = 1, subsample_depth: int = 0, subsample_with_replacement: bool = True, permanova_perms: int = 0, grouping_filename: str = "", grouping_columns: str = "") -> str: """Compute weighted normalized UniFrac and write to file Parameters ---------- table : str A filepath to a BIOM-Format 2.1 file. phylogeny : str A filepath to a Newick formatted tree. out_filename : str A filepath to the output file. pcoa_dims : int, optional Number of dimensions to use for PCoA compute. if set to 0, no PCoA is computed. Defaults of 10. threads : int, optional Deprecated, no-op. variance_adjusted : bool, optional Adjust for varianace or not. Default is False. bypass_tips : bool, optional Bypass the tips of the tree in the computation. This reduces compute by about 50%, but is an approximation. format : str, optional Output format to use. Defaults to "hdf5". buf_dirname : str, optional If set, the directory where the disk buffer is hosted, can be used to reduce the amount of memory needed. n_substeps : int, optional Internally split the problem in substeps for reduced memory footprint. n_subsamples : int If >1, perform multiple subsamples. subsample_depth : int Depth of subsampling, if >0 subsample_with_replacement : bool Use subsampling with replacement? (only True supported in 1.3) permanova_perms : int If not 0, compute PERMANOVA using that many permutations grouping_filename : str The TSV filename containing grouping information grouping_columns : str The columns to use for grouping Returns ------- str A filepath to the output file. Raises ------ IOError If the tree file is not found If the table is not found If the output file cannot be created ValueError If the table does not appear to be BIOM-Format v2.1. If the phylogeny does not appear to be in Newick format. Environment variables --------------------- OMP_NUM_THREADS Number of CPU cores to use. If not defined, use all detected cores. UNIFRAC_USE_GPU Enable or disable GPU offload. If not defined, autodetect. ACC_DEVICE_NUM The GPU to use. If not defined, the first GPU will be used. Notes ----- Weighted UniFrac was originally described in [1]_. Variance Adjusted Weighted UniFrac was originally described in [2]_. References ---------- .. [1] Lozupone, C. A., Hamady, M., Kelley, S. T. & Knight, R. Quantitative and qualitative beta diversity measures lead to different insights into factors that structure microbial communities. Appl. Environ. Microbiol. 73, 1576-1585 (2007). .. [2] Chang, Q., Luan, Y. & Sun, F. Variance adjusted weighted UniFrac: a powerful beta diversity measure for comparing communities based on phylogeny. BMC Bioinformatics 12:118 (2011). """ return _call_ssu_to_file(table, phylogeny, out_filename, 'weighted_normalized', variance_adjusted, 1.0, bypass_tips, n_substeps, format, n_subsamples, subsample_depth, subsample_with_replacement, pcoa_dims, permanova_perms, grouping_filename, grouping_columns, buf_dirname) def weighted_normalized_fp64_to_file(table: str, phylogeny: str, out_filename: str, pcoa_dims: int = 10, threads: int = 1, variance_adjusted: bool = False, bypass_tips: bool = False, format: str = "hdf5", buf_dirname: str = "", n_substeps: int = 1, n_subsamples: int = 1, subsample_depth: int = 0, subsample_with_replacement: bool = True, permanova_perms: int = 0, grouping_filename: str = "", grouping_columns: str = "") -> str: """Compute weighted normalized UniFrac using fp64 math and write to file Parameters ---------- table : str A filepath to a BIOM-Format 2.1 file. phylogeny : str A filepath to a Newick formatted tree. out_filename : str A filepath to the output file. pcoa_dims : int, optional Number of dimensions to use for PCoA compute. if set to 0, no PCoA is computed. Defaults of 10. threads : int, optional Deprecated, no-op. variance_adjusted : bool, optional Adjust for varianace or not. Default is False. bypass_tips : bool, optional Bypass the tips of the tree in the computation. This reduces compute by about 50%, but is an approximation. format : str, optional Output format to use. Defaults to "hdf5". buf_dirname : str, optional If set, the directory where the disk buffer is hosted, can be used to reduce the amount of memory needed. n_substeps : int, optional Internally split the problem in substeps for reduced memory footprint. n_subsamples : int If >1, perform multiple subsamples. subsample_depth : int Depth of subsampling, if >0 subsample_with_replacement : bool Use subsampling with replacement? (only True supported in 1.3) permanova_perms : int If not 0, compute PERMANOVA using that many permutations grouping_filename : str The TSV filename containing grouping information grouping_columns : str The columns to use for grouping Returns ------- str A filepath to the output file. Raises ------ IOError If the tree file is not found If the table is not found If the output file cannot be created ValueError If the table does not appear to be BIOM-Format v2.1. If the phylogeny does not appear to be in Newick format. Environment variables --------------------- OMP_NUM_THREADS Number of CPU cores to use. If not defined, use all detected cores. UNIFRAC_USE_GPU Enable or disable GPU offload. If not defined, autodetect. ACC_DEVICE_NUM The GPU to use. If not defined, the first GPU will be used. Notes ----- Weighted UniFrac was originally described in [1]_. Variance Adjusted Weighted UniFrac was originally described in [2]_. References ---------- .. [1] Lozupone, C. A., Hamady, M., Kelley, S. T. & Knight, R. Quantitative and qualitative beta diversity measures lead to different insights into factors that structure microbial communities. Appl. Environ. Microbiol. 73, 1576-1585 (2007). .. [2] Chang, Q., Luan, Y. & Sun, F. Variance adjusted weighted UniFrac: a powerful beta diversity measure for comparing communities based on phylogeny. BMC Bioinformatics 12:118 (2011). """ return _call_ssu_to_file(table, phylogeny, out_filename, 'weighted_normalized_fp64', variance_adjusted, 1.0, bypass_tips, n_substeps, format, n_subsamples, subsample_depth, subsample_with_replacement, pcoa_dims, permanova_perms, grouping_filename, grouping_columns, buf_dirname) def weighted_normalized_fp32_to_file(table: str, phylogeny: str, out_filename: str, pcoa_dims: int = 10, threads: int = 1, variance_adjusted: bool = False, bypass_tips: bool = False, format: str = "hdf5", buf_dirname: str = "", n_substeps: int = 1, n_subsamples: int = 1, subsample_depth: int = 0, subsample_with_replacement: bool = True, permanova_perms: int = 0, grouping_filename: str = "", grouping_columns: str = "") -> str: """Compute weighted normalized UniFrac using fp32 math and write to file Parameters ---------- table : str A filepath to a BIOM-Format 2.1 file. phylogeny : str A filepath to a Newick formatted tree. out_filename : str A filepath to the output file. pcoa_dims : int, optional Number of dimensions to use for PCoA compute. if set to 0, no PCoA is computed. Defaults of 10. threads : int, optional Deprecated, no-op. variance_adjusted : bool, optional Adjust for varianace or not. Default is False. bypass_tips : bool, optional Bypass the tips of the tree in the computation. This reduces compute by about 50%, but is an approximation. format : str, optional Output format to use. Defaults to "hdf5". buf_dirname : str, optional If set, the directory where the disk buffer is hosted, can be used to reduce the amount of memory needed. n_substeps : int, optional Internally split the problem in substeps for reduced memory footprint. n_subsamples : int If >1, perform multiple subsamples. subsample_depth : int Depth of subsampling, if >0 subsample_with_replacement : bool Use subsampling with replacement? (only True supported in 1.3) permanova_perms : int If not 0, compute PERMANOVA using that many permutations grouping_filename : str The TSV filename containing grouping information grouping_columns : str The columns to use for grouping Returns ------- str A filepath to the output file. Raises ------ IOError If the tree file is not found If the table is not found If the output file cannot be created ValueError If the table does not appear to be BIOM-Format v2.1. If the phylogeny does not appear to be in Newick format. Environment variables --------------------- OMP_NUM_THREADS Number of CPU cores to use. If not defined, use all detected cores. UNIFRAC_USE_GPU Enable or disable GPU offload. If not defined, autodetect. ACC_DEVICE_NUM The GPU to use. If not defined, the first GPU will be used. Notes ----- Weighted UniFrac was originally described in [1]_. Variance Adjusted Weighted UniFrac was originally described in [2]_. References ---------- .. [1] Lozupone, C. A., Hamady, M., Kelley, S. T. & Knight, R. Quantitative and qualitative beta diversity measures lead to different insights into factors that structure microbial communities. Appl. Environ. Microbiol. 73, 1576-1585 (2007). .. [2] Chang, Q., Luan, Y. & Sun, F. Variance adjusted weighted UniFrac: a powerful beta diversity measure for comparing communities based on phylogeny. BMC Bioinformatics 12:118 (2011). """ return _call_ssu_to_file(table, phylogeny, out_filename, 'weighted_normalized_fp32', variance_adjusted, 1.0, bypass_tips, n_substeps, format, n_subsamples, subsample_depth, subsample_with_replacement, pcoa_dims, permanova_perms, grouping_filename, grouping_columns, buf_dirname) def weighted_unnormalized_to_file(table: str, phylogeny: str, out_filename: str, pcoa_dims: int = 10, threads: int = 1, variance_adjusted: bool = False, bypass_tips: bool = False, format: str = "hdf5", buf_dirname: str = "", n_substeps: int = 1, n_subsamples: int = 1, subsample_depth: int = 0, subsample_with_replacement: bool = True, permanova_perms: int = 0, grouping_filename: str = "", grouping_columns: str = "") -> str: """Compute weighted unnormalized UniFrac and write it to file Parameters ---------- table : str A filepath to a BIOM-Format 2.1 file. phylogeny : str A filepath to a Newick formatted tree. out_filename : str A filepath to the output file. pcoa_dims : int, optional Number of dimensions to use for PCoA compute. if set to 0, no PCoA is computed. Defaults of 10. threads : int, optional TDeprecated, no-op.. variance_adjusted : bool, optional Adjust for varianace or not. Default is False. bypass_tips : bool, optional Bypass the tips of the tree in the computation. This reduces compute by about 50%, but is an approximation. format : str, optional Output format to use. Defaults to "hdf5". buf_dirname : str, optional If set, the directory where the disk buffer is hosted, can be used to reduce the amount of memory needed. n_substeps : int, optional Internally split the problem in substeps for reduced memory footprint. n_subsamples : int If >1, perform multiple subsamples. subsample_depth : int Depth of subsampling, if >0 subsample_with_replacement : bool Use subsampling with replacement? (only True supported in 1.3) permanova_perms : int If not 0, compute PERMANOVA using that many permutations grouping_filename : str The TSV filename containing grouping information grouping_columns : str The columns to use for grouping Returns ------- str A filepath to the output file. Raises ------ IOError If the tree file is not found If the table is not found If the output file cannot be created ValueError If the table does not appear to be BIOM-Format v2.1. If the phylogeny does not appear to be in Newick format. Environment variables --------------------- OMP_NUM_THREADS Number of CPU cores to use. If not defined, use all detected cores. UNIFRAC_USE_GPU Enable or disable GPU offload. If not defined, autodetect. ACC_DEVICE_NUM The GPU to use. If not defined, the first GPU will be used. Notes ----- Weighted UniFrac was originally described in [1]_. Variance Adjusted Weighted UniFrac was originally described in [2]_. References ---------- .. [1] Lozupone, C. A., Hamady, M., Kelley, S. T. & Knight, R. Quantitative and qualitative beta diversity measures lead to different insights into factors that structure microbial communities. Appl. Environ. Microbiol. 73, 1576-1585 (2007). .. [2] Chang, Q., Luan, Y. & Sun, F. Variance adjusted weighted UniFrac: a powerful beta diversity measure for comparing communities based on phylogeny. BMC Bioinformatics 12:118 (2011). """ return _call_ssu_to_file(table, phylogeny, out_filename, 'weighted_unnormalized', variance_adjusted, 1.0, bypass_tips, n_substeps, format, n_subsamples, subsample_depth, subsample_with_replacement, pcoa_dims, permanova_perms, grouping_filename, grouping_columns, buf_dirname) def weighted_unnormalized_fp64_to_file(table: str, phylogeny: str, out_filename: str, pcoa_dims: int = 10, threads: int = 1, variance_adjusted: bool = False, bypass_tips: bool = False, format: str = "hdf5", buf_dirname: str = "", n_substeps: int = 1, n_subsamples: int = 1, subsample_depth: int = 0, subsample_with_replacement: bool = True, permanova_perms: int = 0, grouping_filename: str = "", grouping_columns: str = "") -> str: """Compute weighted unnormalized UniFrac using fp64 math and write to file Parameters ---------- table : str A filepath to a BIOM-Format 2.1 file. phylogeny : str A filepath to a Newick formatted tree. out_filename : str A filepath to the output file. pcoa_dims : int, optional Number of dimensions to use for PCoA compute. if set to 0, no PCoA is computed. Defaults of 10. threads : int, optional TDeprecated, no-op.. variance_adjusted : bool, optional Adjust for varianace or not. Default is False. bypass_tips : bool, optional Bypass the tips of the tree in the computation. This reduces compute by about 50%, but is an approximation. format : str, optional Output format to use. Defaults to "hdf5". buf_dirname : str, optional If set, the directory where the disk buffer is hosted, can be used to reduce the amount of memory needed. n_substeps : int, optional Internally split the problem in substeps for reduced memory footprint. n_subsamples : int If >1, perform multiple subsamples. subsample_depth : int Depth of subsampling, if >0 subsample_with_replacement : bool Use subsampling with replacement? (only True supported in 1.3) permanova_perms : int If not 0, compute PERMANOVA using that many permutations grouping_filename : str The TSV filename containing grouping information grouping_columns : str The columns to use for grouping Returns ------- str A filepath to the output file. Raises ------ IOError If the tree file is not found If the table is not found If the output file cannot be created ValueError If the table does not appear to be BIOM-Format v2.1. If the phylogeny does not appear to be in Newick format. Environment variables --------------------- OMP_NUM_THREADS Number of CPU cores to use. If not defined, use all detected cores. UNIFRAC_USE_GPU Enable or disable GPU offload. If not defined, autodetect. ACC_DEVICE_NUM The GPU to use. If not defined, the first GPU will be used. Notes ----- Weighted UniFrac was originally described in [1]_. Variance Adjusted Weighted UniFrac was originally described in [2]_. References ---------- .. [1] Lozupone, C. A., Hamady, M., Kelley, S. T. & Knight, R. Quantitative and qualitative beta diversity measures lead to different insights into factors that structure microbial communities. Appl. Environ. Microbiol. 73, 1576-1585 (2007). .. [2] Chang, Q., Luan, Y. & Sun, F. Variance adjusted weighted UniFrac: a powerful beta diversity measure for comparing communities based on phylogeny. BMC Bioinformatics 12:118 (2011). """ return _call_ssu_to_file(table, phylogeny, out_filename, 'weighted_unnormalized_fp64', variance_adjusted, 1.0, bypass_tips, n_substeps, format, n_subsamples, subsample_depth, subsample_with_replacement, pcoa_dims, permanova_perms, grouping_filename, grouping_columns, buf_dirname) def weighted_unnormalized_fp32_to_file(table: str, phylogeny: str, out_filename: str, pcoa_dims: int = 10, threads: int = 1, variance_adjusted: bool = False, bypass_tips: bool = False, format: str = "hdf5", buf_dirname: str = "", n_substeps: int = 1, n_subsamples: int = 1, subsample_depth: int = 0, subsample_with_replacement: bool = True, permanova_perms: int = 0, grouping_filename: str = "", grouping_columns: str = "") -> str: """Compute weighted unnormalized UniFrac using fp32 math and write to file Parameters ---------- table : str A filepath to a BIOM-Format 2.1 file. phylogeny : str A filepath to a Newick formatted tree. out_filename : str A filepath to the output file. pcoa_dims : int, optional Number of dimensions to use for PCoA compute. if set to 0, no PCoA is computed. Defaults of 10. threads : int, optional TDeprecated, no-op.. variance_adjusted : bool, optional Adjust for varianace or not. Default is False. bypass_tips : bool, optional Bypass the tips of the tree in the computation. This reduces compute by about 50%, but is an approximation. format : str, optional Output format to use. Defaults to "hdf5". buf_dirname : str, optional If set, the directory where the disk buffer is hosted, can be used to reduce the amount of memory needed. n_substeps : int, optional Internally split the problem in substeps for reduced memory footprint. n_subsamples : int If >1, perform multiple subsamples. subsample_depth : int Depth of subsampling, if >0 subsample_with_replacement : bool Use subsampling with replacement? (only True supported in 1.3) permanova_perms : int If not 0, compute PERMANOVA using that many permutations grouping_filename : str The TSV filename containing grouping information grouping_columns : str The columns to use for grouping Returns ------- str A filepath to the output file. Raises ------ IOError If the tree file is not found If the table is not found If the output file cannot be created ValueError If the table does not appear to be BIOM-Format v2.1. If the phylogeny does not appear to be in Newick format. Environment variables --------------------- OMP_NUM_THREADS Number of CPU cores to use. If not defined, use all detected cores. UNIFRAC_USE_GPU Enable or disable GPU offload. If not defined, autodetect. ACC_DEVICE_NUM The GPU to use. If not defined, the first GPU will be used. Notes ----- Weighted UniFrac was originally described in [1]_. Variance Adjusted Weighted UniFrac was originally described in [2]_. References ---------- .. [1] Lozupone, C. A., Hamady, M., Kelley, S. T. & Knight, R. Quantitative and qualitative beta diversity measures lead to different insights into factors that structure microbial communities. Appl. Environ. Microbiol. 73, 1576-1585 (2007). .. [2] Chang, Q., Luan, Y. & Sun, F. Variance adjusted weighted UniFrac: a powerful beta diversity measure for comparing communities based on phylogeny. BMC Bioinformatics 12:118 (2011). """ return _call_ssu_to_file(table, phylogeny, out_filename, 'weighted_unnormalized_fp32', variance_adjusted, 1.0, bypass_tips, n_substeps, format, n_subsamples, subsample_depth, subsample_with_replacement, pcoa_dims, permanova_perms, grouping_filename, grouping_columns, buf_dirname) def generalized_to_file(table: str, phylogeny: str, out_filename: str, pcoa_dims: int = 10, threads: int = 1, alpha: float = 1.0, variance_adjusted: bool = False, bypass_tips: bool = False, format: str = "hdf5", buf_dirname: str = "", n_substeps: int = 1, n_subsamples: int = 1, subsample_depth: int = 0, subsample_with_replacement: bool = True, permanova_perms: int = 0, grouping_filename: str = "", grouping_columns: str = "") -> str: """Compute Generalized UniFrac and write to file Parameters ---------- table : str A filepath to a BIOM-Format 2.1 file. phylogeny : str A filepath to a Newick formatted tree. out_filename : str A filepath to the output file. pcoa_dims : int, optional Number of dimensions to use for PCoA compute. if set to 0, no PCoA is computed. Defaults of 10. threads : int, optional TDeprecated, no-op. alpha : float, optional The level of contribution of high abundance branches. Higher alpha increases the contribution of from high abundance branches while lower alpha reduces the contribution. Alpha was originally defined over the range [0, 1]. Default is 1.0. variance_adjusted : bool, optional Adjust for varianace or not. Default is False. bypass_tips : bool, optional Bypass the tips of the tree in the computation. This reduces compute by about 50%, but is an approximation. format : str, optional Output format to use. Defaults to "hdf5". buf_dirname : str, optional If set, the directory where the disk buffer is hosted, can be used to reduce the amount of memory needed. n_substeps : int, optional Internally split the problem in substeps for reduced memory footprint. n_subsamples : int If >1, perform multiple subsamples. subsample_depth : int Depth of subsampling, if >0 subsample_with_replacement : bool Use subsampling with replacement? (only True supported in 1.3) permanova_perms : int If not 0, compute PERMANOVA using that many permutations grouping_filename : str The TSV filename containing grouping information grouping_columns : str The columns to use for grouping Returns ------- str A filepath to the output file. Raises ------ IOError If the tree file is not found If the table is not found If the output file cannot be created ValueError If the table does not appear to be BIOM-Format v2.1. If the phylogeny does not appear to be in Newick format. Environment variables --------------------- OMP_NUM_THREADS Number of CPU cores to use. If not defined, use all detected cores. UNIFRAC_USE_GPU Enable or disable GPU offload. If not defined, autodetect. ACC_DEVICE_NUM The GPU to use. If not defined, the first GPU will be used. Notes ----- Generalized UniFrac was originally described in [1]_. Variance Adjusted UniFrac was originally described in [2]_, but was not described in as applied to Generalized UniFrac. It is feasible to do, so it is exposed here. An alpha of 1.0 is Weighted normalized UniFrac. An alpha of 0.0 is approximately Unweighted UniFrac, and is if the proportions are dichotomized. References ---------- .. [1] Chen, J., Bittinger, K., Charlson, E. S., Hoffmann C., Lewis, J., Wu, G. D., Collman R. G., Bushman, F. D. & Hongzhe L. Associating microbiome composition with environmental covariates using generalized UniFrac distances. Bioinformatics 28(16), 2106–2113 (2012). .. [2] Chang, Q., Luan, Y. & Sun, F. Variance adjusted weighted UniFrac: a powerful beta diversity measure for comparing communities based on phylogeny. BMC Bioinformatics 12:118 (2011). """ if alpha == 1.0: warn("alpha of 1.0 is weighted-normalized UniFrac. " "Weighted-normalized is being used instead as it is more " "optimized.", Warning) return _call_ssu_to_file(table, phylogeny, out_filename, 'weighted_normalized', variance_adjusted, 1.0, bypass_tips, n_substeps, format, n_subsamples, subsample_depth, subsample_with_replacement, pcoa_dims, permanova_perms, grouping_filename, grouping_columns, buf_dirname) else: return _call_ssu_to_file(table, phylogeny, out_filename, 'generalized', variance_adjusted, alpha, bypass_tips, n_substeps, format, n_subsamples, subsample_depth, subsample_with_replacement, pcoa_dims, permanova_perms, grouping_filename, grouping_columns, buf_dirname) def generalized_fp64_to_file(table: str, phylogeny: str, out_filename: str, pcoa_dims: int = 10, threads: int = 1, alpha: float = 1.0, variance_adjusted: bool = False, bypass_tips: bool = False, format: str = "hdf5", buf_dirname: str = "", n_substeps: int = 1, n_subsamples: int = 1, subsample_depth: int = 0, subsample_with_replacement: bool = True, permanova_perms: int = 0, grouping_filename: str = "", grouping_columns: str = "") -> str: """Compute Generalized UniFrac using fp64 math and write to file Parameters ---------- table : str A filepath to a BIOM-Format 2.1 file. phylogeny : str A filepath to a Newick formatted tree. out_filename : str A filepath to the output file. pcoa_dims : int, optional Number of dimensions to use for PCoA compute. if set to 0, no PCoA is computed. Defaults of 10. threads : int, optional TDeprecated, no-op. alpha : float, optional The level of contribution of high abundance branches. Higher alpha increases the contribution of from high abundance branches while lower alpha reduces the contribution. Alpha was originally defined over the range [0, 1]. Default is 1.0. variance_adjusted : bool, optional Adjust for varianace or not. Default is False. bypass_tips : bool, optional Bypass the tips of the tree in the computation. This reduces compute by about 50%, but is an approximation. format : str, optional Output format to use. Defaults to "hdf5". buf_dirname : str, optional If set, the directory where the disk buffer is hosted, can be used to reduce the amount of memory needed. n_substeps : int, optional Internally split the problem in substeps for reduced memory footprint. n_subsamples : int If >1, perform multiple subsamples. subsample_depth : int Depth of subsampling, if >0 subsample_with_replacement : bool Use subsampling with replacement? (only True supported in 1.3) permanova_perms : int If not 0, compute PERMANOVA using that many permutations grouping_filename : str The TSV filename containing grouping information grouping_columns : str The columns to use for grouping Returns ------- str A filepath to the output file. Raises ------ IOError If the tree file is not found If the table is not found If the output file cannot be created ValueError If the table does not appear to be BIOM-Format v2.1. If the phylogeny does not appear to be in Newick format. Environment variables --------------------- OMP_NUM_THREADS Number of CPU cores to use. If not defined, use all detected cores. UNIFRAC_USE_GPU Enable or disable GPU offload. If not defined, autodetect. ACC_DEVICE_NUM The GPU to use. If not defined, the first GPU will be used. Notes ----- Generalized UniFrac was originally described in [1]_. Variance Adjusted UniFrac was originally described in [2]_, but was not described in as applied to Generalized UniFrac. It is feasible to do, so it is exposed here. An alpha of 1.0 is Weighted normalized UniFrac. An alpha of 0.0 is approximately Unweighted UniFrac, and is if the proportions are dichotomized. References ---------- .. [1] Chen, J., Bittinger, K., Charlson, E. S., Hoffmann C., Lewis, J., Wu, G. D., Collman R. G., Bushman, F. D. & Hongzhe L. Associating microbiome composition with environmental covariates using generalized UniFrac distances. Bioinformatics 28(16), 2106–2113 (2012). .. [2] Chang, Q., Luan, Y. & Sun, F. Variance adjusted weighted UniFrac: a powerful beta diversity measure for comparing communities based on phylogeny. BMC Bioinformatics 12:118 (2011). """ if alpha == 1.0: warn("alpha of 1.0 is weighted-normalized UniFrac. " "Weighted-normalized is being used instead as it is more " "optimized.", Warning) return _call_ssu_to_file(table, phylogeny, out_filename, 'weighted_normalized_fp64', variance_adjusted, 1.0, bypass_tips, n_substeps, format, n_subsamples, subsample_depth, subsample_with_replacement, pcoa_dims, permanova_perms, grouping_filename, grouping_columns, buf_dirname) else: return _call_ssu_to_file(table, phylogeny, out_filename, 'generalized_fp64', variance_adjusted, alpha, bypass_tips, n_substeps, format, n_subsamples, subsample_depth, subsample_with_replacement, pcoa_dims, permanova_perms, grouping_filename, grouping_columns, buf_dirname) def generalized_fp32_to_file(table: str, phylogeny: str, out_filename: str, pcoa_dims: int = 10, threads: int = 1, alpha: float = 1.0, variance_adjusted: bool = False, bypass_tips: bool = False, format: str = "hdf5", buf_dirname: str = "", n_substeps: int = 1, n_subsamples: int = 1, subsample_depth: int = 0, subsample_with_replacement: bool = True, permanova_perms: int = 0, grouping_filename: str = "", grouping_columns: str = "") -> str: """Compute Generalized UniFrac using fp32 math and write to file Parameters ---------- table : str A filepath to a BIOM-Format 2.1 file. phylogeny : str A filepath to a Newick formatted tree. out_filename : str A filepath to the output file. pcoa_dims : int, optional Number of dimensions to use for PCoA compute. if set to 0, no PCoA is computed. Defaults of 10. threads : int, optional TDeprecated, no-op. alpha : float, optional The level of contribution of high abundance branches. Higher alpha increases the contribution of from high abundance branches while lower alpha reduces the contribution. Alpha was originally defined over the range [0, 1]. Default is 1.0. variance_adjusted : bool, optional Adjust for varianace or not. Default is False. bypass_tips : bool, optional Bypass the tips of the tree in the computation. This reduces compute by about 50%, but is an approximation. format : str, optional Output format to use. Defaults to "hdf5". buf_dirname : str, optional If set, the directory where the disk buffer is hosted, can be used to reduce the amount of memory needed. n_substeps : int, optional Internally split the problem in substeps for reduced memory footprint. n_subsamples : int If >1, perform multiple subsamples. subsample_depth : int Depth of subsampling, if >0 subsample_with_replacement : bool Use subsampling with replacement? (only True supported in 1.3) permanova_perms : int If not 0, compute PERMANOVA using that many permutations grouping_filename : str The TSV filename containing grouping information grouping_columns : str The columns to use for grouping Returns ------- str A filepath to the output file. Raises ------ IOError If the tree file is not found If the table is not found If the output file cannot be created ValueError If the table does not appear to be BIOM-Format v2.1. If the phylogeny does not appear to be in Newick format. Environment variables --------------------- OMP_NUM_THREADS Number of CPU cores to use. If not defined, use all detected cores. UNIFRAC_USE_GPU Enable or disable GPU offload. If not defined, autodetect. ACC_DEVICE_NUM The GPU to use. If not defined, the first GPU will be used. Notes ----- Generalized UniFrac was originally described in [1]_. Variance Adjusted UniFrac was originally described in [2]_, but was not described in as applied to Generalized UniFrac. It is feasible to do, so it is exposed here. An alpha of 1.0 is Weighted normalized UniFrac. An alpha of 0.0 is approximately Unweighted UniFrac, and is if the proportions are dichotomized. References ---------- .. [1] Chen, J., Bittinger, K., Charlson, E. S., Hoffmann C., Lewis, J., Wu, G. D., Collman R. G., Bushman, F. D. & Hongzhe L. Associating microbiome composition with environmental covariates using generalized UniFrac distances. Bioinformatics 28(16), 2106–2113 (2012). .. [2] Chang, Q., Luan, Y. & Sun, F. Variance adjusted weighted UniFrac: a powerful beta diversity measure for comparing communities based on phylogeny. BMC Bioinformatics 12:118 (2011). """ if alpha == 1.0: warn("alpha of 1.0 is weighted-normalized UniFrac. " "Weighted-normalized is being used instead as it is more " "optimized.", Warning) return _call_ssu_to_file(table, phylogeny, out_filename, 'weighted_normalized_fp32', variance_adjusted, 1.0, bypass_tips, n_substeps, format, n_subsamples, subsample_depth, subsample_with_replacement, pcoa_dims, permanova_perms, grouping_filename, grouping_columns, buf_dirname) else: return _call_ssu_to_file(table, phylogeny, out_filename, 'generalized_fp32', variance_adjusted, alpha, bypass_tips, n_substeps, format, n_subsamples, subsample_depth, subsample_with_replacement, pcoa_dims, permanova_perms, grouping_filename, grouping_columns, buf_dirname) # # Functions that read Unifrac from hdf5 files # def h5unifrac(h5file: str) -> skbio.DistanceMatrix: """Read UniFrac distance matrix from a hdf5 file Parameters ---------- h5file : str A filepath to a hdf5 file. Returns ------- skbio.DistanceMatrix The distance matrix. Raises ------ OSError If the hdf5 file is not found KeyError If the hdf5 does not have the necessary fields References ---------- .. [1] Lozupone, C. & Knight, R. UniFrac: a new phylogenetic method for comparing microbial communities. Appl. Environ. Microbiol. 71, 8228-8235 (2005). .. [2] Chang, Q., Luan, Y. & Sun, F. Variance adjusted weighted UniFrac: a powerful beta diversity measure for comparing communities based on phylogeny. BMC Bioinformatics 12:118 (2011). """ with h5py.File(h5file, "r") as f_u: if 'matrix:0' in f_u.keys(): # multi format dm = skbio.DistanceMatrix( f_u['matrix:0'][:, :], [c.decode('ascii') for c in f_u['order'][:]]) else: # single format dm = skbio.DistanceMatrix( f_u['matrix'][:, :], [c.decode('ascii') for c in f_u['order'][:]]) return dm def h5unifrac_all(h5file: str) -> skbio.DistanceMatrix: """Read all UniFrac distance matrices from a hdf5 file Parameters ---------- h5file : str A filepath to a hdf5 file. Returns ------- tuple(skbio.DistanceMatrix) The distance matrices. Raises ------ OSError If the hdf5 file is not found KeyError If the hdf5 does not have the necessary fields References ---------- .. [1] Lozupone, C. & Knight, R. UniFrac: a new phylogenetic method for comparing microbial communities. Appl. Environ. Microbiol. 71, 8228-8235 (2005). .. [2] Chang, Q., Luan, Y. & Sun, F. Variance adjusted weighted UniFrac: a powerful beta diversity measure for comparing communities based on phylogeny. BMC Bioinformatics 12:118 (2011). """ with h5py.File(h5file, "r") as f_u: order = [c.decode('ascii') for c in f_u['order'][:]] if 'matrix' in f_u.keys(): # single format dms = [skbio.DistanceMatrix( f_u['matrix'][:, :], order)] else: # multi format dms = [] i = 0 while 'matrix:%i' % i in f_u.keys(): dms.append(skbio.DistanceMatrix( f_u['matrix:%i' % i][:, :], order)) i = i + 1 return dms def _build_pcoa(f_u, long_method_name, order_index, eigval_key, samples_key, prop_key): axis_labels = ["PC%d" % i for i in range(1, len(f_u[eigval_key][:]) + 1)] pc = skbio.OrdinationResults( short_method_name="PCoA", long_method_name=long_method_name, eigvals=pd.Series(f_u[eigval_key][:], index=axis_labels), samples=pd.DataFrame(f_u[samples_key][:, :], index=order_index, columns=axis_labels), proportion_explained=pd.Series( f_u[prop_key][:], index=axis_labels)) return pc def h5pcoa(h5file: str) -> skbio.OrdinationResults: """Read PCoA from a hdf5 file Parameters ---------- h5file : str A filepath to a hdf5 file. Returns ------- skbio.OrdinationResults The PCoA of the distance matrix Raises ------ OSError If the hdf5 file is not found KeyError If the hdf5 does not have the necessary fields """ with h5py.File(h5file, "r") as f_u: pcoa_method = f_u['pcoa_method'][0].decode('ascii') if 'FSVD' == pcoa_method: long_method_name = "Approximate Principal Coordinate Analysis" + \ " using FSVD" else: long_method_name = "Possibly Approximate Principal " + \ "Coordinate Analysis " + \ "using " + pcoa_method order_index = [c.decode('ascii') for c in f_u['order'][:]] if 'pcoa_eigvals:0' in f_u.keys(): # multi interface pc = _build_pcoa(f_u, long_method_name, order_index, 'pcoa_eigvals:0', 'pcoa_samples:0', 'pcoa_proportion_explained:0') else: # single interface pc = _build_pcoa(f_u, long_method_name, order_index, 'pcoa_eigvals', 'pcoa_samples', 'pcoa_proportion_explained') return pc def h5pcoa_all(h5file: str) -> tuple: """Read all PCoAs from a hdf5 file Parameters ---------- h5file : str A filepath to a hdf5 file. Returns ------- tuple(skbio.OrdinationResults) The PCoAs of the distance matrix Raises ------ OSError If the hdf5 file is not found KeyError If the hdf5 does not have the necessary fields References ---------- .. [1] Lozupone, C. & Knight, R. UniFrac: a new phylogenetic method for comparing microbial communities. Appl. Environ. Microbiol. 71, 8228-8235 (2005). .. [2] Chang, Q., Luan, Y. & Sun, F. Variance adjusted weighted UniFrac: a powerful beta diversity measure for comparing communities based on phylogeny. BMC Bioinformatics 12:118 (2011). """ with h5py.File(h5file, "r") as f_u: pcoa_method = f_u['pcoa_method'][0].decode('ascii') if 'FSVD' == pcoa_method: long_method_name = "Approximate Principal Coordinate Analysis" + \ " using FSVD" else: long_method_name = "Possibly Approximate Principal " + \ "Coordinate Analysis " + \ "using " + pcoa_method order_index = [c.decode('ascii') for c in f_u['order'][:]] if 'pcoa_eigvals' in f_u.keys(): # single matrix single PCoA version pcs = [_build_pcoa(f_u, long_method_name, order_index, 'pcoa_eigvals', 'pcoa_samples', 'pcoa_proportion_explained')] else: # multi-matrix version pcs = [] i = 0 while 'pcoa_eigvals:%i' % i in f_u.keys(): pcs.append(_build_pcoa(f_u, long_method_name, order_index, 'pcoa_eigvals:%i' % i, 'pcoa_samples:%i' % i, 'pcoa_proportion_explained:%i' % i)) i = i + 1 return pcs def h5permanova(h5file: str) -> pd.Series: """Read first PERMANOVA statistical test from a hdf5 file As describe in scikit-bio skbio.stats.distance.permanova.py, Permutational Multivariate Analysis of Variance (PERMANOVA) is a non-parametric method that tests whether two or more groups of objects are significantly different based on a categorical factor. Parameters ---------- h5file : str A filepath to a hdf5 file. Returns ------- pandas.Series Results of the statistical test, including ``test statistic`` and ``p-value``. Raises ------ OSError If the hdf5 file is not found KeyError If the hdf5 does not have the necessary fields References ---------- .. [1] Anderson, Marti J. "A new method for non-parametric multivariate analysis of variance." Austral Ecology 26.1 (2001): 32-46. """ found = False with h5py.File(h5file, "r") as f_u: methods = f_u['stat_methods'][:] test_names = f_u['stat_test_names'][:] values = f_u['stat_values'][:] pvalues = f_u['stat_pvalues'][:] n_permutations = f_u['stat_n_permutations'][:] num_groups = f_u['stat_n_groups'][:] sample_size = len(f_u['order'][:]) n_stats = len(methods) for i in range(n_stats): if (methods[i] == b'PERMANOVA') and (test_names[i] == b'pseudo-F'): found = True pmn = _build_stat('PERMANOVA', 'pseudo-F', sample_size, num_groups[i], values[i], pvalues[i], n_permutations[i]) break if (not found): raise KeyError("PERMANOVA not found") return pmn def h5permanova_dict(h5file: str) -> dict: """Read PERMANOVA statistical tests from a hdf5 file As describe in scikit-bio skbio.stats.distance.permanova.py, Permutational Multivariate Analysis of Variance (PERMANOVA) is a non-parametric method that tests whether two or more groups of objects are significantly different based on a categorical factor. Parameters ---------- h5file : str A filepath to a hdf5 file. Returns ------- dict[str]=pandas.Series Results of the statistical test, including ``test statistic`` and ``p-value``. Raises ------ OSError If the hdf5 file is not found KeyError If the hdf5 does not have the necessary fields References ---------- .. [1] Anderson, Marti J. "A new method for non-parametric multivariate analysis of variance." Austral Ecology 26.1 (2001): 32-46. """ pmns = {} with h5py.File(h5file, "r") as f_u: methods = f_u['stat_methods'][:] test_names = f_u['stat_test_names'][:] grouping_names = f_u['stat_grouping_names'][:] values = f_u['stat_values'][:] pvalues = f_u['stat_pvalues'][:] n_permutations = f_u['stat_n_permutations'][:] num_groups = f_u['stat_n_groups'][:] sample_size = len(f_u['order'][:]) n_stats = len(methods) for i in range(n_stats): if (methods[i] == b'PERMANOVA') and (test_names[i] == b'pseudo-F'): kname = grouping_names[i].decode('ascii') pmns[kname] = _build_stat('PERMANOVA', 'pseudo-F', sample_size, num_groups[i], values[i], pvalues[i], n_permutations[i]) return pmns unifrac-1.3/unifrac/tests/000077500000000000000000000000001442154206300155765ustar00rootroot00000000000000unifrac-1.3/unifrac/tests/__init__.py000066400000000000000000000005351442154206300177120ustar00rootroot00000000000000# ---------------------------------------------------------------------------- # Copyright (c) 2016-2017, UniFrac development team. # # Distributed under the terms of the Modified BSD License. # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- unifrac-1.3/unifrac/tests/data/000077500000000000000000000000001442154206300165075ustar00rootroot00000000000000unifrac-1.3/unifrac/tests/data/crawford.biom000066400000000000000000004137411442154206300212000ustar00rootroot00000000000000HDF  `  `TREE`HEAPX observationsample8  @id GCOL No Table IDhttp://biom-format.orgQIIME 1.9.0-rc1, master@2f448392015-01-06T00:37:05.179285199181271766166099 187644 233817 229459 199698 259434260756260205260753336145275819261334194787178735170950260058179063266595312476179069196138275869194978276044 4396877!829401"179719#276260$275563%3392842&259212'270984(262409)4484382*185743+302407,258969-184151.215193/4468234097294125926622592633184567418114152278866174791726901982606539166911:260655;443945<732128=318563>272454?192971@195385A261177B229386C204144D267041E301870F267123G239571H349142I199403J208571K334365L260397M403497N267388O274106P274844Q288931R275136S267411T4338733U4403349V314810W3621189X182033Y1105328Z180105[195005\275627]187233^343581_291750`301012a269902b276985c274438d259228e303479f174754g263908h380534i4397402j335952k1108453l258725m351881n320490o264496p194662q191772r187790s174056t180972u340189v183390w422727x263546y187989z259910{303652|261419}4449524~11518632206226957626066313595626768916299127402144144201571092234121267457267452199534175573259593193463276172197318181419441494259859275707187078270519447113526394627640429358018780726336218321143463742591751816032703851885364407703191398176118167078833390 @type H format-url H format-version@ H generated-by Hcreation-date @ shape@m 0 nnz@-TREEhHEAPX8matrixidsmetadatagroup-metadata SNOD/TREE&HEAPX dataindicesindptr8SNOD(/B-- ?@4 4 deflate-TXTREEx8(-SNOD80x9x^͕=n0 =f!CSir ]^>G4IQҲ<7) 9GdI/J_'G' L$}ʣz('k]u|c?ԓڧGT%靽Jx⹹yVs֩8W}NNM3~OAw޷*S\_DN?ŵiG2'S{}k^ZgY2.O:w-*?߼]NOW''x^m j1 DΞ8I?(ݲJ3u]o_qcz֟wgK߷{._v>jN37b;Y!.;'F?٪OKN>7: ۯIygsXlRwf.]-'io;7}"f}'0._j[j˃Ͼs"8fG ѳ<8Lnjv-0ӉowݗE87w9]O͘mg'7/s=CYOz3ٖ/; |jM|mF8˶n;C< ; X_r;bhr`S[]~7/_1|^Sc7!N<})>jx^svjj61LOL9y&N8q><ANp3u=޿@ %HC:@F2B(ad%;DHE~ P"%┥H%PԢ.OCbhD҂6H':Ӆt;q}G0A e#(F3c<d2d\1,b1KYN<+X*Vc=6"}9ap9YΑy.q\#6w='<9/x+^󆷼go|'RiIO0H&B(ad%D\&|;?/~O4)IEIKz2B( 'M$yK>(@Q)FqJPc-sTREE)I-  deflateH1-T`TREE)-n  deflate:nT`TREE6W+nm deflateCmTXTREEkmGCOL 17380726681616990118299526316544424593104901684221 697874 321484 2120775 268923 17543217033526038719721618652119046044185861865264374042436341176039177427191483343906351794553395178779268755271378 231169!259335"1107945#450047$269359%4331760&182016'275470(270391)187703*191816+353782,163862-319909.307595/2751500272953126999221768863195445434609852750786828435719188783330539167204:259056;233313<164308=263705>178659?174272@310748A132114B336214C550807D262869E178031F173417G316842H376397I190242J182621K174959L259372M175416N847228O461524P350381Q259012R327236S318370T265106U214471V4372578W4127460X274597Y314963Z262166[4417539\100344]170555^261511_273515`177205a839215b4462541c261409d274521e180919f191958g216403h264373i185754j262766k176850l176858m268121n4364243o272812p185222q262399r178926s199307t265641u45363v351859w179181x179188y265828z292745{269378|837473}130335~2598881967774528233434202575787210950194822266483260828348398320635523102126267725924919777518120519619426345227039626310644801763445272753396871851810911871332704912137001268416177802190273191077331965180362258522169398273084326588927066227144943651092766631864972640212647872112006263044311755623341119682533733135495727401818310626160619584026594018124921491926578617466318577717270527442225960931117426858111364434329571330296181344276531180206258250827195206494276580197790f__Lachnospiraceaeg__s__ k__Bacteriap__Deferribacteresc__Deferribactereso__Deferribacteralesf__Deferribacteraceaeg__Mucispirillum s__schaedleri k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7g__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__Bacteroidaceaeg__Bacteroidess__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteria p__Firmicutes  c__Clostridia o__Clostridiales f__ g__ s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridiales f__Lachnospiraceae!g__"s__# k__Bacteria$ p__Firmicutes% c__Clostridia&o__Clostridiales'f__(g__)s__* k__Bacteria+p__Bacteroidetes,c__Bacteroidia-o__Bacteroidales.f__Bacteroidaceae/g__Bacteroides0s__1 k__Bacteria2 p__Firmicutes3 c__Clostridia4o__Clostridiales5f__Lachnospiraceae6g__7s__x^]eGJE"Ive˚b)[(K%% ٦E5#.ȏ9s9{.3z)kϥ[|EqkSyeO|J>-/|IF-KF彩~AP*_?h~D1|B~G^ +/>nˋ%Ry\^!Wɕ|yZ^#ץkAX7ʟțT~4*o?_;R~_jS%%ky3Bn)[mS2}ѧ '\!'Lp)So1} 'L0} 'L0} 'L0} 'L>a 'afr'L0} 'L0}ѧS)w)>aO>SnO>aѧ ϐOxLp)>s>O}b_˘}/ؗs/ce̾ٗ12f_8þٗ12c_;deryRy=2f_˸Zf_˘}/cerne̾ٗ12}9ٗq˹ٗ12f_˘}s s k4oob&9f9Z,,sĜ7q79obΛ&漉9obΛ&f7q7*hx^=1 F܂BTH`}Ldn.so\;2_F<ź^~srqKα0e~";gt}}.   deflate) T`x^c``` > , @\ {210NE4  deflateY  TXx^a``Qj@WA+rh|y4_HG HEAPXttaxonomy@m deflateuTPTREEG GGCOL k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Bacteroidetes c__Bacteroidia o__Bacteroidales f__Rikenellaceae g__Alistipess__indistinctus k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteriap__Bacteroidetesc__Bacteroidia o__Bacteroidales!f__Rikenellaceae"g__#s__$ k__Bacteria% p__Firmicutes& c__Clostridia'o__Clostridiales(f__Lachnospiraceae)g__[Ruminococcus]* s__gnavus+ k__Bacteria, p__Firmicutes- c__Clostridia.o__Clostridiales/f__Ruminococcaceae0g__Oscillospira1s__2 k__Bacteria3p__Bacteroidetes4c__Bacteroidia5o__Bacteroidales6f__Bacteroidaceae7g__Bacteroides8s__9 k__Bacteria:p__Bacteroidetes;c__Bacteroidia<o__Bacteroidales=f__S24-7>g__?s__@ k__BacteriaA p__FirmicutesB c__ClostridiaCo__ClostridialesDf__Eg__Fs__G k__BacteriaH p__FirmicutesI c__ClostridiaJo__ClostridialesKf__Lg__Ms__N k__BacteriaO p__FirmicutesP c__ClostridiaQo__ClostridialesRf__Sg__Ts__U k__BacteriaV p__FirmicutesW c__ClostridiaXo__ClostridialesYf__RuminococcaceaeZg__Oscillospira[s__\ k__Bacteria] p__Firmicutes^ c__Clostridia_o__Clostridiales`f__ag__bs__c k__Bacteriadp__Bacteroidetesec__Bacteroidiafo__Bacteroidalesgf__S24-7hg__is__j k__Bacteriak p__Firmicutesl c__Clostridiamo__Clostridialesnf__og__ps__q k__Bacteriar p__Firmicutess c__Clostridiato__Clostridialesuf__Lachnospiraceaevg__ws__x k__Bacteriay p__Firmicutesz c__Clostridia{o__Clostridiales|f__}g__~s__ k__Bacteria p__Firmicutesc__Erysipelotrichio__Erysipelotrichalesf__Erysipelotrichaceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__[Ruminococcus] s__gnavus k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__[Ruminococcus] s__gnavus k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Peptococcaceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Clostridiaceaeg__Clostridiums__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__Rikenellaceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__[Ruminococcus] s__gnavus k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__[Ruminococcus] s__gnavus k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Peptococcaceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceae g__Ruminococcus s__  k__Bacteria  p__Firmicutes  c__Clostridiao__Clostridialesf__Lachnospiraceaeg__Coprococcuss__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Bacillio__Lactobacillalesf__Lactobacillaceaeg__Lactobacilluss__  k__Bacteria! p__Firmicutes" c__Clostridia#o__Clostridiales$f__%g__&s__' k__Bacteria(p__Bacteroidetes)c__Bacteroidia*o__Bacteroidales+f__Porphyromonadaceae,g__Parabacteroides-s__. k__Bacteria/p__Actinobacteria0c__Coriobacteriia1o__Coriobacteriales2f__Coriobacteriaceae3g__4s__5 k__Bacteria6 p__Firmicutes7 c__Clostridia8o__Clostridiales9f__:g__;s__< k__Bacteria=p__Bacteroidetes>c__Bacteroidia?o__Bacteroidales@f__S24-7Ag__Bs__C k__BacteriaD p__FirmicutesE c__ClostridiaFo__ClostridialesGf__Hg__Is__J k__BacteriaK p__FirmicutesL c__ClostridiaMo__ClostridialesNf__RuminococcaceaeOg__Ps__Q k__BacteriaRp__ActinobacteriaSc__CoriobacteriiaTo__CoriobacterialesUf__CoriobacteriaceaeVg__AdlercreutziaWs__X k__BacteriaY p__FirmicutesZ c__Clostridia[o__Clostridiales\f__]g__^s___ k__Bacteria`p__Bacteroidetesac__Bacteroidiabo__Bacteroidalescf__Bacteroidaceaedg__Bacteroidese s__fragilisf k__Bacteriagp__Bacteroideteshc__Bacteroidiaio__Bacteroidalesjf__Rikenellaceaekg__ls__m k__Bacterian p__Firmicuteso c__Clostridiapo__Clostridialesqf__rg__ss__t k__Bacteriau p__Firmicutesv c__Clostridiawo__Clostridialesxf__yg__zs__{ k__Bacteria|p__Bacteroidetes}c__Bacteroidia~o__Bacteroidalesf__S24-7g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__Ruminococcuss__ k__Bacteriap__Proteobacteriac__Epsilonproteobacteriao__Campylobacteralesf__Helicobacteraceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__[Ruminococcus] s__gnavus k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes  c__Clostridia o__Clostridiales f__Lachnospiraceae g__ s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Clostridiaceaeg__s__ k__Bacteriap__Bacteroidetesc__Bacteroidia o__Bacteroidales!f__S24-7"g__#s__$ k__Bacteria% p__Firmicutes& c__Clostridia'o__Clostridiales(f__)g__*s__+ k__Bacteria, p__Firmicutes- c__Clostridia.o__Clostridiales/f__Ruminococcaceae0g__1s__2 k__Bacteria3 p__Firmicutes4 c__Clostridia5o__Clostridiales6f__7g__8s__9 k__Bacteria: p__Firmicutes;c__Erysipelotrichi<o__Erysipelotrichales=f__Erysipelotrichaceae>g__Clostridium? s__cocleatum@ k__BacteriaA p__FirmicutesB c__ClostridiaCo__ClostridialesDf__Eg__Fs__G k__BacteriaH p__FirmicutesI c__ClostridiaJo__ClostridialesKf__Lg__Ms__N k__BacteriaO p__FirmicutesP c__ClostridiaQo__ClostridialesRf__Sg__Ts__U k__BacteriaV p__FirmicutesW c__ClostridiaXo__ClostridialesYf__Zg__[s__\ k__Bacteria] p__Firmicutes^ c__Clostridia_o__Clostridiales`f__Lachnospiraceaeag__bs__c k__Bacteriad p__Firmicutese c__Clostridiafo__Clostridialesgf__Ruminococcaceaehg__Ruminococcusis__j k__Bacteriak p__Firmicutesl c__Clostridiamo__Clostridialesnf__og__ps__q k__Bacteriar p__Firmicutess c__Clostridiato__Clostridialesuf__vg__ws__x k__Bacteriay p__Firmicutesz c__Clostridia{o__Clostridiales|f__}g__~s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Bacillio__Lactobacillalesf__Lactobacillaceaeg__Lactobacilluss__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__Epulopisciums__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__Ruminococcuss__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__Oscillospiras__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__Oscillospiras__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Peptococcaceaeg__rc4-4s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__Oscillospiras__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__ g__ s__  k__Bacteria  p__Firmicutes  c__Clostridiao__Clostridialesf__Ruminococcaceaeg__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7g__s__ k__Bacteria p__Firmicutes c__Bacilli o__Bacillalesf__Staphylococcaceaeg__Staphylococcuss__  k__Bacteria! p__Firmicutes" c__Clostridia#o__Clostridiales$f__%g__&s__' k__Bacteria( p__Firmicutes) c__Clostridia*o__Clostridiales+f__Ruminococcaceae,g__Ruminococcus-s__. k__Bacteria/ p__Firmicutes0 c__Clostridia1o__Clostridiales2f__3g__4s__5 k__Bacteria6p__Bacteroidetes7c__Bacteroidia8o__Bacteroidales9f__Bacteroidaceae:g__Bacteroides;s__< k__Bacteria= p__Firmicutes> c__Clostridia?o__Clostridiales@f__LachnospiraceaeAg__[Ruminococcus]B s__gnavusC k__BacteriaD p__FirmicutesE c__ClostridiaFo__ClostridialesGf__Hg__Is__J k__BacteriaK p__FirmicutesL c__ClostridiaMo__ClostridialesNf__LachnospiraceaeOg__Ps__Q k__BacteriaR p__FirmicutesS c__ClostridiaTo__ClostridialesU k__BacteriaV p__FirmicutesW c__ClostridiaXo__ClostridialesYf__LachnospiraceaeZg__[s__\ k__Bacteria] p__Firmicutes^ c__Clostridia_o__Clostridiales`f__ag__bs__c k__Bacteriad p__Firmicutese c__Clostridiafo__Clostridialesgf__Lachnospiraceaehg__is__j k__Bacteriakp__Bacteroideteslc__Bacteroidiamo__Bacteroidalesnf__Rikenellaceaeog__ps__q k__Bacteriar p__Firmicutess c__Clostridiato__Clostridialesuf__vg__ws__x k__Bacteriay p__Firmicutesz c__Clostridia{o__Clostridiales|f__Lachnospiraceae}g__~s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__Coprococcuss__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7g__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__[Odoribacteraceae]g__Odoribacters__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__[Odoribacteraceae]g__Butyricimonass__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__f__Lachnospiraceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__Ruminococcuss__ k__Bacteria p__Firmicutes c__Clostridia o__Clostridiales f__ g__ s__  k__Bacteria p__Firmicutes c__Bacillio__Turicibacteralesf__Turicibacteraceaeg__Turicibacters__ k__Bacteria p__Firmicutesc__Erysipelotrichio__Erysipelotrichalesf__Erysipelotrichaceaeg__Allobaculums__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7 g__!s__" k__Bacteria# p__Firmicutes$ c__Clostridia%o__Clostridiales&f__Lachnospiraceae'g__(s__) k__Bacteria* p__Firmicutes+ c__Clostridia,o__Clostridiales-f__.g__/s__0 k__Bacteria1 p__Firmicutes2 c__Clostridia3o__Clostridiales4f__Peptostreptococcaceae5g__6s__7 k__Bacteria8p__Bacteroidetes9c__Bacteroidia:o__Bacteroidales;f__S24-7<g__=s__> k__Bacteria? p__Firmicutes@ c__ClostridiaAo__ClostridialesBf__Cg__Ds__Es__F k__BacteriaG p__FirmicutesH c__ClostridiaIo__ClostridialesJf__Kg__Ls__M k__BacteriaNp__DeferribacteresOc__DeferribacteresPo__DeferribacteralesQf__DeferribacteraceaeRg__MucispirillumS s__schaedleriT k__BacteriaUp__BacteroidetesVc__BacteroidiaWo__BacteroidalesXf__PorphyromonadaceaeYg__ParabacteroidesZs__[ k__Bacteria\ p__Firmicutes] c__Clostridia^o__Clostridiales_f__`g__as__b k__Bacteriac p__Firmicutesd c__Clostridiaeo__Clostridialesff__Lachnospiraceaegg__hs__i k__Bacteriaj p__Firmicutesk c__Clostridialo__Clostridialesm k__Bacterian p__Firmicuteso c__Clostridiapo__Clostridialesqf__Lachnospiraceaerg__ss__t k__Bacteriau p__Firmicutesv c__Clostridiawo__Clostridialesxf__yg__zs__{ k__Bacteria| p__Firmicutes} c__Clostridia~o__Clostridialesf__g__f__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialeso__Clostridialesf__g__ c__Clostridia p__Firmicutes k__Bacteria k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__Oscillospiras__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7g__s__ k__Bacteria p__Firmicutes c__Bacillio__Lactobacillalesf__Lactobacillaceaeg__Lactobacilluss__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__Oscillospiras__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridia o__Clostridiales f__Lachnospiraceae g__[Ruminococcus]  s__gnavus  k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__ g__!s__" k__Bacteria# p__Firmicutes$ c__Clostridia%o__Clostridiales&f__Lachnospiraceae'g__(s__) k__Bacteria* p__Firmicutes+ c__Clostridia,o__Clostridiales-f__.g__/s__0 k__Bacteria1 p__Firmicutes2 c__Clostridia3o__Clostridiales4f__5g__6s__7 k__Bacteria8 p__Firmicutes9 c__Clostridia:o__Clostridiales;f__Ruminococcaceae<g__Oscillospira=s__> k__Bacteria? p__Firmicutes@ c__ClostridiaAo__ClostridialesBf__Cg__Ds__E k__BacteriaF p__FirmicutesG c__ClostridiaHo__ClostridialesIf__Jg__Ks__L k__BacteriaM p__FirmicutesN c__BacilliOo__LactobacillalesPf__StreptococcaceaeQg__StreptococcusRs__S k__BacteriaT p__FirmicutesU c__ClostridiaVo__ClostridialesWf__LachnospiraceaeXg__Ys__Z k__Bacteria[p__Bacteroidetes\c__Bacteroidia]o__Bacteroidales^f__Prevotellaceae_ g__Prevotella`s__a k__Bacteriab p__Firmicutesc c__Bacillid o__Bacillalesef__Staphylococcaceaefg__Staphylococcusg s__sciurih k__Bacteriai p__Firmicutesj c__Clostridiako__Clostridialeslf__Lachnospiraceaemg__ns__o k__Bacteriap p__Firmicutesq c__Clostridiaro__Clostridialessf__Lachnospiraceaetg__Coprococcusus__v k__Bacteriaw p__Firmicutesxc__Erysipelotrichiyo__Erysipelotrichaleszf__Erysipelotrichaceae{g__Allobaculum|s__} k__Bacteria~ p__Firmicutes c__Bacillio__Lactobacillalesf__Lactobacillaceaeg__Lactobacilluss__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__Bacteroidaceaeg__Bacteroides s__eggerthii k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__Ruminococcuss__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__Rikenellaceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__Coprococcuss__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__Rikenellaceaeg__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7g__s__  k__Bacteria  p__Firmicutes  c__Clostridia o__Clostridiales f__Ruminococcaceaeg__Oscillospiras__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__[Ruminococcus] s__gnavus k__Bacteria p__Firmicutes  c__Clostridia!o__Clostridiales"f__Ruminococcaceae#g__Oscillospira$s__% k__Bacteria& p__Firmicutes' c__Clostridia(o__Clostridiales)f__Lachnospiraceae*g__+s__, k__Bacteria- p__Firmicutes. c__Clostridia/o__Clostridiales0f__Ruminococcaceae1g__Oscillospira2s__3 k__Bacteria4 p__Firmicutes5 c__Clostridia6o__Clostridiales7f__Lachnospiraceae8g__[Ruminococcus]9 s__gnavus: k__Bacteria; p__Firmicutes< c__Clostridia=o__Clostridiales>f__?g__@s__A k__BacteriaBp__BacteroidetesCc__BacteroidiaDo__BacteroidalesEf__BacteroidaceaeFg__BacteroidesGs__H k__BacteriaI p__FirmicutesJ c__ClostridiaKo__ClostridialesLf__Mg__Ns__O k__BacteriaP p__FirmicutesQ c__ClostridiaRo__ClostridialesSf__RuminococcaceaeTg__Us__V k__BacteriaW p__FirmicutesX c__ClostridiaYo__ClostridialesZf__Ruminococcaceae[g__Oscillospira\s__] k__Bacteria^ p__Firmicutes_ c__Clostridia`o__Clostridialesaf__bg__cs__d k__Bacteriae p__Firmicutesfc__Erysipelotrichigo__Erysipelotrichaleshf__Erysipelotrichaceaeig__Coprobacillusjs__k k__Bacterial p__Firmicutesm c__Clostridiano__Clostridialesof__pg__qs__r k__Bacterias p__Firmicutest c__Clostridiauo__Clostridialesvf__Ruminococcaceaewg__Oscillospiraxs__y k__Bacteriazp__Bacteroidetes{c__Bacteroidia|o__Bacteroidales}f__S24-7~g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__[Paraprevotellaceae]g__[Prevotella]s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteriap__TM7c__TM7-3o__CW040f__F16g__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__Oscillospiras__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__Porphyromonadaceaeg__Parabacteroidess__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteriap__Proteobacteriac__Deltaproteobacteriao__Desulfovibrionalesf__Desulfovibrionaceaeg__Desulfovibrio s__C21_c20 k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__Oscillospiras__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__[Odoribacteraceae]g__Odoribacters__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridiales f__ g__ s__  k__Bacteria  p__Firmicutes c__Clostridiao__Clostridialesf__g__s__f__Bacteroidaceaeg__Bacteroides s__caccae k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridia o__Clostridiales!f__"g__#s__$ k__Bacteria%p__Bacteroidetes&c__Bacteroidia'o__Bacteroidales(f__S24-7)g__*s__+ k__Bacteria, p__Firmicutes- c__Clostridia.o__Clostridiales/f__Ruminococcaceae0g__Oscillospira1s__2 k__Bacteria3 p__Firmicutes4 c__Clostridia5o__Clostridiales6f__7g__8s__9 k__Bacteria: p__Firmicutes; c__Bacilli<o__Lactobacillales=f__Lactobacillaceae>g__Lactobacillus?s__@ k__BacteriaA p__FirmicutesB c__ClostridiaCo__ClostridialesDf__LachnospiraceaeEg__[Ruminococcus]F s__gnavusG k__BacteriaH p__FirmicutesI c__ClostridiaJo__ClostridialesKf__Lg__Ms__N k__BacteriaO p__FirmicutesP c__ClostridiaQo__ClostridialesRf__Sg__Ts__U k__BacteriaV p__FirmicutesW c__ClostridiaXo__ClostridialesYf__LachnospiraceaeZg__[s__\ k__Bacteria] p__Firmicutes^ c__Clostridia_o__Clostridiales`f__Lachnospiraceaeag__bs__c k__Bacteriad p__Firmicutese c__Clostridiafo__Clostridialesgf__hg__is__j k__Bacteriak p__Firmicutesl c__Clostridiamo__Clostridialesnf__Lachnospiraceaeog__[Ruminococcus]p s__gnavusq k__Bacteriarp__Bacteroidetessc__Bacteroidiato__Bacteroidalesuf__S24-7vg__ws__x k__Bacteriay p__Firmicutesz c__Clostridia{o__Clostridiales|f__}g__~s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Clostridiaceaeg__s__ k__Bacteriap__Actinobacteriac__Coriobacteriiao__Coriobacterialesf__Coriobacteriaceaeg__Adlercreutzias__ k__Bacteria p__Firmicutes c__Bacillio__Lactobacillalesf__Lactobacillaceaeg__Lactobacilluss__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__[Ruminococcus] s__gnavus k__Bacteriap__Actinobacteriac__Coriobacteriiao__Coriobacterialesf__Coriobacteriaceaeg__Adlercreutzias__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__Coprococcuss__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutesc__Erysipelotrichio__Erysipelotrichalesf__Erysipelotrichaceaeg__Allobaculums__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Bacillio__Lactobacillalesf__Lactobacillaceaeg__Lactobacillus s__reuteri k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__Oscillospiras__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__ g__ s__  k__Bacteria p__Bacteroidetes c__Bacteroidiao__Bacteroidalesf__Bacteroidaceaeg__Bacteroides s__fragilis k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__Oscillospiras__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__Coprococcuss__  k__Bacteria! p__Firmicutes" c__Clostridia#o__Clostridiales$f__%g__&s__' k__Bacteria(p__Bacteroidetes)c__Bacteroidia*o__Bacteroidales+f__S24-7,g__-s__. k__Bacteria/ p__Firmicutes0c__Erysipelotrichi1o__Erysipelotrichales2f__Erysipelotrichaceae3g__Allobaculum4s__5 k__Bacteria6 p__Firmicutes7 c__Bacilli8o__Lactobacillales9f__Lactobacillaceae:g__Lactobacillus;s__< k__Bacteria= p__Firmicutes> c__Clostridia?o__Clostridiales@f__Ag__Bs__C k__BacteriaD p__FirmicutesE c__ClostridiaFo__ClostridialesGf__Hg__Is__J k__BacteriaK p__FirmicutesL c__ClostridiaMo__ClostridialesNf__Og__Ps__Qs__R k__BacteriaS p__FirmicutesT c__ClostridiaUo__ClostridialesVf__Wg__Xs__Y k__BacteriaZp__Bacteroidetes[c__Bacteroidia\o__Bacteroidales]f__S24-7^g___s__` k__Bacteriaa p__Firmicutesb c__Clostridiaco__Clostridialesdf__Ruminococcaceaeeg__fs__g k__Bacteriah p__Firmicutesic__Erysipelotrichijo__Erysipelotrichaleskf__Erysipelotrichaceaelg__[Eubacterium]m s__dolichumn k__Bacteriao p__Firmicutesp c__Clostridiaqo__Clostridialesrf__GCOL@g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__ s__ k__Bacteria p__Firmicutes c__Clostridia o__Clostridialesf__Lachnospiraceaeg__[Ruminococcus] s__gnavus k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteria p__Firmicutes! c__Clostridia"o__Clostridiales#f__Clostridiaceae$g__%s__& k__Bacteria'p__Bacteroidetes(c__Bacteroidia)o__Bacteroidales*f__S24-7+g__,s__- k__Bacteria. p__Firmicutes/ c__Clostridia0o__Clostridiales1f__2g__3s__4 k__Bacteria5 p__Firmicutes6 c__Clostridia7o__Clostridiales8f__Ruminococcaceae9g__Oscillospira:s__; k__Bacteria< p__Firmicutes= c__Clostridia>o__Clostridiales?f__@g__As__B k__BacteriaC p__FirmicutesD c__ClostridiaEo__ClostridialesFf__LachnospiraceaeGg__Hs__I k__BacteriaJ p__FirmicutesK c__ClostridiaLo__ClostridialesMf__LachnospiraceaeNg__[Ruminococcus]O s__gnavusP k__BacteriaQ p__FirmicutesR c__ClostridiaSo__ClostridialesTf__LachnospiraceaeUg__Vs__W k__BacteriaX p__FirmicutesY c__ClostridiaZo__Clostridiales[f__Ruminococcaceae\g__]s__^ k__Bacteria_p__Bacteroidetes`c__Bacteroidiaao__Bacteroidalesbf__Bacteroidaceaecg__Bacteroidesds__e k__Bacteriaf p__Firmicutesg c__Clostridiaho__Clostridialesif__Lachnospiraceaejg__[Ruminococcus]k s__gnavusl k__Bacteriam p__Firmicutesnc__Erysipelotrichioo__Erysipelotrichalespf__Erysipelotrichaceaeqg__rs__s k__Bacteriat p__Firmicutesu c__Clostridiavo__Clostridialeswf__Lachnospiraceaexg__[Ruminococcus]y s__gnavusz k__Bacteria{p__Bacteroidetes|c__Bacteroidia}o__Bacteroidales~o__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteriap__Actinobacteriac__Coriobacteriiao__Coriobacterialesf__Coriobacteriaceaeg__Adlercreutzias__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__Ruminococcuss__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7g__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7g__g__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__Oscillospiras__ k__Bacteria p__Firmicutes c__Clostridias__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridiales f__ o__Clostridiales f__ g__ s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__ c__Clostridia p__Firmicutes k__Bacteria 10084.PC.636 10084.PC.635  10084.PC.607! 10084.PC.634" 10084.PC.355# 10084.PC.354$ 10084.PC.356% 10084.PC.593& 10084.PC.481(SNODptx^]uUUGsݍ(aw "vR**w7Ys}gLaf'g 1NGq W¿ 5/@o![R2rF?<1N8ǃ'7IMo*xx:z3 l&59%zzqF8pßd%D/c41q8}aˀ^F2[g8k>ˎ^8=_9;I=^p>c+(z/.^IJKˠWrUqV+_ \*UW Հ&z:Ճ>!z"ww5Oz >Go?< ~QEo\D&'7i͈|΄x6zs |%-t#+[*Wנu mx3z0x x+zh1oo@ogލ= ދS=a};A!wX&3돡w{>_}Eױzo6ޅ=S}}/_Bkw}>>_D|?/]> |" Mnw|)^PYYCeq:Ή^.rz>`=o3+F|E+Ip)J e+pE*W\\jWsP?O z׎qJWz_Cp#pc B5mmk_{ppG:uw@'~  xz|i(zHF7<<qO@o"|SЛ 4tf7< <9Co>| [ PKC?:\YJVo@o#{}8a z[{GC|ށNgx7zO`O¿B0GG{g;Is= /2|B_^ 轅 ~wC_ yxS}>F,zi3/_5 [{9y."zg~2 zWC~zwޭP?zAwO]o?{݇_?A3Eڋԟ8^T襎tiO N^z2DgzHggE/[XosEzF/zy W\\"CxׯDgIJW2蕃<"zRU#^ѫ^HS \:Ϻ׋q2F5777C9--kZvw@-C')i%G? |? OjǛ>/M}_oߚ3>g;o M{L}?Z?agy/~5]u7ӯaڻiڻezn~7=5eiiᄅ_Ӟa(^\Dā B/5iiK_zppF2eg@/' >p~ _\C8|%}~KbT{i|}ʢW)^@g%+Kx= ^8818i}ǽ~~й[\>kݹ&}v>f~7l\ܦ3 ?,n+a/ .e+m+cz5_9^yS_YLSߴWմWWk`Z:i!i)i%i i=i3inaimc o`hz`s(xio~FGFFǀǚƙǃ'&^I)i馽Y٦=/?}|ӿ/X`_^d[l[^jeU5_g_`hl{0x xiSL{Lv]ݦ'L{O2? goz1ϳc㦽L{ϛ^~ɴ𫦽L}i{6޻?0y?>ms61}r6wִ0oK`7ou7o|m~ynݼܼ7]0yM{nܼ?n?nܼynܼ]3]7yiڻezܼg7o~7os6m>ys6m>?0s6m>ys6m>ys6nu6m~|m~ϫ=7o{nݼg7o{nܼg7o~7oa?nm~=7o|>7o|v6mmGݼ߿=7o{nn泛ߧyw6ݼyܼg7oWjՄV6y.k47B1|MMk pKZz -z?ڃ;zt ~^b?_Op/po7c |P~FG7*7<O@o"ћT4tf739 }?<B-} zK[<|EJ*WנY^m6¿ =~zF ܃ޟO4xz0(z,z;> ~"z/z +W{ o&|o޻~| |?OzKk7o;]GO /+]W5z~&z л@_? t= ~CpǁKRp2B/5iiK_zppIx^$iFΈȶm۶m۶axm۶m۶ml|9=_ս~Qzcي/UXa"S0N)j'''#ßB/6qqы_|p3sSߋȏa"?aִt`^? 8*='B.C_4>So;`;h~t?s|Ҵw*#~653_0]4]__1]5__706137qiiﱩ i+S@oL{ߚo7<i'S_L{o4exioxG0 GE~$ Ѧ3ƴ7ִ7t 9lڛbڛjnڛa mڛc gڛo[^^d[l_^j[f[^^i[e_ ^c[k[^`hd<ڌ-ϣm]Xmc?? r}h_}MMk_sh zKh 5 zm1O;p{:C w3z] _HN#9q7z}}?^/ (g`-;ק kӴX _Ih8׆p<{gQx/pn pNbKjONnKaKiONmKcO NgKoLL{M}YLgY~ԟӴt<|B¦"篨L{M{%L%L{M}eeL{MM{Lljګfaګiګez6}1575054i)ii%i-ۙ_{|}^G^'y:ʹ0426}, r_y>h:G|i3S i!h <*99'IIP/ɼ~9988%zO N^ZҁӃ3LOfgϬggG/ ΅^nO^ㅞA΋^>_\BEŽl!.yJz\ S2E<|Я룒U*zq55Ы p]WF77A 资5zmk ;p{: uwwC;==z{z}^?{z?<}y>'Sg}/_717X}yD'OzC<>'/yz5#p<#h3XOoћD&7<Ot?{L|,l3㙇棷,/o1[g)z[ JVuo mFo z[ގvxv¿ =Co?|; Qw||SA,|л%exz |B6|wwлCg?~{5 zo{~_ޏGQvT|x~yы_Lޟba=_qqyȯد%E/)K^*̗4襅?8=zΌ^xa=_Np.rc<~ݯ~}/?z+^a^_ԯb~8z%+^)piW\\ WWB2|UUЫ pMjW\\a> >>G?<Ƨ>gx̳ߧOp<;Q􎁏wF g;E%.|kO/7л-ow{?@!{S3s^ҧWX߂ߡ $#Eq;\=h{b 8E/N^B9zc=/)KyRS4t襇?8#zggE/9 _.pnpŸFE+~pYR蕆 ,z+^%+WjՄڎ_Guq<Ы^&77C9|-z+Gq|mk^;|}{G;:~y';;z^WcgO >C?zO}SG/ /7}=|E;G #?FP/0G7QXm,~GodT4?<YEo ["b¿ ^/+}Bo5zk[_ ooBo3 6ww =ut?9;Awc蝀$O| ]%G_ U97л -m|^/~:ϰ|K+^c7|ppU_TWaqu>~G/D@/r8Ņ?8>z KNN^py՟S4C/eD/Yˊ^6s\A//ˇ^~ _\quC8%%+/ ..^9ʻz2zUЫyWZՁ.z]= on^c ^-k^+[۠vuvqۮ8nuGz= _p_p??zb\= z/ <17o}A Gv&3Ŀzc?C ߟ7! (hǺџD$&SSћ27͂o6曃o?-<[RJW0juu<ۄ ފ6w=x?z;>>wS蝆 ,z\__D|]}]UWk@&|~]{~cr -z>"QQ>vz~cXŁ/.88?l^TREEHEAPXiPTREEHEAPX8matrixidsmetadatagroup-metadata TREEHEAPX dataindicesindptr8SNOD(!prYs- ?@4 4 deflateq-TXTREE{o-SNODa:q-  deflate-T`TREE)-x^eee 1Cwwwww7؝ݭ`+` &vw5v(a󭗵q9<$BICyTBMBmC4,2Cq*q62\57v܍f<&(' _'l? J>(b(C>Zz/acc-y>Qh>AX}qĹrp-n:܌;^Ż?w9GOP 1:9ǒ{$]eQ}Ǟ -=C0#1Kq1rYNYxiH7oyeQQՒX7eh腁Q ss)."=r3No`>șcɽ?49r߆p<徿WP}+ wTz<1݈9 Ÿܑw$b9V&Ϡ(JN4BSy]{?LdL{q>x^c``Bq=x^cd```bf(D x^c````b fb` x^cd``H` fDs%L]  deflate1vYXTREE(  deflate:vYXTREE( deflateDvYPTREE(TREE@'HEAPXNPO(QTREE@'HEAPXHQPQSTREE WHEAPX8Tmatrixidsmetadatagroup-metadata TVTREEaHEAPX Vdataindicesindptr8SNOD(ppu`TTV~~ ?@4 4 deflatexYvYPTREE(SNODhXb0l  deflatedvYXTREE(  deflate@mvYXTREE ) deflatevvYPTREE )~TREEaHEAPXPTREEaHEAPX؃Punifrac-1.3/unifrac/tests/data/t1.newick000066400000000000000000000000431442154206300202320ustar00rootroot00000000000000((a:1,b:2):4,(c:3,(d:1,e:1):2):3); unifrac-1.3/unifrac/tests/data/t2.newick000066400000000000000000000000331442154206300202320ustar00rootroot00000000000000(((a:1,b:1):1,c:5):2,d:4); unifrac-1.3/unifrac/tests/data/test.faith.exp000066400000000000000000000004241442154206300212760ustar00rootroot0000000000000010084.PC.481 8.268050017119094 10084.PC.593 6.256320004780719 10084.PC.356 7.2537400212168 10084.PC.355 6.688100009057962 10084.PC.354 7.827880010576337 10084.PC.636 6.08902999899874 10084.PC.635 8.128100004927546 10084.PC.607 7.725159989975509 10084.PC.634 7.508669988412294 unifrac-1.3/unifrac/tests/test_api.py000066400000000000000000001076131442154206300177700ustar00rootroot00000000000000# ---------------------------------------------------------------------------- # Copyright (c) 2016-2017, UniFrac development team. # # Distributed under the terms of the Modified BSD License. # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- import unittest import os from io import StringIO from tempfile import gettempdir import pkg_resources import numpy as np import numpy.testing as npt from biom import Table, load_table from biom.util import biom_open from skbio import TreeNode import skbio.diversity from unifrac import ssu, faith_pd, ssu_inmem from unifrac import unweighted, unweighted_to_file, h5unifrac from unifrac import unweighted_fp32, unweighted_fp64 class UnifracAPITests(unittest.TestCase): package = 'unifrac.tests' def test_unweighted_inmem(self): tree_fp = self.get_data_path('crawford.tre') table_fp = self.get_data_path('crawford.biom') table = load_table(table_fp) tree = skbio.TreeNode.read(tree_fp) ids = table.ids() otu_ids = table.ids(axis='observation') cnts = table.matrix_data.astype(int).toarray().T exp = skbio.diversity.beta_diversity('unweighted_unifrac', cnts, ids=ids, otu_ids=otu_ids, tree=tree) obs = ssu_inmem(table, tree, 'unweighted', False, 1.0, False, 1) npt.assert_almost_equal(obs.data, exp.data, decimal=6) obs2 = unweighted(table_fp, tree_fp) npt.assert_almost_equal(obs2.data, exp.data, decimal=6) def test_unweighted_fp32_inmem(self): tree_fp = self.get_data_path('crawford.tre') table_fp = self.get_data_path('crawford.biom') table = load_table(table_fp) tree = skbio.TreeNode.read(tree_fp) ids = table.ids() otu_ids = table.ids(axis='observation') cnts = table.matrix_data.astype(int).toarray().T exp = skbio.diversity.beta_diversity('unweighted_unifrac', cnts, ids=ids, otu_ids=otu_ids, tree=tree) obs = ssu_inmem(table, tree, 'unweighted_fp32', False, 1.0, False, 1) npt.assert_almost_equal(obs.data, exp.data, decimal=6) obs2 = unweighted_fp32(table_fp, tree_fp) npt.assert_almost_equal(obs2.data, exp.data, decimal=6) def test_unweighted_fp64_inmem(self): tree_fp = self.get_data_path('crawford.tre') table_fp = self.get_data_path('crawford.biom') table = load_table(table_fp) tree = skbio.TreeNode.read(tree_fp) ids = table.ids() otu_ids = table.ids(axis='observation') cnts = table.matrix_data.astype(int).toarray().T exp = skbio.diversity.beta_diversity('unweighted_unifrac', cnts, ids=ids, otu_ids=otu_ids, tree=tree) obs = ssu_inmem(table, tree, 'unweighted_fp64', False, 1.0, False, 1) npt.assert_almost_equal(obs.data, exp.data) obs2 = unweighted_fp64(table_fp, tree_fp) npt.assert_almost_equal(obs2.data, exp.data) def get_data_path(self, filename): # adapted from qiime2.plugin.testing.TestPluginBase return pkg_resources.resource_filename(self.package, 'data/%s' % filename) def test_unweighted_root_eval_issue_46(self): tree = self.get_data_path('crawford.tre') table = self.get_data_path('crawford.biom') table_inmem = load_table(table) tree_inmem = skbio.TreeNode.read(tree) ids = table_inmem.ids() otu_ids = table_inmem.ids(axis='observation') cnts = table_inmem.matrix_data.astype(int).toarray().T exp = skbio.diversity.beta_diversity('unweighted_unifrac', cnts, ids=ids, otu_ids=otu_ids, tree=tree_inmem) obs = ssu(table, tree, 'unweighted', False, 1.0, False, 1) npt.assert_almost_equal(obs.data, exp.data, decimal=6) obs2 = unweighted(table, tree) npt.assert_almost_equal(obs2.data, exp.data, decimal=6) tmpfile = '/tmp/uf_ta_1.md5' unweighted_to_file(table, tree, tmpfile, pcoa_dims=0) try: obs3 = h5unifrac(tmpfile) npt.assert_almost_equal(obs3.data, exp.data, decimal=6) finally: os.unlink(tmpfile) def test_meta_unifrac(self): t1 = self.get_data_path('t1.newick') e1 = self.get_data_path('e1.biom') result = ssu(e1, t1, 'unweighted', False, 1.0, False, 1) u1_distances = np.array([[0, 10 / 16., 8 / 13.], [10 / 16., 0, 8 / 17.], [8 / 13., 8 / 17., 0]]) npt.assert_almost_equal(u1_distances, result.data, decimal=6) self.assertEqual(tuple('ABC'), result.ids) def test_ssu_bad_tree(self): e1 = self.get_data_path('e1.biom') with self.assertRaisesRegex(IOError, "Tree file not found."): ssu(e1, 'bad-file', 'unweighted', False, 1.0, False, 1) def test_ssu_bad_table(self): t1 = self.get_data_path('t1.newick') with self.assertRaisesRegex(IOError, "Table file not found."): ssu('bad-file', t1, 'unweighted', False, 1.0, False, 1) def test_ssu_bad_method(self): t1 = self.get_data_path('t1.newick') e1 = self.get_data_path('e1.biom') with self.assertRaisesRegex(ValueError, "Unknown method."): ssu(e1, t1, 'unweightedfoo', False, 1.0, False, 1) class EdgeCasesTests(unittest.TestCase): # These tests were mostly ported from skbio's # skbio/diversity/beta/tests/test_unifrac.py at SHA-256 ea901b3b6b0b # note that not all tests were kept since the APIs are different. # # The test cases below only exercise unweighted, weighted and weighted # normalized UniFrac. The C++ test suite verifies (against reference # implementations) the variance adjusted and generalized variants of the # algorithm. package = 'unifrac.tests' def _work(self, u_counts, v_counts, otu_ids, tree, method): data = np.array([u_counts, v_counts]).T bt = Table(data, otu_ids, ['u', 'v']) ta = os.path.join(gettempdir(), 'table.biom') tr = os.path.join(gettempdir(), 'tree.biom') self.files_to_delete.append(ta) self.files_to_delete.append(tr) with biom_open(ta, 'w') as fhdf5: bt.to_hdf5(fhdf5, 'Table for unit testing') tree.write(tr) # return value is a distance matrix, get the distance from u->v return ssu(ta, tr, method, False, 1.0, False, 1)['u', 'v'] def weighted_unifrac(self, u_counts, v_counts, otu_ids, tree, normalized=False): if normalized: method = 'weighted_normalized' else: method = 'weighted_unnormalized' return self._work(u_counts, v_counts, otu_ids, tree, method) def unweighted_unifrac(self, u_counts, v_counts, otu_ids, tree, normalized=False): return self._work(u_counts, v_counts, otu_ids, tree, 'unweighted') def setUp(self): self.b1 = np.array( [[1, 3, 0, 1, 0], [0, 2, 0, 4, 4], [0, 0, 6, 2, 1], [0, 0, 1, 1, 1], [5, 3, 5, 0, 0], [0, 0, 0, 3, 5]]) self.sids1 = list('ABCDEF') self.oids1 = ['OTU%d' % i for i in range(1, 6)] self.t1 = TreeNode.read( StringIO('(((((OTU1:0.5,OTU2:0.5):0.5,OTU3:1.0):1.0):0.0,(OTU4:' '0.75,OTU5:0.75):1.25):0.0)root;')) self.t1_w_extra_tips = TreeNode.read( StringIO('(((((OTU1:0.5,OTU2:0.5):0.5,OTU3:1.0):1.0):0.0,(OTU4:' '0.75,(OTU5:0.25,(OTU6:0.5,OTU7:0.5):0.5):0.5):1.25):0.0' ')root;')) self.t2 = TreeNode.read( StringIO('((OTU1:0.1, OTU2:0.2):0.3, (OTU3:0.5, OTU4:0.7):1.1)' 'root;')) self.oids2 = ['OTU%d' % i for i in range(1, 5)] self.files_to_delete = [] def tearDown(self): for f in self.files_to_delete: try: os.remove(f) except OSError: pass def test_ssu_table_not_subset_tree(self): tree = TreeNode.read(StringIO('((OTU1:0.5,OTU3:1.0):1.0)root;')) expected_message = "The table does not appear to be completely "\ "represented by the phylogeny." with self.assertRaisesRegex(ValueError, expected_message): self.unweighted_unifrac(self.b1[0], self.b1[1], self.oids1, tree) def test_unweighted_otus_out_of_order(self): # UniFrac API does not assert the observations are in tip order of the # input tree shuffled_ids = self.oids1[:] shuffled_b1 = self.b1.copy() shuffled_ids[0], shuffled_ids[-1] = shuffled_ids[-1], shuffled_ids[0] shuffled_b1[:, [0, -1]] = shuffled_b1[:, [-1, 0]] for i in range(len(self.b1)): for j in range(len(self.b1)): actual = self.unweighted_unifrac( self.b1[i], self.b1[j], self.oids1, self.t1) expected = self.unweighted_unifrac( shuffled_b1[i], shuffled_b1[j], shuffled_ids, self.t1) self.assertAlmostEqual(actual, expected, places=6) def test_weighted_otus_out_of_order(self): # UniFrac API does not assert the observations are in tip order of the # input tree shuffled_ids = self.oids1[:] shuffled_b1 = self.b1.copy() shuffled_ids[0], shuffled_ids[-1] = shuffled_ids[-1], shuffled_ids[0] shuffled_b1[:, [0, -1]] = shuffled_b1[:, [-1, 0]] for i in range(len(self.b1)): for j in range(len(self.b1)): actual = self.weighted_unifrac( self.b1[i], self.b1[j], self.oids1, self.t1) expected = self.weighted_unifrac( shuffled_b1[i], shuffled_b1[j], shuffled_ids, self.t1) self.assertAlmostEqual(actual, expected, places=6) def test_unweighted_extra_tips(self): # UniFrac values are the same despite unobserved tips in the tree for i in range(len(self.b1)): for j in range(len(self.b1)): actual = self.unweighted_unifrac( self.b1[i], self.b1[j], self.oids1, self.t1_w_extra_tips) expected = self.unweighted_unifrac( self.b1[i], self.b1[j], self.oids1, self.t1) self.assertAlmostEqual(actual, expected, places=6) def test_weighted_extra_tips(self): # UniFrac values are the same despite unobserved tips in the tree for i in range(len(self.b1)): for j in range(len(self.b1)): actual = self.weighted_unifrac( self.b1[i], self.b1[j], self.oids1, self.t1_w_extra_tips) expected = self.weighted_unifrac( self.b1[i], self.b1[j], self.oids1, self.t1) self.assertAlmostEqual(actual, expected, places=6) def test_unweighted_minimal_trees(self): # two tips tree = TreeNode.read(StringIO('(OTU1:0.25, OTU2:0.25)root;')) actual = self.unweighted_unifrac([1, 0], [0, 0], ['OTU1', 'OTU2'], tree) expected = 1.0 self.assertEqual(actual, expected) def test_unweighted_root_not_observed(self): # expected values computed with QIIME 1.9.1 and by hand # root node not observed, but branch between (OTU1, OTU2) and root # is considered shared actual = self.unweighted_unifrac([1, 1, 0, 0], [1, 0, 0, 0], self.oids2, self.t2) # for clarity of what I'm testing, compute expected as it would # based on the branch lengths. the values that compose shared was # a point of confusion for me here, so leaving these in for # future reference expected = 0.2 / (0.1 + 0.2 + 0.3) # 0.3333333333 self.assertAlmostEqual(actual, expected, places=6) # root node not observed, but branch between (OTU3, OTU4) and root # is considered shared actual = self.unweighted_unifrac([0, 0, 1, 1], [0, 0, 1, 0], self.oids2, self.t2) # for clarity of what I'm testing, compute expected as it would # based on the branch lengths. the values that compose shared was # a point of confusion for me here, so leaving these in for # future reference expected = 0.7 / (1.1 + 0.5 + 0.7) # 0.3043478261 self.assertAlmostEqual(actual, expected, places=6) def test_weighted_root_not_observed(self): # expected values computed by hand, these disagree with QIIME 1.9.1 # root node not observed, but branch between (OTU1, OTU2) and root # is considered shared actual = self.weighted_unifrac([1, 0, 0, 0], [1, 1, 0, 0], self.oids2, self.t2) expected = 0.15 self.assertAlmostEqual(actual, expected, places=6) # root node not observed, but branch between (OTU3, OTU4) and root # is considered shared actual = self.weighted_unifrac([0, 0, 1, 1], [0, 0, 1, 0], self.oids2, self.t2) expected = 0.6 self.assertAlmostEqual(actual, expected, places=6) def test_weighted_normalized_root_not_observed(self): # expected values computed by hand, these disagree with QIIME 1.9.1 # root node not observed, but branch between (OTU1, OTU2) and root # is considered shared actual = self.weighted_unifrac([1, 0, 0, 0], [1, 1, 0, 0], self.oids2, self.t2, normalized=True) expected = 0.1764705882 self.assertAlmostEqual(actual, expected, places=6) # root node not observed, but branch between (OTU3, OTU4) and root # is considered shared actual = self.weighted_unifrac([0, 0, 1, 1], [0, 0, 1, 0], self.oids2, self.t2, normalized=True) expected = 0.1818181818 self.assertAlmostEqual(actual, expected, places=6) def test_unweighted_unifrac_identity(self): for i in range(len(self.b1)): actual = self.unweighted_unifrac( self.b1[i], self.b1[i], self.oids1, self.t1) expected = 0.0 self.assertAlmostEqual(actual, expected, places=6) def test_unweighted_unifrac_symmetry(self): for i in range(len(self.b1)): for j in range(len(self.b1)): actual = self.unweighted_unifrac( self.b1[i], self.b1[j], self.oids1, self.t1) expected = self.unweighted_unifrac( self.b1[j], self.b1[i], self.oids1, self.t1) self.assertAlmostEqual(actual, expected, places=6) def test_unweighted_unifrac_non_overlapping(self): # these communities only share the root node actual = self.unweighted_unifrac( self.b1[4], self.b1[5], self.oids1, self.t1) expected = 1.0 self.assertAlmostEqual(actual, expected, places=6) actual = self.unweighted_unifrac( [1, 1, 1, 0, 0], [0, 0, 0, 1, 1], self.oids1, self.t1) expected = 1.0 self.assertAlmostEqual(actual, expected, places=6) def test_unweighted_unifrac(self): # expected results derived from QIIME 1.9.1, which # is a completely different implementation skbio's initial # unweighted unifrac implementation # sample A versus all actual = self.unweighted_unifrac( self.b1[0], self.b1[1], self.oids1, self.t1) expected = 0.238095238095 self.assertAlmostEqual(actual, expected, places=6) actual = self.unweighted_unifrac( self.b1[0], self.b1[2], self.oids1, self.t1) expected = 0.52 self.assertAlmostEqual(actual, expected, places=6) actual = self.unweighted_unifrac( self.b1[0], self.b1[3], self.oids1, self.t1) expected = 0.52 self.assertAlmostEqual(actual, expected, places=6) actual = self.unweighted_unifrac( self.b1[0], self.b1[4], self.oids1, self.t1) expected = 0.545454545455 self.assertAlmostEqual(actual, expected, places=6) actual = self.unweighted_unifrac( self.b1[0], self.b1[5], self.oids1, self.t1) expected = 0.619047619048 self.assertAlmostEqual(actual, expected, places=6) # sample B versus remaining actual = self.unweighted_unifrac( self.b1[1], self.b1[2], self.oids1, self.t1) expected = 0.347826086957 self.assertAlmostEqual(actual, expected, places=6) actual = self.unweighted_unifrac( self.b1[1], self.b1[3], self.oids1, self.t1) expected = 0.347826086957 self.assertAlmostEqual(actual, expected, places=6) actual = self.unweighted_unifrac( self.b1[1], self.b1[4], self.oids1, self.t1) expected = 0.68 self.assertAlmostEqual(actual, expected, places=6) actual = self.unweighted_unifrac( self.b1[1], self.b1[5], self.oids1, self.t1) expected = 0.421052631579 self.assertAlmostEqual(actual, expected, places=6) # sample C versus remaining actual = self.unweighted_unifrac( self.b1[2], self.b1[3], self.oids1, self.t1) expected = 0.0 self.assertAlmostEqual(actual, expected, places=6) actual = self.unweighted_unifrac( self.b1[2], self.b1[4], self.oids1, self.t1) expected = 0.68 self.assertAlmostEqual(actual, expected, places=6) actual = self.unweighted_unifrac( self.b1[2], self.b1[5], self.oids1, self.t1) expected = 0.421052631579 self.assertAlmostEqual(actual, expected, places=6) # sample D versus remaining actual = self.unweighted_unifrac( self.b1[3], self.b1[4], self.oids1, self.t1) expected = 0.68 self.assertAlmostEqual(actual, expected, places=6) actual = self.unweighted_unifrac( self.b1[3], self.b1[5], self.oids1, self.t1) expected = 0.421052631579 self.assertAlmostEqual(actual, expected, places=6) # sample E versus remaining actual = self.unweighted_unifrac( self.b1[4], self.b1[5], self.oids1, self.t1) expected = 1.0 self.assertAlmostEqual(actual, expected, places=6) def test_weighted_unifrac_identity(self): for i in range(len(self.b1)): actual = self.weighted_unifrac( self.b1[i], self.b1[i], self.oids1, self.t1) expected = 0.0 self.assertAlmostEqual(actual, expected, places=6) def test_weighted_unifrac_symmetry(self): for i in range(len(self.b1)): for j in range(len(self.b1)): actual = self.weighted_unifrac( self.b1[i], self.b1[j], self.oids1, self.t1) expected = self.weighted_unifrac( self.b1[j], self.b1[i], self.oids1, self.t1) self.assertAlmostEqual(actual, expected, places=6) def test_weighted_unifrac_non_overlapping(self): # expected results derived from QIIME 1.9.1, which # is a completely different implementation skbio's initial # weighted unifrac implementation # these communities only share the root node actual = self.weighted_unifrac( self.b1[4], self.b1[5], self.oids1, self.t1) expected = 4.0 self.assertAlmostEqual(actual, expected, places=6) def test_weighted_unifrac(self): # expected results derived from QIIME 1.9.1, which # is a completely different implementation skbio's initial # weighted unifrac implementation actual = self.weighted_unifrac( self.b1[0], self.b1[1], self.oids1, self.t1) expected = 2.4 self.assertAlmostEqual(actual, expected, places=6) actual = self.weighted_unifrac( self.b1[0], self.b1[2], self.oids1, self.t1) expected = 1.86666666667 self.assertAlmostEqual(actual, expected, places=6) actual = self.weighted_unifrac( self.b1[0], self.b1[3], self.oids1, self.t1) expected = 2.53333333333 self.assertAlmostEqual(actual, expected, places=6) actual = self.weighted_unifrac( self.b1[0], self.b1[4], self.oids1, self.t1) expected = 1.35384615385 self.assertAlmostEqual(actual, expected, places=6) actual = self.weighted_unifrac( self.b1[0], self.b1[5], self.oids1, self.t1) expected = 3.2 self.assertAlmostEqual(actual, expected, places=6) # sample B versus remaining actual = self.weighted_unifrac( self.b1[1], self.b1[2], self.oids1, self.t1) expected = 2.26666666667 self.assertAlmostEqual(actual, expected, places=6) actual = self.weighted_unifrac( self.b1[1], self.b1[3], self.oids1, self.t1) expected = 0.933333333333 self.assertAlmostEqual(actual, expected, places=6) actual = self.weighted_unifrac( self.b1[1], self.b1[4], self.oids1, self.t1) expected = 3.2 self.assertAlmostEqual(actual, expected, places=6) actual = self.weighted_unifrac( self.b1[1], self.b1[5], self.oids1, self.t1) expected = 0.8375 self.assertAlmostEqual(actual, expected, places=6) # sample C versus remaining actual = self.weighted_unifrac( self.b1[2], self.b1[3], self.oids1, self.t1) expected = 1.33333333333 self.assertAlmostEqual(actual, expected, places=6) actual = self.weighted_unifrac( self.b1[2], self.b1[4], self.oids1, self.t1) expected = 1.89743589744 self.assertAlmostEqual(actual, expected, places=6) actual = self.weighted_unifrac( self.b1[2], self.b1[5], self.oids1, self.t1) expected = 2.66666666667 self.assertAlmostEqual(actual, expected, places=6) # sample D versus remaining actual = self.weighted_unifrac( self.b1[3], self.b1[4], self.oids1, self.t1) expected = 2.66666666667 self.assertAlmostEqual(actual, expected, places=6) actual = self.weighted_unifrac( self.b1[3], self.b1[5], self.oids1, self.t1) expected = 1.33333333333 self.assertAlmostEqual(actual, expected, places=6) # sample E versus remaining actual = self.weighted_unifrac( self.b1[4], self.b1[5], self.oids1, self.t1) expected = 4.0 self.assertAlmostEqual(actual, expected, places=6) def test_weighted_unifrac_identity_normalized(self): for i in range(len(self.b1)): actual = self.weighted_unifrac( self.b1[i], self.b1[i], self.oids1, self.t1, normalized=True) expected = 0.0 self.assertAlmostEqual(actual, expected, places=6) def test_weighted_unifrac_symmetry_normalized(self): for i in range(len(self.b1)): for j in range(len(self.b1)): actual = self.weighted_unifrac( self.b1[i], self.b1[j], self.oids1, self.t1, normalized=True) expected = self.weighted_unifrac( self.b1[j], self.b1[i], self.oids1, self.t1, normalized=True) self.assertAlmostEqual(actual, expected, places=6) def test_weighted_unifrac_non_overlapping_normalized(self): # these communities only share the root node actual = self.weighted_unifrac( self.b1[4], self.b1[5], self.oids1, self.t1, normalized=True) expected = 1.0 self.assertAlmostEqual(actual, expected, places=6) actual = self.weighted_unifrac( [1, 1, 1, 0, 0], [0, 0, 0, 1, 1], self.oids1, self.t1, normalized=True) expected = 1.0 self.assertAlmostEqual(actual, expected, places=6) def test_weighted_unifrac_normalized(self): # expected results derived from QIIME 1.9.1, which # is a completely different implementation skbio's initial # weighted unifrac implementation actual = self.weighted_unifrac( self.b1[0], self.b1[1], self.oids1, self.t1, normalized=True) expected = 0.6 self.assertAlmostEqual(actual, expected, places=6) actual = self.weighted_unifrac( self.b1[0], self.b1[2], self.oids1, self.t1, normalized=True) expected = 0.466666666667 self.assertAlmostEqual(actual, expected, places=6) actual = self.weighted_unifrac( self.b1[0], self.b1[3], self.oids1, self.t1, normalized=True) expected = 0.633333333333 self.assertAlmostEqual(actual, expected, places=6) actual = self.weighted_unifrac( self.b1[0], self.b1[4], self.oids1, self.t1, normalized=True) expected = 0.338461538462 self.assertAlmostEqual(actual, expected, places=6) actual = self.weighted_unifrac( self.b1[0], self.b1[5], self.oids1, self.t1, normalized=True) expected = 0.8 self.assertAlmostEqual(actual, expected, places=6) # sample B versus remaining actual = self.weighted_unifrac( self.b1[1], self.b1[2], self.oids1, self.t1, normalized=True) expected = 0.566666666667 self.assertAlmostEqual(actual, expected, places=6) actual = self.weighted_unifrac( self.b1[1], self.b1[3], self.oids1, self.t1, normalized=True) expected = 0.233333333333 self.assertAlmostEqual(actual, expected, places=6) actual = self.weighted_unifrac( self.b1[1], self.b1[4], self.oids1, self.t1, normalized=True) expected = 0.8 self.assertAlmostEqual(actual, expected, places=6) actual = self.weighted_unifrac( self.b1[1], self.b1[5], self.oids1, self.t1, normalized=True) expected = 0.209375 self.assertAlmostEqual(actual, expected, places=6) # sample C versus remaining actual = self.weighted_unifrac( self.b1[2], self.b1[3], self.oids1, self.t1, normalized=True) expected = 0.333333333333 self.assertAlmostEqual(actual, expected, places=6) actual = self.weighted_unifrac( self.b1[2], self.b1[4], self.oids1, self.t1, normalized=True) expected = 0.474358974359 self.assertAlmostEqual(actual, expected, places=6) actual = self.weighted_unifrac( self.b1[2], self.b1[5], self.oids1, self.t1, normalized=True) expected = 0.666666666667 self.assertAlmostEqual(actual, expected, places=6) # sample D versus remaining actual = self.weighted_unifrac( self.b1[3], self.b1[4], self.oids1, self.t1, normalized=True) expected = 0.666666666667 self.assertAlmostEqual(actual, expected, places=6) actual = self.weighted_unifrac( self.b1[3], self.b1[5], self.oids1, self.t1, normalized=True) expected = 0.333333333333 self.assertAlmostEqual(actual, expected, places=6) # sample E versus remaining actual = self.weighted_unifrac( self.b1[4], self.b1[5], self.oids1, self.t1, normalized=True) expected = 1.0 self.assertAlmostEqual(actual, expected, places=6) class FaithPDEdgeCasesTests(unittest.TestCase): # These tests were mostly ported from skbio's # skbio/diversity/alpha/tests/test_fatih_pd.py at SHA-256 a8c086b # note that not all tests were kept since the APIs are different. package = 'unifrac.tests' def write_table_tree(self, u_counts, otu_ids, sample_ids, tree): data = np.array([u_counts]).T bt = Table(data, otu_ids, sample_ids) ta = os.path.join(gettempdir(), 'table.biom') tr = os.path.join(gettempdir(), 'tree.biom') self.files_to_delete.append(ta) self.files_to_delete.append(tr) with biom_open(ta, 'w') as fhdf5: bt.to_hdf5(fhdf5, 'Table for unit testing') tree.write(tr) return ta, tr def faith_pd_work(self, u_counts, otu_ids, sample_ids, tree): ta, tr = self.write_table_tree(u_counts, otu_ids, sample_ids, tree) return faith_pd(ta, tr) def setUp(self): self.counts = np.array([0, 1, 1, 4, 2, 5, 2, 4, 1, 2]) self.b1 = np.array([[1, 3, 0, 1, 0], [0, 2, 0, 4, 4], [0, 0, 6, 2, 1], [0, 0, 1, 1, 1]]) self.sids1 = list('ABCD') self.oids1 = ['OTU%d' % i for i in range(1, 6)] self.t1 = TreeNode.read(StringIO( '(((((OTU1:0.5,OTU2:0.5):0.5,OTU3:1.0):1.0):' '0.0,(OTU4:0.75,OTU5:0.75):1.25):0.0)root;')) self.t1_w_extra_tips = TreeNode.read( StringIO('(((((OTU1:0.5,OTU2:0.5):0.5,OTU3:1.0):1.0):0.0,(OTU4:' '0.75,(OTU5:0.25,(OTU6:0.5,OTU7:0.5):0.5):0.5):1.25):0.0' ')root;')) self.files_to_delete = [] def tearDown(self): for f in self.files_to_delete: try: os.remove(f) except OSError: pass def test_faith_pd_zero_branches_omitted(self): # also deleted branch length fo t2 = TreeNode.read(StringIO( '((OTU1:0.5,OTU2:0.5),(OTU3:1.0,(OTU4:0.5,' 'OTU5:0.75):1.0):1.0)root;' )) actual = self.faith_pd_work([1, 1, 0, 0, 0], self.oids1, ['foo'], t2) expected = 1.0 self.assertAlmostEqual(actual[0], expected) def test_faith_pd_none_observed(self): actual = self.faith_pd_work([0, 0, 0, 0, 0], self.oids1, ['foo'], self.t1) expected = 0.0 self.assertAlmostEqual(actual.values, expected) def test_faith_pd_biom_table_empty(self): table, tree = self.write_table_tree([], [], [], self.t1) self.assertRaises(ValueError, faith_pd, table, tree) def test_faith_pd_table_not_subset_tree(self): tree = TreeNode.read(StringIO('((OTU1:0.5,OTU3:1.0):1.0)root;')) table_ids = ['OTU1', 'OTU2'] table, tree = self.write_table_tree([1, 0], table_ids, ['foo'], tree) expected_message = "The table does not appear to be completely "\ "represented by the phylogeny." with self.assertRaisesRegex(ValueError, expected_message): faith_pd(table, tree) def test_faith_pd_all_observed(self): actual = self.faith_pd_work([1, 1, 1, 1, 1], self.oids1, ['foo'], self.t1) expected = sum(n.length for n in self.t1.traverse() if n.length is not None) self.assertAlmostEqual(actual.values, expected) actual = self.faith_pd_work([1, 2, 3, 4, 5], self.oids1, ['foo'], self.t1) expected = sum(n.length for n in self.t1.traverse() if n.length is not None) self.assertAlmostEqual(actual.values, expected) def test_faith_pd(self): # expected results derived from QIIME 1.9.1, which # is a completely different implementation unifrac's initial # phylogenetic diversity implementation actual = self.faith_pd_work(self.b1[0], self.oids1, [self.sids1[0]], self.t1) expected = 4.5 self.assertAlmostEqual(actual.values, expected) actual = self.faith_pd_work(self.b1[1], self.oids1, [self.sids1[1]], self.t1) expected = 4.75 self.assertAlmostEqual(actual.values, expected) actual = self.faith_pd_work(self.b1[2], self.oids1, [self.sids1[2]], self.t1) expected = 4.75 self.assertAlmostEqual(actual.values, expected) actual = self.faith_pd_work(self.b1[3], self.oids1, [self.sids1[3]], self.t1) expected = 4.75 self.assertAlmostEqual(actual.values, expected) def test_faith_pd_extra_tips(self): # results are the same despite presences of unobserved tips in tree actual = self.faith_pd_work(self.b1[0], self.oids1, [self.sids1[0]], self.t1_w_extra_tips) expected = self.faith_pd_work(self.b1[0], self.oids1, [self.sids1[0]], self.t1) self.assertAlmostEqual(actual.values, expected.values) actual = self.faith_pd_work(self.b1[1], self.oids1, [self.sids1[1]], self.t1_w_extra_tips) expected = self.faith_pd_work(self.b1[1], self.oids1, [self.sids1[1]], self.t1) self.assertAlmostEqual(actual.values, expected.values) actual = self.faith_pd_work(self.b1[2], self.oids1, [self.sids1[2]], self.t1_w_extra_tips) expected = self.faith_pd_work(self.b1[2], self.oids1, [self.sids1[2]], self.t1) self.assertAlmostEqual(actual.values, expected.values) actual = self.faith_pd_work(self.b1[3], self.oids1, [self.sids1[3]], self.t1_w_extra_tips) expected = self.faith_pd_work(self.b1[3], self.oids1, [self.sids1[3]], self.t1) self.assertAlmostEqual(actual.values, expected.values) def test_faith_pd_minimal(self): # two tips tree = TreeNode.read(StringIO('(OTU1:0.25, OTU2:0.25)root;')) actual = self.faith_pd_work([1, 0], ['OTU1', 'OTU2'], ['foo'], tree) expected = 0.25 self.assertEqual(actual.values, expected) def test_faith_pd_series_name(self): tree = TreeNode.read(StringIO('(OTU1:0.25, OTU2:0.25)root;')) actual = self.faith_pd_work([1, 0], ['OTU1', 'OTU2'], ['foo'], tree) self.assertEqual("faith_pd", actual.name) def test_faith_pd_root_not_observed(self): # expected values computed by hand tree = TreeNode.read( StringIO('((OTU1:0.1, OTU2:0.2):0.3, (OTU3:0.5, OTU4:0.7):1.1)' 'root;')) otu_ids = ['OTU%d' % i for i in range(1, 5)] # root node not observed, but branch between (OTU1, OTU2) and root # is considered observed actual = self.faith_pd_work([1, 1, 0, 0], otu_ids, ['foo'], tree) expected = 0.6 self.assertAlmostEqual(actual[0], expected) # root node not observed, but branch between (OTU3, OTU4) and root # is considered observed actual = self.faith_pd_work([0, 0, 1, 1], otu_ids, ['foo'], tree) expected = 2.3 self.assertAlmostEqual(actual[0], expected) def test_faith_pd_invalid_input(self): # tests are based of skbio tests, checking for duplicate ids, # negative counts are not included but should be incorporated # tree has duplicated tip ids tree = TreeNode.read( StringIO('((OTU1:0.1, OTU2:0.2):0.3, (OTU3:0.5, OTU4:0.7):1.1)' 'root;')) otu_ids = ['OTU%d' % i for i in range(1, 5)] u_counts = [1, 1, 0, 0] data = np.array([u_counts]).T bt = Table(data, otu_ids, ['u']) ta = os.path.join(gettempdir(), 'table.biom') tr = os.path.join(gettempdir(), 'tree.biom') self.files_to_delete.append(ta) self.files_to_delete.append(tr) with biom_open(ta, 'w') as fhdf5: bt.to_hdf5(fhdf5, 'Table for unit testing') tree.write(tr) self.assertRaises(IOError, faith_pd, 'dne.biom', tr) self.assertRaises(IOError, faith_pd, ta, 'dne.tre') if __name__ == "__main__": unittest.main() unifrac-1.3/unifrac/tests/test_methods.py000066400000000000000000000112651442154206300206570ustar00rootroot00000000000000# ---------------------------------------------------------------------------- # Copyright (c) 2016-2017, QIIME 2 development team. # # Distributed under the terms of the Modified BSD License. # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- import unittest import pkg_resources import numpy as np import numpy.testing as npt from unifrac import meta class StateUnifracTests(unittest.TestCase): package = 'unifrac.tests' def setUp(self): super().setUp() self.table1 = self.get_data_path('e1.biom') self.table2 = self.get_data_path('e2.biom') self.tree1 = self.get_data_path('t1.newick') self.tree2 = self.get_data_path('t2.newick') self.not_a_table = self.tree1 self.not_a_tree = self.table1 def get_data_path(self, filename): # adapted from qiime2.plugin.testing.TestPluginBase return pkg_resources.resource_filename(self.package, 'data/%s' % filename) def test_meta_unifrac(self): """meta_unifrac should give correct result on sample trees""" result = meta([self.table1, self.table2], [self.tree1, self.tree2], weights=[1, 1], consolidation='skipping-missing-values', method='unweighted') u1_distances = np.array([[0, 10/16., 8/13.], [10/16., 0, 8/17.], [8/13., 8/17., 0]]) u2_distances = np.array([[0, 11/14., 6/13.], [11/14., 0, 7/13.], [6/13., 7/13., 0]]) exp = (u1_distances + u2_distances) / 2 npt.assert_almost_equal(exp, result.data) self.assertEqual(tuple('ABC'), result.ids) def test_meta_unifrac_unbalanced(self): with self.assertRaisesRegex(ValueError, ("Number of trees and tables " "must be the same.")): meta((self.table1, ), (self.tree1, self.tree2), method='unweighted') with self.assertRaisesRegex(ValueError, ("Number of trees and tables " "must be the same.")): meta((self.table1, self.table2), (self.tree1, ), method='unweighted') def test_meta_unifrac_unbalanced_weights(self): with self.assertRaisesRegex(ValueError, "Number of weights does not " "match number of trees and " "tables."): meta((self.table1, self.table2), (self.tree1, self.tree2), weights=(1, 2, 3), ) def test_meta_unifrac_missing(self): with self.assertRaisesRegex(ValueError, "No trees specified."): meta((self.table1, ), tuple(), method='unweighted') with self.assertRaisesRegex(ValueError, "No tables specified."): meta(tuple(), (self.tree1, ), method='unweighted') def test_meta_validation(self): with self.assertRaisesRegex(ValueError, "Table does not appear to be a " "BIOM-Format v2.1"): meta((self.table1, self.not_a_table), (self.tree1, self.tree2), method='unweighted') with self.assertRaisesRegex(ValueError, "The phylogeny does not " "appear to be newick"): meta((self.table1, self.table2), (self.tree1, self.not_a_tree), method='unweighted') def test_meta_unifrac_no_method(self): with self.assertRaisesRegex(ValueError, "No method specified."): meta((self.table1, ), (self.tree1, )) def test_meta_unifrac_bad_method(self): with self.assertRaisesRegex(ValueError, r"Method \(bar\) " "unrecognized."): meta((self.table1, ), (self.tree1, ), method='bar') def test_meta_unifrac_bad_consolidation(self): with self.assertRaisesRegex(ValueError, r"Consolidation \(foo\) unrecognized."): meta((self.table1, ), (self.tree1, ), method='unweighted', consolidation='foo') def test_meta_unifrac_alpha_not_generalized(self): with self.assertRaisesRegex(ValueError, "The alpha parameter can"): meta((self.table1, ), (self.tree1, ), method='unweighted', alpha=1, consolidation='skipping_missing_matrices') if __name__ == "__main__": unittest.main()