pax_global_header00006660000000000000000000000064135107230100014502gustar00rootroot0000000000000052 comment=2a0b8175b814f79d9823d68215e8b3021d10151b unifrac-0.10.0/000077500000000000000000000000001351072301000132075ustar00rootroot00000000000000unifrac-0.10.0/.gitignore000066400000000000000000000020741351072301000152020ustar00rootroot00000000000000# Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python env/ build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ *.egg-info/ .installed.cfg *.egg # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *,cover .hypothesis/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # IPython Notebook .ipynb_checkpoints # pyenv .python-version # celery beat schedule file celerybeat-schedule # dotenv .env # virtualenv venv/ ENV/ # Spyder project settings .spyderproject # Rope project settings .ropeproject # vim *.swp *.swo *~ # ssu sucpp/ssu unifrac-0.10.0/.gitmodules000066400000000000000000000000001351072301000153520ustar00rootroot00000000000000unifrac-0.10.0/.travis.yml000066400000000000000000000113771351072301000153310ustar00rootroot00000000000000sudo: required dist: trusty os: - linux - osx addons: apt: sources: - ubuntu-toolchain-r-test packages: - gcc-4.9 - g++-4.9 - libhdf5-serial-dev language: - cpp env: - PYTHON_VERSION=3.6 EDITABLE_PIP=1 - PYTHON_VERSION=3.6 EDITABLE_PIP=0 before_install: - sudo ln -s /usr/bin/gcc-4.9 /usr/local/bin/gcc - sudo ln -s /usr/bin/g++-4.9 /usr/local/bin/g++ - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then export CC_FOR_BUILD=/usr/local/bin/gcc ; else export CC_FOR_BUILD=clang ; fi - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then export CXX_FOR_BUILD=/usr/local/bin/g++ ; else export CXX_FOR_BUILD=clang++ ; fi - export PERFORMING_CONDA_BUILD=True # verify compilter versions - $CC_FOR_BUILD -v - $CXX_FOR_BUILD -v # https://github.com/springmeyer/travis-coredump/blob/master/.travis.yml - ulimit -c - ulimit -a -S - ulimit -a -H - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh; fi # osx build has only one core - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then wget https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh; fi - chmod +x miniconda.sh - bash miniconda.sh -b -p $HOME/miniconda - export PATH="$HOME/miniconda/bin:$PATH" - hash -r - conda update --yes conda install: - conda create --yes -n test-env python=$PYTHON_VERSION - source activate test-env - conda config --add channels conda-forge - conda install --yes cython "hdf5>=1.8.17" biom-format numpy "h5py>=2.7.0" "scikit-bio>=0.5.1" flake8 nose # needed for the hdf5 dev tools - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then SED='sed -i'; else SED='sed -i '"'"''"'"' '; fi # make sure hdf5 is using the compiler we want to use for this build - $SED "s|^CXXBASE=.*|CXXBASE=$CXX_FOR_BUILD|" `which h5c++` - $SED "s|^CXXLINKERBASE=.*|CXXLINKERBASE=$CXX_FOR_BUILD|" `which h5c++` # remove these flags for compiling with gcc on linux - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then $SED 's/\(H5BLD_CXXFLAGS="\)\(.*\)\("\)/\1 \3/' `which h5c++` ; $SED 's/\(H5BLD_CPPFLAGS="\)\(.*\)\("\)/\1 \3/' `which h5c++` ; $SED 's/\(H5BLD_LDFLAGS="\)\(.*\)\("\)/\1 \3/' `which h5c++` ; fi # make sure dynamic linking to shared libraries is enabled - $SED 's/^STATIC_AVAILABLE=.*/STATIC_AVAILABLE="no"/' `which h5c++` - export CC=`which h5c++` - pushd sucpp - make test - make main - make api - popd # verify both installation methods - 'if [ ${EDITABLE_PIP} ]; then pip install -e . || travis_terminate 1; else pip install . || travis_terminate 1; fi' # install R for api example test - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo apt-get update && sudo apt-get install r-base ; else brew install r ; fi - sudo R -e 'install.packages("Rcpp", repos="http://cran.us.r-project.org")' - pushd sucpp # make c api for testing - make capi_test - popd script: - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then MD5=md5sum; else MD5='md5 -r'; fi - ulimit -c unlimited -S - pushd sucpp - ./test_su - ./test_api - popd - nosetests - flake8 unifrac setup.py - ./sucpp/ssu # santiy test io - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then ./sucpp/ssu -i unifrac/tests/data/crawford.biom -t unifrac/tests/data/crawford.tre -o ci/test.dm -m unweighted -n 2; fi - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then ./sucpp/ssu -i unifrac/tests/data/crawford.biom -t unifrac/tests/data/crawford.tre -o ci/test.dm -m unweighted; fi - python -c "import skbio; dm = skbio.DistanceMatrix.read('ci/test.dm')" # merge test - ./sucpp/ssu -i unifrac/tests/data/crawford.biom -t unifrac/tests/data/crawford.tre -o ci/test.dm.start0.stop3 -m unweighted --mode partial --start 0 --stop 3 - ./sucpp/ssu -i unifrac/tests/data/crawford.biom -t unifrac/tests/data/crawford.tre -o ci/test.dm.start3.stop5 -m unweighted --mode partial --start 3 --stop 5 - ./sucpp/ssu -i unifrac/tests/data/crawford.biom -t unifrac/tests/data/crawford.tre -o ci/test.dm.partial --mode merge-partial --partial-pattern "ci/test.dm.start*" - exp=$($MD5 ci/test.dm | awk '{ print $1 }') - obs=$($MD5 ci/test.dm.partial | awk '{ print $1 }') - python -c "assert '${obs}' == '${exp}'" - ./sucpp/faithpd -i unifrac/tests/data/crawford.biom -t unifrac/tests/data/crawford.tre -o ci/test.faith.obs - tail -n +2 ci/test.faith.obs > ci/test.faith.header-removed.obs - exp1=$($MD5 unifrac/tests/data/test.faith.exp | awk '{ print $1 }') - obs1=$($MD5 ci/test.faith.header-removed.obs | awk '{ print $1 }') - python -c "assert '${obs1}' == '${exp1}'" # test example API's - pushd sucpp - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then ./capi_test 2 ; else ./capi_test 1 ; fi - make rapi_test - popd unifrac-0.10.0/LICENSE000066400000000000000000000027721351072301000142240ustar00rootroot00000000000000BSD 3-Clause License Copyright (c) 2016-2017, UniFrac development team. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. unifrac-0.10.0/MANIFEST.in000066400000000000000000000003641351072301000147500ustar00rootroot00000000000000graft unifrac include sucpp/*.biom include sucpp/*.tre include sucpp/*.hpp include sucpp/*.cpp include sucpp/*.sh include sucpp/Makefile global-exclude *.pyc global-exclude *.pyo global-exclude .git global-exclude *.so global-exclude .*.swp unifrac-0.10.0/README.md000066400000000000000000000235161351072301000144750ustar00rootroot00000000000000# UniFrac ##### Canonically pronounced *yew-nih-frak* [![Build Status](https://travis-ci.org/biocore/unifrac.svg?branch=master)](https://travis-ci.org/biocore/unifrac) The *de facto* repository for high-performance phylogenetic diversity calculations. The methods in this repository are based on an implementation of the [Strided State UniFrac](https://www.nature.com/articles/s41592-018-0187-8) algorithm which is faster, and uses less memory than [Fast UniFrac](http://www.nature.com/ismej/journal/v4/n1/full/ismej200997a.html). Strided State UniFrac supports [Unweighted UniFrac](http://aem.asm.org/content/71/12/8228.abstract), [Weighted UniFrac](http://aem.asm.org/content/73/5/1576), [Generalized UniFrac](https://academic.oup.com/bioinformatics/article/28/16/2106/324465/Associating-microbiome-composition-with), [Variance Adjusted UniFrac](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-12-118) and [meta UniFrac](http://www.pnas.org/content/105/39/15076.short). This repository also includes Stacked Faith (manuscript in preparation), a method for calculating Faith's PD that is faster and uses less memory than the Fast UniFrac-based [reference implementation](http://scikit-bio.org/). This repository produces a C API exposed via a shared library which can be linked against by any programming language. # Citation A detailed description of the Strided State UniFrac algorithm can be found in [McDonald et al. 2018 Nature Methods](https://www.nature.com/articles/s41592-018-0187-8). Please note that this package implements multiple UniFrac variants, which may have their own citation. Details can be found in the help output from the command line interface in the citations section, and is included immediately below: ssu For UniFrac, please see: McDonald et al. Nature Methods 2018; DOI: 10.1038/s41592-018-0187-8 Lozupone and Knight Appl Environ Microbiol 2005; DOI: 10.1128/AEM.71.12.8228-8235.2005 Lozupone et al. Appl Environ Microbiol 2007; DOI: 10.1128/AEM.01996-06 Hamady et al. ISME 2010; DOI: 10.1038/ismej.2009.97 Lozupone et al. ISME 2011; DOI: 10.1038/ismej.2010.133 For Generalized UniFrac, please see: Chen et al. Bioinformatics 2012; DOI: 10.1093/bioinformatics/bts342 For Variance Adjusted UniFrac, please see: Chang et al. BMC Bioinformatics 2011; DOI: 10.1186/1471-2105-12-118 faithpd For Faith's PD, please see: Faith Biological Conservation 1992; DOI: 10.1016/0006-3207(92)91201-3 # Install At this time, there are two primary ways to install the library. The first is through QIIME 2, and the second is via `pip`. It is also possible to clone the repository and install using either the `sucpp/Makefile` or `setup.py`. Compilation has been performed on both LLVM 9.0.0 (OS X >= 10.12) or GCC 4.9.2 (Centos >= 6) and HDF5 >= 1.8.17. Python installation requires Python >= 3.5, NumPy >= 1.12.1, scikit-bio >= 0.5.1, and Cython >= 0.28.3. Installation time should be a few minutes at most. ## Install (QIIME2) The easiest way to use this library is through [QIIME2](https://docs.qiime2.org/2019.4/install/). The implementation of this algorithm is installed by default and is available under `qiime diversity beta-phylogenetic-alt`. ## Install (native) To install, first the binary needs to be compiled. This assumes that the HDF5 toolchain and libraries are available. More information about how to setup the stack can be found [here](https://support.hdfgroup.org/HDF5/Tutor/compile.html). Assuming `h5c++` is in your path, the following should work: pip install -e . **Note**: if you are using `conda` we recommend installing HDF5 using the `conda-forge` channel, for example: conda install -c conda-forge hdf5 # Examples of use Below are a few light examples of different ways to use this library. ## QIIME2 To use Strided State UniFrac through QIIME2, you need to provide a `FeatureTable[Frequency]` and a `Phylogeny[Rooted]` artifacts. An example of use is: qiime diversity beta-phylogenetic --i-table table-evenly-samples.qza \ --i-phylogeny a-tree.qza \ --o-distance-matrix resulting-distance-matrix.qza \ --p-metric unweighted_unifrac ## Python The library can be accessed directly from within Python. If operating in this mode, the API methods are expecting a filepath to a BIOM-Format V2.1.0 table, and a filepath to a Newick formatted phylogeny. $ python Python 3.5.4 | packaged by conda-forge | (default, Aug 10 2017, 01:41:15) [GCC 4.2.1 Compatible Apple LLVM 6.1.0 (clang-602.0.53)] on darwin Type "help", "copyright", "credits" or "license" for more information. >>> import unifrac >>> dir(unifrac) ['__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '__version__', '_api', '_meta', '_methods', 'generalized', 'meta', 'pkg_resources', 'ssu', 'stacked_faith', 'unweighted', 'weighted_normalized', 'weighted_unnormalized'] >>> print(unifrac.unweighted.__doc__) Compute Unweighted UniFrac Parameters ---------- table : str A filepath to a BIOM-Format 2.1 file. phylogeny : str A filepath to a Newick formatted tree. threads : int, optional The number of threads to use. Default of 1. variance_adjusted : bool, optional Adjust for varianace or not. Default is False. bypass_tips : bool Bypass the tips of the tree in the computation. This reduces compute by about 50%, but is an approximation. Returns ------- skbio.DistanceMatrix The resulting distance matrix. Raises ------ IOError If the tree file is not found If the table is not found ValueError If the table does not appear to be BIOM-Format v2.1. If the phylogeny does not appear to be in Newick format. Notes ----- Unweighted UniFrac was originally described in [1]_. Variance Adjusted UniFrac was originally described in [2]_, and while its application to Unweighted UniFrac was not described, factoring in the variance adjustment is still feasible and so it is exposed. References ---------- .. [1] Lozupone, C. & Knight, R. UniFrac: a new phylogenetic method for comparing microbial communities. Appl. Environ. Microbiol. 71, 8228-8235 (2005). .. [2] Chang, Q., Luan, Y. & Sun, F. Variance adjusted weighted UniFrac: a powerful beta diversity measure for comparing communities based on phylogeny. BMC Bioinformatics 12:118 (2011). >>> print(unifrac.faith_pd.__doc__) Execute a call to the Stacked Faith API in the UniFrac package Parameters ---------- biom_filename : str A filepath to a BIOM 2.1 formatted table (HDF5) tree_filename : str A filepath to a Newick formatted tree Returns ------- pd.Series Series of Faith's PD for each sample in `biom_filename` Raises ------ IOError If the tree file is not found If the table is not found If the table is empty ## Command line The methods can also be used directly through the command line after install: $ which ssu /Users//miniconda3/envs/qiime2-20xx.x/bin/ssu $ ssu --help usage: ssu -i -o -m [METHOD] -t [-n threads] [-a alpha] [--vaw] -i The input BIOM table. -t The input phylogeny in newick. -m The method, [unweighted | weighted_normalized | weighted_unnormalized | generalized]. -o The output distance matrix. -n [OPTIONAL] The number of threads, default is 1. -a [OPTIONAL] Generalized UniFrac alpha, default is 1. -f [OPTIONAL] Bypass tips, reduces compute by about 50%. --vaw [OPTIONAL] Variance adjusted, default is to not adjust for variance. Citations: For UniFrac, please see: Lozupone and Knight Appl Environ Microbiol 2005; DOI: 10.1128/AEM.71.12.8228-8235.2005 Lozupone et al. Appl Environ Microbiol 2007; DOI: 10.1128/AEM.01996-06 Hamady et al. ISME 2010; DOI: 10.1038/ismej.2009.97 Lozupone et al. ISME 2011; DOI: 10.1038/ismej.2010.133 For Generalized UniFrac, please see: Chen et al. Bioinformatics 2012; DOI: 10.1093/bioinformatics/bts342 For Variance Adjusted UniFrac, please see: Chang et al. BMC Bioinformatics 2011; DOI: 10.1186/1471-2105-12-118 $ which faithpd /Users//miniconda3/envs/qiime2-20xx.x/bin/faithpd $ faithpd --help usage: faithpd -i -t -o -i The input BIOM table. -t The input phylogeny in newick. -o The output series. Citations: For Faith's PD, please see: Faith Biological Conservation 1992; DOI: 10.1016/0006-3207(92)91201-3 ## Shared library access In addition to the above methods to access UniFrac, it is also possible to link against the shared library. The C API is described in `sucpp/api.hpp`, and examples of linking against this API can be found in `examples/`. ## Minor test dataset A small test `.biom` and `.tre` can be found in `sucpp/`. An example with expected output is below, and should execute in 10s of milliseconds: $ ssu -i sucpp/test.biom -t sucpp/test.tre -m unweighted -o test.out $ cat test.out Sample1 Sample2 Sample3 Sample4 Sample5 Sample6 Sample1 0 0.2 0.5714285714285714 0.6 0.5 0.2 Sample2 0.2 0 0.4285714285714285 0.6666666666666666 0.6 0.3333333333333333 Sample3 0.5714285714285714 0.4285714285714285 0 0.7142857142857143 0.8571428571428571 0.4285714285714285 Sample4 0.6 0.6666666666666666 0.7142857142857143 0 0.3333333333333333 0.4 Sample5 0.5 0.6 0.8571428571428571 0.3333333333333333 0 0.6 Sample6 0.2 0.3333333333333333 0.4285714285714285 0.4 0.6 0 unifrac-0.10.0/ci/000077500000000000000000000000001351072301000136025ustar00rootroot00000000000000unifrac-0.10.0/ci/linux-64.txt000066400000000000000000000000661351072301000157330ustar00rootroot00000000000000cython flake8 nose scikit-bio biom-format h5py==2.7.0 unifrac-0.10.0/setup.py000066400000000000000000000063101351072301000147210ustar00rootroot00000000000000# ---------------------------------------------------------------------------- # Copyright (c) 2016-2017, QIIME 2 development team. # # Distributed under the terms of the Modified BSD License. # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- from setuptools import setup, find_packages from setuptools.extension import Extension from setuptools.command.build_ext import build_ext as build_ext_orig import numpy as np import subprocess import os import sys SUCPP = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'sucpp/') PREFIX = os.environ.get('PREFIX', "") base = ["cython >= 0.26", "biom-format", "numpy", "h5py >= 2.7.0", "scikit-bio >= 0.5.1"] test = ["nose", "flake8"] all_deps = base + test # https://stackoverflow.com/a/33308902/379593 if sys.platform == 'darwin': os.environ['MACOSX_DEPLOYMENT_TARGET'] = '10.12' def compile_ssu(): """Clean and compile the SSU binary""" # clean the target subprocess.call(['make', 'clean'], cwd=SUCPP) cmd = ['make', 'test'] ret = subprocess.call(cmd, cwd=SUCPP) if ret != 0: raise Exception('Error compiling ssu!') cmd = ['make', 'main'] ret = subprocess.call(cmd, cwd=SUCPP) if ret != 0: raise Exception('Error compiling ssu!') cmd = ['make', 'api'] ret = subprocess.call(cmd, cwd=SUCPP) if ret != 0: raise Exception('Error compiling ssu!') class build_ext(build_ext_orig): """Pre-installation for any time an Extension is built""" def run(self): self.run_compile_ssu() super().run() def run_compile_ssu(self): self.execute(compile_ssu, [], 'Compiling SSU') if PREFIX: self.copy_file(os.path.join(SUCPP, 'libssu.so'), os.path.join(PREFIX, 'lib/')) if sys.platform == "darwin": LINK_ARGS = ['-Wl,sucpp/libssu.so'] else: LINK_ARGS = [] USE_CYTHON = os.environ.get('USE_CYTHON', True) ext = '.pyx' if USE_CYTHON else '.cpp' extensions = [Extension("unifrac._api", sources=["unifrac/_api" + ext, "sucpp/api.cpp"], language="c++", extra_compile_args=["-std=c++11"], extra_link_args=["-std=c++11"] + LINK_ARGS, include_dirs=[np.get_include()] + ['sucpp/'], libraries=['ssu'])] if USE_CYTHON: from Cython.Build import cythonize extensions = cythonize(extensions) with open('README.md') as f: long_description = f.read() setup( name="unifrac", version="0.10.0", packages=find_packages(), author="Daniel McDonald", license='BSD-3-Clause', author_email="wasade@gmail.com", url="https://github.com/biocore/unifrac", description="High performance phylogenetic diversity calculations", long_description=long_description, long_description_content_type='text/markdown', ext_modules=extensions, install_requires=base, extras_require={'test': test, 'all': all_deps}, cmdclass={'build_ext': build_ext}, package_data={ 'unifrac.tests': ['data/*', ]} ) unifrac-0.10.0/sucpp/000077500000000000000000000000001351072301000143415ustar00rootroot00000000000000unifrac-0.10.0/sucpp/Makefile000066400000000000000000000042211351072301000160000ustar00rootroot00000000000000CXX := h5c++ PLATFORM := $(shell uname -s) COMPILER := $(shell ($(CXX) -v 2>&1) | tr A-Z a-z ) ifdef DEBUG OPT = -O0 -DDEBUG=1 --debug -g -ggdb else ifneq (,$(findstring gcc,$(COMPILER))) OPT = -O4 TGTFLAGS = -fwhole-program else OPT = -O3 endif endif ifeq ($(PREFIX),) PREFIX := $(CONDA_PREFIX) endif ifeq ($(PLATFORM),Darwin) AVX2 := $(shell sysctl -a | grep -c AVX2) LDDFLAGS = -dynamiclib -install_name @rpath/libssu.so else AVX2 := $(shell grep "^flags" /proc/cpuinfo | head -n 1 | grep -c avx2) LDDFLAGS = -shared endif ifeq ($(PERFORMING_CONDA_BUILD),True) CPPFLAGS += -mtune=generic else CPPFLAGS += -mfma -march=native endif CPPFLAGS += -Wall -Wextra -std=c++11 -pedantic -I. $(OPT) -fPIC test: tree.o test_su.cpp biom.o unifrac.o unifrac_task.o api.o $(CXX) $(CPPFLAGS) -Wno-unused-parameter test_su.cpp -o test_su tree.o biom.o unifrac.o unifrac_task.o api.o -pthread $(CXX) $(CPPFLAGS) -Wno-unused-parameter test_api.cpp -o test_api tree.o biom.o unifrac.o unifrac_task.o api.o -pthread main: tree.o biom.o unifrac.o cmd.o unifrac_task.o api.o $(CXX) $(CPPFLAGS) su.cpp -o ssu tree.o biom.o unifrac.o cmd.o unifrac_task.o api.o -lhdf5_cpp -pthread $(CXX) $(CPPFLAGS) faithpd.cpp -o faithpd tree.o biom.o unifrac.o cmd.o unifrac_task.o api.o -lhdf5_cpp -pthread cp ssu ${PREFIX}/bin/ cp faithpd ${PREFIX}/bin/ rapi_test: main mkdir -p ~/.R if [ -a ~/.R/Makevars ] ; \ then \ echo "WARNING: OVERWRITING ~/.R/Makevars" ; \ echo "The original Makevars file has been copied to ~/.R/Makevars" ;\ cp ~/.R/Makevars Makevars-original ; \ fi; echo CXX1X=h5c++ > ~/.R/Makevars echo CXX=h5c++ >> ~/.R/Makevars echo CC=h5c++ >> ~/.R/Makevars Rscript R_interface/rapi_test.R api: tree.o biom.o unifrac.o cmd.o unifrac_task.o $(CXX) $(CPPFLAGS) api.cpp -c -o api.o -fPIC $(CXX) $(LDDFLAGS) -o libssu.so tree.o biom.o unifrac.o cmd.o unifrac_task.o api.o -lc -lhdf5_cpp -L$(PREFIX)/lib cp libssu.so ${PREFIX}/lib/ capi_test: api gcc -std=c99 capi_test.c -lssu -L${PREFIX}/lib -Wl,-rpath,${PREFIX}/lib -o capi_test export LD_LIBRARY_PATH="${PREFIX}/lib":"./capi_test" %.o: %.cpp %.hpp $(CXX) $(CPPFLAGS) -c $< -o $@ clean: -rm -f *.o ssu unifrac-0.10.0/sucpp/R_interface/000077500000000000000000000000001351072301000165625ustar00rootroot00000000000000unifrac-0.10.0/sucpp/R_interface/README.md000066400000000000000000000024421351072301000200430ustar00rootroot00000000000000# R interface for Strided State Unifrac This provides an R interface for Unweighted Unifrac. This interface works using R's Rcpp library. To load this, in R use `library(Rcpp)` and `sourceCpp("su_R.cpp")`. The Unifrac method takes in three arguments: a file path to an HDF5 formatted BIOM table, a filepath to a newick formatted tree file, and the number of threads to be used It is expected that the observations described in the BIOM table correspond to a subset of the tips of the input tree. The method returns a list containing an `int` `n_samples`, denoting the number of samples in the table, a `boolean` `is_upper_triangle`, denoting whether Unifrac generated a square matrix and if it has returned the upper triangle , an `int` `cf_size`, denoting the size of the condensed form of the matrix, and `c_form`, an array representation of the condensed form of the matrix, obtained by taking the upper triangle. ```R > library(Rcpp) > sourceCpp("su_R.cpp") > table = "../test.biom" > tree = "../test.tre" > nthreads = 2 > unif = unifrac(table, tree, nthreads) > unif $n_samples [1] 6 $is_sqaure [1] TRUE $cf_size [1] 15 $c_form [1] 0.2000000 0.5714286 0.6000000 0.5000000 0.2000000 0.4285714 0.6666667 [8] 0.6000000 0.3333333 0.7142857 0.8571429 0.4285714 0.3333333 0.4000000 [15] 0.6000000 ``` unifrac-0.10.0/sucpp/R_interface/rapi_test.R000066400000000000000000000020431351072301000206760ustar00rootroot00000000000000library(Rcpp) equals <- function(x, y, msg){ if (x!=y) stop(msg) } aboutEquals <- function(x, y, msg){ if((x-y)>0.005) stop(msg) } source = "su_R.cpp" sourceCpp(source) table = "test.biom" tree = "test.tre" nthreads = 1 print('Testing UniFrac..') unif = unifrac(table, tree, nthreads) exp = c(0.2000000, 0.5714286, 0.6000000, 0.5000000, 0.2000000, 0.4285714, 0.6666667, 0.6000000, 0.3333333, 0.7142857, 0.8571429, 0.4285714, 0.3333333, 0.4000000, 0.6000000) equals(unif["n_samples"][[1]], 6, "n_samples != 6") equals(unif["cf_size"][[1]], 15, "cf_size != 15") equals(unif["is_upper_triangle"][[1]], TRUE, "is_upper_triagnle != TRUE") for ( i in 1:15){ aboutEquals(unif["c_form"][[1]][i], exp[i], "Output not as expected") } print('Success.') print('Testing Faith PD..') faith = faith_pd(table, tree) exp = c(4, 5, 6, 3, 2, 5) equals(faith["n_samples"][[1]], 6, "n_samples != 6") for ( i in 1:6){ aboutEquals(faith["faith_pd"][[1]][i], exp[i], "Output not as expected") } print('Success.') print('All tests pass') unifrac-0.10.0/sucpp/R_interface/test.biom000066400000000000000000001020101351072301000204030ustar00rootroot00000000000000HDF  `  xTREE`HEAPX observationsample8 @id ` @type ` H format-url` P format-version@ H generated-by` Hcreation-date` H shape@ 0 nnz@TREEhHEAPX8matrixidsmetadatagroup-metadata GCOL No Table ID otu tablehttp://biom-format.orgexample2014-07-29T16:15:43.318377GG_OTU_4GG_OTU_5GG_OTU_3 GG_OTU_2 GG_OTU_1 Sample4 Sample5 Sample6Sample3Sample2Sample18SNODxQQSTREE&HEAPX dataindicesindptr8SNOD(NNQBK L@N ?@4 4 deflateSPTREE'8(SNOD80x9x^c``AB P>8htu4} Tx^cbF fb fb&(ĆɃ@|%x^c````b6 b^ b-x^```H`. 1a|N4>B泡فg8Yx^c`0?ؓG @i(.} Tx^cd```bF(LH|al$ x^c````bV bn b~ `7x^cg``H` fGh|4>/  deflateH1SXTREE!_(  deflate:SXTREE( deflateCSPTREE"( L@NTREE&HEAPX`NPNQTREE&HEAPX QPQSTREEVHEAPX8Smatrixidsmetadatagroup-metadata `TVTREEaHEAPX Vdataindicesindptr8SNOD(HpHu8T`TV~~Ѐ ?@4 4 deflatePYSPTREE"(SNOD@Xbl  deflatecSXTREE (  deflatemSXTREE( deflateXvSPTREE$)~ЀTREEaHEAPXPpTREEaHEAPXPunifrac-0.10.0/sucpp/R_interface/test.tre000066400000000000000000000001021351072301000202460ustar00rootroot00000000000000(GG_OTU_1:1,(GG_OTU_2:1,GG_OTU_3:1):1,(GG_OTU_5:1,GG_OTU_4:1):1); unifrac-0.10.0/sucpp/affinity.hpp000066400000000000000000000063661351072301000166760ustar00rootroot00000000000000#define _GNU_SOURCE #include #include #include #include #include #include #ifdef __LINUX__ #include #endif #ifdef __APPLE__ #include #include #include #include #include // OSX code adapted from // http://yyshen.github.io/2015/01/18/binding_threads_to_cores_osx.html // these macros and methods don't exist on OSX #define SYSCTL_CORE_COUNT "machdep.cpu.core_count" typedef struct cpu_set { uint32_t count; } cpu_set_t; static inline void CPU_ZERO(cpu_set_t *cs) { cs->count = 0; } static inline void CPU_SET(int num, cpu_set_t *cs) { cs->count |= (1 << num); } static inline int CPU_ISSET(int num, cpu_set_t *cs) { return (cs->count & (1 << num)); } static inline int CPU_COUNT(cpu_set_t *cs) { return __builtin_popcount(cs->count); } #define CPU_SETSIZE 32 int sched_getaffinity(pid_t pid, size_t cpu_size, cpu_set_t *cpu_set) { int32_t core_count = 0; size_t len = sizeof(core_count); int ret = sysctlbyname(SYSCTL_CORE_COUNT, &core_count, &len, 0, 0); if (ret) { return -1; } cpu_set->count = 0; for (int i = 0; i < core_count; i++) { cpu_set->count |= (1 << i); } return 0; } int pthread_setaffinity_np(pthread_t thread, size_t cpu_size, cpu_set_t *cpu_set) { thread_port_t mach_thread; int core = 0; for (core = 0; core < 8 * cpu_size; core++) { if (CPU_ISSET(core, cpu_set)) break; } thread_affinity_policy_data_t policy = { core }; mach_thread = pthread_mach_thread_np(thread); thread_policy_set(mach_thread, THREAD_AFFINITY_POLICY, (thread_policy_t)&policy, 1); return 0; } #endif #define handle_error_en(en, msg) \ do { errno = en; perror(msg); exit(EXIT_FAILURE); } while (0) int bind_to_core(int core) { /* bind the calling thread to the requested core * * The use of this method is for better NUMA utilization. The * default NUMA policy is local, where memory is allocated on the NUMA node * relative to the core if possible. The intention with this method is to * bind to a core first, and then allocate memory. A beneficial side effect * is that threads should not hop between cores either. * * This method is cgroup safe. */ // https://stackoverflow.com/a/11583550/19741 // http://blog.saliya.org/2015/07/get-and-set-process-affinity-in-c.html pthread_t thread = pthread_self(); pid_t pid = getpid(); cpu_set_t current_set, new_set; int j, ret; CPU_ZERO(¤t_set); CPU_ZERO(&new_set); ret = sched_getaffinity(pid, sizeof(current_set), ¤t_set); // find which core in our cpu_set corresponds to the callers // request int target = -1; for(j = 0; j < CPU_SETSIZE; j++) { if(CPU_ISSET(j, ¤t_set)) { target++; } if(target == core) break; } if(target != core) { fprintf(stderr, "Unable to bind this thread to core %d. Are sufficient processors available?", thread); return -1; } CPU_SET(j, &new_set); int serr = pthread_setaffinity_np(thread, sizeof(new_set), &new_set); return serr; } unifrac-0.10.0/sucpp/api.cpp000066400000000000000000000573711351072301000156330ustar00rootroot00000000000000#include "api.hpp" #include "biom.hpp" #include "tree.hpp" #include "unifrac.hpp" #include #include #include #include #define CHECK_FILE(filename, err) if(!is_file_exists(filename)) { \ return err; \ } #define SET_METHOD(requested_method, err) Method method; \ if(std::strcmp(requested_method, "unweighted") == 0) \ method = unweighted; \ else if(std::strcmp(requested_method, "weighted_normalized") == 0) \ method = weighted_normalized; \ else if(std::strcmp(requested_method, "weighted_unnormalized") == 0) \ method = weighted_unnormalized; \ else if(std::strcmp(requested_method, "generalized") == 0) \ method = generalized; \ else { \ return err; \ } #define PARSE_SYNC_TREE_TABLE(tree_filename, table_filename) std::ifstream ifs(tree_filename); \ std::string content = std::string(std::istreambuf_iterator(ifs), \ std::istreambuf_iterator()); \ su::BPTree tree = su::BPTree(content); \ su::biom table = su::biom(biom_filename); \ if(table.n_samples <= 0 | table.n_obs <= 0) { \ return table_empty; \ } \ std::string bad_id = su::test_table_ids_are_subset_of_tree(table, tree); \ if(bad_id != "") { \ return table_and_tree_do_not_overlap; \ } \ std::unordered_set to_keep(table.obs_ids.begin(), \ table.obs_ids.end()); \ su::BPTree tree_sheared = tree.shear(to_keep).collapse(); using namespace su; using namespace std; // https://stackoverflow.com/a/19841704/19741 bool is_file_exists(const char *fileName) { std::ifstream infile(fileName); return infile.good(); } void destroy_stripes(vector &dm_stripes, vector &dm_stripes_total, unsigned int n_samples, unsigned int stripe_start, unsigned int stripe_stop) { unsigned int n_rotations = (n_samples + 1) / 2; if(stripe_stop == 0) { for(unsigned int i = 0; i < n_rotations; i++) { free(dm_stripes[i]); if(dm_stripes_total[i] != NULL) free(dm_stripes_total[i]); } } else { // if a stripe_stop is specified, and if we're in the stripe window, do not free // dm_stripes. this is done as the pointers in dm_stripes are assigned to the partial_mat_t // and subsequently freed in destroy_partial_mat. but, we do need to free dm_stripes_total // if appropriate for(unsigned int i = stripe_start; i < stripe_stop; i++) { if(dm_stripes_total[i] != NULL) free(dm_stripes_total[i]); } } } void initialize_mat(mat_t* &result, biom &table, bool is_upper_triangle) { result = (mat_t*)malloc(sizeof(mat)); result->n_samples = table.n_samples; result->cf_size = su::comb_2(table.n_samples); result->is_upper_triangle = is_upper_triangle; result->sample_ids = (char**)malloc(sizeof(char*) * result->n_samples); result->condensed_form = (double*)malloc(sizeof(double) * su::comb_2(table.n_samples)); for(unsigned int i = 0; i < result->n_samples; i++) { size_t len = table.sample_ids[i].length(); result->sample_ids[i] = (char*)malloc(sizeof(char) * len + 1); table.sample_ids[i].copy(result->sample_ids[i], len); result->sample_ids[i][len] = '\0'; } } void initialize_results_vec(r_vec* &result, biom& table){ // Stores results for Faith PD result = (r_vec*)malloc(sizeof(results_vec)); result->n_samples = table.n_samples; result->values = (double*)malloc(sizeof(double) * result->n_samples); result->sample_ids = (char**)malloc(sizeof(char*) * result->n_samples); for(unsigned int i = 0; i < result->n_samples; i++) { size_t len = table.sample_ids[i].length(); result->sample_ids[i] = (char*)malloc(sizeof(char) * len + 1); table.sample_ids[i].copy(result->sample_ids[i], len); result->sample_ids[i][len] = '\0'; result->values[i] = 0; } } void initialize_mat_no_biom(mat_t* &result, char** sample_ids, unsigned int n_samples, bool is_upper_triangle) { result = (mat_t*)malloc(sizeof(mat)); result->n_samples = n_samples; result->cf_size = su::comb_2(n_samples); result->is_upper_triangle = is_upper_triangle; result->sample_ids = (char**)malloc(sizeof(char*) * result->n_samples); result->condensed_form = (double*)malloc(sizeof(double) * su::comb_2(n_samples)); for(unsigned int i = 0; i < n_samples; i++) { result->sample_ids[i] = strdup(sample_ids[i]); } } void initialize_partial_mat(partial_mat_t* &result, biom &table, std::vector &dm_stripes, unsigned int stripe_start, unsigned int stripe_stop, bool is_upper_triangle) { result = (partial_mat_t*)malloc(sizeof(partial_mat)); result->n_samples = table.n_samples; result->sample_ids = (char**)malloc(sizeof(char*) * result->n_samples); for(unsigned int i = 0; i < result->n_samples; i++) { size_t len = table.sample_ids[i].length(); result->sample_ids[i] = (char*)malloc(sizeof(char) * len + 1); table.sample_ids[i].copy(result->sample_ids[i], len); result->sample_ids[i][len] = '\0'; } result->stripes = (double**)malloc(sizeof(double*) * (stripe_stop - stripe_start)); result->stripe_start = stripe_start; result->stripe_stop = stripe_stop; result->is_upper_triangle = is_upper_triangle; result->stripe_total = dm_stripes.size(); for(unsigned int i = stripe_start; i < stripe_stop; i++) { result->stripes[i - stripe_start] = dm_stripes[i]; } } void destroy_results_vec(r_vec** result) { // for Faith PD for(unsigned int i = 0; i < (*result)->n_samples; i++) { free((*result)->sample_ids[i]); }; free((*result)->sample_ids); free((*result)->values); free(*result); } void destroy_mat(mat_t** result) { for(unsigned int i = 0; i < (*result)->n_samples; i++) { free((*result)->sample_ids[i]); }; free((*result)->sample_ids); free((*result)->condensed_form); free(*result); } void destroy_partial_mat(partial_mat_t** result) { for(unsigned int i = 0; i < (*result)->n_samples; i++) { if((*result)->sample_ids[i] != NULL) free((*result)->sample_ids[i]); }; if((*result)->sample_ids != NULL) free((*result)->sample_ids); unsigned int n_stripes = (*result)->stripe_stop - (*result)->stripe_start; for(unsigned int i = 0; i < n_stripes; i++) if((*result)->stripes[i] != NULL) free((*result)->stripes[i]); if((*result)->stripes != NULL) free((*result)->stripes); free(*result); } void set_tasks(std::vector &tasks, double alpha, unsigned int n_samples, unsigned int stripe_start, unsigned int stripe_stop, bool bypass_tips, unsigned int nthreads) { // compute from start to the max possible stripe if stop doesn't make sense if(stripe_stop <= stripe_start) stripe_stop = (n_samples + 1) / 2; /* chunking strategy is to balance as much as possible. eg if there are 15 stripes * and 4 threads, our goal is to assign 4 stripes to 3 threads, and 3 stripes to one thread. * * we use the remaining the chunksize for bins which cannot be full maximally */ unsigned int fullchunk = ((stripe_stop - stripe_start) + nthreads - 1) / nthreads; // this computes the ceiling unsigned int smallchunk = (stripe_stop - stripe_start) / nthreads; unsigned int n_fullbins = (stripe_stop - stripe_start) % nthreads; if(n_fullbins == 0) n_fullbins = nthreads; unsigned int start = stripe_start; for(unsigned int tid = 0; tid < nthreads; tid++) { tasks[tid].tid = tid; tasks[tid].start = start; // stripe start tasks[tid].bypass_tips = bypass_tips; if(tid < n_fullbins) { tasks[tid].stop = start + fullchunk; // stripe end start = start + fullchunk; } else { tasks[tid].stop = start + smallchunk; // stripe end start = start + smallchunk; } tasks[tid].n_samples = n_samples; tasks[tid].g_unifrac_alpha = alpha; } } compute_status partial(const char* biom_filename, const char* tree_filename, const char* unifrac_method, bool variance_adjust, double alpha, bool bypass_tips, unsigned int nthreads, unsigned int stripe_start, unsigned int stripe_stop, partial_mat_t** result) { CHECK_FILE(biom_filename, table_missing) CHECK_FILE(tree_filename, tree_missing) SET_METHOD(unifrac_method, unknown_method) PARSE_SYNC_TREE_TABLE(tree_filename, table_filename) // we resize to the largest number of possible stripes even if only computing // partial, however we do not allocate arrays for non-computed stripes so // there is a little memory waste here but should be on the order of // 8 bytes * N samples per vector. std::vector dm_stripes((table.n_samples + 1) / 2); std::vector dm_stripes_total((table.n_samples + 1) / 2); if(nthreads > dm_stripes.size()) { fprintf(stderr, "More threads were requested than stripes. Using %d threads.\n"); nthreads = dm_stripes.size(); } std::vector tasks(nthreads); std::vector threads(nthreads); if(((table.n_samples + 1) / 2) < stripe_stop) { fprintf(stderr, "Stopping stripe is out-of-bounds, max %d\n", (table.n_samples + 1) / 2); exit(EXIT_FAILURE); } set_tasks(tasks, alpha, table.n_samples, stripe_start, stripe_stop, bypass_tips, nthreads); su::process_stripes(table, tree_sheared, method, variance_adjust, dm_stripes, dm_stripes_total, threads, tasks); initialize_partial_mat(*result, table, dm_stripes, stripe_start, stripe_stop, true); // true -> is_upper_triangle destroy_stripes(dm_stripes, dm_stripes_total, table.n_samples, stripe_start, stripe_stop); return okay; } compute_status faith_pd_one_off(const char* biom_filename, const char* tree_filename, r_vec** result){ CHECK_FILE(biom_filename, table_missing) CHECK_FILE(tree_filename, tree_missing) PARSE_SYNC_TREE_TABLE(tree_filename, table_filename) initialize_results_vec(*result, table); // compute faithpd su::faith_pd(table, tree_sheared, std::ref((*result)->values)); return okay; } compute_status one_off(const char* biom_filename, const char* tree_filename, const char* unifrac_method, bool variance_adjust, double alpha, bool bypass_tips, unsigned int nthreads, mat_t** result) { CHECK_FILE(biom_filename, table_missing) CHECK_FILE(tree_filename, tree_missing) SET_METHOD(unifrac_method, unknown_method) PARSE_SYNC_TREE_TABLE(tree_filename, table_filename) // we resize to the largest number of possible stripes even if only computing // partial, however we do not allocate arrays for non-computed stripes so // there is a little memory waste here but should be on the order of // 8 bytes * N samples per vector. std::vector dm_stripes((table.n_samples + 1) / 2); std::vector dm_stripes_total((table.n_samples + 1) / 2); if(nthreads > dm_stripes.size()) { fprintf(stderr, "More threads were requested than stripes. Using %d threads.\n"); nthreads = dm_stripes.size(); } std::vector tasks(nthreads); std::vector threads(nthreads); set_tasks(tasks, alpha, table.n_samples, 0, 0, bypass_tips, nthreads); su::process_stripes(table, tree_sheared, method, variance_adjust, dm_stripes, dm_stripes_total, threads, tasks); initialize_mat(*result, table, true); // true -> is_upper_triangle for(unsigned int tid = 0; tid < threads.size(); tid++) { threads[tid] = std::thread(su::stripes_to_condensed_form, std::ref(dm_stripes), table.n_samples, std::ref((*result)->condensed_form), tasks[tid].start, tasks[tid].stop); } for(unsigned int tid = 0; tid < threads.size(); tid++) { threads[tid].join(); } destroy_stripes(dm_stripes, dm_stripes_total, table.n_samples, 0, 0); return okay; } IOStatus write_mat(const char* output_filename, mat_t* result) { std::ofstream output; output.open(output_filename); uint64_t comb_N = su::comb_2(result->n_samples); uint64_t comb_N_minus = 0; double v; for(unsigned int i = 0; i < result->n_samples; i++) output << "\t" << result->sample_ids[i]; output << std::endl; for(unsigned int i = 0; i < result->n_samples; i++) { output << result->sample_ids[i]; for(unsigned int j = 0; j < result->n_samples; j++) { if(i < j) { // upper triangle comb_N_minus = su::comb_2(result->n_samples - i); v = result->condensed_form[comb_N - comb_N_minus + (j - i - 1)]; } else if (i > j) { // lower triangle comb_N_minus = su::comb_2(result->n_samples - j); v = result->condensed_form[comb_N - comb_N_minus + (i - j - 1)]; } else { v = 0.0; } output << std::setprecision(16) << "\t" << v; } output << std::endl; } output.close(); return write_okay; } IOStatus write_vec(const char* output_filename, r_vec* result) { std::ofstream output; output.open(output_filename); // write sample ids in first column of file and faith's pd in second column output << "#SampleID\tfaith_pd" << std::endl; for(unsigned int i = 0; i < result->n_samples; i++) { output << result->sample_ids[i]; output << std::setprecision(16) << "\t" << result->values[i]; output << std::endl; } output.close(); return write_okay; } IOStatus write_partial(const char* output_filename, partial_mat_t* result) { std::ofstream output; output.open(output_filename, std::ios::binary); if(!output.is_open()) return open_error; uint32_t n_stripes = result->stripe_stop - result->stripe_start; std::string magic(PARTIAL_MAGIC); uint32_t magic_len = magic.length(); /* header information */ output.write(reinterpret_cast(&magic_len), sizeof(uint16_t)); output << magic; output.write(reinterpret_cast(&result->n_samples), sizeof(uint32_t)); output.write(reinterpret_cast(&n_stripes), sizeof(uint32_t)); output.write(reinterpret_cast(&result->stripe_start), sizeof(uint32_t)); output.write(reinterpret_cast(&result->stripe_total), sizeof(uint32_t)); output.write(reinterpret_cast(&result->is_upper_triangle), sizeof(uint8_t)); /* sample IDs */ for(unsigned int i = 0; i < result->n_samples; i++) { uint16_t length = strlen(result->sample_ids[i]); output.write(reinterpret_cast(&length), sizeof(uint16_t)); output << result->sample_ids[i]; } /* stripe information */ for(unsigned int i = 0; i < n_stripes; i++) { /// :( streamsize didn't seem to work. probably a fancy way to do this, but the regular loop is fast too //output.write(reinterpret_cast(&result->stripes[i]), std::streamsize(sizeof(double) * result->n_samples)); for(unsigned int j = 0; j < result->n_samples; j++) output.write(reinterpret_cast(&result->stripes[i][j]), sizeof(double)); } /* footer */ output << magic; output.close(); return write_okay; } IOStatus _is_partial_file(const char* input_filename) { std::ifstream input; input.open(input_filename, std::ios::in | std::ios::binary); if(!input.is_open()) return open_error; char magic[32]; uint16_t magic_len; input.read((char*)&magic_len, 2); // if the length of the magic is unexpected then bail if(magic_len <= 0 || magic_len > 32) { return magic_incompatible; } input.read(magic, magic_len); if(strncmp(magic, PARTIAL_MAGIC, magic_len) != 0) { return magic_incompatible; } input.close(); return read_okay; } IOStatus read_partial(const char* input_filename, partial_mat_t** result_out) { IOStatus err = _is_partial_file(input_filename); if(err != read_okay) return err; std::ifstream input; input.open(input_filename, std::ios::binary); /* load header */ uint16_t magic_len; input.read((char*)&magic_len, 2); // magic length char header_magic[32]; input.read(header_magic, magic_len); // magic header_magic[magic_len] = '\0'; uint32_t n_samples; input.read((char*)&n_samples, 4); // number of samples uint32_t n_stripes; input.read((char*)&n_stripes, 4); // number of stripes uint32_t stripe_start; input.read((char*)&stripe_start, 4); // stripe start uint32_t stripe_total; input.read((char*)&stripe_total, 4); // stripe total bool is_upper_triangle; input.read((char*)&is_upper_triangle, 1); // is_upper_triangle /* sanity check header */ if(n_samples <= 0 || n_stripes <= 0 || stripe_start < 0 || stripe_total <= 0 || is_upper_triangle < 0) return bad_header; if(stripe_total >= n_samples || n_stripes > stripe_total || stripe_start >= stripe_total || stripe_start + n_stripes > stripe_total) return bad_header; /* initialize the partial result structure */ partial_mat_t* result = (partial_mat_t*)malloc(sizeof(partial_mat)); result->n_samples = n_samples; result->sample_ids = (char**)malloc(sizeof(char*) * result->n_samples); result->stripes = (double**)malloc(sizeof(double*) * (n_stripes)); result->stripe_start = stripe_start; result->stripe_stop = stripe_start + n_stripes; result->is_upper_triangle = is_upper_triangle; result->stripe_total = stripe_total; /* load samples */ for(int i = 0; i < n_samples; i++) { uint16_t sample_length; input.read((char*)&sample_length, 2); result->sample_ids[i] = (char*)malloc(sizeof(char) * (sample_length + 1)); input.read(result->sample_ids[i], sample_length); result->sample_ids[i][sample_length] = '\0'; } /* load stripes */ int current_to_load; void *ptr; for(int i = 0; i < n_stripes; i++) { ptr = malloc(sizeof(double) * n_samples); if(ptr == NULL) { fprintf(stderr, "failed\n"); exit(1); } result->stripes[i] = (double*)ptr; input.read(reinterpret_cast(result->stripes[i]), sizeof(double) * n_samples); } /* sanity check the footer */ char footer_magic[32]; input.read(footer_magic, magic_len); footer_magic[magic_len] = '\0'; if(strcmp(header_magic, footer_magic) != 0) { return magic_incompatible; } (*result_out) = result; return read_okay; } MergeStatus merge_partial(partial_mat_t** partial_mats, int n_partials, unsigned int nthreads, mat_t** result) { if(n_partials <= 0) { fprintf(stderr, "Zero or less partials.\n"); exit(EXIT_FAILURE); } // sanity check int n_samples = partial_mats[0]->n_samples; bool *stripe_map = (bool*)calloc(sizeof(bool), partial_mats[0]->stripe_total); int stripe_count = 0; for(int i = 0; i < n_partials; i++) { if(partial_mats[i]->n_samples != n_samples) { free(stripe_map); return partials_mismatch; } if(partial_mats[0]->stripe_total != partial_mats[i]->stripe_total) { free(stripe_map); return partials_mismatch; } if(partial_mats[0]->is_upper_triangle != partial_mats[i]->is_upper_triangle) { free(stripe_map); return square_mismatch; } for(int j = 0; j < n_samples; j++) { if(strcmp(partial_mats[0]->sample_ids[j], partial_mats[i]->sample_ids[j]) != 0) { free(stripe_map); return sample_id_consistency; } } for(int j = partial_mats[i]->stripe_start; j < partial_mats[i]->stripe_stop; j++) { if(stripe_map[j]) { free(stripe_map); return stripes_overlap; } stripe_map[j] = true; stripe_count += 1; } } free(stripe_map); if(stripe_count != partial_mats[0]->stripe_total) { return incomplete_stripe_set; } std::vector stripes(partial_mats[0]->stripe_total); std::vector stripes_totals(partial_mats[0]->stripe_total); // not actually used but destroy_stripes needs this to "exist" for(int i = 0; i < n_partials; i++) { int n_stripes = partial_mats[i]->stripe_stop - partial_mats[i]->stripe_start; for(int j = 0; j < n_stripes; j++) { // as this is potentially a large amount of memory, don't copy, just adopt *&(stripes[j + partial_mats[i]->stripe_start]) = partial_mats[i]->stripes[j]; } } if(nthreads > stripes.size()) { fprintf(stderr, "More threads were requested than stripes. Using %d threads.\n"); nthreads = stripes.size(); } std::vector tasks(nthreads); std::vector threads(nthreads); initialize_mat_no_biom(*result, partial_mats[0]->sample_ids, n_samples, partial_mats[0]->is_upper_triangle); su::stripes_to_condensed_form(stripes, n_samples, (*result)->condensed_form, 0, partial_mats[0]->stripe_total); destroy_stripes(stripes, stripes_totals, n_samples, 0, n_partials); return merge_okay; } unifrac-0.10.0/sucpp/api.hpp000066400000000000000000000244031351072301000156260ustar00rootroot00000000000000#include "task_parameters.hpp" #ifdef __cplusplus #include #define EXTERN extern "C" #else #include #define EXTERN #endif #define PARTIAL_MAGIC "SSU-PARTIAL-01" typedef enum compute_status {okay=0, tree_missing, table_missing, table_empty, unknown_method, table_and_tree_do_not_overlap} ComputeStatus; typedef enum io_status {read_okay=0, write_okay, open_error, read_error, magic_incompatible, bad_header, unexpected_end} IOStatus; typedef enum merge_status {merge_okay=0, incomplete_stripe_set, sample_id_consistency, square_mismatch, partials_mismatch, stripes_overlap} MergeStatus; /* a result matrix * * n_samples the number of samples. * cf_size the size of the condensed form. * is_upper_triangle if true, indicates condensed_form represents a square * matrix, and only the upper triangle is contained. if false, * condensed_form represents the lower triangle of a matrix. * condensed_form the matrix values of length cf_size. * sample_ids the sample IDs of length n_samples. */ typedef struct mat { unsigned int n_samples; unsigned int cf_size; bool is_upper_triangle; double* condensed_form; char** sample_ids; } mat_t; /* a result vector * * n_samples the number of samples. * values the score values of length n_samples. * sample_ids the sample IDs of length n_samples. */ typedef struct results_vec{ unsigned int n_samples; double* values; char** sample_ids; } r_vec; /* a partial result containing stripe data * * n_samples the number of samples. * sample_ids the sample IDs of length n_samples. * stripes the stripe data of dimension (stripe_stop - stripe_start, n_samples) * stripe_start the logical starting stripe in the final matrix. * stripe_stop the logical stopping stripe in the final matrix. * stripe_total the total number of stripes present in the final matrix. * is_upper_triangle whether the stripes correspond to the upper triangle of the resulting matrix. * This is useful for asymmetric unifrac metrics. */ typedef struct partial_mat { uint32_t n_samples; char** sample_ids; double** stripes; uint32_t stripe_start; uint32_t stripe_stop; uint32_t stripe_total; bool is_upper_triangle; } partial_mat_t; void destroy_mat(mat_t** result); void destroy_partial_mat(partial_mat_t** result); void destroy_results_vec(r_vec** result); /* Compute UniFrac * * biom_filename the filename to the biom table. * tree_filename the filename to the correspodning tree. * unifrac_method the requested unifrac method. * variance_adjust whether to apply variance adjustment. * alpha GUniFrac alpha, only relevant if method == generalized. * bypass_tips disregard tips, reduces compute by about 50% * threads the number of threads to use. * result the resulting distance matrix in condensed form, this is initialized within the method so using ** * * one_off returns the following error codes: * * okay : no problems encountered * table_missing : the filename for the table does not exist * tree_missing : the filename for the tree does not exist * unknown_method : the requested method is unknown. * table_empty : the table does not have any entries */ EXTERN ComputeStatus one_off(const char* biom_filename, const char* tree_filename, const char* unifrac_method, bool variance_adjust, double alpha, bool bypass_tips, unsigned int threads, mat_t** result); /* compute Faith PD * biom_filename the filename to the biom table. * tree_filename the filename to the correspodning tree. * result the resulting vector of computed Faith PD values * * faith_pd_one_off returns the following error codes: * * okay : no problems encountered * table_missing : the filename for the table does not exist * tree_missing : the filename for the tree does not exist * table_empty : the table does not have any entries */ EXTERN ComputeStatus faith_pd_one_off(const char* biom_filename, const char* tree_filename, r_vec** result); /* Write a matrix object * * filename the file to write into * result the results object * * The following error codes are returned: * * write_okay : no problems */ EXTERN IOStatus write_mat(const char* filename, mat_t* result); /* Write a series * * filename the file to write into * result the results object * * The following error codes are returned: * * write_okay : no problems */ EXTERN IOStatus write_vec(const char* filename, r_vec* result); /* Read a matrix object * * filename the file to write into * result the results object * * The following error codes are returned: * * read_okay : no problems * open_error : could not open the file * magic_incompatible : format magic not found or incompatible * unexpected_end : format end not found in expected location */ //EXTERN IOStatus read_mat(const char* filename, mat_t** result); /* Compute a subset of a UniFrac distance matrix * * biom_filename the filename to the biom table. * tree_filename the filename to the correspodning tree. * unifrac_method the requested unifrac method. * variance_adjust whether to apply variance adjustment. * alpha GUniFrac alpha, only relevant if method == generalized. * bypass_tips disregard tips, reduces compute by about 50% * threads the number of threads to use. * stripe_start the starting stripe to compute * stripe_stop the last stripe to compute * dm_stripes the unique branch length stripes. This is expected to be * uninitialized, and is an output parameter. * dm_stripes_total the total branch length stripes. This is expected to be * uninitialized, and is an output parameter. * result the resulting distance matrix in condensed form, this is initialized within the method so using ** * * partial returns the following error codes: * * okay : no problems encountered * table_missing : the filename for the table does not exist * tree_missing : the filename for the tree does not exist * unknown_method : the requested method is unknown. */ EXTERN ComputeStatus partial(const char* biom_filename, const char* tree_filename, const char* unifrac_method, bool variance_adjust, double alpha, bool bypass_tips, unsigned int threads, unsigned int stripe_start, unsigned int stripe_stop, partial_mat_t** result); /* Write a partial matrix object * * filename the file to write into * result the partial results object * * The following error codes are returned: * * write_okay : no problems * open_error : could not open the file * * The structure of the binary output file is as follows. Newlines added for clarity, but are not stored. * The file has logical blocks, but are not explicitly denoted in the format. These logical blocks are * just used to improve readability here, and are denoted by ### marks. * * ### HEADER ### * : uint16_t, the length of the magic * : char, e.g., SSU-PARTIAL-01 * : uint32_t, the number of samples * : uint32_t, the number of stripes represented in this file * : uint32_t, the starting stripe number * : uint32_t, the total number of stripes in the full matrix * : uint8_t, zero is false, nonzero is true * * ### SAMPLE IDS ### * : uint16_t, the length of the next sample ID * : LEN bytes, char * ... : ... repeated * : uint16_t, the length of the next sample ID * : LEN bytes, char * * ### STRIPE VALUES; SS -> STRIPE_START, NS -> N_STRIPES * : double, the first value in the 0th stripe * ... : ... repeated for N_SAMPLES values * : double, the last value in the 0th stripe * : double, the first value in the Kth stripe * ... : ... repeated for N_SAMPLES values * : double, the last value in the Kth stripe * * ### FOOTER ### * : char, e.g., SSU-PARTIAL-01, same as starting magic */ EXTERN IOStatus write_partial(const char* filename, partial_mat_t* result); /* Read a partial matrix object * * filename the file to write into * result the partial results object, output parameter * * The following error codes are returned: * * read_okay : no problems * open_error : could not open the file * magic_incompatible : format magic not found or incompatible * bad_header : header seems malformed * unexpected_end : format end not found in expected location */ EXTERN IOStatus read_partial(const char* filename, partial_mat_t** result); /* Merge partial results * * results an array of partial_mat_t* * n_partials number of partial mats * merged the full matrix, output parameters, this is initialized in the method so using ** * * The following error codes are returned: * * merge_okay : no problems * incomplete_stripe_set : not all stripes needed to create a full matrix were foun * sample_id_consistency : samples described by stripes are inconsistent * square_mismatch : inconsistency on denotation of square matrix */ EXTERN MergeStatus merge_partial(partial_mat_t** partial_mats, int n_partials, unsigned int nthreads, mat_t** result); #ifdef __cplusplus // TODO: only needed for testing, should be encased in a macro void set_tasks(std::vector &tasks, double alpha, unsigned int n_samples, unsigned int stripe_start, unsigned int stripe_stop, bool bypass_tips, unsigned int nthreads); #endif unifrac-0.10.0/sucpp/benchtest.sh000066400000000000000000000011301351072301000166470ustar00rootroot00000000000000set -e set -x basedir=bench_tables_trees resdir=$basedir/results mkdir -p $resdir for f in $basedir/*.biom do bench=${basedir}/$(basename $f .biom) res=${resdir}/$(basename $f .biom) for method in {unweighted,weighted_normalized,weighted_unnormalized} do /usr/bin/time -l ./su ${bench}.tre ${bench}.biom $method > ${res}.${method}.su.dm 2> ${res}.${method}.su.stats /usr/bin/time -l ./sk ${bench}.tre ${bench}.biom $method > ${res}.${method}.sk.dm 2> ${res}.${method}.sk.stats python compare_dms.py ${res}.${method}.sk.dm ${res}.${method}.su.dm done done unifrac-0.10.0/sucpp/biom.cpp000066400000000000000000000225111351072301000157740ustar00rootroot00000000000000#include #include #include #include "biom.hpp" using namespace H5; using namespace su; /* datasets defined by the BIOM 2.x spec */ const std::string OBS_INDPTR = std::string("/observation/matrix/indptr"); const std::string OBS_INDICES = std::string("/observation/matrix/indices"); const std::string OBS_DATA = std::string("/observation/matrix/data"); const std::string OBS_IDS = std::string("/observation/ids"); const std::string SAMPLE_INDPTR = std::string("/sample/matrix/indptr"); const std::string SAMPLE_INDICES = std::string("/sample/matrix/indices"); const std::string SAMPLE_DATA = std::string("/sample/matrix/data"); const std::string SAMPLE_IDS = std::string("/sample/ids"); biom::biom(std::string filename) { file = H5File(filename.c_str(), H5F_ACC_RDONLY); /* establish the datasets */ obs_indices = file.openDataSet(OBS_INDICES.c_str()); obs_data = file.openDataSet(OBS_DATA.c_str()); sample_indices = file.openDataSet(SAMPLE_INDICES.c_str()); sample_data = file.openDataSet(SAMPLE_DATA.c_str()); /* cache IDs and indptr */ sample_ids = std::vector(); obs_ids = std::vector(); sample_indptr = std::vector(); obs_indptr = std::vector(); load_ids(OBS_IDS.c_str(), obs_ids); load_ids(SAMPLE_IDS.c_str(), sample_ids); load_indptr(OBS_INDPTR.c_str(), obs_indptr); load_indptr(SAMPLE_INDPTR.c_str(), sample_indptr); /* cache shape and nnz info */ n_samples = sample_ids.size(); n_obs = obs_ids.size(); set_nnz(); /* define a mapping between an ID and its corresponding offset */ obs_id_index = std::unordered_map(); sample_id_index = std::unordered_map(); create_id_index(obs_ids, obs_id_index); create_id_index(sample_ids, sample_id_index); /* load obs sparse data */ obs_indices_resident = (uint32_t**)malloc(sizeof(uint32_t**) * n_obs); if(obs_indices_resident == NULL) { fprintf(stderr, "Failed to allocate %zd bytes; [%s]:%d\n", sizeof(uint32_t**) * n_obs, __FILE__, __LINE__); exit(EXIT_FAILURE); } obs_data_resident = (double**)malloc(sizeof(double**) * n_obs); if(obs_data_resident == NULL) { fprintf(stderr, "Failed to allocate %zd bytes; [%s]:%d\n", sizeof(double**) * n_obs, __FILE__, __LINE__); exit(EXIT_FAILURE); } obs_counts_resident = (unsigned int*)malloc(sizeof(unsigned int) * n_obs); if(obs_counts_resident == NULL) { fprintf(stderr, "Failed to allocate %zd bytes; [%s]:%d\n", sizeof(unsigned int) * n_obs, __FILE__, __LINE__); exit(EXIT_FAILURE); } uint32_t *current_indices = NULL; double *current_data = NULL; for(unsigned int i = 0; i < obs_ids.size(); i++) { std::string id_ = obs_ids[i]; unsigned int n = get_obs_data_direct(id_, current_indices, current_data); obs_counts_resident[i] = n; obs_indices_resident[i] = current_indices; obs_data_resident[i] = current_data; } sample_counts = get_sample_counts(); } biom::~biom() { for(unsigned int i = 0; i < n_obs; i++) { free(obs_indices_resident[i]); free(obs_data_resident[i]); } free(obs_indices_resident); free(obs_data_resident); free(obs_counts_resident); } void biom::set_nnz() { // should these be cached? DataType dtype = obs_data.getDataType(); DataSpace dataspace = obs_data.getSpace(); hsize_t dims[1]; dataspace.getSimpleExtentDims(dims, NULL); nnz = dims[0]; } void biom::load_ids(const char *path, std::vector &ids) { DataSet ds_ids = file.openDataSet(path); DataType dtype = ds_ids.getDataType(); DataSpace dataspace = ds_ids.getSpace(); hsize_t dims[1]; dataspace.getSimpleExtentDims(dims, NULL); /* the IDs are a dataset of variable length strings */ char **dataout = (char**)malloc(sizeof(char*) * dims[0]); if(dataout == NULL) { fprintf(stderr, "Failed to allocate %zd bytes; [%s]:%d\n", sizeof(char*) * dims[0], __FILE__, __LINE__); exit(EXIT_FAILURE); } ds_ids.read((void*)dataout, dtype); ids.reserve(dims[0]); for(unsigned int i = 0; i < dims[0]; i++) { ids.push_back(dataout[i]); } for(unsigned int i = 0; i < dims[0]; i++) free(dataout[i]); free(dataout); } void biom::load_indptr(const char *path, std::vector &indptr) { DataSet ds = file.openDataSet(path); DataType dtype = ds.getDataType(); DataSpace dataspace = ds.getSpace(); hsize_t dims[1]; dataspace.getSimpleExtentDims(dims, NULL); uint32_t *dataout = (uint32_t*)malloc(sizeof(uint32_t) * dims[0]); if(dataout == NULL) { fprintf(stderr, "Failed to allocate %zd bytes; [%s]:%d\n", sizeof(uint32_t) * dims[0], __FILE__, __LINE__); exit(EXIT_FAILURE); } ds.read((void*)dataout, dtype); indptr.reserve(dims[0]); for(unsigned int i = 0; i < dims[0]; i++) indptr.push_back(dataout[i]); free(dataout); } void biom::create_id_index(std::vector &ids, std::unordered_map &map) { uint32_t count = 0; map.reserve(ids.size()); for(auto i = ids.begin(); i != ids.end(); i++, count++) { map[*i] = count; } } unsigned int biom::get_obs_data_direct(std::string id, uint32_t *& current_indices_out, double *& current_data_out) { uint32_t idx = obs_id_index.at(id); uint32_t start = obs_indptr[idx]; uint32_t end = obs_indptr[idx + 1]; hsize_t count[1] = {end - start}; hsize_t offset[1] = {start}; DataType indices_dtype = obs_indices.getDataType(); DataType data_dtype = obs_data.getDataType(); DataSpace indices_dataspace = obs_indices.getSpace(); DataSpace data_dataspace = obs_data.getSpace(); DataSpace indices_memspace(1, count, NULL); DataSpace data_memspace(1, count, NULL); indices_dataspace.selectHyperslab(H5S_SELECT_SET, count, offset); data_dataspace.selectHyperslab(H5S_SELECT_SET, count, offset); current_indices_out = (uint32_t*)malloc(sizeof(uint32_t) * count[0]); if(current_indices_out == NULL) { fprintf(stderr, "Failed to allocate %zd bytes; [%s]:%d\n", sizeof(uint32_t) * count[0], __FILE__, __LINE__); exit(EXIT_FAILURE); } current_data_out = (double*)malloc(sizeof(double) * count[0]); if(current_data_out == NULL) { fprintf(stderr, "Failed to allocate %zd bytes; [%s]:%d\n", sizeof(double) * count[0], __FILE__, __LINE__); exit(EXIT_FAILURE); } obs_indices.read((void*)current_indices_out, indices_dtype, indices_memspace, indices_dataspace); obs_data.read((void*)current_data_out, data_dtype, data_memspace, data_dataspace); return count[0]; } void biom::get_obs_data(std::string id, double* out) { uint32_t idx = obs_id_index.at(id); unsigned int count = obs_counts_resident[idx]; uint32_t *indices = obs_indices_resident[idx]; double *data = obs_data_resident[idx]; // reset our output buffer for(unsigned int i = 0; i < n_samples; i++) out[i] = 0.0; for(unsigned int i = 0; i < count; i++) { out[indices[i]] = data[i]; } } unsigned int biom::get_sample_data_direct(std::string id, uint32_t *& current_indices_out, double *& current_data_out) { uint32_t idx = sample_id_index.at(id); uint32_t start = sample_indptr[idx]; uint32_t end = sample_indptr[idx + 1]; hsize_t count[1] = {end - start}; hsize_t offset[1] = {start}; DataType indices_dtype = sample_indices.getDataType(); DataType data_dtype = sample_data.getDataType(); DataSpace indices_dataspace = sample_indices.getSpace(); DataSpace data_dataspace = sample_data.getSpace(); DataSpace indices_memspace(1, count, NULL); DataSpace data_memspace(1, count, NULL); indices_dataspace.selectHyperslab(H5S_SELECT_SET, count, offset); data_dataspace.selectHyperslab(H5S_SELECT_SET, count, offset); current_indices_out = (uint32_t*)malloc(sizeof(uint32_t) * count[0]); if(current_indices_out == NULL) { fprintf(stderr, "Failed to allocate %zd bytes; [%s]:%d\n", sizeof(uint32_t) * count[0], __FILE__, __LINE__); exit(EXIT_FAILURE); } current_data_out = (double*)malloc(sizeof(double) * count[0]); if(current_data_out == NULL) { fprintf(stderr, "Failed to allocate %zd bytes; [%s]:%d\n", sizeof(double) * count[0], __FILE__, __LINE__); exit(EXIT_FAILURE); } sample_indices.read((void*)current_indices_out, indices_dtype, indices_memspace, indices_dataspace); sample_data.read((void*)current_data_out, data_dtype, data_memspace, data_dataspace); return count[0]; } double* biom::get_sample_counts() { double *sample_counts = (double*)calloc(sizeof(double), n_samples); for(unsigned int i = 0; i < n_obs; i++) { unsigned int count = obs_counts_resident[i]; uint32_t *indices = obs_indices_resident[i]; double *data = obs_data_resident[i]; for(unsigned int j = 0; j < count; j++) { uint32_t index = indices[j]; double datum = data[j]; sample_counts[index] += datum; } } return sample_counts; } unifrac-0.10.0/sucpp/biom.hpp000066400000000000000000000065221351072301000160050ustar00rootroot00000000000000#include #include #include #include namespace su { class biom { public: // cache the IDs contained within the table std::vector sample_ids; std::vector obs_ids; // cache both index pointers into both CSC and CSR representations std::vector sample_indptr; std::vector obs_indptr; uint32_t n_samples; // the number of samples uint32_t n_obs; // the number of observations uint32_t nnz; // the total number of nonzero entries double *sample_counts; /* default constructor * * @param filename The path to the BIOM table to read */ biom(std::string filename); /* default destructor * * Temporary arrays are freed */ ~biom(); /* get a dense vector of observation data * * @param id The observation ID to fetch * @param out An allocated array of at least size n_samples. * Values of an index position [0, n_samples) which do not * have data will be zero'd. */ void get_obs_data(std::string id, double* out); private: /* retain DataSet handles within the HDF5 file */ H5::DataSet obs_indices; H5::DataSet sample_indices; H5::DataSet obs_data; H5::DataSet sample_data; H5::H5File file; uint32_t **obs_indices_resident; double **obs_data_resident; unsigned int *obs_counts_resident; unsigned int get_obs_data_direct(std::string id, uint32_t *& current_indices_out, double *& current_data_out); unsigned int get_sample_data_direct(std::string id, uint32_t *& current_indices_out, double *& current_data_out); double* get_sample_counts(); /* At construction, lookups mapping IDs -> index position within an * axis are defined */ std::unordered_map obs_id_index; std::unordered_map sample_id_index; /* load ids from an axis * * @param path The dataset path to the ID dataset to load * @param ids The variable representing the IDs to load into */ void load_ids(const char *path, std::vector &ids); /* load the index pointer for an axis * * @param path The dataset path to the index pointer to load * @param indptr The vector to load the data into */ void load_indptr(const char *path, std::vector &indptr); /* count the number of nonzero values and set nnz */ void set_nnz(); /* create an index mapping an ID to its corresponding index * position. * * @param ids A vector of IDs to index * @param map A hash table to populate */ void create_id_index(std::vector &ids, std::unordered_map &map); }; } unifrac-0.10.0/sucpp/capi_test.c000066400000000000000000000036631351072301000164700ustar00rootroot00000000000000#include #include #include #include #include "api.hpp" #ifndef bool #define bool char #define true 1 #define false 0 #endif void err(bool condition, const char* msg) { if(condition) { fprintf(stderr, "%s\n", msg); exit(1); } } void test_su(int num_cores){ mat_t* result = NULL; const char* table = "test.biom"; const char* tree = "test.tre"; const char* method = "unweighted"; double exp[] = {0.2, 0.57142857, 0.6, 0.5, 0.2, 0.42857143, 0.66666667, 0.6, 0.33333333, 0.71428571, 0.85714286, 0.42857143, 0.33333333, 0.4, 0.6}; ComputeStatus status; status = one_off(table, tree, method, false, 1.0, false, num_cores, &result); err(status != okay, "Compute failed"); err(result == NULL, "Empty result"); err(result->n_samples != 6, "Wrong number of samples"); err(result->cf_size != 15, "Wrong condensed form size"); err(!result->is_upper_triangle, "Result is not squaure"); for(unsigned int i = 0; i < result->cf_size; i++) err(fabs(exp[i] - result->condensed_form[i]) > 0.00001, "Result is wrong"); } void test_faith_pd(){ r_vec* result = NULL; const char* table = "test.biom"; const char* tree = "test.tre"; double exp[] = {4, 5, 6, 3, 2, 5}; ComputeStatus status; status = faith_pd_one_off(table, tree, &result); err(status != okay, "Compute failed"); err(result == NULL, "Empty result"); err(result->n_samples != 6, "Wrong number of samples"); for(unsigned int i = 0; i < result->n_samples; i++) err(fabs(exp[i] - result->values[i]) > 0.00001, "Result is wrong"); } int main(int argc, char** argv) { int num_cores = strtol(argv[1], NULL, 10); printf("Testing Striped UniFrac...\n"); test_su(num_cores); printf("Tests passed.\n"); printf("Testing Faith's PD...\n"); test_faith_pd(); printf("Tests passed.\n"); return 0; } unifrac-0.10.0/sucpp/cmd.cpp000066400000000000000000000000231351072301000156030ustar00rootroot00000000000000#include "cmd.hpp" unifrac-0.10.0/sucpp/cmd.hpp000066400000000000000000000020401351072301000156110ustar00rootroot00000000000000#include #include #include #include class InputParser{ /* this object was shamelessly adapted from http://stackoverflow.com/a/868894 */ public: InputParser (int &argc, char **argv){ for (int i=1; i < argc; ++i) this->tokens.push_back(std::string(argv[i])); } /// @author iain const std::string& getCmdOption(const std::string &option) const{ std::vector::const_iterator itr; itr = std::find(this->tokens.begin(), this->tokens.end(), option); if (itr != this->tokens.end() && ++itr != this->tokens.end()){ return *itr; } return empty; } /// @author iain bool cmdOptionExists(const std::string &option) const{ return std::find(this->tokens.begin(), this->tokens.end(), option) != this->tokens.end(); } private: std::vector tokens; const std::string empty; }; unifrac-0.10.0/sucpp/faithpd.cpp000066400000000000000000000054331351072301000164710ustar00rootroot00000000000000#include #include #include #include #include "api.hpp" #include "cmd.hpp" #include "tree.hpp" #include "biom.hpp" #include "unifrac.hpp" void usage() { std::cout << "usage: faithpd -i -t -o " << std::endl; std::cout << std::endl; std::cout << " -i\t\tThe input BIOM table." << std::endl; std::cout << " -t\t\tThe input phylogeny in newick." << std::endl; std::cout << " -o\t\tThe output series." << std::endl; std::cout << std::endl; std::cout << "Citations: " << std::endl; std::cout << " For Faith's PD, please see:" << std::endl; std::cout << " Faith Biological Conservation 1992; DOI: 10.1016/0006-3207(92)91201-3" << std::endl; std::cout << std::endl; } const char* compute_status_messages[6] = {"No error.", "The tree file cannot be found.", "The table file cannot be found.", "The table file contains an empty table." "An unknown method was requested.", "Table observation IDs are not a subset of the tree tips. This error can also be triggered if a node name contains a single quote (this is unlikely)."}; void err(std::string msg) { std::cerr << "ERROR: " << msg << std::endl << std::endl; usage(); } int faith_cli_one_off(std::string table_filename, std::string tree_filename, std::string output_filename) { if(output_filename.empty()) { err("output filename missing"); return EXIT_FAILURE; } if(table_filename.empty()) { err("table filename missing"); return EXIT_FAILURE; } if(tree_filename.empty()) { err("tree filename missing"); return EXIT_FAILURE; } r_vec *result = NULL; compute_status status; status = faith_pd_one_off(table_filename.c_str(), tree_filename.c_str(), &result); if(status != okay || result == NULL) { fprintf(stderr, "Compute failed in faith_pd_one_off: %s\n", compute_status_messages[status]); exit(EXIT_FAILURE); } write_vec(output_filename.c_str(), result); destroy_results_vec(&result); return EXIT_SUCCESS; } int main(int argc, char **argv){ InputParser input(argc, argv); if(input.cmdOptionExists("-h") || input.cmdOptionExists("--help") || argc == 1) { usage(); return EXIT_SUCCESS; } const std::string &table_filename = input.getCmdOption("-i"); const std::string &tree_filename = input.getCmdOption("-t"); const std::string &output_filename = input.getCmdOption("-o"); faith_cli_one_off(table_filename, tree_filename, output_filename); return EXIT_SUCCESS; } unifrac-0.10.0/sucpp/su.cpp000066400000000000000000000340231351072301000154760ustar00rootroot00000000000000#include #include #include #include #include #include #include "api.hpp" #include "cmd.hpp" #include "tree.hpp" #include "biom.hpp" #include "unifrac.hpp" void usage() { std::cout << "usage: ssu -i -o -m [METHOD] -t [-n threads] [-a alpha] [--vaw]" << std::endl; std::cout << " [--mode [MODE]] [--start starting-stripe] [--stop stopping-stripe] [--partial-pattern ]" << std::endl; std::cout << " [--n-partials number_of_partitions] [--report-bare]" << std::endl; std::cout << std::endl; std::cout << " -i\t\tThe input BIOM table." << std::endl; std::cout << " -t\t\tThe input phylogeny in newick." << std::endl; std::cout << " -m\t\tThe method, [unweighted | weighted_normalized | weighted_unnormalized | generalized]." << std::endl; std::cout << " -o\t\tThe output distance matrix." << std::endl; std::cout << " -n\t\t[OPTIONAL] The number of threads, default is 1." << std::endl; std::cout << " -a\t\t[OPTIONAL] Generalized UniFrac alpha, default is 1." << std::endl; std::cout << " -f\t\t[OPTIONAL] Bypass tips, reduces compute by about 50%." << std::endl; std::cout << " --vaw\t[OPTIONAL] Variance adjusted, default is to not adjust for variance." << std::endl; std::cout << " --mode\t[OPTIONAL] Mode of operation:" << std::endl; std::cout << " \t\t one-off : [DEFAULT] compute UniFrac." << std::endl; std::cout << " \t\t partial : Compute UniFrac over a subset of stripes." << std::endl; std::cout << " \t\t partial-report : Start and stop suggestions for partial compute." << std::endl; std::cout << " \t\t merge-partial : Merge partial UniFrac results." << std::endl; std::cout << " --start\t[OPTIONAL] If mode==partial, the starting stripe." << std::endl; std::cout << " --stop\t[OPTIONAL] If mode==partial, the stopping stripe." << std::endl; std::cout << " --partial-pattern\t[OPTIONAL] If mode==merge-partial, a glob pattern for partial outputs to merge." << std::endl; std::cout << " --n-partials\t[OPTIONAL] If mode==partial-report, the number of partitions to compute." << std::endl; std::cout << " --report-bare\t[OPTIONAL] If mode==partial-report, produce barebones output." << std::endl; std::cout << std::endl; std::cout << "Citations: " << std::endl; std::cout << " For UniFrac, please see:" << std::endl; std::cout << " McDonald et al. Nature Methods 2018; DOI: 10.1038/s41592-018-0187-8" << std::endl; std::cout << " Lozupone and Knight Appl Environ Microbiol 2005; DOI: 10.1128/AEM.71.12.8228-8235.2005" << std::endl; std::cout << " Lozupone et al. Appl Environ Microbiol 2007; DOI: 10.1128/AEM.01996-06" << std::endl; std::cout << " Hamady et al. ISME 2010; DOI: 10.1038/ismej.2009.97" << std::endl; std::cout << " Lozupone et al. ISME 2011; DOI: 10.1038/ismej.2010.133" << std::endl; std::cout << " For Generalized UniFrac, please see: " << std::endl; std::cout << " Chen et al. Bioinformatics 2012; DOI: 10.1093/bioinformatics/bts342" << std::endl; std::cout << " For Variance Adjusted UniFrac, please see: " << std::endl; std::cout << " Chang et al. BMC Bioinformatics 2011; DOI: 10.1186/1471-2105-12-118" << std::endl; std::cout << std::endl; std::cout << "Runtime progress can be obtained by issuing a SIGUSR1 signal. If running with " << std::endl; std::cout << "multiple threads, this signal will only be honored if issued to the master PID. " << std::endl; std::cout << "The report will yield the following information: " << std::endl; std::cout << std::endl; std::cout << "tid: start: stop: k: total:" << std::endl; std::cout << std::endl; std::cout << "The proportion of the tree that has been evaluated can be determined from (k / total)." << std::endl; std::cout << std::endl; } const char* compute_status_messages[6] = {"No error.", "The tree file cannot be found.", "The table file cannot be found.", "The table file contains an empty table." "An unknown method was requested.", "Table observation IDs are not a subset of the tree tips. This error can also be triggered if a node name contains a single quote (this is unlikely)."}; // https://stackoverflow.com/questions/8401777/simple-glob-in-c-on-unix-system inline std::vector glob(const std::string& pat){ using namespace std; glob_t glob_result; glob(pat.c_str(),GLOB_TILDE,NULL,&glob_result); vector ret; for(unsigned int i=0;i partials = glob(partial_pattern); partial_mat_t** partial_mats = (partial_mat_t**)malloc(sizeof(partial_mat_t*) * partials.size()); for(size_t i = 0; i < partials.size(); i++) { IOStatus io_err = read_partial(partials[i].c_str(), &partial_mats[i]); if(io_err != read_okay) { std::ostringstream msg; msg << "Unable to parse file (" << partials[i] << "); err " << io_err; err(msg.str()); return EXIT_FAILURE; } } MergeStatus status = merge_partial(partial_mats, partials.size(), nthreads, &result); if(status != merge_okay) { std::ostringstream msg; msg << "Unable to complete merge; err " << status; err(msg.str()); return EXIT_FAILURE; } IOStatus io_err = write_mat(output_filename.c_str(), result); if(io_err != write_okay) { std::ostringstream msg; msg << "Unable to write; err " << io_err; err(msg.str()); return EXIT_FAILURE; } destroy_mat(&result); return EXIT_SUCCESS; } int mode_partial(std::string table_filename, std::string tree_filename, std::string output_filename, std::string method_string, bool vaw, double g_unifrac_alpha, bool bypass_tips, unsigned int nthreads, int start_stripe, int stop_stripe) { if(output_filename.empty()) { err("output filename missing"); return EXIT_FAILURE; } if(table_filename.empty()) { err("table filename missing"); return EXIT_FAILURE; } if(tree_filename.empty()) { err("tree filename missing"); return EXIT_FAILURE; } if(method_string.empty()) { err("method missing"); return EXIT_FAILURE; } if(start_stripe < 0) { err("Starting stripe must be >= 0"); return EXIT_FAILURE; } if(stop_stripe <= start_stripe) { err("In '--mode partial', the stop and start stripes must be specified, and the stop stripe must be > start stripe"); return EXIT_FAILURE; } partial_mat_t *result = NULL; compute_status status; status = partial(table_filename.c_str(), tree_filename.c_str(), method_string.c_str(), vaw, g_unifrac_alpha, bypass_tips, nthreads, start_stripe, stop_stripe, &result); if(status != okay || result == NULL) { fprintf(stderr, "Compute failed in partial: %s\n", compute_status_messages[status]); exit(EXIT_FAILURE); } io_status err = write_partial(output_filename.c_str(), result); destroy_partial_mat(&result); if(err != write_okay){ fprintf(stderr, "Write failed: %s\n", err == open_error ? "could not open output" : "unknown error"); return EXIT_FAILURE; } return EXIT_SUCCESS; } int mode_one_off(std::string table_filename, std::string tree_filename, std::string output_filename, std::string method_string, bool vaw, double g_unifrac_alpha, bool bypass_tips, unsigned int nthreads) { if(output_filename.empty()) { err("output filename missing"); return EXIT_FAILURE; } if(table_filename.empty()) { err("table filename missing"); return EXIT_FAILURE; } if(tree_filename.empty()) { err("tree filename missing"); return EXIT_FAILURE; } if(method_string.empty()) { err("method missing"); return EXIT_FAILURE; } mat_t *result = NULL; compute_status status; status = one_off(table_filename.c_str(), tree_filename.c_str(), method_string.c_str(), vaw, g_unifrac_alpha, bypass_tips, nthreads, &result); if(status != okay || result == NULL) { fprintf(stderr, "Compute failed in one_off: %s\n", compute_status_messages[status]); exit(EXIT_FAILURE); } write_mat(output_filename.c_str(), result); destroy_mat(&result); return EXIT_SUCCESS; } void ssu_sig_handler(int signo) { if (signo == SIGUSR1) { printf("Status cannot be reported.\n"); } } int main(int argc, char **argv){ signal(SIGUSR1, ssu_sig_handler); InputParser input(argc, argv); if(input.cmdOptionExists("-h") || input.cmdOptionExists("--help") || argc == 1) { usage(); return EXIT_SUCCESS; } unsigned int nthreads; const std::string &table_filename = input.getCmdOption("-i"); const std::string &tree_filename = input.getCmdOption("-t"); const std::string &output_filename = input.getCmdOption("-o"); const std::string &method_string = input.getCmdOption("-m"); const std::string &nthreads_arg = input.getCmdOption("-n"); const std::string &gunifrac_arg = input.getCmdOption("-a"); const std::string &mode_arg = input.getCmdOption("--mode"); const std::string &start_arg = input.getCmdOption("--start"); const std::string &stop_arg = input.getCmdOption("--stop"); const std::string &partial_pattern = input.getCmdOption("--partial-pattern"); const std::string &npartials = input.getCmdOption("--n-partials"); const std::string &report_bare = input.getCmdOption("--report-bare"); if(nthreads_arg.empty()) { nthreads = 1; } else { nthreads = atoi(nthreads_arg.c_str()); } bool vaw = input.cmdOptionExists("--vaw"); bool bare = input.cmdOptionExists("--report-bare"); bool bypass_tips = input.cmdOptionExists("-f"); double g_unifrac_alpha; if(gunifrac_arg.empty()) { g_unifrac_alpha = 1.0; } else { g_unifrac_alpha = atof(gunifrac_arg.c_str()); } int start_stripe; if(start_arg.empty()) start_stripe = 0; else start_stripe = atoi(start_arg.c_str()); int stop_stripe; if(stop_arg.empty()) stop_stripe = 0; else stop_stripe = atoi(stop_arg.c_str()); int n_partials; if(npartials.empty()) n_partials = 1; else n_partials = atoi(npartials.c_str()); if(mode_arg.empty() || mode_arg == "one-off") return mode_one_off(table_filename, tree_filename, output_filename, method_string, vaw, g_unifrac_alpha, bypass_tips, nthreads); else if(mode_arg == "partial") return mode_partial(table_filename, tree_filename, output_filename, method_string, vaw, g_unifrac_alpha, bypass_tips, nthreads, start_stripe, stop_stripe); else if(mode_arg == "merge-partial") return mode_merge_partial(output_filename, partial_pattern, nthreads); else if(mode_arg == "partial-report") return mode_partial_report(table_filename, n_partials, bare); else err("Unknown mode. Valid options are: one-off, partial, merge-partial"); return EXIT_SUCCESS; } unifrac-0.10.0/sucpp/su_R.cpp000066400000000000000000000024771351072301000157670ustar00rootroot00000000000000#include #include #include #include "api.hpp" using namespace std; using namespace Rcpp; // [[Rcpp::export]] Rcpp::List unifrac(const char* table, const char* tree, int nthreads){ mat_t* result = NULL; const char* method = "unweighted"; ComputeStatus status; status = one_off(table, tree, method, false, 1.0, false, nthreads, &result); vector cf; //push result->condensed_form into a vector becuase R doesn't like double* for(int i=0; icf_size; i++){ cf.push_back(result->condensed_form[i]); } return Rcpp::List::create(Rcpp::Named("n_samples") = result->n_samples, Rcpp::Named("is_upper_triangle") = result->is_upper_triangle, Rcpp::Named("cf_size") = result->cf_size, Rcpp::Named("c_form") = cf); } // [[Rcpp::export]] Rcpp::List faith_pd(const char* table, const char* tree){ r_vec* result = NULL; ComputeStatus status; status = faith_pd_one_off(table, tree, &result); vector values; for(int i = 0; i < result->n_samples; i++){ values.push_back(result->values[i]); } return Rcpp::List::create(Rcpp::Named("n_samples") = result->n_samples, Rcpp::Named("faith_pd") = values); } unifrac-0.10.0/sucpp/task_parameters.hpp000066400000000000000000000021121351072301000202330ustar00rootroot00000000000000#include #include #ifndef __su_task_parameters #ifdef __cplusplus namespace su { #endif /* task specific compute parameters * * n_samples the number of samples being processed * start the first stride to process * stop the last stride to process * tid the thread identifier * bypass_tips ignore tips on compute, reduces compute by ~50% * g_unifrac_alpha an alpha value for generalized unifrac */ struct task_parameters { uint32_t n_samples; // number of samples unsigned int start; // starting stripe unsigned int stop; // stopping stripe unsigned int tid; // thread ID bool bypass_tips; // avoid compute at tips // task specific arguments below double g_unifrac_alpha; // generalized unifrac alpha }; #ifdef __cplusplus } #endif #define __su_task_parameters #endif unifrac-0.10.0/sucpp/test.biom000066400000000000000000001020101351072301000161620ustar00rootroot00000000000000HDF  `  xTREE`HEAPX observationsample8 @id ` @type ` H format-url` P format-version@ H generated-by` Hcreation-date` H shape@ 0 nnz@TREEhHEAPX8matrixidsmetadatagroup-metadata GCOL No Table ID otu tablehttp://biom-format.orgexample2014-07-29T16:15:43.318377GG_OTU_4GG_OTU_5GG_OTU_3 GG_OTU_2 GG_OTU_1 Sample4 Sample5 Sample6Sample3Sample2Sample18SNODxQQSTREE&HEAPX dataindicesindptr8SNOD(NNQBK L@N ?@4 4 deflateSPTREE'8(SNOD80x9x^c``AB P>8htu4} Tx^cbF fb fb&(ĆɃ@|%x^c````b6 b^ b-x^```H`. 1a|N4>B泡فg8Yx^c`0?ؓG @i(.} Tx^cd```bF(LH|al$ x^c````bV bn b~ `7x^cg``H` fGh|4>/  deflateH1SXTREE!_(  deflate:SXTREE( deflateCSPTREE"( L@NTREE&HEAPX`NPNQTREE&HEAPX QPQSTREEVHEAPX8Smatrixidsmetadatagroup-metadata `TVTREEaHEAPX Vdataindicesindptr8SNOD(HpHu8T`TV~~Ѐ ?@4 4 deflatePYSPTREE"(SNOD@Xbl  deflatecSXTREE (  deflatemSXTREE( deflateXvSPTREE$)~ЀTREEaHEAPXPpTREEaHEAPXPunifrac-0.10.0/sucpp/test.tre000066400000000000000000000001021351072301000160250ustar00rootroot00000000000000(GG_OTU_1:1,(GG_OTU_2:1,GG_OTU_3:1):1,(GG_OTU_5:1,GG_OTU_4:1):1); unifrac-0.10.0/sucpp/test_api.cpp000066400000000000000000000270631351072301000166650ustar00rootroot00000000000000#include #include "api.hpp" #include #include #include /* * test harness adapted from * https://github.com/noporpoise/BitArray/blob/master/dev/bit_array_test.c */ const char *suite_name; char suite_pass; int suites_run = 0, suites_failed = 0, suites_empty = 0; int tests_in_suite = 0, tests_run = 0, tests_failed = 0; #define QUOTE(str) #str #define ASSERT(x) {tests_run++; tests_in_suite++; if(!(x)) \ { fprintf(stderr, "failed assert [%s:%i] %s\n", __FILE__, __LINE__, QUOTE(x)); \ suite_pass = 0; tests_failed++; }} void SUITE_START(const char *name) { suite_pass = 1; suite_name = name; suites_run++; tests_in_suite = 0; } void SUITE_END() { printf("Testing %s ", suite_name); size_t suite_i; for(suite_i = strlen(suite_name); suite_i < 80-8-5; suite_i++) printf("."); printf("%s\n", suite_pass ? " pass" : " fail"); if(!suite_pass) suites_failed++; if(!tests_in_suite) suites_empty++; } /* * End adapted code */ //void test_write_mat() { // SUITE_START("test write mat_t"); // SUITE_END(); //} // //void test_read_mat() { // SUITE_START("test read mat_t"); // SUITE_END(); //} // partial_mat_t* make_test_pm() { partial_mat_t* pm = (partial_mat_t*)malloc(sizeof(partial_mat_t)); pm->n_samples = 6; pm->sample_ids = (char**)malloc(sizeof(char*) * 6); pm->sample_ids[0] = (char*)malloc(sizeof(char) * 2); pm->sample_ids[0][0] = 'A'; pm->sample_ids[0][1] = '\0'; pm->sample_ids[1] = (char*)malloc(sizeof(char) * 2); pm->sample_ids[1][0] = 'B'; pm->sample_ids[1][1] = '\0'; pm->sample_ids[2] = (char*)malloc(sizeof(char) * 3); pm->sample_ids[2][0] = 'C'; pm->sample_ids[2][1] = 'x'; pm->sample_ids[2][2] = '\0'; pm->sample_ids[3] = (char*)malloc(sizeof(char) * 2); pm->sample_ids[3][0] = 'D'; pm->sample_ids[3][1] = '\0'; pm->sample_ids[4] = (char*)malloc(sizeof(char) * 2); pm->sample_ids[4][0] = 'E'; pm->sample_ids[4][1] = '\0'; pm->sample_ids[5] = (char*)malloc(sizeof(char) * 2); pm->sample_ids[5][0] = 'F'; pm->sample_ids[5][1] = '\0'; pm->stripes = (double**)malloc(sizeof(double*) * 3); pm->stripes[0] = (double*)malloc(sizeof(double) * 6); pm->stripes[0][0] = 1; pm->stripes[0][1] = 2; pm->stripes[0][2] = 3; pm->stripes[0][3] = 4; pm->stripes[0][4] = 5; pm->stripes[0][5] = 6; pm->stripes[1] = (double*)malloc(sizeof(double) * 6); pm->stripes[1][0] = 7; pm->stripes[1][1] = 8; pm->stripes[1][2] = 9; pm->stripes[1][3] = 10; pm->stripes[1][4] = 11; pm->stripes[1][5] = 12; pm->stripes[2] = (double*)malloc(sizeof(double) * 6); pm->stripes[2][0] = 13; pm->stripes[2][1] = 14; pm->stripes[2][2] = 15; pm->stripes[2][3] = 16; pm->stripes[2][4] = 17; pm->stripes[2][5] = 18; return pm; } mat_t* mat_three_rep() { mat_t* res = (mat_t*)malloc(sizeof(mat_t)); res->n_samples = 6; res->cf_size = 15; res->is_upper_triangle = true; res->condensed_form = (double*)malloc(sizeof(double) * 15); // using second half of third stripe. the last stripe when operating on even numbers of samples is normally redundant with the first half, // but that was more annoying in to write up in the tests. res->condensed_form[0] = 1; res->condensed_form[1] = 7; res->condensed_form[2] = 16; res->condensed_form[3] = 11; res->condensed_form[4] = 6; res->condensed_form[5] = 2; res->condensed_form[6] = 8; res->condensed_form[7] = 17; res->condensed_form[8] = 12; res->condensed_form[9] = 3; res->condensed_form[10] = 9; res->condensed_form[11] = 18; res->condensed_form[12] = 4; res->condensed_form[13] = 10; res->condensed_form[14] = 5; res->sample_ids = (char**)malloc(sizeof(char*) * 6); res->sample_ids[0] = (char*)malloc(sizeof(char) * 2); res->sample_ids[0][0] = 'A'; res->sample_ids[0][1] = '\0'; res->sample_ids[1] = (char*)malloc(sizeof(char) * 2); res->sample_ids[1][0] = 'B'; res->sample_ids[1][1] = '\0'; res->sample_ids[2] = (char*)malloc(sizeof(char) * 3); res->sample_ids[2][0] = 'C'; res->sample_ids[2][1] = 'x'; res->sample_ids[2][2] = '\0'; res->sample_ids[3] = (char*)malloc(sizeof(char) * 2); res->sample_ids[3][0] = 'D'; res->sample_ids[3][1] = '\0'; res->sample_ids[4] = (char*)malloc(sizeof(char) * 2); res->sample_ids[4][0] = 'E'; res->sample_ids[4][1] = '\0'; res->sample_ids[5] = (char*)malloc(sizeof(char) * 2); res->sample_ids[5][0] = 'F'; res->sample_ids[5][1] = '\0'; return res; } void test_read_write_partial_mat() { SUITE_START("test read/write partial_mat_t"); partial_mat_t* pm = make_test_pm(); pm->stripe_start = 0; pm->stripe_stop = 3; pm->stripe_total = 3; pm->is_upper_triangle = true; io_status err = write_partial("/tmp/ssu_io.dat", pm); ASSERT(err == write_okay); partial_mat_t *obs = NULL; err = read_partial("/tmp/ssu_io.dat", &obs); ASSERT(err == read_okay); ASSERT(obs->n_samples == 6); ASSERT(obs->stripe_start == 0); ASSERT(obs->stripe_stop == 3); ASSERT(obs->stripe_total == 3); ASSERT(strcmp(obs->sample_ids[0], "A") == 0); ASSERT(strcmp(obs->sample_ids[1], "B") == 0); ASSERT(strcmp(obs->sample_ids[2], "Cx") == 0); ASSERT(strcmp(obs->sample_ids[3], "D") == 0); ASSERT(strcmp(obs->sample_ids[4], "E") == 0); ASSERT(strcmp(obs->sample_ids[5], "F") == 0); for(int i = 0; i < 3; i++) { for(int j = 0; j < 6; j++) { ASSERT(obs->stripes[i][j] == ((i * 6) + j + 1)); } } SUITE_END(); } void test_merge_partial_mat() { SUITE_START("test merge partial_mat_t"); // the easy test partial_mat_t* pm1 = (partial_mat_t*)malloc(sizeof(partial_mat_t)); pm1->n_samples = 6; pm1->sample_ids = (char**)malloc(sizeof(char*) * 6); pm1->sample_ids[0] = (char*)malloc(sizeof(char) * 2); pm1->sample_ids[0][0] = 'A'; pm1->sample_ids[0][1] = '\0'; pm1->sample_ids[1] = (char*)malloc(sizeof(char) * 2); pm1->sample_ids[1][0] = 'B'; pm1->sample_ids[1][1] = '\0'; pm1->sample_ids[2] = (char*)malloc(sizeof(char) * 3); pm1->sample_ids[2][0] = 'C'; pm1->sample_ids[2][1] = 'x'; pm1->sample_ids[2][2] = '\0'; pm1->sample_ids[3] = (char*)malloc(sizeof(char) * 2); pm1->sample_ids[3][0] = 'D'; pm1->sample_ids[3][1] = '\0'; pm1->sample_ids[4] = (char*)malloc(sizeof(char) * 2); pm1->sample_ids[4][0] = 'E'; pm1->sample_ids[4][1] = '\0'; pm1->sample_ids[5] = (char*)malloc(sizeof(char) * 2); pm1->sample_ids[5][0] = 'F'; pm1->sample_ids[5][1] = '\0'; pm1->stripes = (double**)malloc(sizeof(double*) * 2); pm1->stripes[0] = (double*)malloc(sizeof(double) * 6); pm1->stripes[0][0] = 1; pm1->stripes[0][1] = 2; pm1->stripes[0][2] = 3; pm1->stripes[0][3] = 4; pm1->stripes[0][4] = 5; pm1->stripes[0][5] = 6; pm1->stripes[1] = (double*)malloc(sizeof(double) * 6); pm1->stripes[1][0] = 7; pm1->stripes[1][1] = 8; pm1->stripes[1][2] = 9; pm1->stripes[1][3] = 10; pm1->stripes[1][4] = 11; pm1->stripes[1][5] = 12; pm1->stripe_start = 0; pm1->stripe_stop = 2; pm1->stripe_total = 3; pm1->is_upper_triangle = true; partial_mat_t* pm2 = (partial_mat_t*)malloc(sizeof(partial_mat_t)); pm2->n_samples = 6; pm2->sample_ids = (char**)malloc(sizeof(char*) * 6); pm2->sample_ids[0] = (char*)malloc(sizeof(char) * 2); pm2->sample_ids[0][0] = 'A'; pm2->sample_ids[0][1] = '\0'; pm2->sample_ids[1] = (char*)malloc(sizeof(char) * 2); pm2->sample_ids[1][0] = 'B'; pm2->sample_ids[1][1] = '\0'; pm2->sample_ids[2] = (char*)malloc(sizeof(char) * 3); pm2->sample_ids[2][0] = 'C'; pm2->sample_ids[2][1] = 'x'; pm2->sample_ids[2][2] = '\0'; pm2->sample_ids[3] = (char*)malloc(sizeof(char) * 2); pm2->sample_ids[3][0] = 'D'; pm2->sample_ids[3][1] = '\0'; pm2->sample_ids[4] = (char*)malloc(sizeof(char) * 2); pm2->sample_ids[4][0] = 'E'; pm2->sample_ids[4][1] = '\0'; pm2->sample_ids[5] = (char*)malloc(sizeof(char) * 2); pm2->sample_ids[5][0] = 'F'; pm2->sample_ids[5][1] = '\0'; pm2->stripes = (double**)malloc(sizeof(double*) * 1); pm2->stripes[0] = (double*)malloc(sizeof(double) * 6); pm2->stripes[0][0] = 13; pm2->stripes[0][1] = 14; pm2->stripes[0][2] = 15; pm2->stripes[0][3] = 16; pm2->stripes[0][4] = 17; pm2->stripes[0][5] = 18; pm2->stripe_start = 2; pm2->stripe_stop = 3; pm2->stripe_total = 3; pm2->is_upper_triangle = true; mat_t* exp = mat_three_rep(); partial_mat_t* pms[2]; pms[0] = pm1; pms[1] = pm2; mat_t* obs = NULL; merge_status err = merge_partial(pms, 2, 1, &obs); ASSERT(err == merge_okay); ASSERT(obs->cf_size == exp->cf_size); ASSERT(obs->n_samples == exp->n_samples); ASSERT(obs->is_upper_triangle == exp->is_upper_triangle); for(int i = 0; i < obs->cf_size; i++) { ASSERT(obs->condensed_form[i] == exp->condensed_form[i]); } for(int i = 0; i < obs->n_samples; i++) ASSERT(strcmp(obs->sample_ids[i], exp->sample_ids[i]) == 0); // out of order test pms[0] = pm2; pms[1] = pm1; obs = NULL; err = merge_partial(pms, 2, 1, &obs); ASSERT(err == merge_okay); ASSERT(obs->cf_size == exp->cf_size); ASSERT(obs->n_samples == exp->n_samples); ASSERT(obs->is_upper_triangle == exp->is_upper_triangle); for(int i = 0; i < obs->cf_size; i++) { ASSERT(obs->condensed_form[i] == exp->condensed_form[i]); } for(int i = 0; i < obs->n_samples; i++) ASSERT(strcmp(obs->sample_ids[i], exp->sample_ids[i]) == 0); // error checking pm1->stripe_start = 0; pm1->stripe_stop = 3; pm1->stripe_total = 9; pm1->is_upper_triangle = true; pm2->stripe_start = 3; pm2->stripe_stop = 5; pm2->stripe_total = 9; pm2->is_upper_triangle = true; partial_mat_t* pm3 = (partial_mat_t*)malloc(sizeof(partial_mat_t)); pm3 = make_test_pm(); pm3->stripe_start = 6; pm3->stripe_stop = 9; pm3->stripe_total = 9; pm3->is_upper_triangle = true; exp = mat_three_rep(); partial_mat_t* pms_err[3]; pms_err[2] = pm1; pms_err[0] = pm2; pms_err[1] = pm3; err = merge_partial(pms_err, 3, 1, &obs); ASSERT(err == incomplete_stripe_set); pm2->stripe_start = 2; pm2->stripe_stop = 6; err = merge_partial(pms_err, 3, 1, &obs); ASSERT(err == stripes_overlap); pm2->stripe_start = 3; pm2->sample_ids[2][0] = 'X'; err = merge_partial(pms_err, 3, 1, &obs); ASSERT(err == sample_id_consistency); pm2->sample_ids[2][0] = 'C'; pm3->n_samples = 2; err = merge_partial(pms_err, 3, 1, &obs); ASSERT(err == partials_mismatch); pm3->n_samples = 6; pm3->stripe_total = 12; err = merge_partial(pms_err, 3, 1, &obs); ASSERT(err == partials_mismatch); pm3->is_upper_triangle = false; pm3->stripe_total = 9; err = merge_partial(pms_err, 3, 1, &obs); ASSERT(err == square_mismatch); SUITE_END(); } int main(int argc, char** argv) { /* one_off and partial are executed as integration tests */ //test_write_mat(); //test_read_mat(); test_read_write_partial_mat(); test_merge_partial_mat(); printf("\n"); printf(" %i / %i suites failed\n", suites_failed, suites_run); printf(" %i / %i suites empty\n", suites_empty, suites_run); printf(" %i / %i tests failed\n", tests_failed, tests_run); printf("\n THE END.\n"); return tests_failed ? EXIT_FAILURE : EXIT_SUCCESS; } unifrac-0.10.0/sucpp/test_su.cpp000066400000000000000000001420541351072301000165410ustar00rootroot00000000000000#include #include "api.hpp" #include "tree.hpp" #include "biom.hpp" #include "unifrac.hpp" #include #include #include /* * test harness adapted from * https://github.com/noporpoise/BitArray/blob/master/dev/bit_array_test.c */ const char *suite_name; char suite_pass; int suites_run = 0, suites_failed = 0, suites_empty = 0; int tests_in_suite = 0, tests_run = 0, tests_failed = 0; #define QUOTE(str) #str #define ASSERT(x) {tests_run++; tests_in_suite++; if(!(x)) \ { fprintf(stderr, "failed assert [%s:%i] %s\n", __FILE__, __LINE__, QUOTE(x)); \ suite_pass = 0; tests_failed++; }} void SUITE_START(const char *name) { suite_pass = 1; suite_name = name; suites_run++; tests_in_suite = 0; } void SUITE_END() { printf("Testing %s ", suite_name); size_t suite_i; for(suite_i = strlen(suite_name); suite_i < 80-8-5; suite_i++) printf("."); printf("%s\n", suite_pass ? " pass" : " fail"); if(!suite_pass) suites_failed++; if(!tests_in_suite) suites_empty++; } /* * End adapted code */ std::vector _bool_array_to_vector(bool *arr, unsigned int n) { std::vector vec; for(unsigned int i = 0; i < n; i++) vec.push_back(arr[i]); return vec; } std::vector _uint32_array_to_vector(uint32_t *arr, unsigned int n) { std::vector vec; for(unsigned int i = 0; i < n; i++) vec.push_back(arr[i]); return vec; } std::vector _double_array_to_vector(double *arr, unsigned int n) { std::vector vec; for(unsigned int i = 0; i < n; i++) vec.push_back(arr[i]); return vec; } std::vector _string_array_to_vector(std::string *arr, unsigned int n) { std::vector vec; for(unsigned int i = 0; i < n; i++) vec.push_back(arr[i]); return vec; } bool vec_almost_equal(std::vector a, std::vector b) { if(a.size() != b.size()) { return false; } for(unsigned int i = 0; i < a.size(); i++) { if(!(fabs(a[i] - b[i]) < 0.000001)) { // sufficient given the tests return false; } } return true; } void test_bptree_constructor_simple() { SUITE_START("bptree constructor simple"); //01234567 //11101000 su::BPTree tree = su::BPTree("(('123:foo; bar':1,b:2)c);"); unsigned int exp_nparens = 8; std::vector exp_structure; exp_structure.push_back(true); exp_structure.push_back(true); exp_structure.push_back(true); exp_structure.push_back(false); exp_structure.push_back(true); exp_structure.push_back(false); exp_structure.push_back(false); exp_structure.push_back(false); std::vector exp_openclose; exp_openclose.push_back(7); exp_openclose.push_back(6); exp_openclose.push_back(3); exp_openclose.push_back(2); exp_openclose.push_back(5); exp_openclose.push_back(4); exp_openclose.push_back(1); exp_openclose.push_back(0); std::vector exp_names; exp_names.push_back(std::string()); exp_names.push_back(std::string("c")); exp_names.push_back(std::string("123:foo; bar")); exp_names.push_back(std::string()); exp_names.push_back(std::string("b")); exp_names.push_back(std::string()); exp_names.push_back(std::string()); exp_names.push_back(std::string()); std::vector exp_lengths; exp_lengths.push_back(0.0); exp_lengths.push_back(0.0); exp_lengths.push_back(1.0); exp_lengths.push_back(0.0); exp_lengths.push_back(2.0); exp_lengths.push_back(0.0); exp_lengths.push_back(0.0); exp_lengths.push_back(0.0); ASSERT(tree.nparens == exp_nparens); ASSERT(tree.get_structure() == exp_structure); ASSERT(tree.get_openclose() == exp_openclose); ASSERT(tree.lengths == exp_lengths); ASSERT(tree.names == exp_names); SUITE_END(); } void test_bptree_constructor_from_existing() { SUITE_START("bptree constructor from_existing"); //01234567 //11101000 su::BPTree existing = su::BPTree("(('123:foo; bar':1,b:2)c);"); su::BPTree tree = su::BPTree(existing.get_structure(), existing.lengths, existing.names); unsigned int exp_nparens = 8; std::vector exp_structure; exp_structure.push_back(true); exp_structure.push_back(true); exp_structure.push_back(true); exp_structure.push_back(false); exp_structure.push_back(true); exp_structure.push_back(false); exp_structure.push_back(false); exp_structure.push_back(false); std::vector exp_openclose; exp_openclose.push_back(7); exp_openclose.push_back(6); exp_openclose.push_back(3); exp_openclose.push_back(2); exp_openclose.push_back(5); exp_openclose.push_back(4); exp_openclose.push_back(1); exp_openclose.push_back(0); std::vector exp_names; exp_names.push_back(std::string()); exp_names.push_back(std::string("c")); exp_names.push_back(std::string("123:foo; bar")); exp_names.push_back(std::string()); exp_names.push_back(std::string("b")); exp_names.push_back(std::string()); exp_names.push_back(std::string()); exp_names.push_back(std::string()); std::vector exp_lengths; exp_lengths.push_back(0.0); exp_lengths.push_back(0.0); exp_lengths.push_back(1.0); exp_lengths.push_back(0.0); exp_lengths.push_back(2.0); exp_lengths.push_back(0.0); exp_lengths.push_back(0.0); exp_lengths.push_back(0.0); ASSERT(tree.nparens == exp_nparens); ASSERT(tree.get_structure() == exp_structure); ASSERT(tree.get_openclose() == exp_openclose); ASSERT(tree.lengths == exp_lengths); ASSERT(tree.names == exp_names); SUITE_END(); } void test_bptree_mask() { SUITE_START("bptree mask"); //01234567 //11101000 //111000 std::vector mask = {true, true, true, true, false, false, true, true}; su::BPTree base = su::BPTree("(('123:foo; bar':1,b:2)c);"); su::BPTree tree = base.mask(mask, base.lengths); unsigned int exp_nparens = 6; std::vector exp_structure; exp_structure.push_back(true); exp_structure.push_back(true); exp_structure.push_back(true); exp_structure.push_back(false); exp_structure.push_back(false); exp_structure.push_back(false); std::vector exp_openclose; exp_openclose.push_back(5); exp_openclose.push_back(4); exp_openclose.push_back(3); exp_openclose.push_back(2); exp_openclose.push_back(1); exp_openclose.push_back(0); std::vector exp_names; exp_names.push_back(std::string()); exp_names.push_back(std::string("c")); exp_names.push_back(std::string("123:foo; bar")); exp_names.push_back(std::string()); exp_names.push_back(std::string()); exp_names.push_back(std::string()); std::vector exp_lengths; exp_lengths.push_back(0.0); exp_lengths.push_back(0.0); exp_lengths.push_back(1.0); exp_lengths.push_back(0.0); exp_lengths.push_back(0.0); exp_lengths.push_back(0.0); ASSERT(tree.nparens == exp_nparens); ASSERT(tree.get_structure() == exp_structure); ASSERT(tree.get_openclose() == exp_openclose); ASSERT(tree.lengths == exp_lengths); ASSERT(tree.names == exp_names); SUITE_END(); } void test_bptree_constructor_single_descendent() { SUITE_START("bptree constructor single descendent"); su::BPTree tree = su::BPTree("(((a)b)c,((d)e)f,g)r;"); unsigned int exp_nparens = 16; bool structure_arr[] = {1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0}; std::vector exp_structure = _bool_array_to_vector(structure_arr, exp_nparens); double length_arr[] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; std::vector exp_lengths = _double_array_to_vector(length_arr, exp_nparens); std::string names_arr[] = {"r", "c", "b", "a", "", "", "", "f", "e", "d", "", "", "", "g", "", ""}; std::vector exp_names = _string_array_to_vector(names_arr, exp_nparens); ASSERT(tree.nparens == exp_nparens); ASSERT(tree.get_structure() == exp_structure); ASSERT(vec_almost_equal(tree.lengths, exp_lengths)); ASSERT(tree.names == exp_names); SUITE_END(); } void test_bptree_constructor_complex() { SUITE_START("bp tree constructor complex"); su::BPTree tree = su::BPTree("(((a:1,b:2.5)c:6,d:8,(e),(f,g,(h:1,i:2)j:1)k:1.2)l,m:2)r;"); unsigned int exp_nparens = 30; bool structure_arr[] = {1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0}; std::vector exp_structure = _bool_array_to_vector(structure_arr, exp_nparens); double length_arr[] = {0, 0, 6, 1, 0, 2.5, 0, 0, 8, 0, 0, 0, 0, 0, 1.2, 0, 0, 0, 0, 1, 1, 0, 2, 0, 0, 0, 0, 2, 0, 0}; std::vector exp_lengths = _double_array_to_vector(length_arr, exp_nparens); std::string names_arr[] = {"r", "l", "c", "a", "", "b", "", "", "d", "", "", "e", "", "", "k", "f", "", "g", "", "j", "h", "", "i", "", "", "", "", "m", "", ""}; std::vector exp_names = _string_array_to_vector(names_arr, exp_nparens); ASSERT(tree.nparens == exp_nparens); ASSERT(tree.get_structure() == exp_structure); ASSERT(vec_almost_equal(tree.lengths, exp_lengths)); ASSERT(tree.names == exp_names); SUITE_END(); } void test_bptree_constructor_semicolon() { SUITE_START("bp tree constructor semicolon"); su::BPTree tree = su::BPTree("((a,(b,c):5)'d','e; foo':10,((f))g)r;"); unsigned int exp_nparens = 20; bool structure_arr[] = {1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0}; std::vector exp_structure = _bool_array_to_vector(structure_arr, exp_nparens); double length_arr[] = {0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0}; std::vector exp_lengths = _double_array_to_vector(length_arr, exp_nparens); std::string names_arr[] = {"r", "d", "a", "", "", "b", "", "c", "", "", "", "e; foo", "", "g", "", "f", "", "", "", ""}; std::vector exp_names = _string_array_to_vector(names_arr, exp_nparens); ASSERT(tree.nparens == exp_nparens); ASSERT(tree.get_structure() == exp_structure); ASSERT(vec_almost_equal(tree.lengths, exp_lengths)); ASSERT(tree.names == exp_names); SUITE_END(); } void test_bptree_constructor_edgecases() { SUITE_START("bp tree constructor edgecases"); su::BPTree tree1 = su::BPTree("((a,b));"); bool structure_arr1[] = {1, 1, 1, 0, 1, 0, 0, 0}; std::vector exp_structure1 = _bool_array_to_vector(structure_arr1, 8); su::BPTree tree2 = su::BPTree("(a);"); bool structure_arr2[] = {1, 1, 0, 0}; std::vector exp_structure2 = _bool_array_to_vector(structure_arr2, 4); su::BPTree tree3 = su::BPTree("();"); bool structure_arr3[] = {1, 1, 0, 0}; std::vector exp_structure3 = _bool_array_to_vector(structure_arr3, 4); su::BPTree tree4 = su::BPTree("((a,b),c);"); bool structure_arr4[] = {1, 1, 1, 0, 1, 0, 0, 1, 0, 0}; std::vector exp_structure4 = _bool_array_to_vector(structure_arr4, 10); su::BPTree tree5 = su::BPTree("(a,(b,c));"); bool structure_arr5[] = {1, 1, 0, 1, 1, 0, 1, 0, 0, 0}; std::vector exp_structure5 = _bool_array_to_vector(structure_arr5, 10); ASSERT(tree1.get_structure() == exp_structure1); ASSERT(tree2.get_structure() == exp_structure2); ASSERT(tree3.get_structure() == exp_structure3); ASSERT(tree4.get_structure() == exp_structure4); ASSERT(tree5.get_structure() == exp_structure5); SUITE_END(); } void test_bptree_constructor_quoted_comma() { SUITE_START("quoted comma bug"); su::BPTree tree = su::BPTree("((3,'foo,bar')x,c)r;"); std::vector exp_names = {"r", "x", "3", "", "foo,bar", "", "", "c", "", ""}; ASSERT(exp_names.size() == tree.names.size()); for(unsigned int i = 0; i < tree.names.size(); i++) { ASSERT(exp_names[i] == tree.names[i]); } SUITE_END(); } void test_bptree_constructor_quoted_parens() { SUITE_START("quoted parens"); su::BPTree tree = su::BPTree("((3,'foo(b)ar')x,c)r;"); std::vector exp_names = {"r", "x", "3", "", "foo(b)ar", "", "", "c", "", ""}; ASSERT(exp_names.size() == tree.names.size()); for(unsigned int i = 0; i < tree.names.size(); i++) { ASSERT(exp_names[i] == tree.names[i]); } SUITE_END(); } void test_bptree_postorder() { SUITE_START("postorderselect"); // fig1 from https://www.dcc.uchile.cl/~gnavarro/ps/tcs16.2.pdf su::BPTree tree = su::BPTree("((3,4,(6)5)2,7,((10,100)9)8)1;"); uint32_t exp[] = {2, 4, 7, 6, 1, 11, 15, 17, 14, 13, 0}; uint32_t obs[tree.nparens / 2]; for(int i = 0; i < (tree.nparens / 2); i++) obs[i] = tree.postorderselect(i); std::vector exp_v = _uint32_array_to_vector(exp, tree.nparens / 2); std::vector obs_v = _uint32_array_to_vector(obs, tree.nparens / 2); ASSERT(obs_v == exp_v); SUITE_END(); } void test_bptree_preorder() { SUITE_START("preorderselect"); // fig1 from https://www.dcc.uchile.cl/~gnavarro/ps/tcs16.2.pdf su::BPTree tree = su::BPTree("((3,4,(6)5)2,7,((10,100)9)8)1;"); uint32_t exp[] = {0, 1, 2, 4, 6, 7, 11, 13, 14, 15, 17}; uint32_t obs[tree.nparens / 2]; for(int i = 0; i < (tree.nparens / 2); i++) obs[i] = tree.preorderselect(i); std::vector exp_v = _uint32_array_to_vector(exp, tree.nparens / 2); std::vector obs_v = _uint32_array_to_vector(obs, tree.nparens / 2); ASSERT(obs_v == exp_v); SUITE_END(); } void test_bptree_parent() { SUITE_START("parent"); // fig1 from https://www.dcc.uchile.cl/~gnavarro/ps/tcs16.2.pdf su::BPTree tree = su::BPTree("((3,4,(6)5)2,7,((10,100)9)8)1;"); uint32_t exp[] = {0, 1, 1, 1, 1, 1, 6, 6, 1, 0, 0, 0, 0, 13, 14, 14, 14, 14, 13, 0}; // all the -2 and +1 garbage is to avoid testing the root. uint32_t obs[tree.nparens - 2]; for(int i = 0; i < (tree.nparens) - 2; i++) obs[i] = tree.parent(i+1); std::vector exp_v = _uint32_array_to_vector(exp, tree.nparens - 2); std::vector obs_v = _uint32_array_to_vector(obs, tree.nparens - 2); ASSERT(obs_v == exp_v); SUITE_END(); } void test_biom_constructor() { SUITE_START("biom constructor"); su::biom table = su::biom("test.biom"); uint32_t exp_n_samples = 6; uint32_t exp_n_obs = 5; std::string sids[] = {"Sample1", "Sample2", "Sample3", "Sample4", "Sample5", "Sample6"}; std::vector exp_sids = _string_array_to_vector(sids, exp_n_samples); std::string oids[] = {"GG_OTU_1", "GG_OTU_2","GG_OTU_3", "GG_OTU_4", "GG_OTU_5"}; std::vector exp_oids = _string_array_to_vector(oids, exp_n_obs); uint32_t s_indptr[] = {0, 2, 5, 9, 11, 12, 15}; std::vector exp_s_indptr = _uint32_array_to_vector(s_indptr, exp_n_samples + 1); uint32_t o_indptr[] = {0, 1, 6, 9, 13, 15}; std::vector exp_o_indptr = _uint32_array_to_vector(o_indptr, exp_n_obs + 1); uint32_t exp_nnz = 15; ASSERT(table.n_samples == exp_n_samples); ASSERT(table.n_obs == exp_n_obs); ASSERT(table.nnz == exp_nnz); ASSERT(table.sample_ids == exp_sids); ASSERT(table.obs_ids == exp_oids); ASSERT(table.sample_indptr == exp_s_indptr); ASSERT(table.obs_indptr == exp_o_indptr); SUITE_END(); } void test_biom_get_obs_data() { SUITE_START("biom get obs data"); su::biom table = su::biom("test.biom"); double exp0[] = {0.0, 0.0, 1.0, 0.0, 0.0, 0.0}; std::vector exp0_vec = _double_array_to_vector(exp0, 6); double exp1[] = {5.0, 1.0, 0.0, 2.0, 3.0, 1.0}; std::vector exp1_vec = _double_array_to_vector(exp1, 6); double exp2[] = {0.0, 0.0, 1.0, 4.0, 0.0, 2.0}; std::vector exp2_vec = _double_array_to_vector(exp2, 6); double exp3[] = {2.0, 1.0, 1.0, 0.0, 0.0, 1.0}; std::vector exp3_vec = _double_array_to_vector(exp3, 6); double exp4[] = {0.0, 1.0, 1.0, 0.0, 0.0, 0.0}; std::vector exp4_vec = _double_array_to_vector(exp4, 6); double *out = (double*)malloc(sizeof(double) * 6); std::vector obs_vec; table.get_obs_data(std::string("GG_OTU_1").c_str(), out); obs_vec = _double_array_to_vector(out, 6); ASSERT(vec_almost_equal(obs_vec, exp0_vec)); table.get_obs_data(std::string("GG_OTU_2").c_str(), out); obs_vec = _double_array_to_vector(out, 6); ASSERT(vec_almost_equal(obs_vec, exp1_vec)); table.get_obs_data(std::string("GG_OTU_3").c_str(), out); obs_vec = _double_array_to_vector(out, 6); ASSERT(vec_almost_equal(obs_vec, exp2_vec)); table.get_obs_data(std::string("GG_OTU_4").c_str(), out); obs_vec = _double_array_to_vector(out, 6); ASSERT(vec_almost_equal(obs_vec, exp3_vec)); table.get_obs_data(std::string("GG_OTU_5").c_str(), out); obs_vec = _double_array_to_vector(out, 6); ASSERT(vec_almost_equal(obs_vec, exp4_vec)); free(out); SUITE_END(); } void test_bptree_leftchild() { SUITE_START("test bptree left child"); su::BPTree tree = su::BPTree("((3,4,(6)5)2,7,((10,100)9)8)1;"); uint32_t exp[] = {1, 2, 0, 0, 7, 0, 0, 14, 15, 0, 0}; std::vector structure = tree.get_structure(); uint32_t exp_pos = 0; for(int i = 0; i < tree.nparens; i++) { if(structure[i]) ASSERT(tree.leftchild(i) == exp[exp_pos++]); } SUITE_END(); } void test_bptree_rightchild() { SUITE_START("test bptree right child"); su::BPTree tree = su::BPTree("((3,4,(6)5)2,7,((10,100)9)8)1;"); uint32_t exp[] = {13, 6, 0, 0, 7, 0, 0, 14, 17, 0, 0}; std::vector structure = tree.get_structure(); uint32_t exp_pos = 0; for(int i = 0; i < tree.nparens; i++) { if(structure[i]) ASSERT(tree.rightchild(i) == exp[exp_pos++]); } SUITE_END(); } void test_bptree_rightsibling() { SUITE_START("test bptree rightsibling"); su::BPTree tree = su::BPTree("((3,4,(6)5)2,7,((10,100)9)8)1;"); uint32_t exp[] = {0, 11, 4, 6, 0, 0, 13, 0, 0, 17, 0}; std::vector structure = tree.get_structure(); uint32_t exp_pos = 0; for(int i = 0; i < tree.nparens; i++) { if(structure[i]) ASSERT(tree.rightsibling(i) == exp[exp_pos++]); } SUITE_END(); } void test_propstack_constructor() { SUITE_START("test propstack constructor"); su::PropStack ps = su::PropStack(10); // nothing to test directly... SUITE_END(); } void test_propstack_push_and_pop() { SUITE_START("test propstack push and pop"); su::PropStack ps = su::PropStack(10); double *vec1 = ps.pop(1); double *vec2 = ps.pop(2); double *vec3 = ps.pop(3); double *vec1_obs; double *vec2_obs; double *vec3_obs; ps.push(1); ps.push(2); ps.push(3); vec3_obs = ps.pop(4); vec2_obs = ps.pop(5); vec1_obs = ps.pop(6); ASSERT(vec1 == vec1_obs); ASSERT(vec2 == vec2_obs); ASSERT(vec3 == vec3_obs); SUITE_END(); } void test_propstack_get() { SUITE_START("test propstack get"); su::PropStack ps = su::PropStack(10); double *vec1 = ps.pop(1); double *vec2 = ps.pop(2); double *vec3 = ps.pop(3); double *vec1_obs = ps.get(1); double *vec2_obs = ps.get(2); double *vec3_obs = ps.get(3); ASSERT(vec1 == vec1_obs); ASSERT(vec2 == vec2_obs); ASSERT(vec3 == vec3_obs); SUITE_END(); } void test_unifrac_set_proportions() { SUITE_START("test unifrac set proportions"); // 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 // ( ( ) ( ( ) ( ) ) ( ( ) ( ) ) ) su::BPTree tree = su::BPTree("(GG_OTU_1,(GG_OTU_2,GG_OTU_3),(GG_OTU_5,GG_OTU_4));"); su::biom table = su::biom("test.biom"); su::PropStack ps = su::PropStack(table.n_samples); double sample_counts[] = {7, 3, 4, 6, 3, 4}; double *obs = ps.pop(4); // GG_OTU_2 double exp4[] = {0.714285714286, 0.333333333333, 0.0, 0.333333333333, 1.0, 0.25}; set_proportions(obs, tree, 4, table, ps); for(unsigned int i = 0; i < table.n_samples; i++) ASSERT(fabs(obs[i] - exp4[i]) < 0.000001); obs = ps.pop(6); // GG_OTU_3 double exp6[] = {0.0, 0.0, 0.25, 0.666666666667, 0.0, 0.5}; set_proportions(obs, tree, 6, table, ps); for(unsigned int i = 0; i < table.n_samples; i++) ASSERT(fabs(obs[i] - exp6[i]) < 0.000001); obs = ps.pop(3); // node containing GG_OTU_2 and GG_OTU_3 double exp3[] = {0.71428571, 0.33333333, 0.25, 1.0, 1.0, 0.75}; set_proportions(obs, tree, 3, table, ps); for(unsigned int i = 0; i < table.n_samples; i++) ASSERT(fabs(obs[i] - exp3[i]) < 0.000001); SUITE_END(); } void test_unifrac_deconvolute_stripes() { SUITE_START("test deconvolute stripes"); std::vector stripes; double s1[] = {1, 1, 1, 1, 1, 1}; double s2[] = {2, 2, 2, 2, 2, 2}; double s3[] = {3, 3, 3, 3, 3, 3}; stripes.push_back(s1); stripes.push_back(s2); stripes.push_back(s3); double exp[6][6] = { {0, 1, 2, 3, 2, 1}, {1, 0, 1, 2, 3, 2}, {2, 1, 0, 1, 2, 3}, {3, 2, 1, 0, 1, 2}, {2, 3, 2, 1, 0, 1}, {1, 2, 3, 2, 1, 0} }; double **obs = su::deconvolute_stripes(stripes, 6); for(unsigned int i = 0; i < 6; i++) { for(unsigned int j = 0; j < 6; j++) { ASSERT(exp[i][j] == obs[i][j]); } } free(obs); SUITE_END(); } void test_unifrac_stripes_to_condensed_form_even() { SUITE_START("test stripes_to_condensed_form even samples"); std::vector stripes; double s1[] = {0, 5, 9, 12, 14, 4}; double s2[] = {1, 6, 10, 13, 3, 8}; double s3[] = {2, 7, 11, 2, 7, 11}; // {0, 0, 1, 2, 3, 4}, // {x, 0, 5, 6, 7, 8}, // {x, x, 0, 9, 10, 11}, // {x, x, x, 0, 12, 13}, // {x, x, x, x, 0, 14}, // {x, x, x, x, x, 0} stripes.push_back(s1); stripes.push_back(s2); stripes.push_back(s3); double exp[15] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}; double *obs = (double*)malloc(sizeof(double) * 15); su::stripes_to_condensed_form(stripes, 6, obs, 0, 3); for(unsigned int i = 0; i < 15; i++) { ASSERT(exp[i] == obs[i]); } free(obs); SUITE_END(); } void test_unifrac_stripes_to_condensed_form_odd() { SUITE_START("test stripes_to_condensed_form odd samples"); std::vector stripes; double s1[] = {1, 2, 3, 4, 5, 6, 0}; double s2[] = {12, 11, 10, 9, 8, 7, 1}; double s3[] = {13, 14, 15, 16, 17, 18, 2}; // {0, 1, 12, 13, 17, 7, 0}, // {x, 0, 2, 11, 14, 18, 1}, // {x, x, 0, 3, 10, 15, 2}, // {x, x, x, 0, 4, 9, 16}, // {x, x, x, x, 0, 5, 8}, // {x, x, x, x, x, 0, 6} // {x, x, x, x, x, x, 0} stripes.push_back(s1); stripes.push_back(s2); stripes.push_back(s3); double exp[21] = {1, 12, 13, 17, 7, 0, 2, 11, 14, 18, 1, 3, 10, 15, 2, 4, 9, 16, 5, 8, 6}; double *obs = (double*)malloc(sizeof(double) * 21); su::stripes_to_condensed_form(stripes, 7, obs, 0, 3); for(unsigned int i = 0; i < 21; i++) { ASSERT(exp[i] == obs[i]); } free(obs); SUITE_END(); } void test_unnormalized_weighted_unifrac() { SUITE_START("test unnormalized weighted unifrac"); std::vector threads(1); su::BPTree tree = su::BPTree("(GG_OTU_1:1,(GG_OTU_2:1,GG_OTU_3:1):1,(GG_OTU_5:1,GG_OTU_4:1):1);"); su::biom table = su::biom("test.biom"); std::vector exp; double stride1[] = {1.52380952, 1.25, 2.75, 1.33333333, 2., 1.07142857}; double stride2[] = {2.17857143, 2.66666667, 3.25, 1.0, 1.14285714, 1.83333333}; double stride3[] = {1.9047619, 2.66666667, 1.75, 1.9047619, 2.66666667, 1.75}; exp.push_back(stride1); exp.push_back(stride2); exp.push_back(stride3); std::vector strides = su::make_strides(6); std::vector strides_total = su::make_strides(6); su::task_parameters task_p; task_p.start = 0; task_p.stop = 3; task_p.tid = 0; task_p.n_samples = 6; task_p.bypass_tips = false; std::vector tasks; tasks.push_back(task_p); su::process_stripes(std::ref(table), std::ref(tree), su::weighted_unnormalized, false, std::ref(strides), std::ref(strides_total), std::ref(threads), std::ref(tasks)); for(unsigned int i = 0; i < 3; i++) { for(unsigned int j = 0; j < 6; j++) { ASSERT(fabs(strides[i][j] - exp[i][j]) < 0.000001); } free(strides[i]); } SUITE_END(); } void test_generalized_unifrac() { SUITE_START("test generalized unifrac"); std::vector threads(1); su::BPTree tree = su::BPTree("(GG_OTU_1:1,(GG_OTU_2:1,GG_OTU_3:1):1,(GG_OTU_5:1,GG_OTU_4:1):1);"); su::biom table = su::biom("test.biom"); // weighted normalized unifrac as computed above std::vector w_exp; double w_stride1[] = {0.38095238, 0.33333333, 0.73333333, 0.33333333, 0.5, 0.26785714}; double w_stride2[] = {0.58095238, 0.66666667, 0.86666667, 0.25, 0.28571429, 0.45833333}; double w_stride3[] = {0.47619048, 0.66666667, 0.46666667, 0.47619048, 0.66666667, 0.46666667}; w_exp.push_back(w_stride1); w_exp.push_back(w_stride2); w_exp.push_back(w_stride3); std::vector w_strides = su::make_strides(6); std::vector w_strides_total = su::make_strides(6); su::task_parameters w_task_p; w_task_p.start = 0; w_task_p.stop = 3; w_task_p.tid = 0; w_task_p.n_samples = 6; w_task_p.bypass_tips = false; w_task_p.g_unifrac_alpha = 1.0; std::vector tasks; tasks.push_back(w_task_p); su::process_stripes(std::ref(table), std::ref(tree), su::generalized, false, std::ref(w_strides), std::ref(w_strides_total), std::ref(threads), std::ref(tasks)); // as computed by GUniFrac v1.0 // Sample1 Sample2 Sample3 Sample4 Sample5 Sample6 //Sample1 0.0000000 0.4408392 0.6886965 0.7060606 0.5833333 0.3278410 //Sample2 0.4408392 0.0000000 0.5102041 0.7500000 0.8000000 0.5208125 //Sample3 0.6886965 0.5102041 0.0000000 0.8649351 0.9428571 0.5952381 //Sample4 0.7060606 0.7500000 0.8649351 0.0000000 0.5000000 0.4857143 //Sample5 0.5833333 0.8000000 0.9428571 0.5000000 0.0000000 0.7485714 //Sample6 0.3278410 0.5208125 0.5952381 0.4857143 0.7485714 0.0000000 std::vector d0_exp; double d0_stride1[] = {0.4408392, 0.5102041, 0.8649351, 0.5000000, 0.7485714, 0.3278410}; double d0_stride2[] = {0.6886965, 0.7500000, 0.9428571, 0.4857143, 0.5833333, 0.5208125}; double d0_stride3[] = {0.7060606, 0.8000000, 0.5952381, 0.7060606, 0.8000000, 0.5952381}; d0_exp.push_back(d0_stride1); d0_exp.push_back(d0_stride2); d0_exp.push_back(d0_stride3); std::vector d0_strides = su::make_strides(6); std::vector d0_strides_total = su::make_strides(6); su::task_parameters d0_task_p; d0_task_p.start = 0; d0_task_p.stop = 3; d0_task_p.tid = 0; d0_task_p.n_samples = 6; d0_task_p.bypass_tips = false; d0_task_p.g_unifrac_alpha = 0.0; tasks.clear(); tasks.push_back(d0_task_p); su::process_stripes(std::ref(table), std::ref(tree), su::generalized, false, std::ref(d0_strides), std::ref(d0_strides_total), std::ref(threads), std::ref(tasks)); // as computed by GUniFrac v1.0 // Sample1 Sample2 Sample3 Sample4 Sample5 Sample6 //Sample1 0.0000000 0.4040518 0.6285560 0.5869439 0.4082483 0.2995673 //Sample2 0.4040518 0.0000000 0.4160597 0.7071068 0.7302479 0.4860856 //Sample3 0.6285560 0.4160597 0.0000000 0.8005220 0.9073159 0.5218198 //Sample4 0.5869439 0.7071068 0.8005220 0.0000000 0.4117216 0.3485667 //Sample5 0.4082483 0.7302479 0.9073159 0.4117216 0.0000000 0.6188282 //Sample6 0.2995673 0.4860856 0.5218198 0.3485667 0.6188282 0.0000000 std::vector d05_exp; double d05_stride1[] = {0.4040518, 0.4160597, 0.8005220, 0.4117216, 0.6188282, 0.2995673}; double d05_stride2[] = {0.6285560, 0.7071068, 0.9073159, 0.3485667, 0.4082483, 0.4860856}; double d05_stride3[] = {0.5869439, 0.7302479, 0.5218198, 0.5869439, 0.7302479, 0.5218198}; d05_exp.push_back(d05_stride1); d05_exp.push_back(d05_stride2); d05_exp.push_back(d05_stride3); std::vector d05_strides = su::make_strides(6); std::vector d05_strides_total = su::make_strides(6); su::task_parameters d05_task_p; d05_task_p.start = 0; d05_task_p.stop = 3; d05_task_p.tid = 0; d05_task_p.n_samples = 6; d05_task_p.bypass_tips = false; d05_task_p.g_unifrac_alpha = 0.5; tasks.clear(); tasks.push_back(d05_task_p); su::process_stripes(std::ref(table), std::ref(tree), su::generalized, false, std::ref(d05_strides), std::ref(d05_strides_total), std::ref(threads), std::ref(tasks)); for(unsigned int i = 0; i < 3; i++) { for(unsigned int j = 0; j < 6; j++) { ASSERT(fabs(w_strides[i][j] - w_exp[i][j]) < 0.000001); ASSERT(fabs(d0_strides[i][j] - d0_exp[i][j]) < 0.000001); ASSERT(fabs(d05_strides[i][j] - d05_exp[i][j]) < 0.000001); } free(w_strides[i]); free(d0_strides[i]); free(d05_strides[i]); } SUITE_END(); } void test_vaw_unifrac_weighted_normalized() { SUITE_START("test vaw weighted normalized unifrac"); std::vector threads(1); su::BPTree tree = su::BPTree("(GG_OTU_1:1,(GG_OTU_2:1,GG_OTU_3:1):1,(GG_OTU_5:1,GG_OTU_4:1):1);"); su::biom table = su::biom("test.biom"); // as computed by GUniFrac, the original implementation of VAW-UniFrac // could not be found. // Sample1 Sample2 Sample3 Sample4 Sample5 Sample6 //Sample1 0.0000000 0.4086040 0.6240185 0.4639481 0.2857143 0.2766318 //Sample2 0.4086040 0.0000000 0.3798594 0.6884992 0.6807616 0.4735781 //Sample3 0.6240185 0.3798594 0.0000000 0.7713254 0.8812897 0.5047114 //Sample4 0.4639481 0.6884992 0.7713254 0.0000000 0.6666667 0.2709298 //Sample5 0.2857143 0.6807616 0.8812897 0.6666667 0.0000000 0.4735991 //Sample6 0.2766318 0.4735781 0.5047114 0.2709298 0.4735991 0.0000000 // weighted normalized unifrac as computed above std::vector w_exp; double w_stride1[] = {0.4086040, 0.3798594, 0.7713254, 0.6666667, 0.4735991, 0.2766318}; double w_stride2[] = {0.6240185, 0.6884992, 0.8812897, 0.2709298, 0.2857143, 0.4735781}; double w_stride3[] = {0.4639481, 0.6807616, 0.5047114, 0.4639481, 0.6807616, 0.5047114}; w_exp.push_back(w_stride1); w_exp.push_back(w_stride2); w_exp.push_back(w_stride3); std::vector w_strides = su::make_strides(6); std::vector w_strides_total = su::make_strides(6); su::task_parameters w_task_p; w_task_p.start = 0; w_task_p.stop = 3; w_task_p.tid = 0; w_task_p.n_samples = 6; w_task_p.bypass_tips = false; w_task_p.g_unifrac_alpha = 1.0; std::vector tasks; tasks.push_back(w_task_p); su::process_stripes(std::ref(table), std::ref(tree), su::weighted_normalized, true, std::ref(w_strides), std::ref(w_strides_total), std::ref(threads), std::ref(tasks)); for(unsigned int i = 0; i < 3; i++) { for(unsigned int j = 0; j < 6; j++) { ASSERT(fabs(w_strides[i][j] - w_exp[i][j]) < 0.000001); } free(w_strides[i]); } SUITE_END(); } void test_make_strides() { SUITE_START("test make stripes"); std::vector exp; double stride[] = {0., 0., 0.}; exp.push_back(stride); exp.push_back(stride); exp.push_back(stride); std::vector obs = su::make_strides(3); for(unsigned int i = 0; i < 3; i++) { for(unsigned int j = 0; j < 6; j++) { ASSERT(fabs(obs[i][j] - exp[i][j]) < 0.000001); } free(obs[i]); } } void test_faith_pd() { SUITE_START("test faith PD"); // Note this tree is binary (opposed to example below) su::BPTree tree = su::BPTree("((GG_OTU_1:1,(GG_OTU_2:1,GG_OTU_3:1):1):2,(GG_OTU_5:1,GG_OTU_4:1):1);"); su::biom table = su::biom("test.biom"); // make vector of expectations from faith PD double exp[6] = {6., 7., 8., 5., 4., 7.}; // run faith PD to get obs double obs[6] = {0, 0, 0, 0, 0, 0}; su::faith_pd(table, tree, obs); // ASSERT that results = expectation for (unsigned int i = 0; i < 6; i++){ ASSERT(fabs(exp[i]-obs[i]) < 0.000001) } SUITE_END(); } void test_faith_pd_shear(){ SUITE_START("test faith PD extra OTUs in tree"); su::BPTree tree = su::BPTree("((GG_OTU_1:1,(GG_OTU_2:1,GG_OTU_3:1,GG_OTU_ex:9):1):2,(GG_OTU_5:1,GG_OTU_4:1,GG_OTU_ex2:12):1);"); su::biom table = su::biom("test.biom"); // make vector of expectations from faith PD double exp[6] = {6., 7., 8., 5., 4., 7.}; // run faith PD to get obs double obs[6] = {0, 0, 0, 0, 0, 0}; std::unordered_set to_keep(table.obs_ids.begin(), \ table.obs_ids.end()); \ su::BPTree tree_sheared = tree.shear(to_keep).collapse(); su::faith_pd(table, tree_sheared, obs); // ASSERT that results = expectation for (unsigned int i = 0; i < 6; i++){ ASSERT(fabs(exp[i]-obs[i]) < 0.000001) } SUITE_END(); } void test_unweighted_unifrac() { SUITE_START("test unweighted unifrac"); double **obs; std::vector threads(1); su::BPTree tree = su::BPTree("(GG_OTU_1:1,(GG_OTU_2:1,GG_OTU_3:1):1,(GG_OTU_5:1,GG_OTU_4:1):1);"); su::biom table = su::biom("test.biom"); std::vector exp; double stride1[] = {0.2, 0.42857143, 0.71428571, 0.33333333, 0.6, 0.2}; double stride2[] = {0.57142857, 0.66666667, 0.85714286, 0.4, 0.5, 0.33333333}; double stride3[] = {0.6, 0.6, 0.42857143, 0.6, 0.6, 0.42857143}; exp.push_back(stride1); exp.push_back(stride2); exp.push_back(stride3); std::vector strides = su::make_strides(6); std::vector strides_total = su::make_strides(6); su::task_parameters task_p; task_p.start = 0; task_p.stop = 3; task_p.tid = 0; task_p.n_samples = 6; task_p.bypass_tips = false; std::vector tasks; tasks.push_back(task_p); su::process_stripes(std::ref(table), std::ref(tree), su::unweighted, false, std::ref(strides), std::ref(strides_total), std::ref(threads), std::ref(tasks)); for(unsigned int i = 0; i < 3; i++) { for(unsigned int j = 0; j < 6; j++) { ASSERT(fabs(strides[i][j] - exp[i][j]) < 0.000001); } free(strides[i]); } SUITE_END(); } void test_unweighted_unifrac_fast() { SUITE_START("test unweighted unifrac no tips"); double **obs; std::vector threads(1); su::BPTree tree = su::BPTree("(GG_OTU_1:1,(GG_OTU_2:1,GG_OTU_3:1):1,(GG_OTU_5:1,GG_OTU_4:1):1);"); su::biom table = su::biom("test.biom"); std::vector exp; double stride1[] = {0., 0., 0.5, 0., 0.5, 0.}; double stride2[] = {0., 0.5, 0.5, 0.5, 0.5, 0.}; double stride3[] = {0.5, 0.5, 0., 0.5, 0.5, 0.}; exp.push_back(stride1); exp.push_back(stride2); exp.push_back(stride3); std::vector strides = su::make_strides(6); std::vector strides_total = su::make_strides(6); su::task_parameters task_p; task_p.start = 0; task_p.stop = 3; task_p.tid = 0; task_p.n_samples = 6; task_p.bypass_tips = true; std::vector tasks; tasks.push_back(task_p); su::process_stripes(std::ref(table), std::ref(tree), su::unweighted, false, std::ref(strides), std::ref(strides_total), std::ref(threads), std::ref(tasks)); for(unsigned int i = 0; i < 3; i++) { for(unsigned int j = 0; j < 6; j++) { ASSERT(fabs(strides[i][j] - exp[i][j]) < 0.000001); } free(strides[i]); } SUITE_END(); } void test_normalized_weighted_unifrac() { SUITE_START("test normalized weighted unifrac"); double **obs; std::vector threads(1); su::BPTree tree = su::BPTree("(GG_OTU_1:1,(GG_OTU_2:1,GG_OTU_3:1):1,(GG_OTU_5:1,GG_OTU_4:1):1);"); su::biom table = su::biom("test.biom"); std::vector exp; double stride1[] = {0.38095238, 0.33333333, 0.73333333, 0.33333333, 0.5, 0.26785714}; double stride2[] = {0.58095238, 0.66666667, 0.86666667, 0.25, 0.28571429, 0.45833333}; double stride3[] = {0.47619048, 0.66666667, 0.46666667, 0.47619048, 0.66666667, 0.46666667}; exp.push_back(stride1); exp.push_back(stride2); exp.push_back(stride3); std::vector strides = su::make_strides(6); std::vector strides_total = su::make_strides(6); su::task_parameters task_p; task_p.start = 0; task_p.stop = 3; task_p.tid = 0; task_p.n_samples = 6; task_p.bypass_tips = false; std::vector tasks; tasks.push_back(task_p); su::process_stripes(std::ref(table), std::ref(tree), su::weighted_normalized, false, std::ref(strides), std::ref(strides_total), std::ref(threads), std::ref(tasks)); for(unsigned int i = 0; i < 3; i++) { for(unsigned int j = 0; j < 6; j++) { ASSERT(fabs(strides[i][j] - exp[i][j]) < 0.000001); } free(strides[i]); } SUITE_END(); } void test_bptree_shear_simple() { SUITE_START("test bptree shear simple"); su::BPTree tree = su::BPTree("((3:2,4:3,(6:5)5:4)2:1,7:6,((10:9,11:10)9:8)8:7)r"); // simple std::unordered_set to_keep = {"4", "6", "7", "10", "11"}; uint32_t exp_nparens = 20; std::vector exp_structure = {true, true, true, false, true, true, false, false, false, true, false, true, true, true, false, true, false, false, false, false}; std::vector exp_names = {"r", "2", "4", "", "5", "6", "", "", "", "7", "", "8", "9", "10", "", "11", "", "", "", ""}; std::vector exp_lengths = {0, 1, 3, 0, 4, 5, 0, 0, 0, 6, 0, 7, 8, 9, 0, 10, 0, 0, 0, 0}; su::BPTree obs = tree.shear(to_keep); ASSERT(obs.get_structure() == exp_structure); ASSERT(exp_nparens == obs.nparens); ASSERT(vec_almost_equal(exp_lengths, obs.lengths)); ASSERT(obs.names == exp_names); SUITE_END(); } void test_bptree_shear_deep() { SUITE_START("test bptree shear deep"); su::BPTree tree = su::BPTree("((3:2,4:3,(6:5)5:4)2:1,7:6,((10:9,11:10)9:8)8:7)r"); // deep std::unordered_set to_keep = {"10", "11"}; uint32_t exp_nparens = 10; std::vector exp_structure = {true, true, true, true, false, true, false, false, false, false}; std::vector exp_names = {"r", "8", "9", "10", "", "11", "", "", "", ""}; std::vector exp_lengths = {0, 7, 8, 9, 0, 10, 0, 0, 0, 0}; su::BPTree obs = tree.shear(to_keep); ASSERT(exp_nparens == obs.nparens); ASSERT(obs.get_structure() == exp_structure); ASSERT(vec_almost_equal(exp_lengths, obs.lengths)); ASSERT(obs.names == exp_names); SUITE_END(); } void test_test_table_ids_are_subset_of_tree() { SUITE_START("test test_table_ids_are_subset_of_tree"); su::BPTree tree = su::BPTree("(a:1,b:2)r;"); su::biom table = su::biom("test.biom"); std::string expected = "GG_OTU_1"; std::string observed = su::test_table_ids_are_subset_of_tree(table, tree); ASSERT(observed == expected); su::BPTree tree2 = su::BPTree("(GG_OTU_1,GG_OTU_5,GG_OTU_6,GG_OTU_2,GG_OTU_3,GG_OTU_4);"); su::biom table2 = su::biom("test.biom"); expected = ""; observed = su::test_table_ids_are_subset_of_tree(table2, tree2); ASSERT(observed == expected); SUITE_END(); } void test_bptree_get_tip_names() { SUITE_START("test bptree get_tip_names"); su::BPTree tree = su::BPTree("((a:2,b:3,(c:5)d:4)e:1,f:6,((g:9,h:10)i:8)j:7)r"); std::unordered_set expected = {"a", "b", "c", "f", "g", "h"}; std::unordered_set observed = tree.get_tip_names(); ASSERT(observed == expected); SUITE_END(); } void test_bptree_collapse_simple() { SUITE_START("test bptree collapse simple"); su::BPTree tree = su::BPTree("((3:2,4:3,(6:5)5:4)2:1,7:6,((10:9,11:10)9:8)8:7)r"); uint32_t exp_nparens = 18; std::vector exp_structure = {true, true, true, false, true, false, true, false, false, true, false, true, true, false, true, false, false, false}; std::vector exp_names = {"r", "2", "3", "", "4", "", "6", "", "", "7", "", "9", "10", "", "11", "", "", ""}; std::vector exp_lengths = {0, 1, 2, 0, 3, 0, 9, 0, 0, 6, 0, 15, 9, 0, 10, 0, 0, 0}; su::BPTree obs = tree.collapse(); ASSERT(obs.get_structure() == exp_structure); ASSERT(exp_nparens == obs.nparens); ASSERT(vec_almost_equal(exp_lengths, obs.lengths)); ASSERT(obs.names == exp_names); SUITE_END(); } void test_bptree_collapse_edge() { SUITE_START("test bptree collapse edge case against root"); su::BPTree tree = su::BPTree("((a),b)r;"); su::BPTree exp = su::BPTree("(a,b)r;"); su::BPTree obs = tree.collapse(); ASSERT(obs.get_structure() == exp.get_structure()); ASSERT(obs.names == exp.names); ASSERT(vec_almost_equal(obs.lengths, exp.lengths)); SUITE_END(); } void test_unifrac_sample_counts() { SUITE_START("test unifrac sample counts"); su::biom table = su::biom("test.biom"); double* obs = table.sample_counts; double exp[] = {7, 3, 4, 6, 3, 4}; for(unsigned int i = 0; i < 6; i++) ASSERT(obs[i] == exp[i]); SUITE_END(); } void test_set_tasks() { SUITE_START("test set tasks"); std::vector obs(1); std::vector exp(1); exp[0].g_unifrac_alpha = 1.0; exp[0].n_samples = 100; exp[0].bypass_tips = false; exp[0].start = 0; exp[0].stop = 100; exp[0].tid = 0; set_tasks(obs, 1.0, 100, 0, 100, false, 1); ASSERT(obs[0].g_unifrac_alpha == exp[0].g_unifrac_alpha); ASSERT(obs[0].n_samples == exp[0].n_samples); ASSERT(obs[0].start == exp[0].start); ASSERT(obs[0].stop == exp[0].stop); ASSERT(obs[0].tid == exp[0].tid); std::vector obs2(2); std::vector exp2(2); exp2[0].g_unifrac_alpha = 1.0; exp2[0].n_samples = 100; exp2[0].bypass_tips = false; exp2[0].start = 0; exp2[0].stop = 50; exp2[0].tid = 0; exp2[1].g_unifrac_alpha = 1.0; exp2[1].n_samples = 100; exp2[1].bypass_tips = false; exp2[1].start = 50; exp2[1].stop = 100; exp2[1].tid = 1; set_tasks(obs2, 1.0, 100, 0, 100, false, 2); for(unsigned int i=0; i < 2; i++) { ASSERT(obs2[i].g_unifrac_alpha == exp2[i].g_unifrac_alpha); ASSERT(obs2[i].n_samples == exp2[i].n_samples); ASSERT(obs2[i].start == exp2[i].start); ASSERT(obs2[i].stop == exp2[i].stop); ASSERT(obs2[i].tid == exp2[i].tid); } std::vector obs3(3); std::vector exp3(3); exp3[0].g_unifrac_alpha = 1.0; exp3[0].n_samples = 100; exp3[0].bypass_tips = false; exp3[0].start = 25; exp3[0].stop = 50; exp3[0].tid = 0; exp3[1].g_unifrac_alpha = 1.0; exp3[1].n_samples = 100; exp3[1].bypass_tips = false; exp3[1].start = 50; exp3[1].stop = 75; exp3[1].tid = 1; exp3[2].g_unifrac_alpha = 1.0; exp3[2].n_samples = 100; exp3[2].bypass_tips = false; exp3[2].start = 75; exp3[2].stop = 100; exp3[2].tid = 2; set_tasks(obs3, 1.0, 100, 25, 100, false, 3); for(unsigned int i=0; i < 3; i++) { ASSERT(obs3[i].g_unifrac_alpha == exp3[i].g_unifrac_alpha); ASSERT(obs3[i].n_samples == exp3[i].n_samples); ASSERT(obs3[i].start == exp3[i].start); ASSERT(obs3[i].stop == exp3[i].stop); ASSERT(obs3[i].tid == exp3[i].tid); } std::vector obs4(3); std::vector exp4(3); exp4[0].g_unifrac_alpha = 1.0; exp4[0].n_samples = 100; exp4[0].bypass_tips = false; exp4[0].start = 26; exp4[0].stop = 51; exp4[0].tid = 0; exp4[1].g_unifrac_alpha = 1.0; exp4[1].n_samples = 100; exp4[1].bypass_tips = false; exp4[1].start = 51; exp4[1].stop = 76; exp4[1].tid = 1; exp4[2].g_unifrac_alpha = 1.0; exp4[2].n_samples = 100; exp4[2].bypass_tips = false; exp4[2].start = 76; exp4[2].stop = 100; exp4[2].tid = 2; set_tasks(obs4, 1.0, 100, 26, 100, false, 3); for(unsigned int i=0; i < 3; i++) { ASSERT(obs4[i].g_unifrac_alpha == exp4[i].g_unifrac_alpha); ASSERT(obs4[i].n_samples == exp4[i].n_samples); ASSERT(obs4[i].start == exp4[i].start); ASSERT(obs4[i].stop == exp4[i].stop); ASSERT(obs4[i].tid == exp4[i].tid); } // set_tasks boundary bug std::vector obs16(16); std::vector exp16(16); set_tasks(obs16, 1.0, 9511, 0, 0, false, 16); exp16[15].start = 4459; exp16[15].stop = 4756; ASSERT(obs16[15].start == exp16[15].start); ASSERT(obs16[15].stop == exp16[15].stop); SUITE_END(); } void test_bptree_constructor_newline_bug() { SUITE_START("test bptree constructor newline bug"); su::BPTree tree = su::BPTree("((362be41f31fd26be95ae43a8769b91c0:0.116350803,(a16679d5a10caa9753f171977552d920:0.105836235,((a7acc2abb505c3ee177a12e514d3b994:0.008268754,(4e22aa3508b98813f52e1a12ffdb74ad:0.03144211,8139c4ac825dae48454fb4800fb87896:0.043622957)0.923:0.046588301)0.997:0.120902074,((2d3df7387323e2edcbbfcb6e56a02710:0.031543994,3f6752aabcc291b67a063fb6492fd107:0.091571442)0.759:0.016335166,((d599ebe277afb0dfd4ad3c2176afc50e:5e-09,84d0affc7243c7d6261f3a7d680b873f:0.010245188)0.883:0.048993011,51121722488d0c3da1388d1b117cd239:0.119447926)0.763:0.035660204)0.921:0.058191474)0.776:0.02854575)0.657:0.052060833)0.658:0.032547569,(99647b51f775c8ddde8ed36a7d60dbcd:0.173334268,(f18a9c8112372e2916a66a9778f3741b:0.194813398,(5833416522de0cca717a1abf720079ac:5e-09,(2bf1067d2cd4f09671e3ebe5500205ca:0.031692682,(b32621bcd86cb99e846d8f6fee7c9ab8:0.031330707,1016319c25196d73bdb3096d86a9df2f:5e-09)0.058:0.01028612)0.849:0.010284866)0.791:0.041353384)0.922:0.109470534):0.022169824000000005)root;\n\n"); SUITE_END(); } int main(int argc, char** argv) { test_bptree_constructor_simple(); test_bptree_constructor_newline_bug(); test_bptree_constructor_from_existing(); test_bptree_constructor_single_descendent(); test_bptree_constructor_complex(); test_bptree_constructor_semicolon(); test_bptree_constructor_edgecases(); test_bptree_constructor_quoted_comma(); test_bptree_constructor_quoted_parens(); test_bptree_postorder(); test_bptree_preorder(); test_bptree_parent(); test_bptree_leftchild(); test_bptree_rightchild(); test_bptree_rightsibling(); test_bptree_get_tip_names(); test_bptree_mask(); test_bptree_shear_simple(); test_bptree_shear_deep(); test_bptree_collapse_simple(); test_bptree_collapse_edge(); test_biom_constructor(); test_biom_get_obs_data(); test_propstack_constructor(); test_propstack_push_and_pop(); test_propstack_get(); test_unifrac_set_proportions(); test_unifrac_deconvolute_stripes(); test_unifrac_stripes_to_condensed_form_even(); test_unifrac_stripes_to_condensed_form_odd(); test_unweighted_unifrac(); test_unweighted_unifrac_fast(); test_unnormalized_weighted_unifrac(); test_normalized_weighted_unifrac(); test_generalized_unifrac(); test_vaw_unifrac_weighted_normalized(); test_unifrac_sample_counts(); test_set_tasks(); test_test_table_ids_are_subset_of_tree(); test_faith_pd(); test_faith_pd_shear(); printf("\n"); printf(" %i / %i suites failed\n", suites_failed, suites_run); printf(" %i / %i suites empty\n", suites_empty, suites_run); printf(" %i / %i tests failed\n", tests_failed, tests_run); printf("\n THE END.\n"); return tests_failed ? EXIT_FAILURE : EXIT_SUCCESS; } unifrac-0.10.0/sucpp/tree.cpp000066400000000000000000000320031351072301000160020ustar00rootroot00000000000000#include "tree.hpp" #include #include using namespace su; BPTree::BPTree(std::string newick) { openclose = std::vector(); lengths = std::vector(); names = std::vector(); excess = std::vector(); select_0_index = std::vector(); select_1_index = std::vector(); structure = std::vector(); structure.reserve(500000); // a fair sized tree... avoid reallocs, and its not _that_ much waste if this is wrong // three pass for parse. not ideal, but easier to map from IOW code newick_to_bp(newick); // resize is correct here as we are not performing a push_back openclose.resize(nparens); lengths.resize(nparens); names.resize(nparens); select_0_index.resize(nparens / 2); select_1_index.resize(nparens / 2); excess.resize(nparens); structure_to_openclose(); newick_to_metadata(newick); index_and_cache(); } BPTree::BPTree(std::vector input_structure, std::vector input_lengths, std::vector input_names) { structure = input_structure; lengths = input_lengths; names = input_names; nparens = structure.size(); openclose = std::vector(); select_0_index = std::vector(); select_1_index = std::vector(); openclose.resize(nparens); select_0_index.resize(nparens / 2); select_1_index.resize(nparens / 2); excess.resize(nparens); structure_to_openclose(); index_and_cache(); } BPTree BPTree::mask(std::vector topology_mask, std::vector in_lengths) { std::vector new_structure = std::vector(); std::vector new_lengths = std::vector(); std::vector new_names = std::vector(); uint32_t count = 0; for(auto i = topology_mask.begin(); i != topology_mask.end(); i++) { if(*i) count++; } new_structure.resize(count); new_lengths.resize(count); new_names.resize(count); auto mask_it = topology_mask.begin(); auto base_it = this->structure.begin(); uint32_t new_idx = 0; uint32_t old_idx = 0; for(; mask_it != topology_mask.end(); mask_it++, base_it++, old_idx++) { if(*mask_it) { new_structure[new_idx] = this->structure[old_idx]; new_lengths[new_idx] = in_lengths[old_idx]; new_names[new_idx] = this->names[old_idx]; new_idx++; } } return BPTree(new_structure, new_lengths, new_names); } std::unordered_set BPTree::get_tip_names() { std::unordered_set observed; for(unsigned int i = 0; i < this->nparens; i++) { if(this->isleaf(i)) { observed.insert(this->names[i]); } } return observed; } BPTree BPTree::shear(std::unordered_set to_keep) { std::vector shearmask = std::vector(this->nparens); int32_t p; for(unsigned int i = 0; i < this->nparens; i++) { if(this->isleaf(i) && to_keep.count(this->names[i]) > 0) { shearmask[i] = true; shearmask[i+1] = true; p = this->parent(i); while(p != -1 && !shearmask[p]) { shearmask[p] = true; shearmask[this->close(p)] = true; p = this->parent(p); } } } return this->mask(shearmask, this->lengths); } BPTree BPTree::collapse() { std::vector collapsemask = std::vector(this->nparens); std::vector new_lengths = std::vector(this->lengths); uint32_t current, first, last; for(uint32_t i = 0; i < this->nparens / 2; i++) { current = this->preorderselect(i); if(this->isleaf(current) or (current == 0)) { // 0 == root collapsemask[current] = true; collapsemask[this->close(current)] = true; } else { first = this->leftchild(current); last = this->rightchild(current); if(first == last) { new_lengths[first] = new_lengths[first] + new_lengths[current]; } else { collapsemask[current] = true; collapsemask[this->close(current)] = true; } } } return this->mask(collapsemask, new_lengths); } /* mask = bit_array_create(self.B.size) bit_array_set_bit(mask, self.root()) bit_array_set_bit(mask, self.close(self.root())) new_lengths = self._lengths.copy() new_lengths_ptr = new_lengths.data with nogil: for i in range(n): current = self.preorderselect(i) if self.isleaf(current): bit_array_set_bit(mask, current) bit_array_set_bit(mask, self.close(current)) else: first = self.fchild(current) last = self.lchild(current) if first == last: new_lengths_ptr[first] = new_lengths_ptr[first] + \ new_lengths_ptr[current] else: bit_array_set_bit(mask, current) bit_array_set_bit(mask, self.close(current)) new_bp = self._mask_from_self(mask, new_lengths) bit_array_free(mask) return new_bp */ BPTree::~BPTree() { } void BPTree::index_and_cache() { // should probably do the open/close in here too unsigned int idx = 0; auto i = structure.begin(); auto k0 = select_0_index.begin(); auto k1 = select_1_index.begin(); auto e_it = excess.begin(); unsigned int e = 0; for(; i != structure.end(); i++, idx++ ) { if(*i) { *(k1++) = idx; *(e_it++) = ++e; } else { *(k0++) = idx; *(e_it++) = --e; } } } uint32_t BPTree::postorderselect(uint32_t k) { return open(select_0_index[k]); } uint32_t BPTree::preorderselect(uint32_t k) { return select_1_index[k]; } inline uint32_t BPTree::open(uint32_t i) { return structure[i] ? i : openclose[i]; } inline uint32_t BPTree::close(uint32_t i) { return structure[i] ? openclose[i] : i; } bool BPTree::isleaf(unsigned int idx) { return (structure[idx] && !structure[idx + 1]); } uint32_t BPTree::leftchild(uint32_t i) { // aka fchild if(isleaf(i)) return 0; // this is awkward, using 0 which is root, but a root cannot be a child. edge case else return i + 1; } uint32_t BPTree::rightchild(uint32_t i) { // aka lchild if(isleaf(i)) return 0; // this is awkward, using 0 which is root, but a root cannot be a child. edge case else return open(close(i) - 1); } uint32_t BPTree::rightsibling(uint32_t i) { // aka nsibling uint32_t position = close(i) + 1; if(position >= nparens) return 0; // will return 0 if no sibling as root cannot have a sibling else if(structure[position]) return position; else return 0; } int32_t BPTree::parent(uint32_t i) { return enclose(i); } int32_t BPTree::enclose(uint32_t i) { if(structure[i]) return bwd(i, -2) + 1; else return bwd(i - 1, -2) + 1; } int32_t BPTree::bwd(uint32_t i, int d) { uint32_t target_excess = excess[i] + d; for(int current_idx = i - 1; current_idx >= 0; current_idx--) { if(excess[current_idx] == target_excess) return current_idx; } return -1; } void BPTree::newick_to_bp(std::string newick) { char last_structure; bool potential_single_descendent = false; int count = 0; bool in_quote = false; for(auto c = newick.begin(); c != newick.end(); c++) { if(*c == '\'') in_quote = !in_quote; if(in_quote) continue; switch(*c) { case '(': // opening of a node count++; structure.push_back(true); last_structure = *c; potential_single_descendent = true; break; case ')': // closing of a node if(potential_single_descendent || (last_structure == ',')) { // we have a single descendent or a last child (i.e. ",)" scenario) count += 3; structure.push_back(true); structure.push_back(false); structure.push_back(false); potential_single_descendent = false; } else { // it is possible still to have a single descendent in the case of // multiple single descendents (e.g., (...()...) ) count += 1; structure.push_back(false); } last_structure = *c; break; case ',': if(last_structure != ')') { // we have a new tip count += 2; structure.push_back(true); structure.push_back(false); } potential_single_descendent = false; last_structure = *c; break; default: break; } } nparens = structure.size(); } void BPTree::structure_to_openclose() { std::stack oc; unsigned int open_idx; unsigned int i = 0; for(auto it = structure.begin(); it != structure.end(); it++, i++) { if(*it) { oc.push(i); } else { open_idx = oc.top(); oc.pop(); openclose[i] = open_idx; openclose[open_idx] = i; } } } // trim from end // from http://stackoverflow.com/a/217605 static inline std::string &rtrim(std::string &s) { s.erase(std::find_if(s.rbegin(), s.rend(), std::not1(std::ptr_fun(std::isspace))).base(), s.end()); return s; } //// WEIRDNESS. THIS SOLVES IT WITH THE RTRIM. ISOLATE, MOVE TO CONSTRUCTOR. void BPTree::newick_to_metadata(std::string newick) { newick = rtrim(newick); std::string::iterator start = newick.begin(); std::string::iterator end = newick.end(); std::string token; char last_structure = '\0'; unsigned int structure_idx = 0; unsigned int lag = 0; unsigned int open_idx; while(start != end) { token = tokenize(start, end); // this sucks. if(token.length() == 1 && is_structure_character(token[0])) { switch(token[0]) { case '(': structure_idx++; break; case ')': case ',': structure_idx++; if(last_structure == ')') lag++; break; } } else { // puts us on the corresponding closing parenthesis structure_idx += lag; lag = 0; open_idx = open(structure_idx); set_node_metadata(open_idx, token); // std::cout << structure_idx << " <-> " << open_idx << " " << token << std::endl; // make sure to advance an extra position if we are a leaf as the // as a leaf is by definition a 10, and doing a single advancement // would put the structure to token mapping out of sync if(isleaf(open_idx)) structure_idx += 2; else structure_idx += 1; } last_structure = token[0]; } } void BPTree::set_node_metadata(unsigned int open_idx, std::string &token) { double length = 0.0; std::string name = std::string(); unsigned int colon_idx = token.find_last_of(':'); if(colon_idx == 0) length = std::stof(token.substr(1)); else if(colon_idx < token.length()) { name = token.substr(0, colon_idx); length = std::stof(token.substr(colon_idx + 1)); } else name = token; names[open_idx] = name; lengths[open_idx] = length; } inline bool BPTree::is_structure_character(char c) { return (c == '(' || c == ')' || c == ',' || c == ';'); } std::string BPTree::tokenize(std::string::iterator &start, const std::string::iterator &end) { bool inquote = false; bool isquote = false; char c; std::string token; do { c = *start; start++; if(c == '\n') { continue; } isquote = c == '\''; if(inquote && isquote) { inquote = false; continue; } else if(!inquote && isquote) { inquote = true; continue; } if(is_structure_character(c) && !inquote) { if(token.length() == 0) token.push_back(c); break; } token.push_back(c); } while(start != end); return token; } std::vector BPTree::get_structure() { return structure; } std::vector BPTree::get_openclose() { return openclose; } unifrac-0.10.0/sucpp/tree.hpp000066400000000000000000000110641351072301000160130ustar00rootroot00000000000000#include #include #include #include #include namespace su { class BPTree { public: /* tracked attributes */ std::vector lengths; std::vector names; /* total number of parentheses */ uint32_t nparens; /* default constructor * * @param newick A newick string */ BPTree(std::string newick); /* constructor from a defined topology * * @param input_structure A boolean vector defining the topology * @param input_lengths A vector of double of the branch lengths * @param input_names A vector of str of the vertex names */ BPTree(std::vector input_structure, std::vector input_lengths, std::vector input_names); ~BPTree(); /* postorder tree traversal * * Get the index position of the ith node in a postorder tree * traversal. * * @param i The ith node in a postorder traversal */ uint32_t postorderselect(uint32_t i); /* preorder tree traversal * * Get the index position of the ith node in a preorder tree * traversal. * * @param i The ith node in a preorder traversal */ uint32_t preorderselect(uint32_t i); /* Test if the node at an index position is a leaf * * @param i The node to evaluate */ bool isleaf(uint32_t i); /* Get the left child of a node * * @param i The node to obtain the left child from */ uint32_t leftchild(uint32_t i); /* Get the right child of a node * * @param i The node to obtain the right child from */ uint32_t rightchild(uint32_t i); /* Get the right sibling of a node * * @param i The node to obtain the right sibling from */ uint32_t rightsibling(uint32_t i); /* Get the parent of a node * * @param i The node to obtain the parent of */ int32_t parent(uint32_t i); /* get the names at the tips of the tree */ std::unordered_set get_tip_names(); /* public getters */ std::vector get_structure(); std::vector get_openclose(); /* serialize the structure as a sequence of 1s and 0s */ void print() { for(auto c = structure.begin(); c != structure.end(); c++) { if(*c) std::cout << "1"; else std::cout << "0"; } std::cout << std::endl; } BPTree mask(std::vector topology_mask, std::vector in_lengths); // mask self BPTree shear(std::unordered_set to_keep); BPTree collapse(); private: std::vector structure; // the topology std::vector openclose; // cache'd mapping between parentheses std::vector select_0_index; // cache of select 0 std::vector select_1_index; // cache of select 1 std::vector excess; void index_and_cache(); // construct the select caches void newick_to_bp(std::string newick); // convert a newick string to parentheses void newick_to_metadata(std::string newick); // convert newick to attributes void structure_to_openclose(); // set the cache mapping between parentheses pairs void set_node_metadata(unsigned int open_idx, std::string &token); // set attributes for a node bool is_structure_character(char c); // test if a character is a newick structure inline uint32_t open(uint32_t i); // obtain the index of the opening for a given parenthesis inline uint32_t close(uint32_t i); // obtain the index of the closing for a given parenthesis std::string tokenize(std::string::iterator &start, const std::string::iterator &end); // newick -> tokens int32_t bwd(uint32_t i, int32_t d); int32_t enclose(uint32_t i); }; } unifrac-0.10.0/sucpp/unifrac.cpp000066400000000000000000000501731351072301000165020ustar00rootroot00000000000000#include "tree.hpp" #include "biom.hpp" #include "unifrac.hpp" #include "affinity.hpp" #include #include #include #include #include #include #include static pthread_mutex_t printf_mutex; static bool* report_status; std::string su::test_table_ids_are_subset_of_tree(su::biom &table, su::BPTree &tree) { std::unordered_set tip_names = tree.get_tip_names(); std::unordered_set::const_iterator hit; std::string a_missing_name = ""; for(auto i : table.obs_ids) { hit = tip_names.find(i); if(hit == tip_names.end()) { a_missing_name = i; break; } } return a_missing_name; } int sync_printf(const char *format, ...) { // https://stackoverflow.com/a/23587285/19741 va_list args; va_start(args, format); pthread_mutex_lock(&printf_mutex); vprintf(format, args); pthread_mutex_unlock(&printf_mutex); va_end(args); } void sig_handler(int signo) { // http://www.thegeekstuff.com/2012/03/catch-signals-sample-c-code if (signo == SIGUSR1) { if(report_status == NULL) fprintf(stderr, "Cannot report status.\n"); else { for(int i = 0; i < CPU_SETSIZE; i++) { report_status[i] = true; } } } } using namespace su; PropStack::PropStack(uint32_t vecsize) { defaultsize = vecsize; prop_stack = std::stack(); prop_map = std::unordered_map(); prop_map.reserve(1000); } PropStack::~PropStack() { double *vec; // drain stack for(unsigned int i = 0; i < prop_stack.size(); i++) { vec = prop_stack.top(); prop_stack.pop(); free(vec); } // drain the map for(auto it = prop_map.begin(); it != prop_map.end(); it++) { vec = it->second; free(vec); } prop_map.clear(); } double* PropStack::get(uint32_t i) { return prop_map[i]; } void PropStack::push(uint32_t node) { double* vec = prop_map[node]; prop_map.erase(node); prop_stack.push(vec); } double* PropStack::pop(uint32_t node) { /* * if we don't have any available vectors, create one * add it to our record of known vectors so we can track our mallocs */ double *vec; int err = 0; if(prop_stack.empty()) { err = posix_memalign((void **)&vec, 32, sizeof(double) * defaultsize); if(vec == NULL || err != 0) { fprintf(stderr, "Failed to allocate %zd bytes, err %d; [%s]:%d\n", sizeof(double) * defaultsize, err, __FILE__, __LINE__); exit(EXIT_FAILURE); } } else { vec = prop_stack.top(); prop_stack.pop(); } prop_map[node] = vec; return vec; } double** su::deconvolute_stripes(std::vector &stripes, uint32_t n) { // would be better to just do striped_to_condensed_form double **dm; dm = (double**)malloc(sizeof(double*) * n); if(dm == NULL) { fprintf(stderr, "Failed to allocate %zd bytes; [%s]:%d\n", sizeof(double*) * n, __FILE__, __LINE__); exit(EXIT_FAILURE); } for(unsigned int i = 0; i < n; i++) { dm[i] = (double*)malloc(sizeof(double) * n); if(dm[i] == NULL) { fprintf(stderr, "Failed to allocate %zd bytes; [%s]:%d\n", sizeof(double) * n, __FILE__, __LINE__); exit(EXIT_FAILURE); } dm[i][i] = 0; } for(unsigned int i = 0; i < stripes.size(); i++) { double *vec = stripes[i]; unsigned int k = 0; for(unsigned int row = 0, col = i + 1; row < n; row++, col++) { if(col < n) { dm[row][col] = vec[k]; dm[col][row] = vec[k]; } else { dm[col % n][row] = vec[k]; dm[row][col % n] = vec[k]; } k++; } } return dm; } void su::stripes_to_condensed_form(std::vector &stripes, uint32_t n, double* &cf, unsigned int start, unsigned int stop) { // n must be >= 2, but that should be enforced upstream as that would imply // computing unifrac on a single sample. uint64_t comb_N = comb_2(n); for(unsigned int stripe = start; stripe < stop; stripe++) { // compute the (i, j) position of each element in each stripe uint64_t i = 0; uint64_t j = stripe + 1; for(uint64_t k = 0; k < n; k++, i++, j++) { if(j == n) { i = 0; j = n - (stripe + 1); } // determine the position in the condensed form vector for a given (i, j) // based off of // https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.squareform.html uint64_t comb_N_minus_i = comb_2(n - i); cf[comb_N - comb_N_minus_i + (j - i - 1)] = stripes[stripe][k]; } } } void progressbar(float progress) { // from http://stackoverflow.com/a/14539953 // // could encapsulate into a classs for displaying time elapsed etc int barWidth = 70; std::cout << "["; int pos = barWidth * progress; for (int i = 0; i < barWidth; ++i) { if (i < pos) std::cout << "="; else if (i == pos) std::cout << ">"; else std::cout << " "; } std::cout << "] " << int(progress * 100.0) << " %\r"; std::cout.flush(); } void initialize_embedded(double*& prop, const su::task_parameters* task_p) { int err = 0; err = posix_memalign((void **)&prop, 32, sizeof(double) * task_p->n_samples * 2); if(prop == NULL || err != 0) { fprintf(stderr, "Failed to allocate %zd bytes, err %d; [%s]:%d\n", sizeof(double) * task_p->n_samples * 2, err, __FILE__, __LINE__); exit(EXIT_FAILURE); } } void initialize_sample_counts(double*& counts, const su::task_parameters* task_p, biom &table) { int err = 0; err = posix_memalign((void **)&counts, 32, sizeof(double) * task_p->n_samples * 2); if(counts == NULL || err != 0) { fprintf(stderr, "Failed to allocate %zd bytes, err %d; [%s]:%d\n", sizeof(double) * task_p->n_samples, err, __FILE__, __LINE__); exit(EXIT_FAILURE); } for(unsigned int i = 0; i < table.n_samples; i++) { counts[i] = table.sample_counts[i]; counts[i + table.n_samples] = table.sample_counts[i]; } } void initialize_stripes(std::vector &dm_stripes, std::vector &dm_stripes_total, Method unifrac_method, const su::task_parameters* task_p) { int err = 0; for(unsigned int i = task_p->start; i < task_p->stop; i++){ err = posix_memalign((void **)&dm_stripes[i], 32, sizeof(double) * task_p->n_samples); if(dm_stripes[i] == NULL || err != 0) { fprintf(stderr, "Failed to allocate %zd bytes, err %d; [%s]:%d\n", sizeof(double) * task_p->n_samples, err, __FILE__, __LINE__); exit(EXIT_FAILURE); } for(unsigned int j = 0; j < task_p->n_samples; j++) dm_stripes[i][j] = 0.; if(unifrac_method == unweighted || unifrac_method == weighted_normalized || unifrac_method == generalized) { err = posix_memalign((void **)&dm_stripes_total[i], 32, sizeof(double) * task_p->n_samples); if(dm_stripes_total[i] == NULL || err != 0) { fprintf(stderr, "Failed to allocate %zd bytes err %d; [%s]:%d\n", sizeof(double) * task_p->n_samples, err, __FILE__, __LINE__); exit(EXIT_FAILURE); } for(unsigned int j = 0; j < task_p->n_samples; j++) dm_stripes_total[i][j] = 0.; } } } // Computes Faith's PD for the samples in `table` over the phylogenetic // tree given by `tree`. // Assure that tree does not contain ids that are not in table void su::faith_pd(biom &table, BPTree &tree, double* result) { PropStack propstack(table.n_samples); uint32_t node; double *node_proportions; double length; // for node in postorderselect for(unsigned int k = 0; k < (tree.nparens / 2) - 1; k++) { node = tree.postorderselect(k); // get branch length length = tree.lengths[node]; // get node proportions and set intermediate scores node_proportions = propstack.pop(node); set_proportions(node_proportions, tree, node, table, propstack); for (unsigned int sample = 0; sample < table.n_samples; sample++){ // calculate contribution of node to score result[sample] += (node_proportions[sample] > 0) * length; } } } void su::unifrac(biom &table, BPTree &tree, Method unifrac_method, std::vector &dm_stripes, std::vector &dm_stripes_total, const su::task_parameters* task_p) { // processor affinity int err = bind_to_core(task_p->tid); if(err != 0) { fprintf(stderr, "Unable to bind thread %d to core: %d\n", task_p->tid, err); exit(EXIT_FAILURE); } if(table.n_samples != task_p->n_samples) { fprintf(stderr, "Task and table n_samples not equal\n"); exit(EXIT_FAILURE); } void (*func)(std::vector&, // dm_stripes std::vector&, // dm_stripes_total double*, // embedded_proportions double, // length const su::task_parameters*); switch(unifrac_method) { case unweighted: func = &su::_unweighted_unifrac_task; break; case weighted_normalized: func = &su::_normalized_weighted_unifrac_task; break; case weighted_unnormalized: func = &su::_unnormalized_weighted_unifrac_task; break; case generalized: func = &su::_generalized_unifrac_task; break; default: func = NULL; break; } if(func == NULL) { fprintf(stderr, "Unknown unifrac task\n"); exit(1); } PropStack propstack(table.n_samples); uint32_t node; double *node_proportions; double *embedded_proportions; double length; initialize_embedded(embedded_proportions, task_p); initialize_stripes(std::ref(dm_stripes), std::ref(dm_stripes_total), unifrac_method, task_p); for(unsigned int k = 0; k < (tree.nparens / 2) - 1; k++) { node = tree.postorderselect(k); length = tree.lengths[node]; node_proportions = propstack.pop(node); set_proportions(node_proportions, tree, node, table, propstack); if(task_p->bypass_tips && tree.isleaf(node)) continue; embed_proportions(embedded_proportions, node_proportions, task_p->n_samples); /* * The values in the example vectors correspond to index positions of an * element in the resulting distance matrix. So, in the example below, * the following can be interpreted: * * [0 1 2] * [1 2 3] * * As comparing the sample for row 0 against the sample for col 1, the * sample for row 1 against the sample for col 2, the sample for row 2 * against the sample for col 3. * * In other words, we're computing stripes of a distance matrix. In the * following example, we're computing over 6 samples requiring 3 * stripes. * * A; stripe == 0 * [0 1 2 3 4 5] * [1 2 3 4 5 0] * * B; stripe == 1 * [0 1 2 3 4 5] * [2 3 4 5 0 1] * * C; stripe == 2 * [0 1 2 3 4 5] * [3 4 5 0 1 2] * * The stripes end up computing the following positions in the distance * matrix. * * x A B C x x * x x A B C x * x x x A B C * C x x x A B * B C x x x A * A B C x x x * * However, we store those stripes as vectors, ie * [ A A A A A A ] * * We end up performing N / 2 redundant calculations on the last stripe * (see C) but that is small over large N. */ func(dm_stripes, dm_stripes_total, embedded_proportions, length, task_p); if(__builtin_expect(report_status[task_p->tid], false)) { sync_printf("tid:%d\tstart:%d\tstop:%d\tk:%d\ttotal:%d\n", task_p->tid, task_p->start, task_p->stop, k, (tree.nparens / 2) - 1); report_status[task_p->tid] = false; } } if(unifrac_method == weighted_normalized || unifrac_method == unweighted || unifrac_method == generalized) { for(unsigned int i = task_p->start; i < task_p->stop; i++) { for(unsigned int j = 0; j < task_p->n_samples; j++) { dm_stripes[i][j] = dm_stripes[i][j] / dm_stripes_total[i][j]; } } } free(embedded_proportions); } void su::unifrac_vaw(biom &table, BPTree &tree, Method unifrac_method, std::vector &dm_stripes, std::vector &dm_stripes_total, const su::task_parameters* task_p) { // processor affinity int err = bind_to_core(task_p->tid); if(err != 0) { fprintf(stderr, "Unable to bind thread %d to core: %d\n", task_p->tid, err); exit(EXIT_FAILURE); } if(table.n_samples != task_p->n_samples) { fprintf(stderr, "Task and table n_samples not equal\n"); exit(EXIT_FAILURE); } void (*func)(std::vector&, // dm_stripes std::vector&, // dm_stripes_total double*, // embedded_proportions double*, // embedded_counts double*, // sample total counts double, // length const su::task_parameters*); switch(unifrac_method) { case unweighted: func = &su::_vaw_unweighted_unifrac_task; break; case weighted_normalized: func = &su::_vaw_normalized_weighted_unifrac_task; break; case weighted_unnormalized: func = &su::_vaw_unnormalized_weighted_unifrac_task; break; case generalized: func = &su::_vaw_generalized_unifrac_task; break; default: func = NULL; break; } if(func == NULL) { fprintf(stderr, "Unknown unifrac task\n"); exit(1); } PropStack propstack(table.n_samples); PropStack countstack(table.n_samples); uint32_t node; double *node_proportions; double *node_counts; double *embedded_proportions; double *embedded_counts; double *sample_total_counts; double length; initialize_embedded(embedded_proportions, task_p); initialize_embedded(embedded_counts, task_p); initialize_sample_counts(sample_total_counts, task_p, table); initialize_stripes(std::ref(dm_stripes), std::ref(dm_stripes_total), unifrac_method, task_p); for(unsigned int k = 0; k < (tree.nparens / 2) - 1; k++) { node = tree.postorderselect(k); length = tree.lengths[node]; node_proportions = propstack.pop(node); node_counts = countstack.pop(node); set_proportions(node_proportions, tree, node, table, propstack); set_proportions(node_counts, tree, node, table, countstack, false); if(task_p->bypass_tips && tree.isleaf(node)) continue; embed_proportions(embedded_proportions, node_proportions, task_p->n_samples); embed_proportions(embedded_counts, node_counts, task_p->n_samples); func(dm_stripes, dm_stripes_total, embedded_proportions, embedded_counts, sample_total_counts, length, task_p); if(__builtin_expect(report_status[task_p->tid], false)) { sync_printf("tid:%d\tstart:%d\tstop:%d\tk:%d\ttotal:%d\n", task_p->tid, task_p->start, task_p->stop, k, (tree.nparens / 2) - 1); report_status[task_p->tid] = false; } } if(unifrac_method == weighted_normalized || unifrac_method == unweighted || unifrac_method == generalized) { for(unsigned int i = task_p->start; i < task_p->stop; i++) { for(unsigned int j = 0; j < task_p->n_samples; j++) { dm_stripes[i][j] = dm_stripes[i][j] / dm_stripes_total[i][j]; } } } free(embedded_proportions); free(embedded_counts); free(sample_total_counts); } void su::set_proportions(double* props, BPTree &tree, uint32_t node, biom &table, PropStack &ps, bool normalize) { if(tree.isleaf(node)) { table.get_obs_data(tree.names[node], props); for(unsigned int i = 0; i < table.n_samples; i++) { props[i] = props[i]; if(normalize) props[i] /= table.sample_counts[i]; } } else { unsigned int current = tree.leftchild(node); unsigned int right = tree.rightchild(node); double *vec; for(unsigned int i = 0; i < table.n_samples; i++) props[i] = 0; while(current <= right && current != 0) { vec = ps.get(current); // pull from prop map ps.push(current); // remove from prop map, place back on stack for(unsigned int i = 0; i < table.n_samples; i++) props[i] = props[i] + vec[i]; current = tree.rightsibling(current); } } } std::vector su::make_strides(unsigned int n_samples) { uint32_t n_rotations = (n_samples + 1) / 2; std::vector dm_stripes(n_rotations); int err = 0; for(unsigned int i = 0; i < n_rotations; i++) { double* tmp; err = posix_memalign((void **)&tmp, 32, sizeof(double) * n_samples); if(tmp == NULL || err != 0) { fprintf(stderr, "Failed to allocate %zd bytes, err %d; [%s]:%d\n", sizeof(double) * n_samples, err, __FILE__, __LINE__); exit(EXIT_FAILURE); } for(unsigned int j = 0; j < n_samples; j++) tmp[j] = 0.0; dm_stripes[i] = tmp; } return dm_stripes; } void su::process_stripes(biom &table, BPTree &tree_sheared, Method method, bool variance_adjust, std::vector &dm_stripes, std::vector &dm_stripes_total, std::vector &threads, std::vector &tasks) { // register a signal handler so we can ask the master thread for its // progress if (signal(SIGUSR1, sig_handler) == SIG_ERR) fprintf(stderr, "Can't catch SIGUSR1\n"); report_status = (bool*)calloc(sizeof(bool), CPU_SETSIZE); pthread_mutex_init(&printf_mutex, NULL); for(unsigned int tid = 0; tid < threads.size(); tid++) { if(variance_adjust) threads[tid] = std::thread(su::unifrac_vaw, std::ref(table), std::ref(tree_sheared), method, std::ref(dm_stripes), std::ref(dm_stripes_total), &tasks[tid]); else threads[tid] = std::thread(su::unifrac, std::ref(table), std::ref(tree_sheared), method, std::ref(dm_stripes), std::ref(dm_stripes_total), &tasks[tid]); } for(unsigned int tid = 0; tid < threads.size(); tid++) { threads[tid].join(); } if(report_status != NULL) { pthread_mutex_destroy(&printf_mutex); free(report_status); } } unifrac-0.10.0/sucpp/unifrac.hpp000066400000000000000000000067031351072301000165070ustar00rootroot00000000000000#include #include #include #include #include "unifrac_task.hpp" #include #ifndef __UNIFRAC namespace su { enum Method {unweighted, weighted_normalized, weighted_unnormalized, generalized}; class PropStack { private: std::stack prop_stack; std::unordered_map prop_map; uint32_t defaultsize; public: PropStack(uint32_t vecsize); ~PropStack(); double* pop(uint32_t i); void push(uint32_t i); double* get(uint32_t i); }; void faith_pd(biom &table, BPTree &tree, double* result); std::string test_table_ids_are_subset_of_tree(biom &table, BPTree &tree); void unifrac(biom &table, BPTree &tree, Method unifrac_method, std::vector &dm_stripes, std::vector &dm_stripes_total, const task_parameters* task_p); void unifrac_vaw(biom &table, BPTree &tree, Method unifrac_method, std::vector &dm_stripes, std::vector &dm_stripes_total, const task_parameters* task_p); double** deconvolute_stripes(std::vector &stripes, uint32_t n); void stripes_to_condensed_form(std::vector &stripes, uint32_t n, double* &cf, unsigned int start, unsigned int stop); void set_proportions(double* props, BPTree &tree, uint32_t node, biom &table, PropStack &ps, bool normalize = true); std::vector make_strides(unsigned int n_samples); inline void embed_proportions(double* out, double* in, uint32_t n) { double val; for(unsigned int i = 0; i < n; i++) { val = in[i]; out[i] = val; out[i + n] = val; } } inline uint64_t comb_2(uint64_t N) { // based off of _comb_int_long // https://github.com/scipy/scipy/blob/v0.19.1/scipy/special/_comb.pyx // Compute binom(N, k) for integers. // // we're disregarding overflow as that practically should not // happen unless the number of samples processed is in excess // of 4 billion uint64_t val, j, M, nterms; uint64_t k = 2; M = N + 1; nterms = k < (N - k) ? k : N - k; val = 1; for(j = 1; j < nterms + 1; j++) { val *= M - j; val /= j; } return val; } // process the stripes described by tasks void process_stripes(biom &table, BPTree &tree_sheared, Method method, bool variance_adjust, std::vector &dm_stripes, std::vector &dm_stripes_total, std::vector &threads, std::vector &tasks); } #define __UNIFRAC 1 #endif unifrac-0.10.0/sucpp/unifrac_task.cpp000066400000000000000000000346721351072301000175320ustar00rootroot00000000000000#include "unifrac_task.hpp" #include void su::_unnormalized_weighted_unifrac_task(std::vector &__restrict__ dm_stripes, std::vector &__restrict__ dm_stripes_total, double* __restrict__ embedded_proportions, double length, const su::task_parameters* task_p) { double *dm_stripe; for(unsigned int stripe=task_p->start; stripe < task_p->stop; stripe++) { dm_stripe = dm_stripes[stripe]; /* intrinsics yield about a 2x reduction in runtime on llvm. they * were not effective on linux gcc 4.9.1 or 4.9.2. it is unclear * if they would be effective on other versions of gcc. * * one reason they help is that these for loops are not easily * autovectorizable. using the intrinsics effectively gets around * this. ...although, it also appears that loop unrolling works. * * it may make sense to revisit the inclusion of intriniscs, however * support must be tested at compile time, so it's rather annoying * at the moment. basically, we can't assume the presence of avx2. */ for(unsigned int j = 0; j < task_p->n_samples / 4; j++) { int k = j * 4; double u1 = embedded_proportions[k]; double u2 = embedded_proportions[k + 1]; double u3 = embedded_proportions[k + 2]; double u4 = embedded_proportions[k + 3]; double v1 = embedded_proportions[k + stripe + 1]; double v2 = embedded_proportions[k + stripe + 2]; double v3 = embedded_proportions[k + stripe + 3]; double v4 = embedded_proportions[k + stripe + 4]; dm_stripe[k] += fabs(u1 - v1) * length; dm_stripe[k + 1] += fabs(u2 - v2) * length; dm_stripe[k + 2] += fabs(u3 - v3) * length; dm_stripe[k + 3] += fabs(u4 - v4) * length; } if((task_p->n_samples % 4) != 0) { for(unsigned int k = task_p->n_samples - (task_p->n_samples % 4); k < task_p->n_samples; k++) { double u = embedded_proportions[k]; double v = embedded_proportions[k + stripe + 1]; dm_stripe[k] += fabs(u - v) * length; } } } } void su::_vaw_unnormalized_weighted_unifrac_task(std::vector &__restrict__ dm_stripes, std::vector &__restrict__ dm_stripes_total, double* __restrict__ embedded_proportions, double* __restrict__ embedded_counts, double* __restrict__ sample_total_counts, double length, const su::task_parameters* task_p) { double *dm_stripe; for(unsigned int stripe=task_p->start; stripe < task_p->stop; stripe++) { dm_stripe = dm_stripes[stripe]; for(unsigned int j = 0; j < task_p->n_samples; j++) { double u = embedded_proportions[j]; double v = embedded_proportions[j + stripe + 1]; double m = sample_total_counts[j] + sample_total_counts[j + stripe + 1]; double mi = embedded_counts[j] + embedded_counts[j + stripe + 1]; double vaw = sqrt(mi * (m - mi)); if(vaw > 0) dm_stripe[j] += (fabs(u - v) * length) / vaw; } } } void su::_normalized_weighted_unifrac_task(std::vector &__restrict__ dm_stripes, std::vector &__restrict__ dm_stripes_total, double* __restrict__ embedded_proportions, double length, const su::task_parameters* task_p) { double *dm_stripe; double *dm_stripe_total; unsigned int trailing = task_p->n_samples - (task_p->n_samples % 4); // point of thread for(unsigned int stripe = task_p->start; stripe < task_p->stop; stripe++) { dm_stripe = dm_stripes[stripe]; dm_stripe_total = dm_stripes_total[stripe]; for(unsigned int j = 0; j < task_p->n_samples / 4; j++) { int k = j * 4; int l = k + stripe; double u1 = embedded_proportions[k]; double u2 = embedded_proportions[k + 1]; double u3 = embedded_proportions[k + 2]; double u4 = embedded_proportions[k + 3]; double v1 = embedded_proportions[l + 1]; double v2 = embedded_proportions[l + 2]; double v3 = embedded_proportions[l + 3]; double v4 = embedded_proportions[l + 4]; double diff1 = u1 - v1; double diff2 = u2 - v2; double diff3 = u3 - v3; double diff4 = u4 - v4; double sum1 = u1 + v1; double sum2 = u2 + v2; double sum3 = u3 + v3; double sum4 = u4 + v4; dm_stripe[k] += fabs(diff1) * length; dm_stripe[k + 1] += fabs(diff2) * length; dm_stripe[k + 2] += fabs(diff3) * length; dm_stripe[k + 3] += fabs(diff4) * length; dm_stripe_total[k] += sum1 * length; dm_stripe_total[k + 1] += sum2 * length; dm_stripe_total[k + 2] += sum3 * length; dm_stripe_total[k + 3] += sum4 * length; } for(unsigned int k = trailing; k < task_p->n_samples; k++) { double u = embedded_proportions[k]; double v = embedded_proportions[k + stripe + 1]; double diff = u - v; double sum = u + v; dm_stripe[k] += fabs(diff) * length; dm_stripe_total[k] += sum * length; } } } void su::_vaw_normalized_weighted_unifrac_task(std::vector &__restrict__ dm_stripes, std::vector &__restrict__ dm_stripes_total, double* __restrict__ embedded_proportions, double* __restrict__ embedded_counts, double* __restrict__ sample_total_counts, double length, const su::task_parameters* task_p) { double *dm_stripe; double *dm_stripe_total; // point of thread for(unsigned int stripe = task_p->start; stripe < task_p->stop; stripe++) { dm_stripe = dm_stripes[stripe]; dm_stripe_total = dm_stripes_total[stripe]; for(unsigned int j = 0; j < task_p->n_samples; j++) { double u = embedded_proportions[j]; double v = embedded_proportions[j + stripe + 1]; double m = sample_total_counts[j] + sample_total_counts[j + stripe + 1]; double mi = embedded_counts[j] + embedded_counts[j + stripe + 1]; double vaw = sqrt(mi * (m - mi)); if(vaw > 0) { dm_stripe[j] += (fabs(u - v) * length) / vaw; dm_stripe_total[j] += ((u + v) * length) / vaw; } } } } #define GUNIFRAC(u, v, s, j) if(s != 0.0) { \ double sub1 = fabs(u - v); \ double sum_pow1 = pow(s, task_p->g_unifrac_alpha) * length; \ dm_stripe[j] += sum_pow1 * (sub1 / s); \ dm_stripe_total[j] += sum_pow1; \ } void su::_generalized_unifrac_task(std::vector &__restrict__ dm_stripes, std::vector &__restrict__ dm_stripes_total, double* __restrict__ embedded_proportions, double length, const su::task_parameters* task_p) { double *dm_stripe; double *dm_stripe_total; unsigned int trailing = task_p->n_samples - (task_p->n_samples % 4); // point of thread for(unsigned int stripe = task_p->start; stripe < task_p->stop; stripe++) { dm_stripe = dm_stripes[stripe]; dm_stripe_total = dm_stripes_total[stripe]; for(unsigned int j = 0; j < task_p->n_samples / 4; j++) { int k = j * 4; int l = k + stripe; double u1 = embedded_proportions[k]; double u2 = embedded_proportions[k + 1]; double u3 = embedded_proportions[k + 2]; double u4 = embedded_proportions[k + 3]; double v1 = embedded_proportions[l + 1]; double v2 = embedded_proportions[l + 2]; double v3 = embedded_proportions[l + 3]; double v4 = embedded_proportions[l + 4]; double sum1 = u1 + v1; double sum2 = u2 + v2; double sum3 = u3 + v3; double sum4 = u4 + v4; GUNIFRAC(u1, v1, sum1, k) GUNIFRAC(u2, v2, sum2, k + 1) GUNIFRAC(u3, v3, sum3, k + 2) GUNIFRAC(u4, v4, sum4, k + 3) } for(unsigned int k = trailing; k < task_p->n_samples; k++) { double u = embedded_proportions[k]; double v = embedded_proportions[k + stripe + 1]; double s = u + v; GUNIFRAC(u, v, s, k) } } } void su::_vaw_generalized_unifrac_task(std::vector &__restrict__ dm_stripes, std::vector &__restrict__ dm_stripes_total, double* __restrict__ embedded_proportions, double* __restrict__ embedded_counts, double* __restrict__ sample_total_counts, double length, const su::task_parameters* task_p) { double *dm_stripe; double *dm_stripe_total; // point of thread for(unsigned int stripe = task_p->start; stripe < task_p->stop; stripe++) { dm_stripe = dm_stripes[stripe]; dm_stripe_total = dm_stripes_total[stripe]; for(unsigned int j = 0; j < task_p->n_samples; j++) { double m = sample_total_counts[j] + sample_total_counts[j + stripe + 1]; double mi = embedded_counts[j] + embedded_counts[j + stripe + 1]; double vaw = sqrt(mi * (m - mi)); double u1 = embedded_proportions[j]; double v1 = embedded_proportions[j + stripe + 1]; if(vaw > 0.0) { double sum1 = (u1 + v1) / vaw; double sub1 = fabs(u1 - v1) / vaw; double sum_pow1 = pow(sum1, task_p->g_unifrac_alpha) * length; dm_stripe[j] += sum_pow1 * (sub1 / sum1); dm_stripe_total[j] += sum_pow1; } } } } void su::_unweighted_unifrac_task(std::vector &__restrict__ dm_stripes, std::vector &__restrict__ dm_stripes_total, double* __restrict__ embedded_proportions, double length, const su::task_parameters* task_p) { double *dm_stripe; double *dm_stripe_total; for(unsigned int stripe = task_p->start; stripe < task_p->stop; stripe++) { dm_stripe = dm_stripes[stripe]; dm_stripe_total = dm_stripes_total[stripe]; for(unsigned int j = 0; j < task_p->n_samples / 4; j++) { int k = j * 4; int32_t u1 = embedded_proportions[k] > 0; int32_t u2 = embedded_proportions[k + 1] > 0; int32_t u3 = embedded_proportions[k + 2] > 0; int32_t u4 = embedded_proportions[k + 3] > 0; int32_t v1 = embedded_proportions[k + stripe + 1] > 0; int32_t v2 = embedded_proportions[k + stripe + 2] > 0; int32_t v3 = embedded_proportions[k + stripe + 3] > 0; int32_t v4 = embedded_proportions[k + stripe + 4] > 0; dm_stripe[k] += (u1 ^ v1) * length; dm_stripe[k + 1] += (u2 ^ v2) * length; dm_stripe[k + 2] += (u3 ^ v3) * length; dm_stripe[k + 3] += (u4 ^ v4) * length; dm_stripe_total[k] += (u1 | v1) * length; dm_stripe_total[k + 1] += (u2 | v2) * length; dm_stripe_total[k + 2] += (u3 | v3) * length; dm_stripe_total[k + 3] += (u4 | v4) * length; } if((task_p->n_samples % 4) != 0) { for(unsigned int k = task_p->n_samples - (task_p->n_samples % 4); k < task_p->n_samples; k++) { int32_t u = embedded_proportions[k] > 0; int32_t v = embedded_proportions[k + stripe + 1] > 0; dm_stripe[k] += (u ^ v) * length; dm_stripe_total[k] += (u | v) * length; } } } } void su::_vaw_unweighted_unifrac_task(std::vector &__restrict__ dm_stripes, std::vector &__restrict__ dm_stripes_total, double* __restrict__ embedded_proportions, double* __restrict__ embedded_counts, double* __restrict__ sample_total_counts, double length, const su::task_parameters* task_p) { double *dm_stripe; double *dm_stripe_total; for(unsigned int stripe = task_p->start; stripe < task_p->stop; stripe++) { dm_stripe = dm_stripes[stripe]; dm_stripe_total = dm_stripes_total[stripe]; for(unsigned int j = 0; j < task_p->n_samples; j++) { int32_t u = embedded_proportions[j] > 0; int32_t v = embedded_proportions[j + stripe + 1] > 0; double m = sample_total_counts[j] + sample_total_counts[j + stripe + 1]; double mi = embedded_counts[j] + embedded_counts[j + stripe + 1]; double vaw = sqrt(mi * (m - mi)); if(vaw > 0) { dm_stripe[j] += ((u ^ v) * length) / vaw; dm_stripe_total[j] += ((u | v) * length) / vaw; } } } } unifrac-0.10.0/sucpp/unifrac_task.hpp000066400000000000000000000137251351072301000175330ustar00rootroot00000000000000#include "task_parameters.hpp" #include #include #include namespace su { /* void su::unifrac tasks * * all methods utilize the same function signature. that signature is as follows: * * dm_stripes vector the stripes of the distance matrix being accumulated * into for unique branch length * dm_stripes vector the stripes of the distance matrix being accumulated * into for total branch length (e.g., to normalize unweighted unifrac) * embedded_proportions the proportions vector for a sample, or rather * the counts vector normalized to 1. this vector is embedded as it is * duplicated: if A, B and C are proportions for features A, B, and C, the * vector will look like [A B C A B C]. * length the branch length of the current node to its parent. * task_p task specific parameters. */ void _unnormalized_weighted_unifrac_task(std::vector &__restrict__ dm_stripes, std::vector &__restrict__ dm_stripes_total, double* __restrict__ embedded_proportions, double length, const su::task_parameters* task_p); void _normalized_weighted_unifrac_task(std::vector &__restrict__ dm_stripes, std::vector &__restrict__ dm_stripes_total, double* __restrict__ embedded_proportions, double length, const su::task_parameters* task_p); void _unweighted_unifrac_task(std::vector &__restrict__ dm_stripes, std::vector &__restrict__ dm_stripes_total, double* __restrict__ embedded_proportions, double length, const su::task_parameters* task_p); void _generalized_unifrac_task(std::vector &__restrict__ dm_stripes, std::vector &__restrict__ dm_stripes_total, double* __restrict__ embedded_proportions, double length, const su::task_parameters* task_p); /* void su::unifrac_vaw tasks * * all methods utilize the same function signature. that signature is as follows: * * dm_stripes vector the stripes of the distance matrix being accumulated * into for unique branch length * dm_stripes vector the stripes of the distance matrix being accumulated * into for total branch length (e.g., to normalize unweighted unifrac) * embedded_proportions the proportions vector for a sample, or rather * the counts vector normalized to 1. this vector is embedded as it is * duplicated: if A, B and C are proportions for features A, B, and C, the * vector will look like [A B C A B C]. * embedded_counts the counts vector embedded in the same way and order as * embedded_proportions. the values of this array are unnormalized feature * counts for the subtree. * sample_total_counts the total unnormalized feature counts for all samples * embedded in the same way and order as embedded_proportions. * length the branch length of the current node to its parent. * task_p task specific parameters. */ void _vaw_unnormalized_weighted_unifrac_task(std::vector &__restrict__ dm_stripes, std::vector &__restrict__ dm_stripes_total, double* __restrict__ embedded_proportions, double* __restrict__ embedded_counts, double* __restrict__ sample_total_counts, double length, const su::task_parameters* task_p); void _vaw_normalized_weighted_unifrac_task(std::vector &__restrict__ dm_stripes, std::vector &__restrict__ dm_stripes_total, double* __restrict__ embedded_proportions, double* __restrict__ embedded_counts, double* __restrict__ sample_total_counts, double length, const su::task_parameters* task_p); void _vaw_unweighted_unifrac_task(std::vector &__restrict__ dm_stripes, std::vector &__restrict__ dm_stripes_total, double* __restrict__ embedded_proportions, double* __restrict__ embedded_counts, double* __restrict__ sample_total_counts, double length, const su::task_parameters* task_p); void _vaw_generalized_unifrac_task(std::vector &__restrict__ dm_stripes, std::vector &__restrict__ dm_stripes_total, double* __restrict__ embedded_proportions, double* __restrict__ embedded_counts, double* __restrict__ sample_total_counts, double length, const su::task_parameters* task_p); } unifrac-0.10.0/unifrac/000077500000000000000000000000001351072301000146365ustar00rootroot00000000000000unifrac-0.10.0/unifrac/__init__.py000066400000000000000000000014361351072301000167530ustar00rootroot00000000000000# ---------------------------------------------------------------------------- # Copyright (c) 2016-2017, UniFrac development team. # # Distributed under the terms of the Modified BSD License. # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- import pkg_resources from unifrac._methods import (unweighted, weighted_normalized, weighted_unnormalized, generalized, meta) from unifrac._api import ssu, faith_pd __version__ = pkg_resources.get_distribution('unifrac').version __all__ = ['unweighted', 'weighted_normalized', 'weighted_unnormalized', 'generalized', 'meta', 'ssu', 'faith_pd'] unifrac-0.10.0/unifrac/_api.pxd000066400000000000000000000017451351072301000162720ustar00rootroot00000000000000#distutils: language = c++ from libcpp cimport bool cdef extern from "../sucpp/api.hpp": struct mat: double* condensed_form unsigned int n_samples unsigned int cf_size char** sample_ids struct results_vec: unsigned int n_samples double* values char** sample_ids enum compute_status: okay, tree_missing, table_missing, table_empty, unknown_method, table_and_tree_do_not_overlap compute_status one_off(const char* biom_filename, const char* tree_filename, const char* unifrac_method, bool variance_adjust, double alpha, bool bypass_tips, unsigned int threads, mat** result) compute_status faith_pd_one_off(const char* biom_filename, const char* tree_filename, results_vec** result) void destroy_mat(mat** result) void destroy_results_vec(results_vec** result) unifrac-0.10.0/unifrac/_api.pyx000066400000000000000000000123501351072301000163110ustar00rootroot00000000000000import skbio import numpy as np cimport numpy as np import pandas as pd def ssu(str biom_filename, str tree_filename, str unifrac_method, bool variance_adjust, double alpha, bool bypass_tips, unsigned int threads): """Execute a call to Strided State UniFrac via the direct API Parameters ---------- biom_filename : str A filepath to a BIOM 2.1 formatted table (HDF5) tree_filename : str A filepath to a Newick formatted tree unifrac_method : str The requested UniFrac method, one of {unweighted, weighted_normalized, weighted_unnormalized, generalized} variance_adjust : bool Whether to perform Variance Adjusted UniFrac alpha : float The value of alpha for Generalized UniFrac; only applies to Generalized UniFraca bypass_tips : bool Bypass the tips of the tree in the computation. This reduces compute by about 50%, but is an approximation. threads : int The number of threads to use. Returns ------- skbio.DistanceMatrix The resulting distance matrix Raises ------ IOError If the tree file is not found If the table is not found ValueError If the table is empty If the table is not completely represented by the phylogeny If an unknown method is requested. Exception If an unkown error is experienced """ cdef: mat *result; compute_status status; np.ndarray[np.double_t, ndim=1] numpy_arr double *cf int i bytes biom_py_bytes bytes tree_py_bytes bytes met_py_bytes char* biom_c_string char* tree_c_string char* met_c_string list ids biom_py_bytes = biom_filename.encode() tree_py_bytes = tree_filename.encode() met_py_bytes = unifrac_method.encode() biom_c_string = biom_py_bytes tree_c_string = tree_py_bytes met_c_string = met_py_bytes status = one_off(biom_c_string, tree_c_string, met_c_string, variance_adjust, alpha, bypass_tips, threads, &result) if status != okay: if status == tree_missing: raise IOError("Tree file not found.") elif status == table_missing: raise IOError("Table file not found.") elif status == table_empty: raise ValueError("Table file is empty.") elif status == table_and_tree_do_not_overlap: raise ValueError("The table does not appear to be completely " "represented by the phylogeny.") elif status == unknown_method: raise ValueError("Unknown method.") else: raise Exception("Unknown Error: {}".format(status)) ids = [] numpy_arr = np.zeros(result.cf_size, dtype=np.double) numpy_arr[:] = result.condensed_form for i in range(result.n_samples): ids.append(result.sample_ids[i].decode('utf-8')) destroy_mat(&result) return skbio.DistanceMatrix(numpy_arr, ids) def faith_pd(str biom_filename, str tree_filename): """Execute a call to the Stacked Faith API in the UniFrac package Parameters ---------- biom_filename : str A filepath to a BIOM 2.1 formatted table (HDF5) tree_filename : str A filepath to a Newick formatted tree Returns ------- pd.Series Series of Faith's PD for each sample in `biom_filename` Raises ------ IOError If the tree file is not found If the table is not found ValueError If the table is empty If the table is not completely represented by the phylogeny Exception If an unkown error is experienced """ cdef: results_vec *result; compute_status status; np.ndarray[np.double_t, ndim=1] numpy_arr bytes biom_py_bytes bytes tree_py_bytes char* biom_c_string char* tree_c_string list ids biom_py_bytes = biom_filename.encode() tree_py_bytes = tree_filename.encode() biom_c_string = biom_py_bytes tree_c_string = tree_py_bytes status = faith_pd_one_off(biom_c_string, tree_c_string, &result) if status != okay: if status == tree_missing: raise IOError("Tree file not found.") elif status == table_missing: raise IOError("Table file not found.") elif status == table_empty: raise ValueError("Table file is empty.") elif status == table_and_tree_do_not_overlap: raise ValueError("The table does not appear to be completely " "represented by the phylogeny.") else: raise Exception("Unknown Error: {}".format(status)) numpy_arr = np.zeros(result.n_samples, dtype=np.double) numpy_arr[:] = result.values ids = [] for i in range(result.n_samples): ids.append(result.sample_ids[i].decode('utf-8')) faith_pd_series = pd.Series(numpy_arr, index=ids) faith_pd_series.rename("faith_pd", inplace=True) destroy_results_vec(&result) return faith_pd_series unifrac-0.10.0/unifrac/_meta.py000066400000000000000000000073601351072301000163030ustar00rootroot00000000000000# ---------------------------------------------------------------------------- # Copyright (c) 2016-2017, UniFrac development team. # # Distributed under the terms of the Modified BSD License. # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- # Code pulled from cogent.maths.unifrac.fast_unifrac; the authors have # previously indicated approval for converstion from GPL -> BSD # https://github.com/biocore/scikit-bio#the-pre-history-of-scikit-bio # These methods did not have unit tests in cogent import numpy as np def consolidate_skipping_missing_matrices(matrices, env_names, weights, all_env_names): """Consolidates matrices, skipping any that are missing envs""" weight_sum = 0 result = np.zeros((len(all_env_names), len(all_env_names)), float) for m, e, w in zip(matrices, env_names, weights): if e == all_env_names: # note -- assumes sorted result += m * w weight_sum += w # readjust weights for missing matrices result /= weight_sum return result def consolidate_missing_zero(matrices, env_names, weights, all_env_names): """Consolidates matrices, setting missing values to 0 distance""" result = np.zeros((len(all_env_names), len(all_env_names)), float) for m, e, w in zip(matrices, env_names, weights): result += reshape_by_name(m, e, all_env_names, 0) * w return result def consolidate_missing_one(matrices, env_names, weights, all_env_names): """Consolidates matrices, setting missing values to 1 distance""" result = np.zeros((len(all_env_names), len(all_env_names)), float) for m, e, w in zip(matrices, env_names, weights): result += reshape_by_name(m, e, all_env_names, 1) * w return result def consolidate_skipping_missing_values(matrices, env_names, weights, all_env_names): """Consolidates matrices, skipping only values from missing envs""" result = [] for m, e, w in zip(matrices, env_names, weights): reshaped = reshape_by_name(m, e, all_env_names, masked=True) reshaped *= w result.append(reshaped) data = np.array([i.data for i in result], float) masks = np.array([i.mask for i in result], bool) masked_result = np.ma.array(data, mask=masks) # figure out mask of weights so we can figure out per-element weighting masked_weights = np.ma.array(np.zeros(data.shape), mask=masks) + \ np.array(weights, float).reshape((len(weights), 1, 1)) return masked_result.sum(0) / masked_weights.sum(0) def reshape_by_name(m, old_names, new_names, default_off_diag=0, default_diag=0, masked=False): """Reshape matrix m mapping slots from old names to new names. """ num_names = len(new_names) result = np.zeros((num_names, num_names), float) + default_off_diag for i in range(num_names): result[i, i] = default_diag pairs = {} for i, n in enumerate(old_names): if n in new_names: pairs[i] = new_names.index(n) for i, row in enumerate(m): new_i = pairs[i] for j, val in enumerate(row): new_j = pairs[j] result[new_i, new_j] = val if masked: mask = np.ones((num_names, num_names), float) for i in pairs.values(): for j in pairs.values(): mask[i, j] = 0 result = np.ma.array(result, mask=mask) return result CONSOLIDATIONS = \ {'skipping_missing_matrices': consolidate_skipping_missing_matrices, 'missing_zero': consolidate_missing_zero, 'missing_one': consolidate_missing_one, 'skipping_missing_values': consolidate_skipping_missing_values} unifrac-0.10.0/unifrac/_methods.py000066400000000000000000000360701351072301000170200ustar00rootroot00000000000000# ---------------------------------------------------------------------------- # Copyright (c) 2016-2017, UniFrac development team. # # Distributed under the terms of the Modified BSD License. # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- from warnings import warn from functools import reduce from operator import or_ import numpy as np import skbio import unifrac as qsu from unifrac._meta import CONSOLIDATIONS def is_biom_v210(f): import h5py if not h5py.is_hdf5(f): return False with h5py.File(f, 'r') as fp: if 'format-version' not in fp.attrs: return False version = fp.attrs.get('format-version', None) if version is None: return False if tuple(version) != (2, 1): return False return True def is_newick(f): sniffer = skbio.io.format.newick.newick.sniffer_function return sniffer(f)[0] def _validate(table, phylogeny): if not is_biom_v210(table): raise ValueError("Table does not appear to be a BIOM-Format v2.1") if not is_newick(phylogeny): raise ValueError("The phylogeny does not appear to be newick") def unweighted(table: str, phylogeny: str, threads: int = 1, variance_adjusted: bool = False, bypass_tips: bool = False) -> skbio.DistanceMatrix: """Compute Unweighted UniFrac Parameters ---------- table : str A filepath to a BIOM-Format 2.1 file. phylogeny : str A filepath to a Newick formatted tree. threads : int, optional The number of threads to use. Default of 1. variance_adjusted : bool, optional Adjust for varianace or not. Default is False. bypass_tips : bool Bypass the tips of the tree in the computation. This reduces compute by about 50%, but is an approximation. Returns ------- skbio.DistanceMatrix The resulting distance matrix. Raises ------ IOError If the tree file is not found If the table is not found ValueError If the table does not appear to be BIOM-Format v2.1. If the phylogeny does not appear to be in Newick format. Notes ----- Unweighted UniFrac was originally described in [1]_. Variance Adjusted UniFrac was originally described in [2]_, and while its application to Unweighted UniFrac was not described, factoring in the variance adjustment is still feasible and so it is exposed. References ---------- .. [1] Lozupone, C. & Knight, R. UniFrac: a new phylogenetic method for comparing microbial communities. Appl. Environ. Microbiol. 71, 8228-8235 (2005). .. [2] Chang, Q., Luan, Y. & Sun, F. Variance adjusted weighted UniFrac: a powerful beta diversity measure for comparing communities based on phylogeny. BMC Bioinformatics 12:118 (2011). """ _validate(table, phylogeny) return qsu.ssu(table, phylogeny, 'unweighted', variance_adjusted, 1.0, bypass_tips, threads) def weighted_normalized(table: str, phylogeny: str, threads: int = 1, variance_adjusted: bool = False, bypass_tips: bool = False) -> skbio.DistanceMatrix: """Compute weighted normalized UniFrac Parameters ---------- table : str A filepath to a BIOM-Format 2.1 file. phylogeny : str A filepath to a Newick formatted tree. threads : int, optional The number of threads to use. Default of 1. variance_adjusted : bool, optional Adjust for varianace or not. Default is False. bypass_tips : bool Bypass the tips of the tree in the computation. This reduces compute by about 50%, but is an approximation. Returns ------- skbio.DistanceMatrix The resulting distance matrix. Raises ------ IOError If the tree file is not found If the table is not found ValueError If the table does not appear to be BIOM-Format v2.1. If the phylogeny does not appear to be in Newick format. Notes ----- Weighted UniFrac was originally described in [1]_. Variance Adjusted Weighted UniFrac was originally described in [2]_. References ---------- .. [1] Lozupone, C. A., Hamady, M., Kelley, S. T. & Knight, R. Quantitative and qualitative beta diversity measures lead to different insights into factors that structure microbial communities. Appl. Environ. Microbiol. 73, 1576-1585 (2007). .. [2] Chang, Q., Luan, Y. & Sun, F. Variance adjusted weighted UniFrac: a powerful beta diversity measure for comparing communities based on phylogeny. BMC Bioinformatics 12:118 (2011). """ return qsu.ssu(str(table), str(phylogeny), 'weighted_normalized', variance_adjusted, 1.0, bypass_tips, threads) def weighted_unnormalized(table: str, phylogeny: str, threads: int = 1, variance_adjusted: bool = False, bypass_tips: bool = False) -> skbio.DistanceMatrix: # noqa """Compute weighted unnormalized UniFrac Parameters ---------- table : str A filepath to a BIOM-Format 2.1 file. phylogeny : str A filepath to a Newick formatted tree. threads : int, optional The number of threads to use. Default is 1. variance_adjusted : bool, optional Adjust for varianace or not. Default is False. bypass_tips : bool Bypass the tips of the tree in the computation. This reduces compute by about 50%, but is an approximation. Returns ------- skbio.DistanceMatrix The resulting distance matrix. Raises ------ IOError If the tree file is not found If the table is not found ValueError If the table does not appear to be BIOM-Format v2.1. If the phylogeny does not appear to be in Newick format. Notes ----- Weighted UniFrac was originally described in [1]_. Variance Adjusted Weighted UniFrac was originally described in [2]_. References ---------- .. [1] Lozupone, C. A., Hamady, M., Kelley, S. T. & Knight, R. Quantitative and qualitative beta diversity measures lead to different insights into factors that structure microbial communities. Appl. Environ. Microbiol. 73, 1576-1585 (2007). .. [2] Chang, Q., Luan, Y. & Sun, F. Variance adjusted weighted UniFrac: a powerful beta diversity measure for comparing communities based on phylogeny. BMC Bioinformatics 12:118 (2011). """ return qsu.ssu(str(table), str(phylogeny), 'weighted_unnormalized', variance_adjusted, 1.0, bypass_tips, threads) def generalized(table: str, phylogeny: str, threads: int = 1, alpha: float = 1.0, variance_adjusted: bool = False, bypass_tips: bool = False) -> skbio.DistanceMatrix: """Compute Generalized UniFrac Parameters ---------- table : str A filepath to a BIOM-Format 2.1 file. phylogeny : str A filepath to a Newick formatted tree. threads : int, optional The number of threads to use. Default is 1 alpha : float, optional The level of contribution of high abundance branches. Higher alpha increases the contribution of from high abundance branches while lower alpha reduces the contribution. Alpha was originally defined over the range [0, 1]. Default is 1.0. variance_adjusted : bool, optional Adjust for varianace or not. Default is False. bypass_tips : bool Bypass the tips of the tree in the computation. This reduces compute by about 50%, but is an approximation. Returns ------- skbio.DistanceMatrix The resulting distance matrix. Raises ------ IOError If the tree file is not found If the table is not found ValueError If the table does not appear to be BIOM-Format v2.1. If the phylogeny does not appear to be in Newick format. Notes ----- Generalized UniFrac was originally described in [1]_. Variance Adjusted UniFrac was originally described in [2]_, but was not described in as applied to Generalized UniFrac. It is feasible to do, so it is exposed here. An alpha of 1.0 is Weighted normalized UniFrac. An alpha of 0.0 is approximately Unweighted UniFrac, and is if the proportions are dichotomized. References ---------- .. [1] Chen, J., Bittinger, K., Charlson, E. S., Hoffmann C., Lewis, J., Wu, G. D., Collman R. G., Bushman, F. D. & Hongzhe L. Associating microbiome composition with environmental covariates using generalized UniFrac distances. Bioinformatics 28(16), 2106–2113 (2012). .. [2] Chang, Q., Luan, Y. & Sun, F. Variance adjusted weighted UniFrac: a powerful beta diversity measure for comparing communities based on phylogeny. BMC Bioinformatics 12:118 (2011). """ if alpha == 1.0: warn("alpha of 1.0 is weighted-normalized UniFrac. " "Weighted-normalized is being used instead as it is more " "optimized.", Warning) return weighted_normalized(table, phylogeny, threads, variance_adjusted) else: return qsu.ssu(str(table), str(phylogeny), 'generalized', variance_adjusted, alpha, bypass_tips, threads) METHODS = {'unweighted': unweighted, 'weighted_normalized': weighted_normalized, 'weighted_unnormalized': weighted_unnormalized, 'generalized': generalized} def meta(tables: tuple, phylogenies: tuple, weights: tuple = None, consolidation: str = None, method: str = None, threads: int = 1, variance_adjusted: bool = False, alpha: float = None, bypass_tips: bool = False) -> \ skbio.DistanceMatrix: """Compute meta UniFrac Parameters ---------- tables : tuple of str Filepaths to a BIOM-Format 2.1 files. This tuple is expected to be in index order with phylogenies. phylogenies : tuple of str Filepaths to a Newick formatted trees. This tuple is expected to be in index order with tables. weights : tuple of float, optional The weight applied to each tree/table pair. This tuple is expected to be in index order with tables and phylogenies. Default is to weight each tree/table pair evenly. consolidation : str, optional The matrix consolidation method. The available choices are: 'skipping_missing_matrices', 'missing_zero', 'missing_one', 'skipping_missing_values'. The default is 'skipping_missing_values'. method : str The UniFrac method to use. The available choices are: 'unweighted', 'weighted_unnormalized', 'weighted_normalized', and 'generalized'. threads : int, optional The number of threads to use. Default is 1 bypass_tips : bool Bypass the tips of the tree in the computation. This reduces compute by about 50%, but is an approximation. alpha : float, optional The level of contribution of high abundance branches. Higher alpha increases the contribution of from high abundance branches while lower alpha reduces the contribution. Alpha was originally defined over the range [0, 1]. Default is 1.0 variance_adjusted : bool, optional Adjust for varianace or not. Default is False. Returns ------- skbio.DistanceMatrix The resulting distance matrix. Raises ------ IOError If the tree file is not found If the table is not found ValueError If the table does not appear to be BIOM-Format v2.1. If the phylogeny does not appear to be in Newick format. Notes ----- UniFrac can be adapted to account for multiple genes, as originally done in [1]_. Generalized UniFrac was originally described in [2]_. Variance Adjusted UniFrac was originally described in [3]_, but was not described in as applied to Generalized UniFrac. It is feasible to do, so it is exposed here. References ---------- .. [1] Lozupone C. A., Hamady M., Cantarel B. L., Coutinho P. M., Henrissat B., Gordon J. I. & Knight R. The convergence of carbohydrate active gene repertoires in human gut microbes. PNAS 105(39):15076-81 (2008). .. [2] Chen, J., Bittinger, K., Charlson, E. S., Hoffmann C., Lewis, J., Wu, G. D., Collman R. G., Bushman, F. D. & Hongzhe L. Associating microbiome composition with environmental covariates using generalized UniFrac distances. Bioinformatics 28(16), 2106–2113 (2012). .. [3] Chang, Q., Luan, Y. & Sun, F. Variance adjusted weighted UniFrac: a powerful beta diversity measure for comparing communities based on phylogeny. BMC Bioinformatics 12:118 (2011). """ if not len(tables): raise ValueError("No tables specified.") if not len(phylogenies): raise ValueError("No trees specified.") if len(tables) != len(phylogenies): raise ValueError("Number of trees and tables must be the same.") if weights is None: weights = tuple(1 for _ in phylogenies) else: if len(weights) != len(phylogenies): raise ValueError("Number of weights does not match number of " "trees and tables.") if method is None: raise ValueError("No method specified.") method_ = METHODS.get(method.replace('-', '_')) if method_ is None: raise ValueError("Method (%s) unrecognized. Available methods are: %s" % (method, ', '.join(METHODS.keys()))) if consolidation is None: consolidation = 'skipping_missing_values' consolidation_ = CONSOLIDATIONS.get(consolidation.replace('-', '_')) if consolidation_ is None: raise ValueError("Consolidation (%s) unrecognized. Available " "consolidations are: %s" % (consolidation, ', '.join(CONSOLIDATIONS.keys()))) if alpha is not None and method is not generalized: raise ValueError("The alpha parameter can only be set when the method " "is set as 'generalized', the selected method is " "'%s'." % method) kwargs = {'threads': threads, 'bypass_tips': bypass_tips, 'variance_adjusted': variance_adjusted} if alpha is not None: kwargs['alpha'] = alpha weights = np.array(weights, float)/sum(weights) dms = [method_(table, tree, **kwargs) for table, tree in zip(tables, phylogenies)] all_ids = sorted(reduce(or_, [set(dm.ids) for dm in dms])) dm = consolidation_(dms, [dm.ids for dm in dms], weights, all_ids) return skbio.DistanceMatrix(dm, ids=all_ids) unifrac-0.10.0/unifrac/tests/000077500000000000000000000000001351072301000160005ustar00rootroot00000000000000unifrac-0.10.0/unifrac/tests/__init__.py000066400000000000000000000005351351072301000201140ustar00rootroot00000000000000# ---------------------------------------------------------------------------- # Copyright (c) 2016-2017, UniFrac development team. # # Distributed under the terms of the Modified BSD License. # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- unifrac-0.10.0/unifrac/tests/data/000077500000000000000000000000001351072301000167115ustar00rootroot00000000000000unifrac-0.10.0/unifrac/tests/data/crawford.biom000066400000000000000000004137411351072301000214020ustar00rootroot00000000000000HDF  `  `TREE`HEAPX observationsample8  @id GCOL No Table IDhttp://biom-format.orgQIIME 1.9.0-rc1, master@2f448392015-01-06T00:37:05.179285199181271766166099 187644 233817 229459 199698 259434260756260205260753336145275819261334194787178735170950260058179063266595312476179069196138275869194978276044 4396877!829401"179719#276260$275563%3392842&259212'270984(262409)4484382*185743+302407,258969-184151.215193/4468234097294125926622592633184567418114152278866174791726901982606539166911:260655;443945<732128=318563>272454?192971@195385A261177B229386C204144D267041E301870F267123G239571H349142I199403J208571K334365L260397M403497N267388O274106P274844Q288931R275136S267411T4338733U4403349V314810W3621189X182033Y1105328Z180105[195005\275627]187233^343581_291750`301012a269902b276985c274438d259228e303479f174754g263908h380534i4397402j335952k1108453l258725m351881n320490o264496p194662q191772r187790s174056t180972u340189v183390w422727x263546y187989z259910{303652|261419}4449524~11518632206226957626066313595626768916299127402144144201571092234121267457267452199534175573259593193463276172197318181419441494259859275707187078270519447113526394627640429358018780726336218321143463742591751816032703851885364407703191398176118167078833390 @type H format-url H format-version@ H generated-by Hcreation-date @ shape@m 0 nnz@-TREEhHEAPX8matrixidsmetadatagroup-metadata SNOD/TREE&HEAPX dataindicesindptr8SNOD(/B-- ?@4 4 deflate-TXTREEx8(-SNOD80x9x^͕=n0 =f!CSir ]^>G4IQҲ<7) 9GdI/J_'G' L$}ʣz('k]u|c?ԓڧGT%靽Jx⹹yVs֩8W}NNM3~OAw޷*S\_DN?ŵiG2'S{}k^ZgY2.O:w-*?߼]NOW''x^m j1 DΞ8I?(ݲJ3u]o_qcz֟wgK߷{._v>jN37b;Y!.;'F?٪OKN>7: ۯIygsXlRwf.]-'io;7}"f}'0._j[j˃Ͼs"8fG ѳ<8Lnjv-0ӉowݗE87w9]O͘mg'7/s=CYOz3ٖ/; |jM|mF8˶n;C< ; X_r;bhr`S[]~7/_1|^Sc7!N<})>jx^svjj61LOL9y&N8q><ANp3u=޿@ %HC:@F2B(ad%;DHE~ P"%┥H%PԢ.OCbhD҂6H':Ӆt;q}G0A e#(F3c<d2d\1,b1KYN<+X*Vc=6"}9ap9YΑy.q\#6w='<9/x+^󆷼go|'RiIO0H&B(ad%D\&|;?/~O4)IEIKz2B( 'M$yK>(@Q)FqJPc-sTREE)I-  deflateH1-T`TREE)-n  deflate:nT`TREE6W+nm deflateCmTXTREEkmGCOL 17380726681616990118299526316544424593104901684221 697874 321484 2120775 268923 17543217033526038719721618652119046044185861865264374042436341176039177427191483343906351794553395178779268755271378 231169!259335"1107945#450047$269359%4331760&182016'275470(270391)187703*191816+353782,163862-319909.307595/2751500272953126999221768863195445434609852750786828435719188783330539167204:259056;233313<164308=263705>178659?174272@310748A132114B336214C550807D262869E178031F173417G316842H376397I190242J182621K174959L259372M175416N847228O461524P350381Q259012R327236S318370T265106U214471V4372578W4127460X274597Y314963Z262166[4417539\100344]170555^261511_273515`177205a839215b4462541c261409d274521e180919f191958g216403h264373i185754j262766k176850l176858m268121n4364243o272812p185222q262399r178926s199307t265641u45363v351859w179181x179188y265828z292745{269378|837473}130335~2598881967774528233434202575787210950194822266483260828348398320635523102126267725924919777518120519619426345227039626310644801763445272753396871851810911871332704912137001268416177802190273191077331965180362258522169398273084326588927066227144943651092766631864972640212647872112006263044311755623341119682533733135495727401818310626160619584026594018124921491926578617466318577717270527442225960931117426858111364434329571330296181344276531180206258250827195206494276580197790f__Lachnospiraceaeg__s__ k__Bacteriap__Deferribacteresc__Deferribactereso__Deferribacteralesf__Deferribacteraceaeg__Mucispirillum s__schaedleri k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7g__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__Bacteroidaceaeg__Bacteroidess__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteria p__Firmicutes  c__Clostridia o__Clostridiales f__ g__ s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridiales f__Lachnospiraceae!g__"s__# k__Bacteria$ p__Firmicutes% c__Clostridia&o__Clostridiales'f__(g__)s__* k__Bacteria+p__Bacteroidetes,c__Bacteroidia-o__Bacteroidales.f__Bacteroidaceae/g__Bacteroides0s__1 k__Bacteria2 p__Firmicutes3 c__Clostridia4o__Clostridiales5f__Lachnospiraceae6g__7s__x^]eGJE"Ive˚b)[(K%% ٦E5#.ȏ9s9{.3z)kϥ[|EqkSyeO|J>-/|IF-KF彩~AP*_?h~D1|B~G^ +/>nˋ%Ry\^!Wɕ|yZ^#ץkAX7ʟțT~4*o?_;R~_jS%%ky3Bn)[mS2}ѧ '\!'Lp)So1} 'L0} 'L0} 'L0} 'L>a 'afr'L0} 'L0}ѧS)w)>aO>SnO>aѧ ϐOxLp)>s>O}b_˘}/ؗs/ce̾ٗ12f_8þٗ12c_;deryRy=2f_˸Zf_˘}/cerne̾ٗ12}9ٗq˹ٗ12f_˘}s s k4oob&9f9Z,,sĜ7q79obΛ&漉9obΛ&f7q7*hx^=1 F܂BTH`}Ldn.so\;2_F<ź^~srqKα0e~";gt}}.   deflate) T`x^c``` > , @\ {210NE4  deflateY  TXx^a``Qj@WA+rh|y4_HG HEAPXttaxonomy@m deflateuTPTREEG GGCOL k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Bacteroidetes c__Bacteroidia o__Bacteroidales f__Rikenellaceae g__Alistipess__indistinctus k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteriap__Bacteroidetesc__Bacteroidia o__Bacteroidales!f__Rikenellaceae"g__#s__$ k__Bacteria% p__Firmicutes& c__Clostridia'o__Clostridiales(f__Lachnospiraceae)g__[Ruminococcus]* s__gnavus+ k__Bacteria, p__Firmicutes- c__Clostridia.o__Clostridiales/f__Ruminococcaceae0g__Oscillospira1s__2 k__Bacteria3p__Bacteroidetes4c__Bacteroidia5o__Bacteroidales6f__Bacteroidaceae7g__Bacteroides8s__9 k__Bacteria:p__Bacteroidetes;c__Bacteroidia<o__Bacteroidales=f__S24-7>g__?s__@ k__BacteriaA p__FirmicutesB c__ClostridiaCo__ClostridialesDf__Eg__Fs__G k__BacteriaH p__FirmicutesI c__ClostridiaJo__ClostridialesKf__Lg__Ms__N k__BacteriaO p__FirmicutesP c__ClostridiaQo__ClostridialesRf__Sg__Ts__U k__BacteriaV p__FirmicutesW c__ClostridiaXo__ClostridialesYf__RuminococcaceaeZg__Oscillospira[s__\ k__Bacteria] p__Firmicutes^ c__Clostridia_o__Clostridiales`f__ag__bs__c k__Bacteriadp__Bacteroidetesec__Bacteroidiafo__Bacteroidalesgf__S24-7hg__is__j k__Bacteriak p__Firmicutesl c__Clostridiamo__Clostridialesnf__og__ps__q k__Bacteriar p__Firmicutess c__Clostridiato__Clostridialesuf__Lachnospiraceaevg__ws__x k__Bacteriay p__Firmicutesz c__Clostridia{o__Clostridiales|f__}g__~s__ k__Bacteria p__Firmicutesc__Erysipelotrichio__Erysipelotrichalesf__Erysipelotrichaceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__[Ruminococcus] s__gnavus k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__[Ruminococcus] s__gnavus k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Peptococcaceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Clostridiaceaeg__Clostridiums__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__Rikenellaceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__[Ruminococcus] s__gnavus k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__[Ruminococcus] s__gnavus k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Peptococcaceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceae g__Ruminococcus s__  k__Bacteria  p__Firmicutes  c__Clostridiao__Clostridialesf__Lachnospiraceaeg__Coprococcuss__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Bacillio__Lactobacillalesf__Lactobacillaceaeg__Lactobacilluss__  k__Bacteria! p__Firmicutes" c__Clostridia#o__Clostridiales$f__%g__&s__' k__Bacteria(p__Bacteroidetes)c__Bacteroidia*o__Bacteroidales+f__Porphyromonadaceae,g__Parabacteroides-s__. k__Bacteria/p__Actinobacteria0c__Coriobacteriia1o__Coriobacteriales2f__Coriobacteriaceae3g__4s__5 k__Bacteria6 p__Firmicutes7 c__Clostridia8o__Clostridiales9f__:g__;s__< k__Bacteria=p__Bacteroidetes>c__Bacteroidia?o__Bacteroidales@f__S24-7Ag__Bs__C k__BacteriaD p__FirmicutesE c__ClostridiaFo__ClostridialesGf__Hg__Is__J k__BacteriaK p__FirmicutesL c__ClostridiaMo__ClostridialesNf__RuminococcaceaeOg__Ps__Q k__BacteriaRp__ActinobacteriaSc__CoriobacteriiaTo__CoriobacterialesUf__CoriobacteriaceaeVg__AdlercreutziaWs__X k__BacteriaY p__FirmicutesZ c__Clostridia[o__Clostridiales\f__]g__^s___ k__Bacteria`p__Bacteroidetesac__Bacteroidiabo__Bacteroidalescf__Bacteroidaceaedg__Bacteroidese s__fragilisf k__Bacteriagp__Bacteroideteshc__Bacteroidiaio__Bacteroidalesjf__Rikenellaceaekg__ls__m k__Bacterian p__Firmicuteso c__Clostridiapo__Clostridialesqf__rg__ss__t k__Bacteriau p__Firmicutesv c__Clostridiawo__Clostridialesxf__yg__zs__{ k__Bacteria|p__Bacteroidetes}c__Bacteroidia~o__Bacteroidalesf__S24-7g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__Ruminococcuss__ k__Bacteriap__Proteobacteriac__Epsilonproteobacteriao__Campylobacteralesf__Helicobacteraceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__[Ruminococcus] s__gnavus k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes  c__Clostridia o__Clostridiales f__Lachnospiraceae g__ s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Clostridiaceaeg__s__ k__Bacteriap__Bacteroidetesc__Bacteroidia o__Bacteroidales!f__S24-7"g__#s__$ k__Bacteria% p__Firmicutes& c__Clostridia'o__Clostridiales(f__)g__*s__+ k__Bacteria, p__Firmicutes- c__Clostridia.o__Clostridiales/f__Ruminococcaceae0g__1s__2 k__Bacteria3 p__Firmicutes4 c__Clostridia5o__Clostridiales6f__7g__8s__9 k__Bacteria: p__Firmicutes;c__Erysipelotrichi<o__Erysipelotrichales=f__Erysipelotrichaceae>g__Clostridium? s__cocleatum@ k__BacteriaA p__FirmicutesB c__ClostridiaCo__ClostridialesDf__Eg__Fs__G k__BacteriaH p__FirmicutesI c__ClostridiaJo__ClostridialesKf__Lg__Ms__N k__BacteriaO p__FirmicutesP c__ClostridiaQo__ClostridialesRf__Sg__Ts__U k__BacteriaV p__FirmicutesW c__ClostridiaXo__ClostridialesYf__Zg__[s__\ k__Bacteria] p__Firmicutes^ c__Clostridia_o__Clostridiales`f__Lachnospiraceaeag__bs__c k__Bacteriad p__Firmicutese c__Clostridiafo__Clostridialesgf__Ruminococcaceaehg__Ruminococcusis__j k__Bacteriak p__Firmicutesl c__Clostridiamo__Clostridialesnf__og__ps__q k__Bacteriar p__Firmicutess c__Clostridiato__Clostridialesuf__vg__ws__x k__Bacteriay p__Firmicutesz c__Clostridia{o__Clostridiales|f__}g__~s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Bacillio__Lactobacillalesf__Lactobacillaceaeg__Lactobacilluss__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__Epulopisciums__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__Ruminococcuss__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__Oscillospiras__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__Oscillospiras__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Peptococcaceaeg__rc4-4s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__Oscillospiras__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__ g__ s__  k__Bacteria  p__Firmicutes  c__Clostridiao__Clostridialesf__Ruminococcaceaeg__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7g__s__ k__Bacteria p__Firmicutes c__Bacilli o__Bacillalesf__Staphylococcaceaeg__Staphylococcuss__  k__Bacteria! p__Firmicutes" c__Clostridia#o__Clostridiales$f__%g__&s__' k__Bacteria( p__Firmicutes) c__Clostridia*o__Clostridiales+f__Ruminococcaceae,g__Ruminococcus-s__. k__Bacteria/ p__Firmicutes0 c__Clostridia1o__Clostridiales2f__3g__4s__5 k__Bacteria6p__Bacteroidetes7c__Bacteroidia8o__Bacteroidales9f__Bacteroidaceae:g__Bacteroides;s__< k__Bacteria= p__Firmicutes> c__Clostridia?o__Clostridiales@f__LachnospiraceaeAg__[Ruminococcus]B s__gnavusC k__BacteriaD p__FirmicutesE c__ClostridiaFo__ClostridialesGf__Hg__Is__J k__BacteriaK p__FirmicutesL c__ClostridiaMo__ClostridialesNf__LachnospiraceaeOg__Ps__Q k__BacteriaR p__FirmicutesS c__ClostridiaTo__ClostridialesU k__BacteriaV p__FirmicutesW c__ClostridiaXo__ClostridialesYf__LachnospiraceaeZg__[s__\ k__Bacteria] p__Firmicutes^ c__Clostridia_o__Clostridiales`f__ag__bs__c k__Bacteriad p__Firmicutese c__Clostridiafo__Clostridialesgf__Lachnospiraceaehg__is__j k__Bacteriakp__Bacteroideteslc__Bacteroidiamo__Bacteroidalesnf__Rikenellaceaeog__ps__q k__Bacteriar p__Firmicutess c__Clostridiato__Clostridialesuf__vg__ws__x k__Bacteriay p__Firmicutesz c__Clostridia{o__Clostridiales|f__Lachnospiraceae}g__~s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__Coprococcuss__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7g__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__[Odoribacteraceae]g__Odoribacters__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__[Odoribacteraceae]g__Butyricimonass__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__f__Lachnospiraceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__Ruminococcuss__ k__Bacteria p__Firmicutes c__Clostridia o__Clostridiales f__ g__ s__  k__Bacteria p__Firmicutes c__Bacillio__Turicibacteralesf__Turicibacteraceaeg__Turicibacters__ k__Bacteria p__Firmicutesc__Erysipelotrichio__Erysipelotrichalesf__Erysipelotrichaceaeg__Allobaculums__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7 g__!s__" k__Bacteria# p__Firmicutes$ c__Clostridia%o__Clostridiales&f__Lachnospiraceae'g__(s__) k__Bacteria* p__Firmicutes+ c__Clostridia,o__Clostridiales-f__.g__/s__0 k__Bacteria1 p__Firmicutes2 c__Clostridia3o__Clostridiales4f__Peptostreptococcaceae5g__6s__7 k__Bacteria8p__Bacteroidetes9c__Bacteroidia:o__Bacteroidales;f__S24-7<g__=s__> k__Bacteria? p__Firmicutes@ c__ClostridiaAo__ClostridialesBf__Cg__Ds__Es__F k__BacteriaG p__FirmicutesH c__ClostridiaIo__ClostridialesJf__Kg__Ls__M k__BacteriaNp__DeferribacteresOc__DeferribacteresPo__DeferribacteralesQf__DeferribacteraceaeRg__MucispirillumS s__schaedleriT k__BacteriaUp__BacteroidetesVc__BacteroidiaWo__BacteroidalesXf__PorphyromonadaceaeYg__ParabacteroidesZs__[ k__Bacteria\ p__Firmicutes] c__Clostridia^o__Clostridiales_f__`g__as__b k__Bacteriac p__Firmicutesd c__Clostridiaeo__Clostridialesff__Lachnospiraceaegg__hs__i k__Bacteriaj p__Firmicutesk c__Clostridialo__Clostridialesm k__Bacterian p__Firmicuteso c__Clostridiapo__Clostridialesqf__Lachnospiraceaerg__ss__t k__Bacteriau p__Firmicutesv c__Clostridiawo__Clostridialesxf__yg__zs__{ k__Bacteria| p__Firmicutes} c__Clostridia~o__Clostridialesf__g__f__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialeso__Clostridialesf__g__ c__Clostridia p__Firmicutes k__Bacteria k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__Oscillospiras__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7g__s__ k__Bacteria p__Firmicutes c__Bacillio__Lactobacillalesf__Lactobacillaceaeg__Lactobacilluss__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__Oscillospiras__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridia o__Clostridiales f__Lachnospiraceae g__[Ruminococcus]  s__gnavus  k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__ g__!s__" k__Bacteria# p__Firmicutes$ c__Clostridia%o__Clostridiales&f__Lachnospiraceae'g__(s__) k__Bacteria* p__Firmicutes+ c__Clostridia,o__Clostridiales-f__.g__/s__0 k__Bacteria1 p__Firmicutes2 c__Clostridia3o__Clostridiales4f__5g__6s__7 k__Bacteria8 p__Firmicutes9 c__Clostridia:o__Clostridiales;f__Ruminococcaceae<g__Oscillospira=s__> k__Bacteria? p__Firmicutes@ c__ClostridiaAo__ClostridialesBf__Cg__Ds__E k__BacteriaF p__FirmicutesG c__ClostridiaHo__ClostridialesIf__Jg__Ks__L k__BacteriaM p__FirmicutesN c__BacilliOo__LactobacillalesPf__StreptococcaceaeQg__StreptococcusRs__S k__BacteriaT p__FirmicutesU c__ClostridiaVo__ClostridialesWf__LachnospiraceaeXg__Ys__Z k__Bacteria[p__Bacteroidetes\c__Bacteroidia]o__Bacteroidales^f__Prevotellaceae_ g__Prevotella`s__a k__Bacteriab p__Firmicutesc c__Bacillid o__Bacillalesef__Staphylococcaceaefg__Staphylococcusg s__sciurih k__Bacteriai p__Firmicutesj c__Clostridiako__Clostridialeslf__Lachnospiraceaemg__ns__o k__Bacteriap p__Firmicutesq c__Clostridiaro__Clostridialessf__Lachnospiraceaetg__Coprococcusus__v k__Bacteriaw p__Firmicutesxc__Erysipelotrichiyo__Erysipelotrichaleszf__Erysipelotrichaceae{g__Allobaculum|s__} k__Bacteria~ p__Firmicutes c__Bacillio__Lactobacillalesf__Lactobacillaceaeg__Lactobacilluss__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__Bacteroidaceaeg__Bacteroides s__eggerthii k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__Ruminococcuss__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__Rikenellaceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__Coprococcuss__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__Rikenellaceaeg__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7g__s__  k__Bacteria  p__Firmicutes  c__Clostridia o__Clostridiales f__Ruminococcaceaeg__Oscillospiras__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__[Ruminococcus] s__gnavus k__Bacteria p__Firmicutes  c__Clostridia!o__Clostridiales"f__Ruminococcaceae#g__Oscillospira$s__% k__Bacteria& p__Firmicutes' c__Clostridia(o__Clostridiales)f__Lachnospiraceae*g__+s__, k__Bacteria- p__Firmicutes. c__Clostridia/o__Clostridiales0f__Ruminococcaceae1g__Oscillospira2s__3 k__Bacteria4 p__Firmicutes5 c__Clostridia6o__Clostridiales7f__Lachnospiraceae8g__[Ruminococcus]9 s__gnavus: k__Bacteria; p__Firmicutes< c__Clostridia=o__Clostridiales>f__?g__@s__A k__BacteriaBp__BacteroidetesCc__BacteroidiaDo__BacteroidalesEf__BacteroidaceaeFg__BacteroidesGs__H k__BacteriaI p__FirmicutesJ c__ClostridiaKo__ClostridialesLf__Mg__Ns__O k__BacteriaP p__FirmicutesQ c__ClostridiaRo__ClostridialesSf__RuminococcaceaeTg__Us__V k__BacteriaW p__FirmicutesX c__ClostridiaYo__ClostridialesZf__Ruminococcaceae[g__Oscillospira\s__] k__Bacteria^ p__Firmicutes_ c__Clostridia`o__Clostridialesaf__bg__cs__d k__Bacteriae p__Firmicutesfc__Erysipelotrichigo__Erysipelotrichaleshf__Erysipelotrichaceaeig__Coprobacillusjs__k k__Bacterial p__Firmicutesm c__Clostridiano__Clostridialesof__pg__qs__r k__Bacterias p__Firmicutest c__Clostridiauo__Clostridialesvf__Ruminococcaceaewg__Oscillospiraxs__y k__Bacteriazp__Bacteroidetes{c__Bacteroidia|o__Bacteroidales}f__S24-7~g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__[Paraprevotellaceae]g__[Prevotella]s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteriap__TM7c__TM7-3o__CW040f__F16g__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__Oscillospiras__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__Porphyromonadaceaeg__Parabacteroidess__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteriap__Proteobacteriac__Deltaproteobacteriao__Desulfovibrionalesf__Desulfovibrionaceaeg__Desulfovibrio s__C21_c20 k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__Oscillospiras__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__[Odoribacteraceae]g__Odoribacters__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridiales f__ g__ s__  k__Bacteria  p__Firmicutes c__Clostridiao__Clostridialesf__g__s__f__Bacteroidaceaeg__Bacteroides s__caccae k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridia o__Clostridiales!f__"g__#s__$ k__Bacteria%p__Bacteroidetes&c__Bacteroidia'o__Bacteroidales(f__S24-7)g__*s__+ k__Bacteria, p__Firmicutes- c__Clostridia.o__Clostridiales/f__Ruminococcaceae0g__Oscillospira1s__2 k__Bacteria3 p__Firmicutes4 c__Clostridia5o__Clostridiales6f__7g__8s__9 k__Bacteria: p__Firmicutes; c__Bacilli<o__Lactobacillales=f__Lactobacillaceae>g__Lactobacillus?s__@ k__BacteriaA p__FirmicutesB c__ClostridiaCo__ClostridialesDf__LachnospiraceaeEg__[Ruminococcus]F s__gnavusG k__BacteriaH p__FirmicutesI c__ClostridiaJo__ClostridialesKf__Lg__Ms__N k__BacteriaO p__FirmicutesP c__ClostridiaQo__ClostridialesRf__Sg__Ts__U k__BacteriaV p__FirmicutesW c__ClostridiaXo__ClostridialesYf__LachnospiraceaeZg__[s__\ k__Bacteria] p__Firmicutes^ c__Clostridia_o__Clostridiales`f__Lachnospiraceaeag__bs__c k__Bacteriad p__Firmicutese c__Clostridiafo__Clostridialesgf__hg__is__j k__Bacteriak p__Firmicutesl c__Clostridiamo__Clostridialesnf__Lachnospiraceaeog__[Ruminococcus]p s__gnavusq k__Bacteriarp__Bacteroidetessc__Bacteroidiato__Bacteroidalesuf__S24-7vg__ws__x k__Bacteriay p__Firmicutesz c__Clostridia{o__Clostridiales|f__}g__~s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Clostridiaceaeg__s__ k__Bacteriap__Actinobacteriac__Coriobacteriiao__Coriobacterialesf__Coriobacteriaceaeg__Adlercreutzias__ k__Bacteria p__Firmicutes c__Bacillio__Lactobacillalesf__Lactobacillaceaeg__Lactobacilluss__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__[Ruminococcus] s__gnavus k__Bacteriap__Actinobacteriac__Coriobacteriiao__Coriobacterialesf__Coriobacteriaceaeg__Adlercreutzias__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__Coprococcuss__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutesc__Erysipelotrichio__Erysipelotrichalesf__Erysipelotrichaceaeg__Allobaculums__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Bacillio__Lactobacillalesf__Lactobacillaceaeg__Lactobacillus s__reuteri k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__Oscillospiras__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__ g__ s__  k__Bacteria p__Bacteroidetes c__Bacteroidiao__Bacteroidalesf__Bacteroidaceaeg__Bacteroides s__fragilis k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__Oscillospiras__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__Coprococcuss__  k__Bacteria! p__Firmicutes" c__Clostridia#o__Clostridiales$f__%g__&s__' k__Bacteria(p__Bacteroidetes)c__Bacteroidia*o__Bacteroidales+f__S24-7,g__-s__. k__Bacteria/ p__Firmicutes0c__Erysipelotrichi1o__Erysipelotrichales2f__Erysipelotrichaceae3g__Allobaculum4s__5 k__Bacteria6 p__Firmicutes7 c__Bacilli8o__Lactobacillales9f__Lactobacillaceae:g__Lactobacillus;s__< k__Bacteria= p__Firmicutes> c__Clostridia?o__Clostridiales@f__Ag__Bs__C k__BacteriaD p__FirmicutesE c__ClostridiaFo__ClostridialesGf__Hg__Is__J k__BacteriaK p__FirmicutesL c__ClostridiaMo__ClostridialesNf__Og__Ps__Qs__R k__BacteriaS p__FirmicutesT c__ClostridiaUo__ClostridialesVf__Wg__Xs__Y k__BacteriaZp__Bacteroidetes[c__Bacteroidia\o__Bacteroidales]f__S24-7^g___s__` k__Bacteriaa p__Firmicutesb c__Clostridiaco__Clostridialesdf__Ruminococcaceaeeg__fs__g k__Bacteriah p__Firmicutesic__Erysipelotrichijo__Erysipelotrichaleskf__Erysipelotrichaceaelg__[Eubacterium]m s__dolichumn k__Bacteriao p__Firmicutesp c__Clostridiaqo__Clostridialesrf__GCOL@g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__ s__ k__Bacteria p__Firmicutes c__Clostridia o__Clostridialesf__Lachnospiraceaeg__[Ruminococcus] s__gnavus k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteria p__Firmicutes! c__Clostridia"o__Clostridiales#f__Clostridiaceae$g__%s__& k__Bacteria'p__Bacteroidetes(c__Bacteroidia)o__Bacteroidales*f__S24-7+g__,s__- k__Bacteria. p__Firmicutes/ c__Clostridia0o__Clostridiales1f__2g__3s__4 k__Bacteria5 p__Firmicutes6 c__Clostridia7o__Clostridiales8f__Ruminococcaceae9g__Oscillospira:s__; k__Bacteria< p__Firmicutes= c__Clostridia>o__Clostridiales?f__@g__As__B k__BacteriaC p__FirmicutesD c__ClostridiaEo__ClostridialesFf__LachnospiraceaeGg__Hs__I k__BacteriaJ p__FirmicutesK c__ClostridiaLo__ClostridialesMf__LachnospiraceaeNg__[Ruminococcus]O s__gnavusP k__BacteriaQ p__FirmicutesR c__ClostridiaSo__ClostridialesTf__LachnospiraceaeUg__Vs__W k__BacteriaX p__FirmicutesY c__ClostridiaZo__Clostridiales[f__Ruminococcaceae\g__]s__^ k__Bacteria_p__Bacteroidetes`c__Bacteroidiaao__Bacteroidalesbf__Bacteroidaceaecg__Bacteroidesds__e k__Bacteriaf p__Firmicutesg c__Clostridiaho__Clostridialesif__Lachnospiraceaejg__[Ruminococcus]k s__gnavusl k__Bacteriam p__Firmicutesnc__Erysipelotrichioo__Erysipelotrichalespf__Erysipelotrichaceaeqg__rs__s k__Bacteriat p__Firmicutesu c__Clostridiavo__Clostridialeswf__Lachnospiraceaexg__[Ruminococcus]y s__gnavusz k__Bacteria{p__Bacteroidetes|c__Bacteroidia}o__Bacteroidales~o__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteriap__Actinobacteriac__Coriobacteriiao__Coriobacterialesf__Coriobacteriaceaeg__Adlercreutzias__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__Ruminococcuss__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7g__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7g__g__s__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Lachnospiraceaeg__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__Ruminococcaceaeg__Oscillospiras__ k__Bacteria p__Firmicutes c__Clostridias__ k__Bacteriap__Bacteroidetesc__Bacteroidiao__Bacteroidalesf__S24-7g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridiales f__ o__Clostridiales f__ g__ s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__g__g__s__ k__Bacteria p__Firmicutes c__Clostridiao__Clostridialesf__ c__Clostridia p__Firmicutes k__Bacteria 10084.PC.636 10084.PC.635  10084.PC.607! 10084.PC.634" 10084.PC.355# 10084.PC.354$ 10084.PC.356% 10084.PC.593& 10084.PC.481(SNODptx^]uUUGsݍ(aw "vR**w7Ys}gLaf'g 1NGq W¿ 5/@o![R2rF?<1N8ǃ'7IMo*xx:z3 l&59%zzqF8pßd%D/c41q8}aˀ^F2[g8k>ˎ^8=_9;I=^p>c+(z/.^IJKˠWrUqV+_ \*UW Հ&z:Ճ>!z"ww5Oz >Go?< ~QEo\D&'7i͈|΄x6zs |%-t#+[*Wנu mx3z0x x+zh1oo@ogލ= ދS=a};A!wX&3돡w{>_}Eױzo6ޅ=S}}/_Bkw}>>_D|?/]> |" Mnw|)^PYYCeq:Ή^.rz>`=o3+F|E+Ip)J e+pE*W\\jWsP?O z׎qJWz_Cp#pc B5mmk_{ppG:uw@'~  xz|i(zHF7<<qO@o"|SЛ 4tf7< <9Co>| [ PKC?:\YJVo@o#{}8a z[{GC|ށNgx7zO`O¿B0GG{g;Is= /2|B_^ 轅 ~wC_ yxS}>F,zi3/_5 [{9y."zg~2 zWC~zwޭP?zAwO]o?{݇_?A3Eڋԟ8^T襎tiO N^z2DgzHggE/[XosEzF/zy W\\"CxׯDgIJW2蕃<"zRU#^ѫ^HS \:Ϻ׋q2F5777C9--kZvw@-C')i%G? |? OjǛ>/M}_oߚ3>g;o M{L}?Z?agy/~5]u7ӯaڻiڻezn~7=5eiiᄅ_Ӟa(^\Dā B/5iiK_zppF2eg@/' >p~ _\C8|%}~KbT{i|}ʢW)^@g%+Kx= ^8818i}ǽ~~й[\>kݹ&}v>f~7l\ܦ3 ?,n+a/ .e+m+cz5_9^yS_YLSߴWմWWk`Z:i!i)i%i i=i3inaimc o`hz`s(xio~FGFFǀǚƙǃ'&^I)i馽Y٦=/?}|ӿ/X`_^d[l[^jeU5_g_`hl{0x xiSL{Lv]ݦ'L{O2? goz1ϳc㦽L{ϛ^~ɴ𫦽L}i{6޻?0y?>ms61}r6wִ0oK`7ou7o|m~ynݼܼ7]0yM{nܼ?n?nܼynܼ]3]7yiڻezܼg7o~7os6m>ys6m>?0s6m>ys6m>ys6nu6m~|m~ϫ=7o{nݼg7o{nܼg7o~7oa?nm~=7o|>7o|v6mmGݼ߿=7o{nn泛ߧyw6ݼyܼg7oWjՄV6y.k47B1|MMk pKZz -z?ڃ;zt ~^b?_Op/po7c |P~FG7*7<O@o"ћT4tf739 }?<B-} zK[<|EJ*WנY^m6¿ =~zF ܃ޟO4xz0(z,z;> ~"z/z +W{ o&|o޻~| |?OzKk7o;]GO /+]W5z~&z л@_? t= ~CpǁKRp2B/5iiK_zppIx^$iFΈȶm۶m۶axm۶m۶ml|9=_ս~Qzcي/UXa"S0N)j'''#ßB/6qqы_|p3sSߋȏa"?aִt`^? 8*='B.C_4>So;`;h~t?s|Ҵw*#~653_0]4]__1]5__706137qiiﱩ i+S@oL{ߚo7<i'S_L{o4exioxG0 GE~$ Ѧ3ƴ7ִ7t 9lڛbڛjnڛa mڛc gڛo[^^d[l_^j[f[^^i[e_ ^c[k[^`hd<ڌ-ϣm]Xmc?? r}h_}MMk_sh zKh 5 zm1O;p{:C w3z] _HN#9q7z}}?^/ (g`-;ק kӴX _Ih8׆p<{gQx/pn pNbKjONnKaKiONmKcO NgKoLL{M}YLgY~ԟӴt<|B¦"篨L{M{%L%L{M}eeL{MM{Lljګfaګiګez6}1575054i)ii%i-ۙ_{|}^G^'y:ʹ0426}, r_y>h:G|i3S i!h <*99'IIP/ɼ~9988%zO N^ZҁӃ3LOfgϬggG/ ΅^nO^ㅞA΋^>_\BEŽl!.yJz\ S2E<|Я룒U*zq55Ы p]WF77A 资5zmk ;p{: uwwC;==z{z}^?{z?<}y>'Sg}/_717X}yD'OzC<>'/yz5#p<#h3XOoћD&7<Ot?{L|,l3㙇棷,/o1[g)z[ JVuo mFo z[ގvxv¿ =Co?|; Qw||SA,|л%exz |B6|wwлCg?~{5 zo{~_ޏGQvT|x~yы_Lޟba=_qqyȯد%E/)K^*̗4襅?8=zΌ^xa=_Np.rc<~ݯ~}/?z+^a^_ԯb~8z%+^)piW\\ WWB2|UUЫ pMjW\\a> >>G?<Ƨ>gx̳ߧOp<;Q􎁏wF g;E%.|kO/7л-ow{?@!{S3s^ҧWX߂ߡ $#Eq;\=h{b 8E/N^B9zc=/)KyRS4t襇?8#zggE/9 _.pnpŸFE+~pYR蕆 ,z+^%+WjՄڎ_Guq<Ы^&77C9|-z+Gq|mk^;|}{G;:~y';;z^WcgO >C?zO}SG/ /7}=|E;G #?FP/0G7QXm,~GodT4?<YEo ["b¿ ^/+}Bo5zk[_ ooBo3 6ww =ut?9;Awc蝀$O| ]%G_ U97л -m|^/~:ϰ|K+^c7|ppU_TWaqu>~G/D@/r8Ņ?8>z KNN^py՟S4C/eD/Yˊ^6s\A//ˇ^~ _\quC8%%+/ ..^9ʻz2zUЫyWZՁ.z]= on^c ^-k^+[۠vuvqۮ8nuGz= _p_p??zb\= z/ <17o}A Gv&3Ŀzc?C ߟ7! (hǺџD$&SSћ27͂o6曃o?-<[RJW0juu<ۄ ފ6w=x?z;>>wS蝆 ,z\__D|]}]UWk@&|~]{~cr -z>"QQ>vz~cXŁ/.88?l^TREEHEAPXiPTREEHEAPX8matrixidsmetadatagroup-metadata TREEHEAPX dataindicesindptr8SNOD(!prYs- ?@4 4 deflateq-TXTREE{o-SNODa:q-  deflate-T`TREE)-x^eee 1Cwwwww7؝ݭ`+` &vw5v(a󭗵q9<$BICyTBMBmC4,2Cq*q62\57v܍f<&(' _'l? J>(b(C>Zz/acc-y>Qh>AX}qĹrp-n:܌;^Ż?w9GOP 1:9ǒ{$]eQ}Ǟ -=C0#1Kq1rYNYxiH7oyeQQՒX7eh腁Q ss)."=r3No`>șcɽ?49r߆p<徿WP}+ wTz<1݈9 Ÿܑw$b9V&Ϡ(JN4BSy]{?LdL{q>x^c``Bq=x^cd```bf(D x^c````b fb` x^cd``H` fDs%L]  deflate1vYXTREE(  deflate:vYXTREE( deflateDvYPTREE(TREE@'HEAPXNPO(QTREE@'HEAPXHQPQSTREE WHEAPX8Tmatrixidsmetadatagroup-metadata TVTREEaHEAPX Vdataindicesindptr8SNOD(ppu`TTV~~ ?@4 4 deflatexYvYPTREE(SNODhXb0l  deflatedvYXTREE(  deflate@mvYXTREE ) deflatevvYPTREE )~TREEaHEAPXPTREEaHEAPX؃Punifrac-0.10.0/unifrac/tests/data/t1.newick000066400000000000000000000000431351072301000204340ustar00rootroot00000000000000((a:1,b:2):4,(c:3,(d:1,e:1):2):3); unifrac-0.10.0/unifrac/tests/data/t2.newick000066400000000000000000000000331351072301000204340ustar00rootroot00000000000000(((a:1,b:1):1,c:5):2,d:4); unifrac-0.10.0/unifrac/tests/data/test.faith.exp000066400000000000000000000004241351072301000215000ustar00rootroot0000000000000010084.PC.481 8.268050017119094 10084.PC.593 6.256320004780719 10084.PC.356 7.2537400212168 10084.PC.355 6.688100009057962 10084.PC.354 7.827880010576337 10084.PC.636 6.08902999899874 10084.PC.635 8.128100004927546 10084.PC.607 7.725159989975509 10084.PC.634 7.508669988412294 unifrac-0.10.0/unifrac/tests/test_api.py000066400000000000000000001004551351072301000201670ustar00rootroot00000000000000# ---------------------------------------------------------------------------- # Copyright (c) 2016-2017, UniFrac development team. # # Distributed under the terms of the Modified BSD License. # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- import unittest import os from io import StringIO from tempfile import gettempdir import pkg_resources import numpy as np import numpy.testing as npt from biom import Table, load_table from biom.util import biom_open from skbio import TreeNode import skbio.diversity from unifrac import ssu, faith_pd class UnifracAPITests(unittest.TestCase): package = 'unifrac.tests' def get_data_path(self, filename): # adapted from qiime2.plugin.testing.TestPluginBase return pkg_resources.resource_filename(self.package, 'data/%s' % filename) def test_unweighted_root_eval_issue_46(self): tree = self.get_data_path('crawford.tre') table = self.get_data_path('crawford.biom') table_inmem = load_table(table) tree_inmem = skbio.TreeNode.read(tree) ids = table_inmem.ids() otu_ids = table_inmem.ids(axis='observation') cnts = table_inmem.matrix_data.astype(int).toarray().T exp = skbio.diversity.beta_diversity('unweighted_unifrac', cnts, ids=ids, otu_ids=otu_ids, tree=tree_inmem) obs = ssu(table, tree, 'unweighted', False, 1.0, False, 1) npt.assert_almost_equal(obs.data, exp.data) def test_meta_unifrac(self): t1 = self.get_data_path('t1.newick') e1 = self.get_data_path('e1.biom') result = ssu(e1, t1, 'unweighted', False, 1.0, False, 1) u1_distances = np.array([[0, 10 / 16., 8 / 13.], [10 / 16., 0, 8 / 17.], [8 / 13., 8 / 17., 0]]) npt.assert_almost_equal(u1_distances, result.data) self.assertEqual(tuple('ABC'), result.ids) def test_ssu_bad_tree(self): e1 = self.get_data_path('e1.biom') with self.assertRaisesRegex(IOError, "Tree file not found."): ssu(e1, 'bad-file', 'unweighted', False, 1.0, False, 1) def test_ssu_bad_table(self): t1 = self.get_data_path('t1.newick') with self.assertRaisesRegex(IOError, "Table file not found."): ssu('bad-file', t1, 'unweighted', False, 1.0, False, 1) def test_ssu_bad_method(self): t1 = self.get_data_path('t1.newick') e1 = self.get_data_path('e1.biom') with self.assertRaisesRegex(ValueError, "Unknown method."): ssu(e1, t1, 'unweightedfoo', False, 1.0, False, 1) class EdgeCasesTests(unittest.TestCase): # These tests were mostly ported from skbio's # skbio/diversity/beta/tests/test_unifrac.py at SHA-256 ea901b3b6b0b # note that not all tests were kept since the APIs are different. # # The test cases below only exercise unweighted, weighted and weighted # normalized UniFrac. The C++ test suite verifies (against reference # implementations) the variance adjusted and generalized variants of the # algorithm. package = 'unifrac.tests' def _work(self, u_counts, v_counts, otu_ids, tree, method): data = np.array([u_counts, v_counts]).T bt = Table(data, otu_ids, ['u', 'v']) ta = os.path.join(gettempdir(), 'table.biom') tr = os.path.join(gettempdir(), 'tree.biom') self.files_to_delete.append(ta) self.files_to_delete.append(tr) with biom_open(ta, 'w') as fhdf5: bt.to_hdf5(fhdf5, 'Table for unit testing') tree.write(tr) # return value is a distance matrix, get the distance from u->v return ssu(ta, tr, method, False, 1.0, False, 1)['u', 'v'] def weighted_unifrac(self, u_counts, v_counts, otu_ids, tree, normalized=False): if normalized: method = 'weighted_normalized' else: method = 'weighted_unnormalized' return self._work(u_counts, v_counts, otu_ids, tree, method) def unweighted_unifrac(self, u_counts, v_counts, otu_ids, tree, normalized=False): return self._work(u_counts, v_counts, otu_ids, tree, 'unweighted') def setUp(self): self.b1 = np.array( [[1, 3, 0, 1, 0], [0, 2, 0, 4, 4], [0, 0, 6, 2, 1], [0, 0, 1, 1, 1], [5, 3, 5, 0, 0], [0, 0, 0, 3, 5]]) self.sids1 = list('ABCDEF') self.oids1 = ['OTU%d' % i for i in range(1, 6)] self.t1 = TreeNode.read( StringIO('(((((OTU1:0.5,OTU2:0.5):0.5,OTU3:1.0):1.0):0.0,(OTU4:' '0.75,OTU5:0.75):1.25):0.0)root;')) self.t1_w_extra_tips = TreeNode.read( StringIO('(((((OTU1:0.5,OTU2:0.5):0.5,OTU3:1.0):1.0):0.0,(OTU4:' '0.75,(OTU5:0.25,(OTU6:0.5,OTU7:0.5):0.5):0.5):1.25):0.0' ')root;')) self.t2 = TreeNode.read( StringIO('((OTU1:0.1, OTU2:0.2):0.3, (OTU3:0.5, OTU4:0.7):1.1)' 'root;')) self.oids2 = ['OTU%d' % i for i in range(1, 5)] self.files_to_delete = [] def tearDown(self): for f in self.files_to_delete: try: os.remove(f) except OSError: pass def test_ssu_table_not_subset_tree(self): tree = TreeNode.read(StringIO('((OTU1:0.5,OTU3:1.0):1.0)root;')) expected_message = "The table does not appear to be completely "\ "represented by the phylogeny." with self.assertRaisesRegex(ValueError, expected_message): self.unweighted_unifrac(self.b1[0], self.b1[1], self.oids1, tree) def test_unweighted_otus_out_of_order(self): # UniFrac API does not assert the observations are in tip order of the # input tree shuffled_ids = self.oids1[:] shuffled_b1 = self.b1.copy() shuffled_ids[0], shuffled_ids[-1] = shuffled_ids[-1], shuffled_ids[0] shuffled_b1[:, [0, -1]] = shuffled_b1[:, [-1, 0]] for i in range(len(self.b1)): for j in range(len(self.b1)): actual = self.unweighted_unifrac( self.b1[i], self.b1[j], self.oids1, self.t1) expected = self.unweighted_unifrac( shuffled_b1[i], shuffled_b1[j], shuffled_ids, self.t1) self.assertAlmostEqual(actual, expected) def test_weighted_otus_out_of_order(self): # UniFrac API does not assert the observations are in tip order of the # input tree shuffled_ids = self.oids1[:] shuffled_b1 = self.b1.copy() shuffled_ids[0], shuffled_ids[-1] = shuffled_ids[-1], shuffled_ids[0] shuffled_b1[:, [0, -1]] = shuffled_b1[:, [-1, 0]] for i in range(len(self.b1)): for j in range(len(self.b1)): actual = self.weighted_unifrac( self.b1[i], self.b1[j], self.oids1, self.t1) expected = self.weighted_unifrac( shuffled_b1[i], shuffled_b1[j], shuffled_ids, self.t1) self.assertAlmostEqual(actual, expected) def test_unweighted_extra_tips(self): # UniFrac values are the same despite unobserved tips in the tree for i in range(len(self.b1)): for j in range(len(self.b1)): actual = self.unweighted_unifrac( self.b1[i], self.b1[j], self.oids1, self.t1_w_extra_tips) expected = self.unweighted_unifrac( self.b1[i], self.b1[j], self.oids1, self.t1) self.assertAlmostEqual(actual, expected) def test_weighted_extra_tips(self): # UniFrac values are the same despite unobserved tips in the tree for i in range(len(self.b1)): for j in range(len(self.b1)): actual = self.weighted_unifrac( self.b1[i], self.b1[j], self.oids1, self.t1_w_extra_tips) expected = self.weighted_unifrac( self.b1[i], self.b1[j], self.oids1, self.t1) self.assertAlmostEqual(actual, expected) def test_unweighted_minimal_trees(self): # two tips tree = TreeNode.read(StringIO('(OTU1:0.25, OTU2:0.25)root;')) actual = self.unweighted_unifrac([1, 0], [0, 0], ['OTU1', 'OTU2'], tree) expected = 1.0 self.assertEqual(actual, expected) def test_unweighted_root_not_observed(self): # expected values computed with QIIME 1.9.1 and by hand # root node not observed, but branch between (OTU1, OTU2) and root # is considered shared actual = self.unweighted_unifrac([1, 1, 0, 0], [1, 0, 0, 0], self.oids2, self.t2) # for clarity of what I'm testing, compute expected as it would # based on the branch lengths. the values that compose shared was # a point of confusion for me here, so leaving these in for # future reference expected = 0.2 / (0.1 + 0.2 + 0.3) # 0.3333333333 self.assertAlmostEqual(actual, expected) # root node not observed, but branch between (OTU3, OTU4) and root # is considered shared actual = self.unweighted_unifrac([0, 0, 1, 1], [0, 0, 1, 0], self.oids2, self.t2) # for clarity of what I'm testing, compute expected as it would # based on the branch lengths. the values that compose shared was # a point of confusion for me here, so leaving these in for # future reference expected = 0.7 / (1.1 + 0.5 + 0.7) # 0.3043478261 self.assertAlmostEqual(actual, expected) def test_weighted_root_not_observed(self): # expected values computed by hand, these disagree with QIIME 1.9.1 # root node not observed, but branch between (OTU1, OTU2) and root # is considered shared actual = self.weighted_unifrac([1, 0, 0, 0], [1, 1, 0, 0], self.oids2, self.t2) expected = 0.15 self.assertAlmostEqual(actual, expected) # root node not observed, but branch between (OTU3, OTU4) and root # is considered shared actual = self.weighted_unifrac([0, 0, 1, 1], [0, 0, 1, 0], self.oids2, self.t2) expected = 0.6 self.assertAlmostEqual(actual, expected) def test_weighted_normalized_root_not_observed(self): # expected values computed by hand, these disagree with QIIME 1.9.1 # root node not observed, but branch between (OTU1, OTU2) and root # is considered shared actual = self.weighted_unifrac([1, 0, 0, 0], [1, 1, 0, 0], self.oids2, self.t2, normalized=True) expected = 0.1764705882 self.assertAlmostEqual(actual, expected) # root node not observed, but branch between (OTU3, OTU4) and root # is considered shared actual = self.weighted_unifrac([0, 0, 1, 1], [0, 0, 1, 0], self.oids2, self.t2, normalized=True) expected = 0.1818181818 self.assertAlmostEqual(actual, expected) def test_unweighted_unifrac_identity(self): for i in range(len(self.b1)): actual = self.unweighted_unifrac( self.b1[i], self.b1[i], self.oids1, self.t1) expected = 0.0 self.assertAlmostEqual(actual, expected) def test_unweighted_unifrac_symmetry(self): for i in range(len(self.b1)): for j in range(len(self.b1)): actual = self.unweighted_unifrac( self.b1[i], self.b1[j], self.oids1, self.t1) expected = self.unweighted_unifrac( self.b1[j], self.b1[i], self.oids1, self.t1) self.assertAlmostEqual(actual, expected) def test_unweighted_unifrac_non_overlapping(self): # these communities only share the root node actual = self.unweighted_unifrac( self.b1[4], self.b1[5], self.oids1, self.t1) expected = 1.0 self.assertAlmostEqual(actual, expected) actual = self.unweighted_unifrac( [1, 1, 1, 0, 0], [0, 0, 0, 1, 1], self.oids1, self.t1) expected = 1.0 self.assertAlmostEqual(actual, expected) def test_unweighted_unifrac(self): # expected results derived from QIIME 1.9.1, which # is a completely different implementation skbio's initial # unweighted unifrac implementation # sample A versus all actual = self.unweighted_unifrac( self.b1[0], self.b1[1], self.oids1, self.t1) expected = 0.238095238095 self.assertAlmostEqual(actual, expected) actual = self.unweighted_unifrac( self.b1[0], self.b1[2], self.oids1, self.t1) expected = 0.52 self.assertAlmostEqual(actual, expected) actual = self.unweighted_unifrac( self.b1[0], self.b1[3], self.oids1, self.t1) expected = 0.52 self.assertAlmostEqual(actual, expected) actual = self.unweighted_unifrac( self.b1[0], self.b1[4], self.oids1, self.t1) expected = 0.545454545455 self.assertAlmostEqual(actual, expected) actual = self.unweighted_unifrac( self.b1[0], self.b1[5], self.oids1, self.t1) expected = 0.619047619048 self.assertAlmostEqual(actual, expected) # sample B versus remaining actual = self.unweighted_unifrac( self.b1[1], self.b1[2], self.oids1, self.t1) expected = 0.347826086957 self.assertAlmostEqual(actual, expected) actual = self.unweighted_unifrac( self.b1[1], self.b1[3], self.oids1, self.t1) expected = 0.347826086957 self.assertAlmostEqual(actual, expected) actual = self.unweighted_unifrac( self.b1[1], self.b1[4], self.oids1, self.t1) expected = 0.68 self.assertAlmostEqual(actual, expected) actual = self.unweighted_unifrac( self.b1[1], self.b1[5], self.oids1, self.t1) expected = 0.421052631579 self.assertAlmostEqual(actual, expected) # sample C versus remaining actual = self.unweighted_unifrac( self.b1[2], self.b1[3], self.oids1, self.t1) expected = 0.0 self.assertAlmostEqual(actual, expected) actual = self.unweighted_unifrac( self.b1[2], self.b1[4], self.oids1, self.t1) expected = 0.68 self.assertAlmostEqual(actual, expected) actual = self.unweighted_unifrac( self.b1[2], self.b1[5], self.oids1, self.t1) expected = 0.421052631579 self.assertAlmostEqual(actual, expected) # sample D versus remaining actual = self.unweighted_unifrac( self.b1[3], self.b1[4], self.oids1, self.t1) expected = 0.68 self.assertAlmostEqual(actual, expected) actual = self.unweighted_unifrac( self.b1[3], self.b1[5], self.oids1, self.t1) expected = 0.421052631579 self.assertAlmostEqual(actual, expected) # sample E versus remaining actual = self.unweighted_unifrac( self.b1[4], self.b1[5], self.oids1, self.t1) expected = 1.0 self.assertAlmostEqual(actual, expected) def test_weighted_unifrac_identity(self): for i in range(len(self.b1)): actual = self.weighted_unifrac( self.b1[i], self.b1[i], self.oids1, self.t1) expected = 0.0 self.assertAlmostEqual(actual, expected) def test_weighted_unifrac_symmetry(self): for i in range(len(self.b1)): for j in range(len(self.b1)): actual = self.weighted_unifrac( self.b1[i], self.b1[j], self.oids1, self.t1) expected = self.weighted_unifrac( self.b1[j], self.b1[i], self.oids1, self.t1) self.assertAlmostEqual(actual, expected) def test_weighted_unifrac_non_overlapping(self): # expected results derived from QIIME 1.9.1, which # is a completely different implementation skbio's initial # weighted unifrac implementation # these communities only share the root node actual = self.weighted_unifrac( self.b1[4], self.b1[5], self.oids1, self.t1) expected = 4.0 self.assertAlmostEqual(actual, expected) def test_weighted_unifrac(self): # expected results derived from QIIME 1.9.1, which # is a completely different implementation skbio's initial # weighted unifrac implementation actual = self.weighted_unifrac( self.b1[0], self.b1[1], self.oids1, self.t1) expected = 2.4 self.assertAlmostEqual(actual, expected) actual = self.weighted_unifrac( self.b1[0], self.b1[2], self.oids1, self.t1) expected = 1.86666666667 self.assertAlmostEqual(actual, expected) actual = self.weighted_unifrac( self.b1[0], self.b1[3], self.oids1, self.t1) expected = 2.53333333333 self.assertAlmostEqual(actual, expected) actual = self.weighted_unifrac( self.b1[0], self.b1[4], self.oids1, self.t1) expected = 1.35384615385 self.assertAlmostEqual(actual, expected) actual = self.weighted_unifrac( self.b1[0], self.b1[5], self.oids1, self.t1) expected = 3.2 self.assertAlmostEqual(actual, expected) # sample B versus remaining actual = self.weighted_unifrac( self.b1[1], self.b1[2], self.oids1, self.t1) expected = 2.26666666667 self.assertAlmostEqual(actual, expected) actual = self.weighted_unifrac( self.b1[1], self.b1[3], self.oids1, self.t1) expected = 0.933333333333 self.assertAlmostEqual(actual, expected) actual = self.weighted_unifrac( self.b1[1], self.b1[4], self.oids1, self.t1) expected = 3.2 self.assertAlmostEqual(actual, expected) actual = self.weighted_unifrac( self.b1[1], self.b1[5], self.oids1, self.t1) expected = 0.8375 self.assertAlmostEqual(actual, expected) # sample C versus remaining actual = self.weighted_unifrac( self.b1[2], self.b1[3], self.oids1, self.t1) expected = 1.33333333333 self.assertAlmostEqual(actual, expected) actual = self.weighted_unifrac( self.b1[2], self.b1[4], self.oids1, self.t1) expected = 1.89743589744 self.assertAlmostEqual(actual, expected) actual = self.weighted_unifrac( self.b1[2], self.b1[5], self.oids1, self.t1) expected = 2.66666666667 self.assertAlmostEqual(actual, expected) # sample D versus remaining actual = self.weighted_unifrac( self.b1[3], self.b1[4], self.oids1, self.t1) expected = 2.66666666667 self.assertAlmostEqual(actual, expected) actual = self.weighted_unifrac( self.b1[3], self.b1[5], self.oids1, self.t1) expected = 1.33333333333 self.assertAlmostEqual(actual, expected) # sample E versus remaining actual = self.weighted_unifrac( self.b1[4], self.b1[5], self.oids1, self.t1) expected = 4.0 self.assertAlmostEqual(actual, expected) def test_weighted_unifrac_identity_normalized(self): for i in range(len(self.b1)): actual = self.weighted_unifrac( self.b1[i], self.b1[i], self.oids1, self.t1, normalized=True) expected = 0.0 self.assertAlmostEqual(actual, expected) def test_weighted_unifrac_symmetry_normalized(self): for i in range(len(self.b1)): for j in range(len(self.b1)): actual = self.weighted_unifrac( self.b1[i], self.b1[j], self.oids1, self.t1, normalized=True) expected = self.weighted_unifrac( self.b1[j], self.b1[i], self.oids1, self.t1, normalized=True) self.assertAlmostEqual(actual, expected) def test_weighted_unifrac_non_overlapping_normalized(self): # these communities only share the root node actual = self.weighted_unifrac( self.b1[4], self.b1[5], self.oids1, self.t1, normalized=True) expected = 1.0 self.assertAlmostEqual(actual, expected) actual = self.weighted_unifrac( [1, 1, 1, 0, 0], [0, 0, 0, 1, 1], self.oids1, self.t1, normalized=True) expected = 1.0 self.assertAlmostEqual(actual, expected) def test_weighted_unifrac_normalized(self): # expected results derived from QIIME 1.9.1, which # is a completely different implementation skbio's initial # weighted unifrac implementation actual = self.weighted_unifrac( self.b1[0], self.b1[1], self.oids1, self.t1, normalized=True) expected = 0.6 self.assertAlmostEqual(actual, expected) actual = self.weighted_unifrac( self.b1[0], self.b1[2], self.oids1, self.t1, normalized=True) expected = 0.466666666667 self.assertAlmostEqual(actual, expected) actual = self.weighted_unifrac( self.b1[0], self.b1[3], self.oids1, self.t1, normalized=True) expected = 0.633333333333 self.assertAlmostEqual(actual, expected) actual = self.weighted_unifrac( self.b1[0], self.b1[4], self.oids1, self.t1, normalized=True) expected = 0.338461538462 self.assertAlmostEqual(actual, expected) actual = self.weighted_unifrac( self.b1[0], self.b1[5], self.oids1, self.t1, normalized=True) expected = 0.8 self.assertAlmostEqual(actual, expected) # sample B versus remaining actual = self.weighted_unifrac( self.b1[1], self.b1[2], self.oids1, self.t1, normalized=True) expected = 0.566666666667 self.assertAlmostEqual(actual, expected) actual = self.weighted_unifrac( self.b1[1], self.b1[3], self.oids1, self.t1, normalized=True) expected = 0.233333333333 self.assertAlmostEqual(actual, expected) actual = self.weighted_unifrac( self.b1[1], self.b1[4], self.oids1, self.t1, normalized=True) expected = 0.8 self.assertAlmostEqual(actual, expected) actual = self.weighted_unifrac( self.b1[1], self.b1[5], self.oids1, self.t1, normalized=True) expected = 0.209375 self.assertAlmostEqual(actual, expected) # sample C versus remaining actual = self.weighted_unifrac( self.b1[2], self.b1[3], self.oids1, self.t1, normalized=True) expected = 0.333333333333 self.assertAlmostEqual(actual, expected) actual = self.weighted_unifrac( self.b1[2], self.b1[4], self.oids1, self.t1, normalized=True) expected = 0.474358974359 self.assertAlmostEqual(actual, expected) actual = self.weighted_unifrac( self.b1[2], self.b1[5], self.oids1, self.t1, normalized=True) expected = 0.666666666667 self.assertAlmostEqual(actual, expected) # sample D versus remaining actual = self.weighted_unifrac( self.b1[3], self.b1[4], self.oids1, self.t1, normalized=True) expected = 0.666666666667 self.assertAlmostEqual(actual, expected) actual = self.weighted_unifrac( self.b1[3], self.b1[5], self.oids1, self.t1, normalized=True) expected = 0.333333333333 self.assertAlmostEqual(actual, expected) # sample E versus remaining actual = self.weighted_unifrac( self.b1[4], self.b1[5], self.oids1, self.t1, normalized=True) expected = 1.0 self.assertAlmostEqual(actual, expected) class FaithPDEdgeCasesTests(unittest.TestCase): # These tests were mostly ported from skbio's # skbio/diversity/alpha/tests/test_fatih_pd.py at SHA-256 a8c086b # note that not all tests were kept since the APIs are different. package = 'unifrac.tests' def write_table_tree(self, u_counts, otu_ids, sample_ids, tree): data = np.array([u_counts]).T bt = Table(data, otu_ids, sample_ids) ta = os.path.join(gettempdir(), 'table.biom') tr = os.path.join(gettempdir(), 'tree.biom') self.files_to_delete.append(ta) self.files_to_delete.append(tr) with biom_open(ta, 'w') as fhdf5: bt.to_hdf5(fhdf5, 'Table for unit testing') tree.write(tr) return ta, tr def faith_pd_work(self, u_counts, otu_ids, sample_ids, tree): ta, tr = self.write_table_tree(u_counts, otu_ids, sample_ids, tree) return faith_pd(ta, tr) def setUp(self): self.counts = np.array([0, 1, 1, 4, 2, 5, 2, 4, 1, 2]) self.b1 = np.array([[1, 3, 0, 1, 0], [0, 2, 0, 4, 4], [0, 0, 6, 2, 1], [0, 0, 1, 1, 1]]) self.sids1 = list('ABCD') self.oids1 = ['OTU%d' % i for i in range(1, 6)] self.t1 = TreeNode.read(StringIO( '(((((OTU1:0.5,OTU2:0.5):0.5,OTU3:1.0):1.0):' '0.0,(OTU4:0.75,OTU5:0.75):1.25):0.0)root;')) self.t1_w_extra_tips = TreeNode.read( StringIO('(((((OTU1:0.5,OTU2:0.5):0.5,OTU3:1.0):1.0):0.0,(OTU4:' '0.75,(OTU5:0.25,(OTU6:0.5,OTU7:0.5):0.5):0.5):1.25):0.0' ')root;')) self.files_to_delete = [] def tearDown(self): for f in self.files_to_delete: try: os.remove(f) except OSError: pass def test_faith_pd_zero_branches_omitted(self): # also deleted branch length fo t2 = TreeNode.read(StringIO( '((OTU1:0.5,OTU2:0.5),(OTU3:1.0,(OTU4:0.5,' 'OTU5:0.75):1.0):1.0)root;' )) actual = self.faith_pd_work([1, 1, 0, 0, 0], self.oids1, ['foo'], t2) expected = 1.0 self.assertAlmostEqual(actual[0], expected) def test_faith_pd_none_observed(self): actual = self.faith_pd_work([0, 0, 0, 0, 0], self.oids1, ['foo'], self.t1) expected = 0.0 self.assertAlmostEqual(actual.values, expected) def test_faith_pd_biom_table_empty(self): table, tree = self.write_table_tree([], [], [], self.t1) self.assertRaises(ValueError, faith_pd, table, tree) def test_faith_pd_table_not_subset_tree(self): tree = TreeNode.read(StringIO('((OTU1:0.5,OTU3:1.0):1.0)root;')) table_ids = ['OTU1', 'OTU2'] table, tree = self.write_table_tree([1, 0], table_ids, ['foo'], tree) expected_message = "The table does not appear to be completely "\ "represented by the phylogeny." with self.assertRaisesRegex(ValueError, expected_message): faith_pd(table, tree) def test_faith_pd_all_observed(self): actual = self.faith_pd_work([1, 1, 1, 1, 1], self.oids1, ['foo'], self.t1) expected = sum(n.length for n in self.t1.traverse() if n.length is not None) self.assertAlmostEqual(actual.values, expected) actual = self.faith_pd_work([1, 2, 3, 4, 5], self.oids1, ['foo'], self.t1) expected = sum(n.length for n in self.t1.traverse() if n.length is not None) self.assertAlmostEqual(actual.values, expected) def test_faith_pd(self): # expected results derived from QIIME 1.9.1, which # is a completely different implementation unifrac's initial # phylogenetic diversity implementation actual = self.faith_pd_work(self.b1[0], self.oids1, [self.sids1[0]], self.t1) expected = 4.5 self.assertAlmostEqual(actual.values, expected) actual = self.faith_pd_work(self.b1[1], self.oids1, [self.sids1[1]], self.t1) expected = 4.75 self.assertAlmostEqual(actual.values, expected) actual = self.faith_pd_work(self.b1[2], self.oids1, [self.sids1[2]], self.t1) expected = 4.75 self.assertAlmostEqual(actual.values, expected) actual = self.faith_pd_work(self.b1[3], self.oids1, [self.sids1[3]], self.t1) expected = 4.75 self.assertAlmostEqual(actual.values, expected) def test_faith_pd_extra_tips(self): # results are the same despite presences of unobserved tips in tree actual = self.faith_pd_work(self.b1[0], self.oids1, [self.sids1[0]], self.t1_w_extra_tips) expected = self.faith_pd_work(self.b1[0], self.oids1, [self.sids1[0]], self.t1) self.assertAlmostEqual(actual.values, expected.values) actual = self.faith_pd_work(self.b1[1], self.oids1, [self.sids1[1]], self.t1_w_extra_tips) expected = self.faith_pd_work(self.b1[1], self.oids1, [self.sids1[1]], self.t1) self.assertAlmostEqual(actual.values, expected.values) actual = self.faith_pd_work(self.b1[2], self.oids1, [self.sids1[2]], self.t1_w_extra_tips) expected = self.faith_pd_work(self.b1[2], self.oids1, [self.sids1[2]], self.t1) self.assertAlmostEqual(actual.values, expected.values) actual = self.faith_pd_work(self.b1[3], self.oids1, [self.sids1[3]], self.t1_w_extra_tips) expected = self.faith_pd_work(self.b1[3], self.oids1, [self.sids1[3]], self.t1) self.assertAlmostEqual(actual.values, expected.values) def test_faith_pd_minimal(self): # two tips tree = TreeNode.read(StringIO('(OTU1:0.25, OTU2:0.25)root;')) actual = self.faith_pd_work([1, 0], ['OTU1', 'OTU2'], ['foo'], tree) expected = 0.25 self.assertEqual(actual.values, expected) def test_faith_pd_series_name(self): tree = TreeNode.read(StringIO('(OTU1:0.25, OTU2:0.25)root;')) actual = self.faith_pd_work([1, 0], ['OTU1', 'OTU2'], ['foo'], tree) self.assertEqual("faith_pd", actual.name) def test_faith_pd_root_not_observed(self): # expected values computed by hand tree = TreeNode.read( StringIO('((OTU1:0.1, OTU2:0.2):0.3, (OTU3:0.5, OTU4:0.7):1.1)' 'root;')) otu_ids = ['OTU%d' % i for i in range(1, 5)] # root node not observed, but branch between (OTU1, OTU2) and root # is considered observed actual = self.faith_pd_work([1, 1, 0, 0], otu_ids, ['foo'], tree) expected = 0.6 self.assertAlmostEqual(actual[0], expected) # root node not observed, but branch between (OTU3, OTU4) and root # is considered observed actual = self.faith_pd_work([0, 0, 1, 1], otu_ids, ['foo'], tree) expected = 2.3 self.assertAlmostEqual(actual[0], expected) def test_faith_pd_invalid_input(self): # tests are based of skbio tests, checking for duplicate ids, # negative counts are not included but should be incorporated # tree has duplicated tip ids tree = TreeNode.read( StringIO('((OTU1:0.1, OTU2:0.2):0.3, (OTU3:0.5, OTU4:0.7):1.1)' 'root;')) otu_ids = ['OTU%d' % i for i in range(1, 5)] u_counts = [1, 1, 0, 0] data = np.array([u_counts]).T bt = Table(data, otu_ids, ['u']) ta = os.path.join(gettempdir(), 'table.biom') tr = os.path.join(gettempdir(), 'tree.biom') self.files_to_delete.append(ta) self.files_to_delete.append(tr) with biom_open(ta, 'w') as fhdf5: bt.to_hdf5(fhdf5, 'Table for unit testing') tree.write(tr) self.assertRaises(IOError, faith_pd, 'dne.biom', tr) self.assertRaises(IOError, faith_pd, ta, 'dne.tre') if __name__ == "__main__": unittest.main() unifrac-0.10.0/unifrac/tests/test_methods.py000066400000000000000000000071471351072301000210650ustar00rootroot00000000000000# ---------------------------------------------------------------------------- # Copyright (c) 2016-2017, QIIME 2 development team. # # Distributed under the terms of the Modified BSD License. # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- import unittest import pkg_resources import numpy as np import numpy.testing as npt from unifrac import meta class StateUnifracTests(unittest.TestCase): package = 'unifrac.tests' def get_data_path(self, filename): # adapted from qiime2.plugin.testing.TestPluginBase return pkg_resources.resource_filename(self.package, 'data/%s' % filename) def test_meta_unifrac(self): """meta_unifrac should give correct result on sample trees""" t1 = self.get_data_path('t1.newick') t2 = self.get_data_path('t2.newick') e1 = self.get_data_path('e1.biom') e2 = self.get_data_path('e2.biom') result = meta([e1, e2], [t1, t2], weights=[1, 1], consolidation='skipping-missing-values', method='unweighted') u1_distances = np.array([[0, 10/16., 8/13.], [10/16., 0, 8/17.], [8/13., 8/17., 0]]) u2_distances = np.array([[0, 11/14., 6/13.], [11/14., 0, 7/13.], [6/13., 7/13., 0]]) exp = (u1_distances + u2_distances) / 2 npt.assert_almost_equal(exp, result.data) self.assertEqual(tuple('ABC'), result.ids) def test_meta_unifrac_unbalanced(self): with self.assertRaisesRegex(ValueError, ("Number of trees and tables " "must be the same.")): meta(('a', ), ('a', 'b')) with self.assertRaisesRegex(ValueError, ("Number of trees and tables " "must be the same.")): meta(('a', 'b'), ('a', )) def test_meta_unifrac_unbalanced_weights(self): with self.assertRaisesRegex(ValueError, "Number of weights does not " "match number of trees and " "tables."): meta(('c', 'd'), ('a', 'b'), weights=(1, 2, 3)) def test_meta_unifrac_missing(self): with self.assertRaisesRegex(ValueError, "No trees specified."): meta(('a', ), tuple()) with self.assertRaisesRegex(ValueError, "No tables specified."): meta(tuple(), ('a', )) def test_meta_unifrac_no_method(self): with self.assertRaisesRegex(ValueError, "No method specified."): meta(('a', ), ('b', )) def test_meta_unifrac_bad_method(self): with self.assertRaisesRegex(ValueError, r"Method \(bar\) " "unrecognized."): meta(('a', ), ('b', ), method='bar') def test_meta_unifrac_bad_consolidation(self): with self.assertRaisesRegex(ValueError, r"Consolidation \(foo\) unrecognized."): meta(('a', ), ('b', ), method='unweighted', consolidation='foo') def test_meta_unifrac_alpha_not_generalized(self): with self.assertRaisesRegex(ValueError, "The alpha parameter can"): meta(('a', ), ('b', ), method='generalized', alpha=1, consolidation='skipping_missing_matrices') if __name__ == "__main__": unittest.main()