pax_global_header00006660000000000000000000000064136041710350014512gustar00rootroot0000000000000052 comment=0db17a40717d826920577a750b8c79b627edc582 berkeleylab-metabat-0db17a40717d/000077500000000000000000000000001360417103500164755ustar00rootroot00000000000000berkeleylab-metabat-0db17a40717d/.dockerignore000066400000000000000000000003721360417103500211530ustar00rootroot00000000000000.dockerignore Dockerfile #.git* bitbucket*.yml metabat metabat1 metabat2 jgi_summarize_bam_contig_depths contigOverlaps *.o .sconsign.dblite build/ samtools* /include/ /lib/ bin /.autotools /.cproject .settings /.project /.pydevproject /Build (GNU)/ berkeleylab-metabat-0db17a40717d/.edison_deploy/000077500000000000000000000000001360417103500214105ustar00rootroot00000000000000berkeleylab-metabat-0db17a40717d/.edison_deploy/build.sh000077500000000000000000000001371360417103500230470ustar00rootroot00000000000000#!/bin/bash -l . $(dirname $0)/env.sh scons BOOST_ROOT=$BOOST_ROOT SAMTOOLS_DIR=$SAMTOOLS_DIR berkeleylab-metabat-0db17a40717d/.edison_deploy/env.sh000077500000000000000000000002201360417103500225310ustar00rootroot00000000000000#!/bin/bash -l set -ex module swap PrgEnv-intel PrgEnv-gnu/4.9 module load boost module load binutils module load scons module load samtools berkeleylab-metabat-0db17a40717d/.edison_deploy/install.sh000077500000000000000000000002551360417103500234170ustar00rootroot00000000000000#!/bin/bash -l . $(dirname $0)/env.sh scons install PREFIX=$PREFIX BOOST_ROOT=$BOOST_ROOT SAMTOOLS_DIR=$SAMTOOLS_DIR cp .genepool_deploy/module_dependencies $PREFIX/.deps berkeleylab-metabat-0db17a40717d/.edison_deploy/module_dependencies000066400000000000000000000000001360417103500253140ustar00rootroot00000000000000berkeleylab-metabat-0db17a40717d/.edison_deploy/test.sh000077500000000000000000000001461360417103500227270ustar00rootroot00000000000000#!/bin/bash -l . $(dirname $0)/env.sh scons test BOOST_ROOT=$BOOST_ROOT SAMTOOLS_DIR=$SAMTOOLS_DIR berkeleylab-metabat-0db17a40717d/.genepool_deploy/000077500000000000000000000000001360417103500217375ustar00rootroot00000000000000berkeleylab-metabat-0db17a40717d/.genepool_deploy/build.sh000077500000000000000000000001371360417103500233760ustar00rootroot00000000000000#!/bin/bash -l . $(dirname $0)/env.sh scons BOOST_ROOT=$BOOST_ROOT SAMTOOLS_DIR=$SAMTOOLS_DIR berkeleylab-metabat-0db17a40717d/.genepool_deploy/env.sh000077500000000000000000000002061360417103500230640ustar00rootroot00000000000000#!/bin/bash -l set -ex module purge module load PrgEnv-gnu/4.9 module load binutils/2.28 module load scons module load boost/1.59.0 berkeleylab-metabat-0db17a40717d/.genepool_deploy/install.sh000077500000000000000000000002551360417103500237460ustar00rootroot00000000000000#!/bin/bash -l . $(dirname $0)/env.sh scons install PREFIX=$PREFIX BOOST_ROOT=$BOOST_ROOT SAMTOOLS_DIR=$SAMTOOLS_DIR cp .genepool_deploy/module_dependencies $PREFIX/.deps berkeleylab-metabat-0db17a40717d/.genepool_deploy/module_dependencies000066400000000000000000000000001360417103500256430ustar00rootroot00000000000000berkeleylab-metabat-0db17a40717d/.genepool_deploy/test.sh000077500000000000000000000001461360417103500232560ustar00rootroot00000000000000#!/bin/bash -l . $(dirname $0)/env.sh scons test BOOST_ROOT=$BOOST_ROOT SAMTOOLS_DIR=$SAMTOOLS_DIR berkeleylab-metabat-0db17a40717d/.gitignore000066400000000000000000000004171360417103500204670ustar00rootroot00000000000000metabat_version.h metabat metabat1 metabat2 jgi_summarize_bam_contig_depths contigOverlaps *.o .sconsign.dblite test/*.bins.txt.fa test/*.depth* test/*.pruned.* build*/ samtools* /include/ /lib/ bin /.autotools /.cproject .settings /.project /.pydevproject /Build (GNU)/ berkeleylab-metabat-0db17a40717d/.gitlab-ci.yml000066400000000000000000000023011360417103500211250ustar00rootroot00000000000000# This file is a template, and might need editing before it works on your project. # Official docker image. image: docker:latest services: - docker:dind before_script: - set -e - export METABAT_VERSION=$(git describe --tags) - docker info - docker login -u "$CI_REGISTRY_USER" -p "$CI_REGISTRY_PASSWORD" $CI_REGISTRY - cat /etc/issue - apk add --no-cache openssh-client git - export METABAT_VERSION=$(git describe --tags) build-master: stage: build script: - set -e - docker build -t "$CI_REGISTRY_IMAGE" -t "$CI_REGISTRY_IMAGE:$CI_COMMIT_REF_SLUG" -t "$DOCKERHUB_USER/metabat:$CI_COMMIT_SHA" -t "$DOCKERHUB_USER/metabat:latest" -t "$DOCKERHUB_USER/metabat:$METABAT_VERSION" . - docker push "$CI_REGISTRY_IMAGE" - docker login -u "$DOCKERHUB_USER" -p "$DOCKERHUB_PASSWORD" $DOCKERHUB_REGISTRY - docker push $DOCKERHUB_USER/metabat only: - master build: stage: build script: - set -e - docker build -t "$CI_REGISTRY_IMAGE:$CI_COMMIT_REF_SLUG" -t "$CI_REGISTRY_IMAGE:$CI_COMMIT_SHA" . - docker push "$CI_REGISTRY_IMAGE:$CI_COMMIT_REF_SLUG" - docker push "$CI_REGISTRY_IMAGE:$CI_COMMIT_SHA" except: - master after_script: - echo "Done" berkeleylab-metabat-0db17a40717d/CMakeLists.txt000066400000000000000000000030641360417103500212400ustar00rootroot00000000000000cmake_minimum_required (VERSION 3.5.1) project (MetaBAT) if(NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release) endif() message("Installing ${CMAKE_BUILD_TYPE} MetaBAT into ${CMAKE_INSTALL_PREFIX}") include(${CMAKE_ROOT}/Modules/ExternalProject.cmake) include(cmake/zlib.cmake) include(cmake/htslib.cmake) set(CMAKE_CXX_STANDARD 11) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) set(CMAKE_C_STANDARD 99) set(CMAKE_C_STANDARD_REQUIRED ON) set(CMAKE_C_EXTENSIONS OFF) add_definitions(-D_XOPEN_SOURCE=700) set(PRE_CONFIGURE_FILE "metabat_version.h.in") set(POST_CONFIGURE_FILE "metabat_version.h") include(cmake/git-watcher.cmake) if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") # using Clang elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang") # using AppleClang elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") # using GCC if ("${CMAKE_BUILD_TYPE}" STREQUAL "Release") set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -mtune=native") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -mtune=native") endif() elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel") # using Intel C++ endif() include_directories(${CMAKE_SOURCE_DIR}) add_subdirectory(src) if (NOT NO_TESTING) enable_testing() add_subdirectory(test) endif() install(PROGRAMS runMetaBat.sh merge_depths.pl aggregateBinDepths.pl aggregateContigOverlapsByBin.pl DESTINATION bin/ ) INSTALL(CODE "execute_process( \ COMMAND ${CMAKE_COMMAND} -E create_symlink \ metabat2 \ ${CMAKE_INSTALL_PREFIX}/bin/metabat \ )" ) berkeleylab-metabat-0db17a40717d/Dockerfile000066400000000000000000000021311360417103500204640ustar00rootroot00000000000000FROM ubuntu:18.04 AS run-env LABEL Maintainer="Rob Egan" WORKDIR /root # This is necessary because the upgrade sometimes prompts for input ENV DEBIAN_FRONTEND=noninteractive RUN apt-get update && \ apt-get install -y libgomp1 && \ apt-get autoremove -y && \ apt-get clean && \ apt-get autoclean && \ rm -rf /var/lib/apt/lists/* FROM run-env as builder RUN apt-get update && \ apt-get install -y build-essential autoconf libboost-all-dev cmake git curl libncurses5-dev zlib1g-dev # copy the git tree (minus Dockerfile) to metabat subdir COPY . metabat RUN cd metabat && \ mkdir build && \ cd build && \ cmake -DCMAKE_INSTALL_PREFIX=/usr/local .. && \ make -j8 && \ make install && \ cd .. && \ rm -rf build FROM run-env WORKDIR /root ENV PATH=$PATH:/root/bin COPY --from=builder /usr/local /usr/local env PATH=/usr/local/bin:$PATH CMD ["/usr/local/bin/runMetaBat.sh"] # build and deploy with this command # docker build --tag robegan21/metabat:$(git describe --tags) --tag robegan21/metabat:latest . && docker push robegan21/metabat berkeleylab-metabat-0db17a40717d/INSTALL.md000066400000000000000000000036361360417103500201350ustar00rootroot00000000000000#Install instructions for supported Operating Systems # Recent Linux Distributions with packages for MetaBAT pre-requisites: ## Docker: ----------------- ``` git clone https://bitbucket.org/berkeleylab/metabat.git cd metabat docker build --tag metabat . docker run metabat runMetaBat.sh ... ``` ### Prerequisites for Linux Ubuntu 16.04 ---------------- ``` # install boost and a build environment sudo apt-get update sudo apt-get install -y build-essential libboost-all-dev git cmake curl libncurses5-dev zlib1g-dev mkdir build ; cd build && cmake -DCMAKE_INSTALL_PREFIX=$HOME/metabat .. && make && make install ``` ### Prerequisties for Linux Fedora 20 -------------------- ``` #### install g++, boost and other build dependencies sudo yum install gcc-c++ boost.x86_64 boost-devel.x86_64 zlib-devel.x86_64 libstdc++-static cmake mkdir build ; cd build && cmake -DCMAKE_INSTALL_PREFIX=$HOME/metabat .. && make && make install ``` ### Prerequisties for MacOS X (10.14 Mojave) : ( using Homebrew http://brew.sh/ ) --------------------- ``` # First install Xcode from the App Store (version 10.2) # Second install Homebrew # Third install llvm with openmpi and boost and cmake brew tap homebrew/versions brew install llvm libomp boost cmake brew link libomp # use the latest llvm compiler and flags export CPPFLAGS="-I/usr/local/opt/llvm/include" export LDFLAGS="-L/usr/local/opt/llvm/lib" export CC=/usr/local/opt/llvm/bin/clang export CXX=/usr/local/opt/llvm/bin/clang++ mkdir build ; cd build && cmake -DCMAKE_INSTALL_PREFIX=$HOME/metabat .. && make && make install ``` ### Older distributions must build and install: ``` gcc/g++ >= 4.9 or intel >= 18.0.1 or llvm >= 8.0 boost >= 1.53 cmake >= 3.8.2 make >= 4.1 ``` # Build and install MetaBAT ``` git clone https://bitbucket.org/berkeleylab/metabat.git cd metabat mkdir build cd build cmake -DCMAKE_INSTALL_PREFIX=$HOME/metabat .. make make install cd .. rm -rf build ``` berkeleylab-metabat-0db17a40717d/Legal.txt000066400000000000000000000023311360417103500202610ustar00rootroot00000000000000********* MetaBAT Copyright (c) 2014, The Regents of the University of California, through Lawrence Berkeley National Laboratory (subject to receipt of any required approvals from the U.S. Dept. of Energy). All rights reserved. If you have questions about your rights to use or distribute this software, please contact Berkeley Lab's technology transfer department at TTD@lbl.gov referring to "MetaBAT (2014-075)." NOTICE. This software was developed under funding from the U.S. Department of Energy. As such, the U.S. Government has been granted for itself and others acting on its behalf a paid-up, nonexclusive, irrevocable, worldwide license in the Software to reproduce, prepare derivative works, and perform publicly and display publicly. Beginning five (5) years after the date permission to assert copyright is obtained from the U.S. Department of Energy, and subject to any subsequent five (5) year renewals, the U.S. Government is granted for itself and others acting on its behalf a paid-up, nonexclusive, irrevocable, worldwide license in the Software to reproduce, prepare derivative works, distribute copies to the public, perform publicly and display publicly, and to permit others to do so. ********* berkeleylab-metabat-0db17a40717d/MetaBAT2PaperSupplementaryScripts/000077500000000000000000000000001360417103500251255ustar00rootroot00000000000000berkeleylab-metabat-0db17a40717d/MetaBAT2PaperSupplementaryScripts/README000066400000000000000000000013171360417103500260070ustar00rootroot00000000000000# This folder contains several scripts that were used in the MetaBAT2 paper. ## Requirements: To replicate the results we showed in the paper, you'll need the following: 1. BAM files from each sample. If you want to generate BAM files yourself, we recommend the BBMap from the bbtools package.(https://jgi.doe.gov/data-and-tools/bbtools/) 2. MetaBAT2, CONCOCT, MaxBin or other software tools installed ## List of scripts: 1. runBBMap.sh: The BASH script to run BBMap to create the BAM files 2. benchmark.R: This runs the benchmarking and profiles performance of tools listed above using CAMI datasets. 3. binning.R is the script we used to perform the binning on CAMI datasets for our MetaBAT 2 paper. berkeleylab-metabat-0db17a40717d/MetaBAT2PaperSupplementaryScripts/benchmark.R000066400000000000000000000615271360417103500272150ustar00rootroot00000000000000requireAll <- function(packages) { dir.create("~/Rlibs", showWarnings=FALSE) .libPaths("~/Rlibs") .packages <- setdiff(packages, installed.packages()[,'Package']) if(length(.packages)>0) { suppressWarnings(rm(biocLite, envir=.GlobalEnv)) source("http://bioconductor.org/biocLite.R") biocLite(.packages, dependencies=TRUE, ask=FALSE, suppressUpdates=TRUE, lib="~/Rlibs") } for(package in packages) suppressPackageStartupMessages(do.call(library, list(package))) } requireAll(c('ggplot2','foreach','plyr','reshape2')) calcPerf <- function(type=c("MetaBAT","CONCOCT","GroopM","MaxBin","Canopy"), file="clustering_gt1000.csv", prof=NULL, minSize=200000) { type <- match.arg(type) if(!file.exists("contigs.txt")) { system("wget http://portal.nersc.gov/dna/RD/Metagenome_RD/MetaBAT/Files/contigs.txt") if(!file.exists("contigs.txt")) stop("Cannot find contigs.txt. Download it from http://portal.nersc.gov/dna/RD/Metagenome_RD/MetaBAT/Files/") } if(!file.exists("genomes.txt")) { system("wget http://portal.nersc.gov/dna/RD/Metagenome_RD/MetaBAT/Files/genomes.txt") if(!file.exists("genomes.txt")) stop("Cannot find genomes.txt. Download it from http://portal.nersc.gov/dna/RD/Metagenome_RD/MetaBAT/Files/") } set.seed(94521) contigs <- read.table("contigs.txt", sep="\t", header=T, as.is=T) genomes <- read.table("genomes.txt", sep="\t", header=T, as.is=T) if (type == 'MetaBAT') { files <- system(sprintf("ls %s.* | egrep '\\.[0-9]+$'", file), intern=T) if(length(files) == 0) stop(sprintf("Cannot find bins: %s.*", file)) } else if (type == "CONCOCT") { if(!file.exists(file)) stop(sprintf("Cannot find %s", file)) cc <- read.csv(file, header=F, as.is=T) cc.size <- ddply(cc, .(V2), function(x) sum(contigs$Size[match(x$V1, contigs$Name)])) stopifnot(!any(is.na(cc.size))) cc <- cc[cc$V2 %in% cc.size$V2[cc.size$V1 >= minSize],] files <- unique(cc$V2) } else if (type == "GroopM") { files <- system(sprintf("ls %s_bin_*.fna", file), intern=T) if(length(files) == 0) stop(sprintf("Cannot find bins: %s.*", file)) } else if (type == "MaxBin") { files <- system(sprintf("ls %s.*.fasta", file), intern=T) if(length(files) == 0) stop(sprintf("Cannot find bins: %s.*", file)) } else if (type == "Canopy") { if(!file.exists(file)) stop(sprintf("Cannot find %s", file)) if(is.null(prof)) stop("Cluster profile should be given") if(!file.exists(prof)) stop(sprintf("Cannot find %s", prof)) prof <- read.table(prof, as.is=T) rownames(prof) <- prof[,1]; prof <- prof[,-1] CAGs <- rownames(prof)[rowSums(t(apply(prof,1,sort, decreasing=TRUE))[,1:3])/rowSums(prof)<=0.9] cc <- read.table(file, as.is=T)[,c('V2','V1')]; colnames(cc) <- c('V1','V2') CAGs <- intersect(CAGs, names(which(table(cc$V2) > 2))) cc <- cc[cc$V2 %in% CAGs,] cc.size <- ddply(cc, .(V2), function(x) sum(contigs$Size[match(x$V1, contigs$Name)])) stopifnot(!any(is.na(cc.size))) cc <- cc[cc$V2 %in% cc.size$V2[cc.size$V1 >= minSize],] files <- unique(cc$V2) } res <- foreach(f=files, .combine=rbind) %do% { if (type == 'MetaBAT') ctgs <- read.table(f, as.is=T)$V1 else if (type %in% c("CONCOCT", "Canopy")) ctgs <- cc$V1[cc$V2 == f] else if (type %in% c("GroopM","MaxBin")) ctgs <- system(sprintf("grep '>' %s | sed 's/>//'", f), intern=TRUE) .res <- contigs[match(ctgs, contigs$Name),] stopifnot(!any(is.na(.res)) | nrow(.res) == length(ctgs)) .res$Name <- sapply(strsplit(.res$Name, "\\[|\\]"), function(x) x[2]) .res <- ddply(.res, .(Name), function(x) sum(x$Size)) colnames(.res) <- c("Genome","Size") .res <- .res[order(.res$Size,decreasing=T),] TP <- .res$Size[1] FP <- sum(.res$Size) - TP Recall <- TP / genomes$Size[genomes[,1] == .res$Genome[1]] Precision <- TP / sum(.res$Size) F1 <- 2 * Recall * Precision / (Precision + Recall) F0.5 <- (1 + .5 ^ 2) * Recall * Precision / ((.5 ^ 2) * Precision + Recall) cbind.data.frame(Genome=.res$Genome[1], Recall, Precision, F1, F0.5, stringsAsFactors=F) } while (length(unique(res$Recall)) != nrow(res)) { res$Recall = res$Recall + rnorm(nrow(res), sd=1e-8) } while (length(unique(res$Precision)) != nrow(res)) { res$Precision = res$Precision + rnorm(nrow(res), sd=1e-8) } while (length(unique(res$F1)) != nrow(res)) { res$F1 = res$F1 + rnorm(nrow(res), sd=1e-8) } while (length(unique(res$'F0.5')) != nrow(res)) { res$'F0.5' = res$'F0.5' + rnorm(nrow(res), sd=1e-8) } res <- cbind(res, Rank.Recall=length(res$Recall)+1-rank(res$Recall,ties.method="max"), Rank.Precision=length(res$Precision)+1-rank(res$Precision,ties.method="max"), Rank.F1=length(res$F1)+1-rank(res$F1,ties.method="max"), Rank.F0.5=length(res$'F0.5')+1-rank(res$'F0.5',ties.method="max")) res } calcPerfBySCG <- function(f, minRec=.2, minPrec=0, removeStrain=F, skip=2) { #to prevent bias in precision due to smaller bin size if(is.data.frame(f)) SCG <- f else SCG <- read.table(f, comment.char='-', as.is=T, header=F, skip=skip) set.seed(94522) SCG <- SCG[order(SCG$V13,decreasing=T),] #TODO need to warn the additional '-' character in the bin name SCG.ID <- SCG$V1 if (ncol(SCG) == 14) { if(removeStrain) SCG[,13] <- SCG[,13] * (100 - SCG[,14]) / 100 SCG <- SCG[,c(12,13)] / 100 SCG$V13 <- pmax(1 - SCG$V13, 0) } else if (ncol(SCG) == 15) { if(removeStrain) SCG[,14] <- SCG[,14] * (100 - SCG[,15]) / 100 SCG <- SCG[,c(13,14)] / 100 SCG$V14 <- pmax(1 - SCG$V14, 0) } else stop("[Error!] Unexpected SCG file format") SCG <- cbind(SCG.ID, SCG, stringsAsFactors=F) colnames(SCG) <- c('ID','Recall','Precision') SCG <- SCG[SCG$Recall >= minRec & SCG$Precision >= minPrec,] SCG$F1 <- 2 * SCG$Recall * SCG$Precision / (SCG$Precision + SCG$Recall) SCG$'F0.5' <- (1 + .5 ^ 2) * SCG$Recall * SCG$Precision / ((.5 ^ 2) * SCG$Precision + SCG$Recall) while (length(unique(SCG$Recall)) != nrow(SCG)) { SCG$Recall <- SCG$Recall + rnorm(nrow(SCG), sd=1e-8) } while (length(unique(SCG$Precision)) != nrow(SCG)) { SCG$Precision <- SCG$Precision + rnorm(nrow(SCG), sd=1e-8) } while (length(unique(SCG$F1)) != nrow(SCG)) { SCG$F1 <- SCG$F1 + rnorm(nrow(SCG), sd=1e-8) } while (length(unique(SCG$'F0.5')) != nrow(SCG)) { SCG$'F0.5' <- SCG$'F0.5' + rnorm(nrow(SCG), sd=1e-8) } SCG$Recall <- pmax(pmin(SCG$Recall, 1), 0) SCG$Precision <- pmax(pmin(SCG$Precision, 1), 0) SCG$Rank.Recall <- length(SCG$Recall)+1-rank(SCG$Recall,ties.method="max") SCG$Rank.Precision <- length(SCG$Precision)+1-rank(SCG$Precision,ties.method="max") SCG$Rank.F1 <- length(SCG$F1)+1-rank(SCG$F1,ties.method="max") SCG$Rank.F0.5 <- length(SCG$'F0.5')+1-rank(SCG$'F0.5',ties.method="max") SCG } calcPerfCAMI <- function(type=c("MetaBAT","CONCOCT","MaxBin","BinSanity"), file="clustering_gt1000.csv", complexity=c('low','medium','high'), minSize=200000) { type <- match.arg(type) complexity <- match.arg(complexity) if (complexity == 'low') { fname1 <- 'contigs-low.txt' fname2 <- 'gsa_mapping.binning' } else if (complexity == 'medium') { fname1 <- 'contigs-medium.txt' fname2 <- 'pooled_gsa_mapping.binning.tsv' } else if (complexity == 'high') { fname1 <- 'contigs-high.txt' fname2 <- 'gsa_mapping_pool.binning' } if(!file.exists(fname1)) { system(sprintf("wget http://portal.nersc.gov/dna/RD/Metagenome_RD/MetaBAT/Files/CAMI/%s", fname1)) if(!file.exists(fname1)) stop(sprintf("Cannot find %s. Download it from http://portal.nersc.gov/dna/RD/Metagenome_RD/MetaBAT/Files/CAMI/", fname1)) } if(!file.exists(fname2)) { system(sprintf("wget http://portal.nersc.gov/dna/RD/Metagenome_RD/MetaBAT/Files/CAMI/%s", fname2)) if(!file.exists(fname2)) stop(sprintf("Cannot find %s. Download it from http://portal.nersc.gov/dna/RD/Metagenome_RD/MetaBAT/Files/CAMI/", fname2)) } set.seed(94521) contigs <- read.table(fname1, sep="\t", header=F, as.is=T) colnames(contigs) <- c('Name', 'Size') genomes <- read.table(fname2, skip=4, as.is=T) contigs$Genome <- genomes$V2[match(contigs$Name, genomes$V1)] genomes <- ddply(contigs, .(Genome), function(x) cbind(Ctgs=nrow(x), Size=sum(x$Size))) if (type == 'MetaBAT') { files <- system(sprintf("ls %s.*.fa", file), intern=T) if(length(files) == 0) stop(sprintf("Cannot find bins: %s.*", file)) } else if (type == "CONCOCT") { if(!file.exists(file)) stop(sprintf("Cannot find %s", file)) cc <- read.csv(file, header=F, as.is=T) cc.size <- ddply(cc, .(V2), function(x) sum(contigs$Size[match(x$V1, contigs$Name)])) stopifnot(!any(is.na(cc.size))) cc <- cc[cc$V2 %in% cc.size$V2[cc.size$V1 >= minSize],] files <- unique(cc$V2) } else if (type == "MaxBin") { files <- system(sprintf("ls %s.*.fasta", file), intern=T) if(length(files) == 0) stop(sprintf("Cannot find bins: %s.*", file)) } else if (type == "BinSanity") { files <- system(sprintf("ls %s/*.fna", file), intern=T) if(length(files) == 0) stop(sprintf("Cannot find bins: %s.*", file)) } res <- foreach(f=files, .combine=rbind) %dopar% { if (type %in% c("CONCOCT")) ctgs <- cc$V1[cc$V2 == f] else ctgs <- system(sprintf("grep '>' %s | sed 's/>//'", f), intern=TRUE) .res <- contigs[match(ctgs, contigs$Name),] stopifnot(!any(is.na(.res)) | nrow(.res) == length(ctgs)) .res <- ddply(.res, .(Genome), function(x) sum(x$Size)) colnames(.res) <- c("Genome","Size") .res <- .res[order(.res$Size,decreasing=T),] TP <- .res$Size[1] FP <- sum(.res$Size) - TP Recall <- TP / genomes$Size[genomes[,1] == .res$Genome[1]] Precision <- TP / sum(.res$Size) F1 <- 2 * Recall * Precision / (Precision + Recall) F0.5 <- (1 + .5 ^ 2) * Recall * Precision / ((.5 ^ 2) * Precision + Recall) cbind.data.frame(Genome=.res$Genome[1], Recall, Precision, F1, F0.5, stringsAsFactors=F) } while (length(unique(res$Recall)) != nrow(res)) { res$Recall = res$Recall + rnorm(nrow(res), sd=1e-8) } while (length(unique(res$Precision)) != nrow(res)) { res$Precision = res$Precision + rnorm(nrow(res), sd=1e-8) } while (length(unique(res$F1)) != nrow(res)) { res$F1 = res$F1 + rnorm(nrow(res), sd=1e-8) } while (length(unique(res$'F0.5')) != nrow(res)) { res$'F0.5' = res$'F0.5' + rnorm(nrow(res), sd=1e-8) } res <- cbind(res, Rank.Recall=length(res$Recall)+1-rank(res$Recall,ties.method="max"), Rank.Precision=length(res$Precision)+1-rank(res$Precision,ties.method="max"), Rank.F1=length(res$F1)+1-rank(res$F1,ties.method="max"), Rank.F0.5=length(res$'F0.5')+1-rank(res$'F0.5',ties.method="max")) res } plotPerf2 <- function(res, rec=c(.3,.5,.7,.9), prec=c(.9,.95), stress=NULL, .xlim=NULL, .ylim=NULL) { res <- lapply(res, function(x) { x <- sapply(rec, function(rec) sapply(prec, function(prec) sum(x$Recall > rec & x$Precision > prec))); dimnames(x) <- list(Precision=prec, Recall=rec); x}) res <- melt(res) res$L1 <- factor(res$L1, levels=unique(res$L1)) p <- ggplot(res, aes(x = L1, y = value, fill = L1)) + theme_bw() p <- p + geom_bar(stat="identity") if(!is.null(stress) && stress %in% levels(res$L1)) { .col <- rep("grey50", length(levels(res$L1))) .col[grep(stress,levels(res$L1))] <- "grey20" p <- p + scale_fill_manual(values=.col) } p <- p + facet_grid(Precision ~ Recall) p <- p + xlab("") + ylab("") + theme(axis.text.x = element_text(angle = 45, hjust=1)) if (!is.null(.xlim)) p <- p + xlim(.xlim) if (!is.null(.ylim)) p <- p + ylim(.ylim) p <- p + theme(legend.position = "none") print(p) } plotPerf3 <- function(res, rec=seq(.3,.9,.1), prec=c(.9,.95), legend.position=c(.9,.7)) { if("Genome" %in% colnames(res[[1]])) { res <- lapply(res, function(x) { ddply(x, .(Genome), function(xx) { xx[which.max(xx$Recall),] }) }) } res <- lapply(res, function(x) { x <- sapply(rec, function(rec) sapply(prec, function(prec) sum(x$Recall > rec & x$Precision > prec))); x <- matrix(x, nrow=length(prec), byrow=F); dimnames(x) <- list(Precision=prec, Recall=rec); x}) for(i in 1:length(res)) { for(j in 1:(ncol(res[[i]])-1)) { res[[i]][,j] <- res[[i]][,j] - res[[i]][,j+1] } } res <- melt(res) res$L1 <- factor(res$L1, levels=unique(res$L1)) res$Recall <- as.character(res$Recall) #res$Recall <- factor(res$Recall, levels=rev(unique(res$Recall))) res$Precision <- factor(res$Precision, levels=rev(unique(res$Precision))) p <- ggplot(res, aes(x = L1, y = value, fill = Recall)) + theme_bw() p <- p + geom_bar(stat="identity") p <- p + scale_fill_grey(start=0.8, end=0.2) if(length(prec) > 1) p <- p + facet_wrap( ~ Precision, ncol=2) p <- p + coord_flip() p <- p + xlab("") + ylab("# of Genomes Identified") p <- p + theme(legend.position = legend.position, legend.key.size = grid::unit(1, "lines"), legend.text = element_text(size = rel(.7)), legend.title = element_text(face="bold", size = rel(.7))) p <- p + guides(fill = guide_legend(reverse=T)) suppressWarnings(print(p)) } plotPerfVenn <- function(res, rec=.3, prec=.9, sel=NULL) { requireAll(c('grid','VennDiagram')) res <- lapply(res, function(x) { ddply(x, .(Genome), function(xx) { xx[which.max(xx$Recall),] }) }) v <- lapply(res, function(x) x$Genome[x$Recall > rec & x$Precision > prec]) if(!is.null(sel) && all(sel %in% names(v))) v <- v[sel] grid.draw(venn.diagram(v, filename=NULL, fill = c("dodgerblue", "goldenrod1", "darkorange1", "seagreen3", "orchid3")[1:length(v)], cat.col = c("dodgerblue", "goldenrod1", "darkorange1", "seagreen3", "orchid3")[1:length(v)], cat.cex = 2, cex = 1.5, margin=.2)) } getCtgList <- function(res) { d <- foreach(i=1:length(res)) %do% { gs <- foreach(g=res[[i]]$ID) %dopar% { if(names(res)[i] == "MetaBAT") { ctgs <- system(sprintf("grep '>' ./1.5kb/MetaBAT/%s.fa | sed 's/^>//'",g), intern=T) } else if(names(res)[i] == "Canopy") { ctgs <- system(sprintf("grep '>' ./1.5kb/Canopy/%s.fa | cut -f1 -d' ' | sed 's/^>//'",g), intern=T) } else if(names(res)[i] == "CONCOCT") { ctgs <- system(sprintf("grep '>' ./1.5kb/CONCOCT/bins/%s.fa | cut -f1 -d' ' | sed 's/^>//'",g), intern=T) } else if(names(res)[i] == "MaxBin") { ctgs <- system(sprintf("grep '>' ./1.5kb/MaxBin/%s.fasta | sed 's/^>//'",g), intern=T) } else if(names(res)[i] == "GroopM") { ctgs <- system(sprintf("grep '>' ./1.5kb/GroopM/core_only/%s.fna | sed 's/^>//'",g), intern=T) } ctgs } names(gs) <- sprintf("S%d_B%d",i,seq(res[[i]]$ID)) gs } names(d) <- names(res) d } plotPerfVennBySCG <- function(res, ctgList, ctgSizes, minRec=.3, minPrec=.9) { getCatalogs <- function(list1, list2, sizes, SCG1, SCG2) { findMiddle <- function(summ) { b12 <- (1:nrow(summ))[rowSums(summ>0)==1]; b12S <- length(b12) #one-to-many map from b1 to b2 b21 <- (1:ncol(summ))[colSums(summ>0)==1]; b21S <- length(b21) #one-to-many map from b2 to b1 while(TRUE) { good <- T if(length(b12) > 0 && length(b21) > 0) { b12 <- b12[rowSums(summ[b12,b21,drop=F]>0)==1] if(length(b12) == 0) { good <- F } else { b21 <- b21[colSums(summ[b12,b21,drop=F]>0)==1] if(length(b21) == 0) { good <- F } } } else { good <- F } if(!good) { b12 <- b21 <- NULL break } if (length(b12) == b12S && length(b21) == b21S) { break } else { b12S <- length(b12) b21S <- length(b21) } } stopifnot(length(b12)==length(b21)) list(b12=b12, b21=b21) } stopifnot(all(unlist(list1) %in% sizes$V1)) stopifnot(all(unlist(list2) %in% sizes$V1)) summ <- foreach(m=list1, .combine=rbind) %dopar% { foreach(n=list2, .combine=cbind) %do% { sum(sizes$V2[match(intersect(m,n), sizes$V1)]) } } flipped <- nrow(summ) < ncol(summ) if(flipped) { tmp <- list1; list1 <- list2; list2 <- tmp tmp <- SCG1; SCG1 <- SCG2; SCG2 <- tmp summ <- t(summ) } b1 <- which(rowSums(summ>0)==0) #unique to b1 b2 <- which(colSums(summ>0)==0) #unique to b2 b121 <- findMiddle(summ) b12 <- b121$b12 b21 <- b121$b21 stopifnot(length(intersect(b1,b12))==0 && length(intersect(b2,b21))==0) b12dup <- setdiff(1:nrow(summ), c(b1,b12)) b21dup <- setdiff(1:ncol(summ), c(b2,b21)) updated <- TRUE while(updated) { #any(rowSums(ss>0)>1) || any(colSums(ss>0)>1) updated <- FALSE ss <- summ[,b21dup,drop=F] for(cc in 1:ncol(ss)) { if(sum(ss[,cc]>0) > 1) { #t1 <- which(ss[,cc] > 0) t1 <- which.max(summ[,b21dup[cc]]) if(which.max(summ[t1,]) == b21dup[cc]) { #reciprocal best.. remove the others in row and col summ[setdiff(which(summ[,b21dup[cc]]>0), t1),b21dup[cc]] <- 0 summ[t1,setdiff(which(summ[t1,]>0), b21dup[cc])] <- 0 } updated <- TRUE } } ss <- summ[b12dup,,drop=F] for(rr in 1:nrow(ss)) { if(sum(ss[rr,]>0) > 1) { t2 <- which.max(summ[b12dup[rr],]) if(which.max(summ[,t2]) == b12dup[rr]) { #reciprocal best.. remove the others in row and col summ[b12dup[rr],setdiff(which(summ[b12dup[rr],]>0), t2)] <- 0 summ[setdiff(which(summ[,t2]>0), b12dup[rr]), t2] <- 0 } updated <- TRUE } } } b1 <- which(rowSums(summ>0)==0) #unique to b1 b2 <- which(colSums(summ>0)==0) #unique to b2 b121 <- findMiddle(summ) b12 <- b121$b12 b21 <- b121$b21 stopifnot(length(intersect(b1,b12))==0 && length(intersect(b2,b21))==0) stopifnot(length(unique(b12)) == length(unique(b21))) stopifnot(all(rowSums(summ[b12,b21]>0)==1)) stopifnot(all(colSums(summ[b12,b21]>0)==1)) SCG <- SCG1[b1,] middle <- list() for(i in 1:length(b12)) { j <- which(summ[b12[i],b21] > 0) if(SCG1$Recall[b12[i]] >= SCG2$Recall[b21[j]]) { middle[[i]] <- list1[[b12[i]]] SCG <- rbind(SCG, SCG1[b12[i],]) } else { middle[[i]] <- list2[[b21[j]]] SCG <- rbind(SCG, SCG2[b21[j],]) } } SCG <- rbind(SCG, SCG2[b2,]) if(flipped) { left <- list2[b2] right <- list1[b1] } else { left <- list1[b1] right <- list2[b2] } list(left=left, middle=middle, right=right, SCG=SCG) } stopifnot(length(res) == length(ctgList)) stopifnot(length(res) >= 2) stopifnot(length(res) <= 5) requireAll(c('grid','VennDiagram','doMC')) registerDoMC() for(i in 1:length(res)) { ctgList[[i]] <- ctgList[[i]][res[[i]]$Recall >= minRec & res[[i]]$Precision>=minPrec] res[[i]] <- res[[i]][res[[i]]$Recall >= minRec & res[[i]]$Precision>=minPrec, ] } catalogs <- NULL for(i in 2:length(ctgList)) { if(i==2) catalogs <- getCatalogs(ctgList[[1]],ctgList[[2]],ctgSizes,res[[1]],res[[2]]) else catalogs <- getCatalogs(do.call(c,catalogs[1:3]), ctgList[[i]], ctgSizes, catalogs$SCG, res[[i]]) } genomes <- do.call(c,catalogs[1:3]) names(genomes) <- paste("Genome",1:length(genomes)) res.venn <- foreach(i=1:length(ctgList)) %do% { gs <- getCatalogs(genomes, ctgList[[i]], ctgSizes, catalogs$SCG, res[[i]])$middle summ <- foreach(m=genomes, .combine=rbind) %dopar% { foreach(n=gs, .combine=cbind) %do% { length(intersect(m,n)) } } names(genomes)[apply(summ,2,which.max)] } names(res.venn) <- names(ctgList) grid.draw(venn.diagram(res.venn, filename=NULL, fill = c("dodgerblue", "goldenrod1", "darkorange1", "seagreen3", "orchid3")[1:length(res.venn)], cat.col = c("dodgerblue", "goldenrod1", "darkorange1", "seagreen3", "orchid3")[1:length(res.venn)], cat.cex = 2, cex = 1.5, margin=.2)) } plotPerf <- function(res, xlim.=NULL, yrange=c(-0.001,1.001), legend.order=NULL, legend.position=c(.35,.9), what=c('Recall','Precision','F1','F0.5')) { if(is.null(xlim.)) { if(!file.exists("genomes.txt")) { system("wget http://portal.nersc.gov/dna/RD/Metagenome_RD/MetaBAT/Files/genomes.txt") if(!file.exists("genomes.txt")) stop("Cannot find genomes.txt. Download it from http://portal.nersc.gov/dna/RD/Metagenome_RD/MetaBAT/Files/") } genomes <- read.table("genomes.txt", sep="\t", header=T, as.is=T) } if(is.null(legend.order) || length(intersect(names(res), legend.order)) != length(res)) { legend.order <- names(res) } modes <- rep(names(res), sapply(res, nrow)) res <- cbind(Mode=modes, do.call(rbind, res)) if (!all(what %in% c('Recall','Precision','F1','F0.5'))) { stop("what should be from the list of 'Recall','Precision','F1','F0.5'") } .d1 <- melt(res[, c('Mode','Recall','Precision','F1','F0.5')], id.vars=c('Mode'), variable.name='Score', value.name='Y') .d2 <- melt(res[, c('Mode','Rank.Recall','Rank.Precision','Rank.F1','Rank.F0.5')], id.vars=c('Mode'), variable.name='Rank', value.name='X') .d <- cbind(.d1, X=.d2$X) .d <- .d[.d$Score %in% what,] .d$Score <- droplevels(.d$Score) if(is.null(xlim.)) xlim. <- max(nrow(genomes), max(.d$X)) .d$Mode <- factor(.d$Mode, levels=legend.order) p <- ggplot(.d, aes(X, Y, colour=Mode)) + theme_bw() + facet_wrap(~ Score, nrow=ifelse(length(what)==4,2,1)) p <- p + geom_line(size=1) p <- p + xlab("Genome Bins (Sorted)") + ylab("Performance Metric") p <- p + ylim(yrange) + xlim(c(1,xlim.)) p <- p + theme(legend.position = legend.position); p$labels$colour <- NULL suppressWarnings(print(p)) } printPerf <- function(res, rec=c(seq(.3,.9,.1),.95), prec=c(seq(.7,.9,.1),.95,.99), uniqueGenomes=FALSE) { if (uniqueGenomes) { res <- lapply(res, function(x) { ddply(x, .(Genome), function(xx) { xx[which.max(xx$Recall),] }) }) } out <- lapply(res, function(x) { x <- sapply(rec, function(rec) sapply(prec, function(prec) sum(x$Recall >= rec & x$Precision >= prec))); dimnames(x) <- list(Precision=prec, Recall=rec); x}) out } diffPerf <- function(res1, res2, rec=c(seq(.1,.9,.1),.95), prec=c(seq(0,.9,.1),.95,.99)) { if(is.data.frame(res1) || !is.list(res1)) res1 <- list(res1) if(is.data.frame(res2) || !is.list(res2)) res2 <- list(res2) printPerf(res1, rec, prec)[[1]] - printPerf(res2, rec, prec)[[1]] } berkeleylab-metabat-0db17a40717d/MetaBAT2PaperSupplementaryScripts/binning.R000066400000000000000000000153411360417103500267000ustar00rootroot00000000000000# CAMI-1 data analysis source('benchmark.R') #How we got the bam files (BBMap is required for this method): #If you already have bam files, skip this step. #low #system("runBBmap.sh CAMI_low_RL_S001__insert_270_GoldStandardAssembly.fasta.gz RL_S001__insert_270.fq.gz") #system("for i in $(ls *.fq.gz); do runBBmap.sh CAMI_medium_GoldStandardAssembly.fasta.gz $i; done > /dev/null 2>&1") #system("for i in $(ls *.fq.gz); do runBBmap.sh CAMI_high_GoldStandardAssembly.fasta.gz $i; done > /dev/null 2>&1") #system("MetaBAT/jgi_summarize_bam_contig_depths --outputDepth depth.txt CAMI_low_RL_S001__insert_270_GoldStandardAssembly.fasta.gz.d/*.bam") #for medium need to re-calculate depth.txt using combined reads #system("MetaBAT/jgi_summarize_bam_contig_depths --outputDepth depth.txt CAMI_medium_GoldStandardAssembly.fasta.gz.d/*.bam") #system("MetaBAT/jgi_summarize_bam_contig_depths --outputDepth depth.txt CAMI_high_GoldStandardAssembly.fasta.gz.d/*.bam") args = commandArgs(trailingOnly=TRUE)#Example arguments will be provided in comments next to the R commands below. #low #system("zcat CAMI_low_RL_S001__insert_270_GoldStandardAssembly.fasta.gz | fastaLengths.pl - > contigs-low.txt")#We did not end up using this truth <- read.table(args[1], skip=4, as.is=T)#i.e. /cami/low/gsa_mapping.binning sizes <- read.table(args[2], as.is=T)#i.e. /cami/low/sizes res <- read.table(args[3], as.is=T)#i.e. /cami/low/resA-1.txt res <- read.table(args[4], as.is=T)#i.e. /cami/low/resB-1.txt #medium #system("zcat CAMI_medium_GoldStandardAssembly.fasta.gz | fastaLengths.pl - > contigs-medium.txt")#We did not end up using this truth <- read.table(args[5], skip=4, as.is=T)#i.e. /cami/medium/pooled_gsa_mapping.binning sizes <- read.table(args[6], as.is=T)#i.e. /cami/medium/sizes res <- read.table(args[7], as.is=T)#i.e. /cami/medium/resA-1.txt res <- read.table(args[8], as.is=T)#i.e. /cami/medium/resB-1.txt #high #system("zcat CAMI_high_GoldStandardAssembly.fasta.gz | fastaLengths.pl - > contigs-high.txt")#We did not end up using this truth <- read.table(args[9], skip=4, as.is=T)#i.e. /cami/high/gsa_mapping_pool.binning sizes <- read.table(args[10], as.is=T)#i.e. /cami/high/sizes res <- read.table(args[11], as.is=T)#i.e. /cami/high/resA-1.txt res <- read.table(args[12], as.is=T)#i.e. /cami/high/resB-1.txt nrow(truth) == nrow(sizes) truth$size <- sizes$V2[match(truth$V1, sizes$V1)] truth.genome <- ddply(truth, .(V2), function(x) cbind(ctgs=nrow(x), size=sum(x$size))) res <- res[res$V2 > 0, ] res <- merge(truth, res, by.x='V1', by.y='V1', all.x=T) #table(res$V2.x, res$V2.y) compB <- ddply(res, .(V2.x), function(x) ddply(x, .(V2.y), function(xx) sum(xx$size))) compB <- ddply(compB, .(V2.x), function(x) { na <- x$V1[is.na(x$V2.y)] if(length(na) == 0) na <- 0 x <- x[which(!is.na(x$V2.y)), ,drop=F] if (nrow(x) > 0) { cbind(binID=x$V2.y[which.max(x$V1)], comp=max(x$V1) / (sum(x$V1)+na)) } else { #nothing binned cbind(binID=NA, comp=0) } }) compB <- merge(truth.genome, compB, by.x='V2', by.y='V2.x') compB <- cbind(compB, prec=foreach(x=iter(compB, by='row'), .combine=c) %dopar% { denom <- sum(res$size[which(res$V2.y==x$binID)]) if (denom >= 200000) { numer <- sum(res$size[which(res$V2.x==x$V2 & res$V2.y==x$binID)]) numer / denom } else 0 }) sum(compB$comp > .50 & compB$prec > .90, na.rm=T) #21 => 87 => 432 #MaxBin 2 #low for(i in seq(4,4,2)) { system(sprintf("cut -f1,%d depth.txt| tail -n+2 > depth.txt.mb.%d", i,i)) write.table(sprintf("depth.txt.mb.%d", i), file='abund_list', col.names=F, row.names=F, quote=F, append=T) } system("~/files/MaxBin-2.2.3/run_MaxBin.pl -contig CAMI_low_RL_S001__insert_270_GoldStandardAssembly.fasta -abund_list abund_list -out resMB-1/bin") #0 hours 11 minutes and 5 seconds. #medium for(i in seq(4,10,2)) { system(sprintf("cut -f1,%d depth.txt| tail -n+2 > depth.txt.mb.%d", i,i)) write.table(sprintf("depth.txt.mb.%d", i), file='abund_list', col.names=F, row.names=F, quote=F, append=T) } #high for(i in seq(4,12,2)) { system(sprintf("cut -f1,%d depth.txt| tail -n+2 > depth.txt.mb.%d", i,i)) write.table(sprintf("depth.txt.mb.%d", i), file='abund_list', col.names=F, row.names=F, quote=F, append=T) } system("~/files/MaxBin-2.2.3/run_MaxBin.pl -contig CAMI_high_GoldStandardAssembly.fasta -abund_list abund_list -out mb/bin") system('metabat2 -i CAMI_low_RL_S001__insert_270_GoldStandardAssembly.fasta.gz -a depth.txt -o resB-1/bin -v') system('metabat2 -i CAMI_medium_GoldStandardAssembly.fasta.gz -a depth.txt -o resB-1/bin -v') system('metabat2 -i CAMI_high_GoldStandardAssembly.fasta.gz -a depth.txt -o resB-1/bin -v') printPerf(list(MetaBAT=calcPerfCAMI("MetaBAT","high/resB-1/bin", complexity='high'))) #CONCOCT system('concoct --composition_file CAMI_low_RL_S001__insert_270_GoldStandardAssembly.fasta --coverage_file depth.txt.mb.4 2>CONCOCT.log') #--length_threshold 2500 system("paste depth.txt.mb.* | cut -f1,2,4,6,8 -d$'\t' > depth-only-medium.txt") system('concoct --composition_file CAMI_medium_GoldStandardAssembly.fasta --coverage_file depth-only-medium.txt 2>CONCOCT.log') system("paste depth.txt.mb.* | cut -f1,2,4,6,8,10 -d$'\t' > depth-only-high.txt") system('concoct --composition_file CAMI_high_GoldStandardAssembly.fasta --coverage_file depth-only-high.txt 2>CONCOCT.log') printPerf(list(CONCOCT=calcPerfCAMI("CONCOCT"))) res <- list(MetaBAT2=calcPerfCAMI("MetaBAT","MetaBATLow/bin",complexity='low'), MaxBin2=calcPerfCAMI("MaxBin","MaxBinLow/bin",complexity='low'), CONCOCT=calcPerfCAMI("CONCOCT",complexity='low'), MyCC=calcPerfCAMI("MaxBin","20170808_0126_4mer_0.7_cov/Cluster",complexity='low'), BinSanity=calcPerfCAMI("BinSanity","BinSanity-Final-bins",complexity='low'), COCACOLA=calcPerfCAMI("CONCOCT","result.csv",complexity='low')) res <- list(MetaBAT2=calcPerfCAMI("MetaBAT","MetaBATMed/bin",complexity='medium'), MaxBin2=calcPerfCAMI("MaxBin","MaxBinMed/bin",complexity='medium'), CONCOCT=calcPerfCAMI("CONCOCT",complexity='medium'), MyCC=calcPerfCAMI("MaxBin","20170808_1000_4mer_0.7_cov/Cluster",complexity='medium'), BinSanity=calcPerfCAMI("BinSanity","BinSanity-Final-bins",complexity='medium'), COCACOLA=calcPerfCAMI("CONCOCT","result.csv",complexity='medium')) res <- list(MetaBAT2=calcPerfCAMI("MetaBAT","MetaBATHigh/bin",complexity='high'), MaxBin2=calcPerfCAMI("MaxBin","MaxBinHigh/bin",complexity='high'), CONCOCT=calcPerfCAMI("CONCOCT",complexity='high'), MyCC=calcPerfCAMI("MaxBin","20170808_0155_4mer_0.7_cov/Cluster",complexity='high'), BinSanity=calcPerfCAMI("BinSanity","BinSanity-Final-bins",complexity='high'), COCACOLA=calcPerfCAMI("CONCOCT","result.csv",complexity='high')) printPerf(res) res <- res[c('MetaBAT2','MaxBin2','BinSanity','COCACOLA','CONCOCT','MyCC')] pdf("Rplots.pdf", width=12, height=4) plotPerf3(res, rec=seq(.5,.9,.1), legend.position=c(.95,.7)) dev.off() berkeleylab-metabat-0db17a40717d/MetaBAT2PaperSupplementaryScripts/img100_scripts.tar.gz000066400000000000000000000024231360417103500310210ustar00rootroot00000000000000e\[o6_}ERd퀵@KU7t;:vk%-uwH͋z itjI[eӅ\h, jPyeT5#0 '0d\DWȗzD$ݱMƪN""Rڼ/82~39>#P0BaM+c( uBd?'X5"m}A0W"1+7U]=zu0=F?uԛ*pΖ&E2(Kz>S4JuK~`SIY#RoixUmn\f9tSG%]^:qg•*E܆ P<OTux?%/YtY? 2)υ{.<2 N?r2 ?y-q~ae|͘z٦([e5WW3Jo.-hI҆vrϗӅjI>#֙YE]?O>||t (v@YʦП*ŠceE5(L\wS9Y7|Ue*61Vו˦6"/@Q M|{Fݧ˷n"&G0* Q,ǒ_qQχGtpgb|n^^fuYp'ќku9}._ Y.#ضFwf*.ܺ*Vn'Bj,W~/IW2ySٍU6{e.ŊNw@K RAO4E8)` y,Blxd }%?q scDδIm$%n%0$ytB{iZx ꓣl&2 exit 1 fi ref=$(realpath $1) refdir=$ref.d in1=$(realpath $2) if [ ! -f "$in1" ] then echo "Could not find $in1" 1>&2 exit 1 fi interleaved=auto in2= if [ -f "$3" ] then in2=$(realpath $3) if [ ! -f "$in2" ] then echo "Could not find $in2" 1>&2 exit 1 fi else # detect interleaved read1= read2= if [ "$in1" == "${in1%gz}" ] then read1=$(head -1 $in1 | awk '{print $1}' | sed 's/\/[12]//;') read2=$(head -5 $in1 | tail -1 | awk '{print $1}' | sed 's/\/[12]//;') else read1=$(gunzip -c $in1 | head -1 | awk '{print $1}' | sed 's/\/[12]//;') read2=$(gunzip -c $in1 | head -5 | tail -1 | awk '{print $1}' | sed 's/\/[12]//;') fi if [ "$read1" == "$read2" ] then interleaved=true echo "Detected interleaved fastq" fi in2= fi bbopts=${bbopts:="samversion=1.4 local=t kbp=f minhits=2 minratio=0.8 maxindel=50 mdtag=true requirecorrectstrand=false trd=t interleaved=$interleaved usemodulo=t "} if [ -n "${idfilter}" ] then bbopts="${bbopts} idfilter=${idfilter}" fi mkdir -p $refdir inputs="in=$in1" if [ -f "$in2" ] then # split reads inputs="$inputs in2=$in2" fi out=${ref##*/}-${in1##*/}.sam outu="outu=$out.unmap.fastq.gz" if [ "${outputunmapped}" == 'f' ] then outu="outputunmapped=${outputunmapped}" fi (cd $refdir && [ -d ref/genome ] || $bbmap $bbopts ref=$ref build=1) tmpout=$TMPDIR/${out##*/} ( cd $refdir && \ $bbmap $bbopts $inputs build=1 outm=$tmpout.tmp.sam $outu && \ samtools view -Sbu $tmpout.tmp.sam | \ samtools sort -l 0 -m 1G -@ 12 -T $tmpout.dummy | \ samtools calmd -u - $ref | \ samtools view -b -@ 12 - > $out.tmp.bam && \ mv $out.tmp.bam $out.bam && \ samtools index $out.bam && \ rm $tmpout.tmp.sam || echo "sorting and indexing $sam failed" 1>&2 ) berkeleylab-metabat-0db17a40717d/README.md000066400000000000000000000647061360417103500177710ustar00rootroot00000000000000It is exciting that MetaBAT has gained some popularity for the last couple of years. One of design goals of original MetaBAT was to enable users to explore MetaBAT parameters efficiently so that they could find the best results out of their dataset. However, the task might have been challenging due to various reasons (e.g. too many parameters to optimize, lack of knowledge about parameters, lack of computing or time resources, etc.). And typical users were just stick to the default setting and the shortcuts (e.g. --sensitive, --specific, etc) at best. Even though it is amazing to get great bins without spending much effort to optimize parameters, sometimes it does not produce the best possible bins out of datasets. And we felt that was not users' responsibility but our's. So here we introduce MetaBAT 2: * It requires virtually no parameter optimization. Now, default parameters are more reliable to use in most cases since MetaBAT adapts to the given data to find the best parameter. Hopefully it will relieve users of having full responsibility to find the best parameters for each dataset. * There are some parameters remaining for advanced users. It will help out to manage some exceptional cases by changing the amount of data used for the analysis. * MetaBAT 1 might outperform MetaBAT 2 when there are many samples and the assembly quality is good, so we kept the original version as metabat1 and added metabat2 as a separate executable. Since v2.12.1, metabat executable points to metabat2. NOTICE: ------------ ***Since v2.12.1, metabat executable points to metabat2. To run MetaBAT 1, use metabat1 executable. *** ***Check out a new tutorial [Best Binning Practices](https://bitbucket.org/berkeleylab/metabat/wiki/Best%20Binning%20Practices). *** ***Check out CAMI benchmark with MetaBAT 2 [here](https://bitbucket.org/berkeleylab/metabat/wiki/CAMI). *** ***Check out the MetaBAT paper [here](https://peerj.com/articles/1165). *** ***Be careful to have bams sorted first!*** Running with Docker: ------------- ``` docker run metabat/metabat:latest runMetaBat.sh # For example: docker run --workdir $(pwd) --volume $(pwd):$(pwd) metabat/metabat:latest runMetaBat.sh test/contigs.fa test/contigs-1000.fastq.bam # See INSTALL.md to build your own docker image ``` INSTALLATION (non-Docker): ------------- Requirements: * boost >= 1.59.0 (dev and libs for boost_graph, system, filesystem and serialization) * python >= 2.7 * cmake >= 3.8.2 * gcc/g++ >= 4.9 or intel >= 18.0.1.163 or llvm >= 8.0 (htslib 1.9 is downloaded and installed automatically if not present on the system) ``` #!bash #clean up old files rm -f master.tar.gz rm -f dev.tar.gz rm -rf berkeleylab-metabat-* #stable release version wget https://bitbucket.org/berkeleylab/metabat/get/master.tar.gz tar xzvf master.tar.gz cd berkeleylab-metabat-* #latest development version wget https://bitbucket.org/berkeleylab/metabat/get/dev.tar.gz tar xzvf dev.tar.gz cd berkeleylab-metabat-* #run the installation script mkdir build && cd build && cmake .. [ -DCMAKE_INSTALL_PREFIX=/path/to/install ] && make && make test && make install ``` See INSTALL.md for Operating System specific installation instructions For technical supports, please open an issue with bitbucket's issue tracker: https://bitbucket.org/berkeleylab/metabat/issues MetaBAT 2 USAGE: running on command line -------------------------------- ***Be careful to have bams sorted first!*** The easy way: > runMetaBat.sh assembly.fasta sample1.bam [sample2.bam ...] The slightly less easy way: a) Generate a depth file from BAM files >jgi_summarize_bam_contig_depths --outputDepth depth.txt *.bam b) Run metabat >metabat2 -i assembly.fasta -a depth.txt -o bins_dir/bin To check MetaBAT options: > metabat2 -h ``` Allowed options: -h [ --help ] produce help message -i [ --inFile ] arg Contigs in (gzipped) fasta file format [Mandatory] -o [ --outFile ] arg Base file name and path for each bin. The default output is fasta format. Use -l option to output only contig names [Mandatory]. -a [ --abdFile ] arg A file having mean and variance of base coverage depth (tab delimited; the first column should be contig names, and the first row will be considered as the header and be skipped) [Optional]. -m [ --minContig ] arg (=2500) Minimum size of a contig for binning (should be >=1500). --maxP arg (=95) Percentage of 'good' contigs considered for binning decided by connection among contigs. The greater, the more sensitive. --minS arg (=60) Minimum score of a edge for binning (should be between 1 and 99). The greater, the more specific. --maxEdges arg (=200) Maximum number of edges per node. The greater, the more sensitive. --pTNF arg (=0) TNF probability cutoff for building TNF graph. Use it to skip the preparation step. (0: auto). --noAdd Turning off additional binning for lost or small contigs. --cvExt When a coverage file without variance (from third party tools) is used instead of abdFile from jgi_summarize_bam_contig_depths. -x [ --minCV ] arg (=1) Minimum mean coverage of a contig in each library for binning. --minCVSum arg (=1) Minimum total effective mean coverage of a contig (sum of depth over minCV) for binning. -s [ --minClsSize ] arg (=200000) Minimum size of a bin as the output. -t [ --numThreads ] arg (=0) Number of threads to use (0: use all cores). -l [ --onlyLabel ] Output only sequence labels as a list in a column without sequences. --saveCls Save cluster memberships as a matrix format --unbinned Generate [outFile].unbinned.fa file for unbinned contigs --noBinOut No bin output. Usually combined with --saveCls to check only contig memberships --seed arg (=0) For exact reproducibility. (0: use random seed) -d [ --debug ] Debug output -v [ --verbose ] Verbose output ``` Choice of Options: * In MetaBAT 2, parameter optimization will be unnecessary, though we allowed a few parameters so that advanced users might play with them. * You can decrease -m (--minContig) when the qualities of both assembly and formed bins with default value are very good. * You can decrease --maxP and --maxEdges when the qualities of both assembly and formed bins are very bad. * You can increase --maxEdges when the completeness level is low, for many datasets we typically use 500. * You can increase --minS when the qualities of both assembly and formed bins are very bad. * Set --noAdd when added small or leftover contigs cause too much contamination. * Set --pTNF positive numbers (1-99) to skip the TNF graph building preparation step. Otherwise, it will be automatically decided based on --maxP. Use this to reproduce previous result. * Set --seed positive numbers to reproduce the result exactly. Otherwise, random seed will be set each time. MetaBAT 1 USAGE: running on command line -------------------------------- Run metabat >metabat1 -i assembly.fasta -a depth.txt -o bins_dir/bin To check MetaBAT options: > metabat1 -h ``` Allowed options: -h [ --help ] produce help message -i [ --inFile ] arg Contigs in (gzipped) fasta file format [Mandatory] -o [ --outFile ] arg Base file name for each bin. The default output is fasta format. Use -l option to output only contig names [Mandatory] -a [ --abdFile ] arg A file having mean and variance of base coverage depth (tab delimited; the first column should be contig names, and the first row will be considered as the header and be skipped) [Optional] --cvExt When a coverage file without variance (from third party tools) is used instead of abdFile from jgi_summarize_bam_contig_depths -p [ --pairFile ] arg A file having paired reads mapping information. Use it to increase sensitivity. (tab delimited; should have 3 columns of contig index (ordered by), its mate contig index, and supporting mean read coverage. The first row will be considered as the header and be skipped) [Optional] --p1 arg (=0) Probability cutoff for bin seeding. It mainly controls the number of potential bins and their specificity. The higher, the more (specific) bins would be. (Percentage; Should be between 0 and 100) --p2 arg (=0) Probability cutoff for secondary neighbors. It supports p1 and better be close to p1. (Percentage; Should be between 0 and 100) --minProb arg (=0) Minimum probability for binning consideration. It controls sensitivity. Usually it should be >= 75. (Percentage; Should be between 0 and 100) --minBinned arg (=0) Minimum proportion of already binned neighbors for one's membership inference. It contorls specificity. Usually it would be <= 50 (Percentage; Should be between 0 and 100) --verysensitive For greater sensitivity, especially in a simple community. It is the shortcut for --p1 90 --p2 85 --pB 20 --minProb 75 --minBinned 20 --minCorr 90 --sensitive For better sensitivity [default]. It is the shortcut for --p1 90 --p2 90 --pB 20 --minProb 80 --minBinned 40 --minCorr 92 --specific For better specificity. Different from --sensitive when using correlation binning or ensemble binning. It is the shortcut for --p1 90 --p2 90 --pB 30 --minProb 80 --minBinned 40 --minCorr 96 --veryspecific For greater specificity. No correlation binning for short contig recruiting. It is the shortcut for --p1 90 --p2 90 --pB 40 --minProb 80 --minBinned 40 --superspecific For the best specificity. It is the shortcut for --p1 95 --p2 90 --pB 50 --minProb 80 --minBinned 20 --minCorr arg (=0) Minimum pearson correlation coefficient for binning missed contigs to increase sensitivity (Helpful when there are many samples). Should be very high (>=90) to reduce contamination. (Percentage; Should be between 0 and 100; 0 disables) --minSamples arg (=10) Minimum number of sample sizes for considering correlation based recruiting -x [ --minCV ] arg (=1) Minimum mean coverage of a contig to consider for abundance distance calculation in each library --minCVSum arg (=2) Minimum total mean coverage of a contig (sum of all libraries) to consider for abundance distance calculation -s [ --minClsSize ] arg (=200000) Minimum size of a bin to be considered as the output -m [ --minContig ] arg (=2500) Minimum size of a contig to be considered for binning (should be >=1500; ideally >=2500). If # of samples >= minSamples, small contigs (>=1000) will be given a chance to be recruited to existing bins by default. --minContigByCorr arg (=1000) Minimum size of a contig to be considered for recruiting by pearson correlation coefficients (activated only if # of samples >= minSamples; disabled when minContigByCorr > minContig) -t [ --numThreads ] arg (=0) Number of threads to use (0: use all cores) --minShared arg (=50) Percentage cutoff for merging fuzzy contigs --fuzzy Binning with fuzziness which assigns multiple memberships of a contig to bins (activated only with --pairFile at the moment) -l [ --onlyLabel ] Output only sequence labels as a list in a column without sequences -S [ --sumLowCV ] If set, then every sample that falls below the minCV will be used in an aggregate sample -V [ --maxVarRatio ] arg (=0) Ignore any contigs where variance / mean exceeds this ratio (0 disables) --saveTNF arg File to save (or load if exists) TNF matrix for each contig in input --saveDistance arg File to save (or load if exists) distance graph at lowest probability cutoff --saveCls Save cluster memberships as a matrix format --unbinned Generate [outFile].unbinned.fa file for unbinned contigs --noBinOut No bin output. Usually combined with --saveCls to check only contig memberships -B [ --B ] arg (=20) Number of bootstrapping for ensemble binning (Recommended to be >=20) --pB arg (=50) Proportion of shared membership in bootstrapping. Major control for sensitivity/specificity. The higher, the specific. (Percentage; Should be between 0 and 100) --seed arg (=0) For reproducibility in ensemble binning, though it might produce slightly different results. (0: use random seed) --keep Keep the intermediate files for later usage -d [ --debug ] Debug output -v [ --verbose ] Verbose output ``` Choice of Options: * '-i' input file should be either fasta or gzipped fasta file. (since v0.32.3) * If '-a [--abdFile]' option is not given, TNF only binning will be executed. * -p option is for utilizing paired info from short reads. It may improve sensitivity. * '--p1' and '--p2' should be both high to maintain great specificity. Usually p1 >= p2 performs better. * --minProb mainly controls the scope and sensitivity of binning. A smaller number improves sensitivity. It should be < p1, p2. * --minBinned mainly controls the specificity. A greater number improves specificity. Usually <= 50. * Use --verysensitive on simple community for more inclusive binning. * --minCorr would include contigs which are closely correlated in abundance but somewhat different in absolute abundance. More effective in availability of many samples. * Recruiting by correlation would be activated only if # of samples >= minSamples and be disabled (for better specificity) when minContigByCorr > minContig. * --veryspecific and --superspecific will not recruit small contigs by abundance correlation. * Coverage in a sample less than the number given with '-x [--minCV]' option will be ignored. * Bin size less than the number given with '-s [--minClsSize]' option will not be reported. * Contigs smaller than the length cutoff given with '-m [--minContig]' option will not be used in binning. The cutoff should be >= 1500; ideally >=2500. * Smaller contigs (>1000) will be given a chance to be recruited to existing bins when # of samples >= minSamples by default setting. * Use '-l [--onlyLabel]' when it is not necessary to record sequences. In this case, only the labels will be reported. In this case, use --noBinOut to prevent producing individual bins output. * If '-S [--sumLowCV]' is set, the coverages smallers than minCV will be aggregated. It may improve performance in certain cases. * If any number greater than 0 is given by '-V [--maxVarRatio]' option, contigs having spurious coverage pattern will be ignored. * '--saveDistance' option saves a lot of computations when multiple binning attempts are executed with different parameter settings. * '--unbinned' option generates a file for unbinned contigs. * '-B' option is for ensemble binning. Recommended to be 20 or more. Should be >= 10 for reasonable results. It tends to generate reduced number of better quality bins at the cost of some additional computation. * '--pB' option controls for sensitivity and specificity tradeoff in ensemble binning. The smaller, the sensitive. Range is between 0 to 100. The default is 50. * Produced bins would be stochastic if ensemble binning was used. --seed would minimize the stochasticity but still there would be slight difference. OUTPUT: ------- Each discovered bin will be saved as a fasta format NOTES: ------- The proper settings on the read aligner should be set to evenly distribute ambiguously mapping reads (the default option for bowtie2, bwa, and bbmap). jgi_summarize_bam_contig_depths USAGE: ----- > jgi_summarize_bam_contig_depths ``` Usage: jgi_summarize_bam_contig_depths sortedBam1 [ sortedBam2 ...] where options include: --outputDepth arg The file to put the contig by bam depth matrix (default: STDOUT) --percentIdentity arg The minimum end-to-end % identity of qualifying reads (default: 97) --pairedContigs arg The file to output the sparse matrix of contigs which paired reads span (default: none) --unmappedFastq arg The prefix to output unmapped reads from each bam file suffixed by 'bamfile.bam.fastq.gz' --noIntraDepthVariance Do not include variance from mean depth along the contig --showDepth Output a .depth file per bam for each contig base --includeEdgeBases When calculating depth & variance, include the 1-readlength edges (off by default) --maxEdgeBases When calculating depth & variance, and not --includeEdgeBases, the maximum length (default:75) Options to control shredding contigs that are under represented by the reads --referenceFasta arg The reference file. (It must be the same fasta that bams used) --shredLength arg The maximum length of the shreds --shredDepth arg The depth to generate overlapping shreds --minContigLength arg The mimimum length of contig to include for mapping and shredding --minContigDepth arg The minimum depth along contig at which to break the contig ``` MetaBAT Adjusted coverage depths -------------------------------- The algorithm that jgi_summarize_bam_contig_depths uses for calculating coverage depth for each sequence in the assembly is adjusted by a few factors to improve the fidelity of the metrics when correlating abundance coverage in the binning stage. By default the following adjustments are applied: 1) **Edge Bases are ignored** > Edge bases are not counted as coverage, by the lesser of 1 AverageReadLength or (--maxEdgeBases=75). > This is because most mappers can not reliably place a read that would extend off the edge of a sequence, and coverage depth tends to drop towards 0 at the edge of a contig or scaffold. > Use --includeEdgeBases to include the coverage in this region. 2) **Reads with high mapping errors are skipped** > Reads that map imperfectly are excluded when the %ID of the mapping drops below a threshold (--percentIdentity=97). > MetaBAT is designed to resolve strain variation and mapping reads with low %ID indicate that the read actually came > from a different strain/species. > %ID is calculated from the CIGAR string and/or NM/MD fields and == 100 * MatchedBases / (MatchedBases + Substituions + Insertions + Deletions) > This ensures that clips, insertions, deletions and mismatches are excluded from the coverage count. Only the read bases that exactly match the reference are counted as coverage. This generally has a small effect, except in the case of long reads from PacBio and Nanopore. Example with real data -------------------------------- Description of Data: * 2 libraries of next-gen sequencing data of a mock community. * The community is composed of 25 known genomes. * Data is available to download: http://portal.nersc.gov/dna/RD/Metagenome_RD/MetaBAT/Software/Mockup/ After downloading the assembly file and two bam files, run the following command line: >runMetaBat.sh assembly.fa *.bam MetaBAT forms about 28 bins. In this example, since we know the true membership of each contig, we can calculate the completeness and precision of a bin correctly. ```R #The following is R commands (tested on Linux) library(plyr) library(foreach) options(width=150) refs <- read.table("membership.txt", sep="\t", header=T) files <- list.files(".", pattern="*.[0-9]+.fa$", full.name=F) bins <- foreach(f=files) %do% { system(sprintf("grep '>' %s | sed 's/>//'", f), intern=TRUE) } res <- foreach(b=bins, .combine=cbind) %do% { ddply(refs[match(b, refs$contig),], .(reference), function(x) sum(x$size), .drop=F) } rownames(res) <- res[,1]; res <- res[, seq(2,ncol(res),2)] colnames(res) <- sapply(strsplit(files, "\\."), function(x) x[length(x)-1]) genome.size <- ddply(refs, .(reference), function(x) c(size=sum(x$size))) genome.size <- genome.size[match(rownames(res), genome.size$reference),] res.prec <- apply(res, 2, function(x) max(x)/sum(x)) res.comp <- apply(res, 2, function(x) max(x)/genome.size$size[which.max(x)]) res.ref <- apply(res, 2, function(x) rownames(res)[which.max(x)]) res2 <- cbind.data.frame(Bin=names(res.prec), Size=colSums(res), 'Compl.'=res.comp, 'Prec.'=res.prec, 'Ref.'=res.ref) res2 summary(res2[,2:4]) #Median bin size was 3.72Mb with 97% median completeness (or 80% by mean) and 100% median precision (or 96% by mean). length(unique(res2$Ref.)) # 24 out of 25 genomes were binned. ``` The output will look like the following: ``` >res2 Bin Size Compl. Prec. Ref. 1 1 7860794 0.97635160 0.4832147 Natronobacterium gregoryi SP2, DSM 3393 10 10 4115670 0.81602285 1.0000000 Spirochaeta smaragdinae DSM 11293 11 11 3921774 0.86124825 1.0000000 Natronococcus occultus SP4, DSM 3396 12 12 3874303 0.98891653 1.0000000 Hirschia baltica ATCC 49814 13 13 3839986 0.99090122 1.0000000 Coraliomargarita akajimensis DSM 45221 14 14 3730710 0.98314815 1.0000000 Clostridium thermocellum ATCC 27405 15 15 3704412 0.87120817 1.0000000 Thermobacillus composti KWC4, DSM 18247 16 16 3684573 0.86379200 1.0000000 Frateuria aurantia Kondo 67, DSM 6220 17 17 3546737 0.98625152 1.0000000 Meiothermus silvanus DSM 9946 18 18 3236335 0.99686126 1.0000000 Clostridium perfringens ATCC 13124 19 19 3173259 0.99905801 1.0000000 Segniliparus rotundus DSM 44985 2 2 5619150 0.99822442 1.0000000 Echinicola vietnamensis KMM 6221, DSM 17526 20 20 2924873 0.89827162 1.0000000 Corynebacterium glutamicum ATCC 13032 21 21 2147757 0.98982458 1.0000000 Fervidobacterium pennivorans Ven5, DSM 9078 22 22 2125683 0.98312892 1.0000000 Olsenella uli DSM 7084 23 23 1805558 0.96656297 1.0000000 Streptococcus pyogenes M1 GAS 24 24 703485 0.23799984 0.7072745 Salmonella bongori NCTC 12419 25 25 576231 0.11425057 1.0000000 Spirochaeta smaragdinae DSM 11293 26 26 351671 0.06972657 1.0000000 Spirochaeta smaragdinae DSM 11293 27 27 292949 0.06889611 1.0000000 Thermobacillus composti KWC4, DSM 18247 28 28 274427 0.08428058 1.0000000 Corynebacterium glutamicum ATCC 13032 3 3 5272697 0.92686085 0.7153358 Escherichia coli str. K-12 substr. MG1655 4 4 5090248 0.97611998 1.0000000 Desulfotomaculum gibsoniae Groll, DSM 7213 5 5 4940344 0.99375427 1.0000000 Desulfosporosinus meridiei S10, DSM 13257 6 6 4829038 0.96953761 1.0000000 Desulfosporosinus acidophilus SJ4, DSM 22704 7 7 4766169 0.98037872 1.0000000 Terriglobus roseus KBS 63, DSM 18391 8 8 4753789 0.91043046 1.0000000 Salmonella enterica subsp. arizonae serovar 62 9 9 4735197 0.98375082 1.0000000 Pseudomonas stutzeri RCH2 >summary(res2[,2:4]) Size Compl. Prec. Min. : 274427 Min. :0.0689 Min. :0.4832 1st Qu.:2142238 1st Qu.:0.8632 1st Qu.:1.0000 Median :3717561 Median :0.9728 Median :1.0000 Mean :3424922 Mean :0.8031 Mean :0.9609 3rd Qu.:4756884 3rd Qu.:0.9869 3rd Qu.:1.0000 Max. :7860794 Max. :0.9991 Max. :1.0000 ``` ``` MetaBAT Copyright (c) 2014, The Regents of the University of California, through Lawrence Berkeley National Laboratory (subject to receipt of any required approvals from the U.S. Dept. of Energy). All rights reserved. If you have questions about your rights to use or distribute this software, please contact Berkeley Lab's technology transfer department at TTD@lbl.gov referring to "MetaBAT (2014-075)." NOTICE. This software was developed under funding from the U.S. Department of Energy. As such, the U.S. Government has been granted for itself and others acting on its behalf a paid-up, nonexclusive, irrevocable, worldwide license in the Software to reproduce, prepare derivative works, and perform publicly and display publicly. Beginning five (5) years after the date permission to assert copyright is obtained from the U.S. Department of Energy, and subject to any subsequent five (5) year renewals, the U.S. Government is granted for itself and others acting on its behalf a paid-up, nonexclusive, irrevocable, worldwide license in the Software to reproduce, prepare derivative works, distribute copies to the public, perform publicly and display publicly, and to permit others to do so. ``` berkeleylab-metabat-0db17a40717d/aggregateBinDepths.pl000077500000000000000000000025231360417103500225660ustar00rootroot00000000000000#!/usr/bin/perl use warnings; use strict; our $USAGE = "$0 depth.txt bin1.fa [...]\n\n"; die $USAGE unless scalar(@ARGV) >= 2; our %hdepths; open(my $d, "<", $ARGV[0]) || die; my $header = <$d>; my $reg = qr/^([^\t]+)\t([^\t]+)\t([^\t]+)\t/; my $reg2 = qr/^(\S+)/; while (<$d>) { my($contig, $len, $totalDepth) = $_ =~ $reg; my($contigName) = $contig =~ $reg2; $hdepths{$contigName} = [$len, $totalDepth]; } close($d) || die; shift; printf("bin\ttotalLength\tAvgDepth\tStdDev\n"); my %hbins; my @lbins; foreach my $file (@ARGV) { open(my $f, "<", $file) || die; $reg = qr/^>(\S+)/; my @l_contigs; $hbins{$file} = \@l_contigs; push @lbins, $file; while (<$f>) { if ($_ =~ $reg) { my $contig = $1; push @l_contigs, $contig; } } close($f) || die; my $totalLen = 0; my $totalDepth = 0; foreach my $contig (@l_contigs) { my $rl = $hdepths{$contig}; $totalLen += $rl->[0]; $totalDepth += $rl->[0] * $rl->[1]; } my($avgDepth, $stdDev) = (0,0); if ($totalLen > 0) { $avgDepth = $totalDepth / $totalLen; foreach my $contig (@l_contigs) { my $rl = $hdepths{$contig}; my $diff = $avgDepth - $rl->[1]; $stdDev += $diff * $diff * $rl->[0] / $totalLen; } $stdDev = sqrt( $stdDev ); } printf("%s\t%d\t%0.2f\t%0.2f\n", $file, $totalLen, $avgDepth, $stdDev); } berkeleylab-metabat-0db17a40717d/aggregateContigOverlapsByBin.pl000077500000000000000000000055331360417103500245750ustar00rootroot00000000000000#!/usr/bin/env perl use strict; use warnings; our $USAGE = "$0 contigOverlaps.txt firstBinsContigNames.fofn secondBinsContigNames.fofn [...]\n"; die $USAGE unless scalar(@ARGV >= 3); my $contigsOverlapFile = shift; open(my $overlapfh, "<", $contigsOverlapFile) || die; my $overlapHeader = <$overlapfh>; our @llhh_assemAassemBbinAbinBCounts; our @llh_assemBinReadCounts; our @lh_assemContigBins; foreach my $binfofn (@ARGV) { open(my $fhfofn, "<", $binfofn) || die "Could not open $binfofn! $!"; my %h_contigBins; push @lh_assemContigBins, \%h_contigBins; while (my $binFile = <$fhfofn>) { chomp($binFile); open(my $fh, "<", $binFile) || die "Could not open $binFile! $!";; my $binName = $binFile; $binName =~ s#.*/##; while (my $contigName = <$fh>) { chomp($contigName); $contigName =~ s/[ \t].*//; # hack to fix reference name mangling $contigName =~ s/[\[]/_/; $contigName =~ s/[\]]//; $h_contigBins{$contigName} = $binName; #print STDERR "$contigName -> $binName\n"; } close($fh) || die; } close($fhfofn) || die } while (<$overlapfh>) { my ($assemblyi, $assemblyj, $contigi, $contigj, $overlap, $totali, $percent) = split(/\t/); $contigi =~ s/\s.*//; $contigj =~ s/\s.*//; if (exists $lh_assemContigBins[$assemblyi]{$contigi}) { my $binAName = $lh_assemContigBins[$assemblyi]{$contigi}; my $binBName = "*"; if ($binBName ne $contigj) { $binBName = $lh_assemContigBins[$assemblyj]{$contigj}; if (not defined $binBName) { #warn "There is no bin for $contigj in $ARGV[$assemblyj]\n"; next; } } $llh_assemBinReadCounts[$assemblyi][$assemblyj]{$binAName} += $overlap; $llhh_assemAassemBbinAbinBCounts[$assemblyi][$assemblyj]{$binAName}{$binBName} += $overlap; } } printf("assemblyA\tassemblyB\tbinA\tbinB\toverlap\ttotalA\t%%\n"); for(my $assemblyi = 0; $assemblyi < scalar(@ARGV); $assemblyi++) { for(my $assemblyj = 0; $assemblyj < scalar(@ARGV); $assemblyj++) { my $rh_binReadCounts = $llh_assemBinReadCounts[$assemblyi][$assemblyj]; if ($assemblyi == $assemblyj) { next; } my $rh_BinABinBCounts = $llhh_assemAassemBbinAbinBCounts[$assemblyi][$assemblyj]; foreach my $binAName (sort keys %{$rh_binReadCounts}) { my $total = $rh_binReadCounts->{$binAName}; while(my($binBName, $overlap) = each %{$llhh_assemAassemBbinAbinBCounts[$assemblyi][$assemblyj]{$binAName}}) { my $frac = $total > 0 ? $overlap/$total : 0; if ($frac > 0.001) { printf("%d\t%d\t%s\t%s\t%d\t%d\t%0.2f\n", $assemblyi, $assemblyj, $binAName, $binBName, $overlap, $total, 100*$frac); } } } } } berkeleylab-metabat-0db17a40717d/bitbucket-pipelines.yml000066400000000000000000000025231360417103500231640ustar00rootroot00000000000000 # This has been disabled due to costs on bitbucket. - RSE 2019 # This repo is mirrored at gitlab where the docker repo is now built # https://gitlab.com/robegan21/MetaBAT/pipelines # This is a sample build configuration for Docker. # Check our guides at https://confluence.atlassian.com/x/O1toN for more examples. # Only use spaces to indent your .yml configuration. # ----- # You can specify a custom docker image from Docker Hub as your build environment. # image: atlassian/default-image:latest # enable Docker for your repository #options: # docker: true # #pipelines: # default: # - step: # script: # Modify the commands below to build your repository. # # Set $DOCKER_HUB_USERNAME and $DOCKER_HUB_PASSWORD as environment variables in repository settings # - set -e # - export IMAGE_NAME=metabat/metabat # - docker version # # build the Docker image (this will use the Dockerfile in the root of the repo) # - docker build -t $IMAGE_NAME:$BITBUCKET_COMMIT . # - if [ "$BITBUCKET_BRANCH" == "master" ] ; then docker build -t $IMAGE_NAME:latest . ; fi # # authenticate with the Docker Hub registry # - docker login --username metabat --password $DOCKER_PASSWORD # # push the new Docker image to the Docker registry # - docker push $IMAGE_NAME berkeleylab-metabat-0db17a40717d/cmake/000077500000000000000000000000001360417103500175555ustar00rootroot00000000000000berkeleylab-metabat-0db17a40717d/cmake/LICENSE000066400000000000000000000030621360417103500205630ustar00rootroot00000000000000Firepony Copyright (c) 2014-2015, NVIDIA CORPORATION Copyright (c) 2015, Nuno Subtil All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the copyright holders nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. berkeleylab-metabat-0db17a40717d/cmake/git-watcher.cmake000066400000000000000000000167201360417103500230030ustar00rootroot00000000000000# git_watcher.cmake # # License: MIT # Source: https://raw.githubusercontent.com/andrew-hardin/cmake-git-version-tracking/master/git_watcher.cmake # This file defines the functions and targets needed to monitor # the state of a git repo. If the state changes (e.g. a commit is made), # then a file gets reconfigured. # # The behavior of this script can be modified by defining any of these variables: # # PRE_CONFIGURE_FILE (REQUIRED) # -- The path to the file that'll be configured. # # POST_CONFIGURE_FILE (REQUIRED) # -- The path to the configured PRE_CONFIGURE_FILE. # # GIT_STATE_FILE (OPTIONAL) # -- The path to the file used to store the previous build's git state. # Defaults to the current binary directory. # # GIT_WORKING_DIR (OPTIONAL) # -- The directory from which git commands will be run. # Defaults to the directory with the top level CMakeLists.txt. # # GIT_EXECUTABLE (OPTIONAL) # -- The path to the git executable. It'll automatically be set if the # user doesn't supply a path. # # Script design: # - This script was designed similar to a Python application # with a Main() function. I wanted to keep it compact to # simplify "copy + paste" usage. # # - This script is made to operate in two CMake contexts: # 1. Configure time context (when build files are created). # 2. Build time context (called via CMake -P) # If you see something odd (e.g. the NOT DEFINED clauses), # consider that it can run in one of two contexts. # Short hand for converting paths to absolute. macro(PATH_TO_ABSOLUTE var_name) get_filename_component(${var_name} "${${var_name}}" ABSOLUTE) endmacro() # Check that a required variable is set. macro(CHECK_REQUIRED_VARIABLE var_name) if(NOT DEFINED ${var_name}) message(FATAL_ERROR "The \"${var_name}\" variable must be defined.") endif() PATH_TO_ABSOLUTE(${var_name}) endmacro() # Check that an optional variable is set, or, set it to a default value. macro(CHECK_OPTIONAL_VARIABLE var_name default_value) if(NOT DEFINED ${var_name}) set(${var_name} ${default_value}) endif() PATH_TO_ABSOLUTE(${var_name}) endmacro() CHECK_REQUIRED_VARIABLE(PRE_CONFIGURE_FILE) CHECK_REQUIRED_VARIABLE(POST_CONFIGURE_FILE) CHECK_OPTIONAL_VARIABLE(GIT_STATE_FILE "${CMAKE_BINARY_DIR}/git-state") CHECK_OPTIONAL_VARIABLE(GIT_WORKING_DIR "${CMAKE_SOURCE_DIR}") # Check the optional git variable. # If it's not set, we'll try to find it using the CMake packaging system. if(NOT DEFINED GIT_EXECUTABLE) find_package(Git QUIET REQUIRED) endif() CHECK_REQUIRED_VARIABLE(GIT_EXECUTABLE) # Function: GitStateChangedAction # Description: this function is executed when the state of the git # repo changes (e.g. a commit is made). function(GitStateChangedAction _state_as_list) # Set variables by index, then configure the file w/ these variables defined. LIST(GET _state_as_list 0 GIT_RETRIEVED_STATE) LIST(GET _state_as_list 1 GIT_HEAD_SHA1) LIST(GET _state_as_list 2 GIT_IS_DIRTY) string(TIMESTAMP _time_stamp) set(SKIP_IT false) if (EXISTS "${POST_CONFIGURE_FILE}") if (NOT GIT_RETRIEVED_STATE) set(SKIP_IT true) message("Skipping creation of ${POST_CONFIGURE_FILE} as git is not available") endif() endif() if (NOT SKIP_IT) message("Creating new ${POST_CONFIGURE_FILE} with ${GIT_RETRIEVED_STATE} ${TIMESTAMP}") configure_file("${PRE_CONFIGURE_FILE}" "${POST_CONFIGURE_FILE}" @ONLY) endif() endfunction() # Function: GetGitState # Description: gets the current state of the git repo. # Args: # _working_dir (in) string; the directory from which git commands will be executed. # _state (out) list; a collection of variables representing the state of the # repository (e.g. commit SHA). function(GetGitState _working_dir _state) # Get the hash for HEAD. set(_success "true") execute_process(COMMAND "${GIT_EXECUTABLE}" describe --tags WORKING_DIRECTORY "${_working_dir}" RESULT_VARIABLE res OUTPUT_VARIABLE _hashvar ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) if(NOT res EQUAL 0) set(_success "false") set(_hashvar "GIT-NOTFOUND") endif() # Get whether or not the working tree is dirty. execute_process(COMMAND "${GIT_EXECUTABLE}" status --porcelain WORKING_DIRECTORY "${_working_dir}" RESULT_VARIABLE res OUTPUT_VARIABLE out ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) if(NOT res EQUAL 0) set(_success "false") set(_dirty "false") else() if(NOT "${out}" STREQUAL "") set(_dirty "true") else() set(_dirty "false") endif() endif() # Return a list of our variables to the parent scope. set(${_state} ${_success} ${_hashvar} ${_dirty} PARENT_SCOPE) endfunction() # Function: CheckGit # Description: check if the git repo has changed. If so, update the state file. # Args: # _working_dir (in) string; the directory from which git commands will be ran. # _state_changed (out) bool; whether or no the state of the repo has changed. # _state (out) list; the repository state as a list (e.g. commit SHA). function(CheckGit _working_dir _state_changed _state) # Get the current state of the repo. GetGitState("${_working_dir}" state) # Set the output _state variable. # (Passing by reference in CMake is awkward...) set(${_state} ${state} PARENT_SCOPE) # Check if the state has changed compared to the backup on disk. if(EXISTS "${GIT_STATE_FILE}") file(READ "${GIT_STATE_FILE}" OLD_HEAD_CONTENTS) if(OLD_HEAD_CONTENTS STREQUAL "${state}") # State didn't change. set(${_state_changed} "false" PARENT_SCOPE) return() endif() endif() # The state has changed. # We need to update the state file on disk. # Future builds will compare their state to this file. file(WRITE "${GIT_STATE_FILE}" "${state}") set(${_state_changed} "true" PARENT_SCOPE) endfunction() # Function: SetupGitMonitoring # Description: this function sets up custom commands that make the build system # check the state of git before every build. If the state has # changed, then a file is configured. function(SetupGitMonitoring) add_custom_target(check_git_repository ALL DEPENDS ${PRE_CONFIGURE_FILE} BYPRODUCTS ${POST_CONFIGURE_FILE} COMMENT "Checking the git repository for changes..." COMMAND ${CMAKE_COMMAND} -D_BUILD_TIME_CHECK_GIT=TRUE -DGIT_WORKING_DIR=${GIT_WORKING_DIR} -DGIT_EXECUTABLE=${GIT_EXECUTABLE} -DGIT_STATE_FILE=${GIT_STATE_FILE} -DPRE_CONFIGURE_FILE=${PRE_CONFIGURE_FILE} -DPOST_CONFIGURE_FILE=${POST_CONFIGURE_FILE} -P "${CMAKE_CURRENT_LIST_FILE}") endfunction() # Function: Main # Description: primary entry-point to the script. Functions are selected based # on whether it's configure or build time. function(Main) if(_BUILD_TIME_CHECK_GIT) # Check if the repo has changed. # If so, run the change action. CheckGit("${GIT_WORKING_DIR}" did_change state) if(did_change) GitStateChangedAction("${state}") endif() else() # >> Executes at configure time. SetupGitMonitoring() endif() endfunction() # And off we go... Main() berkeleylab-metabat-0db17a40717d/cmake/htslib.cmake000066400000000000000000000023531360417103500220470ustar00rootroot00000000000000set(htslib_PREFIX ${CMAKE_BINARY_DIR}/contrib/htslib-prefix) set(htslib_INSTALL ${CMAKE_BINARY_DIR}/contrib/htslib-install) if (CMAKE_GENERATOR STREQUAL "Unix Makefiles") # when using the makefile generator, use the special variable $(MAKE) to invoke make # this enables the jobserver to work correctly set(MAKE_COMMAND "$(MAKE)") else() # invoke make explicitly # in this case, we assume the parent build system is running in parallel already so no -j flag is added find_program(MAKE_COMMAND NAMES make gmake) endif() ExternalProject_Add(htslib PREFIX ${htslib_PREFIX} GIT_REPOSITORY "https://github.com/samtools/htslib.git" GIT_TAG "1.9" UPDATE_COMMAND "" BUILD_IN_SOURCE 1 #CONFIGURE_COMMAND "${CMAKE_CURRENT_SOURCE_DIR}/contrib/htslib-prefix/src/htslib/configure" #CONFIGURE_COMMAND "autoheader" #CONFIGURE_COMMAND "autoconf" CONFIGURE_COMMAND autoheader && autoconf && ./configure --disable-bz2 --disable-lzma --disable-libcurl BUILD_COMMAND ${MAKE_COMMAND} lib-static INSTALL_COMMAND ${MAKE_COMMAND} install prefix=${htslib_INSTALL} LOG_DOWNLOAD 1 ) add_dependencies(htslib zlib) include_directories(${htslib_INSTALL}/include) set(htslib_LIB ${htslib_INSTALL}/lib/libhts.a) berkeleylab-metabat-0db17a40717d/cmake/zlib.cmake000066400000000000000000000014501360417103500215170ustar00rootroot00000000000000# build zlib set(zlib_PREFIX ${CMAKE_BINARY_DIR}/contrib/zlib-prefix) set(zlib_INSTALL ${CMAKE_BINARY_DIR}/contrib/zlib-install) ExternalProject_Add(zlib PREFIX ${zlib_PREFIX} # GIT_REPOSITORY "https://github.com/jtkukunas/zlib.git" # GIT_TAG "e176b3c23ace88d5ded5b8f8371bbab6d7b02ba8" GIT_REPOSITORY "https://github.com/madler/zlib.git" GIT_TAG "v1.2.11" UPDATE_COMMAND "" BUILD_IN_SOURCE 1 CONFIGURE_COMMAND ${zlib_PREFIX}/src/zlib/configure --prefix=${zlib_INSTALL} --static INSTALL_DIR ${zlib_INSTALL} # CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} # -DCMAKE_INSTALL_PREFIX=${zlib_INSTALL} # -DAMD64=ON LOG_DOWNLOAD 1 LOG_INSTALL 1 ) include_directories(${zlib_INSTALL}/include) set(zlib_LIB ${zlib_INSTALL}/lib/libz.a) berkeleylab-metabat-0db17a40717d/license.txt000066400000000000000000000046121360417103500206630ustar00rootroot00000000000000********** MetaBAT (2014-075), The Regents of the University of California, through Lawrence Berkeley National Laboratory (subject to receipt of any required approvals from the U.S. Dept. of Energy). All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: (1) Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. (2) Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. (3) Neither the name of the University of California, Lawrence Berkeley National Laboratory, U.S. Dept. of Energy nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You are under no obligation whatsoever to provide any bug fixes, patches, or upgrades to the features, functionality or performance of the source code ("Enhancements") to anyone; however, if you choose to make your Enhancements available either publicly, or directly to Lawrence Berkeley National Laboratory, without imposing a separate written license agreement for such Enhancements, then you hereby grant the following license: a non-exclusive, royalty-free perpetual license to install, use, modify, prepare derivative works, incorporate into other computer software, distribute, and sublicense such enhancements or derivative works thereof, in binary and source code form. ********** berkeleylab-metabat-0db17a40717d/merge_depths.pl000077500000000000000000000025041360417103500215040ustar00rootroot00000000000000#!/usr/bin/env perl use strict; use warnings; my $USAGE="$0 depth1.txt depth2.txt [..] This simple script will combine the depth files as output by jgi_summarize_bam_contig_depths into a single file. "; if (@ARGV < 2) { die($USAGE); } my @l_fh = (); my @l_headers = (); for my $filename (@ARGV) { my $fh; open($fh, "<", $filename) || die("Could not open $filename! $!\n"); push @l_fh, $fh; my $firstline = <$fh>; chomp($firstline); my @headers = split("\t", $firstline); if (scalar(@l_headers) == 0) { push @l_headers, @headers; } else { shift @headers; shift @headers; shift @headers; push @l_headers, @headers; } } print join("\t", @l_headers) . "\n"; my $end = 0; while(not $end) { my @line = (); my $name = undef; my $len = undef; my $avg = 0.0; foreach my $fh (@l_fh) { my $line = <$fh>; if (not defined $line) { $end = 1; last; } chomp($line); my @fields = split("\t", $line); if (not defined $name) { $name = $fields[0]; $len = $fields[1]; } elsif ($len != $fields[1]) { die("Files do not match! $name $len vs '$line'\n"); } $avg += $fields[2]; shift @fields; shift @fields; shift @fields; push @line, @fields; } if (defined $name) { print join("\t", $name, $len, $avg, @line) . "\n"; } } berkeleylab-metabat-0db17a40717d/metabat_version.h.in000066400000000000000000000005661360417103500224440ustar00rootroot00000000000000// This file was create automatically by CMake. #pragma once // Whether or not we retrieved the state of the repo. #define GIT_RETRIEVED_STATE @GIT_RETRIEVED_STATE@ // The SHA1 for the HEAD of the repo. #define VERSION "@GIT_HEAD_SHA1@" // Whether or not there were uncommited changes present. #define GIT_IS_DIRTY @GIT_IS_DIRTY@ #define BUILD_TIMESTAMP "@_time_stamp@" berkeleylab-metabat-0db17a40717d/opt/000077500000000000000000000000001360417103500172775ustar00rootroot00000000000000berkeleylab-metabat-0db17a40717d/opt/benchmark.R000066400000000000000000000645501360417103500213660ustar00rootroot00000000000000requireAll <- function(packages) { dir.create("~/Rlibs", showWarnings=FALSE) .libPaths("~/Rlibs") .packages <- setdiff(packages, installed.packages()[,'Package']) if(length(.packages)>0) { suppressWarnings(rm(biocLite, envir=.GlobalEnv)) source("http://bioconductor.org/biocLite.R") biocLite(.packages, dependencies=TRUE, ask=FALSE, suppressUpdates=TRUE, lib="~/Rlibs") } for(package in packages) suppressPackageStartupMessages(do.call(library, list(package))) } requireAll(c('ggplot2','foreach','plyr','reshape2')) calcPerf <- function(type=c("MetaBAT","CONCOCT","GroopM","MaxBin","Canopy"), file="clustering_gt1000.csv", prof=NULL, minSize=200000) { type <- match.arg(type) if(!file.exists("contigs.txt")) { system("wget http://portal.nersc.gov/dna/RD/Metagenome_RD/MetaBAT/Files/contigs.txt") if(!file.exists("contigs.txt")) stop("Cannot find contigs.txt. Download it from http://portal.nersc.gov/dna/RD/Metagenome_RD/MetaBAT/Files/") } if(!file.exists("genomes.txt")) { system("wget http://portal.nersc.gov/dna/RD/Metagenome_RD/MetaBAT/Files/genomes.txt") if(!file.exists("genomes.txt")) stop("Cannot find genomes.txt. Download it from http://portal.nersc.gov/dna/RD/Metagenome_RD/MetaBAT/Files/") } set.seed(94521) contigs <- read.table("contigs.txt", sep="\t", header=T, as.is=T) genomes <- read.table("genomes.txt", sep="\t", header=T, as.is=T) if (type == 'MetaBAT') { files <- system(sprintf("ls %s.* | egrep '\\.[0-9]+$'", file), intern=T) if(length(files) == 0) stop(sprintf("Cannot find bins: %s.*", file)) } else if (type == "CONCOCT") { if(!file.exists(file)) stop(sprintf("Cannot find %s", file)) cc <- read.csv(file, header=F, as.is=T) cc.size <- ddply(cc, .(V2), function(x) sum(contigs$Size[match(x$V1, contigs$Name)])) stopifnot(!any(is.na(cc.size))) cc <- cc[cc$V2 %in% cc.size$V2[cc.size$V1 >= minSize],] files <- unique(cc$V2) } else if (type == "GroopM") { files <- system(sprintf("ls %s_bin_*.fna", file), intern=T) if(length(files) == 0) stop(sprintf("Cannot find bins: %s.*", file)) } else if (type == "MaxBin") { files <- system(sprintf("ls %s.*.fasta", file), intern=T) if(length(files) == 0) stop(sprintf("Cannot find bins: %s.*", file)) } else if (type == "Canopy") { if(!file.exists(file)) stop(sprintf("Cannot find %s", file)) if(is.null(prof)) stop("Cluster profile should be given") if(!file.exists(prof)) stop(sprintf("Cannot find %s", prof)) prof <- read.table(prof, as.is=T) rownames(prof) <- prof[,1]; prof <- prof[,-1] CAGs <- rownames(prof)[rowSums(t(apply(prof,1,sort, decreasing=TRUE))[,1:3])/rowSums(prof)<=0.9] cc <- read.table(file, as.is=T)[,c('V2','V1')]; colnames(cc) <- c('V1','V2') CAGs <- intersect(CAGs, names(which(table(cc$V2) > 2))) cc <- cc[cc$V2 %in% CAGs,] cc.size <- ddply(cc, .(V2), function(x) sum(contigs$Size[match(x$V1, contigs$Name)])) stopifnot(!any(is.na(cc.size))) cc <- cc[cc$V2 %in% cc.size$V2[cc.size$V1 >= minSize],] files <- unique(cc$V2) } res <- foreach(f=files, .combine=rbind) %do% { if (type == 'MetaBAT') ctgs <- read.table(f, as.is=T)$V1 else if (type %in% c("CONCOCT", "Canopy")) ctgs <- cc$V1[cc$V2 == f] else if (type %in% c("GroopM","MaxBin")) ctgs <- system(sprintf("grep '>' %s | sed 's/>//'", f), intern=TRUE) .res <- contigs[match(ctgs, contigs$Name),] stopifnot(!any(is.na(.res)) | nrow(.res) == length(ctgs)) .res$Name <- sapply(strsplit(.res$Name, "\\[|\\]"), function(x) x[2]) .res <- ddply(.res, .(Name), function(x) sum(x$Size)) colnames(.res) <- c("Genome","Size") .res <- .res[order(.res$Size,decreasing=T),] TP <- .res$Size[1] FP <- sum(.res$Size) - TP Recall <- TP / genomes$Size[genomes[,1] == .res$Genome[1]] Precision <- TP / sum(.res$Size) F1 <- 2 * Recall * Precision / (Precision + Recall) F0.5 <- (1 + .5 ^ 2) * Recall * Precision / ((.5 ^ 2) * Precision + Recall) cbind.data.frame(Genome=.res$Genome[1], Recall, Precision, F1, F0.5, stringsAsFactors=F) } while (length(unique(res$Recall)) != nrow(res)) { res$Recall = res$Recall + rnorm(nrow(res), sd=1e-8) } while (length(unique(res$Precision)) != nrow(res)) { res$Precision = res$Precision + rnorm(nrow(res), sd=1e-8) } while (length(unique(res$F1)) != nrow(res)) { res$F1 = res$F1 + rnorm(nrow(res), sd=1e-8) } while (length(unique(res$'F0.5')) != nrow(res)) { res$'F0.5' = res$'F0.5' + rnorm(nrow(res), sd=1e-8) } res <- cbind(res, Rank.Recall=length(res$Recall)+1-rank(res$Recall,ties.method="max"), Rank.Precision=length(res$Precision)+1-rank(res$Precision,ties.method="max"), Rank.F1=length(res$F1)+1-rank(res$F1,ties.method="max"), Rank.F0.5=length(res$'F0.5')+1-rank(res$'F0.5',ties.method="max")) return(res) } calcPerfBySCG <- function(f, minRec=.2, minPrec=0, removeStrain=F, skip=2) { #to prevent bias in precision due to smaller bin size if(is.data.frame(f)) SCG <- f else SCG <- read.table(f, comment.char='-', as.is=T, header=F, skip=skip) set.seed(94522) SCG <- SCG[order(SCG$V13,decreasing=T),] #TODO need to warn the additional '-' character in the bin name SCG.ID <- SCG$V1 if (ncol(SCG) == 14) { if(removeStrain) SCG[,13] <- SCG[,13] * (100 - SCG[,14]) / 100 SCG <- SCG[,c(12,13)] / 100 SCG$V13 <- pmax(1 - SCG$V13, 0) } else if (ncol(SCG) == 15) { if(removeStrain) SCG[,14] <- SCG[,14] * (100 - SCG[,15]) / 100 SCG <- SCG[,c(13,14)] / 100 SCG$V14 <- pmax(1 - SCG$V14, 0) } else stop("[Error!] Unexpected SCG file format") SCG <- cbind(SCG.ID, SCG, stringsAsFactors=F) colnames(SCG) <- c('ID','Recall','Precision') SCG <- SCG[SCG$Recall >= minRec & SCG$Precision >= minPrec,] SCG$F1 <- 2 * SCG$Recall * SCG$Precision / (SCG$Precision + SCG$Recall) SCG$'F0.5' <- (1 + .5 ^ 2) * SCG$Recall * SCG$Precision / ((.5 ^ 2) * SCG$Precision + SCG$Recall) while (length(unique(SCG$Recall)) != nrow(SCG)) { SCG$Recall <- SCG$Recall + rnorm(nrow(SCG), sd=1e-8) } while (length(unique(SCG$Precision)) != nrow(SCG)) { SCG$Precision <- SCG$Precision + rnorm(nrow(SCG), sd=1e-8) } while (length(unique(SCG$F1)) != nrow(SCG)) { SCG$F1 <- SCG$F1 + rnorm(nrow(SCG), sd=1e-8) } while (length(unique(SCG$'F0.5')) != nrow(SCG)) { SCG$'F0.5' <- SCG$'F0.5' + rnorm(nrow(SCG), sd=1e-8) } SCG$Recall <- pmax(pmin(SCG$Recall, 1), 0) SCG$Precision <- pmax(pmin(SCG$Precision, 1), 0) SCG$Rank.Recall <- length(SCG$Recall)+1-rank(SCG$Recall,ties.method="max") SCG$Rank.Precision <- length(SCG$Precision)+1-rank(SCG$Precision,ties.method="max") SCG$Rank.F1 <- length(SCG$F1)+1-rank(SCG$F1,ties.method="max") SCG$Rank.F0.5 <- length(SCG$'F0.5')+1-rank(SCG$'F0.5',ties.method="max") return(SCG) } calcPerfCAMI <- function(type=c("MetaBAT","CONCOCT","MaxBin","BinSanity"), file="clustering_gt1000.csv", complexity=c('low','medium','high'), minSize=200000) { type <- match.arg(type) complexity <- match.arg(complexity) if (complexity == 'low') { fname1 <- 'contigs-low.txt' fname2 <- 'gsa_mapping.binning' } else if (complexity == 'medium') { fname1 <- 'contigs-medium.txt' fname2 <- 'pooled_gsa_mapping.binning.tsv' } else if (complexity == 'high') { fname1 <- 'contigs-high.txt' fname2 <- 'gsa_mapping_pool.binning' } if(!file.exists(fname1)) { system(sprintf("wget http://portal.nersc.gov/dna/RD/Metagenome_RD/MetaBAT/Files/CAMI/%s", fname1)) if(!file.exists(fname1)) stop(sprintf("Cannot find %s. Download it from http://portal.nersc.gov/dna/RD/Metagenome_RD/MetaBAT/Files/CAMI/", fname1)) } if(!file.exists(fname2)) { system(sprintf("wget http://portal.nersc.gov/dna/RD/Metagenome_RD/MetaBAT/Files/CAMI/%s", fname2)) if(!file.exists(fname2)) stop(sprintf("Cannot find %s. Download it from http://portal.nersc.gov/dna/RD/Metagenome_RD/MetaBAT/Files/CAMI/", fname2)) } set.seed(94521) contigs <- read.table(fname1, sep="\t", header=F, as.is=T) colnames(contigs) <- c('Name', 'Size') genomes <- read.table(fname2, skip=4, as.is=T) contigs$Genome <- genomes$V2[match(contigs$Name, genomes$V1)] genomes <- ddply(contigs, .(Genome), function(x) cbind(Ctgs=nrow(x), Size=sum(x$Size))) if (type == 'MetaBAT') { files <- system(sprintf("ls %s.*.fa", file), intern=T) if(length(files) == 0) stop(sprintf("Cannot find bins: %s.*", file)) } else if (type == "CONCOCT") { if(!file.exists(file)) stop(sprintf("Cannot find %s", file)) cc <- read.csv(file, header=F, as.is=T) cc.size <- ddply(cc, .(V2), function(x) sum(contigs$Size[match(x$V1, contigs$Name)])) stopifnot(!any(is.na(cc.size))) cc <- cc[cc$V2 %in% cc.size$V2[cc.size$V1 >= minSize],] files <- unique(cc$V2) } else if (type == "MaxBin") { files <- system(sprintf("ls %s.*.fasta", file), intern=T) if(length(files) == 0) stop(sprintf("Cannot find bins: %s.*", file)) } else if (type == "BinSanity") { files <- system(sprintf("ls %s/*.fna", file), intern=T) if(length(files) == 0) stop(sprintf("Cannot find bins: %s.*", file)) } res <- foreach(f=files, .combine=rbind) %dopar% { if (type %in% c("CONCOCT")) ctgs <- cc$V1[cc$V2 == f] else ctgs <- system(sprintf("grep '>' %s | sed 's/>//'", f), intern=TRUE) .res <- contigs[match(ctgs, contigs$Name),] stopifnot(!any(is.na(.res)) | nrow(.res) == length(ctgs)) .res <- ddply(.res, .(Genome), function(x) sum(x$Size)) colnames(.res) <- c("Genome","Size") .res <- .res[order(.res$Size,decreasing=T),] TP <- .res$Size[1] FP <- sum(.res$Size) - TP Recall <- TP / genomes$Size[genomes[,1] == .res$Genome[1]] Precision <- TP / sum(.res$Size) F1 <- 2 * Recall * Precision / (Precision + Recall) F0.5 <- (1 + .5 ^ 2) * Recall * Precision / ((.5 ^ 2) * Precision + Recall) cbind.data.frame(Genome=.res$Genome[1], Recall, Precision, F1, F0.5, stringsAsFactors=F) } while (length(unique(res$Recall)) != nrow(res)) { res$Recall = res$Recall + rnorm(nrow(res), sd=1e-8) } while (length(unique(res$Precision)) != nrow(res)) { res$Precision = res$Precision + rnorm(nrow(res), sd=1e-8) } while (length(unique(res$F1)) != nrow(res)) { res$F1 = res$F1 + rnorm(nrow(res), sd=1e-8) } while (length(unique(res$'F0.5')) != nrow(res)) { res$'F0.5' = res$'F0.5' + rnorm(nrow(res), sd=1e-8) } res <- cbind(res, Rank.Recall=length(res$Recall)+1-rank(res$Recall,ties.method="max"), Rank.Precision=length(res$Precision)+1-rank(res$Precision,ties.method="max"), Rank.F1=length(res$F1)+1-rank(res$F1,ties.method="max"), Rank.F0.5=length(res$'F0.5')+1-rank(res$'F0.5',ties.method="max")) return(res) } plotPerf2 <- function(res, rec=c(.3,.5,.7,.9), prec=c(.9,.95), stress=NULL, .xlim=NULL, .ylim=NULL) { res <- lapply(res, function(x) { x <- sapply(rec, function(rec) sapply(prec, function(prec) sum(x$Recall > rec & x$Precision > prec))); dimnames(x) <- list(Precision=prec, Recall=rec); x}) res <- melt(res) res$L1 <- factor(res$L1, levels=unique(res$L1)) p <- ggplot(res, aes(x = L1, y = value, fill = L1)) + theme_bw() p <- p + geom_bar(stat="identity") if(!is.null(stress) && stress %in% levels(res$L1)) { .col <- rep("grey50", length(levels(res$L1))) .col[grep(stress,levels(res$L1))] <- "grey20" p <- p + scale_fill_manual(values=.col) } p <- p + facet_grid(Precision ~ Recall) p <- p + xlab("") + ylab("") + theme(axis.text.x = element_text(angle = 45, hjust=1)) if (!is.null(.xlim)) p <- p + xlim(.xlim) if (!is.null(.ylim)) p <- p + ylim(.ylim) p <- p + theme(legend.position = "none") print(p) } plotPerf3 <- function(res, rec=seq(.3,.9,.1), prec=c(.9,.95), legend.position=c(.9,.7)) { if("Genome" %in% colnames(res[[1]])) { res <- lapply(res, function(x) { ddply(x, .(Genome), function(xx) { xx[which.max(xx$Recall),] }) }) } res <- lapply(res, function(x) { x <- sapply(rec, function(rec) sapply(prec, function(prec) sum(x$Recall > rec & x$Precision > prec))); x <- matrix(x, nrow=length(prec), byrow=F); dimnames(x) <- list(Precision=prec, Recall=rec); x}) for(i in 1:length(res)) { for(j in 1:(ncol(res[[i]])-1)) { res[[i]][,j] <- res[[i]][,j] - res[[i]][,j+1] } } res <- melt(res) res$L1 <- factor(res$L1, levels=unique(res$L1)) res$Recall <- as.character(res$Recall) #res$Recall <- factor(res$Recall, levels=rev(unique(res$Recall))) res$Precision <- factor(res$Precision, levels=rev(unique(res$Precision))) p <- ggplot(res, aes(x = L1, y = value, fill = Recall)) + theme_bw() p <- p + geom_bar(stat="identity") p <- p + scale_fill_grey(start=0.8, end=0.2) if(length(prec) > 1) p <- p + facet_wrap( ~ Precision, ncol=2) p <- p + coord_flip() p <- p + xlab("") + ylab("# of Genomes Identified") p <- p + theme(legend.position = legend.position, legend.key.size = grid::unit(1, "lines"), legend.text = element_text(size = rel(.7)), legend.title = element_text(face="bold", size = rel(.7))) p <- p + guides(fill = guide_legend(reverse=T)) suppressWarnings(print(p)) } plotPerfVenn <- function(res, rec=.3, prec=.9, sel=NULL) { requireAll(c('grid','VennDiagram')) res <- lapply(res, function(x) { ddply(x, .(Genome), function(xx) { xx[which.max(xx$Recall),] }) }) v <- lapply(res, function(x) x$Genome[x$Recall > rec & x$Precision > prec]) if(!is.null(sel) && all(sel %in% names(v))) v <- v[sel] grid.draw(venn.diagram(v, filename=NULL, fill = c("dodgerblue", "goldenrod1", "darkorange1", "seagreen3", "orchid3")[1:length(v)], cat.col = c("dodgerblue", "goldenrod1", "darkorange1", "seagreen3", "orchid3")[1:length(v)], cat.cex = 2, cex = 1.5, margin=.2)) } getCtgList <- function(res) { d <- foreach(i=1:length(res)) %do% { gs <- foreach(g=res[[i]]$ID) %dopar% { if(names(res)[i] == "MetaBAT") { ctgs <- system(sprintf("grep '>' ./1.5kb/MetaBAT/%s.fa | sed 's/^>//'",g), intern=T) } else if(names(res)[i] == "Canopy") { ctgs <- system(sprintf("grep '>' ./1.5kb/Canopy/%s.fa | cut -f1 -d' ' | sed 's/^>//'",g), intern=T) } else if(names(res)[i] == "CONCOCT") { ctgs <- system(sprintf("grep '>' ./1.5kb/CONCOCT/bins/%s.fa | cut -f1 -d' ' | sed 's/^>//'",g), intern=T) } else if(names(res)[i] == "MaxBin") { ctgs <- system(sprintf("grep '>' ./1.5kb/MaxBin/%s.fasta | sed 's/^>//'",g), intern=T) } else if(names(res)[i] == "GroopM") { ctgs <- system(sprintf("grep '>' ./1.5kb/GroopM/core_only/%s.fna | sed 's/^>//'",g), intern=T) } ctgs } names(gs) <- sprintf("S%d_B%d",i,seq(res[[i]]$ID)) gs } names(d) <- names(res) return(d) } plotPerfVennBySCG <- function(res, ctgList, ctgSizes, minRec=.3, minPrec=.9) { getCatalogs <- function(list1, list2, sizes, SCG1, SCG2) { findMiddle <- function(summ) { b12 <- (1:nrow(summ))[rowSums(summ>0)==1]; b12S <- length(b12) #one-to-many map from b1 to b2 b21 <- (1:ncol(summ))[colSums(summ>0)==1]; b21S <- length(b21) #one-to-many map from b2 to b1 while(TRUE) { good <- T if(length(b12) > 0 && length(b21) > 0) { b12 <- b12[rowSums(summ[b12,b21,drop=F]>0)==1] if(length(b12) == 0) { good <- F } else { b21 <- b21[colSums(summ[b12,b21,drop=F]>0)==1] if(length(b21) == 0) { good <- F } } } else { good <- F } if(!good) { b12 <- b21 <- NULL break } if (length(b12) == b12S && length(b21) == b21S) { break } else { b12S <- length(b12) b21S <- length(b21) } } stopifnot(length(b12)==length(b21)) list(b12=b12, b21=b21) } stopifnot(all(unlist(list1) %in% sizes$V1)) stopifnot(all(unlist(list2) %in% sizes$V1)) summ <- foreach(m=list1, .combine=rbind) %dopar% { foreach(n=list2, .combine=cbind) %do% { sum(sizes$V2[match(intersect(m,n), sizes$V1)]) } } flipped <- nrow(summ) < ncol(summ) if(flipped) { tmp <- list1; list1 <- list2; list2 <- tmp tmp <- SCG1; SCG1 <- SCG2; SCG2 <- tmp summ <- t(summ) } b1 <- which(rowSums(summ>0)==0) #unique to b1 b2 <- which(colSums(summ>0)==0) #unique to b2 b121 <- findMiddle(summ) b12 <- b121$b12 b21 <- b121$b21 stopifnot(length(intersect(b1,b12))==0 && length(intersect(b2,b21))==0) b12dup <- setdiff(1:nrow(summ), c(b1,b12)) b21dup <- setdiff(1:ncol(summ), c(b2,b21)) updated <- TRUE while(updated) { #any(rowSums(ss>0)>1) || any(colSums(ss>0)>1) updated <- FALSE ss <- summ[,b21dup,drop=F] for(cc in 1:ncol(ss)) { if(sum(ss[,cc]>0) > 1) { #t1 <- which(ss[,cc] > 0) t1 <- which.max(summ[,b21dup[cc]]) if(which.max(summ[t1,]) == b21dup[cc]) { #reciprocal best.. remove the others in row and col summ[setdiff(which(summ[,b21dup[cc]]>0), t1),b21dup[cc]] <- 0 summ[t1,setdiff(which(summ[t1,]>0), b21dup[cc])] <- 0 } updated <- TRUE } } ss <- summ[b12dup,,drop=F] for(rr in 1:nrow(ss)) { if(sum(ss[rr,]>0) > 1) { t2 <- which.max(summ[b12dup[rr],]) if(which.max(summ[,t2]) == b12dup[rr]) { #reciprocal best.. remove the others in row and col summ[b12dup[rr],setdiff(which(summ[b12dup[rr],]>0), t2)] <- 0 summ[setdiff(which(summ[,t2]>0), b12dup[rr]), t2] <- 0 } updated <- TRUE } } } b1 <- which(rowSums(summ>0)==0) #unique to b1 b2 <- which(colSums(summ>0)==0) #unique to b2 b121 <- findMiddle(summ) b12 <- b121$b12 b21 <- b121$b21 stopifnot(length(intersect(b1,b12))==0 && length(intersect(b2,b21))==0) stopifnot(length(unique(b12)) == length(unique(b21))) stopifnot(all(rowSums(summ[b12,b21]>0)==1)) stopifnot(all(colSums(summ[b12,b21]>0)==1)) SCG <- SCG1[b1,] middle <- list() for(i in 1:length(b12)) { j <- which(summ[b12[i],b21] > 0) if(SCG1$Recall[b12[i]] >= SCG2$Recall[b21[j]]) { middle[[i]] <- list1[[b12[i]]] SCG <- rbind(SCG, SCG1[b12[i],]) } else { middle[[i]] <- list2[[b21[j]]] SCG <- rbind(SCG, SCG2[b21[j],]) } } SCG <- rbind(SCG, SCG2[b2,]) if(flipped) { left <- list2[b2] right <- list1[b1] } else { left <- list1[b1] right <- list2[b2] } list(left=left, middle=middle, right=right, SCG=SCG) } stopifnot(length(res) == length(ctgList)) stopifnot(length(res) >= 2) stopifnot(length(res) <= 5) requireAll(c('grid','VennDiagram','doMC')) registerDoMC() for(i in 1:length(res)) { ctgList[[i]] <- ctgList[[i]][res[[i]]$Recall >= minRec & res[[i]]$Precision>=minPrec] res[[i]] <- res[[i]][res[[i]]$Recall >= minRec & res[[i]]$Precision>=minPrec, ] } catalogs <- NULL for(i in 2:length(ctgList)) { if(i==2) catalogs <- getCatalogs(ctgList[[1]],ctgList[[2]],ctgSizes,res[[1]],res[[2]]) else catalogs <- getCatalogs(do.call(c,catalogs[1:3]), ctgList[[i]], ctgSizes, catalogs$SCG, res[[i]]) } genomes <- do.call(c,catalogs[1:3]) names(genomes) <- paste("Genome",1:length(genomes)) res.venn <- foreach(i=1:length(ctgList)) %do% { gs <- getCatalogs(genomes, ctgList[[i]], ctgSizes, catalogs$SCG, res[[i]])$middle summ <- foreach(m=genomes, .combine=rbind) %dopar% { foreach(n=gs, .combine=cbind) %do% { length(intersect(m,n)) } } names(genomes)[apply(summ,2,which.max)] } names(res.venn) <- names(ctgList) grid.draw(venn.diagram(res.venn, filename=NULL, fill = c("dodgerblue", "goldenrod1", "darkorange1", "seagreen3", "orchid3")[1:length(res.venn)], cat.col = c("dodgerblue", "goldenrod1", "darkorange1", "seagreen3", "orchid3")[1:length(res.venn)], cat.cex = 2, cex = 1.5, margin=.2)) } plotPerf <- function(res, xlim.=NULL, yrange=c(-0.001,1.001), legend.order=NULL, legend.position=c(.35,.9), what=c('Recall','Precision','F1','F0.5')) { if(is.null(xlim.)) { if(!file.exists("genomes.txt")) { system("wget http://portal.nersc.gov/dna/RD/Metagenome_RD/MetaBAT/Files/genomes.txt") if(!file.exists("genomes.txt")) stop("Cannot find genomes.txt. Download it from http://portal.nersc.gov/dna/RD/Metagenome_RD/MetaBAT/Files/") } genomes <- read.table("genomes.txt", sep="\t", header=T, as.is=T) } if(is.null(legend.order) || length(intersect(names(res), legend.order)) != length(res)) { legend.order <- names(res) } modes <- rep(names(res), sapply(res, nrow)) res <- cbind(Mode=modes, do.call(rbind, res)) if (!all(what %in% c('Recall','Precision','F1','F0.5'))) { stop("what should be from the list of 'Recall','Precision','F1','F0.5'") } .d1 <- melt(res[, c('Mode','Recall','Precision','F1','F0.5')], id.vars=c('Mode'), variable.name='Score', value.name='Y') .d2 <- melt(res[, c('Mode','Rank.Recall','Rank.Precision','Rank.F1','Rank.F0.5')], id.vars=c('Mode'), variable.name='Rank', value.name='X') .d <- cbind(.d1, X=.d2$X) .d <- .d[.d$Score %in% what,] .d$Score <- droplevels(.d$Score) if(is.null(xlim.)) xlim. <- max(nrow(genomes), max(.d$X)) .d$Mode <- factor(.d$Mode, levels=legend.order) p <- ggplot(.d, aes(X, Y, colour=Mode)) + theme_bw() + facet_wrap(~ Score, nrow=ifelse(length(what)==4,2,1)) p <- p + geom_line(size=1) p <- p + xlab("Genome Bins (Sorted)") + ylab("Performance Metric") p <- p + ylim(yrange) + xlim(c(1,xlim.)) p <- p + theme(legend.position = legend.position); p$labels$colour <- NULL suppressWarnings(print(p)) } getPerf <- function(res, rec=c(seq(.3,.9,.1),.95), prec=c(seq(.7,.9,.1),.95,.99), uniqueGenomes=FALSE) { if (uniqueGenomes) { res <- lapply(res, function(x) { ddply(x, .(Genome), function(xx) { xx[which.max(xx$Recall),] }) }) } out <- lapply(res, function(x) { x <- sapply(rec, function(rec) sapply(prec, function(prec) sum(x$Recall >= rec & x$Precision >= prec))); dimnames(x) <- list(Precision=prec, Recall=rec); x}) return(out) } diffPerf <- function(res1, res2, rec=c(seq(.1,.9,.1),.95), prec=c(seq(0,.9,.1),.95,.99)) { if(is.data.frame(res1) || !is.list(res1)) res1 <- list(res1) if(is.data.frame(res2) || !is.list(res2)) res2 <- list(res2) out <- getPerf(res1, rec, prec)[[1]] - getPerf(res2, rec, prec)[[1]] return(out) } printPerf <- function(res, rec=c(seq(.3,.9,.1),.95), prec=c(seq(.7,.9,.1),.95,.99), uniqueGenomes=FALSE) { print(getPerf(res, rec, prec, uniqueGenomes)) } args <- commandArgs(TRUE) if (!is.na(args[1]) && !is.na(args[2])) { bindir <- args[1] checkmfile1 <- paste0(bindir, '/CheckM.txt') bindir <- args[2] checkmfile2 <- paste0(bindir, '/CheckM.txt') print(paste0("Peformance of ", checkmfile2, " vs ", checkmfile1, " including all strains")) out1 <- diffPerf(calcPerfBySCG(checkmfile2, removeStrain=F), calcPerfBySCG(checkmfile1, removeStrain=F), rec=c(seq(.1,.9,.1),.95), prec=c(seq(.6,.9,.1),.95,.99)) print(out1) print(paste0("Peformance of ", checkmfile2, " vs ", checkmfile1, " removing strains")) out2 <- diffPerf(calcPerfBySCG(checkmfile2, removeStrain=T), calcPerfBySCG(checkmfile1, removeStrain=T), rec=c(seq(.1,.9,.1),.95), prec=c(seq(.6,.9,.1),.95,.99)) print(out2) } else { bindir <- args[1] checkmfile <- paste0(bindir, '/CheckM.txt') print(paste0("Peformance of ",checkmfile, " including all strains")) out1 <- getPerf(list(calcPerfBySCG(checkmfile, removeStrain=F)), rec=c(seq(.1,.9,.1),.95), prec=c(seq(.6,.9,.1),.95,.99)) print(out1) print(paste0("Peformance of ",checkmfile, " removing strains")) out2 <- getPerf(list(calcPerfBySCG(checkmfile, removeStrain=T)), rec=c(seq(.1,.9,.1),.95), prec=c(seq(.6,.9,.1),.95,.99)) print(out2) print(paste0("Difference without strains")) print(out2[[1]] - out1[[1]]) } berkeleylab-metabat-0db17a40717d/opt/checkBins.sh000077500000000000000000000024651360417103500215360ustar00rootroot00000000000000#!/bin/bash USAGE="$0 BinDir" if [ $# -lt 1 ] then echo "$USAGE" 1>&2 exit 1 fi bindir=$1 # Resolve the base path and bootstrap the environment SOURCE="${BASH_SOURCE[0]}" while [ -h "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink DIR="$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd )" SOURCE="$(readlink "$SOURCE")" [[ $SOURCE != /* ]] && SOURCE="$DIR/$SOURCE" # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located done DIR="$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd )" echo "Starting $0 $@ at $(date)" set -e onerr() { echo "uh oh something went wrong with '$0 $@' at $(date)" 1>&2 trap "" 0 exit 1 } trap onerr 0 1 2 3 15 if [ ! -f $bindir/CheckM.txt ] then echo "Running checkm $(date)" $DIR/jgi_docker_wrapper.sh sstevens/checkm checkm lineage_wf -f $bindir/CheckM.txt.tmp -t 8 -x fa $bindir/ $bindir/SCG mv $bindir/CheckM.txt.tmp $bindir/CheckM.txt echo "Completed checkm at $(date)" fi if [ ! -f $bindir/CheckM-perf.txt ] then echo "Running R performance check $(date)" Rscript $DIR/benchmark.R $bindir > $bindir/CheckM-perf.txt.tmp mv $bindir/CheckM-perf.txt.tmp $bindir/CheckM-perf.txt echo "Completed benchmark.R at $(date)" fi trap "" 0 echo "Completed $0 $@ at $(date)" berkeleylab-metabat-0db17a40717d/opt/jgi_docker_wrapper.sh000077500000000000000000000031331360417103500234760ustar00rootroot00000000000000#!/bin/bash USAGE="Proper Usage: $0 image cmd [...]" if [ $# -lt 2 ] then echo "$USAGE" 1>&2 echo "ERROR - please specify an image and command to execute" 1>&2 exit 1 fi image=$1 shift cmd=$2 shifter=$(which shifter 2>/dev/null || true) docker=$(which docker 2>/dev/null || true) cmd2=$(which cmd 2>/dev/null || true) set -e set -o pipefail RUN_PREFIX= if [ -x "${cmd2}" ] then # no need for shifter or docker RUN_PREFIX=time elif [ -x "${shifter}" ] then img=$(${shifter}img lookup $image 1>&2 || true) [ -z "$img" ] && ${shifter}img pull $image 1>&2 && img=$(${shifter}img lookup $image) if [ -z "$img" ] then echo "$USAGE" 1>&2 echo "ERROR - shifter could not pull $image" 1>&2 exit 1 fi RUN_PREFIX="shifter --image=id:${img}" elif [ -x "${docker}" ] then if ! docker pull $image 1>&2 then echo "$USAGE" 1>&2 echo "ERROR - docker could not pull $image" 1>&2 exit 1 fi img=$image volumes="--volume=$(pwd):$(pwd) --workdir=$(pwd)" if [ -n "$VOLUMES" ] then for v in $VOLUMES do volumes="$volumes --volume=$v:$v" done fi RUN_PREFIX="docker run -i --tty=false -a STDIN -a STDOUT -a STDERR --user $(id -u):$(id -g) $volumes ${img}" else echo "$USAGE" 1>&2 echo "Could not find '$cmd' or shifter or docker for image=$image. Please update your PATH" 1>&2 exit 1 fi echo "Executing '$RUN_PREFIX $@' at $(date) on $(uname -n)" 1>&2 ret=0 $RUN_PREFIX $@ || ret=$? if [ $ret -ne 0 ] then echo "ERROR exit $ret for command '$RUN_PREFIX $@' at $(date)" 1>&2 exit $ret else echo "Finished at $(date) with $SECONDS s runtime" 1>&2 fi berkeleylab-metabat-0db17a40717d/opt/runStandards.sh000077500000000000000000000061511360417103500223110ustar00rootroot00000000000000#!/bin/bash USAGE="$0 /path/to/metabat2 [ binPrefix ]" mb2=$(realpath $1) if [ ! -x "$mb2" ] then echo "$USAGE" 1>&2 echo "Could not execute '$mb2'" 1>&2 exit 1 fi git=$(which git 2>/dev/null || true) gitver= if [ -x "$git" ] then gitver=$(${git} describe --tags --dirty 2>/dev/null || true) [ -z "$gitver" ] || gitver="-$gitver" fi binPrefix=$2 [ -n "$binPrefix" ] || binPrefix=standard$gitver invokedas="'$0 $@'" # Resolve the base path and bootstrap the environment SOURCE="${BASH_SOURCE[0]}" while [ -h "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink DIR="$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd )" SOURCE="$(readlink "$SOURCE")" [[ $SOURCE != /* ]] && SOURCE="$DIR/$SOURCE" # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located done DIR="$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd )" echo "Starting $invokedas at $(date)" set -e set -o pipefail onerr() { echo "uh oh something went wrong with '$invokedas' at $(date)" 1>&2 trap "" 0 exit 1 } trap onerr 0 1 2 3 15 if [ -n "$SCRATCH" ] && [ -w "$SCRATCH" ] then WORKDIR=/$SCRATCH/$USER-metabat-standards else WORKDIR=/tmp/$USER-metabat-standards fi mkdir -p $WORKDIR cd $WORKDIR echo "Saving files to $(pwd) and using binPrefix=$binPrefix" for case in CASE1 CASE2 CASE3 do if [ ! -d $case ] then mkdir $case.tmp cd $case.tmp wget https://portal.nersc.gov/dna/RD/Metagenome_RD/MetaBAT/Files/BestPractices/V2/$case/assembly.fa.gz wget https://portal.nersc.gov/dna/RD/Metagenome_RD/MetaBAT/Files/BestPractices/V2/$case/depth.txt cd .. mv $case.tmp $case fi # default, minus abundance table bins_noa=$binPrefix-$case-noabd if [ ! -d $bins_noa ] then rm -rf $bins_noa.tmp $mb2 -i $case/assembly.fa.gz -o $bins_noa.tmp/bin -v --seed 1 2>$bins_noa.err | tee $bins_noa.log mv $bins_noa.tmp $bins_noa fi $DIR/checkBins.sh $bins_noa 2>&1 | tee $bins_noa-checkBins.log cat $bins_noa/CheckM-perf.txt # default bins=$binPrefix-$case if [ ! -d $bins ] then rm -rf $bins.tmp $mb2 -i $case/assembly.fa.gz -a $case/depth.txt -o $bins.tmp/bin -v --seed 1 2>$bins.err | tee $bins.log mv $bins.tmp $bins fi $DIR/checkBins.sh $bins 2>&1 | tee $bins-checkBins.log cat $bins/CheckM-perf.txt Rscript $DIR/benchmark.R $bins_noa $bins > CheckM-diffperf.txt.tmp mv CheckM-diffperf.txt.tmp CheckM-diffperf-noabd.txt echo "Completed diffperf between $bins_noa and $bins at $(date)" cat CheckM-diffperf-noabd.txt # default with minContig 2000 bins2=$binPrefix-$case-m2000 if [ ! -d $bins2 ] then rm -rf $bins2.tmp $mb2 -i $case/assembly.fa.gz -a $case/depth.txt -o $bins2.tmp/bin -v -m 2000 --seed 1 2>$bin2.err | tee $bins2.log mv $bins2.tmp $bins2 fi $DIR/checkBins.sh $bins2 2>&1 | tee $bins2-checkBins.log cat $bins2/CheckM-perf.txt Rscript $DIR/benchmark.R $bins $bins2 > CheckM-diffperf.txt.tmp mv CheckM-diffperf.txt.tmp CheckM-diffperf.txt echo "Completed diffperf between $bins and $bins2 at $(date)" done trap "" 0 echo "Done with $invokedas at $(date)" berkeleylab-metabat-0db17a40717d/runMetaBat.sh000077500000000000000000000052371360417103500211050ustar00rootroot00000000000000#!/bin/bash SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" PATH=$SCRIPTPATH:$PATH MB=metabat2 SUM=jgi_summarize_bam_contig_depths BADMAP=${BADMAP:=0} PCTID=${PCTID:=97} MINDEPTH=${MINDEPTH:=1.0} if ! $MB --help 2>/dev/null then echo "Please ensure that the MetaBAT binaries are in your PATH: Could not find $MB" 2>&1 exit 1 fi if ! $SUM 2>/dev/null then echo "Please ensure that the MetaBAT binaries are in your PATH: Could not find $SUM" 2>&1 exit 1 fi USAGE="$0