pax_global_header00006660000000000000000000000064141476412770014527gustar00rootroot0000000000000052 comment=ef1cd626a85cfd1c1b7acfca2b5fd5957f2a05f1 delly-0.9.1/000077500000000000000000000000001414764127700126475ustar00rootroot00000000000000delly-0.9.1/.github/000077500000000000000000000000001414764127700142075ustar00rootroot00000000000000delly-0.9.1/.github/workflows/000077500000000000000000000000001414764127700162445ustar00rootroot00000000000000delly-0.9.1/.github/workflows/c-cpp.yml000066400000000000000000000006561414764127700200000ustar00rootroot00000000000000name: C/C++ CI on: push: branches: [ main ] pull_request: branches: [ main ] jobs: build: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - name: make run: | sudo apt-get update sudo apt-get install -y libcurl4-gnutls-dev libhts-dev libboost-date-time-dev libboost-program-options-dev libboost-system-dev libboost-filesystem-dev libboost-iostreams-dev make delly-0.9.1/.github/workflows/docker.yml000066400000000000000000000011351414764127700202360ustar00rootroot00000000000000name: Docker CI on: push: branches: main jobs: main: runs-on: ubuntu-latest steps: - name: Set up QEMU uses: docker/setup-qemu-action@v1 - name: Set up Docker Buildx uses: docker/setup-buildx-action@v1 - name: Login to DockerHub uses: docker/login-action@v1 with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - name: Build and push id: docker_build uses: docker/build-push-action@v2 with: push: true tags: dellytools/delly:latest delly-0.9.1/.gitignore000066400000000000000000000004631414764127700146420ustar00rootroot00000000000000.htslib *.pyc *~ # Binaries bin/ src/delly src/dellyLR src/dpe # Compiled Object files *.slo *.lo *.o *.obj # Precompiled Headers *.gch *.pch # Compiled Dynamic libraries *.so *.dylib *.dll # Fortran module files *.mod # Compiled Static libraries *.lai *.la *.a *.lib # Executables *.exe *.out *.app delly-0.9.1/.gitmodules000066400000000000000000000001521414764127700150220ustar00rootroot00000000000000[submodule "src/htslib"] path = src/htslib url = https://github.com/samtools/htslib.git ignore = dirty delly-0.9.1/AUTHORS000066400000000000000000000003011414764127700137110ustar00rootroot00000000000000Delly: Integrated structural variant discovery for short- and long-read sequencing. Delly Project Authors: Tobias Rausch Contributors: Markus Hsi-Yang Fritz Sascha Meiers delly-0.9.1/CONTRIBUTING.md000066400000000000000000000024531414764127700151040ustar00rootroot00000000000000Contributing ------------ Thank you for considering contributing to Delly. Bugs ---- If you have noticed a bug in Delly please search the [issue tracker](https://github.com/dellytools/delly/issues) to see if someone else in the community had already created a ticket. If that is not the case please create one! I try to fix all bugs as soon as possible but if you want to work on one yourself please consider using the fork and pull request mechanism of github. Usage questions --------------- Please consider using the Delly discussion group [delly-users](http://groups.google.com/d/forum/delly-users) for usage and installation questions. New Features ------------ If you want to suggest a new feature please go ahead and open an [issue](https://github.com/dellytools/delly/issues)! Roadmap ---------- Delly still lacks an adequate copy-number segmentation algorithm which is currently under development. We also would love to incorporate the alignment viewers, suave and maze, into our [GEAR genomics](https://www.gear-genomics.com) platform ([https://www.gear-genomics.com](https://www.gear-genomics.com)) but currently lack man-power. If you are interested in contributing please get in touch with me, thanks! delly-0.9.1/CREDITS000066400000000000000000000004241414764127700136670ustar00rootroot00000000000000DELLY: structural variant discovery by integrated paired-end and split-read analysis. Tobias Rausch, Thomas Zichner, Andreas Schlattl, Adrian M. Stuetz, Vladimir Benes, Jan O. Korbel. Bioinformatics. 2012 Sep 15;28(18):i333-i339. https://doi.org/10.1093/bioinformatics/bts378 delly-0.9.1/Dockerfile000066400000000000000000000020401414764127700146350ustar00rootroot00000000000000# use the ubuntu base image FROM ubuntu:18.04 MAINTAINER Tobias Rausch rausch@embl.de # install required packages RUN apt-get update && apt-get install -y \ autoconf \ build-essential \ cmake \ g++ \ gfortran \ git \ libcurl4-gnutls-dev \ hdf5-tools \ libboost-date-time-dev \ libboost-program-options-dev \ libboost-system-dev \ libboost-filesystem-dev \ libboost-iostreams-dev \ libbz2-dev \ libhdf5-dev \ libncurses-dev \ liblzma-dev \ zlib1g-dev \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* # set environment ENV BOOST_ROOT /usr # install delly RUN cd /opt \ && git clone --recursive https://github.com/dellytools/delly.git \ && cd /opt/delly/ \ && make STATIC=1 all \ && make install # Multi-stage build FROM alpine:latest RUN mkdir -p /opt/delly/bin WORKDIR /opt/delly/bin COPY --from=0 /opt/delly/bin/delly . # Workdir WORKDIR /root/ # Add Delly to PATH ENV PATH="/opt/delly/bin:${PATH}" # by default /bin/sh is executed CMD ["/bin/sh"] delly-0.9.1/Dockerfile.parallel000066400000000000000000000021271414764127700164360ustar00rootroot00000000000000# use the ubuntu base image FROM ubuntu:18.04 MAINTAINER Tobias Rausch rausch@embl.de # install required packages RUN apt-get update && apt-get install -y \ autoconf \ build-essential \ cmake \ g++ \ gfortran \ git \ libcurl4-gnutls-dev \ hdf5-tools \ libboost-date-time-dev \ libboost-program-options-dev \ libboost-system-dev \ libboost-filesystem-dev \ libboost-iostreams-dev \ libbz2-dev \ libhdf5-dev \ libncurses-dev \ liblzma-dev \ zlib1g-dev \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* # set environment ENV BOOST_ROOT /usr # install delly RUN cd /opt \ && git clone --recursive https://github.com/dellytools/delly.git \ && cd /opt/delly/ \ && make STATIC=1 PARALLEL=1 all \ && make install # Multi-stage build FROM alpine:latest RUN mkdir -p /opt/delly/bin WORKDIR /opt/delly/bin COPY --from=0 /opt/delly/bin/delly . # Workdir WORKDIR /root/ # Add Delly to PATH ENV PATH="/opt/delly/bin:${PATH}" # Set OpenMP threads ENV OMP_NUM_THREADS 2 # by default /bin/sh is executed CMD ["/bin/sh"] delly-0.9.1/LICENSE000066400000000000000000000030341414764127700136540ustar00rootroot00000000000000BSD 3-Clause License Copyright (c) 2012- European Molecular Biology Laboratory (EMBL) All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. delly-0.9.1/Makefile000066400000000000000000000040341414764127700143100ustar00rootroot00000000000000DEBUG ?= 0 PARALLEL ?= 0 STATIC ?= 0 # Submodules PWD = $(shell pwd) EBROOTHTSLIB ?= ${PWD}/src/htslib/ # Install dir prefix = ${PWD} exec_prefix = $(prefix) bindir ?= $(exec_prefix)/bin # Flags CXX=g++ CXXFLAGS += -isystem ${EBROOTHTSLIB} -pedantic -W -Wall -Wno-unknown-pragmas -D__STDC_LIMIT_MACROS -fno-strict-aliasing -fpermissive LDFLAGS += -L${EBROOTHTSLIB} -L${EBROOTHTSLIB}/lib -lboost_iostreams -lboost_filesystem -lboost_system -lboost_program_options -lboost_date_time # Flags for parallel computation ifeq (${PARALLEL}, 1) CXXFLAGS += -fopenmp -DOPENMP else CXXFLAGS += -DNOPENMP endif # Flags for static compile ifeq (${STATIC}, 1) LDFLAGS += -static -static-libgcc -pthread -lhts -lz -llzma -lbz2 else LDFLAGS += -lhts -lz -llzma -lbz2 -Wl,-rpath,${EBROOTHTSLIB} endif # Flags for debugging, profiling and releases ifeq (${DEBUG}, 1) CXXFLAGS += -g -O0 -fno-inline -DDEBUG else ifeq (${DEBUG}, 2) CXXFLAGS += -g -O0 -fno-inline -DPROFILE LDFLAGS += -lprofiler -ltcmalloc else CXXFLAGS += -O3 -fno-tree-vectorize -DNDEBUG endif ifeq (${EBROOTHTSLIB}, ${PWD}/src/htslib/) SUBMODULES += .htslib endif # External sources HTSLIBSOURCES = $(wildcard src/htslib/*.c) $(wildcard src/htslib/*.h) SOURCES = $(wildcard src/*.h) $(wildcard src/*.cpp) # Targets BUILT_PROGRAMS = src/delly TARGETS = ${SUBMODULES} ${BUILT_PROGRAMS} all: $(TARGETS) .htslib: $(HTSLIBSOURCES) if [ -r src/htslib/Makefile ]; then cd src/htslib && autoheader && autoconf && ./configure --disable-s3 --disable-gcs --disable-libcurl --disable-plugins && $(MAKE) && $(MAKE) lib-static && cd ../../ && touch .htslib; fi src/delly: ${SUBMODULES} $(SOURCES) $(CXX) $(CXXFLAGS) $@.cpp -o $@ $(LDFLAGS) src/dpe: ${SUBMODULES} $(SOURCES) $(CXX) $(CXXFLAGS) $@.cpp -o $@ $(LDFLAGS) install: ${BUILT_PROGRAMS} mkdir -p ${bindir} install -p ${BUILT_PROGRAMS} ${bindir} clean: if [ -r src/htslib/Makefile ]; then cd src/htslib && $(MAKE) clean; fi rm -f $(TARGETS) $(TARGETS:=.o) ${SUBMODULES} distclean: clean rm -f ${BUILT_PROGRAMS} .PHONY: clean distclean install all delly-0.9.1/R/000077500000000000000000000000001414764127700130505ustar00rootroot00000000000000delly-0.9.1/R/cnv.R000066400000000000000000000020731414764127700137630ustar00rootroot00000000000000library(ggplot2) library(reshape2) library(scales) args = commandArgs(trailingOnly=TRUE) x = read.table(args[1], header=F) colnames(x)[1] = c("cnv") x = melt(x, id.vars=c("cnv")) x$cn = round(x$value) if (sum(x$cn > 9)) { x[x$cn > 9,]$cn = 9; } x$cn = factor(x$cn, levels=0:9) nsamples = length(unique(x$variable)) nbins = sqrt(nsamples) if (nbins < 30) { nbins = 30; } # Plot CNVs for (CNV in unique(x$cnv)) { print(CNV) df = x[x$cnv == CNV,] p = ggplot(data=df, aes(x=value)) for(i in 0:9) { p = p + geom_histogram(data=subset(df, cn == i), aes(fill=cn), bins=nbins) } p = p + xlab("Copy-number") p = p + ylab("Count") p = p + scale_x_continuous(breaks=0:10, labels=comma) # p = p + scale_fill_manual(values=c("#a6cee3","#1f78b4","#b2df8a","#33a02c","#fb9a99","#e31a1c","#fdbf6f","#ff7f00","#cab2d6","#6a3d9a"), drop=F) p = p + scale_fill_manual(values=c("#ff7f00", "#1f78b4","#33a02c","#e31a1c","#6a3d9a", "#fdbf6f", "#a6cee3", "#b2df8a", "#fb9a99", "#cab2d6"), drop=F) p = p + ggtitle(CNV) ggsave(p, file=paste0(CNV, ".png"), width=24, height=6) } delly-0.9.1/R/gcbias.R000066400000000000000000000013321414764127700144220ustar00rootroot00000000000000library(ggplot2) library(reshape2) args = commandArgs(trailingOnly=TRUE) x = read.table(args[1], header=T) x = read.table("gc.table", header=T) x$gc = x$gcsum / (nrow(x)-1) x$fractionSample = x$fractionSample * 100 x$fractionReference = x$fractionReference * 100 df = melt(x[,c("gc","fractionSample","fractionReference")], id.vars=c("gc")) # Whole genome p = ggplot(data=df, aes(x=gc, y=value)) p = p + geom_bar(aes(color=variable, fill=variable), stat="identity") p = p + xlab("GC content") p = p + ylab("Obs / Exp") p = p + ylim(0, max(max(x$obsexp), max(x$fractionSample + x$fractionReference))) p = p + geom_line(data=x, aes(x=gc, y=obsexp), color="black") ggsave(p, file="gcbias.png", width=12, height=6) print(warnings()) delly-0.9.1/R/rd.R000066400000000000000000000041241414764127700136010ustar00rootroot00000000000000library(ggplot2) library(scales) library(gtable) library(grid) chrNamesLong = c("chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20", "chr21", "chr22", "chrX") chrNamesShort = c("1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","X") args = commandArgs(trailingOnly=TRUE) x = read.table(args[1], header=T) maxCN = 8 seg = data.frame() if (length(args)>1) { seg = read.table(args[2], header=F, sep="\t") colnames(seg) = c("chr", "start", "end", "id", "cn") } # Fix chromosome ordering if (sum(x$chr %in% chrNamesLong) > sum(x$chr %in% chrNamesShort)) { chrs = chrNamesLong; } else { chrs = chrNamesShort; } x = x[x$chr %in% chrs,] x$chr = factor(x$chr, levels=chrs) if (nrow(seg) > 0) { seg = seg[seg$chr %in% chrs,] seg$chr = factor(seg$chr, levels=chrs) } # Whole genome p = ggplot(data=x, aes(x=start, y=x[,6])) p = p + geom_point(pch=21, color="black", fill="black", size=0.5) p = p + xlab("Chromosome") p = p + ylab("Copy-number") p = p + scale_x_continuous(labels=comma) if (nrow(seg)) { p = p + geom_segment(data=seg, aes(x=start, y=cn, xend=end, yend=cn), color="#31a354", size=1.2); } p = p + facet_grid(. ~ chr, scales="free_x", space="free_x") p = p + ylim(0, maxCN) p = p + theme(axis.text.x = element_text(angle=45, hjust=1)) ggsave(p, file="plot.wholegenome.png", width=24, height=6) print(warnings()) # By chromosome for(chrname in unique(x$chr)) { print(chrname) sub = x[x$chr == chrname,] sl = seg[seg$chr == chrname,] p = ggplot(data=sub, aes(x=start, y=sub[,6])) p = p + geom_point(pch=21, color="black", fill="black", size=0.5) p = p + ylab("Copy-number") + xlab(chrname) p = p + scale_x_continuous(labels=comma, breaks = scales::pretty_breaks(n=20)) if (nrow(sl)) { p = p + geom_segment(data=sl, aes(x=start, y=cn, xend=end, yend=cn), color="#31a354", size=1.2); } p = p + ylim(0, maxCN) p = p + theme(axis.text.x = element_text(angle=45, hjust=1)) ggsave(p, file=paste0("plot.", chrname, ".png"), width=24, height=6) print(warnings()) } delly-0.9.1/README.md000066400000000000000000000275061414764127700141400ustar00rootroot00000000000000

Delly

[![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat-square)](http://bioconda.github.io/recipes/delly/README.html) [![Anaconda-Server Badge](https://anaconda.org/bioconda/delly/badges/downloads.svg)](https://anaconda.org/bioconda/delly) [![C/C++ CI](https://github.com/dellytools/delly/workflows/C/C++%20CI/badge.svg)](https://github.com/dellytools/delly/actions) [![Docker CI](https://github.com/dellytools/delly/workflows/Docker%20CI/badge.svg)](https://hub.docker.com/r/dellytools/delly/) [![GitHub license](https://img.shields.io/badge/License-BSD%203--Clause-blue.svg)](https://github.com/dellytools/delly/blob/master/LICENSE) [![GitHub Releases](https://img.shields.io/github/release/dellytools/delly.svg)](https://github.com/dellytools/delly/releases) Delly is an integrated structural variant (SV) prediction method that can discover, genotype and visualize deletions, tandem duplications, inversions and translocations at single-nucleotide resolution in short-read massively parallel sequencing data. It uses paired-ends, split-reads and read-depth to sensitively and accurately delineate genomic rearrangements throughout the genome. Structural variants can be annotated using [Delly-sansa](https://github.com/dellytools/sansa) and visualized using [Delly-maze](https://github.com/dellytools/maze) or [Delly-suave](https://github.com/dellytools/suave). Installing Delly ---------------- The easiest way to get Delly is to download a statically linked binary or the singularity container (SIF file) from the [Delly github release page](https://github.com/dellytools/delly/releases/). Alternatively, you can download Delly from [Bioconda](https://anaconda.org/bioconda/delly). You can also build Delly from source using a recursive clone and make. `git clone --recursive https://github.com/dellytools/delly.git` `cd delly/` `make all` There is a Delly discussion group [delly-users](http://groups.google.com/d/forum/delly-users) for usage and installation questions and a dockerized [delly](https://hub.docker.com/r/dellytools/delly/). Delly multi-threading mode -------------------------- Delly supports parallel computing using the OpenMP API (www.openmp.org). `make PARALLEL=1 -B src/delly` You can set the number of threads using the environment variable OMP_NUM_THREADS. `export OMP_NUM_THREADS=2` Delly primarily parallelizes on the sample level. Hence, OMP_NUM_THREADS should be always smaller or equal to the number of input samples. Running Delly ------------- Delly needs a sorted, indexed and duplicate marked bam file for every input sample. An indexed reference genome is required to identify split-reads. The output is in [BCF](http://samtools.github.io/bcftools/) format with a csi index. Delly supports germline and somatic SV discovery, genotyping and filtering. Because of that, Delly has been modularized and common workflows for germline and somatic SV calling are outlined below. If you do need VCF output you need a recent version of [BCFtools](http://samtools.github.io/bcftools/) for file conversion . `delly call -x hg19.excl -o delly.bcf -g hg19.fa input.bam` `bcftools view delly.bcf > delly.vcf` Somatic SV calling ------------------ * At least one tumor sample and a matched control sample are required for SV discovery `delly call -x hg19.excl -o t1.bcf -g hg19.fa tumor1.bam control1.bam` * Somatic pre-filtering requires a tab-delimited sample description file where the first column is the sample id (as in the VCF/BCF file) and the second column is either tumor or control. `delly filter -f somatic -o t1.pre.bcf -s samples.tsv t1.bcf` * Genotype pre-filtered somatic sites across a larger panel of control samples to efficiently filter false postives and germline SVs. For performance reasons, this can be run in parallel for each sample of the control panel and you may want to combine multiple pre-filtered somatic site lists from multiple tumor samples. `delly call -g hg19.fa -v t1.pre.bcf -o geno.bcf -x hg19.excl tumor1.bam control1.bam ... controlN.bam` * Post-filter for somatic SVs using all control samples. `delly filter -f somatic -o t1.somatic.bcf -s samples.tsv geno.bcf` Germline SV calling ------------------- * SV calling is done by sample for high-coverage genomes or in small batches for low-coverage genomes `delly call -g hg19.fa -o s1.bcf -x hg19.excl sample1.bam` * Merge SV sites into a unified site list `delly merge -o sites.bcf s1.bcf s2.bcf ... sN.bcf` * Genotype this merged SV site list across all samples. This can be run in parallel for each sample. `delly call -g hg19.fa -v sites.bcf -o s1.geno.bcf -x hg19.excl s1.bam` `delly call -g hg19.fa -v sites.bcf -o sN.geno.bcf -x hg19.excl sN.bam` * Merge all genotyped samples to get a single VCF/BCF using bcftools merge `bcftools merge -m id -O b -o merged.bcf s1.geno.bcf s2.geno.bcf ... sN.geno.bcf` * Apply the germline SV filter which requires at least 20 unrelated samples `delly filter -f germline -o germline.bcf merged.bcf` Delly for long reads from PacBio or ONT --------------------------------------- Delly also has a long-read (lr) SV discovery mode. `delly lr -y ont -g hg19.fa -x hg19.excl input.bam` `delly lr -y pb -g hg19.fa -x hg19.excl input.bam` Read-depth profiles ------------------- You can generate read-depth profiles with delly. This requires a mappability map which can be downloaded here: [Mappability Maps](https://gear.embl.de/data/delly/) The command to count reads in 10kbp mappable windows and normalize the coverage is: `delly cnv -a -g hg19.fa -m hg19.map input.bam` The output file `out.cov.gz` can be plotted using [R](https://www.r-project.org/) to generate normalized copy-number profiles: `Rscript R/rd.R out.cov.gz` Copy-number segmentation ------------------------ Read-depth profiles can also be segmented at the same time. `delly cnv -a -u -g hg19.fa -m hg19.map input.bam` The segmentation is in VCF format but you can extract a BED-like file using bcftools. `bcftools query -f "%CHROM\t%POS\t%INFO/END\t%ID\t[%RDCN]\n" cnv.bcf > segmentation.bed` Plotting: `Rscript R/rd.R out.cov.gz segmentation.bed` Germline CNV calling -------------------- Delly uses GC and mappability fragment correction to call CNVs. This requires a [mappability map](https://gear.embl.de/data/delly/). * Call CNVs for each sample and optionally refine breakpoints using delly SV calls `delly cnv -o c1.bcf -g hg19.fa -m hg19.map -l delly.sv.bcf input.bam` * Merge CNVs into a unified site list `delly merge -e -p -o sites.bcf -m 1000 -n 100000 c1.bcf c2.bcf ... cN.bcf` * Genotype CNVs for each sample `delly cnv -u -v sites.bcf -g hg19.fa -m hg19.map -o geno1.bcf input.bam` * Merge genotypes using [bcftools](https://github.com/samtools/bcftools) `bcftools merge -m id -O b -o merged.bcf geno1.bcf ... genoN.bcf` * Filter for germline CNVs `delly classify -f germline -o filtered.bcf merged.bcf` * Optional: Plot copy-number distribution for large number of samples (>>100) `bcftools query -f "%ID[\t%RDCN]\n" filtered.bcf > plot.tsv` `Rscript R/cnv.R plot.tsv` Somatic copy-number alterations (SCNAs) --------------------------------------- * For somatic copy-number alterations, delly first segments the tumor genome (`-u` is required). Depending on the coverage, tumor purity and heterogeneity you can adapt parameters `-z`, `-t` and `-x` which control the sensitivity of SCNA detection. `delly cnv -u -z 10000 -o tumor.bcf -c tumor.cov.gz -g hg19.fa -m hg19.map tumor.bam` * Then these tumor SCNAs are genotyped in the control sample (`-u` is required). `delly cnv -u -v tumor.bcf -o control.bcf -g hg19.fa -m hg19.map control.bam` * The VCF IDs are matched between tumor and control. Thus, you can merge both files using [bcftools](https://github.com/samtools/bcftools). `bcftools merge -m id -O b -o tumor_control.bcf tumor.bcf control.bcf` * Somatic filtering requires a tab-delimited sample description file where the first column is the sample id (as in the VCF/BCF file) and the second column is either tumor or control. `delly classify -p -f somatic -o somatic.bcf -s samples.tsv tumor_control.bcf` * Optional: Plot the SCNAs using bcftools and R. `bcftools query -s tumor -f "%CHROM\t%POS\t%INFO/END\t%ID\t[%RDCN]\n" somatic.bcf > segmentation.bed` `Rscript R/rd.R tumor.cov.gz segmentation.bed` FAQ --- * Visualization of SVs You may want to try out [wally](https://github.com/tobiasrausch/wally) to plot candidate structural variants. The paired-end coloring is explained in [wally's README](https://github.com/tobiasrausch/wally#paired-end-view) file. * What is the smallest SV size Delly can call? This depends on the sharpness of the insert size distribution. For an insert size of 200-300bp with a 20-30bp standard deviation, Delly starts to call reliable SVs >=300bp. Delly also supports calling of small InDels using soft-clipped reads only, the smallest SV size called is 15bp. * Can Delly be used on a non-diploid genome? Yes and no. The SV site discovery works for any ploidy. However, Delly's genotyping model assumes diploidy (hom. reference, het. and hom. alternative). The CNV calling allows to set the baseline ploidy on the command-line. * Delly is running too slowly what can I do? You should exclude telomere and centromere regions and also all unplaced contigs. Delly ships with such an exclude list for human and mouse samples. In addition, you can filter input reads more stringently using -q 20 and -s 15. Lastly, `-z` can be set to 5 for high-coverage data. * Are non-unique alignments, multi-mappings and/or multiple split-read alignments allowed? Delly expects two alignment records in the bam file for every paired-end, one for the first and one for the second read. Multiple split-read alignment records of a given read are allowed if and only if one of them is a primary alignment whereas all others are marked as secondary or supplementary (flag 0x0100 or flag 0x0800). This is the default for bwa mem. * What pre-processing of bam files is required? Bam files need to be sorted, indexed and ideally duplicate marked. * Usage/discussion mailing list? There is a delly discussion group [delly-users](http://groups.google.com/d/forum/delly-users). * Docker/Singularity support? There is a dockerized delly available [here](https://hub.docker.com/r/dellytools/delly/) and singularity containers (*.sif files) are part of the [delly release](https://github.com/dellytools/delly/releases). * How can I compute a mappability map? A basic mappability map can be built using [dicey](https://github.com/gear-genomics/dicey), [samtools](https://github.com/samtools/samtools) and [bwa](https://github.com/lh3/bwa) with the below commands (as an example for the sacCer3 reference): ``` dicey chop sacCer3.fa bwa index sacCer3.fa bwa mem sacCer3.fa read1.fq.gz read2.fq.gz | samtools sort -@ 8 -o srt.bam - samtools index srt.bam dicey mappability2 srt.bam gunzip map.fa.gz && bgzip map.fa && samtools faidx map.fa.gz ``` * Bioconda support? Delly is available via [bioconda](http://bioconda.github.io/recipes/delly/README.html). Citation -------- Tobias Rausch, Thomas Zichner, Andreas Schlattl, Adrian M. Stuetz, Vladimir Benes, Jan O. Korbel. DELLY: structural variant discovery by integrated paired-end and split-read analysis. Bioinformatics. 2012 Sep 15;28(18):i333-i339. [https://doi.org/10.1093/bioinformatics/bts378](https://doi.org/10.1093/bioinformatics/bts378) License ------- Delly is distributed under the BSD 3-Clause license. Consult the accompanying [LICENSE](https://github.com/dellytools/delly/blob/master/LICENSE) file for more details. delly-0.9.1/excludeTemplates/000077500000000000000000000000001414764127700161575ustar00rootroot00000000000000delly-0.9.1/excludeTemplates/drosophila.dm6.excl.tsv000066400000000000000000000243501414764127700225040ustar00rootroot00000000000000chr4 1200662 1217662 other chrX 21907215 21907315 other chrX 22260554 22260654 other chrX 22281172 22281192 other chrX 22282923 22314923 other chrX 22317804 22349804 other chrX 22354228 22354328 other chrX 23020991 23021091 other chrX 23321165 23321265 other chrX 23356567 23358067 other chrX 23473918 23474018 other chrY 1490 8990 other chrY 22086 22186 other chrY 22547 22647 other chrY 83124 83224 other chrY 87411 87911 other chrY 88684 88784 other chrY 110667 110967 other chrY 124109 124209 other chrY 137254 137354 other chrY 137530 137630 other chrY 138584 146084 other chrY 148890 148990 other chrY 200877 203437 other chrY 234471 234521 other chrY 240352 240852 other chrY 242135 242635 other chrY 260151 260251 other chrY 285127 285227 other chrY 350133 350233 other chrY 352081 352781 other chrY 353242 353342 other chrY 473565 473665 other chrY 497185 497285 other chrY 509883 509983 other chrY 522151 530151 other chrY 531415 535415 other chrY 557868 557968 other chrY 561910 562010 other chrY 564137 564237 other chrY 564811 572811 other chrY 573757 581757 other chrY 582268 582368 other chrY 629765 629865 other chrY 631005 633005 other chrY 641403 648403 other chrY 650390 650490 other chrY 652830 652930 other chrY 656009 656109 other chrY 671898 671998 other chrY 846040 846140 other chrY 846883 846983 other chrY 847717 847817 other chrY 848447 848547 other chrY 849455 849555 other chrY 850620 850720 other chrY 851478 851578 other chrY 852533 852633 other chrY 853437 853537 other chrY 855983 856083 other chrY 856797 856897 other chrY 857952 858052 other chrY 858779 858879 other chrY 859570 859670 other chrY 861325 861425 other chrY 862598 862698 other chrY 866737 866837 other chrY 868643 868743 other chrY 869791 869891 other chrY 872579 872679 other chrY 873940 874040 other chrY 875069 875169 other chrY 877729 877829 other chrY 879601 879701 other chrY 880964 881064 other chrY 882111 882211 other chrY 883396 883496 other chrY 886869 886969 other chrY 888001 888101 other chrY 889656 889756 other chrY 895558 895658 other chrY 897841 897941 other chrY 899771 899871 other chrY 902012 902112 other chrY 904758 904858 other chrY 907587 907687 other chrY 908814 908914 other chrY 912439 912539 other chrY 914301 914401 other chrY 916337 916437 other chrY 918354 918454 other chrY 922041 922141 other chrY 924135 924235 other chrY 925588 925688 other chrY 927554 927654 other chrY 929368 929468 other chrY 931732 931832 other chrY 933309 933409 other chrY 936588 936688 other chrY 939566 939666 other chrY 942020 942120 other chrY 943392 943492 other chrY 945145 945245 other chrY 946300 946400 other chrY 947403 947503 other chrY 948654 948754 other chrY 951072 951172 other chrY 952498 952598 other chrY 955434 955534 other chrY 958063 958163 other chrY 959383 959483 other chrY 960542 968542 other chrY 970063 970563 other chrY 971301 971401 other chrY 973757 973857 other chrY 976078 976178 other chrY 977521 977621 other chrY 979429 979529 other chrY 980897 980997 other chrY 983135 983235 other chrY 986927 987027 other chrY 988093 988193 other chrY 992262 992362 other chrY 995840 995940 other chrY 998682 998782 other chrY 1004639 1004739 other chrY 1009359 1009459 other chrY 1014100 1014200 other chrY 1019730 1019830 other chrY 1029671 1029771 other chrY 1032946 1033046 other chrY 1035716 1035816 other chrY 1039082 1039182 other chrY 1042549 1050549 other chrY 1053927 1054027 other chrY 1164724 1165724 other chrY 1215120 1215220 other chrY 1322854 1332854 other chrY 1416803 1416903 other chrY 1457079 1457179 other chrY 1635807 1635907 other chrY 1638203 1638303 other chrY 1642583 1642683 other chrY 1643213 1651213 other chrY 1651664 1651764 other chrY 1655357 1655457 other chrY 1657658 1657758 other chrY 1664050 1664150 other chrY 1665409 1665509 other chrY 1667574 1667674 other chrY 1747133 1747233 other chrY 1748407 1748507 other chrY 1749730 1749830 other chrY 1751361 1751461 other chrY 1810720 1810820 other chrY 1987425 1987475 other chrY 2004934 2005034 other chrY 2086956 2087056 other chrY 2173541 2173554 other chrY 2404888 2404988 other chrY 2405775 2411975 other chrY 2415473 2421673 other chrY 2422375 2422505 other chrY 2423783 2423883 other chrY 2434904 2435104 other chrY 2436405 2436605 other chrY 2496120 2496220 other chrY 2667028 2667128 other chrY 2829290 2829390 other chrY 2854073 2859073 other chrY 2878833 2883833 other chrY 2886692 2891692 other chrY 2902230 2907230 other chrY 2913718 2918718 other chrY 2935790 2940790 other chrY 2954430 2959430 other chrY 2972594 2977594 other chrY 3011908 3012008 other chrY 3024026 3024126 other chrY 3025146 3025246 other chrY 3026143 3026243 other chrY 3044454 3044554 other chrY 3046528 3046628 other chrY 3056790 3056890 other chrY 3255043 3255143 other chrY 3260891 3262891 other chrY 3284038 3284138 other chrY 3291605 3293605 other chrY 3298126 3301826 other chrY 3303556 3318556 other chrY 3319442 3323142 other chrY 3323855 3327555 other chrY 3328279 3331979 other chrY 3332699 3336399 other chrY 3337123 3340823 other chrY 3341546 3345246 other chrY 3345950 3349650 other chrY 3350374 3354074 other chrY 3354870 3358570 other chrY 3359286 3362886 other chrY 3367786 3370886 other chrY 3371602 3375802 other chrY 3376984 3377084 other chrY 3536284 3536384 other chrY 3560203 3560303 other chrY 3560504 3561504 other chrY 3562638 3562738 other chrY 3565630 3565730 other chrY 3567184 3567284 other chrY 3580469 3580569 other chrY 3581825 3590025 other chrY 3591385 3595550 other chrY 3596309 3600474 other chrY 3641170 3647170 other chrY 3648134 3648634 other chrY 3649394 3649494 other chrY 3652786 3652886 other chrY 3654376 3654476 other chrY 3657280 3663280 other chrY 3664653 3664703 other chrY 3665041 3665091 other chrY 3666483 3666583 other chr2L 21485538 21485638 other chr2L 22420241 22420341 other chr2R 4368 4468 other chr2R 24426 24526 other chr2R 416884 422884 other chr2R 748931 749031 other chr2R 1472350 1472450 other chr2R 1826183 1826283 other chr2R 3759567 3759667 other chr2R 20780707 20780807 other chr3L 5107766 5114766 other chr3L 24592984 24593084 other chr3L 24637105 24637205 other chr3L 26400914 26401014 other chr3L 26403999 26406999 other chr3L 26882321 26936121 other chr3L 26952525 27006385 other chr3L 27015779 27015879 other chr3L 27549160 27549260 other chr3R 27270 27370 other chr3R 30824 30924 other chr3R 44499 50290 other chr3R 52604 52704 other chr3R 122625 122725 other chr3R 168639 168739 other chr3R 169033 172514 other chr3R 189145 189245 other chr3R 268538 268638 other chr3R 304889 304989 other chr3R 1461765 1461865 other chr3R 2409541 2409641 other chr3R 2446426 2446526 other chr3R 2488683 2488783 other chr3R 2492820 2494720 other chr3R 2498072 2499972 other chr3R 2515632 2516632 other chr3R 2519338 2520338 other chr3R 2522591 2522691 other chr3R 2527791 2530091 other chr3R 2536031 2539431 other chr3R 2543862 2543962 other chr3R 3033774 3033874 other chr3R 3169914 3170014 other chr3R 4009644 4011644 other chr3R 4012024 4012124 other chr3R 4174178 4174278 other chrUn_CP007071v1 chrUn_CP007072v1 chrUn_CP007076v1 chrUn_CP007077v1 chrUn_CP007079v1 chrUn_CP007080v1 chrUn_CP007082v1 chrUn_CP007083v1 chrUn_CP007084v1 chrUn_CP007086v1 chrUn_CP007087v1 chrUn_CP007089v1 chrUn_CP007090v1 chrUn_CP007094v1 chrUn_CP007095v1 chrUn_CP007096v1 chrUn_CP007098v1 chrUn_CP007099v1 chrUn_CP007102v1 chrUn_CP007105v1 chrUn_CP007120v1 chrUn_DS483629v1 chrUn_DS483641v1 chrUn_DS483646v1 chrUn_DS483647v1 chrUn_DS483649v1 chrUn_DS483650v1 chrUn_DS483658v1 chrUn_DS483662v1 chrUn_DS483663v1 chrUn_DS483670v1 chrUn_DS483673v1 chrUn_DS483674v1 chrUn_DS483675v1 chrUn_DS483678v1 chrUn_DS483679v1 chrUn_DS483680v1 chrUn_DS483681v1 chrUn_DS483682v1 chrUn_DS483686v1 chrUn_DS483687v1 chrUn_DS483688v1 chrUn_DS483689v1 chrUn_DS483692v1 chrUn_DS483693v1 chrUn_DS483694v1 chrUn_DS483695v1 chrUn_DS483700v1 chrUn_DS483701v1 chrUn_DS483702v1 chrUn_DS483703v1 chrUn_DS483705v1 chrUn_DS483707v1 chrUn_DS483709v1 chrUn_DS483711v1 chrUn_DS483712v1 chrUn_DS483719v1 chrUn_DS483723v1 chrUn_DS483724v1 chrUn_DS483726v1 chrUn_DS483728v1 chrUn_DS483734v1 chrUn_DS483735v1 chrUn_DS483736v1 chrUn_DS483737v1 chrUn_DS483738v1 chrUn_DS483739v1 chrUn_DS483740v1 chrUn_DS483741v1 chrUn_DS483743v1 chrUn_DS483746v1 chrUn_DS483754v1 chrUn_DS483758v1 chrUn_DS483759v1 chrUn_DS483760v1 chrUn_DS483767v1 chrUn_DS483773v1 chrUn_DS483782v1 chrUn_DS483797v1 chrUn_DS483799v1 chrUn_DS483801v1 chrUn_DS483804v1 chrUn_DS483805v1 chrUn_DS483808v1 chrUn_DS483815v1 chrUn_DS483816v1 chrUn_DS483820v1 chrUn_DS483824v1 chrUn_DS483825v1 chrUn_DS483837v1 chrUn_DS483841v1 chrUn_DS483846v1 chrUn_DS483858v1 chrUn_DS483863v1 chrUn_DS483866v1 chrUn_DS483868v1 chrUn_DS483873v1 chrUn_DS483874v1 chrUn_DS483876v1 chrUn_DS483886v1 chrUn_DS483901v1 chrUn_DS483904v1 chrUn_DS483908v1 chrUn_DS483913v1 chrUn_DS483919v1 chrUn_DS483936v1 chrUn_DS483938v1 chrUn_DS483939v1 chrUn_DS483957v1 chrUn_DS483964v1 chrUn_DS483965v1 chrUn_DS483977v1 chrUn_DS484013v1 chrUn_DS484017v1 chrUn_DS484040v1 chrUn_DS484055v1 chrUn_DS484066v1 chrUn_DS484076v1 chrUn_DS484096v1 chrUn_DS484104v1 chrUn_DS484122v1 chrUn_DS484152v1 chrUn_DS484156v1 chrUn_DS484189v1 chrUn_DS484192v1 chrUn_DS484196v1 chrUn_DS484205v1 chrUn_DS484206v1 chrUn_DS484207v1 chrUn_DS484210v1 chrUn_DS484229v1 chrUn_DS484232v1 chrUn_DS484238v1 chrUn_DS484242v1 chrUn_DS484251v1 chrUn_DS484267v1 chrUn_DS484274v1 chrUn_DS484311v1 chrUn_DS484317v1 chrUn_DS484321v1 chrUn_DS484350v1 chrX_CP007103v1_random chrX_CP007104v1_random chrX_DS483648v1_random chrX_DS483655v1_random chrX_DS483660v1_random chrX_DS483665v1_random chrX_DS483666v1_random chrX_DS483669v1_random chrX_DS483685v1_random chrX_DS483698v1_random chrX_DS483905v1_random chrX_DS484002v1_random chrX_DS484072v1_random chrX_DS484085v1_random chrX_DS484088v1_random chrX_DS484268v1_random chrX_DS484272v1_random chrX_DS484293v1_random chrX_DS484368v1_random chrY_CP007107v1_random chrY_CP007108v1_random chrY_CP007109v1_random chrY_CP007111v1_random chrY_CP007112v1_random chrY_CP007113v1_random chrY_CP007116v1_random chrY_CP007118v1_random chrY_CP007119v1_random chrY_DS483677v1_random chrY_DS483690v1_random chrY_DS483725v1_random chrY_DS483742v1_random chrY_DS483778v1_random chrY_DS483788v1_random chrY_DS483790v1_random chrY_DS483959v1_random chrY_DS483967v1_random chrY_DS483987v1_random chrY_DS484043v1_random chrY_DS484128v1_random chrY_DS484171v1_random chrY_DS484197v1_random chrY_DS484233v1_random chrY_DS484249v1_random chrY_DS484266v1_random delly-0.9.1/excludeTemplates/human.hg19.excl.tsv000066400000000000000000000133761414764127700215400ustar00rootroot00000000000000chr1 0 10000 telomere chr1 121535434 124535434 centromere chr1 249240621 249250621 telomere chr10 0 10000 telomere chr10 39254935 42254935 centromere chr10 135524747 135534747 telomere chr11 0 10000 telomere chr11 51644205 54644205 centromere chr11 134996516 135006516 telomere chr12 0 10000 telomere chr12 34856694 37856694 centromere chr12 133841895 133851895 telomere chr13 0 10000 telomere chr13 16000000 19000000 centromere chr13 115159878 115169878 telomere chr14 0 10000 telomere chr14 16000000 19000000 centromere chr14 107339540 107349540 telomere chr15 0 10000 telomere chr15 17000000 20000000 centromere chr15 102521392 102531392 telomere chr16 0 10000 telomere chr16 35335801 38335801 centromere chr16 46380000 46450000 lowcomplexity chr16 90344753 90354753 telomere chr17 22263006 25263006 centromere chr18 0 10000 telomere chr18 15460898 18460898 centromere chr18 78067248 78077248 telomere chr19 0 10000 telomere chr19 24681782 27681782 centromere chr19 59118983 59128983 telomere chr2 0 10000 telomere chr2 33141000 33142000 lowcomplexity chr2 92326171 95326171 centromere chr2 243189373 243199373 telomere chr20 0 10000 telomere chr20 26369569 29369569 centromere chr20 63015520 63025520 telomere chr21 0 10000 telomere chr21 11288129 14288129 centromere chr21 48119895 48129895 telomere chr22 0 10000 telomere chr22 13000000 16000000 centromere chr22 51294566 51304566 telomere chr3 0 10000 telomere chr3 90504854 93504854 centromere chr3 198012430 198022430 telomere chr4 0 10000 telomere chr4 49660117 52660117 centromere chr4 191144276 191154276 telomere chr5 0 10000 telomere chr5 46405641 49405641 centromere chr5 180905260 180915260 telomere chr6 0 10000 telomere chr6 58830166 61830166 centromere chr6 171105067 171115067 telomere chr7 0 10000 telomere chr7 58054331 61054331 centromere chr7 159128663 159138663 telomere chr8 0 10000 telomere chr8 43793000 46857000 centromere chr8 146354022 146364022 telomere chr9 0 10000 telomere chr9 47367679 50367679 centromere chr9 141203431 141213431 telomere chrX 0 10000 telomere chrX 58632012 61632012 centromere chrX 155260560 155270560 telomere chrY 0 10000 telomere chrY 10104553 13104553 centromere chrY 59363566 59373566 telomere 1 0 10000 telomere 1 121535434 124535434 centromere 1 249240621 249250621 telomere 10 0 10000 telomere 10 39254935 42254935 centromere 10 135524747 135534747 telomere 11 0 10000 telomere 11 51644205 54644205 centromere 11 134996516 135006516 telomere 12 0 10000 telomere 12 34856694 37856694 centromere 12 133841895 133851895 telomere 13 0 10000 telomere 13 16000000 19000000 centromere 13 115159878 115169878 telomere 14 0 10000 telomere 14 16000000 19000000 centromere 14 107339540 107349540 telomere 15 0 10000 telomere 15 17000000 20000000 centromere 15 102521392 102531392 telomere 16 0 10000 telomere 16 35335801 38335801 centromere 16 46380000 46450000 lowcomplexity 16 90344753 90354753 telomere 17 22263006 25263006 centromere 18 0 10000 telomere 18 15460898 18460898 centromere 18 78067248 78077248 telomere 19 0 10000 telomere 19 24681782 27681782 centromere 19 59118983 59128983 telomere 2 0 10000 telomere 2 33141000 33142000 lowcomplexity 2 92326171 95326171 centromere 2 243189373 243199373 telomere 20 0 10000 telomere 20 26369569 29369569 centromere 20 63015520 63025520 telomere 21 0 10000 telomere 21 11288129 14288129 centromere 21 48119895 48129895 telomere 22 0 10000 telomere 22 13000000 16000000 centromere 22 51294566 51304566 telomere 3 0 10000 telomere 3 90504854 93504854 centromere 3 198012430 198022430 telomere 4 0 10000 telomere 4 49660117 52660117 centromere 4 191144276 191154276 telomere 5 0 10000 telomere 5 46405641 49405641 centromere 5 180905260 180915260 telomere 6 0 10000 telomere 6 58830166 61830166 centromere 6 171105067 171115067 telomere 7 0 10000 telomere 7 58054331 61054331 centromere 7 159128663 159138663 telomere 8 0 10000 telomere 8 43793000 46857000 centromere 8 146354022 146364022 telomere 9 0 10000 telomere 9 47367679 50367679 centromere 9 141203431 141213431 telomere X 0 10000 telomere X 58632012 61632012 centromere X 155260560 155270560 telomere Y 0 10000 telomere Y 10104553 13104553 centromere Y 59363566 59373566 telomere chrM chrMT MT GL000207.1 GL000226.1 GL000229.1 GL000231.1 GL000210.1 GL000239.1 GL000235.1 GL000201.1 GL000247.1 GL000245.1 GL000197.1 GL000203.1 GL000246.1 GL000249.1 GL000196.1 GL000248.1 GL000244.1 GL000238.1 GL000202.1 GL000234.1 GL000232.1 GL000206.1 GL000240.1 GL000236.1 GL000241.1 GL000243.1 GL000242.1 GL000230.1 GL000237.1 GL000233.1 GL000204.1 GL000198.1 GL000208.1 GL000191.1 GL000227.1 GL000228.1 GL000214.1 GL000221.1 GL000209.1 GL000218.1 GL000220.1 GL000213.1 GL000211.1 GL000199.1 GL000217.1 GL000216.1 GL000215.1 GL000205.1 GL000219.1 GL000224.1 GL000223.1 GL000195.1 GL000212.1 GL000222.1 GL000200.1 GL000193.1 GL000194.1 GL000225.1 GL000192.1 NC_007605 hs37d5 chr11_gl000202_random chr17_gl000203_random chr17_gl000204_random chr17_gl000205_random chr17_gl000206_random chr18_gl000207_random chr19_gl000208_random chr19_gl000209_random chr1_gl000191_random chr1_gl000192_random chr21_gl000210_random chr4_gl000193_random chr4_gl000194_random chr7_gl000195_random chr8_gl000196_random chr8_gl000197_random chr9_gl000198_random chr9_gl000199_random chr9_gl000200_random chr9_gl000201_random chrUn_gl000211 chrUn_gl000212 chrUn_gl000213 chrUn_gl000214 chrUn_gl000215 chrUn_gl000216 chrUn_gl000217 chrUn_gl000218 chrUn_gl000219 chrUn_gl000220 chrUn_gl000221 chrUn_gl000222 chrUn_gl000223 chrUn_gl000224 chrUn_gl000225 chrUn_gl000226 chrUn_gl000227 chrUn_gl000228 chrUn_gl000229 chrUn_gl000230 chrUn_gl000231 chrUn_gl000232 chrUn_gl000233 chrUn_gl000234 chrUn_gl000235 chrUn_gl000236 chrUn_gl000237 chrUn_gl000238 chrUn_gl000239 chrUn_gl000240 chrUn_gl000241 chrUn_gl000242 chrUn_gl000243 chrUn_gl000244 chrUn_gl000245 chrUn_gl000246 chrUn_gl000247 chrUn_gl000248 chrUn_gl000249 delly-0.9.1/excludeTemplates/human.hg38.excl.tsv000066400000000000000000002446521414764127700215440ustar00rootroot00000000000000chr1 0 10000 telomere chr1 248946422 248956422 telomere chr10 0 10000 telomere chr10 133787422 133797422 telomere chr11 0 10000 telomere chr11 135076622 135086622 telomere chr12 0 10000 telomere chr12 133265309 133275309 telomere chr13 0 10000 telomere chr13 114354328 114364328 telomere chr14 0 10000 telomere chr14 107033718 107043718 telomere chr15 0 10000 telomere chr15 101981189 101991189 telomere chr16 0 10000 telomere chr16 90328345 90338345 telomere chr17 0 10000 telomere chr17 83247441 83257441 telomere chr18 0 10000 telomere chr18 80363285 80373285 telomere chr19 0 10000 telomere chr19 58607616 58617616 telomere chr2 0 10000 telomere chr2 242183529 242193529 telomere chr20 0 10000 telomere chr20 64434167 64444167 telomere chr21 0 10000 telomere chr21 46699983 46709983 telomere chr22 0 10000 telomere chr22 50808468 50818468 telomere chr3 0 10000 telomere chr3 198285559 198295559 telomere chr4 0 10000 telomere chr4 190204555 190214555 telomere chr5 0 10000 telomere chr5 181528259 181538259 telomere chr6 0 10000 telomere chr6 170795979 170805979 telomere chr7 0 10000 telomere chr7 159335973 159345973 telomere chr8 0 10000 telomere chr8 145128636 145138636 telomere chr9 0 10000 telomere chr9 138384717 138394717 telomere chrX 0 10000 telomere chrX 156030895 156040895 telomere chrY 0 10000 telomere chrY 57217415 57227415 telomere 1 0 10000 telomere 1 248946422 248956422 telomere 10 0 10000 telomere 10 133787422 133797422 telomere 11 0 10000 telomere 11 135076622 135086622 telomere 12 0 10000 telomere 12 133265309 133275309 telomere 13 0 10000 telomere 13 114354328 114364328 telomere 14 0 10000 telomere 14 107033718 107043718 telomere 15 0 10000 telomere 15 101981189 101991189 telomere 16 0 10000 telomere 16 90328345 90338345 telomere 17 0 10000 telomere 17 83247441 83257441 telomere 18 0 10000 telomere 18 80363285 80373285 telomere 19 0 10000 telomere 19 58607616 58617616 telomere 2 0 10000 telomere 2 242183529 242193529 telomere 20 0 10000 telomere 20 64434167 64444167 telomere 21 0 10000 telomere 21 46699983 46709983 telomere 22 0 10000 telomere 22 50808468 50818468 telomere 3 0 10000 telomere 3 198285559 198295559 telomere 4 0 10000 telomere 4 190204555 190214555 telomere 5 0 10000 telomere 5 181528259 181538259 telomere 6 0 10000 telomere 6 170795979 170805979 telomere 7 0 10000 telomere 7 159335973 159345973 telomere 8 0 10000 telomere 8 145128636 145138636 telomere 9 0 10000 telomere 9 138384717 138394717 telomere X 0 10000 telomere X 156030895 156040895 telomere Y 0 10000 telomere Y 57217415 57227415 telomere 1 122026460 125184587 centromere 2 92188146 94090557 centromere 3 90772459 93655574 centromere 4 49708101 51743951 centromere 5 46485901 50059807 centromere 6 58553889 59829934 centromere 7 58169654 60828234 centromere 7 61377789 61528020 hetrochromatin 8 44033745 45877265 centromere 9 43236168 45518558 centromere 10 39686683 41593521 centromere 11 51078349 54425074 centromere 12 34769408 37185252 centromere 13 16000001 18051248 centromere 14 16000001 18173523 centromere 15 17000001 19725254 centromere 16 36311159 38280682 centromere 17 22813680 26885980 centromere 18 15460900 20861206 centromere 19 24498981 27190874 centromere 20 26436233 30038348 centromere 21 10864561 12915808 centromere 22 12954789 15054318 centromere X 58605580 62412542 centromere Y 10001 2781479 PAR Y 10316945 10544039 centromere Y 56887903 57217415 PAR chr1 122026460 125184587 centromere chr2 92188146 94090557 centromere chr3 90772459 93655574 centromere chr4 49708101 51743951 centromere chr5 46485901 50059807 centromere chr6 58553889 59829934 centromere chr7 58169654 60828234 centromere chr7 61377789 61528020 hetrochromatin chr8 44033745 45877265 centromere chr9 43236168 45518558 centromere chr10 39686683 41593521 centromere chr11 51078349 54425074 centromere chr12 34769408 37185252 centromere chr13 16000001 18051248 centromere chr14 16000001 18173523 centromere chr15 17000001 19725254 centromere chr16 36311159 38280682 centromere chr17 22813680 26885980 centromere chr18 15460900 20861206 centromere chr19 24498981 27190874 centromere chr20 26436233 30038348 centromere chr21 10864561 12915808 centromere chr22 12954789 15054318 centromere chrX 58605580 62412542 centromere chrY 10001 2781479 PAR chrY 10316945 10544039 centromere chrY 56887903 57217415 PAR chr1 125184587 143184587 heterochromatin chr11 50871348 51078348 heterochromatin chr13 18051248 18071248 heterochromatin chr16 38280682 46280682 heterochromatin chr19 24448980 24498980 heterochromatin chr19 27190874 27240874 heterochromatin chr2 90402511 91402511 heterochromatin chr21 10814560 10864560 heterochromatin chr7 62456779 62506779 heterochromatin chr9 45518558 60518558 heterochromatin chrY 26673214 56673214 heterochromatin 1 125184587 143184587 heterochromatin 11 50871348 51078348 heterochromatin 13 18051248 18071248 heterochromatin 16 38280682 46280682 heterochromatin 19 24448980 24498980 heterochromatin 19 27190874 27240874 heterochromatin 2 90402511 91402511 heterochromatin 21 10814560 10864560 heterochromatin 7 62456779 62506779 heterochromatin 9 45518558 60518558 heterochromatin Y 26673214 56673214 heterochromatin chrM M chrMT MT chr1_GL383518v1_alt chr1_GL383519v1_alt chr1_GL383520v2_alt chr1_KI270706v1_random chr1_KI270707v1_random chr1_KI270708v1_random chr1_KI270709v1_random chr1_KI270710v1_random chr1_KI270711v1_random chr1_KI270712v1_random chr1_KI270713v1_random chr1_KI270714v1_random chr1_KI270759v1_alt chr1_KI270760v1_alt chr1_KI270761v1_alt chr1_KI270762v1_alt chr1_KI270763v1_alt chr1_KI270764v1_alt chr1_KI270765v1_alt chr1_KI270766v1_alt chr1_KI270892v1_alt chr2_GL383521v1_alt chr2_GL383522v1_alt chr2_GL582966v2_alt chr2_KI270715v1_random chr2_KI270716v1_random chr2_KI270767v1_alt chr2_KI270768v1_alt chr2_KI270769v1_alt chr2_KI270770v1_alt chr2_KI270771v1_alt chr2_KI270772v1_alt chr2_KI270773v1_alt chr2_KI270774v1_alt chr2_KI270775v1_alt chr2_KI270776v1_alt chr2_KI270893v1_alt chr2_KI270894v1_alt chr3_GL000221v1_random chr3_GL383526v1_alt chr3_JH636055v2_alt chr3_KI270777v1_alt chr3_KI270778v1_alt chr3_KI270779v1_alt chr3_KI270780v1_alt chr3_KI270781v1_alt chr3_KI270782v1_alt chr3_KI270783v1_alt chr3_KI270784v1_alt chr3_KI270895v1_alt chr3_KI270924v1_alt chr3_KI270934v1_alt chr3_KI270935v1_alt chr3_KI270936v1_alt chr3_KI270937v1_alt chr4_GL000008v2_random chr4_GL000257v2_alt chr4_GL383527v1_alt chr4_GL383528v1_alt chr4_KI270785v1_alt chr4_KI270786v1_alt chr4_KI270787v1_alt chr4_KI270788v1_alt chr4_KI270789v1_alt chr4_KI270790v1_alt chr4_KI270896v1_alt chr4_KI270925v1_alt chr5_GL000208v1_random chr5_GL339449v2_alt chr5_GL383530v1_alt chr5_GL383531v1_alt chr5_GL383532v1_alt chr5_GL949742v1_alt chr5_KI270791v1_alt chr5_KI270792v1_alt chr5_KI270793v1_alt chr5_KI270794v1_alt chr5_KI270795v1_alt chr5_KI270796v1_alt chr5_KI270897v1_alt chr5_KI270898v1_alt chr6_GL000250v2_alt chr6_GL000251v2_alt chr6_GL000252v2_alt chr6_GL000253v2_alt chr6_GL000254v2_alt chr6_GL000255v2_alt chr6_GL000256v2_alt chr6_GL383533v1_alt chr6_KB021644v2_alt chr6_KI270758v1_alt chr6_KI270797v1_alt chr6_KI270798v1_alt chr6_KI270799v1_alt chr6_KI270800v1_alt chr6_KI270801v1_alt chr6_KI270802v1_alt chr7_GL383534v2_alt chr7_KI270803v1_alt chr7_KI270804v1_alt chr7_KI270805v1_alt chr7_KI270806v1_alt chr7_KI270807v1_alt chr7_KI270808v1_alt chr7_KI270809v1_alt chr7_KI270899v1_alt chr8_KI270810v1_alt chr8_KI270811v1_alt chr8_KI270812v1_alt chr8_KI270813v1_alt chr8_KI270814v1_alt chr8_KI270815v1_alt chr8_KI270816v1_alt chr8_KI270817v1_alt chr8_KI270818v1_alt chr8_KI270819v1_alt chr8_KI270820v1_alt chr8_KI270821v1_alt chr8_KI270822v1_alt chr8_KI270900v1_alt chr8_KI270901v1_alt chr8_KI270926v1_alt chr9_GL383539v1_alt chr9_GL383540v1_alt chr9_GL383541v1_alt chr9_GL383542v1_alt chr9_KI270717v1_random chr9_KI270718v1_random chr9_KI270719v1_random chr9_KI270720v1_random chr9_KI270823v1_alt chr10_GL383545v1_alt chr10_GL383546v1_alt chr10_KI270824v1_alt chr10_KI270825v1_alt chr11_GL383547v1_alt chr11_JH159136v1_alt chr11_JH159137v1_alt chr11_KI270721v1_random chr11_KI270826v1_alt chr11_KI270827v1_alt chr11_KI270829v1_alt chr11_KI270830v1_alt chr11_KI270831v1_alt chr11_KI270832v1_alt chr11_KI270902v1_alt chr11_KI270903v1_alt chr11_KI270927v1_alt chr12_GL383549v1_alt chr12_GL383550v2_alt chr12_GL383551v1_alt chr12_GL383552v1_alt chr12_GL383553v2_alt chr12_GL877875v1_alt chr12_GL877876v1_alt chr12_KI270833v1_alt chr12_KI270834v1_alt chr12_KI270835v1_alt chr12_KI270836v1_alt chr12_KI270837v1_alt chr12_KI270904v1_alt chr13_KI270838v1_alt chr13_KI270839v1_alt chr13_KI270840v1_alt chr13_KI270841v1_alt chr13_KI270842v1_alt chr13_KI270843v1_alt chr14_GL000009v2_random chr14_GL000194v1_random chr14_GL000225v1_random chr14_KI270722v1_random chr14_KI270723v1_random chr14_KI270724v1_random chr14_KI270725v1_random chr14_KI270726v1_random chr14_KI270844v1_alt chr14_KI270845v1_alt chr14_KI270846v1_alt chr14_KI270847v1_alt chr15_GL383554v1_alt chr15_GL383555v2_alt chr15_KI270727v1_random chr15_KI270848v1_alt chr15_KI270849v1_alt chr15_KI270850v1_alt chr15_KI270851v1_alt chr15_KI270852v1_alt chr15_KI270905v1_alt chr15_KI270906v1_alt chr16_GL383556v1_alt chr16_GL383557v1_alt chr16_KI270728v1_random chr16_KI270853v1_alt chr16_KI270854v1_alt chr16_KI270855v1_alt chr16_KI270856v1_alt chr17_GL000205v2_random chr17_GL000258v2_alt chr17_GL383563v3_alt chr17_GL383564v2_alt chr17_GL383565v1_alt chr17_GL383566v1_alt chr17_JH159146v1_alt chr17_JH159147v1_alt chr17_JH159148v1_alt chr17_KI270729v1_random chr17_KI270730v1_random chr17_KI270857v1_alt chr17_KI270858v1_alt chr17_KI270859v1_alt chr17_KI270860v1_alt chr17_KI270861v1_alt chr17_KI270862v1_alt chr17_KI270907v1_alt chr17_KI270908v1_alt chr17_KI270909v1_alt chr17_KI270910v1_alt chr18_GL383567v1_alt chr18_GL383568v1_alt chr18_GL383569v1_alt chr18_GL383570v1_alt chr18_GL383571v1_alt chr18_GL383572v1_alt chr18_KI270863v1_alt chr18_KI270864v1_alt chr18_KI270911v1_alt chr18_KI270912v1_alt chr19_GL000209v2_alt chr19_GL383573v1_alt chr19_GL383574v1_alt chr19_GL383575v2_alt chr19_GL383576v1_alt chr19_GL949746v1_alt chr19_GL949747v2_alt chr19_GL949748v2_alt chr19_GL949749v2_alt chr19_GL949750v2_alt chr19_GL949751v2_alt chr19_GL949752v1_alt chr19_GL949753v2_alt chr19_KI270865v1_alt chr19_KI270866v1_alt chr19_KI270867v1_alt chr19_KI270868v1_alt chr19_KI270882v1_alt chr19_KI270883v1_alt chr19_KI270884v1_alt chr19_KI270885v1_alt chr19_KI270886v1_alt chr19_KI270887v1_alt chr19_KI270888v1_alt chr19_KI270889v1_alt chr19_KI270890v1_alt chr19_KI270891v1_alt chr19_KI270914v1_alt chr19_KI270915v1_alt chr19_KI270916v1_alt chr19_KI270917v1_alt chr19_KI270918v1_alt chr19_KI270919v1_alt chr19_KI270920v1_alt chr19_KI270921v1_alt chr19_KI270922v1_alt chr19_KI270923v1_alt chr19_KI270929v1_alt chr19_KI270930v1_alt chr19_KI270931v1_alt chr19_KI270932v1_alt chr19_KI270933v1_alt chr19_KI270938v1_alt chr20_GL383577v2_alt chr20_KI270869v1_alt chr20_KI270870v1_alt chr20_KI270871v1_alt chr21_GL383578v2_alt chr21_GL383579v2_alt chr21_GL383580v2_alt chr21_GL383581v2_alt chr21_KI270872v1_alt chr21_KI270873v1_alt chr21_KI270874v1_alt chr22_GL383582v2_alt chr22_GL383583v2_alt chr22_KB663609v1_alt chr22_KI270731v1_random chr22_KI270732v1_random chr22_KI270733v1_random chr22_KI270734v1_random chr22_KI270735v1_random chr22_KI270736v1_random chr22_KI270737v1_random chr22_KI270738v1_random chr22_KI270739v1_random chr22_KI270875v1_alt chr22_KI270876v1_alt chr22_KI270877v1_alt chr22_KI270878v1_alt chr22_KI270879v1_alt chr22_KI270928v1_alt chrUn_GL000195v1 chrUn_GL000213v1 chrUn_GL000214v1 chrUn_GL000216v2 chrUn_GL000218v1 chrUn_GL000219v1 chrUn_GL000220v1 chrUn_GL000224v1 chrUn_GL000226v1 chrUn_KI270302v1 chrUn_KI270303v1 chrUn_KI270304v1 chrUn_KI270305v1 chrUn_KI270310v1 chrUn_KI270311v1 chrUn_KI270312v1 chrUn_KI270315v1 chrUn_KI270316v1 chrUn_KI270317v1 chrUn_KI270320v1 chrUn_KI270322v1 chrUn_KI270329v1 chrUn_KI270330v1 chrUn_KI270333v1 chrUn_KI270334v1 chrUn_KI270335v1 chrUn_KI270336v1 chrUn_KI270337v1 chrUn_KI270338v1 chrUn_KI270340v1 chrUn_KI270362v1 chrUn_KI270363v1 chrUn_KI270364v1 chrUn_KI270366v1 chrUn_KI270371v1 chrUn_KI270372v1 chrUn_KI270373v1 chrUn_KI270374v1 chrUn_KI270375v1 chrUn_KI270376v1 chrUn_KI270378v1 chrUn_KI270379v1 chrUn_KI270381v1 chrUn_KI270382v1 chrUn_KI270383v1 chrUn_KI270384v1 chrUn_KI270385v1 chrUn_KI270386v1 chrUn_KI270387v1 chrUn_KI270388v1 chrUn_KI270389v1 chrUn_KI270390v1 chrUn_KI270391v1 chrUn_KI270392v1 chrUn_KI270393v1 chrUn_KI270394v1 chrUn_KI270395v1 chrUn_KI270396v1 chrUn_KI270411v1 chrUn_KI270412v1 chrUn_KI270414v1 chrUn_KI270417v1 chrUn_KI270418v1 chrUn_KI270419v1 chrUn_KI270420v1 chrUn_KI270422v1 chrUn_KI270423v1 chrUn_KI270424v1 chrUn_KI270425v1 chrUn_KI270429v1 chrUn_KI270435v1 chrUn_KI270438v1 chrUn_KI270442v1 chrUn_KI270448v1 chrUn_KI270465v1 chrUn_KI270466v1 chrUn_KI270467v1 chrUn_KI270468v1 chrUn_KI270507v1 chrUn_KI270508v1 chrUn_KI270509v1 chrUn_KI270510v1 chrUn_KI270511v1 chrUn_KI270512v1 chrUn_KI270515v1 chrUn_KI270516v1 chrUn_KI270517v1 chrUn_KI270518v1 chrUn_KI270519v1 chrUn_KI270521v1 chrUn_KI270522v1 chrUn_KI270528v1 chrUn_KI270529v1 chrUn_KI270530v1 chrUn_KI270538v1 chrUn_KI270539v1 chrUn_KI270544v1 chrUn_KI270548v1 chrUn_KI270579v1 chrUn_KI270580v1 chrUn_KI270581v1 chrUn_KI270582v1 chrUn_KI270583v1 chrUn_KI270584v1 chrUn_KI270587v1 chrUn_KI270588v1 chrUn_KI270589v1 chrUn_KI270590v1 chrUn_KI270591v1 chrUn_KI270593v1 chrUn_KI270741v1 chrUn_KI270742v1 chrUn_KI270743v1 chrUn_KI270744v1 chrUn_KI270745v1 chrUn_KI270746v1 chrUn_KI270747v1 chrUn_KI270748v1 chrUn_KI270749v1 chrUn_KI270750v1 chrUn_KI270751v1 chrUn_KI270752v1 chrUn_KI270753v1 chrUn_KI270754v1 chrUn_KI270755v1 chrUn_KI270756v1 chrUn_KI270757v1 chrX_KI270880v1_alt chrX_KI270881v1_alt chrX_KI270913v1_alt chrY_KI270740v1_random chrEBV chrUn_KN707606v1_decoy chrUn_KN707607v1_decoy chrUn_KN707608v1_decoy chrUn_KN707609v1_decoy chrUn_KN707610v1_decoy chrUn_KN707611v1_decoy chrUn_KN707612v1_decoy chrUn_KN707613v1_decoy chrUn_KN707614v1_decoy chrUn_KN707615v1_decoy chrUn_KN707616v1_decoy chrUn_KN707617v1_decoy chrUn_KN707618v1_decoy chrUn_KN707619v1_decoy chrUn_KN707620v1_decoy chrUn_KN707621v1_decoy chrUn_KN707622v1_decoy chrUn_KN707623v1_decoy chrUn_KN707624v1_decoy chrUn_KN707625v1_decoy chrUn_KN707626v1_decoy chrUn_KN707627v1_decoy chrUn_KN707628v1_decoy chrUn_KN707629v1_decoy chrUn_KN707630v1_decoy chrUn_KN707631v1_decoy chrUn_KN707632v1_decoy chrUn_KN707633v1_decoy chrUn_KN707634v1_decoy chrUn_KN707635v1_decoy chrUn_KN707636v1_decoy chrUn_KN707637v1_decoy chrUn_KN707638v1_decoy chrUn_KN707639v1_decoy chrUn_KN707640v1_decoy chrUn_KN707641v1_decoy chrUn_KN707642v1_decoy chrUn_KN707643v1_decoy chrUn_KN707644v1_decoy chrUn_KN707645v1_decoy chrUn_KN707646v1_decoy chrUn_KN707647v1_decoy chrUn_KN707648v1_decoy chrUn_KN707649v1_decoy chrUn_KN707650v1_decoy chrUn_KN707651v1_decoy chrUn_KN707652v1_decoy chrUn_KN707653v1_decoy chrUn_KN707654v1_decoy chrUn_KN707655v1_decoy chrUn_KN707656v1_decoy chrUn_KN707657v1_decoy chrUn_KN707658v1_decoy chrUn_KN707659v1_decoy chrUn_KN707660v1_decoy chrUn_KN707661v1_decoy chrUn_KN707662v1_decoy chrUn_KN707663v1_decoy chrUn_KN707664v1_decoy chrUn_KN707665v1_decoy chrUn_KN707666v1_decoy chrUn_KN707667v1_decoy chrUn_KN707668v1_decoy chrUn_KN707669v1_decoy chrUn_KN707670v1_decoy chrUn_KN707671v1_decoy chrUn_KN707672v1_decoy chrUn_KN707673v1_decoy chrUn_KN707674v1_decoy chrUn_KN707675v1_decoy chrUn_KN707676v1_decoy chrUn_KN707677v1_decoy chrUn_KN707678v1_decoy chrUn_KN707679v1_decoy chrUn_KN707680v1_decoy chrUn_KN707681v1_decoy chrUn_KN707682v1_decoy chrUn_KN707683v1_decoy chrUn_KN707684v1_decoy chrUn_KN707685v1_decoy chrUn_KN707686v1_decoy chrUn_KN707687v1_decoy chrUn_KN707688v1_decoy chrUn_KN707689v1_decoy chrUn_KN707690v1_decoy chrUn_KN707691v1_decoy chrUn_KN707692v1_decoy chrUn_KN707693v1_decoy chrUn_KN707694v1_decoy chrUn_KN707695v1_decoy chrUn_KN707696v1_decoy chrUn_KN707697v1_decoy chrUn_KN707698v1_decoy chrUn_KN707699v1_decoy chrUn_KN707700v1_decoy chrUn_KN707701v1_decoy chrUn_KN707702v1_decoy chrUn_KN707703v1_decoy chrUn_KN707704v1_decoy chrUn_KN707705v1_decoy chrUn_KN707706v1_decoy chrUn_KN707707v1_decoy chrUn_KN707708v1_decoy chrUn_KN707709v1_decoy chrUn_KN707710v1_decoy chrUn_KN707711v1_decoy chrUn_KN707712v1_decoy chrUn_KN707713v1_decoy chrUn_KN707714v1_decoy chrUn_KN707715v1_decoy chrUn_KN707716v1_decoy chrUn_KN707717v1_decoy chrUn_KN707718v1_decoy chrUn_KN707719v1_decoy chrUn_KN707720v1_decoy chrUn_KN707721v1_decoy chrUn_KN707722v1_decoy chrUn_KN707723v1_decoy chrUn_KN707724v1_decoy chrUn_KN707725v1_decoy chrUn_KN707726v1_decoy chrUn_KN707727v1_decoy chrUn_KN707728v1_decoy chrUn_KN707729v1_decoy chrUn_KN707730v1_decoy chrUn_KN707731v1_decoy chrUn_KN707732v1_decoy chrUn_KN707733v1_decoy chrUn_KN707734v1_decoy chrUn_KN707735v1_decoy chrUn_KN707736v1_decoy chrUn_KN707737v1_decoy chrUn_KN707738v1_decoy chrUn_KN707739v1_decoy chrUn_KN707740v1_decoy chrUn_KN707741v1_decoy chrUn_KN707742v1_decoy chrUn_KN707743v1_decoy chrUn_KN707744v1_decoy chrUn_KN707745v1_decoy chrUn_KN707746v1_decoy chrUn_KN707747v1_decoy chrUn_KN707748v1_decoy chrUn_KN707749v1_decoy chrUn_KN707750v1_decoy chrUn_KN707751v1_decoy chrUn_KN707752v1_decoy chrUn_KN707753v1_decoy chrUn_KN707754v1_decoy chrUn_KN707755v1_decoy chrUn_KN707756v1_decoy chrUn_KN707757v1_decoy chrUn_KN707758v1_decoy chrUn_KN707759v1_decoy chrUn_KN707760v1_decoy chrUn_KN707761v1_decoy chrUn_KN707762v1_decoy chrUn_KN707763v1_decoy chrUn_KN707764v1_decoy chrUn_KN707765v1_decoy chrUn_KN707766v1_decoy chrUn_KN707767v1_decoy chrUn_KN707768v1_decoy chrUn_KN707769v1_decoy chrUn_KN707770v1_decoy chrUn_KN707771v1_decoy chrUn_KN707772v1_decoy chrUn_KN707773v1_decoy chrUn_KN707774v1_decoy chrUn_KN707775v1_decoy chrUn_KN707776v1_decoy chrUn_KN707777v1_decoy chrUn_KN707778v1_decoy chrUn_KN707779v1_decoy chrUn_KN707780v1_decoy chrUn_KN707781v1_decoy chrUn_KN707782v1_decoy chrUn_KN707783v1_decoy chrUn_KN707784v1_decoy chrUn_KN707785v1_decoy chrUn_KN707786v1_decoy chrUn_KN707787v1_decoy chrUn_KN707788v1_decoy chrUn_KN707789v1_decoy chrUn_KN707790v1_decoy chrUn_KN707791v1_decoy chrUn_KN707792v1_decoy chrUn_KN707793v1_decoy chrUn_KN707794v1_decoy chrUn_KN707795v1_decoy chrUn_KN707796v1_decoy chrUn_KN707797v1_decoy chrUn_KN707798v1_decoy chrUn_KN707799v1_decoy chrUn_KN707800v1_decoy chrUn_KN707801v1_decoy chrUn_KN707802v1_decoy chrUn_KN707803v1_decoy chrUn_KN707804v1_decoy chrUn_KN707805v1_decoy chrUn_KN707806v1_decoy chrUn_KN707807v1_decoy chrUn_KN707808v1_decoy chrUn_KN707809v1_decoy chrUn_KN707810v1_decoy chrUn_KN707811v1_decoy chrUn_KN707812v1_decoy chrUn_KN707813v1_decoy chrUn_KN707814v1_decoy chrUn_KN707815v1_decoy chrUn_KN707816v1_decoy chrUn_KN707817v1_decoy chrUn_KN707818v1_decoy chrUn_KN707819v1_decoy chrUn_KN707820v1_decoy chrUn_KN707821v1_decoy chrUn_KN707822v1_decoy chrUn_KN707823v1_decoy chrUn_KN707824v1_decoy chrUn_KN707825v1_decoy chrUn_KN707826v1_decoy chrUn_KN707827v1_decoy chrUn_KN707828v1_decoy chrUn_KN707829v1_decoy chrUn_KN707830v1_decoy chrUn_KN707831v1_decoy chrUn_KN707832v1_decoy chrUn_KN707833v1_decoy chrUn_KN707834v1_decoy chrUn_KN707835v1_decoy chrUn_KN707836v1_decoy chrUn_KN707837v1_decoy chrUn_KN707838v1_decoy chrUn_KN707839v1_decoy chrUn_KN707840v1_decoy chrUn_KN707841v1_decoy chrUn_KN707842v1_decoy chrUn_KN707843v1_decoy chrUn_KN707844v1_decoy chrUn_KN707845v1_decoy chrUn_KN707846v1_decoy chrUn_KN707847v1_decoy chrUn_KN707848v1_decoy chrUn_KN707849v1_decoy chrUn_KN707850v1_decoy chrUn_KN707851v1_decoy chrUn_KN707852v1_decoy chrUn_KN707853v1_decoy chrUn_KN707854v1_decoy chrUn_KN707855v1_decoy chrUn_KN707856v1_decoy chrUn_KN707857v1_decoy chrUn_KN707858v1_decoy chrUn_KN707859v1_decoy chrUn_KN707860v1_decoy chrUn_KN707861v1_decoy chrUn_KN707862v1_decoy chrUn_KN707863v1_decoy chrUn_KN707864v1_decoy chrUn_KN707865v1_decoy chrUn_KN707866v1_decoy chrUn_KN707867v1_decoy chrUn_KN707868v1_decoy chrUn_KN707869v1_decoy chrUn_KN707870v1_decoy chrUn_KN707871v1_decoy chrUn_KN707872v1_decoy chrUn_KN707873v1_decoy chrUn_KN707874v1_decoy chrUn_KN707875v1_decoy chrUn_KN707876v1_decoy chrUn_KN707877v1_decoy chrUn_KN707878v1_decoy chrUn_KN707879v1_decoy chrUn_KN707880v1_decoy chrUn_KN707881v1_decoy chrUn_KN707882v1_decoy chrUn_KN707883v1_decoy chrUn_KN707884v1_decoy chrUn_KN707885v1_decoy chrUn_KN707886v1_decoy chrUn_KN707887v1_decoy chrUn_KN707888v1_decoy chrUn_KN707889v1_decoy chrUn_KN707890v1_decoy chrUn_KN707891v1_decoy chrUn_KN707892v1_decoy chrUn_KN707893v1_decoy chrUn_KN707894v1_decoy chrUn_KN707895v1_decoy chrUn_KN707896v1_decoy chrUn_KN707897v1_decoy chrUn_KN707898v1_decoy chrUn_KN707899v1_decoy chrUn_KN707900v1_decoy chrUn_KN707901v1_decoy chrUn_KN707902v1_decoy chrUn_KN707903v1_decoy chrUn_KN707904v1_decoy chrUn_KN707905v1_decoy chrUn_KN707906v1_decoy chrUn_KN707907v1_decoy chrUn_KN707908v1_decoy chrUn_KN707909v1_decoy chrUn_KN707910v1_decoy chrUn_KN707911v1_decoy chrUn_KN707912v1_decoy chrUn_KN707913v1_decoy chrUn_KN707914v1_decoy chrUn_KN707915v1_decoy chrUn_KN707916v1_decoy chrUn_KN707917v1_decoy chrUn_KN707918v1_decoy chrUn_KN707919v1_decoy chrUn_KN707920v1_decoy chrUn_KN707921v1_decoy chrUn_KN707922v1_decoy chrUn_KN707923v1_decoy chrUn_KN707924v1_decoy chrUn_KN707925v1_decoy chrUn_KN707926v1_decoy chrUn_KN707927v1_decoy chrUn_KN707928v1_decoy chrUn_KN707929v1_decoy chrUn_KN707930v1_decoy chrUn_KN707931v1_decoy chrUn_KN707932v1_decoy chrUn_KN707933v1_decoy chrUn_KN707934v1_decoy chrUn_KN707935v1_decoy chrUn_KN707936v1_decoy chrUn_KN707937v1_decoy chrUn_KN707938v1_decoy chrUn_KN707939v1_decoy chrUn_KN707940v1_decoy chrUn_KN707941v1_decoy chrUn_KN707942v1_decoy chrUn_KN707943v1_decoy chrUn_KN707944v1_decoy chrUn_KN707945v1_decoy chrUn_KN707946v1_decoy chrUn_KN707947v1_decoy chrUn_KN707948v1_decoy chrUn_KN707949v1_decoy chrUn_KN707950v1_decoy chrUn_KN707951v1_decoy chrUn_KN707952v1_decoy chrUn_KN707953v1_decoy chrUn_KN707954v1_decoy chrUn_KN707955v1_decoy chrUn_KN707956v1_decoy chrUn_KN707957v1_decoy chrUn_KN707958v1_decoy chrUn_KN707959v1_decoy chrUn_KN707960v1_decoy chrUn_KN707961v1_decoy chrUn_KN707962v1_decoy chrUn_KN707963v1_decoy chrUn_KN707964v1_decoy chrUn_KN707965v1_decoy chrUn_KN707966v1_decoy chrUn_KN707967v1_decoy chrUn_KN707968v1_decoy chrUn_KN707969v1_decoy chrUn_KN707970v1_decoy chrUn_KN707971v1_decoy chrUn_KN707972v1_decoy chrUn_KN707973v1_decoy chrUn_KN707974v1_decoy chrUn_KN707975v1_decoy chrUn_KN707976v1_decoy chrUn_KN707977v1_decoy chrUn_KN707978v1_decoy chrUn_KN707979v1_decoy chrUn_KN707980v1_decoy chrUn_KN707981v1_decoy chrUn_KN707982v1_decoy chrUn_KN707983v1_decoy chrUn_KN707984v1_decoy chrUn_KN707985v1_decoy chrUn_KN707986v1_decoy chrUn_KN707987v1_decoy chrUn_KN707988v1_decoy chrUn_KN707989v1_decoy chrUn_KN707990v1_decoy chrUn_KN707991v1_decoy chrUn_KN707992v1_decoy chrUn_JTFH01000001v1_decoy chrUn_JTFH01000002v1_decoy chrUn_JTFH01000003v1_decoy chrUn_JTFH01000004v1_decoy chrUn_JTFH01000005v1_decoy chrUn_JTFH01000006v1_decoy chrUn_JTFH01000007v1_decoy chrUn_JTFH01000008v1_decoy chrUn_JTFH01000009v1_decoy chrUn_JTFH01000010v1_decoy chrUn_JTFH01000011v1_decoy chrUn_JTFH01000012v1_decoy chrUn_JTFH01000013v1_decoy chrUn_JTFH01000014v1_decoy chrUn_JTFH01000015v1_decoy chrUn_JTFH01000016v1_decoy chrUn_JTFH01000017v1_decoy chrUn_JTFH01000018v1_decoy chrUn_JTFH01000019v1_decoy chrUn_JTFH01000020v1_decoy chrUn_JTFH01000021v1_decoy chrUn_JTFH01000022v1_decoy chrUn_JTFH01000023v1_decoy chrUn_JTFH01000024v1_decoy chrUn_JTFH01000025v1_decoy chrUn_JTFH01000026v1_decoy chrUn_JTFH01000027v1_decoy chrUn_JTFH01000028v1_decoy chrUn_JTFH01000029v1_decoy chrUn_JTFH01000030v1_decoy chrUn_JTFH01000031v1_decoy chrUn_JTFH01000032v1_decoy chrUn_JTFH01000033v1_decoy chrUn_JTFH01000034v1_decoy chrUn_JTFH01000035v1_decoy chrUn_JTFH01000036v1_decoy chrUn_JTFH01000037v1_decoy chrUn_JTFH01000038v1_decoy chrUn_JTFH01000039v1_decoy chrUn_JTFH01000040v1_decoy chrUn_JTFH01000041v1_decoy chrUn_JTFH01000042v1_decoy chrUn_JTFH01000043v1_decoy chrUn_JTFH01000044v1_decoy chrUn_JTFH01000045v1_decoy chrUn_JTFH01000046v1_decoy chrUn_JTFH01000047v1_decoy chrUn_JTFH01000048v1_decoy chrUn_JTFH01000049v1_decoy chrUn_JTFH01000050v1_decoy chrUn_JTFH01000051v1_decoy chrUn_JTFH01000052v1_decoy chrUn_JTFH01000053v1_decoy chrUn_JTFH01000054v1_decoy chrUn_JTFH01000055v1_decoy chrUn_JTFH01000056v1_decoy chrUn_JTFH01000057v1_decoy chrUn_JTFH01000058v1_decoy chrUn_JTFH01000059v1_decoy chrUn_JTFH01000060v1_decoy chrUn_JTFH01000061v1_decoy chrUn_JTFH01000062v1_decoy chrUn_JTFH01000063v1_decoy chrUn_JTFH01000064v1_decoy chrUn_JTFH01000065v1_decoy chrUn_JTFH01000066v1_decoy chrUn_JTFH01000067v1_decoy chrUn_JTFH01000068v1_decoy chrUn_JTFH01000069v1_decoy chrUn_JTFH01000070v1_decoy chrUn_JTFH01000071v1_decoy chrUn_JTFH01000072v1_decoy chrUn_JTFH01000073v1_decoy chrUn_JTFH01000074v1_decoy chrUn_JTFH01000075v1_decoy chrUn_JTFH01000076v1_decoy chrUn_JTFH01000077v1_decoy chrUn_JTFH01000078v1_decoy chrUn_JTFH01000079v1_decoy chrUn_JTFH01000080v1_decoy chrUn_JTFH01000081v1_decoy chrUn_JTFH01000082v1_decoy chrUn_JTFH01000083v1_decoy chrUn_JTFH01000084v1_decoy chrUn_JTFH01000085v1_decoy chrUn_JTFH01000086v1_decoy chrUn_JTFH01000087v1_decoy chrUn_JTFH01000088v1_decoy chrUn_JTFH01000089v1_decoy chrUn_JTFH01000090v1_decoy chrUn_JTFH01000091v1_decoy chrUn_JTFH01000092v1_decoy chrUn_JTFH01000093v1_decoy chrUn_JTFH01000094v1_decoy chrUn_JTFH01000095v1_decoy chrUn_JTFH01000096v1_decoy chrUn_JTFH01000097v1_decoy chrUn_JTFH01000098v1_decoy chrUn_JTFH01000099v1_decoy chrUn_JTFH01000100v1_decoy chrUn_JTFH01000101v1_decoy chrUn_JTFH01000102v1_decoy chrUn_JTFH01000103v1_decoy chrUn_JTFH01000104v1_decoy chrUn_JTFH01000105v1_decoy chrUn_JTFH01000106v1_decoy chrUn_JTFH01000107v1_decoy chrUn_JTFH01000108v1_decoy chrUn_JTFH01000109v1_decoy chrUn_JTFH01000110v1_decoy chrUn_JTFH01000111v1_decoy chrUn_JTFH01000112v1_decoy chrUn_JTFH01000113v1_decoy chrUn_JTFH01000114v1_decoy chrUn_JTFH01000115v1_decoy chrUn_JTFH01000116v1_decoy chrUn_JTFH01000117v1_decoy chrUn_JTFH01000118v1_decoy chrUn_JTFH01000119v1_decoy chrUn_JTFH01000120v1_decoy chrUn_JTFH01000121v1_decoy chrUn_JTFH01000122v1_decoy chrUn_JTFH01000123v1_decoy chrUn_JTFH01000124v1_decoy chrUn_JTFH01000125v1_decoy chrUn_JTFH01000126v1_decoy chrUn_JTFH01000127v1_decoy chrUn_JTFH01000128v1_decoy chrUn_JTFH01000129v1_decoy chrUn_JTFH01000130v1_decoy chrUn_JTFH01000131v1_decoy chrUn_JTFH01000132v1_decoy chrUn_JTFH01000133v1_decoy chrUn_JTFH01000134v1_decoy chrUn_JTFH01000135v1_decoy chrUn_JTFH01000136v1_decoy chrUn_JTFH01000137v1_decoy chrUn_JTFH01000138v1_decoy chrUn_JTFH01000139v1_decoy chrUn_JTFH01000140v1_decoy chrUn_JTFH01000141v1_decoy chrUn_JTFH01000142v1_decoy chrUn_JTFH01000143v1_decoy chrUn_JTFH01000144v1_decoy chrUn_JTFH01000145v1_decoy chrUn_JTFH01000146v1_decoy chrUn_JTFH01000147v1_decoy chrUn_JTFH01000148v1_decoy chrUn_JTFH01000149v1_decoy chrUn_JTFH01000150v1_decoy chrUn_JTFH01000151v1_decoy chrUn_JTFH01000152v1_decoy chrUn_JTFH01000153v1_decoy chrUn_JTFH01000154v1_decoy chrUn_JTFH01000155v1_decoy chrUn_JTFH01000156v1_decoy chrUn_JTFH01000157v1_decoy chrUn_JTFH01000158v1_decoy chrUn_JTFH01000159v1_decoy chrUn_JTFH01000160v1_decoy chrUn_JTFH01000161v1_decoy chrUn_JTFH01000162v1_decoy chrUn_JTFH01000163v1_decoy chrUn_JTFH01000164v1_decoy chrUn_JTFH01000165v1_decoy chrUn_JTFH01000166v1_decoy chrUn_JTFH01000167v1_decoy chrUn_JTFH01000168v1_decoy chrUn_JTFH01000169v1_decoy chrUn_JTFH01000170v1_decoy chrUn_JTFH01000171v1_decoy chrUn_JTFH01000172v1_decoy chrUn_JTFH01000173v1_decoy chrUn_JTFH01000174v1_decoy chrUn_JTFH01000175v1_decoy chrUn_JTFH01000176v1_decoy chrUn_JTFH01000177v1_decoy chrUn_JTFH01000178v1_decoy chrUn_JTFH01000179v1_decoy chrUn_JTFH01000180v1_decoy chrUn_JTFH01000181v1_decoy chrUn_JTFH01000182v1_decoy chrUn_JTFH01000183v1_decoy chrUn_JTFH01000184v1_decoy chrUn_JTFH01000185v1_decoy chrUn_JTFH01000186v1_decoy chrUn_JTFH01000187v1_decoy chrUn_JTFH01000188v1_decoy chrUn_JTFH01000189v1_decoy chrUn_JTFH01000190v1_decoy chrUn_JTFH01000191v1_decoy chrUn_JTFH01000192v1_decoy chrUn_JTFH01000193v1_decoy chrUn_JTFH01000194v1_decoy chrUn_JTFH01000195v1_decoy chrUn_JTFH01000196v1_decoy chrUn_JTFH01000197v1_decoy chrUn_JTFH01000198v1_decoy chrUn_JTFH01000199v1_decoy chrUn_JTFH01000200v1_decoy chrUn_JTFH01000201v1_decoy chrUn_JTFH01000202v1_decoy chrUn_JTFH01000203v1_decoy chrUn_JTFH01000204v1_decoy chrUn_JTFH01000205v1_decoy chrUn_JTFH01000206v1_decoy chrUn_JTFH01000207v1_decoy chrUn_JTFH01000208v1_decoy chrUn_JTFH01000209v1_decoy chrUn_JTFH01000210v1_decoy chrUn_JTFH01000211v1_decoy chrUn_JTFH01000212v1_decoy chrUn_JTFH01000213v1_decoy chrUn_JTFH01000214v1_decoy chrUn_JTFH01000215v1_decoy chrUn_JTFH01000216v1_decoy chrUn_JTFH01000217v1_decoy chrUn_JTFH01000218v1_decoy chrUn_JTFH01000219v1_decoy chrUn_JTFH01000220v1_decoy chrUn_JTFH01000221v1_decoy chrUn_JTFH01000222v1_decoy chrUn_JTFH01000223v1_decoy chrUn_JTFH01000224v1_decoy chrUn_JTFH01000225v1_decoy chrUn_JTFH01000226v1_decoy chrUn_JTFH01000227v1_decoy chrUn_JTFH01000228v1_decoy chrUn_JTFH01000229v1_decoy chrUn_JTFH01000230v1_decoy chrUn_JTFH01000231v1_decoy chrUn_JTFH01000232v1_decoy chrUn_JTFH01000233v1_decoy chrUn_JTFH01000234v1_decoy chrUn_JTFH01000235v1_decoy chrUn_JTFH01000236v1_decoy chrUn_JTFH01000237v1_decoy chrUn_JTFH01000238v1_decoy chrUn_JTFH01000239v1_decoy chrUn_JTFH01000240v1_decoy chrUn_JTFH01000241v1_decoy chrUn_JTFH01000242v1_decoy chrUn_JTFH01000243v1_decoy chrUn_JTFH01000244v1_decoy chrUn_JTFH01000245v1_decoy chrUn_JTFH01000246v1_decoy chrUn_JTFH01000247v1_decoy chrUn_JTFH01000248v1_decoy chrUn_JTFH01000249v1_decoy chrUn_JTFH01000250v1_decoy chrUn_JTFH01000251v1_decoy chrUn_JTFH01000252v1_decoy chrUn_JTFH01000253v1_decoy chrUn_JTFH01000254v1_decoy chrUn_JTFH01000255v1_decoy chrUn_JTFH01000256v1_decoy chrUn_JTFH01000257v1_decoy chrUn_JTFH01000258v1_decoy chrUn_JTFH01000259v1_decoy chrUn_JTFH01000260v1_decoy chrUn_JTFH01000261v1_decoy chrUn_JTFH01000262v1_decoy chrUn_JTFH01000263v1_decoy chrUn_JTFH01000264v1_decoy chrUn_JTFH01000265v1_decoy chrUn_JTFH01000266v1_decoy chrUn_JTFH01000267v1_decoy chrUn_JTFH01000268v1_decoy chrUn_JTFH01000269v1_decoy chrUn_JTFH01000270v1_decoy chrUn_JTFH01000271v1_decoy chrUn_JTFH01000272v1_decoy chrUn_JTFH01000273v1_decoy chrUn_JTFH01000274v1_decoy chrUn_JTFH01000275v1_decoy chrUn_JTFH01000276v1_decoy chrUn_JTFH01000277v1_decoy chrUn_JTFH01000278v1_decoy chrUn_JTFH01000279v1_decoy chrUn_JTFH01000280v1_decoy chrUn_JTFH01000281v1_decoy chrUn_JTFH01000282v1_decoy chrUn_JTFH01000283v1_decoy chrUn_JTFH01000284v1_decoy chrUn_JTFH01000285v1_decoy chrUn_JTFH01000286v1_decoy chrUn_JTFH01000287v1_decoy chrUn_JTFH01000288v1_decoy chrUn_JTFH01000289v1_decoy chrUn_JTFH01000290v1_decoy chrUn_JTFH01000291v1_decoy chrUn_JTFH01000292v1_decoy chrUn_JTFH01000293v1_decoy chrUn_JTFH01000294v1_decoy chrUn_JTFH01000295v1_decoy chrUn_JTFH01000296v1_decoy chrUn_JTFH01000297v1_decoy chrUn_JTFH01000298v1_decoy chrUn_JTFH01000299v1_decoy chrUn_JTFH01000300v1_decoy chrUn_JTFH01000301v1_decoy chrUn_JTFH01000302v1_decoy chrUn_JTFH01000303v1_decoy chrUn_JTFH01000304v1_decoy chrUn_JTFH01000305v1_decoy chrUn_JTFH01000306v1_decoy chrUn_JTFH01000307v1_decoy chrUn_JTFH01000308v1_decoy chrUn_JTFH01000309v1_decoy chrUn_JTFH01000310v1_decoy chrUn_JTFH01000311v1_decoy chrUn_JTFH01000312v1_decoy chrUn_JTFH01000313v1_decoy chrUn_JTFH01000314v1_decoy chrUn_JTFH01000315v1_decoy chrUn_JTFH01000316v1_decoy chrUn_JTFH01000317v1_decoy chrUn_JTFH01000318v1_decoy chrUn_JTFH01000319v1_decoy chrUn_JTFH01000320v1_decoy chrUn_JTFH01000321v1_decoy chrUn_JTFH01000322v1_decoy chrUn_JTFH01000323v1_decoy chrUn_JTFH01000324v1_decoy chrUn_JTFH01000325v1_decoy chrUn_JTFH01000326v1_decoy chrUn_JTFH01000327v1_decoy chrUn_JTFH01000328v1_decoy chrUn_JTFH01000329v1_decoy chrUn_JTFH01000330v1_decoy chrUn_JTFH01000331v1_decoy chrUn_JTFH01000332v1_decoy chrUn_JTFH01000333v1_decoy chrUn_JTFH01000334v1_decoy chrUn_JTFH01000335v1_decoy chrUn_JTFH01000336v1_decoy chrUn_JTFH01000337v1_decoy chrUn_JTFH01000338v1_decoy chrUn_JTFH01000339v1_decoy chrUn_JTFH01000340v1_decoy chrUn_JTFH01000341v1_decoy chrUn_JTFH01000342v1_decoy chrUn_JTFH01000343v1_decoy chrUn_JTFH01000344v1_decoy chrUn_JTFH01000345v1_decoy chrUn_JTFH01000346v1_decoy chrUn_JTFH01000347v1_decoy chrUn_JTFH01000348v1_decoy chrUn_JTFH01000349v1_decoy chrUn_JTFH01000350v1_decoy chrUn_JTFH01000351v1_decoy chrUn_JTFH01000352v1_decoy chrUn_JTFH01000353v1_decoy chrUn_JTFH01000354v1_decoy chrUn_JTFH01000355v1_decoy chrUn_JTFH01000356v1_decoy chrUn_JTFH01000357v1_decoy chrUn_JTFH01000358v1_decoy chrUn_JTFH01000359v1_decoy chrUn_JTFH01000360v1_decoy chrUn_JTFH01000361v1_decoy chrUn_JTFH01000362v1_decoy chrUn_JTFH01000363v1_decoy chrUn_JTFH01000364v1_decoy chrUn_JTFH01000365v1_decoy chrUn_JTFH01000366v1_decoy chrUn_JTFH01000367v1_decoy chrUn_JTFH01000368v1_decoy chrUn_JTFH01000369v1_decoy chrUn_JTFH01000370v1_decoy chrUn_JTFH01000371v1_decoy chrUn_JTFH01000372v1_decoy chrUn_JTFH01000373v1_decoy chrUn_JTFH01000374v1_decoy chrUn_JTFH01000375v1_decoy chrUn_JTFH01000376v1_decoy chrUn_JTFH01000377v1_decoy chrUn_JTFH01000378v1_decoy chrUn_JTFH01000379v1_decoy chrUn_JTFH01000380v1_decoy chrUn_JTFH01000381v1_decoy chrUn_JTFH01000382v1_decoy chrUn_JTFH01000383v1_decoy chrUn_JTFH01000384v1_decoy chrUn_JTFH01000385v1_decoy chrUn_JTFH01000386v1_decoy chrUn_JTFH01000387v1_decoy chrUn_JTFH01000388v1_decoy chrUn_JTFH01000389v1_decoy chrUn_JTFH01000390v1_decoy chrUn_JTFH01000391v1_decoy chrUn_JTFH01000392v1_decoy chrUn_JTFH01000393v1_decoy chrUn_JTFH01000394v1_decoy chrUn_JTFH01000395v1_decoy chrUn_JTFH01000396v1_decoy chrUn_JTFH01000397v1_decoy chrUn_JTFH01000398v1_decoy chrUn_JTFH01000399v1_decoy chrUn_JTFH01000400v1_decoy chrUn_JTFH01000401v1_decoy chrUn_JTFH01000402v1_decoy chrUn_JTFH01000403v1_decoy chrUn_JTFH01000404v1_decoy chrUn_JTFH01000405v1_decoy chrUn_JTFH01000406v1_decoy chrUn_JTFH01000407v1_decoy chrUn_JTFH01000408v1_decoy chrUn_JTFH01000409v1_decoy chrUn_JTFH01000410v1_decoy chrUn_JTFH01000411v1_decoy chrUn_JTFH01000412v1_decoy chrUn_JTFH01000413v1_decoy chrUn_JTFH01000414v1_decoy chrUn_JTFH01000415v1_decoy chrUn_JTFH01000416v1_decoy chrUn_JTFH01000417v1_decoy chrUn_JTFH01000418v1_decoy chrUn_JTFH01000419v1_decoy chrUn_JTFH01000420v1_decoy chrUn_JTFH01000421v1_decoy chrUn_JTFH01000422v1_decoy chrUn_JTFH01000423v1_decoy chrUn_JTFH01000424v1_decoy chrUn_JTFH01000425v1_decoy chrUn_JTFH01000426v1_decoy chrUn_JTFH01000427v1_decoy chrUn_JTFH01000428v1_decoy chrUn_JTFH01000429v1_decoy chrUn_JTFH01000430v1_decoy chrUn_JTFH01000431v1_decoy chrUn_JTFH01000432v1_decoy chrUn_JTFH01000433v1_decoy chrUn_JTFH01000434v1_decoy chrUn_JTFH01000435v1_decoy chrUn_JTFH01000436v1_decoy chrUn_JTFH01000437v1_decoy chrUn_JTFH01000438v1_decoy chrUn_JTFH01000439v1_decoy chrUn_JTFH01000440v1_decoy chrUn_JTFH01000441v1_decoy chrUn_JTFH01000442v1_decoy chrUn_JTFH01000443v1_decoy chrUn_JTFH01000444v1_decoy chrUn_JTFH01000445v1_decoy chrUn_JTFH01000446v1_decoy chrUn_JTFH01000447v1_decoy chrUn_JTFH01000448v1_decoy chrUn_JTFH01000449v1_decoy chrUn_JTFH01000450v1_decoy chrUn_JTFH01000451v1_decoy chrUn_JTFH01000452v1_decoy chrUn_JTFH01000453v1_decoy chrUn_JTFH01000454v1_decoy chrUn_JTFH01000455v1_decoy chrUn_JTFH01000456v1_decoy chrUn_JTFH01000457v1_decoy chrUn_JTFH01000458v1_decoy chrUn_JTFH01000459v1_decoy chrUn_JTFH01000460v1_decoy chrUn_JTFH01000461v1_decoy chrUn_JTFH01000462v1_decoy chrUn_JTFH01000463v1_decoy chrUn_JTFH01000464v1_decoy chrUn_JTFH01000465v1_decoy chrUn_JTFH01000466v1_decoy chrUn_JTFH01000467v1_decoy chrUn_JTFH01000468v1_decoy chrUn_JTFH01000469v1_decoy chrUn_JTFH01000470v1_decoy chrUn_JTFH01000471v1_decoy chrUn_JTFH01000472v1_decoy chrUn_JTFH01000473v1_decoy chrUn_JTFH01000474v1_decoy chrUn_JTFH01000475v1_decoy chrUn_JTFH01000476v1_decoy chrUn_JTFH01000477v1_decoy chrUn_JTFH01000478v1_decoy chrUn_JTFH01000479v1_decoy chrUn_JTFH01000480v1_decoy chrUn_JTFH01000481v1_decoy chrUn_JTFH01000482v1_decoy chrUn_JTFH01000483v1_decoy chrUn_JTFH01000484v1_decoy chrUn_JTFH01000485v1_decoy chrUn_JTFH01000486v1_decoy chrUn_JTFH01000487v1_decoy chrUn_JTFH01000488v1_decoy chrUn_JTFH01000489v1_decoy chrUn_JTFH01000490v1_decoy chrUn_JTFH01000491v1_decoy chrUn_JTFH01000492v1_decoy chrUn_JTFH01000493v1_decoy chrUn_JTFH01000494v1_decoy chrUn_JTFH01000495v1_decoy chrUn_JTFH01000496v1_decoy chrUn_JTFH01000497v1_decoy chrUn_JTFH01000498v1_decoy chrUn_JTFH01000499v1_decoy chrUn_JTFH01000500v1_decoy chrUn_JTFH01000501v1_decoy chrUn_JTFH01000502v1_decoy chrUn_JTFH01000503v1_decoy chrUn_JTFH01000504v1_decoy chrUn_JTFH01000505v1_decoy chrUn_JTFH01000506v1_decoy chrUn_JTFH01000507v1_decoy chrUn_JTFH01000508v1_decoy chrUn_JTFH01000509v1_decoy chrUn_JTFH01000510v1_decoy chrUn_JTFH01000511v1_decoy chrUn_JTFH01000512v1_decoy chrUn_JTFH01000513v1_decoy chrUn_JTFH01000514v1_decoy chrUn_JTFH01000515v1_decoy chrUn_JTFH01000516v1_decoy chrUn_JTFH01000517v1_decoy chrUn_JTFH01000518v1_decoy chrUn_JTFH01000519v1_decoy chrUn_JTFH01000520v1_decoy chrUn_JTFH01000521v1_decoy chrUn_JTFH01000522v1_decoy chrUn_JTFH01000523v1_decoy chrUn_JTFH01000524v1_decoy chrUn_JTFH01000525v1_decoy chrUn_JTFH01000526v1_decoy chrUn_JTFH01000527v1_decoy chrUn_JTFH01000528v1_decoy chrUn_JTFH01000529v1_decoy chrUn_JTFH01000530v1_decoy chrUn_JTFH01000531v1_decoy chrUn_JTFH01000532v1_decoy chrUn_JTFH01000533v1_decoy chrUn_JTFH01000534v1_decoy chrUn_JTFH01000535v1_decoy chrUn_JTFH01000536v1_decoy chrUn_JTFH01000537v1_decoy chrUn_JTFH01000538v1_decoy chrUn_JTFH01000539v1_decoy chrUn_JTFH01000540v1_decoy chrUn_JTFH01000541v1_decoy chrUn_JTFH01000542v1_decoy chrUn_JTFH01000543v1_decoy chrUn_JTFH01000544v1_decoy chrUn_JTFH01000545v1_decoy chrUn_JTFH01000546v1_decoy chrUn_JTFH01000547v1_decoy chrUn_JTFH01000548v1_decoy chrUn_JTFH01000549v1_decoy chrUn_JTFH01000550v1_decoy chrUn_JTFH01000551v1_decoy chrUn_JTFH01000552v1_decoy chrUn_JTFH01000553v1_decoy chrUn_JTFH01000554v1_decoy chrUn_JTFH01000555v1_decoy chrUn_JTFH01000556v1_decoy chrUn_JTFH01000557v1_decoy chrUn_JTFH01000558v1_decoy chrUn_JTFH01000559v1_decoy chrUn_JTFH01000560v1_decoy chrUn_JTFH01000561v1_decoy chrUn_JTFH01000562v1_decoy chrUn_JTFH01000563v1_decoy chrUn_JTFH01000564v1_decoy chrUn_JTFH01000565v1_decoy chrUn_JTFH01000566v1_decoy chrUn_JTFH01000567v1_decoy chrUn_JTFH01000568v1_decoy chrUn_JTFH01000569v1_decoy chrUn_JTFH01000570v1_decoy chrUn_JTFH01000571v1_decoy chrUn_JTFH01000572v1_decoy chrUn_JTFH01000573v1_decoy chrUn_JTFH01000574v1_decoy chrUn_JTFH01000575v1_decoy chrUn_JTFH01000576v1_decoy chrUn_JTFH01000577v1_decoy chrUn_JTFH01000578v1_decoy chrUn_JTFH01000579v1_decoy chrUn_JTFH01000580v1_decoy chrUn_JTFH01000581v1_decoy chrUn_JTFH01000582v1_decoy chrUn_JTFH01000583v1_decoy chrUn_JTFH01000584v1_decoy chrUn_JTFH01000585v1_decoy chrUn_JTFH01000586v1_decoy chrUn_JTFH01000587v1_decoy chrUn_JTFH01000588v1_decoy chrUn_JTFH01000589v1_decoy chrUn_JTFH01000590v1_decoy chrUn_JTFH01000591v1_decoy chrUn_JTFH01000592v1_decoy chrUn_JTFH01000593v1_decoy chrUn_JTFH01000594v1_decoy chrUn_JTFH01000595v1_decoy chrUn_JTFH01000596v1_decoy chrUn_JTFH01000597v1_decoy chrUn_JTFH01000598v1_decoy chrUn_JTFH01000599v1_decoy chrUn_JTFH01000600v1_decoy chrUn_JTFH01000601v1_decoy chrUn_JTFH01000602v1_decoy chrUn_JTFH01000603v1_decoy chrUn_JTFH01000604v1_decoy chrUn_JTFH01000605v1_decoy chrUn_JTFH01000606v1_decoy chrUn_JTFH01000607v1_decoy chrUn_JTFH01000608v1_decoy chrUn_JTFH01000609v1_decoy chrUn_JTFH01000610v1_decoy chrUn_JTFH01000611v1_decoy chrUn_JTFH01000612v1_decoy chrUn_JTFH01000613v1_decoy chrUn_JTFH01000614v1_decoy chrUn_JTFH01000615v1_decoy chrUn_JTFH01000616v1_decoy chrUn_JTFH01000617v1_decoy chrUn_JTFH01000618v1_decoy chrUn_JTFH01000619v1_decoy chrUn_JTFH01000620v1_decoy chrUn_JTFH01000621v1_decoy chrUn_JTFH01000622v1_decoy chrUn_JTFH01000623v1_decoy chrUn_JTFH01000624v1_decoy chrUn_JTFH01000625v1_decoy chrUn_JTFH01000626v1_decoy chrUn_JTFH01000627v1_decoy chrUn_JTFH01000628v1_decoy chrUn_JTFH01000629v1_decoy chrUn_JTFH01000630v1_decoy chrUn_JTFH01000631v1_decoy chrUn_JTFH01000632v1_decoy chrUn_JTFH01000633v1_decoy chrUn_JTFH01000634v1_decoy chrUn_JTFH01000635v1_decoy chrUn_JTFH01000636v1_decoy chrUn_JTFH01000637v1_decoy chrUn_JTFH01000638v1_decoy chrUn_JTFH01000639v1_decoy chrUn_JTFH01000640v1_decoy chrUn_JTFH01000641v1_decoy chrUn_JTFH01000642v1_decoy chrUn_JTFH01000643v1_decoy chrUn_JTFH01000644v1_decoy chrUn_JTFH01000645v1_decoy chrUn_JTFH01000646v1_decoy chrUn_JTFH01000647v1_decoy chrUn_JTFH01000648v1_decoy chrUn_JTFH01000649v1_decoy chrUn_JTFH01000650v1_decoy chrUn_JTFH01000651v1_decoy chrUn_JTFH01000652v1_decoy chrUn_JTFH01000653v1_decoy chrUn_JTFH01000654v1_decoy chrUn_JTFH01000655v1_decoy chrUn_JTFH01000656v1_decoy chrUn_JTFH01000657v1_decoy chrUn_JTFH01000658v1_decoy chrUn_JTFH01000659v1_decoy chrUn_JTFH01000660v1_decoy chrUn_JTFH01000661v1_decoy chrUn_JTFH01000662v1_decoy chrUn_JTFH01000663v1_decoy chrUn_JTFH01000664v1_decoy chrUn_JTFH01000665v1_decoy chrUn_JTFH01000666v1_decoy chrUn_JTFH01000667v1_decoy chrUn_JTFH01000668v1_decoy chrUn_JTFH01000669v1_decoy chrUn_JTFH01000670v1_decoy chrUn_JTFH01000671v1_decoy chrUn_JTFH01000672v1_decoy chrUn_JTFH01000673v1_decoy chrUn_JTFH01000674v1_decoy chrUn_JTFH01000675v1_decoy chrUn_JTFH01000676v1_decoy chrUn_JTFH01000677v1_decoy chrUn_JTFH01000678v1_decoy chrUn_JTFH01000679v1_decoy chrUn_JTFH01000680v1_decoy chrUn_JTFH01000681v1_decoy chrUn_JTFH01000682v1_decoy chrUn_JTFH01000683v1_decoy chrUn_JTFH01000684v1_decoy chrUn_JTFH01000685v1_decoy chrUn_JTFH01000686v1_decoy chrUn_JTFH01000687v1_decoy chrUn_JTFH01000688v1_decoy chrUn_JTFH01000689v1_decoy chrUn_JTFH01000690v1_decoy chrUn_JTFH01000691v1_decoy chrUn_JTFH01000692v1_decoy chrUn_JTFH01000693v1_decoy chrUn_JTFH01000694v1_decoy chrUn_JTFH01000695v1_decoy chrUn_JTFH01000696v1_decoy chrUn_JTFH01000697v1_decoy chrUn_JTFH01000698v1_decoy chrUn_JTFH01000699v1_decoy chrUn_JTFH01000700v1_decoy chrUn_JTFH01000701v1_decoy chrUn_JTFH01000702v1_decoy chrUn_JTFH01000703v1_decoy chrUn_JTFH01000704v1_decoy chrUn_JTFH01000705v1_decoy chrUn_JTFH01000706v1_decoy chrUn_JTFH01000707v1_decoy chrUn_JTFH01000708v1_decoy chrUn_JTFH01000709v1_decoy chrUn_JTFH01000710v1_decoy chrUn_JTFH01000711v1_decoy chrUn_JTFH01000712v1_decoy chrUn_JTFH01000713v1_decoy chrUn_JTFH01000714v1_decoy chrUn_JTFH01000715v1_decoy chrUn_JTFH01000716v1_decoy chrUn_JTFH01000717v1_decoy chrUn_JTFH01000718v1_decoy chrUn_JTFH01000719v1_decoy chrUn_JTFH01000720v1_decoy chrUn_JTFH01000721v1_decoy chrUn_JTFH01000722v1_decoy chrUn_JTFH01000723v1_decoy chrUn_JTFH01000724v1_decoy chrUn_JTFH01000725v1_decoy chrUn_JTFH01000726v1_decoy chrUn_JTFH01000727v1_decoy chrUn_JTFH01000728v1_decoy chrUn_JTFH01000729v1_decoy chrUn_JTFH01000730v1_decoy chrUn_JTFH01000731v1_decoy chrUn_JTFH01000732v1_decoy chrUn_JTFH01000733v1_decoy chrUn_JTFH01000734v1_decoy chrUn_JTFH01000735v1_decoy chrUn_JTFH01000736v1_decoy chrUn_JTFH01000737v1_decoy chrUn_JTFH01000738v1_decoy chrUn_JTFH01000739v1_decoy chrUn_JTFH01000740v1_decoy chrUn_JTFH01000741v1_decoy chrUn_JTFH01000742v1_decoy chrUn_JTFH01000743v1_decoy chrUn_JTFH01000744v1_decoy chrUn_JTFH01000745v1_decoy chrUn_JTFH01000746v1_decoy chrUn_JTFH01000747v1_decoy chrUn_JTFH01000748v1_decoy chrUn_JTFH01000749v1_decoy chrUn_JTFH01000750v1_decoy chrUn_JTFH01000751v1_decoy chrUn_JTFH01000752v1_decoy chrUn_JTFH01000753v1_decoy chrUn_JTFH01000754v1_decoy chrUn_JTFH01000755v1_decoy chrUn_JTFH01000756v1_decoy chrUn_JTFH01000757v1_decoy chrUn_JTFH01000758v1_decoy chrUn_JTFH01000759v1_decoy chrUn_JTFH01000760v1_decoy chrUn_JTFH01000761v1_decoy chrUn_JTFH01000762v1_decoy chrUn_JTFH01000763v1_decoy chrUn_JTFH01000764v1_decoy chrUn_JTFH01000765v1_decoy chrUn_JTFH01000766v1_decoy chrUn_JTFH01000767v1_decoy chrUn_JTFH01000768v1_decoy chrUn_JTFH01000769v1_decoy chrUn_JTFH01000770v1_decoy chrUn_JTFH01000771v1_decoy chrUn_JTFH01000772v1_decoy chrUn_JTFH01000773v1_decoy chrUn_JTFH01000774v1_decoy chrUn_JTFH01000775v1_decoy chrUn_JTFH01000776v1_decoy chrUn_JTFH01000777v1_decoy chrUn_JTFH01000778v1_decoy chrUn_JTFH01000779v1_decoy chrUn_JTFH01000780v1_decoy chrUn_JTFH01000781v1_decoy chrUn_JTFH01000782v1_decoy chrUn_JTFH01000783v1_decoy chrUn_JTFH01000784v1_decoy chrUn_JTFH01000785v1_decoy chrUn_JTFH01000786v1_decoy chrUn_JTFH01000787v1_decoy chrUn_JTFH01000788v1_decoy chrUn_JTFH01000789v1_decoy chrUn_JTFH01000790v1_decoy chrUn_JTFH01000791v1_decoy chrUn_JTFH01000792v1_decoy chrUn_JTFH01000793v1_decoy chrUn_JTFH01000794v1_decoy chrUn_JTFH01000795v1_decoy chrUn_JTFH01000796v1_decoy chrUn_JTFH01000797v1_decoy chrUn_JTFH01000798v1_decoy chrUn_JTFH01000799v1_decoy chrUn_JTFH01000800v1_decoy chrUn_JTFH01000801v1_decoy chrUn_JTFH01000802v1_decoy chrUn_JTFH01000803v1_decoy chrUn_JTFH01000804v1_decoy chrUn_JTFH01000805v1_decoy chrUn_JTFH01000806v1_decoy chrUn_JTFH01000807v1_decoy chrUn_JTFH01000808v1_decoy chrUn_JTFH01000809v1_decoy chrUn_JTFH01000810v1_decoy chrUn_JTFH01000811v1_decoy chrUn_JTFH01000812v1_decoy chrUn_JTFH01000813v1_decoy chrUn_JTFH01000814v1_decoy chrUn_JTFH01000815v1_decoy chrUn_JTFH01000816v1_decoy chrUn_JTFH01000817v1_decoy chrUn_JTFH01000818v1_decoy chrUn_JTFH01000819v1_decoy chrUn_JTFH01000820v1_decoy chrUn_JTFH01000821v1_decoy chrUn_JTFH01000822v1_decoy chrUn_JTFH01000823v1_decoy chrUn_JTFH01000824v1_decoy chrUn_JTFH01000825v1_decoy chrUn_JTFH01000826v1_decoy chrUn_JTFH01000827v1_decoy chrUn_JTFH01000828v1_decoy chrUn_JTFH01000829v1_decoy chrUn_JTFH01000830v1_decoy chrUn_JTFH01000831v1_decoy chrUn_JTFH01000832v1_decoy chrUn_JTFH01000833v1_decoy chrUn_JTFH01000834v1_decoy chrUn_JTFH01000835v1_decoy chrUn_JTFH01000836v1_decoy chrUn_JTFH01000837v1_decoy chrUn_JTFH01000838v1_decoy chrUn_JTFH01000839v1_decoy chrUn_JTFH01000840v1_decoy chrUn_JTFH01000841v1_decoy chrUn_JTFH01000842v1_decoy chrUn_JTFH01000843v1_decoy chrUn_JTFH01000844v1_decoy chrUn_JTFH01000845v1_decoy chrUn_JTFH01000846v1_decoy chrUn_JTFH01000847v1_decoy chrUn_JTFH01000848v1_decoy chrUn_JTFH01000849v1_decoy chrUn_JTFH01000850v1_decoy chrUn_JTFH01000851v1_decoy chrUn_JTFH01000852v1_decoy chrUn_JTFH01000853v1_decoy chrUn_JTFH01000854v1_decoy chrUn_JTFH01000855v1_decoy chrUn_JTFH01000856v1_decoy chrUn_JTFH01000857v1_decoy chrUn_JTFH01000858v1_decoy chrUn_JTFH01000859v1_decoy chrUn_JTFH01000860v1_decoy chrUn_JTFH01000861v1_decoy chrUn_JTFH01000862v1_decoy chrUn_JTFH01000863v1_decoy chrUn_JTFH01000864v1_decoy chrUn_JTFH01000865v1_decoy chrUn_JTFH01000866v1_decoy chrUn_JTFH01000867v1_decoy chrUn_JTFH01000868v1_decoy chrUn_JTFH01000869v1_decoy chrUn_JTFH01000870v1_decoy chrUn_JTFH01000871v1_decoy chrUn_JTFH01000872v1_decoy chrUn_JTFH01000873v1_decoy chrUn_JTFH01000874v1_decoy chrUn_JTFH01000875v1_decoy chrUn_JTFH01000876v1_decoy chrUn_JTFH01000877v1_decoy chrUn_JTFH01000878v1_decoy chrUn_JTFH01000879v1_decoy chrUn_JTFH01000880v1_decoy chrUn_JTFH01000881v1_decoy chrUn_JTFH01000882v1_decoy chrUn_JTFH01000883v1_decoy chrUn_JTFH01000884v1_decoy chrUn_JTFH01000885v1_decoy chrUn_JTFH01000886v1_decoy chrUn_JTFH01000887v1_decoy chrUn_JTFH01000888v1_decoy chrUn_JTFH01000889v1_decoy chrUn_JTFH01000890v1_decoy chrUn_JTFH01000891v1_decoy chrUn_JTFH01000892v1_decoy chrUn_JTFH01000893v1_decoy chrUn_JTFH01000894v1_decoy chrUn_JTFH01000895v1_decoy chrUn_JTFH01000896v1_decoy chrUn_JTFH01000897v1_decoy chrUn_JTFH01000898v1_decoy chrUn_JTFH01000899v1_decoy chrUn_JTFH01000900v1_decoy chrUn_JTFH01000901v1_decoy chrUn_JTFH01000902v1_decoy chrUn_JTFH01000903v1_decoy chrUn_JTFH01000904v1_decoy chrUn_JTFH01000905v1_decoy chrUn_JTFH01000906v1_decoy chrUn_JTFH01000907v1_decoy chrUn_JTFH01000908v1_decoy chrUn_JTFH01000909v1_decoy chrUn_JTFH01000910v1_decoy chrUn_JTFH01000911v1_decoy chrUn_JTFH01000912v1_decoy chrUn_JTFH01000913v1_decoy chrUn_JTFH01000914v1_decoy chrUn_JTFH01000915v1_decoy chrUn_JTFH01000916v1_decoy chrUn_JTFH01000917v1_decoy chrUn_JTFH01000918v1_decoy chrUn_JTFH01000919v1_decoy chrUn_JTFH01000920v1_decoy chrUn_JTFH01000921v1_decoy chrUn_JTFH01000922v1_decoy chrUn_JTFH01000923v1_decoy chrUn_JTFH01000924v1_decoy chrUn_JTFH01000925v1_decoy chrUn_JTFH01000926v1_decoy chrUn_JTFH01000927v1_decoy chrUn_JTFH01000928v1_decoy chrUn_JTFH01000929v1_decoy chrUn_JTFH01000930v1_decoy chrUn_JTFH01000931v1_decoy chrUn_JTFH01000932v1_decoy chrUn_JTFH01000933v1_decoy chrUn_JTFH01000934v1_decoy chrUn_JTFH01000935v1_decoy chrUn_JTFH01000936v1_decoy chrUn_JTFH01000937v1_decoy chrUn_JTFH01000938v1_decoy chrUn_JTFH01000939v1_decoy chrUn_JTFH01000940v1_decoy chrUn_JTFH01000941v1_decoy chrUn_JTFH01000942v1_decoy chrUn_JTFH01000943v1_decoy chrUn_JTFH01000944v1_decoy chrUn_JTFH01000945v1_decoy chrUn_JTFH01000946v1_decoy chrUn_JTFH01000947v1_decoy chrUn_JTFH01000948v1_decoy chrUn_JTFH01000949v1_decoy chrUn_JTFH01000950v1_decoy chrUn_JTFH01000951v1_decoy chrUn_JTFH01000952v1_decoy chrUn_JTFH01000953v1_decoy chrUn_JTFH01000954v1_decoy chrUn_JTFH01000955v1_decoy chrUn_JTFH01000956v1_decoy chrUn_JTFH01000957v1_decoy chrUn_JTFH01000958v1_decoy chrUn_JTFH01000959v1_decoy chrUn_JTFH01000960v1_decoy chrUn_JTFH01000961v1_decoy chrUn_JTFH01000962v1_decoy chrUn_JTFH01000963v1_decoy chrUn_JTFH01000964v1_decoy chrUn_JTFH01000965v1_decoy chrUn_JTFH01000966v1_decoy chrUn_JTFH01000967v1_decoy chrUn_JTFH01000968v1_decoy chrUn_JTFH01000969v1_decoy chrUn_JTFH01000970v1_decoy chrUn_JTFH01000971v1_decoy chrUn_JTFH01000972v1_decoy chrUn_JTFH01000973v1_decoy chrUn_JTFH01000974v1_decoy chrUn_JTFH01000975v1_decoy chrUn_JTFH01000976v1_decoy chrUn_JTFH01000977v1_decoy chrUn_JTFH01000978v1_decoy chrUn_JTFH01000979v1_decoy chrUn_JTFH01000980v1_decoy chrUn_JTFH01000981v1_decoy chrUn_JTFH01000982v1_decoy chrUn_JTFH01000983v1_decoy chrUn_JTFH01000984v1_decoy chrUn_JTFH01000985v1_decoy chrUn_JTFH01000986v1_decoy chrUn_JTFH01000987v1_decoy chrUn_JTFH01000988v1_decoy chrUn_JTFH01000989v1_decoy chrUn_JTFH01000990v1_decoy chrUn_JTFH01000991v1_decoy chrUn_JTFH01000992v1_decoy chrUn_JTFH01000993v1_decoy chrUn_JTFH01000994v1_decoy chrUn_JTFH01000995v1_decoy chrUn_JTFH01000996v1_decoy chrUn_JTFH01000997v1_decoy chrUn_JTFH01000998v1_decoy chrUn_JTFH01000999v1_decoy chrUn_JTFH01001000v1_decoy chrUn_JTFH01001001v1_decoy chrUn_JTFH01001002v1_decoy chrUn_JTFH01001003v1_decoy chrUn_JTFH01001004v1_decoy chrUn_JTFH01001005v1_decoy chrUn_JTFH01001006v1_decoy chrUn_JTFH01001007v1_decoy chrUn_JTFH01001008v1_decoy chrUn_JTFH01001009v1_decoy chrUn_JTFH01001010v1_decoy chrUn_JTFH01001011v1_decoy chrUn_JTFH01001012v1_decoy chrUn_JTFH01001013v1_decoy chrUn_JTFH01001014v1_decoy chrUn_JTFH01001015v1_decoy chrUn_JTFH01001016v1_decoy chrUn_JTFH01001017v1_decoy chrUn_JTFH01001018v1_decoy chrUn_JTFH01001019v1_decoy chrUn_JTFH01001020v1_decoy chrUn_JTFH01001021v1_decoy chrUn_JTFH01001022v1_decoy chrUn_JTFH01001023v1_decoy chrUn_JTFH01001024v1_decoy chrUn_JTFH01001025v1_decoy chrUn_JTFH01001026v1_decoy chrUn_JTFH01001027v1_decoy chrUn_JTFH01001028v1_decoy chrUn_JTFH01001029v1_decoy chrUn_JTFH01001030v1_decoy chrUn_JTFH01001031v1_decoy chrUn_JTFH01001032v1_decoy chrUn_JTFH01001033v1_decoy chrUn_JTFH01001034v1_decoy chrUn_JTFH01001035v1_decoy chrUn_JTFH01001036v1_decoy chrUn_JTFH01001037v1_decoy chrUn_JTFH01001038v1_decoy chrUn_JTFH01001039v1_decoy chrUn_JTFH01001040v1_decoy chrUn_JTFH01001041v1_decoy chrUn_JTFH01001042v1_decoy chrUn_JTFH01001043v1_decoy chrUn_JTFH01001044v1_decoy chrUn_JTFH01001045v1_decoy chrUn_JTFH01001046v1_decoy chrUn_JTFH01001047v1_decoy chrUn_JTFH01001048v1_decoy chrUn_JTFH01001049v1_decoy chrUn_JTFH01001050v1_decoy chrUn_JTFH01001051v1_decoy chrUn_JTFH01001052v1_decoy chrUn_JTFH01001053v1_decoy chrUn_JTFH01001054v1_decoy chrUn_JTFH01001055v1_decoy chrUn_JTFH01001056v1_decoy chrUn_JTFH01001057v1_decoy chrUn_JTFH01001058v1_decoy chrUn_JTFH01001059v1_decoy chrUn_JTFH01001060v1_decoy chrUn_JTFH01001061v1_decoy chrUn_JTFH01001062v1_decoy chrUn_JTFH01001063v1_decoy chrUn_JTFH01001064v1_decoy chrUn_JTFH01001065v1_decoy chrUn_JTFH01001066v1_decoy chrUn_JTFH01001067v1_decoy chrUn_JTFH01001068v1_decoy chrUn_JTFH01001069v1_decoy chrUn_JTFH01001070v1_decoy chrUn_JTFH01001071v1_decoy chrUn_JTFH01001072v1_decoy chrUn_JTFH01001073v1_decoy chrUn_JTFH01001074v1_decoy chrUn_JTFH01001075v1_decoy chrUn_JTFH01001076v1_decoy chrUn_JTFH01001077v1_decoy chrUn_JTFH01001078v1_decoy chrUn_JTFH01001079v1_decoy chrUn_JTFH01001080v1_decoy chrUn_JTFH01001081v1_decoy chrUn_JTFH01001082v1_decoy chrUn_JTFH01001083v1_decoy chrUn_JTFH01001084v1_decoy chrUn_JTFH01001085v1_decoy chrUn_JTFH01001086v1_decoy chrUn_JTFH01001087v1_decoy chrUn_JTFH01001088v1_decoy chrUn_JTFH01001089v1_decoy chrUn_JTFH01001090v1_decoy chrUn_JTFH01001091v1_decoy chrUn_JTFH01001092v1_decoy chrUn_JTFH01001093v1_decoy chrUn_JTFH01001094v1_decoy chrUn_JTFH01001095v1_decoy chrUn_JTFH01001096v1_decoy chrUn_JTFH01001097v1_decoy chrUn_JTFH01001098v1_decoy chrUn_JTFH01001099v1_decoy chrUn_JTFH01001100v1_decoy chrUn_JTFH01001101v1_decoy chrUn_JTFH01001102v1_decoy chrUn_JTFH01001103v1_decoy chrUn_JTFH01001104v1_decoy chrUn_JTFH01001105v1_decoy chrUn_JTFH01001106v1_decoy chrUn_JTFH01001107v1_decoy chrUn_JTFH01001108v1_decoy chrUn_JTFH01001109v1_decoy chrUn_JTFH01001110v1_decoy chrUn_JTFH01001111v1_decoy chrUn_JTFH01001112v1_decoy chrUn_JTFH01001113v1_decoy chrUn_JTFH01001114v1_decoy chrUn_JTFH01001115v1_decoy chrUn_JTFH01001116v1_decoy chrUn_JTFH01001117v1_decoy chrUn_JTFH01001118v1_decoy chrUn_JTFH01001119v1_decoy chrUn_JTFH01001120v1_decoy chrUn_JTFH01001121v1_decoy chrUn_JTFH01001122v1_decoy chrUn_JTFH01001123v1_decoy chrUn_JTFH01001124v1_decoy chrUn_JTFH01001125v1_decoy chrUn_JTFH01001126v1_decoy chrUn_JTFH01001127v1_decoy chrUn_JTFH01001128v1_decoy chrUn_JTFH01001129v1_decoy chrUn_JTFH01001130v1_decoy chrUn_JTFH01001131v1_decoy chrUn_JTFH01001132v1_decoy chrUn_JTFH01001133v1_decoy chrUn_JTFH01001134v1_decoy chrUn_JTFH01001135v1_decoy chrUn_JTFH01001136v1_decoy chrUn_JTFH01001137v1_decoy chrUn_JTFH01001138v1_decoy chrUn_JTFH01001139v1_decoy chrUn_JTFH01001140v1_decoy chrUn_JTFH01001141v1_decoy chrUn_JTFH01001142v1_decoy chrUn_JTFH01001143v1_decoy chrUn_JTFH01001144v1_decoy chrUn_JTFH01001145v1_decoy chrUn_JTFH01001146v1_decoy chrUn_JTFH01001147v1_decoy chrUn_JTFH01001148v1_decoy chrUn_JTFH01001149v1_decoy chrUn_JTFH01001150v1_decoy chrUn_JTFH01001151v1_decoy chrUn_JTFH01001152v1_decoy chrUn_JTFH01001153v1_decoy chrUn_JTFH01001154v1_decoy chrUn_JTFH01001155v1_decoy chrUn_JTFH01001156v1_decoy chrUn_JTFH01001157v1_decoy chrUn_JTFH01001158v1_decoy chrUn_JTFH01001159v1_decoy chrUn_JTFH01001160v1_decoy chrUn_JTFH01001161v1_decoy chrUn_JTFH01001162v1_decoy chrUn_JTFH01001163v1_decoy chrUn_JTFH01001164v1_decoy chrUn_JTFH01001165v1_decoy chrUn_JTFH01001166v1_decoy chrUn_JTFH01001167v1_decoy chrUn_JTFH01001168v1_decoy chrUn_JTFH01001169v1_decoy chrUn_JTFH01001170v1_decoy chrUn_JTFH01001171v1_decoy chrUn_JTFH01001172v1_decoy chrUn_JTFH01001173v1_decoy chrUn_JTFH01001174v1_decoy chrUn_JTFH01001175v1_decoy chrUn_JTFH01001176v1_decoy chrUn_JTFH01001177v1_decoy chrUn_JTFH01001178v1_decoy chrUn_JTFH01001179v1_decoy chrUn_JTFH01001180v1_decoy chrUn_JTFH01001181v1_decoy chrUn_JTFH01001182v1_decoy chrUn_JTFH01001183v1_decoy chrUn_JTFH01001184v1_decoy chrUn_JTFH01001185v1_decoy chrUn_JTFH01001186v1_decoy chrUn_JTFH01001187v1_decoy chrUn_JTFH01001188v1_decoy chrUn_JTFH01001189v1_decoy chrUn_JTFH01001190v1_decoy chrUn_JTFH01001191v1_decoy chrUn_JTFH01001192v1_decoy chrUn_JTFH01001193v1_decoy chrUn_JTFH01001194v1_decoy chrUn_JTFH01001195v1_decoy chrUn_JTFH01001196v1_decoy chrUn_JTFH01001197v1_decoy chrUn_JTFH01001198v1_decoy chrUn_JTFH01001199v1_decoy chrUn_JTFH01001200v1_decoy chrUn_JTFH01001201v1_decoy chrUn_JTFH01001202v1_decoy chrUn_JTFH01001203v1_decoy chrUn_JTFH01001204v1_decoy chrUn_JTFH01001205v1_decoy chrUn_JTFH01001206v1_decoy chrUn_JTFH01001207v1_decoy chrUn_JTFH01001208v1_decoy chrUn_JTFH01001209v1_decoy chrUn_JTFH01001210v1_decoy chrUn_JTFH01001211v1_decoy chrUn_JTFH01001212v1_decoy chrUn_JTFH01001213v1_decoy chrUn_JTFH01001214v1_decoy chrUn_JTFH01001215v1_decoy chrUn_JTFH01001216v1_decoy chrUn_JTFH01001217v1_decoy chrUn_JTFH01001218v1_decoy chrUn_JTFH01001219v1_decoy chrUn_JTFH01001220v1_decoy chrUn_JTFH01001221v1_decoy chrUn_JTFH01001222v1_decoy chrUn_JTFH01001223v1_decoy chrUn_JTFH01001224v1_decoy chrUn_JTFH01001225v1_decoy chrUn_JTFH01001226v1_decoy chrUn_JTFH01001227v1_decoy chrUn_JTFH01001228v1_decoy chrUn_JTFH01001229v1_decoy chrUn_JTFH01001230v1_decoy chrUn_JTFH01001231v1_decoy chrUn_JTFH01001232v1_decoy chrUn_JTFH01001233v1_decoy chrUn_JTFH01001234v1_decoy chrUn_JTFH01001235v1_decoy chrUn_JTFH01001236v1_decoy chrUn_JTFH01001237v1_decoy chrUn_JTFH01001238v1_decoy chrUn_JTFH01001239v1_decoy chrUn_JTFH01001240v1_decoy chrUn_JTFH01001241v1_decoy chrUn_JTFH01001242v1_decoy chrUn_JTFH01001243v1_decoy chrUn_JTFH01001244v1_decoy chrUn_JTFH01001245v1_decoy chrUn_JTFH01001246v1_decoy chrUn_JTFH01001247v1_decoy chrUn_JTFH01001248v1_decoy chrUn_JTFH01001249v1_decoy chrUn_JTFH01001250v1_decoy chrUn_JTFH01001251v1_decoy chrUn_JTFH01001252v1_decoy chrUn_JTFH01001253v1_decoy chrUn_JTFH01001254v1_decoy chrUn_JTFH01001255v1_decoy chrUn_JTFH01001256v1_decoy chrUn_JTFH01001257v1_decoy chrUn_JTFH01001258v1_decoy chrUn_JTFH01001259v1_decoy chrUn_JTFH01001260v1_decoy chrUn_JTFH01001261v1_decoy chrUn_JTFH01001262v1_decoy chrUn_JTFH01001263v1_decoy chrUn_JTFH01001264v1_decoy chrUn_JTFH01001265v1_decoy chrUn_JTFH01001266v1_decoy chrUn_JTFH01001267v1_decoy chrUn_JTFH01001268v1_decoy chrUn_JTFH01001269v1_decoy chrUn_JTFH01001270v1_decoy chrUn_JTFH01001271v1_decoy chrUn_JTFH01001272v1_decoy chrUn_JTFH01001273v1_decoy chrUn_JTFH01001274v1_decoy chrUn_JTFH01001275v1_decoy chrUn_JTFH01001276v1_decoy chrUn_JTFH01001277v1_decoy chrUn_JTFH01001278v1_decoy chrUn_JTFH01001279v1_decoy chrUn_JTFH01001280v1_decoy chrUn_JTFH01001281v1_decoy chrUn_JTFH01001282v1_decoy chrUn_JTFH01001283v1_decoy chrUn_JTFH01001284v1_decoy chrUn_JTFH01001285v1_decoy chrUn_JTFH01001286v1_decoy chrUn_JTFH01001287v1_decoy chrUn_JTFH01001288v1_decoy chrUn_JTFH01001289v1_decoy chrUn_JTFH01001290v1_decoy chrUn_JTFH01001291v1_decoy chrUn_JTFH01001292v1_decoy chrUn_JTFH01001293v1_decoy chrUn_JTFH01001294v1_decoy chrUn_JTFH01001295v1_decoy chrUn_JTFH01001296v1_decoy chrUn_JTFH01001297v1_decoy chrUn_JTFH01001298v1_decoy chrUn_JTFH01001299v1_decoy chrUn_JTFH01001300v1_decoy chrUn_JTFH01001301v1_decoy chrUn_JTFH01001302v1_decoy chrUn_JTFH01001303v1_decoy chrUn_JTFH01001304v1_decoy chrUn_JTFH01001305v1_decoy chrUn_JTFH01001306v1_decoy chrUn_JTFH01001307v1_decoy chrUn_JTFH01001308v1_decoy chrUn_JTFH01001309v1_decoy chrUn_JTFH01001310v1_decoy chrUn_JTFH01001311v1_decoy chrUn_JTFH01001312v1_decoy chrUn_JTFH01001313v1_decoy chrUn_JTFH01001314v1_decoy chrUn_JTFH01001315v1_decoy chrUn_JTFH01001316v1_decoy chrUn_JTFH01001317v1_decoy chrUn_JTFH01001318v1_decoy chrUn_JTFH01001319v1_decoy chrUn_JTFH01001320v1_decoy chrUn_JTFH01001321v1_decoy chrUn_JTFH01001322v1_decoy chrUn_JTFH01001323v1_decoy chrUn_JTFH01001324v1_decoy chrUn_JTFH01001325v1_decoy chrUn_JTFH01001326v1_decoy chrUn_JTFH01001327v1_decoy chrUn_JTFH01001328v1_decoy chrUn_JTFH01001329v1_decoy chrUn_JTFH01001330v1_decoy chrUn_JTFH01001331v1_decoy chrUn_JTFH01001332v1_decoy chrUn_JTFH01001333v1_decoy chrUn_JTFH01001334v1_decoy chrUn_JTFH01001335v1_decoy chrUn_JTFH01001336v1_decoy chrUn_JTFH01001337v1_decoy chrUn_JTFH01001338v1_decoy chrUn_JTFH01001339v1_decoy chrUn_JTFH01001340v1_decoy chrUn_JTFH01001341v1_decoy chrUn_JTFH01001342v1_decoy chrUn_JTFH01001343v1_decoy chrUn_JTFH01001344v1_decoy chrUn_JTFH01001345v1_decoy chrUn_JTFH01001346v1_decoy chrUn_JTFH01001347v1_decoy chrUn_JTFH01001348v1_decoy chrUn_JTFH01001349v1_decoy chrUn_JTFH01001350v1_decoy chrUn_JTFH01001351v1_decoy chrUn_JTFH01001352v1_decoy chrUn_JTFH01001353v1_decoy chrUn_JTFH01001354v1_decoy chrUn_JTFH01001355v1_decoy chrUn_JTFH01001356v1_decoy chrUn_JTFH01001357v1_decoy chrUn_JTFH01001358v1_decoy chrUn_JTFH01001359v1_decoy chrUn_JTFH01001360v1_decoy chrUn_JTFH01001361v1_decoy chrUn_JTFH01001362v1_decoy chrUn_JTFH01001363v1_decoy chrUn_JTFH01001364v1_decoy chrUn_JTFH01001365v1_decoy chrUn_JTFH01001366v1_decoy chrUn_JTFH01001367v1_decoy chrUn_JTFH01001368v1_decoy chrUn_JTFH01001369v1_decoy chrUn_JTFH01001370v1_decoy chrUn_JTFH01001371v1_decoy chrUn_JTFH01001372v1_decoy chrUn_JTFH01001373v1_decoy chrUn_JTFH01001374v1_decoy chrUn_JTFH01001375v1_decoy chrUn_JTFH01001376v1_decoy chrUn_JTFH01001377v1_decoy chrUn_JTFH01001378v1_decoy chrUn_JTFH01001379v1_decoy chrUn_JTFH01001380v1_decoy chrUn_JTFH01001381v1_decoy chrUn_JTFH01001382v1_decoy chrUn_JTFH01001383v1_decoy chrUn_JTFH01001384v1_decoy chrUn_JTFH01001385v1_decoy chrUn_JTFH01001386v1_decoy chrUn_JTFH01001387v1_decoy chrUn_JTFH01001388v1_decoy chrUn_JTFH01001389v1_decoy chrUn_JTFH01001390v1_decoy chrUn_JTFH01001391v1_decoy chrUn_JTFH01001392v1_decoy chrUn_JTFH01001393v1_decoy chrUn_JTFH01001394v1_decoy chrUn_JTFH01001395v1_decoy chrUn_JTFH01001396v1_decoy chrUn_JTFH01001397v1_decoy chrUn_JTFH01001398v1_decoy chrUn_JTFH01001399v1_decoy chrUn_JTFH01001400v1_decoy chrUn_JTFH01001401v1_decoy chrUn_JTFH01001402v1_decoy chrUn_JTFH01001403v1_decoy chrUn_JTFH01001404v1_decoy chrUn_JTFH01001405v1_decoy chrUn_JTFH01001406v1_decoy chrUn_JTFH01001407v1_decoy chrUn_JTFH01001408v1_decoy chrUn_JTFH01001409v1_decoy chrUn_JTFH01001410v1_decoy chrUn_JTFH01001411v1_decoy chrUn_JTFH01001412v1_decoy chrUn_JTFH01001413v1_decoy chrUn_JTFH01001414v1_decoy chrUn_JTFH01001415v1_decoy chrUn_JTFH01001416v1_decoy chrUn_JTFH01001417v1_decoy chrUn_JTFH01001418v1_decoy chrUn_JTFH01001419v1_decoy chrUn_JTFH01001420v1_decoy chrUn_JTFH01001421v1_decoy chrUn_JTFH01001422v1_decoy chrUn_JTFH01001423v1_decoy chrUn_JTFH01001424v1_decoy chrUn_JTFH01001425v1_decoy chrUn_JTFH01001426v1_decoy chrUn_JTFH01001427v1_decoy chrUn_JTFH01001428v1_decoy chrUn_JTFH01001429v1_decoy chrUn_JTFH01001430v1_decoy chrUn_JTFH01001431v1_decoy chrUn_JTFH01001432v1_decoy chrUn_JTFH01001433v1_decoy chrUn_JTFH01001434v1_decoy chrUn_JTFH01001435v1_decoy chrUn_JTFH01001436v1_decoy chrUn_JTFH01001437v1_decoy chrUn_JTFH01001438v1_decoy chrUn_JTFH01001439v1_decoy chrUn_JTFH01001440v1_decoy chrUn_JTFH01001441v1_decoy chrUn_JTFH01001442v1_decoy chrUn_JTFH01001443v1_decoy chrUn_JTFH01001444v1_decoy chrUn_JTFH01001445v1_decoy chrUn_JTFH01001446v1_decoy chrUn_JTFH01001447v1_decoy chrUn_JTFH01001448v1_decoy chrUn_JTFH01001449v1_decoy chrUn_JTFH01001450v1_decoy chrUn_JTFH01001451v1_decoy chrUn_JTFH01001452v1_decoy chrUn_JTFH01001453v1_decoy chrUn_JTFH01001454v1_decoy chrUn_JTFH01001455v1_decoy chrUn_JTFH01001456v1_decoy chrUn_JTFH01001457v1_decoy chrUn_JTFH01001458v1_decoy chrUn_JTFH01001459v1_decoy chrUn_JTFH01001460v1_decoy chrUn_JTFH01001461v1_decoy chrUn_JTFH01001462v1_decoy chrUn_JTFH01001463v1_decoy chrUn_JTFH01001464v1_decoy chrUn_JTFH01001465v1_decoy chrUn_JTFH01001466v1_decoy chrUn_JTFH01001467v1_decoy chrUn_JTFH01001468v1_decoy chrUn_JTFH01001469v1_decoy chrUn_JTFH01001470v1_decoy chrUn_JTFH01001471v1_decoy chrUn_JTFH01001472v1_decoy chrUn_JTFH01001473v1_decoy chrUn_JTFH01001474v1_decoy chrUn_JTFH01001475v1_decoy chrUn_JTFH01001476v1_decoy chrUn_JTFH01001477v1_decoy chrUn_JTFH01001478v1_decoy chrUn_JTFH01001479v1_decoy chrUn_JTFH01001480v1_decoy chrUn_JTFH01001481v1_decoy chrUn_JTFH01001482v1_decoy chrUn_JTFH01001483v1_decoy chrUn_JTFH01001484v1_decoy chrUn_JTFH01001485v1_decoy chrUn_JTFH01001486v1_decoy chrUn_JTFH01001487v1_decoy chrUn_JTFH01001488v1_decoy chrUn_JTFH01001489v1_decoy chrUn_JTFH01001490v1_decoy chrUn_JTFH01001491v1_decoy chrUn_JTFH01001492v1_decoy chrUn_JTFH01001493v1_decoy chrUn_JTFH01001494v1_decoy chrUn_JTFH01001495v1_decoy chrUn_JTFH01001496v1_decoy chrUn_JTFH01001497v1_decoy chrUn_JTFH01001498v1_decoy chrUn_JTFH01001499v1_decoy chrUn_JTFH01001500v1_decoy chrUn_JTFH01001501v1_decoy chrUn_JTFH01001502v1_decoy chrUn_JTFH01001503v1_decoy chrUn_JTFH01001504v1_decoy chrUn_JTFH01001505v1_decoy chrUn_JTFH01001506v1_decoy chrUn_JTFH01001507v1_decoy chrUn_JTFH01001508v1_decoy chrUn_JTFH01001509v1_decoy chrUn_JTFH01001510v1_decoy chrUn_JTFH01001511v1_decoy chrUn_JTFH01001512v1_decoy chrUn_JTFH01001513v1_decoy chrUn_JTFH01001514v1_decoy chrUn_JTFH01001515v1_decoy chrUn_JTFH01001516v1_decoy chrUn_JTFH01001517v1_decoy chrUn_JTFH01001518v1_decoy chrUn_JTFH01001519v1_decoy chrUn_JTFH01001520v1_decoy chrUn_JTFH01001521v1_decoy chrUn_JTFH01001522v1_decoy chrUn_JTFH01001523v1_decoy chrUn_JTFH01001524v1_decoy chrUn_JTFH01001525v1_decoy chrUn_JTFH01001526v1_decoy chrUn_JTFH01001527v1_decoy chrUn_JTFH01001528v1_decoy chrUn_JTFH01001529v1_decoy chrUn_JTFH01001530v1_decoy chrUn_JTFH01001531v1_decoy chrUn_JTFH01001532v1_decoy chrUn_JTFH01001533v1_decoy chrUn_JTFH01001534v1_decoy chrUn_JTFH01001535v1_decoy chrUn_JTFH01001536v1_decoy chrUn_JTFH01001537v1_decoy chrUn_JTFH01001538v1_decoy chrUn_JTFH01001539v1_decoy chrUn_JTFH01001540v1_decoy chrUn_JTFH01001541v1_decoy chrUn_JTFH01001542v1_decoy chrUn_JTFH01001543v1_decoy chrUn_JTFH01001544v1_decoy chrUn_JTFH01001545v1_decoy chrUn_JTFH01001546v1_decoy chrUn_JTFH01001547v1_decoy chrUn_JTFH01001548v1_decoy chrUn_JTFH01001549v1_decoy chrUn_JTFH01001550v1_decoy chrUn_JTFH01001551v1_decoy chrUn_JTFH01001552v1_decoy chrUn_JTFH01001553v1_decoy chrUn_JTFH01001554v1_decoy chrUn_JTFH01001555v1_decoy chrUn_JTFH01001556v1_decoy chrUn_JTFH01001557v1_decoy chrUn_JTFH01001558v1_decoy chrUn_JTFH01001559v1_decoy chrUn_JTFH01001560v1_decoy chrUn_JTFH01001561v1_decoy chrUn_JTFH01001562v1_decoy chrUn_JTFH01001563v1_decoy chrUn_JTFH01001564v1_decoy chrUn_JTFH01001565v1_decoy chrUn_JTFH01001566v1_decoy chrUn_JTFH01001567v1_decoy chrUn_JTFH01001568v1_decoy chrUn_JTFH01001569v1_decoy chrUn_JTFH01001570v1_decoy chrUn_JTFH01001571v1_decoy chrUn_JTFH01001572v1_decoy chrUn_JTFH01001573v1_decoy chrUn_JTFH01001574v1_decoy chrUn_JTFH01001575v1_decoy chrUn_JTFH01001576v1_decoy chrUn_JTFH01001577v1_decoy chrUn_JTFH01001578v1_decoy chrUn_JTFH01001579v1_decoy chrUn_JTFH01001580v1_decoy chrUn_JTFH01001581v1_decoy chrUn_JTFH01001582v1_decoy chrUn_JTFH01001583v1_decoy chrUn_JTFH01001584v1_decoy chrUn_JTFH01001585v1_decoy chrUn_JTFH01001586v1_decoy chrUn_JTFH01001587v1_decoy chrUn_JTFH01001588v1_decoy chrUn_JTFH01001589v1_decoy chrUn_JTFH01001590v1_decoy chrUn_JTFH01001591v1_decoy chrUn_JTFH01001592v1_decoy chrUn_JTFH01001593v1_decoy chrUn_JTFH01001594v1_decoy chrUn_JTFH01001595v1_decoy chrUn_JTFH01001596v1_decoy chrUn_JTFH01001597v1_decoy chrUn_JTFH01001598v1_decoy chrUn_JTFH01001599v1_decoy chrUn_JTFH01001600v1_decoy chrUn_JTFH01001601v1_decoy chrUn_JTFH01001602v1_decoy chrUn_JTFH01001603v1_decoy chrUn_JTFH01001604v1_decoy chrUn_JTFH01001605v1_decoy chrUn_JTFH01001606v1_decoy chrUn_JTFH01001607v1_decoy chrUn_JTFH01001608v1_decoy chrUn_JTFH01001609v1_decoy chrUn_JTFH01001610v1_decoy chrUn_JTFH01001611v1_decoy chrUn_JTFH01001612v1_decoy chrUn_JTFH01001613v1_decoy chrUn_JTFH01001614v1_decoy chrUn_JTFH01001615v1_decoy chrUn_JTFH01001616v1_decoy chrUn_JTFH01001617v1_decoy chrUn_JTFH01001618v1_decoy chrUn_JTFH01001619v1_decoy chrUn_JTFH01001620v1_decoy chrUn_JTFH01001621v1_decoy chrUn_JTFH01001622v1_decoy chrUn_JTFH01001623v1_decoy chrUn_JTFH01001624v1_decoy chrUn_JTFH01001625v1_decoy chrUn_JTFH01001626v1_decoy chrUn_JTFH01001627v1_decoy chrUn_JTFH01001628v1_decoy chrUn_JTFH01001629v1_decoy chrUn_JTFH01001630v1_decoy chrUn_JTFH01001631v1_decoy chrUn_JTFH01001632v1_decoy chrUn_JTFH01001633v1_decoy chrUn_JTFH01001634v1_decoy chrUn_JTFH01001635v1_decoy chrUn_JTFH01001636v1_decoy chrUn_JTFH01001637v1_decoy chrUn_JTFH01001638v1_decoy chrUn_JTFH01001639v1_decoy chrUn_JTFH01001640v1_decoy chrUn_JTFH01001641v1_decoy chrUn_JTFH01001642v1_decoy chrUn_JTFH01001643v1_decoy chrUn_JTFH01001644v1_decoy chrUn_JTFH01001645v1_decoy chrUn_JTFH01001646v1_decoy chrUn_JTFH01001647v1_decoy chrUn_JTFH01001648v1_decoy chrUn_JTFH01001649v1_decoy chrUn_JTFH01001650v1_decoy chrUn_JTFH01001651v1_decoy chrUn_JTFH01001652v1_decoy chrUn_JTFH01001653v1_decoy chrUn_JTFH01001654v1_decoy chrUn_JTFH01001655v1_decoy chrUn_JTFH01001656v1_decoy chrUn_JTFH01001657v1_decoy chrUn_JTFH01001658v1_decoy chrUn_JTFH01001659v1_decoy chrUn_JTFH01001660v1_decoy chrUn_JTFH01001661v1_decoy chrUn_JTFH01001662v1_decoy chrUn_JTFH01001663v1_decoy chrUn_JTFH01001664v1_decoy chrUn_JTFH01001665v1_decoy chrUn_JTFH01001666v1_decoy chrUn_JTFH01001667v1_decoy chrUn_JTFH01001668v1_decoy chrUn_JTFH01001669v1_decoy chrUn_JTFH01001670v1_decoy chrUn_JTFH01001671v1_decoy chrUn_JTFH01001672v1_decoy chrUn_JTFH01001673v1_decoy chrUn_JTFH01001674v1_decoy chrUn_JTFH01001675v1_decoy chrUn_JTFH01001676v1_decoy chrUn_JTFH01001677v1_decoy chrUn_JTFH01001678v1_decoy chrUn_JTFH01001679v1_decoy chrUn_JTFH01001680v1_decoy chrUn_JTFH01001681v1_decoy chrUn_JTFH01001682v1_decoy chrUn_JTFH01001683v1_decoy chrUn_JTFH01001684v1_decoy chrUn_JTFH01001685v1_decoy chrUn_JTFH01001686v1_decoy chrUn_JTFH01001687v1_decoy chrUn_JTFH01001688v1_decoy chrUn_JTFH01001689v1_decoy chrUn_JTFH01001690v1_decoy chrUn_JTFH01001691v1_decoy chrUn_JTFH01001692v1_decoy chrUn_JTFH01001693v1_decoy chrUn_JTFH01001694v1_decoy chrUn_JTFH01001695v1_decoy chrUn_JTFH01001696v1_decoy chrUn_JTFH01001697v1_decoy chrUn_JTFH01001698v1_decoy chrUn_JTFH01001699v1_decoy chrUn_JTFH01001700v1_decoy chrUn_JTFH01001701v1_decoy chrUn_JTFH01001702v1_decoy chrUn_JTFH01001703v1_decoy chrUn_JTFH01001704v1_decoy chrUn_JTFH01001705v1_decoy chrUn_JTFH01001706v1_decoy chrUn_JTFH01001707v1_decoy chrUn_JTFH01001708v1_decoy chrUn_JTFH01001709v1_decoy chrUn_JTFH01001710v1_decoy chrUn_JTFH01001711v1_decoy chrUn_JTFH01001712v1_decoy chrUn_JTFH01001713v1_decoy chrUn_JTFH01001714v1_decoy chrUn_JTFH01001715v1_decoy chrUn_JTFH01001716v1_decoy chrUn_JTFH01001717v1_decoy chrUn_JTFH01001718v1_decoy chrUn_JTFH01001719v1_decoy chrUn_JTFH01001720v1_decoy chrUn_JTFH01001721v1_decoy chrUn_JTFH01001722v1_decoy chrUn_JTFH01001723v1_decoy chrUn_JTFH01001724v1_decoy chrUn_JTFH01001725v1_decoy chrUn_JTFH01001726v1_decoy chrUn_JTFH01001727v1_decoy chrUn_JTFH01001728v1_decoy chrUn_JTFH01001729v1_decoy chrUn_JTFH01001730v1_decoy chrUn_JTFH01001731v1_decoy chrUn_JTFH01001732v1_decoy chrUn_JTFH01001733v1_decoy chrUn_JTFH01001734v1_decoy chrUn_JTFH01001735v1_decoy chrUn_JTFH01001736v1_decoy chrUn_JTFH01001737v1_decoy chrUn_JTFH01001738v1_decoy chrUn_JTFH01001739v1_decoy chrUn_JTFH01001740v1_decoy chrUn_JTFH01001741v1_decoy chrUn_JTFH01001742v1_decoy chrUn_JTFH01001743v1_decoy chrUn_JTFH01001744v1_decoy chrUn_JTFH01001745v1_decoy chrUn_JTFH01001746v1_decoy chrUn_JTFH01001747v1_decoy chrUn_JTFH01001748v1_decoy chrUn_JTFH01001749v1_decoy chrUn_JTFH01001750v1_decoy chrUn_JTFH01001751v1_decoy chrUn_JTFH01001752v1_decoy chrUn_JTFH01001753v1_decoy chrUn_JTFH01001754v1_decoy chrUn_JTFH01001755v1_decoy chrUn_JTFH01001756v1_decoy chrUn_JTFH01001757v1_decoy chrUn_JTFH01001758v1_decoy chrUn_JTFH01001759v1_decoy chrUn_JTFH01001760v1_decoy chrUn_JTFH01001761v1_decoy chrUn_JTFH01001762v1_decoy chrUn_JTFH01001763v1_decoy chrUn_JTFH01001764v1_decoy chrUn_JTFH01001765v1_decoy chrUn_JTFH01001766v1_decoy chrUn_JTFH01001767v1_decoy chrUn_JTFH01001768v1_decoy chrUn_JTFH01001769v1_decoy chrUn_JTFH01001770v1_decoy chrUn_JTFH01001771v1_decoy chrUn_JTFH01001772v1_decoy chrUn_JTFH01001773v1_decoy chrUn_JTFH01001774v1_decoy chrUn_JTFH01001775v1_decoy chrUn_JTFH01001776v1_decoy chrUn_JTFH01001777v1_decoy chrUn_JTFH01001778v1_decoy chrUn_JTFH01001779v1_decoy chrUn_JTFH01001780v1_decoy chrUn_JTFH01001781v1_decoy chrUn_JTFH01001782v1_decoy chrUn_JTFH01001783v1_decoy chrUn_JTFH01001784v1_decoy chrUn_JTFH01001785v1_decoy chrUn_JTFH01001786v1_decoy chrUn_JTFH01001787v1_decoy chrUn_JTFH01001788v1_decoy chrUn_JTFH01001789v1_decoy chrUn_JTFH01001790v1_decoy chrUn_JTFH01001791v1_decoy chrUn_JTFH01001792v1_decoy chrUn_JTFH01001793v1_decoy chrUn_JTFH01001794v1_decoy chrUn_JTFH01001795v1_decoy chrUn_JTFH01001796v1_decoy chrUn_JTFH01001797v1_decoy chrUn_JTFH01001798v1_decoy chrUn_JTFH01001799v1_decoy chrUn_JTFH01001800v1_decoy chrUn_JTFH01001801v1_decoy chrUn_JTFH01001802v1_decoy chrUn_JTFH01001803v1_decoy chrUn_JTFH01001804v1_decoy chrUn_JTFH01001805v1_decoy chrUn_JTFH01001806v1_decoy chrUn_JTFH01001807v1_decoy chrUn_JTFH01001808v1_decoy chrUn_JTFH01001809v1_decoy chrUn_JTFH01001810v1_decoy chrUn_JTFH01001811v1_decoy chrUn_JTFH01001812v1_decoy chrUn_JTFH01001813v1_decoy chrUn_JTFH01001814v1_decoy chrUn_JTFH01001815v1_decoy chrUn_JTFH01001816v1_decoy chrUn_JTFH01001817v1_decoy chrUn_JTFH01001818v1_decoy chrUn_JTFH01001819v1_decoy chrUn_JTFH01001820v1_decoy chrUn_JTFH01001821v1_decoy chrUn_JTFH01001822v1_decoy chrUn_JTFH01001823v1_decoy chrUn_JTFH01001824v1_decoy chrUn_JTFH01001825v1_decoy chrUn_JTFH01001826v1_decoy chrUn_JTFH01001827v1_decoy chrUn_JTFH01001828v1_decoy chrUn_JTFH01001829v1_decoy chrUn_JTFH01001830v1_decoy chrUn_JTFH01001831v1_decoy chrUn_JTFH01001832v1_decoy chrUn_JTFH01001833v1_decoy chrUn_JTFH01001834v1_decoy chrUn_JTFH01001835v1_decoy chrUn_JTFH01001836v1_decoy chrUn_JTFH01001837v1_decoy chrUn_JTFH01001838v1_decoy chrUn_JTFH01001839v1_decoy chrUn_JTFH01001840v1_decoy chrUn_JTFH01001841v1_decoy chrUn_JTFH01001842v1_decoy chrUn_JTFH01001843v1_decoy chrUn_JTFH01001844v1_decoy chrUn_JTFH01001845v1_decoy chrUn_JTFH01001846v1_decoy chrUn_JTFH01001847v1_decoy chrUn_JTFH01001848v1_decoy chrUn_JTFH01001849v1_decoy chrUn_JTFH01001850v1_decoy chrUn_JTFH01001851v1_decoy chrUn_JTFH01001852v1_decoy chrUn_JTFH01001853v1_decoy chrUn_JTFH01001854v1_decoy chrUn_JTFH01001855v1_decoy chrUn_JTFH01001856v1_decoy chrUn_JTFH01001857v1_decoy chrUn_JTFH01001858v1_decoy chrUn_JTFH01001859v1_decoy chrUn_JTFH01001860v1_decoy chrUn_JTFH01001861v1_decoy chrUn_JTFH01001862v1_decoy chrUn_JTFH01001863v1_decoy chrUn_JTFH01001864v1_decoy chrUn_JTFH01001865v1_decoy chrUn_JTFH01001866v1_decoy chrUn_JTFH01001867v1_decoy chrUn_JTFH01001868v1_decoy chrUn_JTFH01001869v1_decoy chrUn_JTFH01001870v1_decoy chrUn_JTFH01001871v1_decoy chrUn_JTFH01001872v1_decoy chrUn_JTFH01001873v1_decoy chrUn_JTFH01001874v1_decoy chrUn_JTFH01001875v1_decoy chrUn_JTFH01001876v1_decoy chrUn_JTFH01001877v1_decoy chrUn_JTFH01001878v1_decoy chrUn_JTFH01001879v1_decoy chrUn_JTFH01001880v1_decoy chrUn_JTFH01001881v1_decoy chrUn_JTFH01001882v1_decoy chrUn_JTFH01001883v1_decoy chrUn_JTFH01001884v1_decoy chrUn_JTFH01001885v1_decoy chrUn_JTFH01001886v1_decoy chrUn_JTFH01001887v1_decoy chrUn_JTFH01001888v1_decoy chrUn_JTFH01001889v1_decoy chrUn_JTFH01001890v1_decoy chrUn_JTFH01001891v1_decoy chrUn_JTFH01001892v1_decoy chrUn_JTFH01001893v1_decoy chrUn_JTFH01001894v1_decoy chrUn_JTFH01001895v1_decoy chrUn_JTFH01001896v1_decoy chrUn_JTFH01001897v1_decoy chrUn_JTFH01001898v1_decoy chrUn_JTFH01001899v1_decoy chrUn_JTFH01001900v1_decoy chrUn_JTFH01001901v1_decoy chrUn_JTFH01001902v1_decoy chrUn_JTFH01001903v1_decoy chrUn_JTFH01001904v1_decoy chrUn_JTFH01001905v1_decoy chrUn_JTFH01001906v1_decoy chrUn_JTFH01001907v1_decoy chrUn_JTFH01001908v1_decoy chrUn_JTFH01001909v1_decoy chrUn_JTFH01001910v1_decoy chrUn_JTFH01001911v1_decoy chrUn_JTFH01001912v1_decoy chrUn_JTFH01001913v1_decoy chrUn_JTFH01001914v1_decoy chrUn_JTFH01001915v1_decoy chrUn_JTFH01001916v1_decoy chrUn_JTFH01001917v1_decoy chrUn_JTFH01001918v1_decoy chrUn_JTFH01001919v1_decoy chrUn_JTFH01001920v1_decoy chrUn_JTFH01001921v1_decoy chrUn_JTFH01001922v1_decoy chrUn_JTFH01001923v1_decoy chrUn_JTFH01001924v1_decoy chrUn_JTFH01001925v1_decoy chrUn_JTFH01001926v1_decoy chrUn_JTFH01001927v1_decoy chrUn_JTFH01001928v1_decoy chrUn_JTFH01001929v1_decoy chrUn_JTFH01001930v1_decoy chrUn_JTFH01001931v1_decoy chrUn_JTFH01001932v1_decoy chrUn_JTFH01001933v1_decoy chrUn_JTFH01001934v1_decoy chrUn_JTFH01001935v1_decoy chrUn_JTFH01001936v1_decoy chrUn_JTFH01001937v1_decoy chrUn_JTFH01001938v1_decoy chrUn_JTFH01001939v1_decoy chrUn_JTFH01001940v1_decoy chrUn_JTFH01001941v1_decoy chrUn_JTFH01001942v1_decoy chrUn_JTFH01001943v1_decoy chrUn_JTFH01001944v1_decoy chrUn_JTFH01001945v1_decoy chrUn_JTFH01001946v1_decoy chrUn_JTFH01001947v1_decoy chrUn_JTFH01001948v1_decoy chrUn_JTFH01001949v1_decoy chrUn_JTFH01001950v1_decoy chrUn_JTFH01001951v1_decoy chrUn_JTFH01001952v1_decoy chrUn_JTFH01001953v1_decoy chrUn_JTFH01001954v1_decoy chrUn_JTFH01001955v1_decoy chrUn_JTFH01001956v1_decoy chrUn_JTFH01001957v1_decoy chrUn_JTFH01001958v1_decoy chrUn_JTFH01001959v1_decoy chrUn_JTFH01001960v1_decoy chrUn_JTFH01001961v1_decoy chrUn_JTFH01001962v1_decoy chrUn_JTFH01001963v1_decoy chrUn_JTFH01001964v1_decoy chrUn_JTFH01001965v1_decoy chrUn_JTFH01001966v1_decoy chrUn_JTFH01001967v1_decoy chrUn_JTFH01001968v1_decoy chrUn_JTFH01001969v1_decoy chrUn_JTFH01001970v1_decoy chrUn_JTFH01001971v1_decoy chrUn_JTFH01001972v1_decoy chrUn_JTFH01001973v1_decoy chrUn_JTFH01001974v1_decoy chrUn_JTFH01001975v1_decoy chrUn_JTFH01001976v1_decoy chrUn_JTFH01001977v1_decoy chrUn_JTFH01001978v1_decoy chrUn_JTFH01001979v1_decoy chrUn_JTFH01001980v1_decoy chrUn_JTFH01001981v1_decoy chrUn_JTFH01001982v1_decoy chrUn_JTFH01001983v1_decoy chrUn_JTFH01001984v1_decoy chrUn_JTFH01001985v1_decoy chrUn_JTFH01001986v1_decoy chrUn_JTFH01001987v1_decoy chrUn_JTFH01001988v1_decoy chrUn_JTFH01001989v1_decoy chrUn_JTFH01001990v1_decoy chrUn_JTFH01001991v1_decoy chrUn_JTFH01001992v1_decoy chrUn_JTFH01001993v1_decoy chrUn_JTFH01001994v1_decoy chrUn_JTFH01001995v1_decoy chrUn_JTFH01001996v1_decoy chrUn_JTFH01001997v1_decoy chrUn_JTFH01001998v1_decoy HLA-A*01:01:01:01 HLA-A*01:01:01:02N HLA-A*01:01:38L HLA-A*01:02 HLA-A*01:03 HLA-A*01:04N HLA-A*01:09 HLA-A*01:11N HLA-A*01:14 HLA-A*01:16N HLA-A*01:20 HLA-A*02:01:01:01 HLA-A*02:01:01:02L HLA-A*02:01:01:03 HLA-A*02:01:01:04 HLA-A*02:02:01 HLA-A*02:03:01 HLA-A*02:03:03 HLA-A*02:05:01 HLA-A*02:06:01 HLA-A*02:07:01 HLA-A*02:10 HLA-A*02:251 HLA-A*02:259 HLA-A*02:264 HLA-A*02:265 HLA-A*02:266 HLA-A*02:269 HLA-A*02:279 HLA-A*02:32N HLA-A*02:376 HLA-A*02:43N HLA-A*02:455 HLA-A*02:48 HLA-A*02:51 HLA-A*02:533 HLA-A*02:53N HLA-A*02:57 HLA-A*02:60:01 HLA-A*02:65 HLA-A*02:68 HLA-A*02:77 HLA-A*02:81 HLA-A*02:89 HLA-A*02:95 HLA-A*03:01:01:01 HLA-A*03:01:01:02N HLA-A*03:01:01:03 HLA-A*03:02:01 HLA-A*03:11N HLA-A*03:21N HLA-A*03:36N HLA-A*11:01:01 HLA-A*11:01:18 HLA-A*11:02:01 HLA-A*11:05 HLA-A*11:110 HLA-A*11:25 HLA-A*11:50Q HLA-A*11:60 HLA-A*11:69N HLA-A*11:74 HLA-A*11:75 HLA-A*11:77 HLA-A*23:01:01 HLA-A*23:09 HLA-A*23:38N HLA-A*24:02:01:01 HLA-A*24:02:01:02L HLA-A*24:02:01:03 HLA-A*24:02:03Q HLA-A*24:02:10 HLA-A*24:03:01 HLA-A*24:07:01 HLA-A*24:08 HLA-A*24:09N HLA-A*24:10:01 HLA-A*24:11N HLA-A*24:152 HLA-A*24:20 HLA-A*24:215 HLA-A*24:61 HLA-A*24:86N HLA-A*25:01:01 HLA-A*26:01:01 HLA-A*26:11N HLA-A*26:15 HLA-A*26:50 HLA-A*29:01:01:01 HLA-A*29:01:01:02N HLA-A*29:02:01:01 HLA-A*29:02:01:02 HLA-A*29:46 HLA-A*30:01:01 HLA-A*30:02:01:01 HLA-A*30:02:01:02 HLA-A*30:04:01 HLA-A*30:89 HLA-A*31:01:02 HLA-A*31:01:23 HLA-A*31:04 HLA-A*31:14N HLA-A*31:46 HLA-A*32:01:01 HLA-A*32:06 HLA-A*33:01:01 HLA-A*33:03:01 HLA-A*33:07 HLA-A*34:01:01 HLA-A*34:02:01 HLA-A*36:01 HLA-A*43:01 HLA-A*66:01:01 HLA-A*66:17 HLA-A*68:01:01:01 HLA-A*68:01:01:02 HLA-A*68:01:02:01 HLA-A*68:01:02:02 HLA-A*68:02:01:01 HLA-A*68:02:01:02 HLA-A*68:02:01:03 HLA-A*68:02:02 HLA-A*68:03:01 HLA-A*68:08:01 HLA-A*68:113 HLA-A*68:17 HLA-A*68:18N HLA-A*68:22 HLA-A*68:71 HLA-A*69:01 HLA-A*74:01 HLA-A*74:02:01:01 HLA-A*74:02:01:02 HLA-A*80:01:01:01 HLA-A*80:01:01:02 HLA-B*07:02:01 HLA-B*07:05:01 HLA-B*07:06 HLA-B*07:156 HLA-B*07:33:01 HLA-B*07:41 HLA-B*07:44 HLA-B*07:50 HLA-B*08:01:01 HLA-B*08:08N HLA-B*08:132 HLA-B*08:134 HLA-B*08:19N HLA-B*08:20 HLA-B*08:33 HLA-B*08:79 HLA-B*13:01:01 HLA-B*13:02:01 HLA-B*13:02:03 HLA-B*13:02:09 HLA-B*13:08 HLA-B*13:15 HLA-B*13:25 HLA-B*14:01:01 HLA-B*14:02:01 HLA-B*14:07N HLA-B*15:01:01:01 HLA-B*15:01:01:02N HLA-B*15:01:01:03 HLA-B*15:02:01 HLA-B*15:03:01 HLA-B*15:04:01 HLA-B*15:07:01 HLA-B*15:108 HLA-B*15:10:01 HLA-B*15:11:01 HLA-B*15:13:01 HLA-B*15:16:01 HLA-B*15:17:01:01 HLA-B*15:17:01:02 HLA-B*15:18:01 HLA-B*15:220 HLA-B*15:25:01 HLA-B*15:27:01 HLA-B*15:32:01 HLA-B*15:42 HLA-B*15:58 HLA-B*15:66 HLA-B*15:77 HLA-B*15:83 HLA-B*18:01:01:01 HLA-B*18:01:01:02 HLA-B*18:02 HLA-B*18:03 HLA-B*18:17N HLA-B*18:26 HLA-B*18:94N HLA-B*27:04:01 HLA-B*27:05:02 HLA-B*27:05:18 HLA-B*27:06 HLA-B*27:07:01 HLA-B*27:131 HLA-B*27:24 HLA-B*27:25 HLA-B*27:32 HLA-B*35:01:01:01 HLA-B*35:01:01:02 HLA-B*35:01:22 HLA-B*35:02:01 HLA-B*35:03:01 HLA-B*35:05:01 HLA-B*35:08:01 HLA-B*35:14:02 HLA-B*35:241 HLA-B*35:41 HLA-B*37:01:01 HLA-B*37:01:05 HLA-B*38:01:01 HLA-B*38:02:01 HLA-B*38:14 HLA-B*39:01:01:01 HLA-B*39:01:01:02L HLA-B*39:01:01:03 HLA-B*39:01:03 HLA-B*39:01:16 HLA-B*39:01:21 HLA-B*39:05:01 HLA-B*39:06:02 HLA-B*39:10:01 HLA-B*39:13:02 HLA-B*39:14 HLA-B*39:34 HLA-B*39:38Q HLA-B*40:01:01 HLA-B*40:01:02 HLA-B*40:02:01 HLA-B*40:03 HLA-B*40:06:01:01 HLA-B*40:06:01:02 HLA-B*40:10:01 HLA-B*40:150 HLA-B*40:40 HLA-B*40:72:01 HLA-B*40:79 HLA-B*41:01:01 HLA-B*41:02:01 HLA-B*42:01:01 HLA-B*42:02 HLA-B*42:08 HLA-B*44:02:01:01 HLA-B*44:02:01:02S HLA-B*44:02:01:03 HLA-B*44:02:17 HLA-B*44:02:27 HLA-B*44:03:01 HLA-B*44:03:02 HLA-B*44:04 HLA-B*44:09 HLA-B*44:138Q HLA-B*44:150 HLA-B*44:23N HLA-B*44:26 HLA-B*44:46 HLA-B*44:49 HLA-B*44:56N HLA-B*45:01:01 HLA-B*45:04 HLA-B*46:01:01 HLA-B*46:01:05 HLA-B*47:01:01:01 HLA-B*47:01:01:02 HLA-B*48:01:01 HLA-B*48:03:01 HLA-B*48:04 HLA-B*48:08 HLA-B*49:01:01 HLA-B*49:32 HLA-B*50:01:01 HLA-B*51:01:01 HLA-B*51:01:02 HLA-B*51:02:01 HLA-B*51:07:01 HLA-B*51:42 HLA-B*52:01:01:01 HLA-B*52:01:01:02 HLA-B*52:01:01:03 HLA-B*52:01:02 HLA-B*53:01:01 HLA-B*53:11 HLA-B*54:01:01 HLA-B*54:18 HLA-B*55:01:01 HLA-B*55:01:03 HLA-B*55:02:01 HLA-B*55:12 HLA-B*55:24 HLA-B*55:48 HLA-B*56:01:01 HLA-B*56:03 HLA-B*56:04 HLA-B*57:01:01 HLA-B*57:03:01 HLA-B*57:06 HLA-B*57:11 HLA-B*57:29 HLA-B*58:01:01 HLA-B*58:31N HLA-B*59:01:01:01 HLA-B*59:01:01:02 HLA-B*67:01:01 HLA-B*67:01:02 HLA-B*67:02 HLA-B*73:01 HLA-B*78:01:01 HLA-B*81:01 HLA-B*82:02:01 HLA-C*01:02:01 HLA-C*01:02:11 HLA-C*01:02:29 HLA-C*01:02:30 HLA-C*01:03 HLA-C*01:06 HLA-C*01:08 HLA-C*01:14 HLA-C*01:21 HLA-C*01:30 HLA-C*01:40 HLA-C*02:02:02:01 HLA-C*02:02:02:02 HLA-C*02:10 HLA-C*02:11 HLA-C*02:16:02 HLA-C*02:69 HLA-C*02:85 HLA-C*02:86 HLA-C*02:87 HLA-C*03:02:01 HLA-C*03:02:02:01 HLA-C*03:02:02:02 HLA-C*03:02:02:03 HLA-C*03:03:01 HLA-C*03:04:01:01 HLA-C*03:04:01:02 HLA-C*03:04:02 HLA-C*03:04:04 HLA-C*03:05 HLA-C*03:06 HLA-C*03:100 HLA-C*03:13:01 HLA-C*03:20N HLA-C*03:219 HLA-C*03:261 HLA-C*03:40:01 HLA-C*03:41:02 HLA-C*03:46 HLA-C*03:61 HLA-C*04:01:01:01 HLA-C*04:01:01:02 HLA-C*04:01:01:03 HLA-C*04:01:01:04 HLA-C*04:01:01:05 HLA-C*04:01:62 HLA-C*04:03:01 HLA-C*04:06 HLA-C*04:09N HLA-C*04:128 HLA-C*04:161 HLA-C*04:177 HLA-C*04:70 HLA-C*04:71 HLA-C*05:01:01:01 HLA-C*05:01:01:02 HLA-C*05:08 HLA-C*05:09:01 HLA-C*05:93 HLA-C*06:02:01:01 HLA-C*06:02:01:02 HLA-C*06:02:01:03 HLA-C*06:23 HLA-C*06:24 HLA-C*06:46N HLA-C*07:01:01:01 HLA-C*07:01:01:02 HLA-C*07:01:02 HLA-C*07:01:19 HLA-C*07:01:27 HLA-C*07:01:45 HLA-C*07:02:01:01 HLA-C*07:02:01:02 HLA-C*07:02:01:03 HLA-C*07:02:01:04 HLA-C*07:02:01:05 HLA-C*07:02:05 HLA-C*07:02:06 HLA-C*07:02:64 HLA-C*07:04:01 HLA-C*07:04:02 HLA-C*07:06 HLA-C*07:149 HLA-C*07:18 HLA-C*07:19 HLA-C*07:26 HLA-C*07:30 HLA-C*07:32N HLA-C*07:384 HLA-C*07:385 HLA-C*07:386 HLA-C*07:391 HLA-C*07:392 HLA-C*07:49 HLA-C*07:56:02 HLA-C*07:66 HLA-C*07:67 HLA-C*08:01:01 HLA-C*08:01:03 HLA-C*08:02:01:01 HLA-C*08:02:01:02 HLA-C*08:03:01 HLA-C*08:04:01 HLA-C*08:112 HLA-C*08:20 HLA-C*08:21 HLA-C*08:22 HLA-C*08:24 HLA-C*08:27 HLA-C*08:36N HLA-C*08:40 HLA-C*08:41 HLA-C*08:62 HLA-C*12:02:02 HLA-C*12:03:01:01 HLA-C*12:03:01:02 HLA-C*12:08 HLA-C*12:13 HLA-C*12:19 HLA-C*12:22 HLA-C*12:99 HLA-C*14:02:01 HLA-C*14:03 HLA-C*14:21N HLA-C*14:23 HLA-C*15:02:01 HLA-C*15:05:01 HLA-C*15:05:02 HLA-C*15:13 HLA-C*15:16 HLA-C*15:17 HLA-C*15:96Q HLA-C*16:01:01 HLA-C*16:02:01 HLA-C*16:04:01 HLA-C*17:01:01:01 HLA-C*17:01:01:02 HLA-C*17:01:01:03 HLA-C*17:03 HLA-C*18:01 HLA-DQA1*01:01:02 HLA-DQA1*01:02:01:01 HLA-DQA1*01:02:01:02 HLA-DQA1*01:02:01:03 HLA-DQA1*01:02:01:04 HLA-DQA1*01:03:01:01 HLA-DQA1*01:03:01:02 HLA-DQA1*01:04:01:01 HLA-DQA1*01:04:01:02 HLA-DQA1*01:05:01 HLA-DQA1*01:07 HLA-DQA1*01:10 HLA-DQA1*01:11 HLA-DQA1*02:01 HLA-DQA1*03:01:01 HLA-DQA1*03:02 HLA-DQA1*03:03:01 HLA-DQA1*04:01:02:01 HLA-DQA1*04:01:02:02 HLA-DQA1*04:02 HLA-DQA1*05:01:01:01 HLA-DQA1*05:01:01:02 HLA-DQA1*05:03 HLA-DQA1*05:05:01:01 HLA-DQA1*05:05:01:02 HLA-DQA1*05:05:01:03 HLA-DQA1*05:11 HLA-DQA1*06:01:01 HLA-DQB1*02:01:01 HLA-DQB1*02:02:01 HLA-DQB1*03:01:01:01 HLA-DQB1*03:01:01:02 HLA-DQB1*03:01:01:03 HLA-DQB1*03:02:01 HLA-DQB1*03:03:02:01 HLA-DQB1*03:03:02:02 HLA-DQB1*03:03:02:03 HLA-DQB1*03:05:01 HLA-DQB1*05:01:01:01 HLA-DQB1*05:01:01:02 HLA-DQB1*05:03:01:01 HLA-DQB1*05:03:01:02 HLA-DQB1*06:01:01 HLA-DQB1*06:02:01 HLA-DQB1*06:03:01 HLA-DQB1*06:09:01 HLA-DRB1*01:01:01 HLA-DRB1*01:02:01 HLA-DRB1*03:01:01:01 HLA-DRB1*03:01:01:02 HLA-DRB1*04:03:01 HLA-DRB1*07:01:01:01 HLA-DRB1*07:01:01:02 HLA-DRB1*08:03:02 HLA-DRB1*09:21 HLA-DRB1*10:01:01 HLA-DRB1*11:01:01 HLA-DRB1*11:01:02 HLA-DRB1*11:04:01 HLA-DRB1*12:01:01 HLA-DRB1*12:17 HLA-DRB1*13:01:01 HLA-DRB1*13:02:01 HLA-DRB1*14:05:01 HLA-DRB1*14:54:01 HLA-DRB1*15:01:01:01 HLA-DRB1*15:01:01:02 HLA-DRB1*15:01:01:03 HLA-DRB1*15:01:01:04 HLA-DRB1*15:02:01 HLA-DRB1*15:03:01:01 HLA-DRB1*15:03:01:02 HLA-DRB1*16:02:01 delly-0.9.1/excludeTemplates/mouse.mm10.excl.tsv000066400000000000000000000120661414764127700215550ustar00rootroot00000000000000chr1 0 100000 telomere chr1 110000 3000000 centromere chr1 195371971 195471971 telomere chr2 0 100000 telomere chr2 110000 3000000 centromere chr2 182013224 182113224 telomere chr3 0 100000 telomere chr3 110000 3000000 centromere chr3 159939680 160039680 telomere chr4 0 100000 telomere chr4 110000 3000000 centromere chr4 156408116 156508116 telomere chr5 0 100000 telomere chr5 110000 3000000 centromere chr5 151734684 151834684 telomere chr6 0 100000 telomere chr6 110000 3000000 centromere chr6 149636546 149736546 telomere chr7 0 100000 telomere chr7 110000 3000000 centromere chr7 145341459 145441459 telomere chr8 0 100000 telomere chr8 110000 3000000 centromere chr8 129301213 129401213 telomere chr9 0 100000 telomere chr9 110000 3000000 centromere chr9 124495110 124595110 telomere chrX 110000 3000000 centromere chrX 0 100000 telomere chrX 170931299 171031299 telomere chrY 0 100000 telomere chrY 91644698 91744698 telomere chr10 0 100000 telomere chr10 110000 3000000 centromere chr10 130594993 130694993 telomere chr11 0 100000 telomere chr11 110000 3000000 centromere chr11 121982543 122082543 telomere chr12 0 100000 telomere chr12 110000 3000000 centromere chr12 120029022 120129022 telomere chr13 0 100000 telomere chr13 110000 3000000 centromere chr13 120321639 120421639 telomere chr14 0 100000 telomere chr14 110000 3000000 centromere chr14 124802244 124902244 telomere chr15 110000 3000000 centromere chr15 0 100000 telomere chr15 103943685 104043685 telomere chr16 0 100000 telomere chr16 110000 3000000 centromere chr16 98107768 98207768 telomere chr17 110000 3000000 centromere chr17 94887271 94987271 telomere chr17 0 100000 telomere chr18 0 100000 telomere chr18 110000 3000000 centromere chr18 90602639 90702639 telomere chr19 0 100000 telomere chr19 110000 3000000 centromere chr19 61331566 61431566 telomere 1 0 100000 telomere 1 110000 3000000 centromere 1 195371971 195471971 telomere 2 0 100000 telomere 2 110000 3000000 centromere 2 182013224 182113224 telomere 3 0 100000 telomere 3 110000 3000000 centromere 3 159939680 160039680 telomere 4 0 100000 telomere 4 110000 3000000 centromere 4 156408116 156508116 telomere 5 0 100000 telomere 5 110000 3000000 centromere 5 151734684 151834684 telomere 6 0 100000 telomere 6 110000 3000000 centromere 6 149636546 149736546 telomere 7 0 100000 telomere 7 110000 3000000 centromere 7 145341459 145441459 telomere 8 0 100000 telomere 8 110000 3000000 centromere 8 129301213 129401213 telomere 9 0 100000 telomere 9 110000 3000000 centromere 9 124495110 124595110 telomere X 110000 3000000 centromere X 0 100000 telomere X 170931299 171031299 telomere Y 0 100000 telomere Y 91644698 91744698 telomere 10 0 100000 telomere 10 110000 3000000 centromere 10 130594993 130694993 telomere 11 0 100000 telomere 11 110000 3000000 centromere 11 121982543 122082543 telomere 12 0 100000 telomere 12 110000 3000000 centromere 12 120029022 120129022 telomere 13 0 100000 telomere 13 110000 3000000 centromere 13 120321639 120421639 telomere 14 0 100000 telomere 14 110000 3000000 centromere 14 124802244 124902244 telomere 15 110000 3000000 centromere 15 0 100000 telomere 15 103943685 104043685 telomere 16 0 100000 telomere 16 110000 3000000 centromere 16 98107768 98207768 telomere 17 110000 3000000 centromere 17 94887271 94987271 telomere 17 0 100000 telomere 18 0 100000 telomere 18 110000 3000000 centromere 18 90602639 90702639 telomere 19 0 100000 telomere 19 110000 3000000 centromere 19 61331566 61431566 telomere chr1_GL456210_random chr1_GL456211_random chr1_GL456212_random chr1_GL456213_random chr1_GL456221_random chr4_GL456216_random chr4_GL456350_random chr4_JH584292_random chr4_JH584293_random chr4_JH584294_random chr4_JH584295_random chr5_GL456354_random chr5_JH584296_random chr5_JH584297_random chr5_JH584298_random chr5_JH584299_random chr7_GL456219_random chrUn_GL456239 chrUn_GL456359 chrUn_GL456360 chrUn_GL456366 chrUn_GL456367 chrUn_GL456368 chrUn_GL456370 chrUn_GL456372 chrUn_GL456378 chrUn_GL456379 chrUn_GL456381 chrUn_GL456382 chrUn_GL456383 chrUn_GL456385 chrUn_GL456387 chrUn_GL456389 chrUn_GL456390 chrUn_GL456392 chrUn_GL456393 chrUn_GL456394 chrUn_GL456396 chrUn_JH584304 chrX_GL456233_random chrY_JH584300_random chrY_JH584301_random chrY_JH584302_random chrY_JH584303_random GL456210 GL456211 GL456212 GL456213 GL456221 GL456216 GL456350 JH584292 JH584293 JH584294 JH584295 GL456354 JH584296 JH584297 JH584298 JH584299 GL456219 GL456239 GL456359 GL456360 GL456366 GL456367 GL456368 GL456370 GL456372 GL456378 GL456379 GL456381 GL456382 GL456383 GL456385 GL456387 GL456389 GL456390 GL456392 GL456393 GL456394 GL456396 JH584304 GL456233 JH584300 JH584301 JH584302 JH584303 GL456210.1 GL456211.1 GL456212.1 GL456213.1 GL456221.1 GL456216.1 GL456350.1 JH584292.1 JH584293.1 JH584294.1 JH584295.1 GL456354.1 JH584296.1 JH584297.1 JH584298.1 JH584299.1 GL456219.1 GL456239.1 GL456359.1 GL456360.1 GL456366.1 GL456367.1 GL456368.1 GL456370.1 GL456372.1 GL456378.1 GL456379.1 GL456381.1 GL456382.1 GL456383.1 GL456385.1 GL456387.1 GL456389.1 GL456390.1 GL456392.1 GL456393.1 GL456394.1 GL456396.1 JH584304.1 GL456233.1 JH584300.1 JH584301.1 JH584302.1 JH584303.1 chrMT chrM MT delly-0.9.1/excludeTemplates/mouse.mm9.excl.tsv000066400000000000000000000024761414764127700215110ustar00rootroot00000000000000chr1 0 3000000 centromere chr2 0 3000000 centromere chr3 0 3000000 centromere chr4 0 3000000 centromere chr5 0 3000000 centromere chr6 0 3000000 centromere chr7 0 3000000 centromere chr8 0 3000000 centromere chr9 0 3000000 centromere chrX 0 3000000 centromere chrY 2902555 5902555 centromere chr10 0 3000000 centromere chr11 0 3000000 centromere chr12 0 3000000 centromere chr13 0 3000000 centromere chr14 0 3000000 centromere chr15 0 3000000 centromere chr16 0 3000000 centromere chr17 0 3000000 centromere chr18 0 3000000 centromere chr19 0 3000000 centromere 1 0 3000000 centromere 2 0 3000000 centromere 3 0 3000000 centromere 4 0 3000000 centromere 5 0 3000000 centromere 6 0 3000000 centromere 7 0 3000000 centromere 8 0 3000000 centromere 9 0 3000000 centromere X 0 3000000 centromere Y 2902555 5902555 centromere 10 0 3000000 centromere 11 0 3000000 centromere 12 0 3000000 centromere 13 0 3000000 centromere 14 0 3000000 centromere 15 0 3000000 centromere 16 0 3000000 centromere 17 0 3000000 centromere 18 0 3000000 centromere 19 0 3000000 centromere chr13_random chr16_random chr17_random chr1_random chr3_random chr4_random chr5_random chr7_random chr8_random chr9_random chrUn_random chrX_random chrY_random 13_random 16_random 17_random 1_random 3_random 4_random 5_random 7_random 8_random 9_random Un_random X_random Y_random delly-0.9.1/excludeTemplates/yeast.sacCer3.excl.tsv000066400000000000000000000026561414764127700222670ustar00rootroot00000000000000chrI 1 801 telomere chrI 229411 230218 telomere chrII 1 6608 telomere chrII 307587 308887 telomere chrII 812379 813184 telomere chrIII 1 1098 telomere chrIII 315354 316620 telomere chrIV 1 904 telomere chrIV 1524625 1531933 telomere chrIX 1 7784 telomere chrIX 439068 439888 telomere chrV 1 6473 telomere chrV 549566 549814 telomere chrV 569599 576874 telomere chrVI 1 6155 telomere chrVI 269731 270161 telomere chrVII 1 781 telomere chrVII 1083635 1090940 telomere chrVIII 1 5505 telomere chrVIII 556105 562643 telomere chrX 1 7767 telomere chrX 744902 745751 telomere chrXI 1 807 telomere chrXI 665904 666816 telomere chrXII 1 12085 telomere chrXII 1064281 1078177 telomere chrXIII 1 6344 telomere chrXIII 923541 924431 telomere chrXIV 1 7428 telomere chrXIV 783278 784333 telomere chrXV 1 847 telomere chrXV 1083922 1091291 telomere chrXVI 1 7223 telomere chrXVI 942396 948010 telomere chrI 151465 151582 centromere chrII 238207 238323 centromere chrIII 114385 114501 centromere chrIV 449711 449821 centromere chrIX 355629 355745 centromere chrV 151987 152104 centromere chrVI 148510 148627 centromere chrVII 496920 497038 centromere chrVIII 105586 105703 centromere chrX 436307 436425 centromere chrXI 440129 440246 centromere chrXII 5724 11196 centromere chrXII 150828 150947 centromere chrXII 1065156 1071645 centromere chrXIII 268031 268149 centromere chrXIV 628758 628875 centromere chrXV 326584 326702 centromere chrXVI 555957 556073 centromere delly-0.9.1/singularity/000077500000000000000000000000001414764127700152215ustar00rootroot00000000000000delly-0.9.1/singularity/README.md000066400000000000000000000003731414764127700165030ustar00rootroot00000000000000You can build a [delly](https://github.com/dellytools/delly) singularity container (SIF file) using `sudo singularity build delly.sif delly.def` Once you have built the container you can run analysis using `singularity exec delly.sif delly --help` delly-0.9.1/singularity/delly.def000066400000000000000000000022231414764127700170110ustar00rootroot00000000000000# Build image BootStrap: library From: ubuntu:16.04 Stage: build %post apt-get -y update apt-get install -y autoconf build-essential cmake g++ gfortran git libcurl4-gnutls-dev hdf5-tools libboost-date-time-dev libboost-program-options-dev libboost-system-dev libboost-filesystem-dev libboost-iostreams-dev libbz2-dev libhdf5-dev libncurses-dev liblzma-dev zlib1g-dev apt-get clean rm -rf /var/lib/apt/lists/* cd /opt git clone --recursive https://github.com/dellytools/delly.git cd /opt/delly/ make STATIC=1 all make install # Final image BootStrap: library From: alpine:latest Stage: final %files from build /opt/delly/bin/delly /bin/delly %post DELLYVERSION=`./bin/delly --version` echo "export DELLYVERSION=\"${DELLYVERSION}\"" >> $SINGULARITY_ENVIRONMENT CREATEDATE=`date` echo "export CREATEDATE=\"${CREATEDATE}\"" >> $SINGULARITY_ENVIRONMENT %environment export PATH=/bin:$PATH %runscript delly %labels Singularity definition file version v0.0.1 %help This is a container running delly. You can run delly on files inside the directory where you start the container, i.e.: singularity exec delly.sif delly call -g ref.fa input.bam delly-0.9.1/src/000077500000000000000000000000001414764127700134365ustar00rootroot00000000000000delly-0.9.1/src/align.h000066400000000000000000000163671414764127700147160ustar00rootroot00000000000000#ifndef ALIGN_H #define ALIGN_H #include #include namespace torali { template struct DnaScore { typedef TScoreValue TValue; TScoreValue match; TScoreValue mismatch; TScoreValue go; TScoreValue ge; TScoreValue inf; DnaScore() { match = 5; mismatch = -4; go = -10; ge = -1; inf = 1000000; } DnaScore(TScoreValue m, TScoreValue mm, TScoreValue gapopen, TScoreValue gapextension) : match(m), mismatch(mm), go(gapopen), ge(gapextension) { inf = 1000000; } }; // Configure the DP matrix template class AlignConfig; template<> class AlignConfig {}; template<> class AlignConfig {}; template<> class AlignConfig {}; template<> class AlignConfig {}; template inline TCost _verticalGap(AlignConfig const&, TPos1 const, TPos2 const, TCost const cost) { return cost; } template inline TCost _verticalGap(AlignConfig const&, TPos1 const i, TPos2 const iend, TCost const cost) { if ((i == (TPos1) 0) || (i == (TPos1) iend)) return 0; else return cost; } template inline TCost _horizontalGap(AlignConfig const&, TPos1 const, TPos2 const, TCost const cost) { return cost; } template inline TCost _horizontalGap(AlignConfig const&, TPos1 const i, TPos2 const iend, TCost const cost) { if ((i == (TPos1) 0) || (i == (TPos1) iend)) return 0; else return cost; } template inline std::size_t _size(boost::multi_array const& a, TDimension const i) { return a.shape()[i]; } template inline std::size_t _size(std::string const& s, TDimension const i) { if (i) return s.size(); return 1; } template inline int _score(std::string const& s1, std::string const& s2, TProfile const&, TProfile const&, TAIndex row, TAIndex col, TScore const& sc) { return (s1[row] == s2[col] ? sc.match : sc.mismatch ); } template inline int _score(boost::multi_array const& a1, boost::multi_array const& a2, TProfile const& p1, TProfile const& p2, TAIndex row, TAIndex col, TScore const& sc) { if ((a1.shape()[0] == 1) && (a2.shape()[0] == 1)) { if (a1[0][row] == a2[0][col]) return sc.match; else return sc.mismatch; } else { typedef typename TProfile::index TPIndex; float score = 0; for(TPIndex k1 = 0; k1<5; ++k1) for(TPIndex k2 = 0; k2<5; ++k2) score += p1[k1][row] * p2[k2][col] * ( (k1 == k2) ? sc.match : sc.mismatch ); return ((int) score); } } template inline void _createProfile(std::string const& s, TProfile& p) { typedef typename TProfile::index TPIndex; p.resize(boost::extents[6][s.size()]); // 'A', 'C', 'G', 'T', 'N', '-' for (std::size_t j = 0; j < s.size(); ++j) { for(TPIndex k = 0; k < 6; ++k) p[k][j] = 0; if ((s[j] == 'A') || (s[j] == 'a')) p[0][j] += 1; else if ((s[j] == 'C') || (s[j] == 'c')) p[1][j] += 1; else if ((s[j] == 'G') || (s[j] == 'g')) p[2][j] += 1; else if ((s[j] == 'T') || (s[j] == 't')) p[3][j] += 1; else if ((s[j] == 'N') || (s[j] == 'n')) p[4][j] += 1; else if (s[j] == '-') p[5][j] += 1; } } template inline void _createProfile(boost::multi_array const& a, TProfile& p) { typedef typename boost::multi_array::index TAIndex; typedef typename TProfile::index TPIndex; p.resize(boost::extents[6][a.shape()[1]]); // 'A', 'C', 'G', 'T', 'N', '-' // Ignore leading and trailing gaps std::vector firstAlignedNuc(a.shape()[0], -1); std::vector lastAlignedNuc(a.shape()[0], a.shape()[1]); for(TAIndex i = 0; i < (TAIndex) a.shape()[0]; ++i) { for (TAIndex j = 0; j < (TAIndex) a.shape()[1]; ++j) { if (firstAlignedNuc[i] == -1) { if (a[i][j] != '-') firstAlignedNuc[i] = j; } if (firstAlignedNuc[i] != -1) { if (a[i][j] != '-') lastAlignedNuc[i] = j; } } } // Compute alignment profile for (TAIndex j = 0; j < (TAIndex) a.shape()[1]; ++j) { for(TPIndex k = 0; k < 6; ++k) p[k][j] = 0; int sum = 0; for(TAIndex i = 0; i < (TAIndex) a.shape()[0]; ++i) { if ((firstAlignedNuc[i] <= j) && (j <= lastAlignedNuc[i])) { ++sum; if ((a[i][j] == 'A') || (a[i][j] == 'a')) p[0][j] += 1; else if ((a[i][j] == 'C') || (a[i][j] == 'c')) p[1][j] += 1; else if ((a[i][j] == 'G') || (a[i][j] == 'g')) p[2][j] += 1; else if ((a[i][j] == 'T') || (a[i][j] == 't')) p[3][j] += 1; else if ((a[i][j] == 'N') || (a[i][j] == 'n')) p[4][j] += 1; else if (a[i][j] == '-') p[5][j] += 1; else --sum; } } for(TPIndex k = 0; k<6; ++k) p[k][j] /= sum; } } template inline void _createLocalAlignment(TTrace const& trace, std::string const& s1, std::string const& s2, TAlign& align, int32_t const maxRow, int32_t const maxCol) { align.resize(boost::extents[2][trace.size()]); std::size_t row = maxRow; std::size_t col = maxCol; std::size_t ai = 0; for(typename TTrace::const_reverse_iterator itT = trace.rbegin(); itT != trace.rend(); ++itT, ++ai) { if (*itT == 's') { align[0][ai] = s1[row++]; align[1][ai] = s2[col++]; } else if (*itT =='h') { align[0][ai] = '-'; align[1][ai] = s2[col++]; } else { align[0][ai] = s1[row++]; align[1][ai] = '-'; } } } template inline void _createAlignment(TTrace const& trace, std::string const& s1, std::string const& s2, TAlign& align) { _createLocalAlignment(trace, s1, s2, align, 0, 0); } template inline void _createAlignment(TTrace const& trace, boost::multi_array const& a1, boost::multi_array const& a2, TAlign& align) { typedef typename TAlign::index TAIndex; TAIndex numN = a1.shape()[0]; TAIndex numM = a2.shape()[0]; align.resize(boost::extents[numN + numM][trace.size()]); TAIndex row = 0; TAIndex col = 0; TAIndex ai = 0; for(typename TTrace::const_reverse_iterator itT = trace.rbegin(); itT != trace.rend(); ++itT, ++ai) { if (*itT == 's') { for(TAIndex i = 0; i #include #include "msa.h" #include "split.h" #include "gotoh.h" #include "needle.h" namespace torali { struct SeqSlice { int32_t svid; int32_t sstart; int32_t inslen; int32_t qual; // Only required for junction count map SeqSlice() : svid(-1), sstart(-1), inslen(-1), qual(-1) {} SeqSlice(int32_t const sv, int32_t const sst, int32_t const il, int32_t q) : svid(sv), sstart(sst), inslen(il), qual(q) {} }; template inline void assemble(TConfig const& c, TValidRegion const& validRegions, std::vector& svs, TSRStore& srStore) { // Sequence store typedef std::set TSequences; typedef std::vector TSVSequences; TSVSequences seqStore(svs.size(), TSequences()); // SV consensus done std::vector svcons(svs.size(), false); // Open file handles typedef std::vector TSamFile; typedef std::vector TIndex; TSamFile samfile(c.files.size()); TIndex idx(c.files.size()); for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) { samfile[file_c] = sam_open(c.files[file_c].string().c_str(), "r"); hts_set_fai_filename(samfile[file_c], c.genome.string().c_str()); idx[file_c] = sam_index_load(samfile[file_c], c.files[file_c].string().c_str()); } bam_hdr_t* hdr = sam_hdr_read(samfile[0]); // Parse BAM boost::posix_time::ptime now = boost::posix_time::second_clock::local_time(); std::cout << '[' << boost::posix_time::to_simple_string(now) << "] " << "Split-read assembly" << std::endl; boost::progress_display show_progress( hdr->n_targets ); faidx_t* fai = fai_load(c.genome.string().c_str()); for(int32_t refIndex = 0; refIndex < hdr->n_targets; ++refIndex) { ++show_progress; if (validRegions[refIndex].empty()) continue; // Load sequence int32_t seqlen = -1; std::string tname(hdr->target_name[refIndex]); char* seq = faidx_fetch_seq(fai, tname.c_str(), 0, hdr->target_len[refIndex], &seqlen); // Collect reads from all samples for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) { // Read alignments (full chromosome because primary alignments might be somewhere else) hts_itr_t* iter = sam_itr_queryi(idx[file_c], refIndex, 0, hdr->target_len[refIndex]); bam1_t* rec = bam_init1(); while (sam_itr_next(samfile[file_c], iter, rec) >= 0) { // Only primary alignments with the full sequence information if (rec->core.flag & (BAM_FQCFAIL | BAM_FDUP | BAM_FUNMAP | BAM_FSECONDARY | BAM_FSUPPLEMENTARY)) continue; std::size_t seed = hash_lr(rec); if (srStore.find(seed) != srStore.end()) { for(uint32_t ri = 0; ri < srStore[seed].size(); ++ri) { int32_t svid = srStore[seed][ri].svid; //std::cerr << svs[svid].svStart << ',' << svs[svid].svEnd << ',' << svs[svid].svt << ',' << svid << " SV" << std::endl; //std::cerr << seed << '\t' << srStore[seed][ri].svid << '\t' << srStore[seed][ri].sstart << '\t' << srStore[seed][ri].inslen << '\t' << sv[srStore[seed][ri].svid].srSupport << '\t' << sv[srStore[seed][ri].svid].svt << std::endl; if ((!svcons[svid]) && (seqStore[svid].size() < c.maxReadPerSV)) { // Get sequence std::string sequence; sequence.resize(rec->core.l_qseq); uint8_t* seqptr = bam_get_seq(rec); for (int i = 0; i < rec->core.l_qseq; ++i) sequence[i] = "=ACMGRSVTWYHKDBN"[bam_seqi(seqptr, i)]; int32_t readlen = sequence.size(); // Extract subsequence (otherwise MSA takes forever) int32_t window = 1000; int32_t sPos = srStore[seed][ri].sstart - window; int32_t ePos = srStore[seed][ri].sstart + srStore[seed][ri].inslen + window; if (rec->core.flag & BAM_FREVERSE) { sPos = (readlen - (srStore[seed][ri].sstart + srStore[seed][ri].inslen)) - window; ePos = (readlen - srStore[seed][ri].sstart) + window; } if (sPos < 0) sPos = 0; if (ePos > (int32_t) readlen) ePos = readlen; // Min. seq length and max insertion size, 10kbp? if (((ePos - sPos) > window) && ((ePos - sPos) <= (10000 + window))) { std::string seqalign = sequence.substr(sPos, (ePos - sPos)); if ((svs[svid].svt == 5) || (svs[svid].svt == 6)) { if (svs[svid].chr == refIndex) reverseComplement(seqalign); } seqStore[svid].insert(seqalign); // Enough split-reads? if ((!_translocation(svs[svid].svt)) && (svs[svid].chr == refIndex)) { if ((seqStore[svid].size() == c.maxReadPerSV) || ((int32_t) seqStore[svid].size() == svs[svid].srSupport)) { bool msaSuccess = false; if (seqStore[svid].size() > 1) { //std::cerr << svs[svid].svStart << ',' << svs[svid].svEnd << ',' << svs[svid].svt << ',' << svid << " SV" << std::endl; //for(typename TSequences::iterator it = seqStore[svid].begin(); it != seqStore[svid].end(); ++it) std::cerr << *it << std::endl; msa(c, seqStore[svid], svs[svid].consensus); //outputConsensus(hdr, svs[svid], svs[svid].consensus); if ((svs[svid].svt == 1) || (svs[svid].svt == 5)) reverseComplement(svs[svid].consensus); //std::cerr << svs[svid].consensus << std::endl; if (alignConsensus(c, hdr, seq, NULL, svs[svid])) msaSuccess = true; //std::cerr << msaSuccess << std::endl; } if (!msaSuccess) { svs[svid].consensus = ""; svs[svid].srSupport = 0; svs[svid].srAlignQuality = 0; } seqStore[svid].clear(); svcons[svid] = true; } } } } } } } bam_destroy1(rec); hts_itr_destroy(iter); } // Handle left-overs and translocations for(int32_t refIndex2 = 0; refIndex2 <= refIndex; ++refIndex2) { char* sndSeq = NULL; for(uint32_t svid = 0; svid < svcons.size(); ++svid) { if (!svcons[svid]) { if ((svs[svid].chr != refIndex) || (svs[svid].chr2 != refIndex2)) continue; bool msaSuccess = false; if (seqStore[svid].size() > 1) { // Lazy loading of references if (refIndex != refIndex2) { if (sndSeq == NULL) { int32_t seqlen = -1; std::string tname(hdr->target_name[refIndex2]); sndSeq = faidx_fetch_seq(fai, tname.c_str(), 0, hdr->target_len[refIndex2], &seqlen); } } //std::cerr << svs[svid].svStart << ',' << svs[svid].svEnd << ',' << svs[svid].svt << ',' << svid << " SV" << std::endl; //for(typename TSequences::iterator it = seqStore[svid].begin(); it != seqStore[svid].end(); ++it) std::cerr << *it << std::endl; msa(c, seqStore[svid], svs[svid].consensus); //outputConsensus(hdr, svs[svid], svs[svid].consensus); if ((svs[svid].svt == 1) || (svs[svid].svt == 5)) reverseComplement(svs[svid].consensus); //std::cerr << "Consensus: " << svs[svid].consensus << std::endl; if (alignConsensus(c, hdr, seq, sndSeq, svs[svid])) msaSuccess = true; //std::cerr << msaSuccess << std::endl; } if (!msaSuccess) { svs[svid].consensus = ""; svs[svid].srSupport = 0; svs[svid].srAlignQuality = 0; } seqStore[svid].clear(); svcons[svid] = true; } } if (sndSeq != NULL) free(sndSeq); } // Clean-up if (seq != NULL) free(seq); } // Clean-up fai_destroy(fai); bam_hdr_destroy(hdr); for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) { hts_idx_destroy(idx[file_c]); sam_close(samfile[file_c]); } // Clean-up unfinished SVs for(uint32_t svid = 0; svid < svcons.size(); ++svid) { if (!svcons[svid]) { //std::cerr << "Missing: " << svid << ',' << svs[svid].svt << std::endl; svs[svid].consensus = ""; svs[svid].srSupport = 0; svs[svid].srAlignQuality = 0; } } } } #endif delly-0.9.1/src/bed.h000066400000000000000000000075351414764127700143530ustar00rootroot00000000000000#ifndef BED_H #define BED_H #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace torali { // Flattens overlapping intervals template inline int32_t _parseBedIntervals(std::string const& filename, bool const filePresent, bam_hdr_t* hdr, TRegionsGenome& bedRegions) { typedef typename TRegionsGenome::value_type TChrIntervals; typedef typename TChrIntervals::interval_type TIVal; int32_t intervals = 0; if (filePresent) { bedRegions.resize(hdr->n_targets, TChrIntervals()); std::ifstream chrFile(filename.c_str(), std::ifstream::in); if (chrFile.is_open()) { while (chrFile.good()) { std::string chrFromFile; getline(chrFile, chrFromFile); typedef boost::tokenizer< boost::char_separator > Tokenizer; boost::char_separator sep(" \t,;"); Tokenizer tokens(chrFromFile, sep); Tokenizer::iterator tokIter = tokens.begin(); if (tokIter!=tokens.end()) { std::string chrName = *tokIter++; int32_t tid = bam_name2id(hdr, chrName.c_str()); if (tid >= 0) { if (tokIter!=tokens.end()) { int32_t start = boost::lexical_cast(*tokIter++); int32_t end = boost::lexical_cast(*tokIter++); bedRegions[tid].insert(TIVal::right_open(start, end)); ++intervals; } } } } } chrFile.close(); } return intervals; } // Keeps overlapping intervals template inline int32_t _parsePotOverlappingIntervals(std::string const& filename, bool const filePresent, bam_hdr_t* hdr, TRegionsGenome& bedRegions) { typedef typename TRegionsGenome::value_type TChrIntervals; int32_t intervals = 0; if (filePresent) { bedRegions.resize(hdr->n_targets, TChrIntervals()); std::ifstream chrFile(filename.c_str(), std::ifstream::in); if (chrFile.is_open()) { while (chrFile.good()) { std::string chrFromFile; getline(chrFile, chrFromFile); typedef boost::tokenizer< boost::char_separator > Tokenizer; boost::char_separator sep(" \t,;"); Tokenizer tokens(chrFromFile, sep); Tokenizer::iterator tokIter = tokens.begin(); if (tokIter!=tokens.end()) { std::string chrName = *tokIter++; int32_t tid = bam_name2id(hdr, chrName.c_str()); if (tid >= 0) { if (tokIter!=tokens.end()) { int32_t start = boost::lexical_cast(*tokIter++); int32_t end = boost::lexical_cast(*tokIter++); bedRegions[tid].insert(std::make_pair(start, end)); ++intervals; } } } } } chrFile.close(); } return intervals; } template inline void _mergeOverlappingBedEntries(TChrIntervals const& bedRegions, TChrIntervals& citv) { typedef boost::icl::interval_set TUniqueIntervals; typedef typename TUniqueIntervals::interval_type TIVal; TUniqueIntervals uitv; // Insert intervals for(typename TChrIntervals::const_iterator it = bedRegions.begin(); it != bedRegions.end(); ++it) uitv.insert(TIVal::right_open(it->first, it->second)); // Fetch unique intervals for(typename TUniqueIntervals::iterator it = uitv.begin(); it != uitv.end(); ++it) citv.insert(std::make_pair(it->lower(), it->upper())); } } #endif delly-0.9.1/src/bolog.h000066400000000000000000000117401414764127700147140ustar00rootroot00000000000000#ifndef BOLOG_H #define BOLOG_H #include #include namespace torali { #define SMALLEST_GL -1000 template struct BoLog { typedef TPrecision value_type; std::vector phred2prob; BoLog() { for(int i = 0; i <= boost::math::round(-10 * SMALLEST_GL); ++i) phred2prob.push_back(std::pow(TPrecision(10), -(TPrecision(i)/TPrecision(10)))); } }; template inline void _computeGLs(TBoLog const& bl, TMapqVector const& mapqRef, TMapqVector const& mapqAlt, float* gls, int32_t* gqval, int32_t* gts, int const file_c) { typedef typename TBoLog::value_type FLP; FLP gl[3]; // Compute genotype likelihoods for(unsigned int geno=0; geno<=2; ++geno) gl[geno]=0; unsigned int peDepth=mapqRef.size() + mapqAlt.size(); for(typename TMapqVector::const_iterator mapqRefIt = mapqRef.begin();mapqRefIt!=mapqRef.end();++mapqRefIt) { gl[0] += std::log10(bl.phred2prob[*mapqRefIt]); gl[1] += std::log10(bl.phred2prob[*mapqRefIt] + (FLP(1) - bl.phred2prob[*mapqRefIt])); gl[2] += std::log10(FLP(1) - bl.phred2prob[*mapqRefIt]); } for(typename TMapqVector::const_iterator mapqAltIt = mapqAlt.begin();mapqAltIt!=mapqAlt.end();++mapqAltIt) { gl[0] += std::log10(FLP(1) - bl.phred2prob[*mapqAltIt]); gl[1] += std::log10((FLP(1) - bl.phred2prob[*mapqAltIt]) + bl.phred2prob[*mapqAltIt]); gl[2] += std::log10(bl.phred2prob[*mapqAltIt]); } gl[1] += -FLP(peDepth) * std::log10(FLP(2)); unsigned int glBest=0; FLP glBestVal=gl[glBest]; for(unsigned int geno=1; geno<=2; ++geno) { if (gl[geno] > glBestVal) { glBestVal=gl[geno]; glBest = geno; } } // Rescale by best genotype for(unsigned int geno=0; geno<=2; ++geno) { gl[geno] -= glBestVal; // Cap at smallest GL gl[geno] = (gl[geno] > SMALLEST_GL) ? gl[geno] : SMALLEST_GL; } // Phred-scaled genotype likelihoods uint32_t pl[3]; pl[0] = (uint32_t) boost::math::round(-10 * gl[0]); pl[1] = (uint32_t) boost::math::round(-10 * gl[1]); pl[2] = (uint32_t) boost::math::round(-10 * gl[2]); if ((peDepth) && (pl[0] + pl[1] + pl[2] > 0)) { FLP likelihood = (FLP) std::log10((1-1/(bl.phred2prob[pl[0]]+bl.phred2prob[pl[1]]+bl.phred2prob[pl[2]]))); likelihood = (likelihood > SMALLEST_GL) ? likelihood : SMALLEST_GL; gqval[file_c] = (int32_t) boost::math::round(-10 * likelihood); if (glBest==0) { gts[file_c * 2] = bcf_gt_unphased(1); gts[file_c * 2 + 1] = bcf_gt_unphased(1); } else if (glBest==1) { gts[file_c * 2] = bcf_gt_unphased(0); gts[file_c * 2 + 1] = bcf_gt_unphased(1); } else { gts[file_c * 2] = bcf_gt_unphased(0); gts[file_c * 2 + 1] = bcf_gt_unphased(0); } } else { gts[file_c * 2] = bcf_gt_missing; gts[file_c * 2 + 1] = bcf_gt_missing; gqval[file_c] = 0; } gls[file_c * 3 + 2] = (float) gl[0]; gls[file_c * 3 + 1] = (float) gl[1]; gls[file_c * 3] = (float) gl[2]; } template inline int32_t _computeCNLs(TConfig const& c, double const mean, double const sd, float* gl, int32_t* gqval, int32_t const file_c) { // Compute copy-number likelihoods boost::math::normal s(mean, sd); for(uint32_t geno=0; geno< MAX_CN; ++geno) { double prob = boost::math::pdf(s, geno); gl[file_c * MAX_CN + geno] = std::log10(prob); gl[file_c * MAX_CN + geno] = (gl[file_c * MAX_CN + geno] > SMALLEST_GL) ? gl[file_c * MAX_CN + geno] : SMALLEST_GL; } uint32_t glBest=file_c * MAX_CN + 0; uint32_t glBest2nd=file_c * MAX_CN + 1; if (gl[glBest] < gl[glBest2nd]) { glBest = file_c * MAX_CN + 1; glBest2nd = file_c * MAX_CN + 0; } for(uint32_t geno=2; geno < MAX_CN; ++geno) { if (gl[file_c * MAX_CN + geno] > gl[glBest2nd]) { if (gl[file_c * MAX_CN + geno] > gl[glBest]) { glBest2nd = glBest; glBest = file_c * MAX_CN + geno; } else { glBest2nd = file_c * MAX_CN + geno; } } } // Variant quality double glObs = std::log10(boost::math::pdf(s, mean)); glObs = (glObs > SMALLEST_GL) ? glObs : SMALLEST_GL; uint32_t plVariant = (uint32_t) boost::math::round(-10 * glObs); uint32_t plPloidy = (uint32_t) boost::math::round(-10 * gl[file_c * MAX_CN + c.ploidy]); int32_t varqual = plPloidy - plVariant; // GQ uint32_t plBest = (uint32_t) boost::math::round(-10 * gl[glBest]); uint32_t plBest2nd = (uint32_t) boost::math::round(-10 * gl[glBest2nd]); gqval[file_c] = plBest2nd - plBest; // Rescale by best genotype double glBestVal = gl[glBest]; for(uint32_t geno=0; geno< MAX_CN; ++geno) gl[file_c * MAX_CN + geno] -= glBestVal; // Variant quality return varqual; } template inline int32_t _computeCNLs(TConfig const& c, double const mean, double const sd, float* gl, int32_t* gqval) { return _computeCNLs(c, mean, sd, gl, gqval, 0); } } #endif delly-0.9.1/src/classify.h000066400000000000000000000424601414764127700154320ustar00rootroot00000000000000#ifndef CLASSIFY_H #define CLASSIFY_H #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "tags.h" #include "version.h" #include "util.h" #include "modvcf.h" namespace torali { struct ClassifyConfig { bool filterForPass; bool hasSampleFile; int32_t minsize; int32_t maxsize; int32_t qual; uint16_t ploidy; float pgerm; float maxsd; float cn_offset; std::string filter; std::set tumorSet; std::set controlSet; boost::filesystem::path outfile; boost::filesystem::path samplefile; boost::filesystem::path vcffile; }; template inline int classifyRun(TClassifyConfig const& c) { // Load bcf file htsFile* ifile = hts_open(c.vcffile.string().c_str(), "r"); bcf_hdr_t* hdr = bcf_hdr_read(ifile); // Open output VCF file htsFile *ofile = hts_open(c.outfile.string().c_str(), "wb"); bcf_hdr_t *hdr_out = bcf_hdr_dup(hdr); if (c.filter == "somatic") { bcf_hdr_remove(hdr_out, BCF_HL_INFO, "SOMATIC"); bcf_hdr_append(hdr_out, "##INFO="); bcf_hdr_remove(hdr_out, BCF_HL_INFO, "PGERM"); bcf_hdr_append(hdr_out, "##INFO="); bcf_hdr_remove(hdr_out, BCF_HL_INFO, "CNDIFF"); bcf_hdr_append(hdr_out, "##INFO="); } else { bcf_hdr_remove(hdr_out, BCF_HL_INFO, "CNSHIFT"); bcf_hdr_append(hdr_out, "##INFO="); bcf_hdr_remove(hdr_out, BCF_HL_INFO, "CNSD"); bcf_hdr_append(hdr_out, "##INFO="); } if (bcf_hdr_write(ofile, hdr_out) != 0) std::cerr << "Error: Failed to write BCF header!" << std::endl; // VCF fields int32_t nsvend = 0; int32_t* svend = NULL; int32_t nsvt = 0; char* svt = NULL; int ngqval = 0; int32_t* gqval = NULL; int ncnval = 0; int32_t* cnval = NULL; int ncnl = 0; float* cnl = NULL; int nrdcn = 0; float* rdcn = NULL; int nrdsd = 0; float* rdsd = NULL; bool germline = false; if (c.filter == "germline") germline = true; // Parse BCF boost::posix_time::ptime now = boost::posix_time::second_clock::local_time(); std::cout << '[' << boost::posix_time::to_simple_string(now) << "] " << "Filtering VCF/BCF file" << std::endl; bcf1_t* rec = bcf_init1(); while (bcf_read(ifile, hdr, rec) == 0) { bcf_unpack(rec, BCF_UN_INFO); // Check SV type bcf_get_info_string(hdr, rec, "SVTYPE", &svt, &nsvt); if (std::string(svt) != "CNV") continue; // Check PASS bool pass = true; if (c.filterForPass) pass = (bcf_has_filter(hdr, rec, const_cast("PASS"))==1); if (!pass) continue; // Check size int32_t svStart= rec->pos - 1; bcf_get_info_int32(hdr, rec, "END", &svend, &nsvend); int32_t svEnd = *svend; if (svStart > svEnd) continue; int32_t svlen = svEnd - svStart; if ((svlen < c.minsize) || (svlen > c.maxsize)) continue; // Check copy-number bcf_unpack(rec, BCF_UN_ALL); bcf_get_format_int32(hdr, rec, "GQ", &gqval, &ngqval); bcf_get_format_int32(hdr, rec, "CN", &cnval, &ncnval); bcf_get_format_float(hdr, rec, "CNL", &cnl, &ncnl); bcf_get_format_float(hdr, rec, "RDCN", &rdcn, &nrdcn); bcf_get_format_float(hdr, rec, "RDSD", &rdsd, &nrdsd); typedef std::pair TCnSd; typedef std::vector TSampleDist; TSampleDist control; TSampleDist tumor; bool invalidCNV = false; for (int i = 0; i < bcf_hdr_nsamples(hdr); ++i) { if ((!std::isfinite(rdcn[i])) || (rdcn[i] == -1)) { invalidCNV = true; break; } if ((germline) || (c.controlSet.find(hdr->samples[i]) != c.controlSet.end())) { // Control or population genomics control.push_back(std::make_pair(rdcn[i], rdsd[i])); } else if ((!germline) && (c.tumorSet.find(hdr->samples[i]) != c.tumorSet.end())) { // Tumor tumor.push_back(std::make_pair(rdcn[i], rdsd[i])); } } if (invalidCNV) continue; // Classify if (!germline) { // Somatic mode double bestCnOffset = 0; bool somaticcnv = false; double lowestp = 1; for(uint32_t i = 0; i < tumor.size(); ++i) { bool germcnv = false; double highestprob = 0; double tcnoffset = -1; for(uint32_t k = 0; k < control.size(); ++k) { boost::math::normal s1(control[k].first, control[k].second); double prob1 = boost::math::pdf(s1, tumor[i].first); boost::math::normal s2(tumor[i].first, tumor[i].second); double prob2 = boost::math::pdf(s2, control[k].first); double prob = std::max(prob1, prob2); if (prob > c.pgerm) germcnv = true; else { // Among all controls, take highest p-value (most likely germline CNV) if (prob > highestprob) highestprob = prob; } double cndiff = std::abs(tumor[i].first - control[k].first); if (cndiff < c.cn_offset) germcnv = true; else { // Among all controls, take smallest CN difference if ((tcnoffset == -1) || (cndiff < tcnoffset)) tcnoffset = cndiff; } } // Among all tumors take best CN difference and lowest p-value if (!germcnv) { somaticcnv = true; if ((highestprob < lowestp) && (tcnoffset > bestCnOffset)) { lowestp = highestprob; bestCnOffset = tcnoffset; } } } if (!somaticcnv) continue; _remove_info_tag(hdr_out, rec, "SOMATIC"); bcf_update_info_flag(hdr_out, rec, "SOMATIC", NULL, 1); float pgerm = (float) lowestp; _remove_info_tag(hdr_out, rec, "PGERM"); bcf_update_info_float(hdr_out, rec, "PGERM", &pgerm, 1); float cndiv = (float) bestCnOffset; _remove_info_tag(hdr_out, rec, "CNDIFF"); bcf_update_info_float(hdr_out, rec, "CNDIFF", &cndiv, 1); } else { // Correct CN shift int32_t cnmain = 0; { std::vector cncount(MAX_CN, 0); { bool validsite = true; boost::accumulators::accumulator_set > acc; for(uint32_t k = 0; k < control.size(); ++k) { if ((boost::math::isinf(control[k].first)) || (boost::math::isnan(control[k].first))) validsite = false; else acc(boost::math::round(control[k].first) - control[k].first); } if (!validsite) continue; double cnshift = boost::accumulators::mean(acc); float cnshiftval = cnshift; _remove_info_tag(hdr_out, rec, "CNSHIFT"); bcf_update_info_float(hdr_out, rec, "CNSHIFT", &cnshiftval, 1); for (int i = 0; i < bcf_hdr_nsamples(hdr); ++i) { rdcn[i] += cnshift; cnval[i] = boost::math::round(rdcn[i]); if ((cnval[i] >= 0) && (cnval[i] < MAX_CN)) ++cncount[cnval[i]]; } } // Find max CN for(uint32_t i = 1; i < MAX_CN; ++i) { if (cncount[i] > cncount[cnmain]) cnmain = i; } } // Calculate SD boost::accumulators::accumulator_set > accLocal; for (int i = 0; i < bcf_hdr_nsamples(hdr); ++i) { if (cnval[i] == cnmain) accLocal(rdcn[i]); } double sd = sqrt(boost::accumulators::variance(accLocal)); if (sd < 0.025) sd = 0.025; float cnsdval = sd; _remove_info_tag(hdr_out, rec, "CNSD"); bcf_update_info_float(hdr_out, rec, "CNSD", &cnsdval, 1); if (cnsdval > c.maxsd) continue; // Re-compute CNLs std::vector ftarr(bcf_hdr_nsamples(hdr)); int32_t altqual = 0; int32_t altcount = 0; for (int i = 0; i < bcf_hdr_nsamples(hdr); ++i) { int32_t qval = _computeCNLs(c, rdcn[i], sd, cnl, gqval, i); if (cnval[i] != c.ploidy) { altqual += qval; ++altcount; } if (gqval[i] < 15) ftarr[i] = "LowQual"; else ftarr[i] = "PASS"; } if (altcount == 0) continue; altqual /= altcount; if (altqual < c.qual) continue; if (altqual > 10000) altqual = 10000; // Update QUAL and FILTER rec->qual = altqual; int32_t tmpi = bcf_hdr_id2int(hdr_out, BCF_DT_ID, "PASS"); if (rec->qual < 15) tmpi = bcf_hdr_id2int(hdr_out, BCF_DT_ID, "LowQual"); bcf_update_filter(hdr_out, rec, &tmpi, 1); // Update GT fields std::vector strp(bcf_hdr_nsamples(hdr)); std::transform(ftarr.begin(), ftarr.end(), strp.begin(), cstyle_str()); bcf_update_format_int32(hdr_out, rec, "CN", cnval, bcf_hdr_nsamples(hdr)); bcf_update_format_float(hdr_out, rec, "CNL", cnl, bcf_hdr_nsamples(hdr) * MAX_CN); bcf_update_format_int32(hdr_out, rec, "GQ", gqval, bcf_hdr_nsamples(hdr)); bcf_update_format_string(hdr_out, rec, "FT", &strp[0], bcf_hdr_nsamples(hdr)); bcf_update_format_float(hdr_out, rec, "RDCN", rdcn, bcf_hdr_nsamples(hdr)); } bcf_write1(ofile, hdr_out, rec); } bcf_destroy(rec); // Clean-up if (svend != NULL) free(svend); if (svt != NULL) free(svt); if (gqval != NULL) free(gqval); if (cnval != NULL) free(cnval); if (cnl != NULL) free(cnl); if (rdcn != NULL) free(rdcn); if (rdsd != NULL) free(rdsd); // Close output VCF bcf_hdr_destroy(hdr_out); hts_close(ofile); // Build index bcf_index_build(c.outfile.string().c_str(), 14); // Close VCF bcf_hdr_destroy(hdr); bcf_close(ifile); // End now = boost::posix_time::second_clock::local_time(); std::cout << '[' << boost::posix_time::to_simple_string(now) << "] Done." << std::endl; return 0; } int classify(int argc, char **argv) { ClassifyConfig c; // Define generic options boost::program_options::options_description generic("Generic options"); generic.add_options() ("help,?", "show help message") ("filter,f", boost::program_options::value(&c.filter)->default_value("somatic"), "Filter mode (somatic, germline)") ("outfile,o", boost::program_options::value(&c.outfile)->default_value("cnv.bcf"), "Filtered CNV BCF output file") ("minsize,m", boost::program_options::value(&c.minsize)->default_value(1000), "min. CNV size") ("maxsize,n", boost::program_options::value(&c.maxsize)->default_value(500000000), "max. CNV size") ("pass,p", "Filter sites for PASS") ; // Define somatic options boost::program_options::options_description somatic("Somatic options"); somatic.add_options() ("samples,s", boost::program_options::value(&c.samplefile), "Two-column sample file listing sample name and tumor or control") ("pgerm,e", boost::program_options::value(&c.pgerm)->default_value(0.001), "probability germline") ("cn-offset,t", boost::program_options::value(&c.cn_offset)->default_value(0.2), "min. CN offset") ; // Define germline options boost::program_options::options_description germline("Germline options"); germline.add_options() ("ploidy,y", boost::program_options::value(&c.ploidy)->default_value(2), "baseline ploidy") ("qual,q", boost::program_options::value(&c.qual)->default_value(50), "min. site quality") ("maxsd,x", boost::program_options::value(&c.maxsd)->default_value(0.15), "max. population SD") ; // Define hidden options boost::program_options::options_description hidden("Hidden options"); hidden.add_options() ("input-file", boost::program_options::value(&c.vcffile), "input file") ; boost::program_options::positional_options_description pos_args; pos_args.add("input-file", -1); // Set the visibility boost::program_options::options_description cmdline_options; cmdline_options.add(generic).add(somatic).add(germline).add(hidden); boost::program_options::options_description visible_options; visible_options.add(generic).add(somatic).add(germline); boost::program_options::variables_map vm; boost::program_options::store(boost::program_options::command_line_parser(argc, argv).options(cmdline_options).positional(pos_args).run(), vm); boost::program_options::notify(vm); // Check command line arguments if ((vm.count("help")) || (!vm.count("input-file"))) { std::cout << std::endl; std::cout << "Usage: delly " << argv[0] << " [OPTIONS] " << std::endl; std::cout << visible_options << "\n"; return 0; } // Filter for PASS if (vm.count("pass")) c.filterForPass = true; else c.filterForPass = false; // Check sample file std::set tSet; std::set cSet; if (c.filter == "somatic") { c.hasSampleFile = true; if (!(boost::filesystem::exists(c.samplefile) && boost::filesystem::is_regular_file(c.samplefile) && boost::filesystem::file_size(c.samplefile))) { std::cerr << "Sample file is missing " << c.samplefile.string() << std::endl; return 1; } else { // Get samples std::ifstream sampleFile(c.samplefile.string().c_str(), std::ifstream::in); if (sampleFile.is_open()) { while (sampleFile.good()) { std::string sampleFromFile; getline(sampleFile, sampleFromFile); typedef boost::tokenizer< boost::char_separator > Tokenizer; boost::char_separator sep(",\t "); Tokenizer tokens(sampleFromFile, sep); Tokenizer::iterator tokIter = tokens.begin(); if (tokIter != tokens.end()) { std::string sample = *tokIter++; if (tokIter != tokens.end()) { std::string type = *tokIter; if (type == "control") cSet.insert(sample); else if (type == "tumor") tSet.insert(sample); else { std::cerr << "Sample type for " << sample << " is neither tumor nor control" << std::endl; return 1; } } } } sampleFile.close(); } if (tSet.empty()) { std::cerr << "No tumor samples specified." << std::endl; return 1; } if (cSet.empty()) { std::cerr << "No control samples specified." << std::endl; return 1; } std::vector intersection; std::set_intersection(cSet.begin(), cSet.end(), tSet.begin(), tSet.end(), std::back_inserter(intersection)); if (!intersection.empty()) { std::cerr << "Sample " << intersection[0] << " is both a tumor and control sample." << std::endl; return 1; } } } else c.hasSampleFile = false; // Check input VCF file if (vm.count("input-file")) { if (!(boost::filesystem::exists(c.vcffile) && boost::filesystem::is_regular_file(c.vcffile) && boost::filesystem::file_size(c.vcffile))) { std::cerr << "Input VCF/BCF file is missing: " << c.vcffile.string() << std::endl; return 1; } htsFile* ifile = bcf_open(c.vcffile.string().c_str(), "r"); if (ifile == NULL) { std::cerr << "Fail to open file " << c.vcffile.string() << std::endl; return 1; } hts_idx_t* bcfidx = NULL; tbx_t* tbx = NULL; if (hts_get_format(ifile)->format==vcf) tbx = tbx_index_load(c.vcffile.string().c_str()); else bcfidx = bcf_index_load(c.vcffile.string().c_str()); if ((bcfidx == NULL) && (tbx == NULL)) { std::cerr << "Fail to open index file for " << c.vcffile.string() << std::endl; return 1; } bcf_hdr_t* hdr = bcf_hdr_read(ifile); if (hdr == NULL) { std::cerr << "Fail to header for " << c.vcffile.string() << std::endl; return 1; } if (!(bcf_hdr_nsamples(hdr)>0)) { std::cerr << "BCF/VCF file has no sample genotypes!" << std::endl; return 1; } // Check sample names if (c.filter == "somatic") { for (int i = 0; i < bcf_hdr_nsamples(hdr); ++i) { if (tSet.find(hdr->samples[i]) != tSet.end()) c.tumorSet.insert(hdr->samples[i]); else if (cSet.find(hdr->samples[i]) != cSet.end()) c.controlSet.insert(hdr->samples[i]); else std::cerr << "Warning: Sample " << hdr->samples[i] << " is missing in sample file." << std::endl; } if (c.tumorSet.empty()) { std::cerr << "No tumor samples specified." << std::endl; return 1; } if (c.controlSet.empty()) { std::cerr << "No control samples specified." << std::endl; return 1; } } bcf_hdr_destroy(hdr); if (bcfidx) hts_idx_destroy(bcfidx); if (tbx) tbx_destroy(tbx); bcf_close(ifile); } // Show cmd boost::posix_time::ptime now = boost::posix_time::second_clock::local_time(); std::cout << '[' << boost::posix_time::to_simple_string(now) << "] "; std::cout << "delly "; for(int i=0; i #include #include #include #include #include #include #include #include #include #include "util.h" #include "junction.h" namespace torali { // Reduced bam alignment record data structure struct BamAlignRecord { int32_t tid; int32_t pos; int32_t mtid; int32_t mpos; int32_t alen; int32_t malen; int32_t Median; int32_t Mad; int32_t maxNormalISize; uint32_t flag; uint8_t MapQuality; BamAlignRecord(bam1_t* rec, uint8_t pairQuality, uint16_t a, uint16_t ma, int32_t median, int32_t mad, int32_t maxISize) : tid(rec->core.tid), pos(rec->core.pos), mtid(rec->core.mtid), mpos(rec->core.mpos), alen(a), malen(ma), Median(median), Mad(mad), maxNormalISize(maxISize), flag(rec->core.flag), MapQuality(pairQuality) {} }; // Sort reduced bam alignment records template struct SortBamRecords : public std::binary_function { inline bool operator()(TRecord const& s1, TRecord const& s2) const { if (s1.tid==s1.mtid) { return ((std::min(s1.pos, s1.mpos) < std::min(s2.pos, s2.mpos)) || ((std::min(s1.pos, s1.mpos) == std::min(s2.pos, s2.mpos)) && (std::max(s1.pos, s1.mpos) < std::max(s2.pos, s2.mpos))) || ((std::min(s1.pos, s1.mpos) == std::min(s2.pos, s2.mpos)) && (std::max(s1.pos, s1.mpos) == std::max(s2.pos, s2.mpos)) && (s1.maxNormalISize < s2.maxNormalISize))); } else { return ((s1.pos < s2.pos) || ((s1.pos == s2.pos) && (s1.mpos < s2.mpos)) || ((s1.pos == s2.pos) && (s1.mpos == s2.mpos) && (s1.maxNormalISize < s2.maxNormalISize))); } } }; // Edge struct template struct EdgeRecord { typedef TVertex TVertexType; TVertex source; TVertex target; TWeight weight; EdgeRecord(TVertex s, TVertex t, TWeight w) : source(s), target(t), weight(w) {} }; // Sort edge records template struct SortEdgeRecords : public std::binary_function { inline bool operator()(TRecord const& e1, TRecord const& e2) const { return ((e1.weight < e2.weight) || ((e1.weight == e2.weight) && (e1.source < e2.source)) || ((e1.weight == e2.weight) && (e1.source == e2.source) && (e1.target < e2.target))); } }; // Initialize clique, deletions template inline void _initClique(TBamRecord const& el, TSize& svStart, TSize& svEnd, TSize& wiggle, int32_t const svt) { if (_translocation(svt)) { uint8_t ct = _getSpanOrientation(svt); if (ct%2==0) { svStart = el.pos + el.alen; if (ct>=2) svEnd = el.mpos; else svEnd = el.mpos + el.malen; } else { svStart = el.pos; if (ct>=2) svEnd = el.mpos + el.malen; else svEnd = el.mpos; } wiggle=el.maxNormalISize; } else { if (svt == 0) { svStart = el.mpos + el.malen; svEnd = el.pos + el.alen; wiggle = el.maxNormalISize - std::max(el.alen, el.malen); } else if (svt == 1) { svStart = el.mpos; svEnd = el.pos; wiggle = el.maxNormalISize - std::max(el.alen, el.malen); } else if (svt == 2) { svStart = el.mpos + el.malen; svEnd = el.pos; wiggle = -el.maxNormalISize; } else if (svt == 3) { svStart = el.mpos; svEnd = el.pos + el.alen; wiggle = el.maxNormalISize; } } } // Update clique, deletions template inline bool _updateClique(TBamRecord const& el, TSize& svStart, TSize& svEnd, TSize& wiggle, int32_t const svt) { if (_translocation(svt)) { int ct = _getSpanOrientation(svt); TSize newSvStart; TSize newSvEnd; TSize newWiggle = wiggle; if (ct%2==0) { newSvStart = std::max(svStart, el.pos + el.alen); newWiggle -= (newSvStart - svStart); if (ct>=2) { newSvEnd = std::min(svEnd, el.mpos); newWiggle -= (svEnd - newSvEnd); } else { newSvEnd = std::max(svEnd, el.mpos + el.malen); newWiggle -= (newSvEnd - svEnd); } } else { newSvStart = std::min(svStart, el.pos); newWiggle -= (svStart - newSvStart); if (ct>=2) { newSvEnd = std::max(svEnd, el.mpos + el.malen); newWiggle -= (newSvEnd - svEnd); } else { newSvEnd = std::min(svEnd, el.mpos); newWiggle -= (svEnd - newSvEnd); } } // Is this still a valid translocation cluster? if (newWiggle>0) { svStart = newSvStart; svEnd = newSvEnd; wiggle = newWiggle; return true; } return false; } else { if ((svt == 0) || (svt == 1)) { int ct = _getSpanOrientation(svt); TSize newSvStart; TSize newSvEnd; TSize newWiggle; TSize wiggleChange; if (!ct) { newSvStart = std::max(svStart, el.mpos + el.malen); newSvEnd = std::max(svEnd, el.pos + el.alen); newWiggle = std::min(el.maxNormalISize - (newSvStart - el.mpos), el.maxNormalISize - (newSvEnd - el.pos)); wiggleChange = wiggle - std::max(newSvStart - svStart, newSvEnd - svEnd); } else { newSvStart = std::min(svStart, el.mpos); newSvEnd = std::min(svEnd, el.pos); newWiggle = std::min(el.maxNormalISize - (el.mpos + el.malen - newSvStart), el.maxNormalISize - (el.pos + el.alen - newSvEnd)); wiggleChange = wiggle - std::max(svStart - newSvStart, svEnd - newSvEnd); } if (wiggleChange < newWiggle) newWiggle=wiggleChange; // Does the new inversion size agree with all pairs if ((newSvStart < newSvEnd) && (newWiggle>=0)) { svStart = newSvStart; svEnd = newSvEnd; wiggle = newWiggle; return true; } return false; } else if (svt == 2) { TSize newSvStart = std::max(svStart, el.mpos + el.malen); TSize newSvEnd = std::min(svEnd, el.pos); TSize newWiggle = el.pos + el.alen - el.mpos - el.maxNormalISize - (newSvEnd - newSvStart); TSize wiggleChange = wiggle + (svEnd-svStart) - (newSvEnd - newSvStart); if (wiggleChange > newWiggle) newWiggle=wiggleChange; // Does the new deletion size agree with all pairs if ((newSvStart < newSvEnd) && (newWiggle<=0)) { svStart = newSvStart; svEnd = newSvEnd; wiggle = newWiggle; return true; } return false; } else if (svt == 3) { TSize newSvStart = std::min(svStart, el.mpos); TSize newSvEnd = std::max(svEnd, el.pos + el.alen); TSize newWiggle = el.pos - (el.mpos + el.malen) + el.maxNormalISize - (newSvEnd - newSvStart); TSize wiggleChange = wiggle - ((newSvEnd - newSvStart) - (svEnd-svStart)); if (wiggleChange < newWiggle) newWiggle = wiggleChange; // Does the new duplication size agree with all pairs if ((newSvStart < newSvEnd) && (newWiggle>=0)) { svStart = newSvStart; svEnd = newSvEnd; wiggle = newWiggle; return true; } return false; } } return false; } template inline void _searchCliques(TConfig const& c, TCompEdgeList& compEdge, std::vector& br, std::vector& sv, uint32_t const wiggle, int32_t const svt) { typedef typename TCompEdgeList::mapped_type TEdgeList; typedef typename TEdgeList::value_type TEdgeRecord; typedef typename TEdgeRecord::TVertexType TVertex; // Iterate all components for(typename TCompEdgeList::iterator compIt = compEdge.begin(); compIt != compEdge.end(); ++compIt) { // Sort edges by weight std::sort(compIt->second.begin(), compIt->second.end(), SortEdgeRecords()); // Find a large clique typename TEdgeList::const_iterator itWEdge = compIt->second.begin(); typename TEdgeList::const_iterator itWEdgeEnd = compIt->second.end(); typedef std::set TCliqueMembers; typedef std::set TSeeds; TCliqueMembers clique; TCliqueMembers incompatible; TSeeds seeds; // Initialize clique clique.insert(itWEdge->source); seeds.insert(br[itWEdge->source].id); int32_t chr = br[itWEdge->source].chr; int32_t chr2 = br[itWEdge->source].chr2; int32_t ciposlow = br[itWEdge->source].pos; uint64_t pos = br[itWEdge->source].pos; int32_t ciposhigh = br[itWEdge->source].pos; int32_t ciendlow = br[itWEdge->source].pos2; uint64_t pos2 = br[itWEdge->source].pos2; int32_t ciendhigh = br[itWEdge->source].pos2; int32_t mapq = br[itWEdge->source].qual; int32_t inslen = br[itWEdge->source].inslen; // Grow clique bool cliqueGrow = true; while (cliqueGrow) { itWEdge = compIt->second.begin(); cliqueGrow = false; // Find next best edge for extension for(;(!cliqueGrow) && (itWEdge != itWEdgeEnd);++itWEdge) { TVertex v; if ((clique.find(itWEdge->source) == clique.end()) && (clique.find(itWEdge->target) != clique.end())) v = itWEdge->source; else if ((clique.find(itWEdge->source) != clique.end()) && (clique.find(itWEdge->target) == clique.end())) v = itWEdge->target; else continue; if (incompatible.find(v) != incompatible.end()) continue; if (seeds.find(br[v].id) != seeds.end()) continue; // Try to update clique with this vertex int32_t newCiPosLow = std::min(br[v].pos, ciposlow); int32_t newCiPosHigh = std::max(br[v].pos, ciposhigh); int32_t newCiEndLow = std::min(br[v].pos2, ciendlow); int32_t newCiEndHigh = std::max(br[v].pos2, ciendhigh); if (((newCiPosHigh - newCiPosLow) < (int32_t) wiggle) && ((newCiEndHigh - newCiEndLow) < (int32_t) wiggle)) cliqueGrow = true; if (cliqueGrow) { // Accept new vertex clique.insert(v); seeds.insert(br[v].id); ciposlow = newCiPosLow; pos += br[v].pos; ciposhigh = newCiPosHigh; ciendlow = newCiEndLow; pos2 += br[v].pos2; ciendhigh = newCiEndHigh; mapq += br[v].qual; inslen += br[v].inslen; } else incompatible.insert(v); } } // Enough split reads? if (clique.size() >= c.minCliqueSize) { int32_t svStart = (int32_t) (pos / (uint64_t) clique.size()); int32_t svEnd = (int32_t) (pos2 / (uint64_t) clique.size()); int32_t svInsLen = (int32_t) (inslen / (int32_t) clique.size()); if (_svSizeCheck(svStart, svEnd, svt, svInsLen)) { if ((ciposlow > svStart) || (ciposhigh < svStart) || (ciendlow > svEnd) || (ciendhigh < svEnd)) { std::cerr << "Warning: Confidence intervals out of bounds: " << ciposlow << ',' << svStart << ',' << ciposhigh << ':' << ciendlow << ',' << svEnd << ',' << ciendhigh << std::endl; } int32_t svid = sv.size(); sv.push_back(StructuralVariantRecord(chr, svStart, chr2, svEnd, (ciposlow - svStart), (ciposhigh - svStart), (ciendlow - svEnd), (ciendhigh - svEnd), clique.size(), mapq / clique.size(), mapq, svInsLen, svt, svid)); // Reads assigned for(typename TCliqueMembers::iterator itC = clique.begin(); itC != clique.end(); ++itC) { //std::cerr << svid << ',' << br[*itC].id << std::endl; br[*itC].svid = svid; } } } } } template inline void cluster(TConfig const& c, std::vector& br, std::vector& sv, uint32_t const varisize, int32_t const svt) { uint32_t count = 0; for(int32_t refIdx = 0; refIdx < c.nchr; ++refIdx) { // Components typedef std::vector TComponent; TComponent comp; comp.resize(br.size(), 0); uint32_t numComp = 0; // Edge lists for each component typedef uint32_t TWeightType; typedef uint32_t TVertex; typedef EdgeRecord TEdgeRecord; typedef std::vector TEdgeList; typedef std::map TCompEdgeList; TCompEdgeList compEdge; std::size_t lastConnectedNode = 0; std::size_t lastConnectedNodeStart = 0; for(uint32_t i = 0; i lastConnectedNode) { // Clean edge lists if (!compEdge.empty()) { // Search cliques _searchCliques(c, compEdge, br, sv, varisize, svt); lastConnectedNodeStart = lastConnectedNode; compEdge.clear(); } } for(uint32_t j = i + 1; j varisize) break; if ((svt == 4) && (std::abs(br[j].inslen - br[i].inslen) > varisize)) continue; if ( (uint32_t) std::abs(br[j].pos2 - br[i].pos2) < varisize) { // Update last connected node if (j > lastConnectedNode) lastConnectedNode = j; // Assign components uint32_t compIndex = 0; if (!comp[i]) { if (!comp[j]) { // Both vertices have no component compIndex = ++numComp; comp[i] = compIndex; comp[j] = compIndex; compEdge.insert(std::make_pair(compIndex, TEdgeList())); } else { compIndex = comp[j]; comp[i] = compIndex; } } else { if (!comp[j]) { compIndex = comp[i]; comp[j] = compIndex; } else { // Both vertices have a component if (comp[j] == comp[i]) { compIndex = comp[j]; } else { // Merge components compIndex = comp[i]; uint32_t otherIndex = comp[j]; if (otherIndex < compIndex) { compIndex = comp[j]; otherIndex = comp[i]; } // Re-label other index for(uint32_t k = lastConnectedNodeStart; k <= lastConnectedNode; ++k) { if (otherIndex == comp[k]) comp[k] = compIndex; } // Merge edge lists TCompEdgeList::iterator compEdgeIt = compEdge.find(compIndex); TCompEdgeList::iterator compEdgeOtherIt = compEdge.find(otherIndex); compEdgeIt->second.insert(compEdgeIt->second.end(), compEdgeOtherIt->second.begin(), compEdgeOtherIt->second.end()); compEdge.erase(compEdgeOtherIt); } } } // Append new edge TCompEdgeList::iterator compEdgeIt = compEdge.find(compIndex); if (compEdgeIt->second.size() < c.graphPruning) { // Breakpoint distance TWeightType weight = std::abs(br[j].pos2 - br[i].pos2) + std::abs(br[j].pos - br[i].pos); compEdgeIt->second.push_back(TEdgeRecord(i, j, weight)); } } } } } } // Search cliques if (!compEdge.empty()) { _searchCliques(c, compEdge, br, sv, varisize, svt); compEdge.clear(); } } } template inline void _searchCliques(TConfig const& c, TCompEdgeList& compEdge, TBamRecord const& bamRecord, TSVs& svs, int32_t const svt) { typedef typename TCompEdgeList::mapped_type TEdgeList; typedef typename TEdgeList::value_type TEdgeRecord; // Iterate all components for(typename TCompEdgeList::iterator compIt = compEdge.begin(); compIt != compEdge.end(); ++compIt) { // Sort edges by weight std::sort(compIt->second.begin(), compIt->second.end(), SortEdgeRecords()); // Find a large clique typename TEdgeList::const_iterator itWEdge = compIt->second.begin(); typename TEdgeList::const_iterator itWEdgeEnd = compIt->second.end(); typedef std::set TCliqueMembers; TCliqueMembers clique; TCliqueMembers incompatible; int32_t svStart = -1; int32_t svEnd = -1; int32_t wiggle = 0; int32_t clusterRefID=bamRecord[itWEdge->source].tid; int32_t clusterMateRefID=bamRecord[itWEdge->source].mtid; _initClique(bamRecord[itWEdge->source], svStart, svEnd, wiggle, svt); if ((clusterRefID==clusterMateRefID) && (svStart >= svEnd)) continue; clique.insert(itWEdge->source); // Grow the clique from the seeding edge bool cliqueGrow=true; while (cliqueGrow) { itWEdge = compIt->second.begin(); cliqueGrow = false; for(;(!cliqueGrow) && (itWEdge != itWEdgeEnd);++itWEdge) { std::size_t v; if ((clique.find(itWEdge->source) == clique.end()) && (clique.find(itWEdge->target) != clique.end())) v = itWEdge->source; else if ((clique.find(itWEdge->source) != clique.end()) && (clique.find(itWEdge->target) == clique.end())) v = itWEdge->target; else continue; if (incompatible.find(v) != incompatible.end()) continue; cliqueGrow = _updateClique(bamRecord[v], svStart, svEnd, wiggle, svt); if (cliqueGrow) clique.insert(v); else incompatible.insert(v); } } // Enough paired-ends if ((clique.size() >= c.minCliqueSize) && (_svSizeCheck(svStart, svEnd, svt))) { StructuralVariantRecord svRec; svRec.chr = clusterRefID; svRec.chr2 = clusterMateRefID; svRec.svStart = (uint32_t) svStart + 1; svRec.svEnd = (uint32_t) svEnd + 1; svRec.peSupport = clique.size(); int32_t ci_wiggle = std::max(abs(wiggle), 50); svRec.ciposlow = -ci_wiggle; svRec.ciposhigh = ci_wiggle; svRec.ciendlow = -ci_wiggle; svRec.ciendhigh = ci_wiggle; svRec.mapq = 0; std::vector mapQV; for(typename TCliqueMembers::const_iterator itC = clique.begin(); itC!=clique.end(); ++itC) { mapQV.push_back(bamRecord[*itC].MapQuality); svRec.mapq += bamRecord[*itC].MapQuality; } std::sort(mapQV.begin(), mapQV.end()); svRec.peMapQuality = mapQV[mapQV.size()/2]; svRec.srSupport=0; svRec.srAlignQuality=0; svRec.precise=false; svRec.svt = svt; svRec.insLen = 0; svRec.homLen = 0; svs.push_back(svRec); } } } template inline void cluster(TConfig const& c, std::vector& bamRecord, std::vector& svs, uint32_t const varisize, int32_t const svt) { typedef typename std::vector TBamRecord; // Components typedef std::vector TComponent; TComponent comp; comp.resize(bamRecord.size(), 0); uint32_t numComp = 0; // Edge lists for each component typedef uint32_t TWeightType; typedef uint32_t TVertex; typedef EdgeRecord TEdgeRecord; typedef std::vector TEdgeList; typedef std::map TCompEdgeList; TCompEdgeList compEdge; // Iterate the chromosome range std::size_t lastConnectedNode = 0; std::size_t lastConnectedNodeStart = 0; std::size_t bamItIndex = 0; for(TBamRecord::const_iterator bamIt = bamRecord.begin(); bamIt != bamRecord.end(); ++bamIt, ++bamItIndex) { // Safe to clean the graph? if (bamItIndex > lastConnectedNode) { // Clean edge lists if (!compEdge.empty()) { _searchCliques(c, compEdge, bamRecord, svs, svt); lastConnectedNodeStart = lastConnectedNode; compEdge.clear(); } } int32_t const minCoord = _minCoord(bamIt->pos, bamIt->mpos, svt); int32_t const maxCoord = _maxCoord(bamIt->pos, bamIt->mpos, svt); TBamRecord::const_iterator bamItNext = bamIt; ++bamItNext; std::size_t bamItIndexNext = bamItIndex + 1; for(; ((bamItNext != bamRecord.end()) && ((uint32_t) std::abs(_minCoord(bamItNext->pos, bamItNext->mpos, svt) + bamItNext->alen - minCoord) <= varisize)) ; ++bamItNext, ++bamItIndexNext) { // Check that mate chr agree (only for translocations) if (bamIt->mtid != bamItNext->mtid) continue; // Check combinability of pairs if (_pairsDisagree(minCoord, maxCoord, bamIt->alen, bamIt->maxNormalISize, _minCoord(bamItNext->pos, bamItNext->mpos, svt), _maxCoord(bamItNext->pos, bamItNext->mpos, svt), bamItNext->alen, bamItNext->maxNormalISize, svt)) continue; // Update last connected node if (bamItIndexNext > lastConnectedNode ) lastConnectedNode = bamItIndexNext; // Assign components uint32_t compIndex = 0; if (!comp[bamItIndex]) { if (!comp[bamItIndexNext]) { // Both vertices have no component compIndex = ++numComp; comp[bamItIndex] = compIndex; comp[bamItIndexNext] = compIndex; compEdge.insert(std::make_pair(compIndex, TEdgeList())); } else { compIndex = comp[bamItIndexNext]; comp[bamItIndex] = compIndex; } } else { if (!comp[bamItIndexNext]) { compIndex = comp[bamItIndex]; comp[bamItIndexNext] = compIndex; } else { // Both vertices have a component if (comp[bamItIndexNext] == comp[bamItIndex]) { compIndex = comp[bamItIndexNext]; } else { // Merge components compIndex = comp[bamItIndex]; uint32_t otherIndex = comp[bamItIndexNext]; if (otherIndex < compIndex) { compIndex = comp[bamItIndexNext]; otherIndex = comp[bamItIndex]; } // Re-label other index for(std::size_t i = lastConnectedNodeStart; i <= lastConnectedNode; ++i) { if (otherIndex == comp[i]) comp[i] = compIndex; } // Merge edge lists TCompEdgeList::iterator compEdgeIt = compEdge.find(compIndex); TCompEdgeList::iterator compEdgeOtherIt = compEdge.find(otherIndex); compEdgeIt->second.insert(compEdgeIt->second.end(), compEdgeOtherIt->second.begin(), compEdgeOtherIt->second.end()); compEdge.erase(compEdgeOtherIt); } } } // Append new edge TCompEdgeList::iterator compEdgeIt = compEdge.find(compIndex); if (compEdgeIt->second.size() < c.graphPruning) { TWeightType weight = (TWeightType) ( std::log((double) abs( abs( (_minCoord(bamItNext->pos, bamItNext->mpos, svt) - minCoord) - (_maxCoord(bamItNext->pos, bamItNext->mpos, svt) - maxCoord) ) - abs(bamIt->Median - bamItNext->Median)) + 1) / std::log(2) ); compEdgeIt->second.push_back(TEdgeRecord(bamItIndex, bamItIndexNext, weight)); } } } if (!compEdge.empty()) { _searchCliques(c, compEdge, bamRecord, svs, svt); compEdge.clear(); } } } #endif delly-0.9.1/src/cnv.h000066400000000000000000000657671414764127700144220ustar00rootroot00000000000000#ifndef CNV_H #define CNV_H #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "util.h" namespace torali { struct SVBreakpoint { int32_t pos; int32_t cilow; int32_t cihigh; int32_t qual; explicit SVBreakpoint(int32_t const p) : pos(p), cilow(0), cihigh(0), qual(0) {} SVBreakpoint(int32_t const p, int32_t const cil, int32_t const cih, int32_t q) : pos(p), cilow(cil), cihigh(cih), qual(q) {} }; template struct SortSVBreakpoint : public std::binary_function { inline bool operator()(TSVBp const& sv1, TSVBp const& sv2) { return ((sv1.pos inline void mergeCNVs(TConfig const& c, std::vector& chrcnv, std::vector& cnvs) { // Merge neighboring segments if too similar bool merged = true; std::vector newcnv; while(merged) { int32_t k = -1; for(int32_t i = 0; i < (int32_t) chrcnv.size(); ++i) { if (i <= k) continue; k = i; for(int32_t j = i + 1; j < (int32_t) chrcnv.size(); ++j) { bool allValid = true; for(int32_t pre = i; pre < j; ++pre) { double diff = std::abs(chrcnv[pre].cn - chrcnv[j].cn); if (diff >= c.cn_offset) { allValid = false; break; } } if (allValid) k = j; else break; } if (k > i) { // Merge double cn = (chrcnv[i].cn + chrcnv[k].cn) / 2.0; double mp = (chrcnv[i].mappable + chrcnv[k].mappable) / 2.0; newcnv.push_back(CNV(chrcnv[i].chr, chrcnv[i].start, chrcnv[k].end, chrcnv[i].ciposlow, chrcnv[i].ciposhigh, chrcnv[k].ciendlow, chrcnv[k].ciendhigh, cn, mp)); } else { newcnv.push_back(chrcnv[i]); } } if (newcnv.size() == chrcnv.size()) merged = false; else { chrcnv = newcnv; newcnv.clear(); } } // Insert into global CNV vector for(uint32_t i = 0; i < chrcnv.size(); ++i) { cnvs.push_back(chrcnv[i]); //std::cerr << chrcnv[i].chr << '\t' << chrcnv[i].start << '\t' << chrcnv[i].end << "\tMerged" << std::endl; } } template inline void breakpointRefinement(TConfig const& c, std::pair const& gcbound, std::vector const& gcContent, std::vector const& uniqContent, TGcBias const& gcbias, TCoverage const& cov, bam_hdr_t const* hdr, int32_t const refIndex, TGenomicBreakpoints const& svbp, std::vector& cnvs) { typedef typename TGenomicBreakpoints::value_type TSVs; // Estimate CN shift for(uint32_t n = 1; n < cnvs.size(); ++n) { if ((cnvs[n-1].chr != refIndex) || (cnvs[n].chr != refIndex)) continue; double precovsum = 0; double preexpcov = 0; double succovsum = 0; double sucexpcov = 0; int32_t pos = cnvs[n-1].start; while((pos < cnvs[n].end) && (pos < (int32_t) hdr->target_len[refIndex])) { if ((gcContent[pos] > gcbound.first) && (gcContent[pos] < gcbound.second) && (uniqContent[pos] >= c.fragmentUnique * c.meanisize)) { if (pos < cnvs[n-1].end) { precovsum += cov[pos]; preexpcov += gcbias[gcContent[pos]].coverage; } else { succovsum += cov[pos]; sucexpcov += gcbias[gcContent[pos]].coverage; } } ++pos; } double precndiff = std::abs((c.ploidy * precovsum / preexpcov) - (c.ploidy * succovsum / sucexpcov)); // Intersect with delly SVs typename TSVs::const_iterator itbest = svbp[refIndex].end(); int32_t searchStart = std::max(0, std::min(cnvs[n-1].ciendlow, cnvs[n-1].end - 1000)); int32_t searchEnd = std::max(cnvs[n].ciposhigh, cnvs[n].start + 1000); int32_t midpoint = (int32_t) ((cnvs[n-1].start + cnvs[n-1].end) / 2); if (searchStart < midpoint) searchStart = midpoint; midpoint = (int32_t) ((cnvs[n].start + cnvs[n].end) / 2); if (searchEnd > midpoint) searchEnd = midpoint; // Current CNV start for this breakpoint typename TSVs::const_iterator itsv = std::lower_bound(svbp[refIndex].begin(), svbp[refIndex].end(), SVBreakpoint(searchStart), SortSVBreakpoint()); for(; itsv != svbp[refIndex].end(); ++itsv) { if (itsv->pos > searchEnd) break; if ((itbest == svbp[refIndex].end()) || (itsv->qual > itbest->qual)) itbest = itsv; } if ((itbest != svbp[refIndex].end()) && (itbest->qual >= 50)) { // Check refined CNV precovsum = 0; preexpcov = 0; succovsum = 0; sucexpcov = 0; pos = cnvs[n-1].start; while((pos < cnvs[n].end) && (pos < (int32_t) hdr->target_len[refIndex])) { if ((gcContent[pos] > gcbound.first) && (gcContent[pos] < gcbound.second) && (uniqContent[pos] >= c.fragmentUnique * c.meanisize)) { if (pos < itbest->pos) { precovsum += cov[pos]; preexpcov += gcbias[gcContent[pos]].coverage; } else { succovsum += cov[pos]; sucexpcov += gcbias[gcContent[pos]].coverage; } } ++pos; } double postcndiff = std::abs((c.ploidy * precovsum / preexpcov) - (c.ploidy * succovsum / sucexpcov)); //std::cerr << cnvs[n-1].end << ',' << itbest->pos << ',' << precndiff << ',' << postcndiff << std::endl; if ((precndiff < postcndiff + c.cn_offset) && (std::abs(cnvs[n].start - itbest->pos) < 50000)) { // Accept new breakpoint cnvs[n-1].end = itbest->pos; cnvs[n].start = itbest->pos; cnvs[n-1].ciendlow = itbest->pos + itbest->cilow; cnvs[n-1].ciendhigh = itbest->pos + itbest->cihigh; cnvs[n].ciposlow = itbest->pos + itbest->cilow; cnvs[n].ciposhigh = itbest->pos + itbest->cihigh; } } } } template inline void breakpointRefinement2(TConfig const& c, std::pair const& gcbound, std::vector const& gcContent, std::vector const& uniqContent, TGcBias const& gcbias, TCoverage const& cov, bam_hdr_t const* hdr, int32_t const refIndex, std::vector& cnvs) { int32_t maxbpshift = 10000; // Breakpoint refinement for(uint32_t n = 1; n < cnvs.size(); ++n) { int32_t prehalf = (cnvs[n-1].start + cnvs[n-1].end) / 2; prehalf = std::max(cnvs[n-1].end - maxbpshift, prehalf); int32_t suchalf = (cnvs[n].start + cnvs[n].end) / 2; suchalf = std::min(cnvs[n].start + maxbpshift, suchalf); double precovsum = 0; double preexpcov = 0; double succovsum = 0; double sucexpcov = 0; int32_t pos = cnvs[n-1].start; std::vector validpos; while((pos < cnvs[n].end) && (pos < (int32_t) hdr->target_len[refIndex])) { if ((gcContent[pos] > gcbound.first) && (gcContent[pos] < gcbound.second) && (uniqContent[pos] >= c.fragmentUnique * c.meanisize)) { if (pos < prehalf) { precovsum += cov[pos]; preexpcov += gcbias[gcContent[pos]].coverage; } else { if (pos <= suchalf) validpos.push_back(pos); succovsum += cov[pos]; sucexpcov += gcbias[gcContent[pos]].coverage; } } ++pos; } double precn = c.ploidy * precovsum / preexpcov; double succn = c.ploidy * succovsum / sucexpcov; // Shift Bp std::vector diffcn(validpos.size(), 0); for(uint32_t idx = 0; idx < validpos.size(); ++idx) { if ((preexpcov > 0) && (sucexpcov > 0)) { precn = c.ploidy * precovsum / preexpcov; succn = c.ploidy * succovsum / sucexpcov; diffcn[idx] = std::abs(precn - succn); //if (validpos[idx] == cnvs[n-1].end) std::cerr << "-->"; //std::cerr << validpos[idx] << ',' << precn << ',' << succn << ',' << diffcn[idx] << std::endl; } // Add to pre, remove from suc precovsum += cov[validpos[idx]]; preexpcov += gcbias[gcContent[validpos[idx]]].coverage; succovsum -= cov[validpos[idx]]; sucexpcov -= gcbias[gcContent[validpos[idx]]].coverage; } // Find best int32_t bestIdx = -1; for(uint32_t idx = 0; idx < validpos.size(); ++idx) { if ((bestIdx == -1) || (diffcn[idx] > diffcn[bestIdx])) bestIdx = idx; } if (bestIdx != -1) { // Update breakpoint cnvs[n-1].end = validpos[bestIdx]; cnvs[n].start = validpos[bestIdx]; } } //for(uint32_t n = 0; n < cnvs.size(); ++n) std::cerr << hdr->target_name[cnvs[n].chr] << '\t' << cnvs[n].start << '\t' << cnvs[n].end << "\tRefinement" << std::endl; } template inline void genotypeCNVs(TConfig const& c, std::pair const& gcbound, std::vector const& gcContent, std::vector const& uniqContent, TGcBias const& gcbias, TCoverage const& cov, bam_hdr_t const* hdr, int32_t const refIndex, std::vector& cnvs) { for(uint32_t n = 0; n < cnvs.size(); ++n) { if (cnvs[n].chr != refIndex) continue; double covsum = 0; double expcov = 0; int32_t winlen = 0; int32_t pos = cnvs[n].start; while((pos < cnvs[n].end) && (pos < (int32_t) hdr->target_len[refIndex])) { if ((gcContent[pos] > gcbound.first) && (gcContent[pos] < gcbound.second) && (uniqContent[pos] >= c.fragmentUnique * c.meanisize)) { covsum += cov[pos]; expcov += gcbias[gcContent[pos]].coverage; ++winlen; } ++pos; } double cn = c.ploidy; if (expcov > 0) cn = c.ploidy * covsum / expcov; double mp = (double) winlen / (double) (cnvs[n].end - cnvs[n].start); cnvs[n].cn = cn; cnvs[n].mappable = mp; // Estimate SD boost::accumulators::accumulator_set > acc; uint32_t wsz = winlen / 10; if (wsz > 1) { covsum = 0; expcov = 0; winlen = 0; pos = cnvs[n].start; while((pos < cnvs[n].end) && (pos < (int32_t) hdr->target_len[refIndex])) { if ((gcContent[pos] > gcbound.first) && (gcContent[pos] < gcbound.second) && (uniqContent[pos] >= c.fragmentUnique * c.meanisize)) { covsum += cov[pos]; expcov += gcbias[gcContent[pos]].coverage; ++winlen; if (winlen % wsz == 0) { double cn = c.ploidy; if (expcov > 0) cn = c.ploidy * covsum / expcov; acc(cn); covsum = 0; expcov = 0; } } ++pos; } cnvs[n].sd = sqrt(boost::accumulators::variance(acc)); if (cnvs[n].sd < 0.025) cnvs[n].sd = 0.025; } else { // Invalid cnvs[n].cn = -1; cnvs[n].sd = 0.025; } } } template inline void callCNVs(TConfig const& c, std::pair const& gcbound, std::vector const& gcContent, std::vector const& uniqContent, TGcBias const& gcbias, TCoverage const& cov, bam_hdr_t const* hdr, int32_t const refIndex, std::vector& cnvs) { // Parameters int32_t smallestWin = c.minCnvSize / 10; int32_t biggestWin = smallestWin * 200; uint32_t chain = 10; // Find breakpoints std::vector bpmax; if (bpmax.empty()) { // Scanning window sizes std::vector winsize; int32_t wsize = smallestWin; while (wsize < biggestWin) { winsize.push_back(wsize); wsize *= 2; } // Iterate window sizes typedef int32_t TCnVal; typedef std::vector TCN; typedef std::vector TChrPos; std::vector bpvec; for(uint32_t idx = 0; idx < winsize.size(); ++idx) { uint32_t idxOffset = winsize[idx] / winsize[0]; //std::cerr << idx << ',' << winsize[idx] << ',' << idxOffset << ',' << bpvec.size() << ',' << hdr->target_len[refIndex] << std::endl; TCN cnvec; TChrPos wpos; uint32_t wstart = 0; while(wstart < hdr->target_len[refIndex]) { double covsum = 0; double expcov = 0; int32_t winlen = 0; uint32_t pos = wstart; while ((winlen < winsize[idx]) && (pos < hdr->target_len[refIndex])) { if ((gcContent[pos] > gcbound.first) && (gcContent[pos] < gcbound.second) && (uniqContent[pos] >= c.fragmentUnique * c.meanisize)) { covsum += cov[pos]; expcov += gcbias[gcContent[pos]].coverage; ++winlen; } ++pos; } if (winlen == winsize[idx]) { // Full window if (expcov > 0) cnvec.push_back((int32_t) boost::math::round(c.ploidy * covsum / expcov * 100.0)); else cnvec.push_back((int32_t) boost::math::round(c.ploidy * 100.0)); wpos.push_back(wstart); } wstart = pos; } // Identify breakpoints TCN pre(chain, -1); TCN suc(chain, -1); TChrPos prep(chain, 0); TChrPos sucp(chain, 0); uint32_t idxbp = 0; for(uint32_t k = 0; k < cnvec.size(); ++k) { if (k < chain) { pre[k % chain] = cnvec[k]; prep[k % chain] = wpos[k]; if (k + 1 < cnvec.size()) { if (idx == 0 ) bpvec.push_back(BpCNV(wpos[k], wpos[k+1], 0)); else idxbp += idxOffset; } } else if (k < 2 * chain) { suc[k % chain] = cnvec[k]; sucp[k % chain] = wpos[k]; } else { // Midpoint TCnVal val = suc[k%chain]; int32_t pos = sucp[k%chain]; int32_t posNext = sucp[(k+1)%chain]; suc[k%chain] = cnvec[k]; sucp[k%chain] = wpos[k]; // Debug //for(uint32_t m = 0; m < pre.size(); ++m) std::cerr << prep[m] << '\t' << pre[m] << std::endl; //std::cerr << "M:" << pos << '\t' << val << std::endl; //for(uint32_t m = 0; m < suc.size(); ++m) std::cerr << sucp[m] << '\t' << suc[m] << std::endl; // Any shift in CN? boost::accumulators::accumulator_set > accpre; boost::accumulators::accumulator_set > accsuc; for(uint32_t m = 0; m < pre.size(); ++m) accpre(pre[m]); for(uint32_t m = 0; m < suc.size(); ++m) accsuc(suc[m]); double diff = std::abs(boost::accumulators::mean(accsuc) - boost::accumulators::mean(accpre)); // Breakpoint candidate double zscore = 0; if ((diff > c.stringency * sqrt(boost::accumulators::variance(accpre))) && (diff > c.stringency * sqrt(boost::accumulators::variance(accsuc)))) { zscore = diff / std::max(sqrt(boost::accumulators::variance(accpre)), sqrt(boost::accumulators::variance(accsuc))); } if (idx == 0) bpvec.push_back(BpCNV(pos, posNext, zscore)); else { for(uint32_t sub = idxbp; sub < idxbp + idxOffset; ++sub) bpvec[sub].zscore += zscore; idxbp += idxOffset; } pre[k%chain] = val; prep[k%chain] = pos; } } } // Local maxima if (bpvec.size()) { int32_t pos = bpvec[0].start; int32_t posNext = bpvec[0].end; double bestDiff = bpvec[0].zscore; for(uint32_t n = 1; n < bpvec.size(); ++n) { //std::cerr << "B:" << bpvec[n].start << '-' << bpvec[n].end << ':' << bpvec[n].zscore << std::endl; if (bpvec[n].zscore == 0) { if (bestDiff != 0) { //std::cerr << "M:" << pos << '-' << posNext << ':' << bestDiff << std::endl; bpmax.push_back(BpCNV(pos, posNext, bestDiff)); pos = bpvec[n].start; posNext = bpvec[n].end; bestDiff = bpvec[n].zscore; } } else { if (bpvec[n].zscore > bestDiff) { // Replace local max pos = bpvec[n].start; posNext = bpvec[n].end; bestDiff = bpvec[n].zscore; } else if (bpvec[n].zscore == bestDiff) { // Extend local max posNext = bpvec[n].end; } } } } } // Breakpoints for(uint32_t n = 0; n <= bpmax.size(); ++n) { int32_t cil = 0; int32_t cih = 0; if (n > 0) { cil = bpmax[n-1].start; cih = bpmax[n-1].end; } int32_t cel = hdr->target_len[refIndex] - 1; int32_t ceh = hdr->target_len[refIndex] - 1; if (n < bpmax.size()) { cel = bpmax[n].start; ceh = bpmax[n].end; } //std::cerr << (cih - cil) << ';' << (ceh - cel) << std::endl; int32_t cnvstart = (int32_t) ((cil + cih)/2); int32_t cnvend = (int32_t) ((cel + ceh)/2); int32_t estcnvstart = -1; int32_t estcnvend = -1; double covsum = 0; double expcov = 0; int32_t winlen = 0; int32_t pos = cnvstart; while((pos < cnvend) && (pos < (int32_t) hdr->target_len[refIndex])) { if ((gcContent[pos] > gcbound.first) && (gcContent[pos] < gcbound.second) && (uniqContent[pos] >= c.fragmentUnique * c.meanisize)) { if (estcnvstart == -1) estcnvstart = pos; estcnvend = pos; covsum += cov[pos]; expcov += gcbias[gcContent[pos]].coverage; ++winlen; } ++pos; } if ((estcnvstart != -1) && (estcnvend != -1) && (estcnvend - estcnvstart > 0)) { double cn = c.ploidy; if (expcov > 0) cn = c.ploidy * covsum / expcov; double mp = (double) winlen / (double) (estcnvend - estcnvstart); cnvs.push_back(CNV(refIndex, estcnvstart, estcnvend, cil, cih, cel, ceh, cn, mp)); //std::cerr << hdr->target_name[refIndex] << '\t' << estcnvstart << '\t' << estcnvend << '\t' << '(' << cil << ',' << cih << ')' << '\t' << '(' << cel << ',' << ceh << ')' << '\t' << cn << '\t' << mp << std::endl; } } } // Parse Delly CNV VCF file template inline void parseVcfCNV(TConfig const& c, bam_hdr_t* hd, std::vector& cnvs) { // Load bcf file htsFile* ifile = bcf_open(c.genofile.string().c_str(), "r"); bcf_hdr_t* hdr = bcf_hdr_read(ifile); bcf1_t* rec = bcf_init(); // Parse bcf int32_t nsvend = 0; int32_t* svend = NULL; int32_t ncipos = 0; int32_t* cipos = NULL; int32_t nmp = 0; float* mp = NULL; int32_t nsvt = 0; char* svt = NULL; int32_t nmethod = 0; char* method = NULL; uint16_t wimethod = 0; while (bcf_read(ifile, hdr, rec) == 0) { bcf_unpack(rec, BCF_UN_INFO); // Delly BCF file? if (!wimethod) { wimethod = 2; if (bcf_get_info_string(hdr, rec, "SVMETHOD", &method, &nmethod) > 0) { std::string mstr = std::string(method); if ((mstr.size() >= 10) && (mstr.substr(0, 10) == "EMBL.DELLY")) wimethod = 1; } } // Delly if (wimethod == 1) { // Fill SV record CNV cnv; std::string chrName = bcf_hdr_id2name(hdr, rec->rid); int32_t tid = bam_name2id(hd, chrName.c_str()); cnv.chr = tid; cnv.start = rec->pos - 1; cnv.qval = rec->qual; // Parse CNV type if (bcf_get_info_string(hdr, rec, "SVTYPE", &svt, &nsvt) > 0) { if (std::string(svt) != "CNV") continue; } else continue; // Parse INFO if (bcf_get_info_int32(hdr, rec, "END", &svend, &nsvend) > 0) cnv.end = *svend; else continue; if (bcf_get_info_int32(hdr, rec, "CIPOS", &cipos, &ncipos) > 0) { cnv.ciposlow = cnv.start + cipos[0]; cnv.ciposhigh = cnv.start + cipos[1]; } else { cnv.ciposlow = cnv.start - 50; cnv.ciposhigh = cnv.start + 50; } if (bcf_get_info_int32(hdr, rec, "CIEND", &cipos, &ncipos) > 0) { cnv.ciendlow = cnv.end + cipos[0]; cnv.ciendhigh = cnv.end + cipos[1]; } else { cnv.ciendlow = cnv.end - 50; cnv.ciendhigh = cnv.end + 50; } if (bcf_get_info_float(hdr, rec, "MP", &mp, &nmp) > 0) cnv.mappable = (double) *mp; else cnv.mappable = 0; cnvs.push_back(cnv); } } // Clean-up free(svend); free(svt); free(method); free(cipos); free(mp); // Close VCF bcf_hdr_destroy(hdr); bcf_close(ifile); bcf_destroy(rec); } template inline void cnvVCF(TConfig const& c, std::vector const& cnvs) { // Open one bam file header samFile* samfile = sam_open(c.bamFile.string().c_str(), "r"); hts_set_fai_filename(samfile, c.genome.string().c_str()); bam_hdr_t* bamhd = sam_hdr_read(samfile); // Output all copy-number variants htsFile *fp = hts_open(c.cnvfile.string().c_str(), "wb"); bcf_hdr_t *hdr = bcf_hdr_init("w"); // Print vcf header boost::posix_time::ptime now = boost::posix_time::second_clock::local_time(); boost::gregorian::date today = now.date(); std::string datestr("##fileDate="); datestr += boost::gregorian::to_iso_string(today); bcf_hdr_append(hdr, datestr.c_str()); bcf_hdr_append(hdr, "##ALT="); bcf_hdr_append(hdr, "##FILTER="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##FORMAT="); bcf_hdr_append(hdr, "##FORMAT="); bcf_hdr_append(hdr, "##FORMAT="); bcf_hdr_append(hdr, "##FORMAT="); bcf_hdr_append(hdr, "##FORMAT="); bcf_hdr_append(hdr, "##FORMAT="); bcf_hdr_append(hdr, "##FORMAT="); // Add reference std::string refloc("##reference="); refloc += c.genome.string(); bcf_hdr_append(hdr, refloc.c_str()); for (int i = 0; in_targets; ++i) { std::string refname("##contig=target_name[i]) + ",length=" + boost::lexical_cast(bamhd->target_len[i]) + ">"; bcf_hdr_append(hdr, refname.c_str()); } // Add samples bcf_hdr_add_sample(hdr, c.sampleName.c_str()); bcf_hdr_add_sample(hdr, NULL); if (bcf_hdr_write(fp, hdr) != 0) std::cerr << "Error: Failed to write BCF header!" << std::endl; uint32_t cnvid = 0; if (!cnvs.empty()) { // Genotype arrays int32_t *gts = (int*) malloc(bcf_hdr_nsamples(hdr) * 2 * sizeof(int)); int32_t *gqval = (int*) malloc(bcf_hdr_nsamples(hdr) * sizeof(int)); int32_t *cnval = (int*) malloc(bcf_hdr_nsamples(hdr) * sizeof(int)); float *cnrdval = (float*) malloc(bcf_hdr_nsamples(hdr) * sizeof(float)); float *cnsdval = (float*) malloc(bcf_hdr_nsamples(hdr) * sizeof(float)); float *cnl = (float*) malloc(bcf_hdr_nsamples(hdr) * MAX_CN * sizeof(float)); std::vector ftarr; ftarr.resize(bcf_hdr_nsamples(hdr)); // Iterate all structural variants now = boost::posix_time::second_clock::local_time(); std::cout << '[' << boost::posix_time::to_simple_string(now) << "] " << "Genotyping" << std::endl; boost::progress_display show_progress( cnvs.size() ); bcf1_t *rec = bcf_init(); for(uint32_t i = 0; i < cnvs.size(); ++i) { ++show_progress; // Invalid CNV? if ((!c.hasGenoFile) && (cnvs[i].cn == -1)) continue; // Integer copy-number int32_t absCN = (int32_t) boost::math::round(cnvs[i].cn); if ((!c.segmentation) && (absCN == c.ploidy)) continue; // Output main vcf fields rec->rid = bcf_hdr_name2id(hdr, bamhd->target_name[cnvs[i].chr]); int32_t svStartPos = cnvs[i].start + 1; int32_t svEndPos = cnvs[i].end; if (svEndPos >= (int32_t) bamhd->target_len[cnvs[i].chr]) svEndPos = bamhd->target_len[cnvs[i].chr] - 1; rec->pos = svStartPos; std::string id("CNV"); std::string padNumber = boost::lexical_cast(++cnvid); padNumber.insert(padNumber.begin(), 8 - padNumber.length(), '0'); id += padNumber; bcf_update_id(hdr, rec, id.c_str()); std::string svtype = "CNV"; std::string alleles = "N,<" + svtype + ">"; bcf_update_alleles_str(hdr, rec, alleles.c_str()); // Add INFO fields bcf_update_info_flag(hdr, rec, "IMPRECISE", NULL, 1); bcf_update_info_string(hdr, rec, "SVTYPE", svtype.c_str()); std::string dellyVersion("EMBL.DELLYv"); dellyVersion += dellyVersionNumber; bcf_update_info_string(hdr,rec, "SVMETHOD", dellyVersion.c_str()); int32_t tmpi = svEndPos; bcf_update_info_int32(hdr, rec, "END", &tmpi, 1); int32_t ciend[2]; ciend[0] = cnvs[i].ciendlow - cnvs[i].end; ciend[1] = cnvs[i].ciendhigh - cnvs[i].end; int32_t cipos[2]; cipos[0] = cnvs[i].ciposlow - cnvs[i].start; cipos[1] = cnvs[i].ciposhigh - cnvs[i].start; bcf_update_info_int32(hdr, rec, "CIPOS", cipos, 2); bcf_update_info_int32(hdr, rec, "CIEND", ciend, 2); float tmpf = cnvs[i].mappable; bcf_update_info_float(hdr, rec, "MP", &tmpf, 1); // Genotyping cnval[0] = absCN; cnrdval[0] = cnvs[i].cn; cnsdval[0] = cnvs[i].sd; gts[0] = bcf_gt_missing; gts[1] = bcf_gt_missing; int32_t qval = _computeCNLs(c, cnvs[i].cn, cnvs[i].sd, cnl, gqval); if (c.hasGenoFile) rec->qual = cnvs[i].qval; // Leave site quality in genotyping mode else rec->qual = qval; tmpi = bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS"); if (rec->qual < 15) tmpi = bcf_hdr_id2int(hdr, BCF_DT_ID, "LowQual"); bcf_update_filter(hdr, rec, &tmpi, 1); if (gqval[0] < 15) ftarr[0] = "LowQual"; else ftarr[0] = "PASS"; std::vector strp(bcf_hdr_nsamples(hdr)); std::transform(ftarr.begin(), ftarr.end(), strp.begin(), cstyle_str()); bcf_update_genotypes(hdr, rec, gts, bcf_hdr_nsamples(hdr) * 2); bcf_update_format_int32(hdr, rec, "CN", cnval, bcf_hdr_nsamples(hdr)); bcf_update_format_float(hdr, rec, "CNL", cnl, bcf_hdr_nsamples(hdr) * MAX_CN); bcf_update_format_int32(hdr, rec, "GQ", gqval, bcf_hdr_nsamples(hdr)); bcf_update_format_string(hdr, rec, "FT", &strp[0], bcf_hdr_nsamples(hdr)); bcf_update_format_float(hdr, rec, "RDCN", cnrdval, bcf_hdr_nsamples(hdr)); bcf_update_format_float(hdr, rec, "RDSD", cnsdval, bcf_hdr_nsamples(hdr)); bcf_write1(fp, hdr, rec); bcf_clear1(rec); } bcf_destroy1(rec); // Clean-up free(gts); free(gqval); free(cnval); free(cnrdval); free(cnsdval); free(cnl); } // Close BAM file bam_hdr_destroy(bamhd); sam_close(samfile); // Close VCF file bcf_hdr_destroy(hdr); hts_close(fp); // Build index bcf_index_build(c.cnvfile.string().c_str(), 14); } } #endif delly-0.9.1/src/coral.h000066400000000000000000000711731414764127700147200ustar00rootroot00000000000000#ifndef CORAL_H #define CORAL_H #include #include #include #include #include #include #include #include #include #include #include "bed.h" #include "scan.h" #include "gcbias.h" #include "cnv.h" #include "version.h" namespace torali { struct CountDNAConfig { bool adaptive; bool hasStatsFile; bool hasBedFile; bool hasScanFile; bool noScanWindowSelection; bool segmentation; bool hasGenoFile; bool hasVcfFile; uint32_t nchr; uint32_t meanisize; uint32_t window_size; uint32_t window_offset; uint32_t scanWindow; uint32_t minChrLen; uint32_t minCnvSize; uint16_t minQual; uint16_t mad; uint16_t ploidy; float exclgc; float uniqueToTotalCovRatio; float fracWindow; float fragmentUnique; float controlMaf; float stringency; float cn_offset; std::string sampleName; boost::filesystem::path vcffile; boost::filesystem::path genofile; boost::filesystem::path cnvfile; boost::filesystem::path covfile; boost::filesystem::path genome; boost::filesystem::path statsFile; boost::filesystem::path mapFile; boost::filesystem::path bamFile; boost::filesystem::path bedFile; boost::filesystem::path scanFile; }; struct CountDNAConfigLib { uint16_t madCutoff; uint16_t madNormalCutoff; boost::filesystem::path genome; std::vector files; }; template inline int32_t bamCount(TConfig const& c, LibraryInfo const& li, std::vector const& gcbias, std::pair const& gcbound) { // Load bam file samFile* samfile = sam_open(c.bamFile.string().c_str(), "r"); hts_set_fai_filename(samfile, c.genome.string().c_str()); hts_idx_t* idx = sam_index_load(samfile, c.bamFile.string().c_str()); bam_hdr_t* hdr = sam_hdr_read(samfile); // BED regions typedef std::set > TChrIntervals; typedef std::vector TRegionsGenome; TRegionsGenome bedRegions; if (c.hasBedFile) { if (!_parsePotOverlappingIntervals(c.bedFile.string(), c.hasBedFile, hdr, bedRegions)) { std::cerr << "Couldn't parse BED intervals. Do the chromosome names match?" << std::endl; return 1; } } // Parse BAM file boost::posix_time::ptime now = boost::posix_time::second_clock::local_time(); std::cout << '[' << boost::posix_time::to_simple_string(now) << "] " << "Count fragments" << std::endl; boost::progress_display show_progress( hdr->n_targets ); // Open output files boost::iostreams::filtering_ostream dataOut; dataOut.push(boost::iostreams::gzip_compressor()); dataOut.push(boost::iostreams::file_sink(c.covfile.c_str(), std::ios_base::out | std::ios_base::binary)); dataOut << "chr\tstart\tend\t" << c.sampleName << "_mappable\t" << c.sampleName << "_counts\t" << c.sampleName << "_CN" << std::endl; // CNVs std::vector cnvs; if (c.hasGenoFile) parseVcfCNV(c, hdr, cnvs); // SVs for breakpoint refinement typedef std::vector TChrBreakpoints; typedef std::vector TGenomicBreakpoints; TGenomicBreakpoints svbp(c.nchr, TChrBreakpoints()); if (c.hasVcfFile) { std::vector svs; vcfParse(c, hdr, svs); for(uint32_t i = 0; i < svs.size(); ++i) { svbp[svs[i].chr].push_back(SVBreakpoint(svs[i].svStart, svs[i].ciposlow, svs[i].ciposhigh, svs[i].mapq)); svbp[svs[i].chr2].push_back(SVBreakpoint(svs[i].svEnd, svs[i].ciendlow, svs[i].ciendhigh, svs[i].mapq)); } for (uint32_t i = 0; i < svbp.size(); ++i) sort(svbp[i].begin(), svbp[i].end(), SortSVBreakpoint()); } // Iterate chromosomes faidx_t* faiMap = fai_load(c.mapFile.string().c_str()); faidx_t* faiRef = fai_load(c.genome.string().c_str()); for(int32_t refIndex=0; refIndex < (int32_t) hdr->n_targets; ++refIndex) { ++show_progress; if ((!c.hasGenoFile) && (chrNoData(c, refIndex, idx))) continue; // Check presence in mappability map std::string tname(hdr->target_name[refIndex]); int32_t seqlen = faidx_seq_len(faiMap, tname.c_str()); if (seqlen == - 1) continue; else seqlen = -1; char* seq = faidx_fetch_seq(faiMap, tname.c_str(), 0, faidx_seq_len(faiMap, tname.c_str()), &seqlen); // Check presence in reference seqlen = faidx_seq_len(faiRef, tname.c_str()); if (seqlen == - 1) continue; else seqlen = -1; char* ref = faidx_fetch_seq(faiRef, tname.c_str(), 0, faidx_seq_len(faiRef, tname.c_str()), &seqlen); // Get GC and Mappability std::vector uniqContent(hdr->target_len[refIndex], 0); std::vector gcContent(hdr->target_len[refIndex], 0); { // Mappability map typedef boost::dynamic_bitset<> TBitSet; TBitSet uniq(hdr->target_len[refIndex], false); for(uint32_t i = 0; i < hdr->target_len[refIndex]; ++i) { if (seq[i] == 'C') uniq[i] = 1; } // GC map typedef boost::dynamic_bitset<> TBitSet; TBitSet gcref(hdr->target_len[refIndex], false); for(uint32_t i = 0; i < hdr->target_len[refIndex]; ++i) { if ((ref[i] == 'c') || (ref[i] == 'C') || (ref[i] == 'g') || (ref[i] == 'G')) gcref[i] = 1; } // Sum across fragment int32_t halfwin = (int32_t) (c.meanisize / 2); int32_t usum = 0; int32_t gcsum = 0; for(int32_t pos = halfwin; pos < (int32_t) hdr->target_len[refIndex] - halfwin; ++pos) { if (pos == halfwin) { for(int32_t i = pos - halfwin; i<=pos+halfwin; ++i) { usum += uniq[i]; gcsum += gcref[i]; } } else { usum -= uniq[pos - halfwin - 1]; gcsum -= gcref[pos - halfwin - 1]; usum += uniq[pos + halfwin]; gcsum += gcref[pos + halfwin]; } gcContent[pos] = gcsum; uniqContent[pos] = usum; } } // Coverage track typedef uint16_t TCount; uint32_t maxCoverage = std::numeric_limits::max(); typedef std::vector TCoverage; TCoverage cov(hdr->target_len[refIndex], 0); { // Mate map typedef boost::unordered_map TMateMap; TMateMap mateMap; // Count reads hts_itr_t* iter = sam_itr_queryi(idx, refIndex, 0, hdr->target_len[refIndex]); bam1_t* rec = bam_init1(); int32_t lastAlignedPos = 0; std::set lastAlignedPosReads; while (sam_itr_next(samfile, iter, rec) >= 0) { if (rec->core.flag & (BAM_FQCFAIL | BAM_FDUP | BAM_FUNMAP | BAM_FSECONDARY | BAM_FSUPPLEMENTARY)) continue; if (rec->core.qual < c.minQual) continue; if ((rec->core.flag & BAM_FPAIRED) && ((rec->core.flag & BAM_FMUNMAP) || (rec->core.tid != rec->core.mtid))) continue; int32_t midPoint = rec->core.pos + halfAlignmentLength(rec); if (rec->core.flag & BAM_FPAIRED) { // Clean-up the read store for identical alignment positions if (rec->core.pos > lastAlignedPos) { lastAlignedPosReads.clear(); lastAlignedPos = rec->core.pos; } if ((rec->core.pos < rec->core.mpos) || ((rec->core.pos == rec->core.mpos) && (lastAlignedPosReads.find(hash_string(bam_get_qname(rec))) == lastAlignedPosReads.end()))) { // First read lastAlignedPosReads.insert(hash_string(bam_get_qname(rec))); std::size_t hv = hash_pair(rec); mateMap[hv] = true; continue; } else { // Second read std::size_t hv = hash_pair_mate(rec); if ((mateMap.find(hv) == mateMap.end()) || (!mateMap[hv])) continue; // Mate discarded mateMap[hv] = false; } // update midpoint int32_t isize = (rec->core.pos + alignmentLength(rec)) - rec->core.mpos; if ((li.minNormalISize < isize) && (isize < li.maxNormalISize)) midPoint = rec->core.mpos + (int32_t) (isize/2); } // Count fragment if ((midPoint >= 0) && (midPoint < (int32_t) hdr->target_len[refIndex]) && (cov[midPoint] < maxCoverage - 1)) ++cov[midPoint]; } // Clean-up if (seq != NULL) free(seq); if (ref != NULL) free(ref); bam_destroy1(rec); hts_itr_destroy(iter); } // CNV discovery if (!c.hasGenoFile) { // Call CNVs std::vector chrcnv; callCNVs(c, gcbound, gcContent, uniqContent, gcbias, cov, hdr, refIndex, chrcnv); // Merge adjacent CNVs lacking read-depth shift mergeCNVs(c, chrcnv, cnvs); // Refine breakpoints if (c.hasVcfFile) breakpointRefinement(c, gcbound, gcContent, uniqContent, gcbias, cov, hdr, refIndex, svbp, cnvs); } // CNV genotyping genotypeCNVs(c, gcbound, gcContent, uniqContent, gcbias, cov, hdr, refIndex, cnvs); // BED File (target intervals) if (c.hasBedFile) { if (c.adaptive) { // Merge overlapping BED entries TChrIntervals citv; _mergeOverlappingBedEntries(bedRegions[refIndex], citv); // Tile merged intervals double covsum = 0; double expcov = 0; double obsexp = 0; uint32_t winlen = 0; uint32_t start = 0; bool endOfWindow = true; typename TChrIntervals::iterator it = citv.begin(); if (it != citv.end()) start = it->first; while(endOfWindow) { endOfWindow = false; for(it = citv.begin(); ((it != citv.end()) && (!endOfWindow)); ++it) { if ((it->first < it->second) && (it->second <= hdr->target_len[refIndex])) { if (start >= it->second) { if (start == it->second) { // Special case typename TChrIntervals::iterator itNext = it; ++itNext; if (itNext != citv.end()) start = itNext->first; } continue; } for(uint32_t pos = it->first; ((pos < it->second) && (!endOfWindow)); ++pos) { if (pos < start) continue; if ((gcContent[pos] > gcbound.first) && (gcContent[pos] < gcbound.second) && (uniqContent[pos] >= c.fragmentUnique * c.meanisize)) { covsum += cov[pos]; obsexp += gcbias[gcContent[pos]].obsexp; expcov += gcbias[gcContent[pos]].coverage; ++winlen; if (winlen == c.window_size) { obsexp /= (double) winlen; double count = ((double) covsum / obsexp ) * (double) c.window_size / (double) winlen; double cn = c.ploidy; if (expcov > 0) cn = c.ploidy * covsum / expcov; dataOut << std::string(hdr->target_name[refIndex]) << "\t" << start << "\t" << (pos + 1) << "\t" << winlen << "\t" << count << "\t" << cn << std::endl; // reset covsum = 0; expcov = 0; obsexp = 0; winlen = 0; if (c.window_offset == c.window_size) { // Move on start = pos + 1; endOfWindow = true; } else { // Rewind for(typename TChrIntervals::iterator sit = citv.begin(); ((sit != citv.end()) && (!endOfWindow)); ++sit) { if ((sit->first < sit->second) && (sit->second <= hdr->target_len[refIndex])) { if (start >= sit->second) continue; for(uint32_t k = sit->first; ((k < sit->second) && (!endOfWindow)); ++k) { if (k < start) continue; if ((gcContent[k] > gcbound.first) && (gcContent[k] < gcbound.second) && (uniqContent[k] >= c.fragmentUnique * c.meanisize)) { ++winlen; if (winlen == c.window_offset) { start = k + 1; winlen = 0; endOfWindow = true; } } } } } } } } } } } } } else { // Fixed Window Length for(typename TChrIntervals::iterator it = bedRegions[refIndex].begin(); it != bedRegions[refIndex].end(); ++it) { if ((it->first < it->second) && (it->second <= hdr->target_len[refIndex])) { double covsum = 0; double expcov = 0; double obsexp = 0; uint32_t winlen = 0; for(uint32_t pos = it->first; pos < it->second; ++pos) { if ((gcContent[pos] > gcbound.first) && (gcContent[pos] < gcbound.second) && (uniqContent[pos] >= c.fragmentUnique * c.meanisize)) { covsum += cov[pos]; obsexp += gcbias[gcContent[pos]].obsexp; expcov += gcbias[gcContent[pos]].coverage; ++winlen; } } if (winlen >= c.fracWindow * (it->second - it->first)) { obsexp /= (double) winlen; double count = ((double) covsum / obsexp ) * (double) (it->second - it->first) / (double) winlen; double cn = c.ploidy; if (expcov > 0) cn = c.ploidy * covsum / expcov; dataOut << std::string(hdr->target_name[refIndex]) << "\t" << it->first << "\t" << it->second << "\t" << winlen << "\t" << count << "\t" << cn << std::endl; } else { dataOut << std::string(hdr->target_name[refIndex]) << "\t" << it->first << "\t" << it->second << "\tNA\tNA\tNA" << std::endl; } } } } } else { // Genome-wide if (c.adaptive) { double covsum = 0; double expcov = 0; double obsexp = 0; uint32_t winlen = 0; uint32_t start = 0; uint32_t pos = 0; while(pos < hdr->target_len[refIndex]) { if ((gcContent[pos] > gcbound.first) && (gcContent[pos] < gcbound.second) && (uniqContent[pos] >= c.fragmentUnique * c.meanisize)) { covsum += cov[pos]; obsexp += gcbias[gcContent[pos]].obsexp; expcov += gcbias[gcContent[pos]].coverage; ++winlen; if (winlen == c.window_size) { obsexp /= (double) winlen; double count = ((double) covsum / obsexp ) * (double) c.window_size / (double) winlen; double cn = c.ploidy; if (expcov > 0) cn = c.ploidy * covsum / expcov; dataOut << std::string(hdr->target_name[refIndex]) << "\t" << start << "\t" << (pos + 1) << "\t" << winlen << "\t" << count << "\t" << cn << std::endl; // reset covsum = 0; expcov = 0; obsexp = 0; winlen = 0; if (c.window_offset == c.window_size) { // Move on start = pos + 1; } else { // Rewind for(uint32_t k = start; k < hdr->target_len[refIndex]; ++k) { if ((gcContent[k] > gcbound.first) && (gcContent[k] < gcbound.second) && (uniqContent[k] >= c.fragmentUnique * c.meanisize)) { ++winlen; if (winlen == c.window_offset) { start = k + 1; pos = k; winlen = 0; break; } } } } } } ++pos; } } else { // Fixed windows (genomic tiling) for(uint32_t start = 0; start < hdr->target_len[refIndex]; start = start + c.window_offset) { if (start + c.window_size < hdr->target_len[refIndex]) { double covsum = 0; double expcov = 0; double obsexp = 0; uint32_t winlen = 0; for(uint32_t pos = start; pos < start + c.window_size; ++pos) { if ((gcContent[pos] > gcbound.first) && (gcContent[pos] < gcbound.second) && (uniqContent[pos] >= c.fragmentUnique * c.meanisize)) { covsum += cov[pos]; obsexp += gcbias[gcContent[pos]].obsexp; expcov += gcbias[gcContent[pos]].coverage; ++winlen; } } if (winlen >= c.fracWindow * c.window_size) { obsexp /= (double) winlen; double count = ((double) covsum / obsexp ) * (double) c.window_size / (double) winlen; double cn = c.ploidy; if (expcov > 0) cn = c.ploidy * covsum / expcov; dataOut << std::string(hdr->target_name[refIndex]) << "\t" << start << "\t" << (start + c.window_size) << "\t" << winlen << "\t" << count << "\t" << cn << std::endl; } } } } } } // Sort CNVs sort(cnvs.begin(), cnvs.end(), SortCNVs()); // Genotype CNVs cnvVCF(c, cnvs); // clean-up fai_destroy(faiRef); fai_destroy(faiMap); bam_hdr_destroy(hdr); hts_idx_destroy(idx); sam_close(samfile); dataOut.pop(); dataOut.pop(); return 0; } int coral(int argc, char **argv) { CountDNAConfig c; // Parameter boost::program_options::options_description generic("Generic options"); generic.add_options() ("help,?", "show help message") ("genome,g", boost::program_options::value(&c.genome), "genome file") ("quality,q", boost::program_options::value(&c.minQual)->default_value(10), "min. mapping quality") ("mappability,m", boost::program_options::value(&c.mapFile), "input mappability map") ("ploidy,y", boost::program_options::value(&c.ploidy)->default_value(2), "baseline ploidy") ("outfile,o", boost::program_options::value(&c.cnvfile)->default_value("cnv.bcf"), "output CNV file") ("covfile,c", boost::program_options::value(&c.covfile)->default_value("cov.gz"), "output coverage file") ; boost::program_options::options_description cnv("CNV calling"); cnv.add_options() ("sdrd,x", boost::program_options::value(&c.stringency)->default_value(2), "min. SD read-depth shift") ("cn-offset,t", boost::program_options::value(&c.cn_offset)->default_value(0.1), "min. CN offset") ("cnv-size,z", boost::program_options::value(&c.minCnvSize)->default_value(1000), "min. CNV size") ("svfile,l", boost::program_options::value(&c.vcffile), "delly SV file for breakpoint refinement") ("vcffile,v", boost::program_options::value(&c.genofile), "input VCF/BCF file for re-genotyping") ("segmentation,u", "copy-number segmentation") ; boost::program_options::options_description window("Read-depth windows"); window.add_options() ("window-size,i", boost::program_options::value(&c.window_size)->default_value(10000), "window size") ("window-offset,j", boost::program_options::value(&c.window_offset)->default_value(10000), "window offset") ("bed-intervals,b", boost::program_options::value(&c.bedFile), "input BED file") ("fraction-window,k", boost::program_options::value(&c.fracWindow)->default_value(0.25), "min. callable window fraction [0,1]") ("adaptive-windowing,a", "use mappable bases for window size") ; boost::program_options::options_description gcopt("GC fragment normalization"); gcopt.add_options() ("scan-window,w", boost::program_options::value(&c.scanWindow)->default_value(10000), "scanning window size") ("fraction-unique,f", boost::program_options::value(&c.uniqueToTotalCovRatio)->default_value(0.8), "uniqueness filter for scan windows [0,1]") ("scan-regions,r", boost::program_options::value(&c.scanFile), "scanning regions in BED format") ("mad-cutoff,d", boost::program_options::value(&c.mad)->default_value(3), "median + 3 * mad count cutoff") ("percentile,p", boost::program_options::value(&c.exclgc)->default_value(0.0005), "excl. extreme GC fraction") ("no-window-selection,n", "no scan window selection") ; boost::program_options::options_description hidden("Hidden options"); hidden.add_options() ("input-file", boost::program_options::value(&c.bamFile), "input bam file") ("fragment,e", boost::program_options::value(&c.fragmentUnique)->default_value(0.97), "min. fragment uniqueness [0,1]") ("statsfile,s", boost::program_options::value(&c.statsFile), "gzipped stats output file (optional)") ; boost::program_options::positional_options_description pos_args; pos_args.add("input-file", -1); // Set the visibility boost::program_options::options_description cmdline_options; cmdline_options.add(generic).add(cnv).add(window).add(gcopt).add(hidden); boost::program_options::options_description visible_options; visible_options.add(generic).add(cnv).add(window).add(gcopt); // Parse command-line boost::program_options::variables_map vm; boost::program_options::store(boost::program_options::command_line_parser(argc, argv).options(cmdline_options).positional(pos_args).run(), vm); boost::program_options::notify(vm); // Check command line arguments if ((vm.count("help")) || (!vm.count("input-file")) || (!vm.count("genome")) || (!vm.count("mappability"))) { std::cout << std::endl; std::cout << "Usage: delly " << argv[0] << " [OPTIONS] -g -m " << std::endl; std::cout << visible_options << "\n"; return 1; } // Show cmd boost::posix_time::ptime now = boost::posix_time::second_clock::local_time(); std::cout << '[' << boost::posix_time::to_simple_string(now) << "] "; std::cout << "delly "; for(int i=0; i c.window_size) c.window_offset = c.window_size; if (c.window_size == 0) c.window_size = 1; if (c.window_offset == 0) c.window_offset = 1; // Check input VCF file (CNV genotyping) if (vm.count("vcffile")) { if (!(boost::filesystem::exists(c.genofile) && boost::filesystem::is_regular_file(c.genofile) && boost::filesystem::file_size(c.genofile))) { std::cerr << "Input VCF/BCF file is missing: " << c.genofile.string() << std::endl; return 1; } htsFile* ifile = bcf_open(c.genofile.string().c_str(), "r"); if (ifile == NULL) { std::cerr << "Fail to open file " << c.genofile.string() << std::endl; return 1; } bcf_hdr_t* hdr = bcf_hdr_read(ifile); if (hdr == NULL) { std::cerr << "Fail to open index file " << c.genofile.string() << std::endl; return 1; } bcf_hdr_destroy(hdr); bcf_close(ifile); c.hasGenoFile = true; } else c.hasGenoFile = false; // Check input VCF file (delly SV file) if (vm.count("svfile")) { if (!(boost::filesystem::exists(c.vcffile) && boost::filesystem::is_regular_file(c.vcffile) && boost::filesystem::file_size(c.vcffile))) { std::cerr << "Input VCF/BCF file is missing: " << c.vcffile.string() << std::endl; return 1; } htsFile* ifile = bcf_open(c.vcffile.string().c_str(), "r"); if (ifile == NULL) { std::cerr << "Fail to open file " << c.vcffile.string() << std::endl; return 1; } bcf_hdr_t* hdr = bcf_hdr_read(ifile); if (hdr == NULL) { std::cerr << "Fail to open index file " << c.vcffile.string() << std::endl; return 1; } bcf_hdr_destroy(hdr); bcf_close(ifile); c.hasVcfFile = true; } else c.hasVcfFile = false; // Check bam file LibraryInfo li; if (!(boost::filesystem::exists(c.bamFile) && boost::filesystem::is_regular_file(c.bamFile) && boost::filesystem::file_size(c.bamFile))) { std::cerr << "Alignment file is missing: " << c.bamFile.string() << std::endl; return 1; } else { // Get scan regions typedef boost::icl::interval_set TChrIntervals; typedef typename TChrIntervals::interval_type TIVal; typedef std::vector TRegionsGenome; TRegionsGenome scanRegions; // Open BAM file samFile* samfile = sam_open(c.bamFile.string().c_str(), "r"); if (samfile == NULL) { std::cerr << "Fail to open file " << c.bamFile.string() << std::endl; return 1; } hts_idx_t* idx = sam_index_load(samfile, c.bamFile.string().c_str()); if (idx == NULL) { if (bam_index_build(c.bamFile.string().c_str(), 0) != 0) { std::cerr << "Fail to open index for " << c.bamFile.string() << std::endl; return 1; } } bam_hdr_t* hdr = sam_hdr_read(samfile); if (hdr == NULL) { std::cerr << "Fail to open header for " << c.bamFile.string() << std::endl; return 1; } c.nchr = hdr->n_targets; c.minChrLen = setMinChrLen(hdr, 0.95); std::string sampleName = "unknown"; getSMTag(std::string(hdr->text), c.bamFile.stem().string(), sampleName); c.sampleName = sampleName; // Check matching chromosome names faidx_t* faiRef = fai_load(c.genome.string().c_str()); faidx_t* faiMap = fai_load(c.mapFile.string().c_str()); uint32_t mapFound = 0; uint32_t refFound = 0; for(int32_t refIndex=0; refIndex < hdr->n_targets; ++refIndex) { std::string tname(hdr->target_name[refIndex]); if (faidx_has_seq(faiMap, tname.c_str())) ++mapFound; if (faidx_has_seq(faiRef, tname.c_str())) ++refFound; else { std::cerr << "Warning: BAM chromosome " << tname << " not present in reference genome!" << std::endl; } } fai_destroy(faiRef); fai_destroy(faiMap); if (!mapFound) { std::cerr << "Mappability map chromosome naming disagrees with BAM file!" << std::endl; return 1; } if (!refFound) { std::cerr << "Reference genome chromosome naming disagrees with BAM file!" << std::endl; return 1; } // Estimate library params if (c.hasScanFile) { if (!_parseBedIntervals(c.scanFile.string(), c.hasScanFile, hdr, scanRegions)) { std::cerr << "Warning: Couldn't parse BED intervals. Do the chromosome names match?" << std::endl; return 1; } } else { scanRegions.resize(hdr->n_targets); for (int32_t refIndex = 0; refIndex < hdr->n_targets; ++refIndex) { scanRegions[refIndex].insert(TIVal::right_open(0, hdr->target_len[refIndex])); } } typedef std::vector TSampleLibrary; TSampleLibrary sampleLib(1, LibraryInfo()); CountDNAConfigLib dellyConf; dellyConf.genome = c.genome; dellyConf.files.push_back(c.bamFile); dellyConf.madCutoff = 9; dellyConf.madNormalCutoff = c.mad; getLibraryParams(dellyConf, scanRegions, sampleLib); li = sampleLib[0]; if (!li.median) { li.median = 250; li.mad = 15; li.minNormalISize = 0; li.maxNormalISize = 400; } c.meanisize = ((int32_t) (li.median / 2)) * 2 + 1; // Clean-up bam_hdr_destroy(hdr); hts_idx_destroy(idx); sam_close(samfile); } // GC bias estimation typedef std::pair TGCBound; TGCBound gcbound; std::vector gcbias(c.meanisize + 1, GcBias()); { // Scan genomic windows typedef std::vector TWindowCounts; typedef std::vector TGenomicWindowCounts; TGenomicWindowCounts scanCounts(c.nchr, TWindowCounts()); scan(c, li, scanCounts); // Select stable windows selectWindows(c, scanCounts); // Estimate GC bias gcBias(c, scanCounts, li, gcbias, gcbound); // Statistics output if (c.hasStatsFile) { // Open stats file boost::iostreams::filtering_ostream statsOut; statsOut.push(boost::iostreams::gzip_compressor()); statsOut.push(boost::iostreams::file_sink(c.statsFile.string().c_str(), std::ios_base::out | std::ios_base::binary)); // Library Info statsOut << "LP\t" << li.rs << ',' << li.median << ',' << li.mad << ',' << li.minNormalISize << ',' << li.maxNormalISize << std::endl; // Scan window summry samFile* samfile = sam_open(c.bamFile.string().c_str(), "r"); bam_hdr_t* hdr = sam_hdr_read(samfile); statsOut << "SW\tchrom\tstart\tend\tselected\tcoverage\tuniqcov" << std::endl; for(uint32_t refIndex = 0; refIndex < (uint32_t) hdr->n_targets; ++refIndex) { for(uint32_t i = 0; i < scanCounts[refIndex].size(); ++i) { statsOut << "SW\t" << hdr->target_name[refIndex] << '\t' << scanCounts[refIndex][i].start << '\t' << scanCounts[refIndex][i].end << '\t' << scanCounts[refIndex][i].select << '\t' << scanCounts[refIndex][i].cov << '\t' << scanCounts[refIndex][i].uniqcov << std::endl; } } bam_hdr_destroy(hdr); sam_close(samfile); // GC bias summary statsOut << "GC\tgcsum\tsample\treference\tpercentileSample\tpercentileReference\tfractionSample\tfractionReference\tobsexp\tmeancoverage" << std::endl; for(uint32_t i = 0; i < gcbias.size(); ++i) statsOut << "GC\t" << i << "\t" << gcbias[i].sample << "\t" << gcbias[i].reference << "\t" << gcbias[i].percentileSample << "\t" << gcbias[i].percentileReference << "\t" << gcbias[i].fractionSample << "\t" << gcbias[i].fractionReference << "\t" << gcbias[i].obsexp << "\t" << gcbias[i].coverage << std::endl; statsOut << "BoundsGC\t" << gcbound.first << "," << gcbound.second << std::endl; statsOut.pop(); statsOut.pop(); } } // Count reads if (bamCount(c, li, gcbias, gcbound)) { std::cerr << "Read counting error!" << std::endl; return 1; } // Done now = boost::posix_time::second_clock::local_time(); std::cout << '[' << boost::posix_time::to_simple_string(now) << "] " << "Done." << std::endl; return 0; } } #endif delly-0.9.1/src/coverage.h000066400000000000000000000726661414764127700154230ustar00rootroot00000000000000#ifndef COVERAGE_H #define COVERAGE_H #include #include #include #include #include #include #include #include #include #include #include "tags.h" #include "util.h" #include "msa.h" #include "split.h" namespace torali { struct SpanPoint { int32_t bppos; int32_t svt; uint32_t id; int32_t chr2; int32_t otherBppos; SpanPoint() : bppos(0), svt(0), id(0), chr2(0), otherBppos(0) {} explicit SpanPoint(int32_t const bp) : bppos(bp), svt(0), id(0), chr2(0), otherBppos(0) {} SpanPoint(int32_t const bp, int32_t const s, uint32_t const identifier, int32_t const tid, int32_t const obp) : bppos(bp), svt(s), id(identifier), chr2(tid), otherBppos(obp) {} }; struct BpRegion { int32_t regionStart; int32_t regionEnd; int32_t bppos; int32_t homLeft; int32_t homRight; int32_t svt; uint32_t id; uint8_t bpPoint; BpRegion() : regionStart(0), regionEnd(0), bppos(0), homLeft(0), homRight(0), svt(0), id(0), bpPoint(0) {} explicit BpRegion(int32_t bp) : regionStart(0), regionEnd(0), bppos(bp), homLeft(0), homRight(0), svt(0), id(0), bpPoint(0) {} BpRegion(int32_t rs, int32_t re, int32_t bpos, int32_t hl, int32_t hr, int32_t s, uint32_t identifier, uint8_t bpp) : regionStart(rs), regionEnd(re), bppos(bpos), homLeft(hl), homRight(hr), svt(s), id(identifier), bpPoint(bpp) {} }; template struct SortBp : public std::binary_function { inline bool operator()(TRecord const& s1, TRecord const& s2) const { return (s1.bppos < s2.bppos); } }; struct SpanningCount { int32_t refh1; int32_t refh2; int32_t alth1; int32_t alth2; std::vector ref; std::vector alt; SpanningCount() : refh1(0), refh2(0), alth1(0), alth2(0) {} }; struct JunctionCount { int32_t refh1; int32_t refh2; int32_t alth1; int32_t alth2; std::vector ref; std::vector alt; JunctionCount() : refh1(0), refh2(0), alth1(0), alth2(0) {} }; template inline uint32_t _getAlignmentQual(TAlign const& align, TQualities const& qual) { typedef typename TAlign::index TAIndex; uint32_t baseQualSum = 0; uint32_t seqPtr = 0; uint32_t alignedBases = 0; for(TAIndex j = 0; j < (TAIndex) align.shape()[1]; ++j) { if (align[1][j] != '-') { if (align[0][j] != '-') { ++alignedBases; baseQualSum += qual[seqPtr]; } ++seqPtr; } } return (baseQualSum / alignedBases); } template inline int32_t _cutRefStart(TPos const rStart, TPos const rEnd, TPos const offset, unsigned int bpPoint, int32_t const svt) { if (_translocation(svt)) { uint8_t ct = _getSpanOrientation(svt); if (ct == 3) { if (!bpPoint) return rEnd - offset; else return rStart - offset; } else { if (bpPoint) return rEnd - offset; else return rStart - offset; } } else { if (svt == 3) { if (!bpPoint) return rEnd - offset; else return rStart - offset; } else { if (bpPoint) return rEnd - offset; else return rStart - offset; } } } template inline int32_t _cutRefEnd(TPos const rStart, TPos const rEnd, TPos const offset, unsigned int bpPoint, int32_t const svt) { if (_translocation(svt)) { uint8_t ct = _getSpanOrientation(svt); if (ct == 3) { if (!bpPoint) return rEnd + offset; else return rStart + offset; } else { if (bpPoint) return rEnd + offset; else return rStart + offset; } } else { if (svt == 3) { if (!bpPoint) return rEnd + offset; else return rStart + offset; } else { if (bpPoint) return rEnd + offset; else return rStart + offset; } } } template inline void _generateProbes(TConfig const& c, bam_hdr_t* hdr, TSVs& svs, TBreakProbes& refProbeArr, TBreakProbes& consProbeArr, TGenomicBpRegion& bpRegion, std::vector& svOnChr) { typedef typename TBreakProbes::value_type TProbes; // Preprocess REF and ALT boost::posix_time::ptime noww = boost::posix_time::second_clock::local_time(); std::cout << '[' << boost::posix_time::to_simple_string(noww) << "] " << "Generate REF and ALT probes" << std::endl; boost::progress_display show_progresss( hdr->n_targets ); TProbes refProbes(svs.size()); faidx_t* fai = fai_load(c.genome.string().c_str()); for(int32_t refIndex=0; refIndex < (int32_t) hdr->n_targets; ++refIndex) { ++show_progresss; char* seq = NULL; // Iterate all structural variants for(typename TSVs::iterator itSV = svs.begin(); itSV != svs.end(); ++itSV) { if ((itSV->chr != refIndex) && (itSV->chr2 != refIndex)) continue; svOnChr[refIndex] = true; // Lazy loading of reference sequence if (seq == NULL) { int32_t seqlen = -1; std::string tname(hdr->target_name[refIndex]); seq = faidx_fetch_seq(fai, tname.c_str(), 0, hdr->target_len[refIndex], &seqlen); } // Set tag alleles if (itSV->chr == refIndex) { itSV->alleles = _addAlleles(boost::to_upper_copy(std::string(seq + itSV->svStart - 1, seq + itSV->svStart)), std::string(hdr->target_name[itSV->chr2]), *itSV, itSV->svt); } if (!itSV->precise) continue; // Get the reference sequence if ((itSV->chr != itSV->chr2) && (itSV->chr2 == refIndex)) { Breakpoint bp(*itSV); _initBreakpoint(hdr, bp, (int32_t) itSV->consensus.size(), itSV->svt); refProbes[itSV->id] = _getSVRef(seq, bp, refIndex, itSV->svt); } if (itSV->chr == refIndex) { Breakpoint bp(*itSV); if (_translocation(itSV->svt)) bp.part1 = refProbes[itSV->id]; if (itSV->svt ==4) { int32_t bufferSpace = std::max((int32_t) ((itSV->consensus.size() - itSV->insLen) / 3), c.minimumFlankSize); _initBreakpoint(hdr, bp, bufferSpace, itSV->svt); } else _initBreakpoint(hdr, bp, (int32_t) itSV->consensus.size(), itSV->svt); std::string svRefStr = _getSVRef(seq, bp, refIndex, itSV->svt); // Find breakpoint to reference typedef boost::multi_array TAlign; TAlign align; if (!_consRefAlignment(itSV->consensus, svRefStr, align, itSV->svt)) continue; AlignDescriptor ad; if (!_findSplit(c, itSV->consensus, svRefStr, align, ad, itSV->svt)) continue; // Debug consensus to reference alignment //std::cerr << itSV->id << std::endl; //for(uint32_t i = 0; ichr2; regionStart = std::max(0, itSV->svEnd - c.minimumFlankSize); regionEnd = std::min((uint32_t) (itSV->svEnd + c.minimumFlankSize), hdr->target_len[itSV->chr2]); cutConsStart = ad.cEnd - ad.homLeft - c.minimumFlankSize; cutConsEnd = ad.cEnd + ad.homRight + c.minimumFlankSize; cutRefStart = _cutRefStart(ad.rStart, ad.rEnd, ad.homLeft + c.minimumFlankSize, bpPoint, itSV->svt); cutRefEnd = _cutRefEnd(ad.rStart, ad.rEnd, ad.homRight + c.minimumFlankSize, bpPoint, itSV->svt); bppos = itSV->svEnd; } else { regionChr = itSV->chr; regionStart = std::max(0, itSV->svStart - c.minimumFlankSize); regionEnd = std::min((uint32_t) (itSV->svStart + c.minimumFlankSize), hdr->target_len[itSV->chr]); cutConsStart = ad.cStart - ad.homLeft - c.minimumFlankSize; cutConsEnd = ad.cStart + ad.homRight + c.minimumFlankSize; cutRefStart = _cutRefStart(ad.rStart, ad.rEnd, ad.homLeft + c.minimumFlankSize, bpPoint, itSV->svt); cutRefEnd = _cutRefEnd(ad.rStart, ad.rEnd, ad.homRight + c.minimumFlankSize, bpPoint, itSV->svt); bppos = itSV->svStart; } consProbeArr[bpPoint][itSV->id] = itSV->consensus.substr(cutConsStart, (cutConsEnd - cutConsStart)); refProbeArr[bpPoint][itSV->id] = svRefStr.substr(cutRefStart, (cutRefEnd - cutRefStart)); bpRegion[regionChr].push_back(BpRegion(regionStart, regionEnd, bppos, ad.homLeft, ad.homRight, itSV->svt, itSV->id, bpPoint)); } } } if (seq != NULL) free(seq); } // Clean-up fai_destroy(fai); for(int32_t refIndex=0; refIndex < (int32_t) hdr->n_targets; ++refIndex) { // Sort breakpoint regions std::sort(bpRegion[refIndex].begin(), bpRegion[refIndex].end(), SortBp()); } } template inline void annotateCoverage(TConfig& c, TSampleLibrary& sampleLib, TSVs& svs, TCoverageCount& covCount, TCountMap& countMap, TSpanMap& spanMap) { typedef typename TCoverageCount::value_type::value_type TCovPair; typedef typename TSpanMap::value_type::value_type TSpanPair; typedef typename TCountMap::value_type::value_type TCountPair; typedef std::vector TQuality; // Open file handles typedef std::vector TSamFile; typedef std::vector TIndex; typedef std::vector THeader; TSamFile samfile(c.files.size()); TIndex idx(c.files.size()); THeader hdr(c.files.size()); int32_t totalTarget = 0; for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) { samfile[file_c] = sam_open(c.files[file_c].string().c_str(), "r"); hts_set_fai_filename(samfile[file_c], c.genome.string().c_str()); idx[file_c] = sam_index_load(samfile[file_c], c.files[file_c].string().c_str()); hdr[file_c] = sam_hdr_read(samfile[file_c]); totalTarget += hdr[file_c]->n_targets; } // Initialize coverage count maps covCount.resize(c.files.size()); countMap.resize(c.files.size()); spanMap.resize(c.files.size()); for(uint32_t file_c = 0; file_c < c.files.size(); ++file_c) { covCount[file_c].resize(svs.size(), TCovPair()); countMap[file_c].resize(svs.size(), TCountPair()); spanMap[file_c].resize(svs.size(), TSpanPair()); } // Reference and consensus probes typedef std::vector TProbes; typedef std::vector TBreakProbes; TBreakProbes refProbeArr(2, TProbes()); // Left and right breakpoint TBreakProbes consProbeArr(2, TProbes()); // Left and right breakpoint for(uint32_t k = 0; k < 2; ++k) { refProbeArr[k].resize(svs.size()); consProbeArr[k].resize(svs.size()); } typedef std::vector TBpRegion; typedef std::vector TGenomicBpRegion; TGenomicBpRegion bpRegion(hdr[0]->n_targets, TBpRegion()); std::vector svOnChr(hdr[0]->n_targets, false); // Generate probes _generateProbes(c, hdr[0], svs, refProbeArr, consProbeArr, bpRegion, svOnChr); // Debug //for(uint32_t k = 0; k < 2; ++k) { //for(uint32_t i = 0; i < svs.size(); ++i) { //std::cerr << k << ',' << i << ',' << refProbeArr[k][i] << ',' << consProbeArr[k][i] << std::endl; //} //} // Iterate all samples boost::posix_time::ptime now = boost::posix_time::second_clock::local_time(); std::cout << '[' << boost::posix_time::to_simple_string(now) << "] " << "SV annotation" << std::endl; boost::progress_display show_progress( totalTarget ); typedef std::vector TRefAlignCount; typedef std::vector TFileRefAlignCount; TFileRefAlignCount refAlignedReadCount(c.files.size(), TRefAlignCount()); TFileRefAlignCount refAlignedSpanCount(c.files.size(), TRefAlignCount()); for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) { refAlignedReadCount[file_c].resize(svs.size(), 0); refAlignedSpanCount[file_c].resize(svs.size(), 0); } // Dump file boost::iostreams::filtering_ostream dumpOut; if (c.hasDumpFile) { dumpOut.push(boost::iostreams::gzip_compressor()); dumpOut.push(boost::iostreams::file_sink(c.dumpfile.string().c_str(), std::ios_base::out | std::ios_base::binary)); dumpOut << "#svid\tbam\tqname\tchr\tpos\tmatechr\tmatepos\tmapq\ttype" << std::endl; } #pragma omp parallel for default(shared) for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) { // Pair qualities and features typedef boost::unordered_map TQualities; TQualities qualities; TQualities qualitiestra; typedef boost::unordered_map TClip; TClip clip; TClip cliptra; // Iterate chromosomes for(int32_t refIndex=0; refIndex < (int32_t) hdr[file_c]->n_targets; ++refIndex) { ++show_progress; // Any SV breakpoints on this chromosome? if (!svOnChr[refIndex]) continue; // Check we have mapped reads on this chromosome bool nodata = true; std::string suffix("cram"); std::string str(c.files[file_c].string()); if ((str.size() >= suffix.size()) && (str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0)) nodata = false; uint64_t mapped = 0; uint64_t unmapped = 0; hts_idx_get_stat(idx[file_c], refIndex, &mapped, &unmapped); if (mapped) nodata = false; if (nodata) continue; // Coverage track typedef uint16_t TCount; uint32_t maxCoverage = std::numeric_limits::max(); typedef std::vector TCoverage; TCoverage covFragment(hdr[file_c]->target_len[refIndex], 0); TCoverage covBases(hdr[file_c]->target_len[refIndex], 0); // Flag breakpoint regions typedef boost::dynamic_bitset<> TBitSet; TBitSet bpOccupied(hdr[file_c]->target_len[refIndex]); for(uint32_t i = 0; i < bpRegion[refIndex].size(); ++i) { for(int32_t k = bpRegion[refIndex][i].regionStart; k < bpRegion[refIndex][i].regionEnd; ++k) { bpOccupied[k] = 1; } } // Flag spanning breakpoints typedef std::vector TSpanPoint; TSpanPoint spanPoint; typedef boost::dynamic_bitset<> TBitSet; TBitSet spanBp(hdr[file_c]->target_len[refIndex]); for(typename TSVs::iterator itSV = svs.begin(); itSV != svs.end(); ++itSV) { if (itSV->peSupport == 0) continue; if ((itSV->chr == refIndex) && (itSV->svStart < (int32_t) hdr[file_c]->target_len[refIndex])) { spanBp[itSV->svStart] = 1; spanPoint.push_back(SpanPoint(itSV->svStart, itSV->svt, itSV->id, itSV->chr2, itSV->svEnd)); } if ((itSV->chr2 == refIndex) && (itSV->svEnd < (int32_t) hdr[file_c]->target_len[refIndex])) { spanBp[itSV->svEnd] = 1; spanPoint.push_back(SpanPoint(itSV->svEnd, itSV->svt, itSV->id, itSV->chr, itSV->svStart)); } } std::sort(spanPoint.begin(), spanPoint.end(), SortBp()); // Count reads hts_itr_t* iter = sam_itr_queryi(idx[file_c], refIndex, 0, hdr[file_c]->target_len[refIndex]); bam1_t* rec = bam_init1(); int32_t lastAlignedPos = 0; std::set lastAlignedPosReads; while (sam_itr_next(samfile[file_c], iter, rec) >= 0) { if (rec->core.flag & (BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP | BAM_FSUPPLEMENTARY | BAM_FUNMAP | BAM_FMUNMAP)) continue; if (rec->core.qual < c.minGenoQual) continue; // Count aligned basepair (small InDels) { uint32_t rp = 0; // reference pointer uint32_t* cigar = bam_get_cigar(rec); for (std::size_t i = 0; i < rec->core.n_cigar; ++i) { if (bam_cigar_op(cigar[i]) == BAM_CMATCH) { for(std::size_t k = 0; kcore.pos + rp < hdr[file_c]->target_len[refIndex]) && (covBases[rec->core.pos + rp] < maxCoverage - 1)) ++covBases[rec->core.pos + rp]; ++rp; } } else if (bam_cigar_op(cigar[i]) == BAM_CDEL) { rp += bam_cigar_oplen(cigar[i]); } else if (bam_cigar_op(cigar[i]) == BAM_CREF_SKIP) { rp += bam_cigar_oplen(cigar[i]); } } } // Any (leading) soft clip bool hasSoftClip = false; bool hasClip = false; int32_t leadingSC = 0; uint32_t* cigar = bam_get_cigar(rec); for (std::size_t i = 0; i < rec->core.n_cigar; ++i) { if (bam_cigar_op(cigar[i]) == BAM_CSOFT_CLIP) { hasClip = true; hasSoftClip = true; if (i == 0) leadingSC = bam_cigar_oplen(cigar[i]); } else if (bam_cigar_op(cigar[i]) == BAM_CHARD_CLIP) hasClip = true; } // Check read length for junction annotation if (rec->core.l_qseq >= (2 * c.minimumFlankSize)) { bool bpvalid = false; int32_t rbegin = std::max(0, (int32_t) rec->core.pos - leadingSC); for(int32_t k = rbegin; ((k < (rec->core.pos + rec->core.l_qseq)) && (k < (int32_t) hdr[file_c]->target_len[refIndex])); ++k) { if (bpOccupied[k]) { bpvalid = true; break; } } if (bpvalid) { // Fetch all relevant SVs typename TBpRegion::iterator itBp = std::lower_bound(bpRegion[refIndex].begin(), bpRegion[refIndex].end(), BpRegion(rbegin), SortBp()); for(; ((itBp != bpRegion[refIndex].end()) && (rec->core.pos + rec->core.l_qseq >= itBp->bppos)); ++itBp) { if ((countMap[file_c][itBp->id].ref.size() + countMap[file_c][itBp->id].alt.size()) >= c.maxGenoReadCount) continue; // Read spans breakpoint? if ((hasSoftClip) || ((!hasClip) && (rec->core.pos + c.minimumFlankSize + itBp->homLeft <= itBp->bppos) && (rec->core.pos + rec->core.l_qseq >= itBp->bppos + c.minimumFlankSize + itBp->homRight))) { std::string consProbe = consProbeArr[itBp->bpPoint][itBp->id]; std::string refProbe = refProbeArr[itBp->bpPoint][itBp->id]; // Get sequence std::string sequence; sequence.resize(rec->core.l_qseq); uint8_t* seqptr = bam_get_seq(rec); for (int i = 0; i < rec->core.l_qseq; ++i) sequence[i] = "=ACMGRSVTWYHKDBN"[bam_seqi(seqptr, i)]; _adjustOrientation(sequence, itBp->bpPoint, itBp->svt); // Compute alignment to alternative haplotype typedef boost::multi_array TAlign; TAlign alignAlt; DnaScore simple(5, -4, -4, -4); AlignConfig semiglobal; int32_t scoreA = needle(consProbe, sequence, alignAlt, semiglobal, simple); int32_t scoreAltThreshold = (int32_t) (c.flankQuality * consProbe.size() * simple.match + (1.0 - c.flankQuality) * consProbe.size() * simple.mismatch); double scoreAlt = (double) scoreA / (double) scoreAltThreshold; // Compute alignment to reference haplotype TAlign alignRef; int32_t scoreR = needle(refProbe, sequence, alignRef, semiglobal, simple); int32_t scoreRefThreshold = (int32_t) (c.flankQuality * refProbe.size() * simple.match + (1.0 - c.flankQuality) * refProbe.size() * simple.mismatch); double scoreRef = (double) scoreR / (double) scoreRefThreshold; // Any confident alignment? if ((scoreRef > 1) || (scoreAlt > 1)) { // Debug alignment to REF and ALT //std::cerr << "Alt:\t" << scoreAlt << "\tRef:\t" << scoreRef << std::endl; //for(TAIndex i = 0; i< (TAIndex) alignAlt.shape()[0]; ++i) { //for(TAIndex j = 0; j< (TAIndex) alignAlt.shape()[1]; ++j) std::cerr << alignAlt[i][j]; //std::cerr << std::endl; //} //for(TAIndex i = 0; i< (TAIndex) alignRef.shape()[0]; ++i) { //for(TAIndex j = 0; j< (TAIndex) alignRef.shape()[1]; ++j) std::cerr << alignRef[i][j]; //std::cerr << std::endl; //} if (scoreRef > scoreAlt) { // Account for reference bias if (++refAlignedReadCount[file_c][itBp->id] % 2) { TQuality quality; quality.resize(rec->core.l_qseq); uint8_t* qualptr = bam_get_qual(rec); for (int i = 0; i < rec->core.l_qseq; ++i) quality[i] = qualptr[i]; uint32_t rq = _getAlignmentQual(alignRef, quality); if (rq >= c.minGenoQual) { uint8_t* hpptr = bam_aux_get(rec, "HP"); #pragma omp critical { countMap[file_c][itBp->id].ref.push_back((uint8_t) std::min(rq, (uint32_t) rec->core.qual)); if (hpptr) { c.isHaplotagged = true; int hap = bam_aux2i(hpptr); if (hap == 1) ++countMap[file_c][itBp->id].refh1; else ++countMap[file_c][itBp->id].refh2; } } } } } else { TQuality quality; quality.resize(rec->core.l_qseq); uint8_t* qualptr = bam_get_qual(rec); for (int i = 0; i < rec->core.l_qseq; ++i) quality[i] = qualptr[i]; uint32_t aq = _getAlignmentQual(alignAlt, quality); if (aq >= c.minGenoQual) { uint8_t* hpptr = bam_aux_get(rec, "HP"); #pragma omp critical { if (c.hasDumpFile) { std::string svid(_addID(itBp->svt)); std::string padNumber = boost::lexical_cast(itBp->id); padNumber.insert(padNumber.begin(), 8 - padNumber.length(), '0'); svid += padNumber; dumpOut << svid << "\t" << c.files[file_c].string() << "\t" << bam_get_qname(rec) << "\t" << hdr[file_c]->target_name[rec->core.tid] << "\t" << rec->core.pos << "\t" << hdr[file_c]->target_name[rec->core.mtid] << "\t" << rec->core.mpos << "\t" << (int32_t) rec->core.qual << "\tSR" << std::endl; } countMap[file_c][itBp->id].alt.push_back((uint8_t) std::min(aq, (uint32_t) rec->core.qual)); if (hpptr) { c.isHaplotagged = true; int hap = bam_aux2i(hpptr); if (hap == 1) ++countMap[file_c][itBp->id].alth1; else ++countMap[file_c][itBp->id].alth2; } } } } } } } } } // Read-count and spanning annotation if ((!(rec->core.flag & BAM_FPAIRED)) || (!svOnChr[rec->core.mtid])) continue; // Clean-up the read store for identical alignment positions if (rec->core.pos > lastAlignedPos) { lastAlignedPosReads.clear(); lastAlignedPos = rec->core.pos; } if (_firstPairObs(rec, lastAlignedPosReads)) { // First read lastAlignedPosReads.insert(hash_string(bam_get_qname(rec))); std::size_t hv = hash_pair(rec); if (rec->core.tid == rec->core.mtid) { qualities[hv] = rec->core.qual; clip[hv] = hasSoftClip; } else { qualitiestra[hv] = rec->core.qual; cliptra[hv] = hasSoftClip; } } else { // Second read std::size_t hv = hash_pair_mate(rec); uint8_t pairQuality = 0; bool pairClip = false; if (rec->core.tid == rec->core.mtid) { if (qualities.find(hv) == qualities.end()) continue; // Mate discarded pairQuality = std::min((uint8_t) qualities[hv], (uint8_t) rec->core.qual); if ((clip[hv]) || (hasSoftClip)) pairClip = true; qualities[hv] = 0; clip[hv] = false; } else { if (qualitiestra.find(hv) == qualitiestra.end()) continue; // Mate discarded pairQuality = std::min((uint8_t) qualitiestra[hv], (uint8_t) rec->core.qual); if ((cliptra[hv]) || (hasSoftClip)) pairClip = true; qualitiestra[hv] = 0; cliptra[hv] = false; } // Pair quality if (pairQuality < c.minGenoQual) continue; // Low quality pair // Read-depth fragment counting if (rec->core.tid == rec->core.mtid) { // Count mid point (fragment counting) int32_t midPoint = rec->core.pos + halfAlignmentLength(rec); if ((midPoint < (int32_t) hdr[file_c]->target_len[refIndex]) && (covFragment[midPoint] < maxCoverage - 1)) ++covFragment[midPoint]; } // Spanning counting int32_t outerISize = 0; if (rec->core.pos < rec->core.mpos) outerISize = rec->core.mpos + rec->core.l_qseq - rec->core.pos; else outerISize = rec->core.pos + rec->core.l_qseq - rec->core.mpos; // Get the library information if (sampleLib[file_c].median == 0) continue; // Single-end library or non-valid library // Normal spanning pair if ((!pairClip) && (getSVType(rec) == 2) && (outerISize >= sampleLib[file_c].minNormalISize) && (outerISize <= sampleLib[file_c].maxNormalISize) && (rec->core.tid==rec->core.mtid)) { // Take X% of the outerisize as the spanned interval int32_t spanlen = 0.8 * outerISize; int32_t pbegin = std::min((int32_t) rec->core.pos, (int32_t) rec->core.mpos); int32_t st = pbegin + (outerISize - spanlen) / 2; bool spanvalid = false; for(int32_t i = st; ((i < (st + spanlen)) && (i < (int32_t) hdr[file_c]->target_len[refIndex])); ++i) { if (spanBp[i]) { spanvalid = true; break; } } if (spanvalid) { // Fetch all relevant SVs typename TSpanPoint::iterator itSpan = std::lower_bound(spanPoint.begin(), spanPoint.end(), SpanPoint(st), SortBp()); for(; ((itSpan != spanPoint.end()) && (st + spanlen >= itSpan->bppos)); ++itSpan) { // Account for reference bias if (++refAlignedSpanCount[file_c][itSpan->id] % 2) { uint8_t* hpptr = bam_aux_get(rec, "HP"); #pragma omp critical { spanMap[file_c][itSpan->id].ref.push_back(pairQuality); if (hpptr) { c.isHaplotagged = true; int hap = bam_aux2i(hpptr); if (hap == 1) ++spanMap[file_c][itSpan->id].refh1; else ++spanMap[file_c][itSpan->id].refh2; } } } } } } // Abnormal spanning coverage if ((getSVType(rec) != 2) || (outerISize < sampleLib[file_c].minNormalISize) || (outerISize > sampleLib[file_c].maxNormalISize) || (rec->core.tid!=rec->core.mtid)) { // SV type int32_t svt = _isizeMappingPos(rec, sampleLib[file_c].maxISizeCutoff); if (svt == -1) continue; // Spanning a breakpoint? bool spanvalid = false; int32_t pbegin = rec->core.pos; int32_t pend = std::min((int32_t) rec->core.pos + sampleLib[file_c].maxNormalISize, (int32_t) hdr[file_c]->target_len[refIndex]); if (rec->core.flag & BAM_FREVERSE) { pbegin = std::max(0, (int32_t) rec->core.pos + rec->core.l_qseq - sampleLib[file_c].maxNormalISize); pend = std::min((int32_t) rec->core.pos + rec->core.l_qseq, (int32_t) hdr[file_c]->target_len[refIndex]); } for(int32_t i = pbegin; i < pend; ++i) { if (spanBp[i]) { spanvalid = true; break; } } if (spanvalid) { // Fetch all relevant SVs typename TSpanPoint::iterator itSpan = std::lower_bound(spanPoint.begin(), spanPoint.end(), SpanPoint(pbegin), SortBp()); for(; ((itSpan != spanPoint.end()) && (pend >= itSpan->bppos)); ++itSpan) { if (svt == itSpan->svt) { // Make sure, mate is correct if (rec->core.mtid == itSpan->chr2) { if (std::abs((int32_t) rec->core.mpos - itSpan->otherBppos) < sampleLib[file_c].maxNormalISize) { uint8_t* hpptr = bam_aux_get(rec, "HP"); #pragma omp critical { if (c.hasDumpFile) { std::string svid(_addID(itSpan->svt)); std::string padNumber = boost::lexical_cast(itSpan->id); padNumber.insert(padNumber.begin(), 8 - padNumber.length(), '0'); svid += padNumber; dumpOut << svid << "\t" << c.files[file_c].string() << "\t" << bam_get_qname(rec) << "\t" << hdr[file_c]->target_name[rec->core.tid] << "\t" << rec->core.pos << "\t" << hdr[file_c]->target_name[rec->core.mtid] << "\t" << rec->core.mpos << "\t" << (int32_t) rec->core.qual << "\tPE" << std::endl; } spanMap[file_c][itSpan->id].alt.push_back(pairQuality); if (hpptr) { c.isHaplotagged = true; int hap = bam_aux2i(hpptr); if (hap == 1) ++spanMap[file_c][itSpan->id].alth1; else ++spanMap[file_c][itSpan->id].alth2; } } } } } } } } } } // Clean-up bam_destroy1(rec); hts_itr_destroy(iter); qualities.clear(); clip.clear(); // Assign fragment and base counts to SVs for(uint32_t i = 0; i < svs.size(); ++i) { if (svs[i].chr == refIndex) { // Small or large SV bool smallSV = false; int32_t halfSize = (svs[i].svEnd - svs[i].svStart)/2; if ((_translocation(svs[i].svt)) || (svs[i].svt == 4)) { halfSize = 500; smallSV = true; } else { if ((svs[i].svEnd - svs[i].svStart) <= c.indelsize) smallSV = true; } // Left region int32_t lstart = std::max(svs[i].svStart - halfSize, 0); int32_t lend = svs[i].svStart; int32_t covbase = 0; for(uint32_t k = lstart; ((k < (uint32_t) lend) && (k < hdr[0]->target_len[refIndex])); ++k) { if (smallSV) covbase += covBases[k]; else covbase += covFragment[k]; } covCount[file_c][svs[i].id].leftRC = covbase; // Actual SV covbase = 0; int32_t mstart = svs[i].svStart; int32_t mend = svs[i].svEnd; if ((_translocation(svs[i].svt)) || (svs[i].svt == 4)) { mstart = std::max(svs[i].svStart - halfSize, 0); mend = std::min(svs[i].svStart + halfSize, (int32_t) hdr[0]->target_len[refIndex]); } for(uint32_t k = mstart; ((k < (uint32_t) mend) && (k < hdr[0]->target_len[refIndex])); ++k) { if (smallSV) covbase += covBases[k]; else covbase += covFragment[k]; } covCount[file_c][svs[i].id].rc = covbase; // Right region covbase = 0; int32_t rstart = svs[i].svEnd; int32_t rend = std::min(svs[i].svEnd + halfSize, (int32_t) hdr[0]->target_len[refIndex]); if ((_translocation(svs[i].svt)) || (svs[i].svt == 4)) { rstart = svs[i].svStart; rend = std::min(svs[i].svStart + halfSize, (int32_t) hdr[0]->target_len[refIndex]); } for(uint32_t k = rstart; ((k < (uint32_t) rend) && (k < hdr[0]->target_len[refIndex])); ++k) { if (smallSV) covbase += covBases[k]; else covbase += covFragment[k]; } covCount[file_c][svs[i].id].rightRC = covbase; } } } } // Clean-up for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) { bam_hdr_destroy(hdr[file_c]); hts_idx_destroy(idx[file_c]); sam_close(samfile[file_c]); } } } #endif delly-0.9.1/src/delly.cpp000066400000000000000000000061371414764127700152620ustar00rootroot00000000000000#define _SECURE_SCL 0 #define _SCL_SECURE_NO_WARNINGS #include #include #define BOOST_DISABLE_ASSERTS #ifdef OPENMP #include #endif #ifdef PROFILE #include "gperftools/profiler.h" #endif #include "version.h" #include "delly.h" #include "filter.h" #include "classify.h" #include "merge.h" #include "tegua.h" #include "coral.h" using namespace torali; inline void displayUsage() { std::cout << "Usage: delly " << std::endl; std::cout << std::endl; std::cout << "Short-read SV calling:" << std::endl; std::cout << " call discover and genotype structural variants" << std::endl; std::cout << " merge merge structural variants across VCF/BCF files and within a single VCF/BCF file" << std::endl; std::cout << " filter filter somatic or germline structural variants" << std::endl; std::cout << std::endl; std::cout << "Long-read SV calling:" << std::endl; std::cout << " lr long-read SV discovery" << std::endl; std::cout << std::endl; std::cout << "Copy-number variant calling:" << std::endl; std::cout << " cnv discover and genotype copy-number variants" << std::endl; std::cout << " classify classify somatic or germline copy-number variants" << std::endl; std::cout << std::endl; std::cout << std::endl; } int main(int argc, char **argv) { if (argc < 2) { printTitle("Delly"); displayUsage(); return 0; } if ((std::string(argv[1]) == "version") || (std::string(argv[1]) == "--version") || (std::string(argv[1]) == "--version-only") || (std::string(argv[1]) == "-v")) { std::cout << "Delly version: v" << dellyVersionNumber << std::endl; std::cout << " using Boost: v" << BOOST_VERSION / 100000 << "." << BOOST_VERSION / 100 % 1000 << "." << BOOST_VERSION % 100 << std::endl; std::cout << " using HTSlib: v" << hts_version() << std::endl; return 0; } else if ((std::string(argv[1]) == "help") || (std::string(argv[1]) == "--help") || (std::string(argv[1]) == "-h") || (std::string(argv[1]) == "-?")) { printTitle("Delly"); displayUsage(); return 0; } else if ((std::string(argv[1]) == "warranty") || (std::string(argv[1]) == "--warranty") || (std::string(argv[1]) == "-w")) { displayWarranty(); return 0; } else if ((std::string(argv[1]) == "license") || (std::string(argv[1]) == "--license") || (std::string(argv[1]) == "-l")) { bsd(); return 0; } else if ((std::string(argv[1]) == "call")) { return delly(argc-1,argv+1); } else if ((std::string(argv[1]) == "lr")) { return tegua(argc-1,argv+1); } else if ((std::string(argv[1]) == "cnv")) { return coral(argc-1,argv+1); } else if ((std::string(argv[1]) == "classify")) { return classify(argc-1,argv+1); } else if ((std::string(argv[1]) == "filter")) { return filter(argc-1,argv+1); } else if ((std::string(argv[1]) == "merge")) { return merge(argc-1,argv+1); } std::cerr << "Unrecognized command " << std::string(argv[1]) << std::endl; return 1; } delly-0.9.1/src/delly.h000066400000000000000000000347611414764127700147330ustar00rootroot00000000000000#ifndef DELLY_H #define DELLY_H #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "version.h" #include "util.h" #include "bolog.h" #include "tags.h" #include "coverage.h" #include "msa.h" #include "split.h" #include "shortpe.h" #include "modvcf.h" #include #include #include #include #include namespace torali { // Config arguments struct Config { bool islr; uint16_t minMapQual; uint16_t minTraQual; uint16_t minGenoQual; uint16_t madCutoff; uint16_t madNormalCutoff; int32_t nchr; int32_t minimumFlankSize; int32_t indelsize; uint32_t graphPruning; uint32_t minRefSep; uint32_t maxReadSep; uint32_t minClip; uint32_t maxGenoReadCount; uint32_t minCliqueSize; float flankQuality; bool hasExcludeFile; bool hasVcfFile; bool isHaplotagged; bool hasDumpFile; bool svtcmd; std::set svtset; DnaScore aliscore; boost::filesystem::path outfile; boost::filesystem::path vcffile; boost::filesystem::path genome; boost::filesystem::path exclude; boost::filesystem::path dumpfile; std::vector files; std::vector sampleName; }; template inline int dellyRun(TConfigStruct& c) { #ifdef PROFILE ProfilerStart("delly.prof"); #endif // Collect all promising structural variants typedef std::vector TVariants; TVariants svs; // Open header samFile* samfile = sam_open(c.files[0].string().c_str(), "r"); bam_hdr_t* hdr = sam_hdr_read(samfile); // Exclude intervals typedef boost::icl::interval_set TChrIntervals; typedef std::vector TRegionsGenome; TRegionsGenome validRegions; if (!_parseExcludeIntervals(c, hdr, validRegions)) { std::cerr << "Delly couldn't parse exclude intervals!" << std::endl; bam_hdr_destroy(hdr); sam_close(samfile); return 1; } // Debug code //for(int32_t refIndex = 0; refIndex < hdr->n_targets; ++refIndex) { //for(typename TChrIntervals::const_iterator vRIt = validRegions[refIndex].begin(); vRIt != validRegions[refIndex].end(); ++vRIt) { //std::cerr << std::string(hdr->target_name[refIndex]) << "\t" << vRIt->lower() << "\t" << vRIt->upper() << std::endl; //} //} // Create library objects typedef std::vector TSampleLibrary; TSampleLibrary sampleLib(c.files.size(), LibraryInfo()); getLibraryParams(c, validRegions, sampleLib); for(uint32_t i = 0; i TVariants; TVariants srSVs; // SR Store { typedef std::pair TPosRead; typedef boost::unordered_map TPosReadSV; typedef std::vector TGenomicPosReadSV; TGenomicPosReadSV srStore(c.nchr, TPosReadSV()); scanPEandSR(c, validRegions, svs, srSVs, srStore, sampleLib); // Assemble split-read calls assembleSplitReads(c, validRegions, srStore, srSVs); } // Sort and merge PE and SR calls mergeSort(svs, srSVs); } else vcfParse(c, hdr, svs); // Clean-up bam_hdr_destroy(hdr); sam_close(samfile); // Re-number SVs sort(svs.begin(), svs.end(), SortSVs()); uint32_t cliqueCount = 0; for(typename TVariants::iterator svIt = svs.begin(); svIt != svs.end(); ++svIt, ++cliqueCount) svIt->id = cliqueCount; // Annotate junction reads typedef std::vector TSVJunctionMap; typedef std::vector TSampleSVJunctionMap; TSampleSVJunctionMap jctMap; // Annotate spanning coverage typedef std::vector TSVSpanningMap; typedef std::vector TSampleSVSpanningMap; TSampleSVSpanningMap spanMap; // Annotate coverage typedef std::vector TSVReadCount; typedef std::vector TSampleSVReadCount; TSampleSVReadCount rcMap; // SV Genotyping if (!svs.empty()) annotateCoverage(c, sampleLib, svs, rcMap, jctMap, spanMap); // VCF output vcfOutput(c, svs, jctMap, rcMap, spanMap); // Output library statistics boost::posix_time::ptime now = boost::posix_time::second_clock::local_time(); std::cout << '[' << boost::posix_time::to_simple_string(now) << "] " << "Sample statistics" << std::endl; for(uint32_t file_c = 0; file_c < c.files.size(); ++file_c) { std::cout << "Sample:" << c.sampleName[file_c] << ",ReadSize=" << sampleLib[file_c].rs << ",Median=" << sampleLib[file_c].median << ",MAD=" << sampleLib[file_c].mad << ",UniqueDiscordantPairs=" << sampleLib[file_c].abnormal_pairs << std::endl; } #ifdef PROFILE ProfilerStop(); #endif // End now = boost::posix_time::second_clock::local_time(); std::cout << '[' << boost::posix_time::to_simple_string(now) << "] Done." << std::endl;; return 0; } int delly(int argc, char **argv) { Config c; c.isHaplotagged = false; c.madNormalCutoff = 5; c.islr = false; // Define generic options std::string svtype; boost::program_options::options_description generic("Generic options"); generic.add_options() ("help,?", "show help message") ("svtype,t", boost::program_options::value(&svtype)->default_value("ALL"), "SV type to compute [DEL, INS, DUP, INV, BND, ALL]") ("genome,g", boost::program_options::value(&c.genome), "genome fasta file") ("exclude,x", boost::program_options::value(&c.exclude), "file with regions to exclude") ("outfile,o", boost::program_options::value(&c.outfile)->default_value("sv.bcf"), "SV BCF output file") ; boost::program_options::options_description disc("Discovery options"); disc.add_options() ("map-qual,q", boost::program_options::value(&c.minMapQual)->default_value(1), "min. paired-end (PE) mapping quality") ("qual-tra,r", boost::program_options::value(&c.minTraQual)->default_value(20), "min. PE quality for translocation") ("mad-cutoff,s", boost::program_options::value(&c.madCutoff)->default_value(9), "insert size cutoff, median+s*MAD (deletions only)") ("minclip,c", boost::program_options::value(&c.minClip)->default_value(25), "min. clipping length") ("min-clique-size,z", boost::program_options::value(&c.minCliqueSize)->default_value(2), "min. PE/SR clique size") ("minrefsep,m", boost::program_options::value(&c.minRefSep)->default_value(25), "min. reference separation") ("maxreadsep,n", boost::program_options::value(&c.maxReadSep)->default_value(40), "max. read separation") ; boost::program_options::options_description geno("Genotyping options"); geno.add_options() ("vcffile,v", boost::program_options::value(&c.vcffile), "input VCF/BCF file for genotyping") ("geno-qual,u", boost::program_options::value(&c.minGenoQual)->default_value(5), "min. mapping quality for genotyping") ("dump,d", boost::program_options::value(&c.dumpfile), "gzipped output file for SV-reads (optional)") ; // Define hidden options boost::program_options::options_description hidden("Hidden options"); hidden.add_options() ("input-file", boost::program_options::value< std::vector >(&c.files), "input file") ("pruning,j", boost::program_options::value(&c.graphPruning)->default_value(1000), "PE graph pruning cutoff") ("max-geno-count,a", boost::program_options::value(&c.maxGenoReadCount)->default_value(250), "max. number of reads aligned for SR genotyping") ; boost::program_options::positional_options_description pos_args; pos_args.add("input-file", -1); // Set the visibility boost::program_options::options_description cmdline_options; cmdline_options.add(generic).add(disc).add(geno).add(hidden); boost::program_options::options_description visible_options; visible_options.add(generic).add(disc).add(geno); boost::program_options::variables_map vm; boost::program_options::store(boost::program_options::command_line_parser(argc, argv).options(cmdline_options).positional(pos_args).run(), vm); boost::program_options::notify(vm); // Check command line arguments if ((vm.count("help")) || (!vm.count("input-file")) || (!vm.count("genome"))) { std::cout << std::endl; std::cout << "Usage: delly " << argv[0] << " [OPTIONS] -g ..." << std::endl; std::cout << visible_options << "\n"; return 0; } // SV types to compute? _svTypesToCompute(c, svtype, vm.count("svtype")); // Dump PE and SR support? if (vm.count("dump")) c.hasDumpFile = true; else c.hasDumpFile = false; // Clique size if (c.minCliqueSize < 2) c.minCliqueSize = 2; // Check quality cuts if (c.minMapQual > c.minTraQual) c.minTraQual = c.minMapQual; // Check reference if (!(boost::filesystem::exists(c.genome) && boost::filesystem::is_regular_file(c.genome) && boost::filesystem::file_size(c.genome))) { std::cerr << "Reference file is missing: " << c.genome.string() << std::endl; return 1; } else { faidx_t* fai = fai_load(c.genome.string().c_str()); if (fai == NULL) { if (fai_build(c.genome.string().c_str()) == -1) { std::cerr << "Fail to open genome fai index for " << c.genome.string() << std::endl; return 1; } else fai = fai_load(c.genome.string().c_str()); } fai_destroy(fai); } // Check input files c.sampleName.resize(c.files.size()); c.nchr = 0; for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) { if (!(boost::filesystem::exists(c.files[file_c]) && boost::filesystem::is_regular_file(c.files[file_c]) && boost::filesystem::file_size(c.files[file_c]))) { std::cerr << "Alignment file is missing: " << c.files[file_c].string() << std::endl; return 1; } samFile* samfile = sam_open(c.files[file_c].string().c_str(), "r"); if (samfile == NULL) { std::cerr << "Fail to open file " << c.files[file_c].string() << std::endl; return 1; } hts_idx_t* idx = sam_index_load(samfile, c.files[file_c].string().c_str()); if (idx == NULL) { std::cerr << "Fail to open index for " << c.files[file_c].string() << std::endl; return 1; } bam_hdr_t* hdr = sam_hdr_read(samfile); if (hdr == NULL) { std::cerr << "Fail to open header for " << c.files[file_c].string() << std::endl; return 1; } if (!c.nchr) c.nchr = hdr->n_targets; else { if (c.nchr != hdr->n_targets) { std::cerr << "BAM files have different number of chromosomes!" << std::endl; return 1; } } faidx_t* fai = fai_load(c.genome.string().c_str()); for(int32_t refIndex=0; refIndex < hdr->n_targets; ++refIndex) { std::string tname(hdr->target_name[refIndex]); if (!faidx_has_seq(fai, tname.c_str())) { std::cerr << "BAM file chromosome " << hdr->target_name[refIndex] << " is NOT present in your reference file " << c.genome.string() << std::endl; return 1; } } fai_destroy(fai); std::string sampleName = "unknown"; getSMTag(std::string(hdr->text), c.files[file_c].stem().string(), sampleName); c.sampleName[file_c] = sampleName; bam_hdr_destroy(hdr); hts_idx_destroy(idx); sam_close(samfile); } // Check exclude file if (vm.count("exclude")) { if (!(boost::filesystem::exists(c.exclude) && boost::filesystem::is_regular_file(c.exclude) && boost::filesystem::file_size(c.exclude))) { std::cerr << "Exclude file is missing: " << c.exclude.string() << std::endl; return 1; } c.hasExcludeFile = true; } else c.hasExcludeFile = false; // Check input VCF file if (vm.count("vcffile")) { if (!(boost::filesystem::exists(c.vcffile) && boost::filesystem::is_regular_file(c.vcffile) && boost::filesystem::file_size(c.vcffile))) { std::cerr << "Input VCF/BCF file is missing: " << c.vcffile.string() << std::endl; return 1; } htsFile* ifile = bcf_open(c.vcffile.string().c_str(), "r"); if (ifile == NULL) { std::cerr << "Fail to open file " << c.vcffile.string() << std::endl; return 1; } bcf_hdr_t* hdr = bcf_hdr_read(ifile); if (hdr == NULL) { std::cerr << "Fail to open index file " << c.vcffile.string() << std::endl; return 1; } bcf_hdr_destroy(hdr); bcf_close(ifile); c.hasVcfFile = true; } else c.hasVcfFile = false; // Check output directory if (!_outfileValid(c.outfile)) return 1; // Show cmd boost::posix_time::ptime now = boost::posix_time::second_clock::local_time(); std::cout << '[' << boost::posix_time::to_simple_string(now) << "] "; std::cout << "delly "; for(int i=0; i(5, -4, -10, -1); c.flankQuality = 0.95; c.minimumFlankSize = 13; c.indelsize = 500; return dellyRun(c); } } #endif delly-0.9.1/src/dpe.cpp000066400000000000000000000333011414764127700147120ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef OPENMP #include #endif #include "tags.h" #include "coverage.h" #include "version.h" #include "util.h" #include "modvcf.h" using namespace torali; struct DoublePEConfig { int32_t wiggle; int32_t svsize; float carconc; boost::filesystem::path outfile; boost::filesystem::path infile; }; struct SVCarrier { typedef boost::dynamic_bitset<> TBitSet; int32_t start; int32_t end; std::string id; TBitSet carrier; SVCarrier(int32_t s, int32_t e, std::string i, TBitSet c) : start(s), end(e), id(i), carrier(c) {} }; struct DPERecord { int32_t start1; int32_t end1; int32_t start2; int32_t end2; float carconc; std::string id1; std::string id2; DPERecord(int32_t s1, int32_t e1, int32_t s2, int32_t e2, float cc, std::string i1, std::string i2) : start1(s1), end1(e1), start2(s2), end2(e2), carconc(cc), id1(i1), id2(i2) {} }; inline int dpeRun(DoublePEConfig const& c) { // Open BCF file htsFile* ifile = bcf_open(c.infile.string().c_str(), "r"); hts_idx_t* bcfidx = bcf_index_load(c.infile.string().c_str()); bcf_hdr_t* hdr = bcf_hdr_read(ifile); // Read BCF file int32_t nsvend = 0; int32_t* svend = NULL; int32_t nsvt = 0; char* svt = NULL; int32_t nchr2 = 0; char* chr2 = NULL; int32_t nct = 0; char* ct = NULL; int ngt = 0; int32_t* gt = NULL; // Get sequences int32_t nseq = 0; const char** seqnames = bcf_hdr_seqnames(hdr, &nseq); boost::posix_time::ptime now = boost::posix_time::second_clock::local_time(); std::cout << '[' << boost::posix_time::to_simple_string(now) << "] " << "Searching complex SVs" << std::endl; boost::progress_display show_progress( nseq ); // Open output file htsFile *ofile = hts_open(c.outfile.string().c_str(), "wb"); bcf_hdr_t *hdr_out = bcf_hdr_dup(hdr); bcf_hdr_remove(hdr_out, BCF_HL_INFO, "LINKID"); bcf_hdr_append(hdr_out, "##INFO="); bcf_hdr_remove(hdr_out, BCF_HL_INFO, "REGION"); bcf_hdr_append(hdr_out, "##INFO="); bcf_hdr_remove(hdr_out, BCF_HL_INFO, "REGION1"); bcf_hdr_append(hdr_out, "##INFO="); bcf_hdr_remove(hdr_out, BCF_HL_INFO, "REGION2"); bcf_hdr_append(hdr_out, "##INFO="); bcf_hdr_remove(hdr_out, BCF_HL_INFO, "REGION3"); bcf_hdr_append(hdr_out, "##INFO="); bcf_hdr_remove(hdr_out, BCF_HL_INFO, "CARCONC"); bcf_hdr_append(hdr_out, "##INFO="); if (bcf_hdr_write(ofile, hdr_out) != 0) std::cerr << "Error: Failed to write BCF header!" << std::endl; // Parse BCF for(int32_t refIndex = 0; refIndex < nseq; ++refIndex) { ++show_progress; // Fetch SVs on this chromosome int32_t maxCTs = 5; typedef std::vector TSVCarrier; typedef std::vector TCTs; TCTs cts(maxCTs); hts_itr_t* itervcf = bcf_itr_querys(bcfidx, hdr, bcf_hdr_id2name(hdr, refIndex)); bcf1_t* rec = bcf_init(); while (bcf_itr_next(ifile, itervcf, rec) >= 0) { // Fetch info bcf_unpack(rec, BCF_UN_ALL); bcf_get_format_int32(hdr, rec, "GT", >, &ngt); bcf_get_info_int32(hdr, rec, "END", &svend, &nsvend); bcf_get_info_string(hdr, rec, "SVTYPE", &svt, &nsvt); std::string chr2Name("NA"); if (bcf_get_info_string(hdr, rec, "CHR2", &chr2, &nchr2) > 0) chr2Name = std::string(chr2); uint8_t ict = 0; if (bcf_get_info_string(hdr, rec, "CT", &ct, &nct) > 0) ict = _decodeOrientation(std::string(ct)); // Fetch carriers if ((*svend - rec->pos) < c.svsize) { SVCarrier::TBitSet car(bcf_hdr_nsamples(hdr)); for (int i = 0; i < bcf_hdr_nsamples(hdr); ++i) { if ((bcf_gt_allele(gt[i*2]) != -1) && (bcf_gt_allele(gt[i*2 + 1]) != -1)) { int gt_type = bcf_gt_allele(gt[i*2]) + bcf_gt_allele(gt[i*2 + 1]); if (gt_type > 0) car[i] = true; } } cts[(int32_t) ict].push_back(SVCarrier(rec->pos, *svend, rec->d.id, car)); } } bcf_destroy(rec); hts_itr_destroy(itervcf); // Process SVs typedef std::vector Tdper; Tdper dper; typedef std::set TSvIds; TSvIds svIds; for(int32_t i = 0; i TJPCount; TJPCount jpc(cts[j].size(), 0); typedef std::map, float> TScoreMap; TScoreMap scm; for(int32_t ip = 0; ip < (int32_t) cts[i].size(); ++ip) { int32_t bestJP = -1; float bestCC = -1; for(int32_t jp = 0; jp < (int32_t) cts[j].size(); ++jp) { if ((cts[i][ip].end < cts[j][jp].start) || (cts[j][jp].end < cts[i][ip].start)) continue; if (((cts[i][ip].start - c.wiggle < cts[j][jp].start) && (cts[j][jp].start < cts[i][ip].end) && (cts[i][ip].end - c.wiggle < cts[j][jp].end)) || ((cts[j][jp].start - c.wiggle < cts[i][ip].start) && (cts[i][ip].start < cts[j][jp].end) && (cts[j][jp].end - c.wiggle < cts[i][ip].end))) { int32_t common = (cts[i][ip].carrier & cts[j][jp].carrier).count(); int32_t all = (cts[i][ip].carrier | cts[j][jp].carrier).count(); float cc = 0; if (all > 0) cc = (float) common / (float) all; if ((cc >= c.carconc) && (cc > bestCC)) { bestJP = jp; bestCC = cc; } } } if (bestJP >= 0) { scm.insert(std::make_pair(std::make_pair(ip, bestJP), bestCC)); ++jpc[bestJP]; } } for(TScoreMap::iterator scmIt = scm.begin(); scmIt != scm.end(); ++scmIt) { int32_t ip = scmIt->first.first; int32_t jp = scmIt->first.second; float cc = scmIt->second; bool savePair = true; if (jpc[jp] > 1) { for(TScoreMap::iterator scmSec = scm.begin(); scmSec != scm.end(); ++scmSec) { if ((ip != scmSec->first.first) && (jp == scmSec->first.second)) { if ((cc < scmSec->second) || ((cc == scmSec->second) && (ip > scmSec->first.first))) { savePair = false; break; } } } } if (savePair) { dper.push_back(DPERecord(cts[i][ip].start, cts[i][ip].end, cts[j][jp].start, cts[j][jp].end, cc, cts[i][ip].id, cts[j][jp].id)); if (!svIds.insert(cts[i][ip].id).second) std::cerr << "SV already exists!" << std::endl; if (!svIds.insert(cts[j][jp].id).second) std::cerr << "SV already exists!" << std::endl; } } } } } } hts_itr_t* ivcf = bcf_itr_querys(bcfidx, hdr, bcf_hdr_id2name(hdr, refIndex)); bcf1_t* r = bcf_init(); while (bcf_itr_next(ifile, ivcf, r) >= 0) { bcf_unpack(r, BCF_UN_ALL); bcf_get_info_int32(hdr, r, "END", &svend, &nsvend); std::string id = std::string(r->d.id); if (svIds.find(id) != svIds.end()) { // Find matching DPERecord for(int32_t i = 0; i < (int32_t) dper.size(); ++i) { if (((dper[i].id1 == id) && (dper[i].start1 == r->pos) && (dper[i].end1 == *svend)) || ((dper[i].id2 == id) && (dper[i].start2 == r->pos) && (dper[i].end2 == *svend))) { std::string linkid = dper[i].id1 + "," + dper[i].id2; _remove_info_tag(hdr_out, r, "LINKID"); bcf_update_info_string(hdr_out, r, "LINKID", linkid.c_str()); _remove_info_tag(hdr_out, r, "CARCONC"); bcf_update_info_float(hdr_out, r, "CARCONC", &dper[i].carconc, 1); std::string reg = bcf_hdr_id2name(hdr, refIndex); reg += "," + boost::lexical_cast(std::min(dper[i].start1 + 1, dper[i].start2 + 1)); reg += "," + boost::lexical_cast(std::max(dper[i].end1, dper[i].end2)); _remove_info_tag(hdr_out, r, "REGION"); bcf_update_info_string(hdr_out, r, "REGION", reg.c_str()); std::string reg1 = bcf_hdr_id2name(hdr, refIndex); reg1 += "," + boost::lexical_cast(std::min(dper[i].start1 + 1, dper[i].start2 + 1)); reg1 += "," + boost::lexical_cast(std::max(dper[i].start1 + 1, dper[i].start2 + 1)); _remove_info_tag(hdr_out, r, "REGION1"); bcf_update_info_string(hdr_out, r, "REGION1", reg1.c_str()); std::string reg2 = bcf_hdr_id2name(hdr, refIndex); reg2 += "," + boost::lexical_cast(std::max(dper[i].start1 + 1, dper[i].start2 + 1)); reg2 += "," + boost::lexical_cast(std::min(dper[i].end1, dper[i].end2)); _remove_info_tag(hdr_out, r, "REGION2"); bcf_update_info_string(hdr_out, r, "REGION2", reg2.c_str()); std::string reg3 = bcf_hdr_id2name(hdr, refIndex); reg3 += "," + boost::lexical_cast(std::min(dper[i].end1, dper[i].end2)); reg3 += "," + boost::lexical_cast(std::max(dper[i].end1, dper[i].end2)); _remove_info_tag(hdr_out, r, "REGION3"); bcf_update_info_string(hdr_out, r, "REGION3", reg3.c_str()); bcf_write1(ofile, hdr_out, r); } } } } bcf_destroy(r); hts_itr_destroy(ivcf); } if (nseq) free(seqnames); // Close output BCF bcf_hdr_destroy(hdr_out); hts_close(ofile); // Build index bcf_index_build(c.outfile.string().c_str(), 14); // Clean-up if (svend != NULL) free(svend); if (svt != NULL) free(svt); if (chr2 != NULL) free(chr2); if (ct != NULL) free(ct); if (gt != NULL) free(gt); // BCF clean-up bcf_hdr_destroy(hdr); hts_idx_destroy(bcfidx); bcf_close(ifile); now = boost::posix_time::second_clock::local_time(); std::cout << '[' << boost::posix_time::to_simple_string(now) << "] Done." << std::endl;; return 0; } int main(int argc, char **argv) { DoublePEConfig c; c.wiggle = 150; // Define generic options boost::program_options::options_description generic("Generic options"); generic.add_options() ("help,?", "show help message") ("svsize,s", boost::program_options::value(&c.svsize)->default_value(50000), "max. SV size") ("carconc,c", boost::program_options::value(&c.carconc)->default_value(0.75), "min. carrier concordance") ("outfile,f", boost::program_options::value(&c.outfile)->default_value("complexSV.bcf"), "complex SV output file") ; // Define hidden options boost::program_options::options_description hidden("Hidden options"); hidden.add_options() ("input-file", boost::program_options::value(&c.infile), "input BCF file") ("license,l", "show license") ("warranty,w", "show warranty") ; boost::program_options::positional_options_description pos_args; pos_args.add("input-file", -1); // Set the visibility boost::program_options::options_description cmdline_options; cmdline_options.add(generic).add(hidden); boost::program_options::options_description visible_options; visible_options.add(generic); boost::program_options::variables_map vm; boost::program_options::store(boost::program_options::command_line_parser(argc, argv).options(cmdline_options).positional(pos_args).run(), vm); boost::program_options::notify(vm); // Check command line arguments if ((vm.count("help")) || (!vm.count("input-file"))) { printTitle("Complex SVs using double paired-end signatures"); if (vm.count("warranty")) { displayWarranty(); } else if (vm.count("license")) { bsd(); } else { std::cout << "Usage: " << argv[0] << " [OPTIONS] " << std::endl; std::cout << visible_options << "\n"; } return 1; } // Check input VCF file if (vm.count("input-file")) { if (!(boost::filesystem::exists(c.infile) && boost::filesystem::is_regular_file(c.infile) && boost::filesystem::file_size(c.infile))) { std::cerr << "Input BCF file is missing: " << c.infile.string() << std::endl; return 1; } htsFile* ifile = bcf_open(c.infile.string().c_str(), "r"); if (ifile == NULL) { std::cerr << "Fail to open file " << c.infile.string() << std::endl; return 1; } hts_idx_t* bcfidx = bcf_index_load(c.infile.string().c_str()); if (bcfidx == NULL) { std::cerr << "Fail to open index file for " << c.infile.string() << std::endl; return 1; } bcf_hdr_t* hdr = bcf_hdr_read(ifile); if (hdr == NULL) { std::cerr << "Fail to header for " << c.infile.string() << std::endl; return 1; } bcf_hdr_destroy(hdr); hts_idx_destroy(bcfidx); bcf_close(ifile); } // Show cmd boost::posix_time::ptime now = boost::posix_time::second_clock::local_time(); std::cout << '[' << boost::posix_time::to_simple_string(now) << "] "; for(int i=0; i #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "tags.h" #include "version.h" #include "util.h" #include "modvcf.h" namespace torali { struct FilterConfig { bool filterForPass; bool hasSampleFile; int32_t minsize; int32_t maxsize; int32_t coverage; float ratiogeno; float altaf; float controlcont; float gq; float rddel; float rddup; std::string filter; std::set tumorSet; std::set controlSet; boost::filesystem::path outfile; boost::filesystem::path samplefile; boost::filesystem::path vcffile; }; template inline int filterRun(TFilterConfig const& c) { // Load bcf file htsFile* ifile = hts_open(c.vcffile.string().c_str(), "r"); bcf_hdr_t* hdr = bcf_hdr_read(ifile); // Open output VCF file htsFile *ofile = hts_open(c.outfile.string().c_str(), "wb"); bcf_hdr_t *hdr_out = bcf_hdr_dup(hdr); if (c.filter == "somatic") { bcf_hdr_remove(hdr_out, BCF_HL_INFO, "RDRATIO"); bcf_hdr_append(hdr_out, "##INFO="); bcf_hdr_remove(hdr_out, BCF_HL_INFO, "SOMATIC"); bcf_hdr_append(hdr_out, "##INFO="); } else if (c.filter == "germline") { bcf_hdr_remove(hdr_out, BCF_HL_INFO, "RDRATIO"); bcf_hdr_append(hdr_out, "##INFO="); } if (bcf_hdr_write(ofile, hdr_out) != 0) std::cerr << "Error: Failed to write BCF header!" << std::endl; // VCF fields int32_t nsvend = 0; int32_t* svend = NULL; int32_t nsvt = 0; char* svt = NULL; int32_t ninslen = 0; int32_t* inslen = NULL; int ngt = 0; int32_t* gt = NULL; int ngq = 0; int32_t* gq = NULL; float* gqf = NULL; int nrc = 0; int32_t* rc = NULL; int nrcl = 0; int32_t* rcl = NULL; int nrcr = 0; int32_t* rcr = NULL; int ndv = 0; int32_t* dv = NULL; int ndr = 0; int32_t* dr = NULL; int nrv = 0; int32_t* rv = NULL; int nrr = 0; int32_t* rr = NULL; bool germline = false; if (c.filter == "germline") germline = true; // Parse BCF boost::posix_time::ptime now = boost::posix_time::second_clock::local_time(); std::cout << '[' << boost::posix_time::to_simple_string(now) << "] " << "Filtering VCF/BCF file" << std::endl; bcf1_t* rec = bcf_init1(); while (bcf_read(ifile, hdr, rec) == 0) { bcf_unpack(rec, BCF_UN_INFO); // Check SV type bcf_get_info_string(hdr, rec, "SVTYPE", &svt, &nsvt); // Check size and PASS bcf_get_info_int32(hdr, rec, "END", &svend, &nsvend); bool pass = true; if (c.filterForPass) pass = (bcf_has_filter(hdr, rec, const_cast("PASS"))==1); int32_t svlen = 1; if (svend != NULL) svlen = *svend - rec->pos; int32_t inslenVal = 0; if (bcf_get_info_int32(hdr, rec, "INSLEN", &inslen, &ninslen) > 0) inslenVal = *inslen; if ((pass) && ((std::string(svt) == "BND") || ((std::string(svt) == "INS") && (inslenVal >= c.minsize) && (inslenVal <= c.maxsize)) || ((std::string(svt) != "BND") && (std::string(svt) != "INS") && (svlen >= c.minsize) && (svlen <= c.maxsize)))) { // Check genotypes bcf_unpack(rec, BCF_UN_ALL); bool precise = false; if (bcf_get_info_flag(hdr, rec, "PRECISE", 0, 0) > 0) precise = true; bcf_get_format_int32(hdr, rec, "GT", >, &ngt); if (_getFormatType(hdr, "GQ") == BCF_HT_INT) bcf_get_format_int32(hdr, rec, "GQ", &gq, &ngq); else if (_getFormatType(hdr, "GQ") == BCF_HT_REAL) bcf_get_format_float(hdr, rec, "GQ", &gqf, &ngq); bcf_get_format_int32(hdr, rec, "RC", &rc, &nrc); if (_isKeyPresent(hdr, "RCL")) bcf_get_format_int32(hdr, rec, "RCL", &rcl, &nrcl); if (_isKeyPresent(hdr, "RCR")) bcf_get_format_int32(hdr, rec, "RCR", &rcr, &nrcr); bcf_get_format_int32(hdr, rec, "DV", &dv, &ndv); bcf_get_format_int32(hdr, rec, "DR", &dr, &ndr); bcf_get_format_int32(hdr, rec, "RV", &rv, &nrv); bcf_get_format_int32(hdr, rec, "RR", &rr, &nrr); std::vector rcraw; std::vector rcControl; std::vector rcTumor; std::vector rcAlt; std::vector rRefVar; std::vector rAltVar; std::vector gqRef; std::vector gqAlt; uint32_t nCount = 0; uint32_t tCount = 0; uint32_t controlpass = 0; uint32_t tumorpass = 0; int32_t ac[2]; ac[0] = 0; ac[1] = 0; for (int i = 0; i < bcf_hdr_nsamples(hdr); ++i) { if ((bcf_gt_allele(gt[i*2]) != -1) && (bcf_gt_allele(gt[i*2 + 1]) != -1)) { int gt_type = bcf_gt_allele(gt[i*2]) + bcf_gt_allele(gt[i*2 + 1]); ++ac[bcf_gt_allele(gt[i*2])]; ++ac[bcf_gt_allele(gt[i*2 + 1])]; if ((germline) || (c.controlSet.find(hdr->samples[i]) != c.controlSet.end())) { // Control or population genomics ++nCount; if (gt_type == 0) { rcraw.push_back(rc[i]); if (_getFormatType(hdr, "GQ") == BCF_HT_INT) gqRef.push_back(gq[i]); else if (_getFormatType(hdr, "GQ") == BCF_HT_REAL) gqRef.push_back(gqf[i]); if ((rcl != NULL) && (rcr != NULL) && (rcl[i] + rcr[i] != 0)) rcControl.push_back((float) rc[i] / ((float) (rcl[i] + rcr[i]))); else rcControl.push_back(rc[i]); float rVar = 0; if (!precise) rVar = (float) dv[i] / (float) (dr[i] + dv[i]); else rVar = (float) rv[i] / (float) (rr[i] + rv[i]); rRefVar.push_back(rVar); if (rVar <= c.controlcont) ++controlpass; } else if ((germline) && (gt_type >= 1)) { if (_getFormatType(hdr, "GQ") == BCF_HT_INT) gqAlt.push_back(gq[i]); else if (_getFormatType(hdr, "GQ") == BCF_HT_REAL) gqAlt.push_back(gqf[i]); if ((rcl != NULL) && (rcr != NULL) && (rcl[i] + rcr[i] != 0)) rcAlt.push_back((float) rc[i] / ((float) (rcl[i] + rcr[i]))); else rcAlt.push_back(rc[i]); float rVar = 0; if (!precise) rVar = (float) dv[i] / (float) (dr[i] + dv[i]); else rVar = (float) rv[i] / (float) (rr[i] + rv[i]); rAltVar.push_back(rVar); } } else if ((!germline) && (c.tumorSet.find(hdr->samples[i]) != c.tumorSet.end())) { // Tumor ++tCount; if ((rcl != NULL) && (rcr != NULL) && (rcl[i] + rcr[i] != 0)) rcTumor.push_back((float) rc[i] / ((float) (rcl[i] + rcr[i]))); else rcTumor.push_back(rc[i]); if (!precise) { if ((((float) dv[i] / (float) (dr[i] + dv[i])) >= c.altaf) && (dr[i] + dv[i] >= c.coverage)) ++tumorpass; } else { if ((((float) rv[i] / (float) (rr[i] + rv[i])) >= c.altaf) && (rr[i] + rv[i] >= c.coverage)) ++tumorpass; } } } } if (c.filter == "somatic") { float genotypeRatio = (float) (nCount + tCount) / (float) (c.controlSet.size() + c.tumorSet.size()); if ((controlpass) && (tumorpass) && (controlpass == nCount) && (genotypeRatio >= c.ratiogeno)) { float rccontrolmed = 0; getMedian(rcControl.begin(), rcControl.end(), rccontrolmed); float rctumormed = 0; getMedian(rcTumor.begin(), rcTumor.end(), rctumormed); float rdRatio = 1; if (rccontrolmed != 0) rdRatio = rctumormed/rccontrolmed; _remove_info_tag(hdr_out, rec, "RDRATIO"); bcf_update_info_float(hdr_out, rec, "RDRATIO", &rdRatio, 1); _remove_info_tag(hdr_out, rec, "SOMATIC"); bcf_update_info_flag(hdr_out, rec, "SOMATIC", NULL, 1); bcf_write1(ofile, hdr_out, rec); } } else if (c.filter == "germline") { float genotypeRatio = (float) (nCount + tCount) / (float) (bcf_hdr_nsamples(hdr)); float rrefvarpercentile = 0; if (!rRefVar.empty()) getPercentile(rRefVar, 0.9, rrefvarpercentile); float raltvarmed = 0; if (!rAltVar.empty()) getMedian(rAltVar.begin(), rAltVar.end(), raltvarmed); float rccontrolmed = 0; if (!rcControl.empty()) getMedian(rcControl.begin(), rcControl.end(), rccontrolmed); float rcaltmed = 0; if (!rcAlt.empty()) getMedian(rcAlt.begin(), rcAlt.end(), rcaltmed); float rdRatio = 1; if (rccontrolmed != 0) rdRatio = rcaltmed/rccontrolmed; float gqaltmed = 0; if (!gqAlt.empty()) getMedian(gqAlt.begin(), gqAlt.end(), gqaltmed); float gqrefmed = 0; if (!gqRef.empty()) getMedian(gqRef.begin(), gqRef.end(), gqrefmed); float af = (float) ac[1] / (float) (ac[0] + ac[1]); //std::cerr << bcf_hdr_id2name(hdr, rec->rid) << '\t' << (rec->pos + 1) << '\t' << *svend << '\t' << rec->d.id << '\t' << svlen << '\t' << ac[1] << '\t' << af << '\t' << genotypeRatio << '\t' << std::string(svt) << '\t' << precise << '\t' << rrefvarpercentile << '\t' << raltvarmed << '\t' << gqrefmed << '\t' << gqaltmed << '\t' << rdRatio << std::endl; if ((af>0) && (gqaltmed >= c.gq) && (gqrefmed >= c.gq) && (raltvarmed >= c.altaf) && (genotypeRatio >= c.ratiogeno)) { if ((std::string(svt)=="DEL") && (rdRatio > c.rddel)) continue; if ((std::string(svt)=="DUP") && (rdRatio < c.rddup)) continue; if ((std::string(svt)!="DEL") && (std::string(svt)!="DUP") && (rrefvarpercentile > 0)) continue; _remove_info_tag(hdr_out, rec, "RDRATIO"); bcf_update_info_float(hdr_out, rec, "RDRATIO", &rdRatio, 1); bcf_write1(ofile, hdr_out, rec); } } } } bcf_destroy(rec); // Clean-up if (svend != NULL) free(svend); if (svt != NULL) free(svt); if (inslen != NULL) free(inslen); if (gt != NULL) free(gt); if (gq != NULL) free(gq); if (gqf != NULL) free(gqf); if (rc != NULL) free(rc); if (rcl != NULL) free(rcl); if (rcr != NULL) free(rcr); if (dv != NULL) free(dv); if (dr != NULL) free(dr); if (rv != NULL) free(rv); if (rr != NULL) free(rr); // Close output VCF bcf_hdr_destroy(hdr_out); hts_close(ofile); // Build index bcf_index_build(c.outfile.string().c_str(), 14); // Close VCF bcf_hdr_destroy(hdr); bcf_close(ifile); // End now = boost::posix_time::second_clock::local_time(); std::cout << '[' << boost::posix_time::to_simple_string(now) << "] Done." << std::endl; return 0; } int filter(int argc, char **argv) { FilterConfig c; // Define generic options boost::program_options::options_description generic("Generic options"); generic.add_options() ("help,?", "show help message") ("filter,f", boost::program_options::value(&c.filter)->default_value("somatic"), "Filter mode (somatic, germline)") ("outfile,o", boost::program_options::value(&c.outfile)->default_value("sv.bcf"), "Filtered SV BCF output file") ("altaf,a", boost::program_options::value(&c.altaf)->default_value(0.2), "min. fractional ALT support") ("minsize,m", boost::program_options::value(&c.minsize)->default_value(0), "min. SV size") ("maxsize,n", boost::program_options::value(&c.maxsize)->default_value(500000000), "max. SV size") ("ratiogeno,r", boost::program_options::value(&c.ratiogeno)->default_value(0.75), "min. fraction of genotyped samples") ("pass,p", "Filter sites for PASS") ; // Define somatic options boost::program_options::options_description somatic("Somatic options"); somatic.add_options() ("samples,s", boost::program_options::value(&c.samplefile), "Two-column sample file listing sample name and tumor or control") ("coverage,v", boost::program_options::value(&c.coverage)->default_value(10), "min. coverage in tumor") ("controlcontamination,c", boost::program_options::value(&c.controlcont)->default_value(0.0), "max. fractional ALT support in control") ; // Define germline options boost::program_options::options_description germline("Germline options"); germline.add_options() ("gq,q", boost::program_options::value(&c.gq)->default_value(15), "min. median GQ for carriers and non-carriers") ("rddel,e", boost::program_options::value(&c.rddel)->default_value(0.8), "max. read-depth ratio of carrier vs. non-carrier for a deletion") ("rddup,u", boost::program_options::value(&c.rddup)->default_value(1.2), "min. read-depth ratio of carrier vs. non-carrier for a duplication") ; // Define hidden options boost::program_options::options_description hidden("Hidden options"); hidden.add_options() ("input-file", boost::program_options::value(&c.vcffile), "input file") ; boost::program_options::positional_options_description pos_args; pos_args.add("input-file", -1); // Set the visibility boost::program_options::options_description cmdline_options; cmdline_options.add(generic).add(somatic).add(germline).add(hidden); boost::program_options::options_description visible_options; visible_options.add(generic).add(somatic).add(germline); boost::program_options::variables_map vm; boost::program_options::store(boost::program_options::command_line_parser(argc, argv).options(cmdline_options).positional(pos_args).run(), vm); boost::program_options::notify(vm); // Check command line arguments if ((vm.count("help")) || (!vm.count("input-file"))) { std::cout << std::endl; std::cout << "Usage: delly " << argv[0] << " [OPTIONS] " << std::endl; std::cout << visible_options << "\n"; return 0; } // Filter for PASS if (vm.count("pass")) c.filterForPass = true; else c.filterForPass = false; // Population Genomics if (c.filter == "germline") c.controlcont = 1.0; // Check sample file std::set tSet; std::set cSet; if (c.filter == "somatic") { c.hasSampleFile = true; if (!(boost::filesystem::exists(c.samplefile) && boost::filesystem::is_regular_file(c.samplefile) && boost::filesystem::file_size(c.samplefile))) { std::cerr << "Sample file is missing " << c.samplefile.string() << std::endl; return 1; } else { // Get samples std::ifstream sampleFile(c.samplefile.string().c_str(), std::ifstream::in); if (sampleFile.is_open()) { while (sampleFile.good()) { std::string sampleFromFile; getline(sampleFile, sampleFromFile); typedef boost::tokenizer< boost::char_separator > Tokenizer; boost::char_separator sep(",\t "); Tokenizer tokens(sampleFromFile, sep); Tokenizer::iterator tokIter = tokens.begin(); if (tokIter != tokens.end()) { std::string sample = *tokIter++; if (tokIter != tokens.end()) { std::string type = *tokIter; if (type == "control") cSet.insert(sample); else if (type == "tumor") tSet.insert(sample); else { std::cerr << "Sample type for " << sample << " is neither tumor nor control" << std::endl; return 1; } } } } sampleFile.close(); } if (tSet.empty()) { std::cerr << "No tumor samples specified." << std::endl; return 1; } if (cSet.empty()) { std::cerr << "No control samples specified." << std::endl; return 1; } std::vector intersection; std::set_intersection(cSet.begin(), cSet.end(), tSet.begin(), tSet.end(), std::back_inserter(intersection)); if (!intersection.empty()) { std::cerr << "Sample " << intersection[0] << " is both a tumor and control sample." << std::endl; return 1; } } } else c.hasSampleFile = false; // Check input VCF file if (vm.count("input-file")) { if (!(boost::filesystem::exists(c.vcffile) && boost::filesystem::is_regular_file(c.vcffile) && boost::filesystem::file_size(c.vcffile))) { std::cerr << "Input VCF/BCF file is missing: " << c.vcffile.string() << std::endl; return 1; } htsFile* ifile = bcf_open(c.vcffile.string().c_str(), "r"); if (ifile == NULL) { std::cerr << "Fail to open file " << c.vcffile.string() << std::endl; return 1; } hts_idx_t* bcfidx = NULL; tbx_t* tbx = NULL; if (hts_get_format(ifile)->format==vcf) tbx = tbx_index_load(c.vcffile.string().c_str()); else bcfidx = bcf_index_load(c.vcffile.string().c_str()); if ((bcfidx == NULL) && (tbx == NULL)) { std::cerr << "Fail to open index file for " << c.vcffile.string() << std::endl; return 1; } bcf_hdr_t* hdr = bcf_hdr_read(ifile); if (hdr == NULL) { std::cerr << "Fail to header for " << c.vcffile.string() << std::endl; return 1; } if (!(bcf_hdr_nsamples(hdr)>0)) { std::cerr << "BCF/VCF file has no sample genotypes!" << std::endl; return 1; } // Check sample names if (c.filter == "somatic") { for (int i = 0; i < bcf_hdr_nsamples(hdr); ++i) { if (tSet.find(hdr->samples[i]) != tSet.end()) c.tumorSet.insert(hdr->samples[i]); else if (cSet.find(hdr->samples[i]) != cSet.end()) c.controlSet.insert(hdr->samples[i]); else std::cerr << "Warning: Sample " << hdr->samples[i] << " is missing in sample file." << std::endl; } if (c.tumorSet.empty()) { std::cerr << "No tumor samples specified." << std::endl; return 1; } if (c.controlSet.empty()) { std::cerr << "No control samples specified." << std::endl; return 1; } } bcf_hdr_destroy(hdr); if (bcfidx) hts_idx_destroy(bcfidx); if (tbx) tbx_destroy(tbx); bcf_close(ifile); } // Show cmd boost::posix_time::ptime now = boost::posix_time::second_clock::local_time(); std::cout << '[' << boost::posix_time::to_simple_string(now) << "] "; std::cout << "delly "; for(int i=0; i #include #include #include #include #include #include #include #include #include #include #include #include #include #include "scan.h" #include "util.h" namespace torali { struct GcBias { int32_t sample; int32_t reference; double fractionSample; double fractionReference; double percentileSample; double percentileReference; double obsexp; double coverage; GcBias() : sample(0), reference(0), fractionSample(0), fractionReference(0), percentileSample(0), percentileReference(0), obsexp(0), coverage(0) {} }; template inline std::pair gcBound(TConfig const& c, std::vector& gcbias) { uint32_t lowerBound = 0; uint32_t upperBound = gcbias.size(); for(uint32_t i = 0; i < gcbias.size(); ++i) { if ((gcbias[i].percentileSample < c.exclgc) || (gcbias[i].percentileReference < c.exclgc)) lowerBound = i; if ((gcbias[i].percentileSample + c.exclgc > 1) || (gcbias[i].percentileReference + c.exclgc > 1)) { if (i < upperBound) upperBound = i; } } if (lowerBound >= upperBound) upperBound = lowerBound + 1; /* // Adjust total uint64_t totalSampleCount = 0; uint64_t totalReferenceCount = 0; for(uint32_t i = lowerBound + 1; i < upperBound; ++i) { totalSampleCount += gcbias[i].sample; totalReferenceCount += gcbias[i].reference; } // Re-estimate observed/expected for(uint32_t i = lowerBound + 1; i < upperBound; ++i) { gcbias[i].fractionSample = (double) gcbias[i].sample / (double) totalSampleCount; gcbias[i].fractionReference = (double) gcbias[i].reference / (double) totalReferenceCount; gcbias[i].obsexp = 1; if (gcbias[i].fractionReference > 0) gcbias[i].obsexp = gcbias[i].fractionSample / gcbias[i].fractionReference; } */ return std::make_pair(lowerBound, upperBound); } inline double getPercentIdentity(bam1_t const* rec, char const* seq) { // Sequence std::string sequence; sequence.resize(rec->core.l_qseq); uint8_t* seqptr = bam_get_seq(rec); for (int i = 0; i < rec->core.l_qseq; ++i) sequence[i] = "=ACMGRSVTWYHKDBN"[bam_seqi(seqptr, i)]; // Reference slice std::string refslice = boost::to_upper_copy(std::string(seq + rec->core.pos, seq + lastAlignedPosition(rec))); // Percent identity uint32_t rp = 0; // reference pointer uint32_t sp = 0; // sequence pointer uint32_t* cigar = bam_get_cigar(rec); int32_t matchCount = 0; int32_t mismatchCount = 0; for (std::size_t i = 0; i < rec->core.n_cigar; ++i) { if ((bam_cigar_op(cigar[i]) == BAM_CMATCH) || (bam_cigar_op(cigar[i]) == BAM_CEQUAL) || (bam_cigar_op(cigar[i]) == BAM_CDIFF)) { // match or mismatch for(std::size_t k = 0; k 0) percid = (double) matchCount / (double) (matchCount + mismatchCount); return percid; } template inline void gcBias(TConfig const& c, std::vector< std::vector > const& scanCounts, LibraryInfo const& li, std::vector& gcbias, TGCBound& gcbound) { // Load bam file samFile* samfile = sam_open(c.bamFile.string().c_str(), "r"); hts_set_fai_filename(samfile, c.genome.string().c_str()); hts_idx_t* idx = sam_index_load(samfile, c.bamFile.string().c_str()); bam_hdr_t* hdr = sam_hdr_read(samfile); // Parse bam (contig by contig) boost::posix_time::ptime now = boost::posix_time::second_clock::local_time(); std::cout << '[' << boost::posix_time::to_simple_string(now) << "] " << "Estimate GC bias" << std::endl; boost::progress_display show_progress( hdr->n_targets ); faidx_t* faiMap = fai_load(c.mapFile.string().c_str()); faidx_t* faiRef = fai_load(c.genome.string().c_str()); for (int refIndex = 0; refIndex < hdr->n_targets; ++refIndex) { ++show_progress; if (scanCounts[refIndex].empty()) continue; // Bin map std::vector binMap; if (c.hasScanFile) { // Fill bin map binMap.resize(hdr->target_len[refIndex], LAST_BIN); for(uint32_t bin = 0;((bin < scanCounts[refIndex].size()) && (bin < LAST_BIN)); ++bin) { for(int32_t k = scanCounts[refIndex][bin].start; k < scanCounts[refIndex][bin].end; ++k) binMap[k] = bin; } } // Check presence in mappability map std::string tname(hdr->target_name[refIndex]); int32_t seqlen = faidx_seq_len(faiMap, tname.c_str()); if (seqlen == - 1) continue; else seqlen = -1; char* seq = faidx_fetch_seq(faiMap, tname.c_str(), 0, faidx_seq_len(faiMap, tname.c_str()), &seqlen); // Check presence in reference seqlen = faidx_seq_len(faiRef, tname.c_str()); if (seqlen == - 1) continue; else seqlen = -1; char* ref = faidx_fetch_seq(faiRef, tname.c_str(), 0, faidx_seq_len(faiRef, tname.c_str()), &seqlen); // Get GC and Mappability std::vector uniqContent(hdr->target_len[refIndex], 0); std::vector gcContent(hdr->target_len[refIndex], 0); { // Mappability map typedef boost::dynamic_bitset<> TBitSet; TBitSet uniq(hdr->target_len[refIndex], false); for(uint32_t i = 0; i < hdr->target_len[refIndex]; ++i) { if (seq[i] == 'C') uniq[i] = true; } // GC map typedef boost::dynamic_bitset<> TBitSet; TBitSet gcref(hdr->target_len[refIndex], false); for(uint32_t i = 0; i < hdr->target_len[refIndex]; ++i) { if ((ref[i] == 'c') || (ref[i] == 'C') || (ref[i] == 'g') || (ref[i] == 'G')) gcref[i] = 1; } // Sum across fragments int32_t halfwin = (int32_t) (c.meanisize / 2); int32_t usum = 0; int32_t gcsum = 0; for(int32_t pos = halfwin; pos < (int32_t) hdr->target_len[refIndex] - halfwin; ++pos) { if (pos == halfwin) { for(int32_t i = pos - halfwin; i<=pos+halfwin; ++i) { usum += uniq[i]; gcsum += gcref[i]; } } else { usum -= uniq[pos - halfwin - 1]; gcsum -= gcref[pos - halfwin - 1]; usum += uniq[pos + halfwin]; gcsum += gcref[pos + halfwin]; } gcContent[pos] = gcsum; uniqContent[pos] = usum; } } // Coverage track typedef uint16_t TCount; uint32_t maxCoverage = std::numeric_limits::max(); typedef std::vector TCoverage; TCoverage cov(hdr->target_len[refIndex], 0); // Mate map typedef boost::unordered_map TMateMap; TMateMap mateMap; // Parse BAM hts_itr_t* iter = sam_itr_queryi(idx, refIndex, 0, hdr->target_len[refIndex]); bam1_t* rec = bam_init1(); int32_t lastAlignedPos = 0; std::set lastAlignedPosReads; while (sam_itr_next(samfile, iter, rec) >= 0) { if (rec->core.flag & (BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP | BAM_FSUPPLEMENTARY | BAM_FUNMAP)) continue; if ((rec->core.flag & BAM_FPAIRED) && ((rec->core.flag & BAM_FMUNMAP) || (rec->core.tid != rec->core.mtid))) continue; if (rec->core.qual < c.minQual) continue; int32_t midPoint = rec->core.pos + halfAlignmentLength(rec); if (rec->core.flag & BAM_FPAIRED) { // Clean-up the read store for identical alignment positions if (rec->core.pos > lastAlignedPos) { lastAlignedPosReads.clear(); lastAlignedPos = rec->core.pos; } // Process pair if ((rec->core.pos < rec->core.mpos) || ((rec->core.pos == rec->core.mpos) && (lastAlignedPosReads.find(hash_string(bam_get_qname(rec))) == lastAlignedPosReads.end()))) { // First read lastAlignedPosReads.insert(hash_string(bam_get_qname(rec))); std::size_t hv = hash_pair(rec); mateMap[hv]= true; continue; } else { // Second read std::size_t hv = hash_pair_mate(rec); if ((mateMap.find(hv) == mateMap.end()) || (!mateMap[hv])) continue; // Mate discarded mateMap[hv] = false; } // Insert size filter int32_t isize = (rec->core.pos + alignmentLength(rec)) - rec->core.mpos; if ((li.minNormalISize < isize) && (isize < li.maxNormalISize)) { midPoint = rec->core.mpos + (int32_t) (isize/2); } else { if (rec->core.flag & BAM_FREVERSE) midPoint = rec->core.pos + alignmentLength(rec) - (c.meanisize / 2); else midPoint = rec->core.pos + (c.meanisize / 2); } } // Count fragment if ((midPoint >= 0) && (midPoint < (int32_t) hdr->target_len[refIndex]) && (cov[midPoint] < maxCoverage - 1)) ++cov[midPoint]; } bam_destroy1(rec); hts_itr_destroy(iter); if (seq != NULL) free(seq); if (ref != NULL) free(ref); // Summarize GC coverage for this chromosome for(uint32_t i = 0; i < hdr->target_len[refIndex]; ++i) { if (uniqContent[i] >= c.fragmentUnique * c.meanisize) { // Valid bin? int32_t bin = _findScanWindow(c, hdr->target_len[refIndex], binMap, i); if ((bin >= 0) && (scanCounts[refIndex][bin].select)) { ++gcbias[gcContent[i]].reference; gcbias[gcContent[i]].sample += cov[i]; gcbias[gcContent[i]].coverage += cov[i]; } } } } // Normalize GC coverage for(uint32_t i = 0; i < gcbias.size(); ++i) { if (gcbias[i].reference) gcbias[i].coverage /= (double) gcbias[i].reference; else gcbias[i].coverage = 0; } // Determine percentiles uint64_t totalSampleCount = 0; uint64_t totalReferenceCount = 0; for(uint32_t i = 0; i < gcbias.size(); ++i) { totalSampleCount += gcbias[i].sample; totalReferenceCount += gcbias[i].reference; } uint64_t cumSample = 0; uint64_t cumReference = 0; for(uint32_t i = 0; i < gcbias.size(); ++i) { cumSample += gcbias[i].sample; cumReference += gcbias[i].reference; gcbias[i].fractionSample = (double) gcbias[i].sample / (double) totalSampleCount; gcbias[i].fractionReference = (double) gcbias[i].reference / (double) totalReferenceCount; gcbias[i].percentileSample = (double) cumSample / (double) totalSampleCount; gcbias[i].percentileReference = (double) cumReference / (double) totalReferenceCount; gcbias[i].obsexp = 1; if (gcbias[i].fractionReference > 0) gcbias[i].obsexp = gcbias[i].fractionSample / gcbias[i].fractionReference; } // Estimate correctable GC range gcbound = gcBound(c, gcbias); // Adjust correction to the callable range totalSampleCount = 0; totalReferenceCount = 0; for(uint32_t i = gcbound.first + 1; i < gcbound.second; ++i) { totalSampleCount += gcbias[i].sample; totalReferenceCount += gcbias[i].reference; } cumSample = 0; cumReference = 0; // Re-initialize for(uint32_t i = 0; i < gcbias.size(); ++i) { gcbias[i].fractionSample = 0; gcbias[i].fractionReference = 0; gcbias[i].percentileSample = 0; gcbias[i].percentileReference = 0; gcbias[i].obsexp = 1; } for(uint32_t i = gcbound.first + 1; i < gcbound.second; ++i) { cumSample += gcbias[i].sample; cumReference += gcbias[i].reference; gcbias[i].fractionSample = (double) gcbias[i].sample / (double) totalSampleCount; gcbias[i].fractionReference = (double) gcbias[i].reference / (double) totalReferenceCount; gcbias[i].percentileSample = (double) cumSample / (double) totalSampleCount; gcbias[i].percentileReference = (double) cumReference / (double) totalReferenceCount; gcbias[i].obsexp = 1; if (gcbias[i].fractionReference > 0) gcbias[i].obsexp = gcbias[i].fractionSample / gcbias[i].fractionReference; } fai_destroy(faiRef); fai_destroy(faiMap); hts_idx_destroy(idx); sam_close(samfile); bam_hdr_destroy(hdr); } } #endif delly-0.9.1/src/genotype.h000066400000000000000000001053411414764127700154450ustar00rootroot00000000000000#ifndef GENOTYPE_H #define GENOTYPE_H #include #include #include #include #include #include #include #include #include #include #include "util.h" namespace torali { struct Geno { int32_t svStartPrefix; int32_t svStartSuffix; int32_t svEndPrefix; int32_t svEndSuffix; int32_t svStart; int32_t svEnd; int32_t svt; std::string ref; std::string alt; Geno() : svStartPrefix(-1), svStartSuffix(-1), svEndPrefix(-1), svEndSuffix(-1), svStart(-1), svEnd(-1), svt(-1) {} }; inline float percentIdentity(std::string const& s1, std::string const& s2, int32_t splitpos, int32_t window) { // Get window boundaries int32_t ws = std::max(splitpos - window, 0); int32_t we = std::min(splitpos + window, (int32_t) s1.size()); // Find percent identity bool varSeen = false; bool refSeen = false; int32_t refpos = 0; uint32_t gapMM = 0; uint32_t mm = 0; uint32_t ma = 0; float leftPerc = -1; float rightPerc = -1; bool inGap=false; for(uint32_t j = 0; j < s1.size(); ++j) { if (s2[j] != '-') varSeen = true; if (s1[j] != '-') { refSeen = true; if ((refpos == splitpos) || (refpos == ws) || (refpos == we)) { if (refpos == splitpos) { leftPerc = 0; if (ma + mm > 0) leftPerc = (float) ma / (float) (ma + mm); } if (refpos == we) { rightPerc = 0; if (ma + mm > 0) rightPerc = (float) ma / (float) (ma + mm); } mm = 0; ma = 0; gapMM = 0; } ++refpos; } if ((refSeen) && (varSeen)) { // Internal gap? if ((s2[j] == '-') || (s1[j] == '-')) { if (!inGap) { inGap = true; gapMM = 0; } gapMM += 1; } else { if (inGap) { mm += gapMM; inGap=false; } if (s2[j] == s1[j]) ma += 1; else mm += 1; } } } if (rightPerc == -1) { rightPerc = 0; if (ma + mm > 0) rightPerc = (float) ma / (float) (ma + mm); } //std::cerr << ws << ',' << splitpos << ',' << we << ',' << leftPerc << ',' << rightPerc << std::endl; return std::min(leftPerc, rightPerc); } template inline void trackRef(TConfig& c, std::vector& svs, TJunctionMap& jctMap, TReadCountMap& covMap) { typedef std::vector TSVs; typedef std::vector TQuality; typedef boost::multi_array TAlign; if (svs.empty()) return; // Open file handles typedef std::vector TSamFile; typedef std::vector TIndex; typedef std::vector THeader; TSamFile samfile(c.files.size()); TIndex idx(c.files.size()); THeader hdr(c.files.size()); int32_t totalTarget = 0; for(uint32_t file_c = 0; file_c < c.files.size(); ++file_c) { samfile[file_c] = sam_open(c.files[file_c].string().c_str(), "r"); hts_set_fai_filename(samfile[file_c], c.genome.string().c_str()); idx[file_c] = sam_index_load(samfile[file_c], c.files[file_c].string().c_str()); hdr[file_c] = sam_hdr_read(samfile[file_c]); totalTarget += hdr[file_c]->n_targets; } // Parse genome chr-by-chr boost::posix_time::ptime now = boost::posix_time::second_clock::local_time(); std::cout << '[' << boost::posix_time::to_simple_string(now) << "] " << "SV annotation" << std::endl; boost::progress_display show_progress( hdr[0]->n_targets ); // Ref aligned reads typedef std::vector TRefAlignCount; typedef std::vector TFileRefAlignCount; TFileRefAlignCount refAlignedReadCount(c.files.size(), TRefAlignCount()); for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) refAlignedReadCount[file_c].resize(svs.size(), 0); // Coverage distribution typedef uint16_t TMaxCoverage; uint32_t maxCoverage = std::numeric_limits::max(); typedef std::vector TCovDist; typedef std::vector TSampleCovDist; TSampleCovDist covDist(c.files.size(), TCovDist()); for(uint32_t i = 0; i < c.files.size(); ++i) covDist[i].resize(maxCoverage, 0); // Error rates std::vector matchCount(c.files.size(), 0); std::vector mismatchCount(c.files.size(), 0); std::vector delCount(c.files.size(), 0); std::vector insCount(c.files.size(), 0); // Read length distribution typedef uint16_t TMaxReadLength; uint32_t maxReadLength = std::numeric_limits::max(); uint32_t rlBinSize = 100; typedef std::vector TReadLengthDist; typedef std::vector TSampleRLDist; TSampleRLDist rlDist(c.files.size(), TReadLengthDist()); for(uint32_t i = 0; i < c.files.size(); ++i) rlDist[i].resize(maxReadLength * rlBinSize, 0); // Dump file boost::iostreams::filtering_ostream dumpOut; if (c.hasDumpFile) { dumpOut.push(boost::iostreams::gzip_compressor()); dumpOut.push(boost::iostreams::file_sink(c.dumpfile.string().c_str(), std::ios_base::out | std::ios_base::binary)); dumpOut << "#svid\tbam\tqname\tchr\tpos\tmatechr\tmatepos\tmapq\ttype" << std::endl; } // Iterate chromosomes std::vector refProbes(svs.size()); faidx_t* fai = fai_load(c.genome.string().c_str()); for(int32_t refIndex=0; refIndex < (int32_t) hdr[0]->n_targets; ++refIndex) { ++show_progress; char* seq = NULL; // Reference and consensus probes for this chromosome typedef std::vector TGenoRegion; TGenoRegion gbp(svs.size(), Geno()); // Iterate all structural variants for(typename TSVs::iterator itSV = svs.begin(); itSV != svs.end(); ++itSV) { if ((itSV->chr != refIndex) && (itSV->chr2 != refIndex)) continue; if ((itSV->svt != 2) && (itSV->svt != 4)) continue; // Lazy loading of reference sequence if (seq == NULL) { int32_t seqlen = -1; std::string tname(hdr[0]->target_name[refIndex]); seq = faidx_fetch_seq(fai, tname.c_str(), 0, hdr[0]->target_len[refIndex], &seqlen); } // Set tag alleles if (itSV->chr == refIndex) { itSV->alleles = _addAlleles(boost::to_upper_copy(std::string(seq + itSV->svStart - 1, seq + itSV->svStart)), std::string(hdr[0]->target_name[itSV->chr2]), *itSV, itSV->svt); } if (!itSV->precise) continue; // Get the reference sequence if ((itSV->chr != itSV->chr2) && (itSV->chr2 == refIndex)) { Breakpoint bp(*itSV); _initBreakpoint(hdr[0], bp, (int32_t) itSV->consensus.size(), itSV->svt); refProbes[itSV->id] = _getSVRef(seq, bp, refIndex, itSV->svt); } if (itSV->chr == refIndex) { Breakpoint bp(*itSV); if (_translocation(itSV->svt)) bp.part1 = refProbes[itSV->id]; if (itSV->svt ==4) { int32_t bufferSpace = std::max((int32_t) ((itSV->consensus.size() - itSV->insLen) / 3), c.minimumFlankSize); _initBreakpoint(hdr[0], bp, bufferSpace, itSV->svt); } else _initBreakpoint(hdr[0], bp, (int32_t) itSV->consensus.size(), itSV->svt); std::string svRefStr = _getSVRef(seq, bp, refIndex, itSV->svt); // Find breakpoint to reference TAlign align; if (!_consRefAlignment(itSV->consensus, svRefStr, align, itSV->svt)) continue; AlignDescriptor ad; if (!_findSplit(c, itSV->consensus, svRefStr, align, ad, itSV->svt)) continue; // Get exact alleles for INS and DEL if ((itSV->svt == 2) || (itSV->svt == 4)) { std::string refVCF; std::string altVCF; int32_t cpos = 0; bool inSV = false; for(uint32_t j = 0; jalleles = _addAlleles(refVCF, altVCF); } // Debug consensus to reference alignment //std::cerr << "svid:" << itSV->id << ",consensus-to-reference-alignment" << std::endl; //for(uint32_t i = 0; iid].svStartPrefix = std::max(ad.cStart - leadCrop, 0); gbp[itSV->id].svStartSuffix = std::max((int32_t) altSeq.size() - gbp[itSV->id].svStartPrefix, 0); gbp[itSV->id].svStart = itSV->svStart; if (itSV->chr2 == refIndex) { gbp[itSV->id].svEndPrefix = std::max(ad.cEnd - leadCrop, 0); gbp[itSV->id].svEndSuffix = std::max((int32_t) altSeq.size() - gbp[itSV->id].svEndPrefix, 0); gbp[itSV->id].svEnd = itSV->svEnd; } gbp[itSV->id].ref = refSeq; gbp[itSV->id].alt = altSeq; gbp[itSV->id].svt = itSV->svt; } } if (seq != NULL) free(seq); // Genotype // Iterate samples for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) { // Check we have mapped reads on this chromosome bool nodata = true; std::string suffix("cram"); std::string str(c.files[file_c].string()); if ((str.size() >= suffix.size()) && (str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0)) nodata = false; uint64_t mapped = 0; uint64_t unmapped = 0; hts_idx_get_stat(idx[file_c], refIndex, &mapped, &unmapped); if (mapped) nodata = false; if (nodata) continue; // Coverage track typedef std::vector TBpCoverage; TBpCoverage covBases(hdr[file_c]->target_len[refIndex], 0); // Flag breakpoints typedef std::set TIdSet; typedef std::map TBpToIdMap; TBpToIdMap bpid; typedef boost::dynamic_bitset<> TBitSet; TBitSet bpOccupied(hdr[file_c]->target_len[refIndex], false); for(uint32_t i = 0; i < gbp.size(); ++i) { if (gbp[i].svStart != -1) { bpOccupied[gbp[i].svStart] = 1; if (bpid.find(gbp[i].svStart) == bpid.end()) bpid.insert(std::make_pair(gbp[i].svStart, TIdSet())); bpid[gbp[i].svStart].insert(i); } if (gbp[i].svEnd != -1) { bpOccupied[gbp[i].svEnd] = 1; if (bpid.find(gbp[i].svEnd) == bpid.end()) bpid.insert(std::make_pair(gbp[i].svEnd, TIdSet())); bpid[gbp[i].svEnd].insert(i); } } // Count reads hts_itr_t* iter = sam_itr_queryi(idx[file_c], refIndex, 0, hdr[file_c]->target_len[refIndex]); bam1_t* rec = bam_init1(); while (sam_itr_next(samfile[file_c], iter, rec) >= 0) { // Genotyping only primary alignments if (rec->core.flag & (BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP | BAM_FSUPPLEMENTARY | BAM_FUNMAP)) continue; // Read length int32_t readlen = readLength(rec); if (readlen < (int32_t) (maxReadLength * rlBinSize)) ++rlDist[file_c][(int32_t) (readlen / rlBinSize)]; // Reference and sequence pointer uint32_t rp = rec->core.pos; // reference pointer uint32_t sp = 0; // sequence pointer // All SV hits typedef std::pair TRefSeq; typedef std::map TSVSeqHit; TSVSeqHit genoMap; // Parse the CIGAR uint32_t* cigar = bam_get_cigar(rec); for (std::size_t i = 0; i < rec->core.n_cigar; ++i) { if ((bam_cigar_op(cigar[i]) == BAM_CMATCH) || (bam_cigar_op(cigar[i]) == BAM_CEQUAL) || (bam_cigar_op(cigar[i]) == BAM_CDIFF)) { // Fetch reference alignments for(uint32_t k = 0; k < bam_cigar_oplen(cigar[i]); ++k) { if ((rp < hdr[file_c]->target_len[refIndex]) && (covBases[rp] < maxCoverage - 1)) ++covBases[rp]; if (bpOccupied[rp]) { for(typename TIdSet::const_iterator it = bpid[rp].begin(); it != bpid[rp].end(); ++it) { // Ensure fwd alignment and each SV only once if (genoMap.find(*it) == genoMap.end()) { if (rec->core.flag & BAM_FREVERSE) genoMap.insert(std::make_pair(*it, std::make_pair(rp, readlen - sp))); else genoMap.insert(std::make_pair(*it, std::make_pair(rp, sp))); } } } if ((bam_cigar_op(cigar[i]) == BAM_CMATCH) || (bam_cigar_op(cigar[i]) == BAM_CEQUAL)) ++matchCount[file_c]; else if (bam_cigar_op(cigar[i]) == BAM_CDIFF) ++mismatchCount[file_c]; ++sp; ++rp; } } else if ((bam_cigar_op(cigar[i]) == BAM_CDEL) || (bam_cigar_op(cigar[i]) == BAM_CREF_SKIP)) { ++delCount[file_c]; for(uint32_t k = 0; k < bam_cigar_oplen(cigar[i]); ++k) { if (bpOccupied[rp]) { for(typename TIdSet::const_iterator it = bpid[rp].begin(); it != bpid[rp].end(); ++it) { // Ensure fwd alignment and each SV only once if (genoMap.find(*it) == genoMap.end()) { if (rec->core.flag & BAM_FREVERSE) genoMap.insert(std::make_pair(*it, std::make_pair(rp, readlen - sp))); else genoMap.insert(std::make_pair(*it, std::make_pair(rp, sp))); } } } ++rp; } } else if (bam_cigar_op(cigar[i]) == BAM_CINS) { ++insCount[file_c]; sp += bam_cigar_oplen(cigar[i]); } else if (bam_cigar_op(cigar[i]) == BAM_CSOFT_CLIP) { sp += bam_cigar_oplen(cigar[i]); } else if (bam_cigar_op(cigar[i]) == BAM_CHARD_CLIP) { // Do nothing } else { std::cerr << "Unknown Cigar options" << std::endl; } } // Read for genotyping? if (!genoMap.empty()) { // Get sequence std::string sequence; sequence.resize(rec->core.l_qseq); uint8_t* seqptr = bam_get_seq(rec); for (int i = 0; i < rec->core.l_qseq; ++i) sequence[i] = "=ACMGRSVTWYHKDBN"[bam_seqi(seqptr, i)]; // Genotype all SVs covered by this read for(typename TSVSeqHit::iterator git = genoMap.begin(); git != genoMap.end(); ++git) { int32_t svid = git->first; uint32_t maxGenoReadCount = 500; if ((jctMap[file_c][svid].ref.size() + jctMap[file_c][svid].alt.size()) >= maxGenoReadCount) continue; int32_t rpHit = git->second.first; int32_t spHit = git->second.second; // Require spanning reads std::string subseq; if (rpHit == gbp[svid].svStart) { if (rec->core.flag & BAM_FREVERSE) { if (spHit < gbp[svid].svStartSuffix) continue; if (readlen < gbp[svid].svStartPrefix + spHit) continue; int32_t st = std::max((readlen - spHit) - gbp[svid].svStartPrefix - c.minimumFlankSize, 0); subseq = sequence.substr(st, gbp[svid].svStartPrefix + gbp[svid].svStartSuffix + 2 * c.minimumFlankSize); } else { if (spHit < gbp[svid].svStartPrefix) continue; if (readlen < gbp[svid].svStartSuffix + spHit) continue; int32_t st = std::max(spHit - gbp[svid].svStartPrefix - c.minimumFlankSize, 0); subseq = sequence.substr(st, gbp[svid].svStartPrefix + gbp[svid].svStartSuffix + 2 * c.minimumFlankSize); } } else { if (rec->core.flag & BAM_FREVERSE) { if (spHit < gbp[svid].svEndSuffix) continue; if (readlen < gbp[svid].svEndPrefix + spHit) continue; int32_t st = std::max((readlen - spHit) - gbp[svid].svEndPrefix - c.minimumFlankSize, 0); subseq = sequence.substr(st, gbp[svid].svEndPrefix + gbp[svid].svEndSuffix + 2 * c.minimumFlankSize); } else { if (spHit < gbp[svid].svEndPrefix) continue; if (readlen < gbp[svid].svEndSuffix + spHit) continue; int32_t st = std::max(spHit - gbp[svid].svEndPrefix - c.minimumFlankSize, 0); subseq = sequence.substr(st, gbp[svid].svEndPrefix + gbp[svid].svEndSuffix + 2 * c.minimumFlankSize); } } // Compute alignment to alternative haplotype DnaScore simple(c.aliscore.match, c.aliscore.mismatch, c.aliscore.mismatch, c.aliscore.mismatch); AlignConfig semiglobal; double scoreAlt = needleBanded(gbp[svid].alt, subseq, semiglobal, simple); scoreAlt /= (double) (c.flankQuality * gbp[svid].alt.size() * simple.match + (1.0 - c.flankQuality) * gbp[svid].alt.size() * simple.mismatch); // Compute alignment to reference haplotype double scoreRef = needleBanded(gbp[svid].ref, subseq, semiglobal, simple); scoreRef /= (double) (c.flankQuality * gbp[svid].ref.size() * simple.match + (1.0 - c.flankQuality) * gbp[svid].ref.size() * simple.mismatch); // Any confident alignment? if ((scoreRef > 1) || (scoreAlt > 1)) { if (scoreRef > scoreAlt) { // Account for reference bias if (++refAlignedReadCount[file_c][svid] % 2) { TQuality quality; quality.resize(rec->core.l_qseq); uint8_t* qualptr = bam_get_qual(rec); for (int i = 0; i < rec->core.l_qseq; ++i) quality[i] = qualptr[i]; uint32_t rq = scoreRef * 35; if (rq >= c.minGenoQual) { uint8_t* hpptr = bam_aux_get(rec, "HP"); jctMap[file_c][svid].ref.push_back((uint8_t) std::min(rq, (uint32_t) rec->core.qual)); if (hpptr) { c.isHaplotagged = true; int hap = bam_aux2i(hpptr); if (hap == 1) ++jctMap[file_c][svid].refh1; else ++jctMap[file_c][svid].refh2; } } } } else { TQuality quality; quality.resize(rec->core.l_qseq); uint8_t* qualptr = bam_get_qual(rec); for (int i = 0; i < rec->core.l_qseq; ++i) quality[i] = qualptr[i]; uint32_t aq = scoreAlt * 35; if (aq >= c.minGenoQual) { uint8_t* hpptr = bam_aux_get(rec, "HP"); if (c.hasDumpFile) { std::string svidStr(_addID(gbp[svid].svt)); std::string padNumber = boost::lexical_cast(svid); padNumber.insert(padNumber.begin(), 8 - padNumber.length(), '0'); svidStr += padNumber; dumpOut << svidStr << "\t" << c.files[file_c].string() << "\t" << bam_get_qname(rec) << "\t" << hdr[file_c]->target_name[rec->core.tid] << "\t" << rec->core.pos << "\t" << hdr[file_c]->target_name[rec->core.mtid] << "\t" << rec->core.mpos << "\t" << (int32_t) rec->core.qual << "\tSR" << std::endl; } jctMap[file_c][svid].alt.push_back((uint8_t) std::min(aq, (uint32_t) rec->core.qual)); if (hpptr) { c.isHaplotagged = true; int hap = bam_aux2i(hpptr); if (hap == 1) ++jctMap[file_c][svid].alth1; else ++jctMap[file_c][svid].alth2; } } } } } } } // Clean-up bam_destroy1(rec); hts_itr_destroy(iter); // Summarize coverage for this chromosome for(uint32_t i = 0; i < hdr[file_c]->target_len[refIndex]; ++i) ++covDist[file_c][covBases[i]]; // Assign SV support for(uint32_t i = 0; i < svs.size(); ++i) { if (svs[i].chr == refIndex) { int32_t halfSize = (svs[i].svEnd - svs[i].svStart)/2; if ((_translocation(svs[i].svt)) || (svs[i].svt == 4)) halfSize = 500; // Left region int32_t lstart = std::max(svs[i].svStart - halfSize, 0); int32_t lend = svs[i].svStart; int32_t covbase = 0; for(uint32_t k = lstart; ((k < (uint32_t) lend) && (k < hdr[file_c]->target_len[refIndex])); ++k) covbase += covBases[k]; covMap[file_c][svs[i].id].leftRC = covbase; // Actual SV covbase = 0; int32_t mstart = svs[i].svStart; int32_t mend = svs[i].svEnd; if ((_translocation(svs[i].svt)) || (svs[i].svt == 4)) { mstart = std::max(svs[i].svStart - halfSize, 0); mend = std::min(svs[i].svStart + halfSize, (int32_t) hdr[file_c]->target_len[refIndex]); } for(uint32_t k = mstart; ((k < (uint32_t) mend) && (k < hdr[file_c]->target_len[refIndex])); ++k) covbase += covBases[k]; covMap[file_c][svs[i].id].rc = covbase; // Right region covbase = 0; int32_t rstart = svs[i].svEnd; int32_t rend = std::min(svs[i].svEnd + halfSize, (int32_t) hdr[file_c]->target_len[refIndex]); if ((_translocation(svs[i].svt)) || (svs[i].svt == 4)) { rstart = svs[i].svStart; rend = std::min(svs[i].svStart + halfSize, (int32_t) hdr[file_c]->target_len[refIndex]); } for(uint32_t k = rstart; ((k < (uint32_t) rend) && (k < hdr[file_c]->target_len[refIndex])); ++k) covbase += covBases[k]; covMap[file_c][svs[i].id].rightRC = covbase; } } } } // Clean-up fai_destroy(fai); // Output coverage info std::cout << "Coverage distribution (^COV)" << std::endl; for(uint32_t file_c = 0; file_c < c.files.size(); ++file_c) { uint64_t totalCovCount = 0; for (uint32_t i = 0; i < covDist[file_c].size(); ++i) totalCovCount += covDist[file_c][i]; std::vector covPercentiles(5, 0); // 5%, 25%, 50%, 75%, 95% uint64_t cumCovCount = 0; for (uint32_t i = 0; i < covDist[file_c].size(); ++i) { cumCovCount += covDist[file_c][i]; double frac = (double) cumCovCount / (double) totalCovCount; if (frac < 0.05) covPercentiles[0] = i + 1; if (frac < 0.25) covPercentiles[1] = i + 1; if (frac < 0.5) covPercentiles[2] = i + 1; if (frac < 0.75) covPercentiles[3] = i + 1; if (frac < 0.95) covPercentiles[4] = i + 1; } std::cout << "COV\t" << c.sampleName[file_c] << "\t95% of bases are >= " << covPercentiles[0] << "x" << std::endl; std::cout << "COV\t" << c.sampleName[file_c] << "\t75% of bases are >= " << covPercentiles[1] << "x" << std::endl; std::cout << "COV\t" << c.sampleName[file_c] << "\t50% of bases are >= " << covPercentiles[2] << "x" << std::endl; std::cout << "COV\t" << c.sampleName[file_c] << "\t25% of bases are >= " << covPercentiles[3] << "x" << std::endl; std::cout << "COV\t" << c.sampleName[file_c] << "\t5% of bases are >= " << covPercentiles[4] << "x" << std::endl; } // Output read length info std::cout << "Read-length distribution (^RL)" << std::endl; for(uint32_t file_c = 0; file_c < c.files.size(); ++file_c) { uint64_t totalRlCount = 0; for (uint32_t i = 0; i < rlDist[file_c].size(); ++i) totalRlCount += rlDist[file_c][i]; std::vector rlPercentiles(5, 0); // 5%, 25%, 50%, 75%, 95% uint64_t cumRlCount = 0; for (uint32_t i = 0; i < rlDist[file_c].size(); ++i) { cumRlCount += rlDist[file_c][i]; double frac = (double) cumRlCount / (double) totalRlCount; if (frac < 0.05) rlPercentiles[0] = (i + 1) * rlBinSize; if (frac < 0.25) rlPercentiles[1] = (i + 1) * rlBinSize; if (frac < 0.5) rlPercentiles[2] = (i + 1) * rlBinSize; if (frac < 0.75) rlPercentiles[3] = (i + 1) * rlBinSize; if (frac < 0.95) rlPercentiles[4] = (i + 1) * rlBinSize; } std::cout << "RL\t" << c.sampleName[file_c] << "\t95% of reads are >= " << rlPercentiles[0] << "bp" << std::endl; std::cout << "RL\t" << c.sampleName[file_c] << "\t75% of reads are >= " << rlPercentiles[1] << "bp" << std::endl; std::cout << "RL\t" << c.sampleName[file_c] << "\t50% of reads are >= " << rlPercentiles[2] << "bp" << std::endl; std::cout << "RL\t" << c.sampleName[file_c] << "\t25% of reads are >= " << rlPercentiles[3] << "bp" << std::endl; std::cout << "RL\t" << c.sampleName[file_c] << "\t5% of reads are >= " << rlPercentiles[4] << "bp" << std::endl; } // Output sequencing error rates std::cout << "Sequencing error rates (^ERR)" << std::endl; for(uint32_t file_c = 0; file_c < c.files.size(); ++file_c) { uint64_t alignedbases = matchCount[file_c] + mismatchCount[file_c] + delCount[file_c] + insCount[file_c]; if (mismatchCount[file_c]) { std::cout << "ERR\t" << c.sampleName[file_c] << "\tMatchRate\t" << (double) matchCount[file_c] / (double) alignedbases << std::endl; std::cout << "ERR\t" << c.sampleName[file_c] << "\tMismatchRate\t" << (double) mismatchCount[file_c] / (double) alignedbases << std::endl; } std::cout << "ERR\t" << c.sampleName[file_c] << "\tDeletionRate\t" << (double) delCount[file_c] / (double) alignedbases << std::endl; std::cout << "ERR\t" << c.sampleName[file_c] << "\tInsertionRate\t" << (double) insCount[file_c] / (double) alignedbases << std::endl; } // Clean-up for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) { bam_hdr_destroy(hdr[file_c]); hts_idx_destroy(idx[file_c]); sam_close(samfile[file_c]); } } template inline void genotypeLR(TConfig& c, std::vector& svs, TSRStore& srStore, TJunctionMap& jctMap, TReadCountMap& covMap) { typedef std::vector TSVs; if (svs.empty()) return; typedef uint16_t TMaxCoverage; uint32_t maxCoverage = std::numeric_limits::max(); // Open file handles typedef std::vector TSamFile; typedef std::vector TIndex; typedef std::vector THeader; TSamFile samfile(c.files.size()); TIndex idx(c.files.size()); THeader hdr(c.files.size()); int32_t totalTarget = 0; for(uint32_t file_c = 0; file_c < c.files.size(); ++file_c) { samfile[file_c] = sam_open(c.files[file_c].string().c_str(), "r"); hts_set_fai_filename(samfile[file_c], c.genome.string().c_str()); idx[file_c] = sam_index_load(samfile[file_c], c.files[file_c].string().c_str()); hdr[file_c] = sam_hdr_read(samfile[file_c]); totalTarget += hdr[file_c]->n_targets; } // Parse genome chr-by-chr boost::posix_time::ptime now = boost::posix_time::second_clock::local_time(); std::cout << '[' << boost::posix_time::to_simple_string(now) << "] " << "SV annotation" << std::endl; boost::progress_display show_progress( hdr[0]->n_targets ); // Ref aligned reads typedef std::vector TRefAlignCount; typedef std::vector TFileRefAlignCount; TFileRefAlignCount refAlignedReadCount(c.files.size(), TRefAlignCount()); for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) refAlignedReadCount[file_c].resize(svs.size(), 0); // Dump file boost::iostreams::filtering_ostream dumpOut; if (c.hasDumpFile) { dumpOut.push(boost::iostreams::gzip_compressor()); dumpOut.push(boost::iostreams::file_sink(c.dumpfile.string().c_str(), std::ios_base::out | std::ios_base::binary)); dumpOut << "#svid\tbam\tqname\tchr\tpos\tmatechr\tmatepos\tmapq\ttype" << std::endl; } faidx_t* fai = fai_load(c.genome.string().c_str()); // Iterate chromosomes for(int32_t refIndex=0; refIndex < (int32_t) hdr[0]->n_targets; ++refIndex) { ++show_progress; char* seq = NULL; // Iterate samples for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) { // Check we have mapped reads on this chromosome bool nodata = true; std::string suffix("cram"); std::string str(c.files[file_c].string()); if ((str.size() >= suffix.size()) && (str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0)) nodata = false; uint64_t mapped = 0; uint64_t unmapped = 0; hts_idx_get_stat(idx[file_c], refIndex, &mapped, &unmapped); if (mapped) nodata = false; if (nodata) continue; // Flag breakpoints typedef std::set TIdSet; typedef std::map TBpToIdMap; TBpToIdMap bpid; typedef boost::dynamic_bitset<> TBitSet; TBitSet bpOccupied(hdr[file_c]->target_len[refIndex], false); for(typename TSVs::iterator itSV = svs.begin(); itSV != svs.end(); ++itSV) { if (itSV->chr == refIndex) { bpOccupied[itSV->svStart] = 1; if (bpid.find(itSV->svStart) == bpid.end()) bpid.insert(std::make_pair(itSV->svStart, TIdSet())); bpid[itSV->svStart].insert(itSV->id); } if (itSV->chr2 == refIndex) { bpOccupied[itSV->svEnd] = 1; if (bpid.find(itSV->svEnd) == bpid.end()) bpid.insert(std::make_pair(itSV->svEnd, TIdSet())); bpid[itSV->svEnd].insert(itSV->id); } } if (bpid.empty()) continue; // Lazy loading of reference sequence if (seq == NULL) { int32_t seqlen = -1; std::string tname(hdr[0]->target_name[refIndex]); seq = faidx_fetch_seq(fai, tname.c_str(), 0, hdr[0]->target_len[refIndex], &seqlen); } // Coverage track typedef std::vector TBpCoverage; TBpCoverage covBases(hdr[file_c]->target_len[refIndex], 0); // Count reads hts_itr_t* iter = sam_itr_queryi(idx[file_c], refIndex, 0, hdr[file_c]->target_len[refIndex]); bam1_t* rec = bam_init1(); while (sam_itr_next(samfile[file_c], iter, rec) >= 0) { // Genotyping only primary alignments if (rec->core.flag & (BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP | BAM_FSUPPLEMENTARY | BAM_FUNMAP)) continue; // Read hash std::size_t seed = hash_lr(rec); // Reference and sequence pointer uint32_t rp = rec->core.pos; // reference pointer uint32_t sp = 0; // sequence pointer // Get sequence std::string sequence; sequence.resize(rec->core.l_qseq); uint8_t* seqptr = bam_get_seq(rec); for (int i = 0; i < rec->core.l_qseq; ++i) sequence[i] = "=ACMGRSVTWYHKDBN"[bam_seqi(seqptr, i)]; // Any REF support std::string refAlign = ""; std::string altAlign = ""; std::vector hits; uint32_t* cigar = bam_get_cigar(rec); for (std::size_t i = 0; i < rec->core.n_cigar; ++i) { if ((bam_cigar_op(cigar[i]) == BAM_CMATCH) || (bam_cigar_op(cigar[i]) == BAM_CEQUAL) || (bam_cigar_op(cigar[i]) == BAM_CDIFF)) { // Fetch reference alignments for(uint32_t k = 0; k < bam_cigar_oplen(cigar[i]); ++k) { if ((rp < hdr[file_c]->target_len[refIndex]) && (covBases[rp] < maxCoverage - 1)) ++covBases[rp]; refAlign += seq[rp]; altAlign += sequence[sp]; if (bpOccupied[rp]) hits.push_back(rp); ++sp; ++rp; } } else if ((bam_cigar_op(cigar[i]) == BAM_CDEL) || (bam_cigar_op(cigar[i]) == BAM_CREF_SKIP)) { for(uint32_t k = 0; k < bam_cigar_oplen(cigar[i]); ++k) { refAlign += seq[rp]; altAlign += "-"; if (bpOccupied[rp]) hits.push_back(rp); ++rp; } } else if (bam_cigar_op(cigar[i]) == BAM_CINS) { for(uint32_t k = 0; k < bam_cigar_oplen(cigar[i]); ++k) { refAlign += "-"; altAlign += sequence[sp]; ++sp; } } else if (bam_cigar_op(cigar[i]) == BAM_CSOFT_CLIP) { sp += bam_cigar_oplen(cigar[i]); } else if (bam_cigar_op(cigar[i]) == BAM_CHARD_CLIP) { // Do nothing } else { std::cerr << "Unknown Cigar options" << std::endl; } } // Any ALT support? TIdSet altAssigned; if (srStore.find(seed) != srStore.end()) { for(uint32_t ri = 0; ri < srStore[seed].size(); ++ri) { int32_t svid = srStore[seed][ri].svid; if (svid == -1) continue; //if ((svs[svid].svt == 2) || (svs[svid].svt == 4)) continue; altAssigned.insert(svid); uint8_t* hpptr = bam_aux_get(rec, "HP"); if (c.hasDumpFile) { std::string svidStr(_addID(svs[svid].svt)); std::string padNumber = boost::lexical_cast(svid); padNumber.insert(padNumber.begin(), 8 - padNumber.length(), '0'); svidStr += padNumber; dumpOut << svidStr << "\t" << c.files[file_c].string() << "\t" << bam_get_qname(rec) << "\t" << hdr[file_c]->target_name[rec->core.tid] << "\t" << rec->core.pos << "\t" << hdr[file_c]->target_name[rec->core.mtid] << "\t" << rec->core.mpos << "\t" << (int32_t) rec->core.qual << "\tSR" << std::endl; } // ToDo //jctMap[file_c][svid].alt.push_back((uint8_t) std::min((uint32_t) score, (uint32_t) rec->core.qual)); jctMap[file_c][svid].alt.push_back((uint8_t) std::min((uint32_t) 20, (uint32_t) rec->core.qual)); if (hpptr) { c.isHaplotagged = true; int hap = bam_aux2i(hpptr); if (hap == 1) ++jctMap[file_c][svid].alth1; else ++jctMap[file_c][svid].alth2; } } } // Any REF support if (hits.empty()) continue; // Sufficiently long flank mapping? if ((rp - rec->core.pos) < c.minimumFlankSize) continue; // Iterate all spanned SVs for(uint32_t idx = 0; idx < hits.size(); ++idx) { //std::cerr << hits[idx] - rec->core.pos << ',' << rp - hits[idx] << std::endl; // Long enough flanking sequence if (hits[idx] < rec->core.pos + c.minimumFlankSize) continue; if (rp < hits[idx] + c.minimumFlankSize) continue; // Confident mapping? float percid = percentIdentity(refAlign, altAlign, hits[idx] - rec->core.pos, c.minRefSep * 2); double score = percid * percid * percid * percid * percid * percid * percid * percid * 30; if (score < c.minGenoQual) continue; for(typename TIdSet::const_iterator its = bpid[hits[idx]].begin(); its != bpid[hits[idx]].end(); ++its) { int32_t svid = *its; //if ((svs[svid].svt == 2) || (svs[svid].svt == 4)) continue; if (altAssigned.find(svid) != altAssigned.end()) continue; //std::cerr << svs[svid].chr << ',' << svs[svid].svStart << ',' << svs[svid].chr2 << ',' << svs[svid].svEnd << std::endl; if (++refAlignedReadCount[file_c][svid] % 2) { uint8_t* hpptr = bam_aux_get(rec, "HP"); jctMap[file_c][svid].ref.push_back((uint8_t) std::min((uint32_t) score, (uint32_t) rec->core.qual)); if (hpptr) { c.isHaplotagged = true; int hap = bam_aux2i(hpptr); if (hap == 1) ++jctMap[file_c][svid].refh1; else ++jctMap[file_c][svid].refh2; } } } } } // Clean-up bam_destroy1(rec); hts_itr_destroy(iter); // Assign SV support for(uint32_t i = 0; i < svs.size(); ++i) { if (svs[i].chr == refIndex) { int32_t halfSize = (svs[i].svEnd - svs[i].svStart)/2; if ((_translocation(svs[i].svt)) || (svs[i].svt == 4)) halfSize = 500; // Left region int32_t lstart = std::max(svs[i].svStart - halfSize, 0); int32_t lend = svs[i].svStart; int32_t covbase = 0; for(uint32_t k = lstart; ((k < (uint32_t) lend) && (k < hdr[file_c]->target_len[refIndex])); ++k) covbase += covBases[k]; covMap[file_c][svs[i].id].leftRC = covbase; // Actual SV covbase = 0; int32_t mstart = svs[i].svStart; int32_t mend = svs[i].svEnd; if ((_translocation(svs[i].svt)) || (svs[i].svt == 4)) { mstart = std::max(svs[i].svStart - halfSize, 0); mend = std::min(svs[i].svStart + halfSize, (int32_t) hdr[file_c]->target_len[refIndex]); } for(uint32_t k = mstart; ((k < (uint32_t) mend) && (k < hdr[file_c]->target_len[refIndex])); ++k) covbase += covBases[k]; covMap[file_c][svs[i].id].rc = covbase; // Right region covbase = 0; int32_t rstart = svs[i].svEnd; int32_t rend = std::min(svs[i].svEnd + halfSize, (int32_t) hdr[file_c]->target_len[refIndex]); if ((_translocation(svs[i].svt)) || (svs[i].svt == 4)) { rstart = svs[i].svStart; rend = std::min(svs[i].svStart + halfSize, (int32_t) hdr[file_c]->target_len[refIndex]); } for(uint32_t k = rstart; ((k < (uint32_t) rend) && (k < hdr[file_c]->target_len[refIndex])); ++k) covbase += covBases[k]; covMap[file_c][svs[i].id].rightRC = covbase; } } } if (seq != NULL) free(seq); } // Clean-up fai_destroy(fai); // Clean-up for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) { bam_hdr_destroy(hdr[file_c]); hts_idx_destroy(idx[file_c]); sam_close(samfile[file_c]); } } } #endif delly-0.9.1/src/gotoh.h000066400000000000000000000132401414764127700147270ustar00rootroot00000000000000#ifndef GOTOH_H #define GOTOH_H #include #include #include "align.h" namespace torali { template inline int gotohScore(TAlign1 const& a1, TAlign2 const& a2, TAlignConfig const& ac, TScoreObject const& sc) { typedef typename TScoreObject::TValue TScoreValue; // DP variables std::size_t m = _size(a1, 1); std::size_t n = _size(a2, 1); std::vector s(n+1, 0); std::vector v(n+1, 0); TScoreValue newhoz = 0; TScoreValue prevsub = 0; // Create profile typedef boost::multi_array TProfile; TProfile p1; TProfile p2; if ((_size(a1, 0) != 1) || (_size(a2, 0) != 1)) { _createProfile(a1, p1); _createProfile(a2, p2); } // DP for(std::size_t row = 0; row <= m; ++row) { for(std::size_t col = 0; col <= n; ++col) { // Initialization if ((row == 0) && (col == 0)) { s[0] = 0; v[0] = -sc.inf; newhoz = -sc.inf; } else if (row == 0) { v[col] = -sc.inf; s[col] = _horizontalGap(ac, 0, m, sc.go + col * sc.ge); newhoz = _horizontalGap(ac, 0, m, sc.go + col * sc.ge); } else if (col == 0) { newhoz = -sc.inf; s[0] = _verticalGap(ac, 0, n, sc.go + row * sc.ge); if (row - 1 == 0) prevsub = 0; else prevsub = _verticalGap(ac, 0, n, sc.go + (row - 1) * sc.ge); v[0] = _verticalGap(ac, 0, n, sc.go + row * sc.ge); } else { // Recursion TScoreValue prevhoz = newhoz; TScoreValue prevver = v[col]; TScoreValue prevprevsub = prevsub; prevsub = s[col]; newhoz = std::max(s[col-1] + _horizontalGap(ac, row, m, sc.go + sc.ge), prevhoz + _horizontalGap(ac, row, m, sc.ge)); v[col] = std::max(prevsub + _verticalGap(ac, col, n, sc.go + sc.ge), prevver + _verticalGap(ac, col, n, sc.ge)); s[col] = std::max(std::max(prevprevsub + _score(a1, a2, p1, p2, row-1, col-1, sc), newhoz), v[col]); } } } // Score return s[n]; } template inline int gotoh(TAlign1 const& a1, TAlign2 const& a2, TAlign& align, TAlignConfig const& ac, TScoreObject const& sc) { typedef typename TScoreObject::TValue TScoreValue; // DP variables std::size_t m = _size(a1, 1); std::size_t n = _size(a2, 1); std::vector s(n+1, 0); std::vector v(n+1, 0); TScoreValue newhoz = 0; TScoreValue prevsub = 0; // Trace Matrix std::size_t mf = n+1; typedef boost::dynamic_bitset<> TBitSet; TBitSet bit1( (m+1) * (n+1), false); TBitSet bit2( (m+1) * (n+1), false); TBitSet bit3( (m+1) * (n+1), false); TBitSet bit4( (m+1) * (n+1), false); // Create profile typedef boost::multi_array TProfile; TProfile p1; TProfile p2; if ((_size(a1, 0) != 1) || (_size(a2, 0) != 1)) { _createProfile(a1, p1); _createProfile(a2, p2); } // DP for(std::size_t row = 0; row <= m; ++row) { for(std::size_t col = 0; col <= n; ++col) { // Initialization if ((row == 0) && (col == 0)) { s[0] = 0; v[0] = -sc.inf; newhoz = -sc.inf; bit1[0] = true; bit2[0] = true; } else if (row == 0) { v[col] = -sc.inf; s[col] = _horizontalGap(ac, 0, m, sc.go + col * sc.ge); newhoz = _horizontalGap(ac, 0, m, sc.go + col * sc.ge); bit3[col] = true; } else if (col == 0) { newhoz = -sc.inf; s[0] = _verticalGap(ac, 0, n, sc.go + row * sc.ge); if (row - 1 == 0) prevsub = 0; else prevsub = _verticalGap(ac, 0, n, sc.go + (row - 1) * sc.ge); v[0] = _verticalGap(ac, 0, n, sc.go + row * sc.ge); bit4[row * mf] = true; } else { // Recursion TScoreValue prevhoz = newhoz; TScoreValue prevver = v[col]; TScoreValue prevprevsub = prevsub; prevsub = s[col]; newhoz = std::max(s[col-1] + _horizontalGap(ac, row, m, sc.go + sc.ge), prevhoz + _horizontalGap(ac, row, m, sc.ge)); v[col] = std::max(prevsub + _verticalGap(ac, col, n, sc.go + sc.ge), prevver + _verticalGap(ac, col, n, sc.ge)); s[col] = std::max(std::max(prevprevsub + _score(a1, a2, p1, p2, row-1, col-1, sc), newhoz), v[col]); // Trace if (s[col] == newhoz) bit3[row * mf + col] = true; else if (s[col] == v[col]) bit4[row * mf + col] = true; if (newhoz != prevhoz + _horizontalGap(ac, row, m, sc.ge)) bit1[row * mf + col] = true; if (v[col] != prevver + _verticalGap(ac, col, n, sc.ge)) bit2[row * mf + col] = true; } } } // Trace-back using pointers std::size_t row = m; std::size_t col = n; char lastMatrix = 's'; typedef std::vector TTrace; TTrace btr; while ((row>0) || (col>0)) { if (lastMatrix == 's') { if (bit3[row * mf + col]) lastMatrix = 'h'; else if (bit4[row * mf + col]) lastMatrix = 'v'; else { --row; --col; btr.push_back('s'); } } else if (lastMatrix == 'h') { if (bit1[row * mf + col]) lastMatrix = 's'; --col; btr.push_back('h'); } else if (lastMatrix == 'v') { if (bit2[row * mf + col]) lastMatrix = 's'; --row; btr.push_back('v'); } } // Create alignment _createAlignment(btr, a1, a2, align); // Score return s[n]; } template inline int gotoh(TAlign1 const& a1, TAlign2 const& a2, TAlign& align, TAlignConfig const& ac) { DnaScore dnasc; return gotoh(a1, a2, align, ac, dnasc); } template inline int gotoh(TAlign1 const& a1, TAlign2 const& a2, TAlign& align) { AlignConfig ac; return gotoh(a1, a2, align, ac); } } #endif delly-0.9.1/src/htslib/000077500000000000000000000000001414764127700147235ustar00rootroot00000000000000delly-0.9.1/src/junction.h000066400000000000000000000657611414764127700154570ustar00rootroot00000000000000#ifndef JUNCTION_H #define JUNCTION_H #include #include #include #include #include #include #include #include #include #include #include "util.h" #include "assemble.h" namespace torali { struct SRBamRecord { int32_t chr; int32_t pos; int32_t chr2; int32_t pos2; int32_t rstart; int32_t sstart; int32_t qual; int32_t inslen; int32_t svid; std::size_t id; SRBamRecord(int32_t const c, int32_t const p, int32_t const c2, int32_t const p2, int32_t const rst, int32_t const sst, int32_t const qval, int32_t const il, std::size_t const idval) : chr(c), pos(p), chr2(c2), pos2(p2), rstart(rst), sstart(sst), qual(qval), inslen(il), svid(-1), id(idval) {} }; template struct SortSRBamRecord : public std::binary_function { inline bool operator()(TSRBamRecord const& sv1, TSRBamRecord const& sv2) { return ((sv1.chr inline void _insertJunction(TReadBp& readBp, std::size_t const seed, bam1_t* rec, int32_t const rp, int32_t const sp, bool const scleft) { bool fw = true; if (rec->core.flag & BAM_FREVERSE) fw = false; int32_t readStart = rec->core.pos; if (rec->core.flag & (BAM_FQCFAIL | BAM_FDUP | BAM_FUNMAP | BAM_FSECONDARY | BAM_FSUPPLEMENTARY)) readStart = -1; typedef typename TReadBp::mapped_type TJunctionVector; typename TReadBp::iterator it = readBp.find(seed); int32_t seqlen = readLength(rec); if (sp <= seqlen) { if (rec->core.flag & BAM_FREVERSE) { if (it != readBp.end()) it->second.push_back(Junction(fw, scleft, rec->core.tid, readStart, rp, seqlen - sp, rec->core.qual)); else readBp.insert(std::make_pair(seed, TJunctionVector(1, Junction(fw, scleft, rec->core.tid, readStart, rp, seqlen - sp, rec->core.qual)))); } else { if (it != readBp.end()) it->second.push_back(Junction(fw, scleft, rec->core.tid, readStart, rp, sp, rec->core.qual)); else readBp.insert(std::make_pair(seed, TJunctionVector(1, Junction(fw, scleft, rec->core.tid, readStart, rp, sp, rec->core.qual)))); } } } template struct SortJunction : public std::binary_function { inline bool operator()(TJunction const& j1, TJunction const& j2) { return ((j1.seqpos inline void selectDeletions(TConfig const& c, TReadBp const& readBp, std::vector >& br) { for(typename TReadBp::const_iterator it = readBp.begin(); it != readBp.end(); ++it) { if (it->second.size() > 1) { for(uint32_t i = 0; i < it->second.size(); ++i) { for(uint32_t j = i+1; j < it->second.size(); ++j) { if ((uint32_t) (it->second[j].seqpos - it->second[i].seqpos) > c.maxReadSep) break; // Same chr, same direction, opposing soft-clips if ((it->second[j].refidx == it->second[i].refidx) && (it->second[j].forward == it->second[i].forward) && (it->second[i].scleft != it->second[j].scleft)) { // Min. deletion size if ( (uint32_t) std::abs(it->second[j].refpos - it->second[i].refpos) > c.minRefSep) { int32_t rst = it->second[i].rstart; if (rst == -1) rst = it->second[j].rstart; // Avg. qval int32_t qval = (int32_t) (((int32_t) it->second[i].qual + (int32_t) it->second[j].qual) / 2); // Correct clipping architecture, note: soft-clipping of error-prone reads can lead to switching left/right breakpoints if (it->second[i].refpos <= it->second[j].refpos) { if ((!it->second[i].scleft) && (it->second[j].scleft)) { br[2].push_back(SRBamRecord(it->second[i].refidx, it->second[i].refpos, it->second[j].refidx, it->second[j].refpos, rst, std::min(it->second[j].seqpos, it->second[i].seqpos), qval, std::abs(it->second[j].seqpos - it->second[i].seqpos), it->first)); } } else { if ((it->second[i].scleft) && (!it->second[j].scleft)) { br[2].push_back(SRBamRecord(it->second[j].refidx, it->second[j].refpos, it->second[i].refidx, it->second[i].refpos, rst, std::min(it->second[j].seqpos, it->second[i].seqpos), qval, std::abs(it->second[j].seqpos - it->second[i].seqpos), it->first)); } } } } } } } } } // Duplication junctions template inline void selectDuplications(TConfig const& c, TReadBp const& readBp, std::vector >& br) { for(typename TReadBp::const_iterator it = readBp.begin(); it != readBp.end(); ++it) { if (it->second.size() > 1) { for(uint32_t i = 0; i < it->second.size(); ++i) { for(uint32_t j = i+1; j < it->second.size(); ++j) { if ((uint32_t) (it->second[j].seqpos - it->second[i].seqpos) > c.maxReadSep) break; // Same chr, same direction, opposing soft-clips if ((it->second[j].refidx == it->second[i].refidx) && (it->second[j].forward == it->second[i].forward) && (it->second[i].scleft != it->second[j].scleft)) { // Min. duplication size if ( (uint32_t) std::abs(it->second[j].refpos - it->second[i].refpos) > c.minRefSep) { int32_t rst = it->second[i].rstart; if (rst == -1) rst = it->second[j].rstart; // Avg. qval int32_t qval = (int32_t) (((int32_t) it->second[i].qual + (int32_t) it->second[j].qual) / 2); // Correct clipping architecture, note: soft-clipping of error-prone reads can lead to switching left/right breakpoints if (it->second[i].refpos <= it->second[j].refpos) { if ((it->second[i].scleft) && (!it->second[j].scleft)) { br[3].push_back(SRBamRecord(it->second[i].refidx, it->second[i].refpos, it->second[j].refidx, it->second[j].refpos, rst, std::min(it->second[j].seqpos, it->second[i].seqpos), qval, std::abs(it->second[j].seqpos - it->second[i].seqpos), it->first)); } } else { if ((!it->second[i].scleft) && (it->second[j].scleft)) { br[3].push_back(SRBamRecord(it->second[j].refidx, it->second[j].refpos, it->second[i].refidx, it->second[i].refpos, rst, std::min(it->second[j].seqpos, it->second[i].seqpos), qval, std::abs(it->second[j].seqpos - it->second[i].seqpos), it->first)); } } } } } } } } } // Inversion junctions template inline void selectInversions(TConfig const& c, TReadBp const& readBp, std::vector >& br) { for(typename TReadBp::const_iterator it = readBp.begin(); it != readBp.end(); ++it) { if (it->second.size() > 1) { for(uint32_t i = 0; i < it->second.size(); ++i) { for(uint32_t j = i+1; j < it->second.size(); ++j) { if ((uint32_t) (it->second[j].seqpos - it->second[i].seqpos) > c.maxReadSep) break; // Same chr, different direction, agreeing soft-clips if ((it->second[j].refidx == it->second[i].refidx) && (it->second[j].forward != it->second[i].forward) && (it->second[i].scleft == it->second[j].scleft)) { // Min. inversion size if ( (uint32_t) std::abs(it->second[j].refpos - it->second[i].refpos) > c.minRefSep) { int32_t rst = it->second[i].rstart; if (rst == -1) rst = it->second[j].rstart; // Avg. qval int32_t qval = (int32_t) (((int32_t) it->second[i].qual + (int32_t) it->second[j].qual) / 2); if (it->second[i].refpos <= it->second[j].refpos) { // Need to differentiate 3to3 and 5to5 if (it->second[i].scleft) br[1].push_back(SRBamRecord(it->second[i].refidx, it->second[i].refpos, it->second[j].refidx, it->second[j].refpos, rst, std::min(it->second[j].seqpos, it->second[i].seqpos), qval, std::abs(it->second[j].seqpos - it->second[i].seqpos), it->first)); else br[0].push_back(SRBamRecord(it->second[i].refidx, it->second[i].refpos, it->second[j].refidx, it->second[j].refpos, rst, std::min(it->second[j].seqpos, it->second[i].seqpos), qval, std::abs(it->second[j].seqpos - it->second[i].seqpos), it->first)); } else { // Need to differentiate 3to3 and 5to5 if (it->second[i].scleft) br[1].push_back(SRBamRecord(it->second[j].refidx, it->second[j].refpos, it->second[i].refidx, it->second[i].refpos, rst, std::min(it->second[j].seqpos, it->second[i].seqpos), qval, std::abs(it->second[j].seqpos - it->second[i].seqpos), it->first)); else br[0].push_back(SRBamRecord(it->second[j].refidx, it->second[j].refpos, it->second[i].refidx, it->second[i].refpos, rst, std::min(it->second[j].seqpos, it->second[i].seqpos), qval, std::abs(it->second[j].seqpos - it->second[i].seqpos), it->first)); } } } } } } } } // Insertion junctions template inline void selectInsertions(TConfig const& c, TReadBp const& readBp, std::vector >& br) { for(typename TReadBp::const_iterator it = readBp.begin(); it != readBp.end(); ++it) { if (it->second.size() > 1) { for(uint32_t i = 0; i < it->second.size(); ++i) { for(uint32_t j = i+1; j < it->second.size(); ++j) { // Same chr, same direction, opposing soft-clips if ((it->second[j].refidx == it->second[i].refidx) && (it->second[j].forward == it->second[i].forward) && (it->second[i].scleft != it->second[j].scleft)) { // Reference insertion footprint should be small if ( (uint32_t) std::abs(it->second[j].refpos - it->second[i].refpos) < c.maxReadSep) { // Large separation in sequence space if ((uint32_t) (it->second[j].seqpos - it->second[i].seqpos) > c.minRefSep) { int32_t rst = it->second[i].rstart; if (rst == -1) rst = it->second[j].rstart; // Avg. qval int32_t qval = (int32_t) (((int32_t) it->second[i].qual + (int32_t) it->second[j].qual) / 2); if (it->second[i].refpos <= it->second[j].refpos) { br[4].push_back(SRBamRecord(it->second[i].refidx, it->second[i].refpos, it->second[j].refidx, it->second[j].refpos, rst, std::min(it->second[j].seqpos, it->second[i].seqpos), qval, std::abs(it->second[j].seqpos - it->second[i].seqpos), it->first)); } else { br[4].push_back(SRBamRecord(it->second[j].refidx, it->second[j].refpos, it->second[i].refidx, it->second[i].refpos, rst, std::min(it->second[j].seqpos, it->second[i].seqpos), qval, std::abs(it->second[j].seqpos - it->second[i].seqpos), it->first)); } } } } } } } } } // Translocation junctions template inline void selectTranslocations(TConfig const& c, TReadBp const& readBp, std::vector >& br) { for(typename TReadBp::const_iterator it = readBp.begin(); it != readBp.end(); ++it) { if (it->second.size() > 1) { for(uint32_t i = 0; i < it->second.size(); ++i) { for(uint32_t j = i+1; j < it->second.size(); ++j) { if ((uint32_t) (it->second[j].seqpos - it->second[i].seqpos) > c.maxReadSep) break; // Different chr if (it->second[j].refidx != it->second[i].refidx) { int32_t chr1ev = j; int32_t chr2ev = i; if (it->second[i].refidx < it->second[j].refidx) { chr1ev = i; chr2ev = j; } int32_t rst = it->second[i].rstart; if (rst == -1) rst = it->second[j].rstart; // Avg. qval int32_t qval = (int32_t) (((int32_t) it->second[i].qual + (int32_t) it->second[j].qual) / 2); if (it->second[chr1ev].forward == it->second[chr2ev].forward) { // Same direction, opposing soft-clips if (it->second[chr1ev].scleft != it->second[chr2ev].scleft) { if (it->second[chr1ev].scleft) { // 3to5 br[DELLY_SVT_TRANS + 2].push_back(SRBamRecord(it->second[chr2ev].refidx, it->second[chr2ev].refpos, it->second[chr1ev].refidx, it->second[chr1ev].refpos, rst, std::min(it->second[j].seqpos, it->second[i].seqpos), qval, std::abs(it->second[j].seqpos - it->second[i].seqpos), it->first)); } else { // 5to3 br[DELLY_SVT_TRANS + 3].push_back(SRBamRecord(it->second[chr2ev].refidx, it->second[chr2ev].refpos, it->second[chr1ev].refidx, it->second[chr1ev].refpos, rst, std::min(it->second[j].seqpos, it->second[i].seqpos), qval, std::abs(it->second[j].seqpos - it->second[i].seqpos), it->first)); } } } else { // Opposing direction, same soft-clips if (it->second[chr1ev].scleft == it->second[chr2ev].scleft) { if (it->second[chr1ev].scleft) { // 5to5 br[DELLY_SVT_TRANS + 1].push_back(SRBamRecord(it->second[chr2ev].refidx, it->second[chr2ev].refpos, it->second[chr1ev].refidx, it->second[chr1ev].refpos, rst, std::min(it->second[j].seqpos, it->second[i].seqpos), qval, std::abs(it->second[j].seqpos - it->second[i].seqpos), it->first)); } else { // 3to3 br[DELLY_SVT_TRANS + 0].push_back(SRBamRecord(it->second[chr2ev].refidx, it->second[chr2ev].refpos, it->second[chr1ev].refidx, it->second[chr1ev].refpos, rst, std::min(it->second[j].seqpos, it->second[i].seqpos), qval, std::abs(it->second[j].seqpos - it->second[i].seqpos), it->first)); } } } } } } } } } template inline void findJunctions(TConfig const& c, TValidRegion const& validRegions, TReadBp& readBp) { typedef typename TValidRegion::value_type TChrIntervals; // Open file handles typedef std::vector TSamFile; typedef std::vector TIndex; TSamFile samfile(c.files.size()); TIndex idx(c.files.size()); for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) { samfile[file_c] = sam_open(c.files[file_c].string().c_str(), "r"); hts_set_fai_filename(samfile[file_c], c.genome.string().c_str()); idx[file_c] = sam_index_load(samfile[file_c], c.files[file_c].string().c_str()); } bam_hdr_t* hdr = sam_hdr_read(samfile[0]); // Parse genome chr-by-chr boost::posix_time::ptime now = boost::posix_time::second_clock::local_time(); std::cout << '[' << boost::posix_time::to_simple_string(now) << "] " << "Split-read scanning" << std::endl; boost::progress_display show_progress( hdr->n_targets ); // Iterate chromosomes for(int32_t refIndex=0; refIndex < (int32_t) hdr->n_targets; ++refIndex) { ++show_progress; if (validRegions[refIndex].empty()) continue; // Collect reads from all samples for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) { // Read alignments for(typename TChrIntervals::const_iterator vRIt = validRegions[refIndex].begin(); vRIt != validRegions[refIndex].end(); ++vRIt) { hts_itr_t* iter = sam_itr_queryi(idx[file_c], refIndex, vRIt->lower(), vRIt->upper()); bam1_t* rec = bam_init1(); while (sam_itr_next(samfile[file_c], iter, rec) >= 0) { // Keep secondary alignments if (rec->core.flag & (BAM_FQCFAIL | BAM_FDUP | BAM_FUNMAP)) continue; if ((rec->core.qual < c.minMapQual) || (rec->core.tid<0)) continue; std::size_t seed = hash_lr(rec); //std::cerr << bam_get_qname(rec) << '\t' << seed << std::endl; uint32_t rp = rec->core.pos; // reference pointer uint32_t sp = 0; // sequence pointer // Parse the CIGAR uint32_t* cigar = bam_get_cigar(rec); for (std::size_t i = 0; i < rec->core.n_cigar; ++i) { if ((bam_cigar_op(cigar[i]) == BAM_CMATCH) || (bam_cigar_op(cigar[i]) == BAM_CEQUAL) || (bam_cigar_op(cigar[i]) == BAM_CDIFF)) { sp += bam_cigar_oplen(cigar[i]); rp += bam_cigar_oplen(cigar[i]); } else if (bam_cigar_op(cigar[i]) == BAM_CDEL) { if (bam_cigar_oplen(cigar[i]) > c.minRefSep) _insertJunction(readBp, seed, rec, rp, sp, false); rp += bam_cigar_oplen(cigar[i]); if (bam_cigar_oplen(cigar[i]) > c.minRefSep) { // Try look-ahead uint32_t spOrig = sp; uint32_t rpTmp = rp; uint32_t spTmp = sp; uint32_t dlen = bam_cigar_oplen(cigar[i]); for (std::size_t j = i + 1; j < rec->core.n_cigar; ++j) { if ((bam_cigar_op(cigar[j]) == BAM_CMATCH) || (bam_cigar_op(cigar[j]) == BAM_CEQUAL) || (bam_cigar_op(cigar[j]) == BAM_CDIFF)) { spTmp += bam_cigar_oplen(cigar[j]); rpTmp += bam_cigar_oplen(cigar[j]); if ((double) (spTmp - sp) / (double) (dlen + (rpTmp - rp)) > c.indelExtension) break; } else if (bam_cigar_op(cigar[j]) == BAM_CDEL) { rpTmp += bam_cigar_oplen(cigar[j]); if (bam_cigar_oplen(cigar[j]) > c.minRefSep) { // Extend deletion dlen += (rpTmp - rp); rp = rpTmp; sp = spTmp; i = j; } } else if (bam_cigar_op(cigar[j]) == BAM_CINS) { if (bam_cigar_oplen(cigar[j]) > c.minRefSep) break; // No extension spTmp += bam_cigar_oplen(cigar[j]); } else break; // No extension } _insertJunction(readBp, seed, rec, rp, spOrig, true); } } else if (bam_cigar_op(cigar[i]) == BAM_CINS) { if (bam_cigar_oplen(cigar[i]) > c.minRefSep) _insertJunction(readBp, seed, rec, rp, sp, false); sp += bam_cigar_oplen(cigar[i]); if (bam_cigar_oplen(cigar[i]) > c.minRefSep) { // Try look-ahead uint32_t rpOrig = rp; uint32_t rpTmp = rp; uint32_t spTmp = sp; uint32_t ilen = bam_cigar_oplen(cigar[i]); for (std::size_t j = i + 1; j < rec->core.n_cigar; ++j) { if ((bam_cigar_op(cigar[j]) == BAM_CMATCH) || (bam_cigar_op(cigar[j]) == BAM_CEQUAL) || (bam_cigar_op(cigar[j]) == BAM_CDIFF)) { spTmp += bam_cigar_oplen(cigar[j]); rpTmp += bam_cigar_oplen(cigar[j]); if ((double) (rpTmp - rp) / (double) (ilen + (spTmp - sp)) > c.indelExtension) break; } else if (bam_cigar_op(cigar[j]) == BAM_CDEL) { if (bam_cigar_oplen(cigar[j]) > c.minRefSep) break; // No extension rpTmp += bam_cigar_oplen(cigar[j]); } else if (bam_cigar_op(cigar[j]) == BAM_CINS) { spTmp += bam_cigar_oplen(cigar[j]); if (bam_cigar_oplen(cigar[j]) > c.minRefSep) { // Extend insertion ilen += (spTmp - sp); rp = rpTmp; sp = spTmp; i = j; } } else { break; // No extension } } _insertJunction(readBp, seed, rec, rpOrig, sp, true); } } else if (bam_cigar_op(cigar[i]) == BAM_CREF_SKIP) { rp += bam_cigar_oplen(cigar[i]); } else if ((bam_cigar_op(cigar[i]) == BAM_CSOFT_CLIP) || (bam_cigar_op(cigar[i]) == BAM_CHARD_CLIP)) { int32_t finalsp = sp; bool scleft = false; if (sp == 0) { finalsp += bam_cigar_oplen(cigar[i]); // Leading soft-clip / hard-clip scleft = true; } sp += bam_cigar_oplen(cigar[i]); //std::cerr << bam_get_qname(rec) << ',' << rp << ',' << finalsp << ',' << scleft << std::endl; if (bam_cigar_oplen(cigar[i]) > c.minClip) _insertJunction(readBp, seed, rec, rp, finalsp, scleft); } else { std::cerr << "Unknown Cigar options" << std::endl; } } } bam_destroy1(rec); hts_itr_destroy(iter); } } } // Sort junctions for(typename TReadBp::iterator it = readBp.begin(); it != readBp.end(); ++it) { std::sort(it->second.begin(), it->second.end(), SortJunction()); } // Clean-up bam_hdr_destroy(hdr); for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) { hts_idx_destroy(idx[file_c]); sam_close(samfile[file_c]); } } template inline void fetchSVs(TConfig const& c, TReadBp& readBp, std::vector >& br) { // Extract BAM records if ((!c.svtcmd) || (c.svtset.find(2) != c.svtset.end())) selectDeletions(c, readBp, br); if ((!c.svtcmd) || (c.svtset.find(3) != c.svtset.end())) selectDuplications(c, readBp, br); if ((!c.svtcmd) || (c.svtset.find(0) != c.svtset.end()) || (c.svtset.find(1) != c.svtset.end())) selectInversions(c, readBp, br); if ((!c.svtcmd) || (c.svtset.find(4) != c.svtset.end())) selectInsertions(c, readBp, br); if ((!c.svtcmd) || (c.svtset.find(5) != c.svtset.end()) || (c.svtset.find(6) != c.svtset.end()) || (c.svtset.find(7) != c.svtset.end()) || (c.svtset.find(8) != c.svtset.end())) selectTranslocations(c, readBp, br); } template inline void _findSRBreakpoints(TConfig const& c, TValidRegions const& validRegions, TSvtSRBamRecord& srBR) { // Breakpoints typedef std::vector TJunctionVector; typedef std::map TReadBp; TReadBp readBp; findJunctions(c, validRegions, readBp); fetchSVs(c, readBp, srBR); } template inline void _clusterSRReads(TConfig const& c, TValidRegions const& validRegions, TSVs& svc, TSRStore& srStore) { typedef typename TSRStore::mapped_type TSvPosVector; // Split-reads typedef std::vector TSRBamRecord; typedef std::vector TSvtSRBamRecord; TSvtSRBamRecord srBR(2 * DELLY_SVT_TRANS, TSRBamRecord()); _findSRBreakpoints(c, validRegions, srBR); // Debug //outputSRBamRecords(c, srBR); // Cluster BAM records for(uint32_t svt = 0; svt < srBR.size(); ++svt) { if (srBR[svt].empty()) continue; // Sort std::sort(srBR[svt].begin(), srBR[svt].end(), SortSRBamRecord()); // Cluster cluster(c, srBR[svt], svc, c.maxReadSep, svt); // Debug //outputStructuralVariants(c, svc, srBR, svt); // Track split-reads for(uint32_t i = 0; i < srBR[svt].size(); ++i) { // Read assigned? if (srBR[svt][i].svid != -1) { if (srStore.find(srBR[svt][i].id) == srStore.end()) srStore.insert(std::make_pair(srBR[svt][i].id, TSvPosVector())); srStore[srBR[svt][i].id].push_back(SeqSlice(srBR[svt][i].svid, srBR[svt][i].sstart, srBR[svt][i].inslen, srBR[svt][i].qual)); } } } } template inline void outputSRBamRecords(TConfig const& c, std::vector > const& br) { samFile* samfile = sam_open(c.files[0].string().c_str(), "r"); hts_set_fai_filename(samfile, c.genome.string().c_str()); bam_hdr_t* hdr = sam_hdr_read(samfile); // Header std::cerr << "id\tchr1\tpos1\tchr2\tpos2\tsvtype\tct\tinslen" << std::endl; // SVs for(uint32_t svt = 0; svt < br.size(); ++svt) { for(uint32_t i = 0; i < br[svt].size(); ++i) { std::cerr << br[svt][i].id << '\t' << hdr->target_name[br[svt][i].chr] << '\t' << br[svt][i].pos << '\t' << hdr->target_name[br[svt][i].chr2] << '\t' << br[svt][i].pos2 << '\t' << _addID(svt) << '\t' << _addOrientation(svt) << '\t' << br[svt][i].inslen << std::endl; } } // Clean-up bam_hdr_destroy(hdr); sam_close(samfile); } template inline void outputStructuralVariants(TConfig const& c, std::vector const& svs, TSvtSRBamRecord const& srBR, int32_t const svt) { // Header std::cerr << "chr1\tpos1\tchr2\tpos2\tsvtype\tct\tpeSupport\tsrSupport" << std::endl; // Hash reads typedef std::map THashMap; THashMap hm; typedef std::vector TSamFile; typedef std::vector TIndex; TSamFile samfile(c.files.size()); TIndex idx(c.files.size()); for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) { samfile[file_c] = sam_open(c.files[file_c].string().c_str(), "r"); hts_set_fai_filename(samfile[file_c], c.genome.string().c_str()); idx[file_c] = sam_index_load(samfile[file_c], c.files[file_c].string().c_str()); } bam_hdr_t* hdr = sam_hdr_read(samfile[0]); for(int32_t refIndex = 0; refIndex < hdr->n_targets; ++refIndex) { for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) { hts_itr_t* iter = sam_itr_queryi(idx[file_c], refIndex, 0, hdr->target_len[refIndex]); bam1_t* rec = bam_init1(); while (sam_itr_next(samfile[file_c], iter, rec) >= 0) { if (rec->core.flag & (BAM_FQCFAIL | BAM_FDUP | BAM_FUNMAP)) continue; std::size_t seed = hash_lr(rec); std::string qname = bam_get_qname(rec); if (hm.find(seed) == hm.end()) hm.insert(std::make_pair(seed, qname)); else { if (hm[seed] != qname) { std::cerr << "Warning: Hash collision! " << seed << ',' << hm[seed] << ',' << qname << std::endl; } } } bam_destroy1(rec); hts_itr_destroy(iter); } } // Track split-reads typedef std::vector TReadNameVector; typedef std::vector TSVReadNames; TSVReadNames svReadNames(svs.size(), TReadNameVector()); for(uint32_t i = 0; i < srBR[svt].size(); ++i) { if (srBR[svt][i].svid != -1) { svReadNames[srBR[svt][i].svid].push_back(hm[srBR[svt][i].id]); } } // SVs for(uint32_t i = 0; i < svs.size(); ++i) { if (svs[i].svt != svt) continue; std::cerr << hdr->target_name[svs[i].chr] << '\t' << svs[i].svStart << '\t' << hdr->target_name[svs[i].chr2] << '\t' << svs[i].svEnd << '\t' << _addID(svs[i].svt) << '\t' << _addOrientation(svs[i].svt) << '\t' << svs[i].peSupport << '\t' << svs[i].srSupport << '\t'; for(uint32_t k = 0; k < svReadNames[svs[i].id].size(); ++k) std::cerr << svReadNames[svs[i].id][k] << ','; std::cerr << std::endl; } // Clean-up bam_hdr_destroy(hdr); for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) { hts_idx_destroy(idx[file_c]); sam_close(samfile[file_c]); } } template inline void outputStructuralVariants(TConfig const& c, std::vector const& svs, int32_t const svt) { samFile* samfile = sam_open(c.files[0].string().c_str(), "r"); hts_set_fai_filename(samfile, c.genome.string().c_str()); bam_hdr_t* hdr = sam_hdr_read(samfile); // Header std::cerr << "chr1\tpos1\tchr2\tpos2\tsvtype\tct\tpeSupport\tsrSupport\tconsensus" << std::endl; // SVs for(uint32_t i = 0; i < svs.size(); ++i) { if (svs[i].svt != svt) continue; std::cerr << hdr->target_name[svs[i].chr] << '\t' << svs[i].svStart << '\t' << hdr->target_name[svs[i].chr2] << '\t' << svs[i].svEnd << '\t' << _addID(svs[i].svt) << '\t' << _addOrientation(svs[i].svt) << '\t' << svs[i].peSupport << '\t' << svs[i].srSupport << '\t' << svs[i].consensus << std::endl; } // Clean-up bam_hdr_destroy(hdr); sam_close(samfile); } template inline void outputStructuralVariants(TConfig const& c, std::vector const& svs) { for(uint32_t i = 0; i <= 8; ++i) outputStructuralVariants(c, svs, i); } } #endif delly-0.9.1/src/merge.h000066400000000000000000001246771414764127700147270ustar00rootroot00000000000000#ifndef MERGE_H #define MERGE_H #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "tags.h" #include "version.h" #include "util.h" #include "modvcf.h" namespace torali { struct MergeConfig { bool filterForPass; bool filterForPrecise; bool cnvMode; uint32_t chunksize; uint32_t svcounter; uint32_t bpoffset; uint32_t minsize; uint32_t maxsize; uint32_t coverage; float recoverlap; float vaf; boost::filesystem::path outfile; std::vector files; }; struct IntervalScore { uint32_t start; uint32_t end; int32_t score; IntervalScore(uint32_t s, uint32_t e, int32_t c) : start(s), end(e), score(c) {} }; template struct SortIScores : public std::binary_function { inline bool operator()(TRecord const& s1, TRecord const& s2) const { return ((s1.start < s2.start) || ((s1.start == s2.start) && (s1.end < s2.end))); } }; template double recOverlap(TPos const s1, TPos const e1, TPos const s2, TPos const e2) { if ((e1 < s2) || (s1 > e2)) return 0; double lenA = (double) (e1-s1); if (lenA <= 0) return 0; double lenB = (double) (e2-s2); if (lenB <= 0) return 0; double overlapLen = double(std::min(e1, e2) - std::max(s1, s2)); if (overlapLen <= 0) return 0; return (overlapLen / std::max(lenA, lenB)); } template void _fillIntervalMap(MergeConfig const& c, TGenomeIntervals& iScore, TContigMap& cMap, int32_t const svtin) { typedef typename TGenomeIntervals::value_type TIntervalScores; typedef typename TIntervalScores::value_type IntervalScore; boost::posix_time::ptime now = boost::posix_time::second_clock::local_time(); std::cout << '[' << boost::posix_time::to_simple_string(now) << "] " << "Reading input VCF/BCF files" << std::endl; boost::progress_display show_progress( c.files.size() ); boost::unordered_map refmap; for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) { ++show_progress; htsFile* ifile = bcf_open(c.files[file_c].string().c_str(), "r"); bcf_hdr_t* hdr = bcf_hdr_read(ifile); bcf1_t* rec = bcf_init(); int32_t nsvend = 0; int32_t* svend = NULL; int32_t ninslen = 0; int32_t* inslen = NULL; int32_t nct = 0; char* ct = NULL; int32_t nsvt = 0; char* svt = NULL; while (bcf_read(ifile, hdr, rec) == 0) { bcf_unpack(rec, BCF_UN_INFO); // Check PASS bool pass = true; if (c.filterForPass) pass = (bcf_has_filter(hdr, rec, const_cast("PASS"))==1); if (!pass) continue; // Correct SV type int32_t recsvt = -1; if (bcf_get_info_string(hdr, rec, "SVTYPE", &svt, &nsvt) > 0) { if (bcf_get_info_string(hdr, rec, "CT", &ct, &nct) > 0) recsvt = _decodeOrientation(std::string(ct), std::string(svt)); else recsvt = _decodeOrientation(std::string("NA"), std::string(svt)); } if (recsvt != svtin) continue; // Correct size? std::string chrName(bcf_hdr_id2name(hdr, rec->rid)); uint32_t tid = cMap[chrName]; uint32_t svStart = rec->pos; uint32_t svEnd = rec->pos + 2; if (bcf_get_info_int32(hdr, rec, "END", &svend, &nsvend) > 0) svEnd = *svend; if (recsvt == 4) { // Insertion uint32_t inslenVal = 0; if (bcf_get_info_int32(hdr, rec, "INSLEN", &inslen, &ninslen) > 0) inslenVal = *inslen; if ((inslenVal < c.minsize) || (inslenVal > c.maxsize)) continue; svEnd = svStart + inslenVal; // To enable reciprocal overlap } else { // Other intra-chr SV if ((svEnd - svStart < c.minsize) || (svEnd - svStart > c.maxsize)) continue; } // Precise? bool precise = false; if (bcf_get_info_flag(hdr, rec, "PRECISE", 0, 0) > 0) precise=true; if ((c.filterForPrecise) && (!precise)) continue; // Variant allele frequency filter if ((c.vaf > 0) || (c.coverage > 0)) { float maxvaf = 0; uint32_t maxcov = 0; bcf_unpack(rec, BCF_UN_ALL); int ndv = 0; int32_t* dv = NULL; int ndr = 0; int32_t* dr = NULL; int nrv = 0; int32_t* rv = NULL; int nrr = 0; int32_t* rr = NULL; int ngt = 0; int32_t* gt = NULL; bcf_get_format_int32(hdr, rec, "DV", &dv, &ndv); bcf_get_format_int32(hdr, rec, "DR", &dr, &ndr); bcf_get_format_int32(hdr, rec, "RV", &rv, &nrv); bcf_get_format_int32(hdr, rec, "RR", &rr, &nrr); bcf_get_format_int32(hdr, rec, "GT", >, &ngt); for(int32_t i = 0; i < bcf_hdr_nsamples(hdr); ++i) { if ((bcf_gt_allele(gt[i*2]) != -1) && (bcf_gt_allele(gt[i*2 + 1]) != -1)) { uint32_t supportsum = 0; if (precise) supportsum = rr[i] + rv[i]; else supportsum = dr[i] + dv[i]; if (supportsum > 0) { double vaf = 0; if (precise) vaf = (double) rv[i] / (double) supportsum; else vaf = (double) dv[i] / (double) supportsum; if (vaf > maxvaf) maxvaf = vaf; if (supportsum > maxcov) maxcov = supportsum; } } } // Debug //std::cerr << maxcov << '\t' << maxvaf << std::endl; if (dv != NULL) free(dv); if (dr != NULL) free(dr); if (rv != NULL) free(rv); if (rr != NULL) free(rr); if (gt != NULL) free(gt); if (recsvt != 9) { if ((maxvaf < c.vaf) || (maxcov < c.coverage)) continue; } } // Store the interval //std::cerr << tid << ',' << svStart << ',' << svEnd << ',' << rec->qual << std::endl; iScore[tid].push_back(IntervalScore(svStart, svEnd, rec->qual)); } if (svend != NULL) free(svend); if (inslen != NULL) free(inslen); if (ct != NULL) free(ct); if (svt != NULL) free(svt); bcf_hdr_destroy(hdr); bcf_close(ifile); bcf_destroy(rec); } } template void _processIntervalMap(MergeConfig const& c, TGenomeIntervals const& iScore, TGenomeIntervals& iSelected, int32_t const svtin) { typedef typename TGenomeIntervals::value_type TIntervalScores; typedef typename TIntervalScores::value_type IntervalScore; boost::posix_time::ptime now = boost::posix_time::second_clock::local_time(); std::cout << '[' << boost::posix_time::to_simple_string(now) << "] " << "Merging SVs" << std::endl; boost::progress_display show_progress( iScore.size() ); unsigned int seqId = 0; for(typename TGenomeIntervals::const_iterator iG = iScore.begin(); iG != iScore.end(); ++iG, ++seqId) { ++show_progress; typedef std::vector TIntervalSelector; TIntervalSelector keepInterval; keepInterval.resize(iG->size(), true); typename TIntervalSelector::iterator iK = keepInterval.begin(); for(typename TIntervalScores::const_iterator iS = iG->begin(); iS != iG->end(); ++iS, ++iK) { typename TIntervalScores::const_iterator iSNext = iS; typename TIntervalSelector::iterator iKNext = iK; ++iSNext; ++iKNext; for(; iSNext != iG->end(); ++iSNext, ++iKNext) { if (iSNext->start - iS->start > c.bpoffset) break; else { if (((iSNext->end > iS->end) && (iSNext->end - iS->end < c.bpoffset)) || ((iSNext->end <= iS->end) &&(iS->end - iSNext->end < c.bpoffset))) { if ((_translocation(svtin)) || (recOverlap(iS->start, iS->end, iSNext->start, iSNext->end) >= c.recoverlap)) { if (iS->score < iSNext->score) *iK = false; else if (iSNext ->score < iS->score) *iKNext = false; else { if (iS->start < iSNext->start) *iKNext = false; else if (iS->end < iSNext->end) *iKNext = false; else *iK = false; } } } } } if (*iK) iSelected[seqId].push_back(IntervalScore(iS->start, iS->end, iS->score)); } } } template void _outputSelectedIntervals(MergeConfig& c, TGenomeIntervals const& iSelected, TContigMap& cMap, int32_t const svtin) { typedef typename TGenomeIntervals::value_type TIntervalScores; typedef typename TIntervalScores::value_type IntervalScore; boost::posix_time::ptime now = boost::posix_time::second_clock::local_time(); std::cout << '[' << boost::posix_time::to_simple_string(now) << "] " << "Filtering SVs" << std::endl; // Open output VCF file htsFile *fp = hts_open(c.outfile.string().c_str(), "wb"); bcf_hdr_t *hdr_out = bcf_hdr_init("w"); // Write VCF header boost::gregorian::date today = now.date(); std::string datestr("##fileDate="); datestr += boost::gregorian::to_iso_string(today); bcf_hdr_append(hdr_out, datestr.c_str()); bcf_hdr_append(hdr_out, "##ALT="); bcf_hdr_append(hdr_out, "##ALT="); bcf_hdr_append(hdr_out, "##ALT="); bcf_hdr_append(hdr_out, "##ALT="); bcf_hdr_append(hdr_out, "##ALT="); bcf_hdr_append(hdr_out, "##FILTER="); bcf_hdr_append(hdr_out, "##INFO="); bcf_hdr_append(hdr_out, "##INFO="); bcf_hdr_append(hdr_out, "##INFO="); bcf_hdr_append(hdr_out, "##INFO="); bcf_hdr_append(hdr_out, "##INFO="); bcf_hdr_append(hdr_out, "##INFO="); bcf_hdr_append(hdr_out, "##INFO="); bcf_hdr_append(hdr_out, "##INFO="); bcf_hdr_append(hdr_out, "##INFO="); bcf_hdr_append(hdr_out, "##INFO="); bcf_hdr_append(hdr_out, "##INFO="); bcf_hdr_append(hdr_out, "##INFO="); bcf_hdr_append(hdr_out, "##INFO="); bcf_hdr_append(hdr_out, "##INFO="); bcf_hdr_append(hdr_out, "##INFO="); bcf_hdr_append(hdr_out, "##INFO="); bcf_hdr_append(hdr_out, "##INFO="); bcf_hdr_append(hdr_out, "##INFO="); bcf_hdr_append(hdr_out, "##INFO="); bcf_hdr_append(hdr_out, "##INFO="); // Add reference contigs uint32_t numseq = 0; typedef std::map TReverseMap; TReverseMap rMap; for(typename TContigMap::iterator cIt = cMap.begin(); cIt != cMap.end(); ++cIt, ++numseq) rMap[cIt->second] = cIt->first; for(typename TReverseMap::iterator rIt = rMap.begin(); rIt != rMap.end(); ++rIt) { std::string refname("##contig=second + ">"; bcf_hdr_append(hdr_out, refname.c_str()); } bcf_hdr_add_sample(hdr_out, NULL); if (bcf_hdr_write(fp, hdr_out) != 0) std::cerr << "Error: Failed to write BCF header!" << std::endl; // Duplicate filter (identical start, end, score values) typedef std::pair TStartEnd; typedef std::set TIntervalSet; typedef std::vector TGenomicIntervalSet; TGenomicIntervalSet gis(numseq); // Parse input VCF files bcf1_t *rout = bcf_init(); typedef std::vector THtsFile; typedef std::vector TBcfHeader; typedef std::vector TBcfRecord; typedef std::vector TEof; THtsFile ifile(c.files.size()); TBcfHeader hdr(c.files.size()); TBcfRecord rec(c.files.size()); TEof eof(c.files.size()); uint32_t allEOF = 0; for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) { ifile[file_c] = bcf_open(c.files[file_c].string().c_str(), "r"); hdr[file_c] = bcf_hdr_read(ifile[file_c]); if (bcf_hdr_set_samples(hdr[file_c], NULL, false) != 0) std::cerr << "Error: Failed to set sample information!" << std::endl; rec[file_c] = bcf_init(); if (bcf_read(ifile[file_c], hdr[file_c], rec[file_c]) == 0) { bcf_unpack(rec[file_c], BCF_UN_INFO); eof[file_c] = false; } else { ++allEOF; eof[file_c] = true; } } int32_t nsvend = 0; int32_t* svend = NULL; int32_t npe = 0; int32_t* pe = NULL; int32_t nsr = 0; int32_t* sr = NULL; int32_t ninslen = 0; int32_t* inslen = NULL; int32_t npos2 = 0; int32_t* pos2 = NULL; int32_t nhomlen = 0; int32_t* homlen = NULL; int32_t nmapq = 0; int32_t* mapq = NULL; int32_t nsrmapq = 0; int32_t* srmapq = NULL; int32_t nsrq = 0; float* srq = NULL; int32_t nct = 0; char* ct = NULL; int32_t nsvt = 0; char* svt = NULL; int32_t nchr2 = 0; char* chr2 = NULL; int32_t ncipos = 0; int32_t* cipos = NULL; int32_t nciend = 0; int32_t* ciend = NULL; int32_t nce = 0; float* ce = NULL; int32_t ncons = 0; char* cons = NULL; while (allEOF < c.files.size()) { // Find next sorted record int32_t idx = -1; for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) { if (!eof[file_c]) { if ((idx < 0) || (rec[idx]->rid > rec[file_c]->rid) || ((rec[idx]->rid == rec[file_c]->rid) && (rec[idx]->pos > rec[file_c]->pos))) idx = file_c; } } // Correct SV type int32_t recsvt = -1; if ((bcf_get_info_string(hdr[idx], rec[idx], "SVTYPE", &svt, &nsvt) > 0) && (bcf_get_info_string(hdr[idx], rec[idx], "CT", &ct, &nct) > 0)) recsvt = _decodeOrientation(std::string(ct), std::string(svt)); if (recsvt == svtin) { // Check PASS bool pass = true; if (c.filterForPass) pass = (bcf_has_filter(hdr[idx], rec[idx], const_cast("PASS"))==1); // Check PRECISE bool precise = false; bool passPrecise = true; if (bcf_get_info_flag(hdr[idx], rec[idx], "PRECISE", 0, 0) > 0) precise=true; if ((c.filterForPrecise) && (!precise)) passPrecise = false; // Check PASS and precise if ((passPrecise) && (pass)) { // Correct size std::string chrName(bcf_hdr_id2name(hdr[idx], rec[idx]->rid)); uint32_t tid = cMap[chrName]; uint32_t svStart = rec[idx]->pos; uint32_t svEnd = svStart + 1; if (bcf_get_info_int32(hdr[idx], rec[idx], "END", &svend, &nsvend) > 0) svEnd = *svend; unsigned int inslenVal = 0; if (bcf_get_info_int32(hdr[idx], rec[idx], "INSLEN", &inslen, &ninslen) > 0) inslenVal = *inslen; if (recsvt == 4) svEnd = svStart + inslenVal; // To enable reciprocal overlap // Parse INFO fields if ((std::string(svt) == "BND") || ((std::string(svt) == "INS") && (inslenVal >= c.minsize) && (inslenVal <= c.maxsize)) || ((std::string(svt) != "BND") && (std::string(svt) != "INS") && (svEnd - svStart >= c.minsize) && (svEnd - svStart <= c.maxsize))) { unsigned int peSupport = 0; if (bcf_get_info_int32(hdr[idx], rec[idx], "PE", &pe, &npe) > 0) peSupport = *pe; unsigned int srSupport = 0; if (bcf_get_info_int32(hdr[idx], rec[idx], "SR", &sr, &nsr) > 0) srSupport = *sr; // Remove this line //if (srSupport > 0) precise = true; int32_t peMapQuality = 0; if (bcf_get_info_int32(hdr[idx], rec[idx], "MAPQ", &mapq, &nmapq) > 0) peMapQuality = *mapq; int32_t srMapQuality = 0; if (bcf_get_info_int32(hdr[idx], rec[idx], "SRMAPQ", &srmapq, &nsrmapq) > 0) srMapQuality = *srmapq; std::string chr2Name = chrName; int32_t pos2val = 0; if (bcf_get_info_string(hdr[idx], rec[idx], "CHR2", &chr2, &nchr2) > 0) { chr2Name = std::string(chr2); if (bcf_get_info_int32(hdr[idx], rec[idx], "POS2", &pos2, &npos2) > 0) pos2val = *pos2; //mtid = cMap[chr2Name]; } int32_t score = rec[idx]->qual; // Is this a selected interval typename TIntervalScores::const_iterator iter = std::lower_bound(iSelected[tid].begin(), iSelected[tid].end(), IntervalScore(svStart, svEnd, score), SortIScores()); bool foundInterval = false; for(; (iter != iSelected[tid].end()) && (iter->start == svStart); ++iter) { if ((iter->start == svStart) && (iter->end == svEnd) && (iter->score == score)) { // Duplicate? if (gis[tid].find(std::make_pair(svStart, svEnd)) == gis[tid].end()) { foundInterval = true; gis[tid].insert(std::make_pair(svStart, svEnd)); } break; } } if (foundInterval) { // Fetch missing INFO fields unsigned int homlenVal = 0; if (bcf_get_info_int32(hdr[idx], rec[idx], "HOMLEN", &homlen, &nhomlen) > 0) homlenVal = *homlen; bcf_get_info_int32(hdr[idx], rec[idx], "CIPOS", &cipos, &ncipos); bcf_get_info_int32(hdr[idx], rec[idx], "CIEND", &ciend, &nciend); float srAlignQuality = 0; if (bcf_get_info_float(hdr[idx], rec[idx], "SRQ", &srq, &nsrq) > 0) srAlignQuality = *srq; std::string consensus; float ceVal = 0; if (precise) { if (bcf_get_info_float(hdr[idx], rec[idx], "CE", &ce, &nce) > 0) ceVal = *ce; if (bcf_get_info_string(hdr[idx], rec[idx], "CONSENSUS", &cons, &ncons) > 0) consensus = boost::to_upper_copy(std::string(cons)); } // Create new record rout->rid = bcf_hdr_name2id(hdr_out, chrName.c_str()); rout->pos = rec[idx]->pos; rout->qual = rec[idx]->qual; std::string id; if (c.files.size() == 1) id = std::string(rec[idx]->d.id); // Within one VCF file IDs are unique else { id += _addID(svtin); std::string padNumber = boost::lexical_cast(c.svcounter++); padNumber.insert(padNumber.begin(), 8 - padNumber.length(), '0'); id += padNumber; } bcf_update_id(hdr_out, rout, id.c_str()); std::string refAllele = rec[idx]->d.allele[0]; std::string altAllele = rec[idx]->d.allele[1]; std::string alleles = refAllele + "," + altAllele; bcf_update_alleles_str(hdr_out, rout, alleles.c_str()); int32_t tmppass = bcf_hdr_id2int(hdr_out, BCF_DT_ID, "PASS"); bcf_update_filter(hdr_out, rout, &tmppass, 1); // Add INFO fields if (precise) bcf_update_info_flag(hdr_out, rout, "PRECISE", NULL, 1); else bcf_update_info_flag(hdr_out, rout, "IMPRECISE", NULL, 1); bcf_update_info_string(hdr_out, rout, "SVTYPE", _addID(svtin).c_str()); std::string dellyVersion("EMBL.DELLYv"); dellyVersion += dellyVersionNumber; bcf_update_info_string(hdr_out,rout, "SVMETHOD", dellyVersion.c_str()); bcf_update_info_int32(hdr_out, rout, "END", &svEnd, 1); if (svtin >= DELLY_SVT_TRANS) { bcf_update_info_string(hdr_out,rout, "CHR2", chr2Name.c_str()); bcf_update_info_int32(hdr_out, rout, "POS2", &pos2val, 1); } if (svtin == 4) { bcf_update_info_int32(hdr_out, rout, "SVLEN", &inslenVal, 1); } bcf_update_info_int32(hdr_out, rout, "PE", &peSupport, 1); int32_t tmpi = peMapQuality; bcf_update_info_int32(hdr_out, rout, "MAPQ", &tmpi, 1); bcf_update_info_string(hdr_out, rout, "CT", _addOrientation(svtin).c_str()); bcf_update_info_int32(hdr_out, rout, "CIPOS", cipos, 2); bcf_update_info_int32(hdr_out, rout, "CIEND", ciend, 2); if (precise) { int32_t tmpi = srMapQuality; bcf_update_info_int32(hdr_out, rout, "SRMAPQ", &tmpi, 1); bcf_update_info_int32(hdr_out, rout, "INSLEN", &inslenVal, 1); bcf_update_info_int32(hdr_out, rout, "HOMLEN", &homlenVal, 1); bcf_update_info_int32(hdr_out, rout, "SR", &srSupport, 1); bcf_update_info_float(hdr_out, rout, "SRQ", &srAlignQuality, 1); if (consensus.size()) { bcf_update_info_string(hdr_out, rout, "CONSENSUS", consensus.c_str()); bcf_update_info_float(hdr_out, rout, "CE", &ceVal, 1); } } // Write record bcf_write1(fp, hdr_out, rout); bcf_clear1(rout); //std::cerr << bcf_hdr_id2name(hdr[idx], tid) << '\t' << svStart << '\t' << svEnd << std::endl; } } } } // Fetch next record if (bcf_read(ifile[idx], hdr[idx], rec[idx]) == 0) bcf_unpack(rec[idx], BCF_UN_INFO); else { ++allEOF; eof[idx] = true; } } if (svend != NULL) free(svend); if (pe != NULL) free(pe); if (sr != NULL) free(sr); if (homlen != NULL) free(homlen); if (inslen != NULL) free(inslen); if (pos2 != NULL) free(pos2); if (mapq != NULL) free(mapq); if (srmapq != NULL) free(srmapq); if (ct != NULL) free(ct); if (srq != NULL) free(srq); if (svt != NULL) free(svt); if (chr2 != NULL) free(chr2); if (cipos != NULL) free(cipos); if (ciend != NULL) free(ciend); if (ce != NULL) free(ce); if (cons != NULL) free(cons); // Clean-up for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) { bcf_hdr_destroy(hdr[file_c]); bcf_close(ifile[file_c]); bcf_destroy(rec[file_c]); } // Close VCF file bcf_destroy(rout); bcf_hdr_destroy(hdr_out); hts_close(fp); // Build index bcf_index_build(c.outfile.string().c_str(), 14); } template void _outputSelectedIntervalsCNVs(MergeConfig& c, TGenomeIntervals const& iSelected, TContigMap& cMap) { typedef typename TGenomeIntervals::value_type TIntervalScores; typedef typename TIntervalScores::value_type IntervalScore; boost::posix_time::ptime now = boost::posix_time::second_clock::local_time(); std::cout << '[' << boost::posix_time::to_simple_string(now) << "] " << "Filtering SVs" << std::endl; // Open output VCF file htsFile *fp = hts_open(c.outfile.string().c_str(), "wb"); bcf_hdr_t *hdr_out = bcf_hdr_init("w"); // Write VCF header boost::gregorian::date today = now.date(); std::string datestr("##fileDate="); datestr += boost::gregorian::to_iso_string(today); bcf_hdr_append(hdr_out, datestr.c_str()); bcf_hdr_append(hdr_out, "##ALT="); bcf_hdr_append(hdr_out, "##FILTER="); bcf_hdr_append(hdr_out, "##INFO="); bcf_hdr_append(hdr_out, "##INFO="); bcf_hdr_append(hdr_out, "##INFO="); bcf_hdr_append(hdr_out, "##INFO="); bcf_hdr_append(hdr_out, "##INFO="); bcf_hdr_append(hdr_out, "##INFO="); bcf_hdr_append(hdr_out, "##INFO="); // Add reference contigs uint32_t numseq = 0; typedef std::map TReverseMap; TReverseMap rMap; for(typename TContigMap::iterator cIt = cMap.begin(); cIt != cMap.end(); ++cIt, ++numseq) rMap[cIt->second] = cIt->first; for(typename TReverseMap::iterator rIt = rMap.begin(); rIt != rMap.end(); ++rIt) { std::string refname("##contig=second + ">"; bcf_hdr_append(hdr_out, refname.c_str()); } bcf_hdr_add_sample(hdr_out, NULL); if (bcf_hdr_write(fp, hdr_out) != 0) std::cerr << "Error: Failed to write BCF header!" << std::endl; // Duplicate filter (identical start, end, score values) typedef std::pair TStartEnd; typedef std::set TIntervalSet; typedef std::vector TGenomicIntervalSet; TGenomicIntervalSet gis(numseq); // Parse input VCF files bcf1_t *rout = bcf_init(); typedef std::vector THtsFile; typedef std::vector TBcfHeader; typedef std::vector TBcfRecord; typedef std::vector TEof; THtsFile ifile(c.files.size()); TBcfHeader hdr(c.files.size()); TBcfRecord rec(c.files.size()); TEof eof(c.files.size()); uint32_t allEOF = 0; for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) { ifile[file_c] = bcf_open(c.files[file_c].string().c_str(), "r"); hdr[file_c] = bcf_hdr_read(ifile[file_c]); if (bcf_hdr_set_samples(hdr[file_c], NULL, false) != 0) std::cerr << "Error: Failed to set sample information!" << std::endl; rec[file_c] = bcf_init(); if (bcf_read(ifile[file_c], hdr[file_c], rec[file_c]) == 0) { bcf_unpack(rec[file_c], BCF_UN_INFO); eof[file_c] = false; } else { ++allEOF; eof[file_c] = true; } } int32_t nsvend = 0; int32_t* svend = NULL; int32_t nmp = 0; float* mp = NULL; int32_t nsvt = 0; char* svt = NULL; int32_t ncipos = 0; int32_t* cipos = NULL; int32_t nciend = 0; int32_t* ciend = NULL; while (allEOF < c.files.size()) { // Find next sorted record int32_t idx = -1; for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) { if (!eof[file_c]) { if ((idx < 0) || (rec[idx]->rid > rec[file_c]->rid) || ((rec[idx]->rid == rec[file_c]->rid) && (rec[idx]->pos > rec[file_c]->pos))) idx = file_c; } } // Correct SV type int32_t recsvt = -1; if (bcf_get_info_string(hdr[idx], rec[idx], "SVTYPE", &svt, &nsvt) > 0) recsvt = _decodeOrientation(std::string("NA"), std::string(svt)); // CNV ? if (recsvt == 9) { // Check PASS bool pass = true; if (c.filterForPass) pass = (bcf_has_filter(hdr[idx], rec[idx], const_cast("PASS"))==1); // Check PRECISE bool precise = false; bool passPrecise = true; if (bcf_get_info_flag(hdr[idx], rec[idx], "PRECISE", 0, 0) > 0) precise=true; if ((c.filterForPrecise) && (!precise)) passPrecise = false; // Check PASS and precise if ((passPrecise) && (pass)) { // Correct size std::string chrName(bcf_hdr_id2name(hdr[idx], rec[idx]->rid)); uint32_t tid = cMap[chrName]; uint32_t svStart = rec[idx]->pos; uint32_t svEnd = svStart + 1; if (bcf_get_info_int32(hdr[idx], rec[idx], "END", &svend, &nsvend) > 0) svEnd = *svend; // Parse INFO fields if ((svEnd - svStart >= c.minsize) && (svEnd - svStart <= c.maxsize)) { int32_t score = rec[idx]->qual; // Is this a selected interval typename TIntervalScores::const_iterator iter = std::lower_bound(iSelected[tid].begin(), iSelected[tid].end(), IntervalScore(svStart, svEnd, score), SortIScores()); bool foundInterval = false; for(; (iter != iSelected[tid].end()) && (iter->start == svStart); ++iter) { if ((iter->start == svStart) && (iter->end == svEnd) && (iter->score == score)) { // Duplicate? if (gis[tid].find(std::make_pair(svStart, svEnd)) == gis[tid].end()) { foundInterval = true; gis[tid].insert(std::make_pair(svStart, svEnd)); } break; } } if (foundInterval) { // Fetch missing INFO fields bcf_get_info_int32(hdr[idx], rec[idx], "CIPOS", &cipos, &ncipos); bcf_get_info_int32(hdr[idx], rec[idx], "CIEND", &ciend, &nciend); float mpval = 0; if (bcf_get_info_float(hdr[idx], rec[idx], "MP", &mp, &nmp) > 0) mpval = *mp; // Create new record rout->rid = bcf_hdr_name2id(hdr_out, chrName.c_str()); rout->pos = rec[idx]->pos; rout->qual = rec[idx]->qual; std::string id; if (c.files.size() == 1) id = std::string(rec[idx]->d.id); // Within one VCF file IDs are unique else { id += _addID(recsvt); std::string padNumber = boost::lexical_cast(c.svcounter++); padNumber.insert(padNumber.begin(), 8 - padNumber.length(), '0'); id += padNumber; } bcf_update_id(hdr_out, rout, id.c_str()); std::string refAllele = rec[idx]->d.allele[0]; std::string altAllele = rec[idx]->d.allele[1]; std::string alleles = refAllele + "," + altAllele; bcf_update_alleles_str(hdr_out, rout, alleles.c_str()); int32_t tmppass = bcf_hdr_id2int(hdr_out, BCF_DT_ID, "PASS"); bcf_update_filter(hdr_out, rout, &tmppass, 1); // Add INFO fields if (precise) bcf_update_info_flag(hdr_out, rout, "PRECISE", NULL, 1); else bcf_update_info_flag(hdr_out, rout, "IMPRECISE", NULL, 1); bcf_update_info_string(hdr_out, rout, "SVTYPE", _addID(recsvt).c_str()); std::string dellyVersion("EMBL.DELLYv"); dellyVersion += dellyVersionNumber; bcf_update_info_string(hdr_out,rout, "SVMETHOD", dellyVersion.c_str()); bcf_update_info_int32(hdr_out, rout, "END", &svEnd, 1); bcf_update_info_int32(hdr_out, rout, "CIPOS", cipos, 2); bcf_update_info_int32(hdr_out, rout, "CIEND", ciend, 2); bcf_update_info_float(hdr_out, rout, "MP", &mpval, 1); // Write record bcf_write1(fp, hdr_out, rout); bcf_clear1(rout); //std::cerr << bcf_hdr_id2name(hdr[idx], tid) << '\t' << svStart << '\t' << svEnd << std::endl; } } } } // Fetch next record if (bcf_read(ifile[idx], hdr[idx], rec[idx]) == 0) bcf_unpack(rec[idx], BCF_UN_INFO); else { ++allEOF; eof[idx] = true; } } if (svend != NULL) free(svend); if (mp != NULL) free(mp); if (svt != NULL) free(svt); if (cipos != NULL) free(cipos); if (ciend != NULL) free(ciend); // Clean-up for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) { bcf_hdr_destroy(hdr[file_c]); bcf_close(ifile[file_c]); bcf_destroy(rec[file_c]); } // Close VCF file bcf_destroy(rout); bcf_hdr_destroy(hdr_out); hts_close(fp); // Build index bcf_index_build(c.outfile.string().c_str(), 14); } inline void mergeBCFs(MergeConfig& c, std::vector const& cts) { boost::posix_time::ptime now = boost::posix_time::second_clock::local_time(); std::cout << '[' << boost::posix_time::to_simple_string(now) << "] " << "Merging SV types" << std::endl; boost::progress_display show_progress( 1 ); // Parse temporary input VCF files typedef std::vector THtsFile; typedef std::vector TBcfHeader; typedef std::vector TBcfRecord; typedef std::vector TEof; THtsFile ifile(cts.size()); TBcfHeader hdr(cts.size()); TBcfRecord rec(cts.size()); TEof eof(cts.size()); uint32_t allEOF = 0; for(unsigned int file_c = 0; file_c < cts.size(); ++file_c) { ifile[file_c] = bcf_open(cts[file_c].string().c_str(), "r"); hdr[file_c] = bcf_hdr_read(ifile[file_c]); rec[file_c] = bcf_init(); if (bcf_read(ifile[file_c], hdr[file_c], rec[file_c]) == 0) { bcf_unpack(rec[file_c], BCF_UN_INFO); eof[file_c] = false; } else { ++allEOF; eof[file_c] = true; } } // Open output VCF file htsFile *fp = hts_open(c.outfile.string().c_str(), "wb"); bcf_hdr_t *hdr_out = bcf_hdr_dup(hdr[0]); if (bcf_hdr_write(fp, hdr_out) != 0) std::cerr << "Error: Failed to write BCF header!" << std::endl; // Merge files while (allEOF < cts.size()) { // Find next sorted record int32_t idx = -1; for(unsigned int file_c = 0; file_c < cts.size(); ++file_c) { if (!eof[file_c]) { if ((idx < 0) || (rec[idx]->rid > rec[file_c]->rid) || ((rec[idx]->rid == rec[file_c]->rid) && (rec[idx]->pos > rec[file_c]->pos))) idx = file_c; } } // Write record bcf_write1(fp, hdr_out, rec[idx]); // Fetch next record if (bcf_read(ifile[idx], hdr[idx], rec[idx]) == 0) bcf_unpack(rec[idx], BCF_UN_INFO); else { ++allEOF; eof[idx] = true; } } ++show_progress; // Clean-up for(unsigned int file_c = 0; file_c < cts.size(); ++file_c) { bcf_hdr_destroy(hdr[file_c]); bcf_close(ifile[file_c]); bcf_destroy(rec[file_c]); } // Close VCF file bcf_hdr_destroy(hdr_out); hts_close(fp); // Build index bcf_index_build(c.outfile.string().c_str(), 14); // End now = boost::posix_time::second_clock::local_time(); std::cout << '[' << boost::posix_time::to_simple_string(now) << "] Done." << std::endl; } inline int mergeRun(MergeConfig& c, int32_t const svt) { // All files may use a different set of chromosomes typedef std::map TContigMap; TContigMap contigMap; uint32_t numseq = 0; for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) { htsFile* ifile = bcf_open(c.files[file_c].string().c_str(), "r"); bcf_hdr_t* hdr = bcf_hdr_read(ifile); int nseq=0; const char** seqnames = bcf_hdr_seqnames(hdr, &nseq); for(int32_t i = 0; i TIntervalScores; typedef std::vector TGenomeIntervals; TGenomeIntervals iScore; iScore.resize(numseq, TIntervalScores()); _fillIntervalMap(c, iScore, contigMap, svt); for(uint32_t i = 0; i()); // Filter intervals TGenomeIntervals iSelected; iSelected.resize(numseq, TIntervalScores()); _processIntervalMap(c, iScore, iSelected, svt); iScore.clear(); for(uint32_t i = 0; i()); // Output best intervals if (svt == 9) _outputSelectedIntervalsCNVs(c, iSelected, contigMap); else _outputSelectedIntervals(c, iSelected, contigMap, svt); // End boost::posix_time::ptime now = boost::posix_time::second_clock::local_time(); std::cout << '[' << boost::posix_time::to_simple_string(now) << "] Done." << std::endl; return 0; } int merge(int argc, char **argv) { MergeConfig c; c.svcounter = 1; // Define generic options boost::program_options::options_description generic("Generic options"); generic.add_options() ("help,?", "show help message") ("outfile,o", boost::program_options::value(&c.outfile)->default_value("sv.bcf"), "Merged SV BCF output file") ("chunks,u", boost::program_options::value(&c.chunksize)->default_value(500), "max. chunk size to merge groups of BCF files") ("vaf,a", boost::program_options::value(&c.vaf)->default_value(0.15), "min. fractional ALT support") ("coverage,v", boost::program_options::value(&c.coverage)->default_value(10), "min. coverage") ("minsize,m", boost::program_options::value(&c.minsize)->default_value(0), "min. SV size") ("maxsize,n", boost::program_options::value(&c.maxsize)->default_value(1000000), "max. SV size") ("cnvmode,e", "Merge delly CNV files") ("precise,c", "Filter sites for PRECISE") ("pass,p", "Filter sites for PASS") ; // Define overlap options boost::program_options::options_description overlap("Overlap options"); overlap.add_options() ("bp-offset,b", boost::program_options::value(&c.bpoffset)->default_value(1000), "max. breakpoint offset") ("rec-overlap,r", boost::program_options::value(&c.recoverlap)->default_value(0.8), "min. reciprocal overlap") ; // Define hidden options boost::program_options::options_description hidden("Hidden options"); hidden.add_options() ("input-file", boost::program_options::value< std::vector >(&c.files), "input file") ; boost::program_options::positional_options_description pos_args; pos_args.add("input-file", -1); // Set the visibility boost::program_options::options_description cmdline_options; cmdline_options.add(generic).add(overlap).add(hidden); boost::program_options::options_description visible_options; visible_options.add(generic).add(overlap); boost::program_options::variables_map vm; boost::program_options::store(boost::program_options::command_line_parser(argc, argv).options(cmdline_options).positional(pos_args).run(), vm); boost::program_options::notify(vm); // Check command line arguments if ((vm.count("help")) || (!vm.count("input-file"))) { std::cout << std::endl; std::cout << "Usage: delly " << argv[0] << " [OPTIONS] [ ... | ]" << std::endl; std::cout << visible_options << "\n"; return 0; } // Filter for PASS if (vm.count("pass")) c.filterForPass = true; else c.filterForPass = false; // Filter for PRECISE if (vm.count("precise")) c.filterForPrecise = true; else c.filterForPrecise = false; // Merge CNVs if (vm.count("cnvmode")) c.cnvMode = true; else c.cnvMode = false; // Check output files if (!_outfileValid(c.outfile)) return 1; if (!_outfileValid(boost::filesystem::path(c.outfile.string() + ".csi"))) return 1; // Show cmd boost::posix_time::ptime now = boost::posix_time::second_clock::local_time(); std::cout << '[' << boost::posix_time::to_simple_string(now) << "] "; std::cout << "delly "; for(int i=0; i c.chunksize) { int32_t bestChunkSize = c.chunksize; int32_t bestBinSize = 0; for(uint32_t i = 50; i < c.chunksize; ++i) { int32_t chunks = ((c.files.size() - 1) / i); int32_t lastBin = c.files.size() - chunks * i; if (lastBin > bestBinSize) { bestBinSize = lastBin; bestChunkSize = i; } } c.chunksize = bestChunkSize; } // Run merging int32_t minSVT = 0; int32_t maxSVT = 9; if (c.cnvMode) { minSVT = 9; maxSVT = 10; } boost::filesystem::path oldPath = c.outfile; std::vector svtCollect(maxSVT); for(int32_t svt = minSVT; svt < maxSVT; ++svt) { boost::uuids::uuid uuid = boost::uuids::random_generator()(); std::string filename = "svt" + boost::lexical_cast(svt) + "_" + boost::lexical_cast(uuid) + ".bcf"; svtCollect[svt] = filename; if (c.files.size() <= c.chunksize) { // Merge in one go c.outfile = svtCollect[svt]; mergeRun(c, svt); } else { // Merge in chunks std::vector fileRestore = c.files; uint32_t chunks = ((c.files.size() - 1) / c.chunksize) + 1; std::vector chunkCollect(chunks); for(uint32_t ic = 0; ic < chunks; ++ic) { boost::uuids::uuid uuid = boost::uuids::random_generator()(); std::string chunkfile = "chunk" + boost::lexical_cast(ic) + "_" + boost::lexical_cast(uuid) + ".bcf"; chunkCollect[ic] = chunkfile; c.files.clear(); for(uint32_t k = ic * c.chunksize; ((k < ((ic+1) * c.chunksize)) && (k < fileRestore.size())); ++k) c.files.push_back(fileRestore[k]); c.outfile = chunkCollect[ic]; mergeRun(c, svt); } // Merge chunks c.files = chunkCollect; c.outfile = svtCollect[svt]; // Reset VAF and coverage because these are site lists! float vafStore = c.vaf; uint32_t coverageStore = c.coverage; c.vaf = 0; c.coverage = 0; mergeRun(c, svt); c.vaf = vafStore; c.coverage = coverageStore; // Clean-up for(uint32_t ic = 0; ic < chunks; ++ic) { boost::filesystem::remove(chunkCollect[ic]); boost::filesystem::remove(boost::filesystem::path(chunkCollect[ic].string() + ".csi")); } c.files = fileRestore; } } // Merge temporary files c.outfile = oldPath; if (c.cnvMode) { // Copy boost::filesystem::copy_file(svtCollect[9], c.outfile); boost::filesystem::copy_file(boost::filesystem::path(svtCollect[9].string() + ".csi"), boost::filesystem::path(c.outfile.string() + ".csi")); // Delete boost::filesystem::remove(svtCollect[9]); boost::filesystem::remove(boost::filesystem::path(svtCollect[9].string() + ".csi")); } else { mergeBCFs(c, svtCollect); for(int32_t svt = minSVT; svt < maxSVT; ++svt) { boost::filesystem::remove(svtCollect[svt]); boost::filesystem::remove(boost::filesystem::path(svtCollect[svt].string() + ".csi")); } } return 0; } } #endif delly-0.9.1/src/modvcf.h000066400000000000000000000732371414764127700151010ustar00rootroot00000000000000#ifndef MODVCF_H #define MODVCF_H #include #include #include #include "bolog.h" namespace torali { void _remove_info_tag(bcf_hdr_t* hdr, bcf1_t* rec, std::string const& tag) { bcf_update_info(hdr, rec, tag.c_str(), NULL, 0, BCF_HT_INT); // Type does not matter for n = 0 } void _remove_format_tag(bcf_hdr_t* hdr, bcf1_t* rec, std::string const& tag) { bcf_update_format(hdr, rec, tag.c_str(), NULL, 0, BCF_HT_INT); // Type does not matter for n = 0 } void _remove_info(bcf_hdr_t* hdr, bcf1_t* rec) { std::string tmp[] = {"CT", "PRECISE", "IMPRECISE", "SVTYPE", "SVMETHOD", "CIEND", "CIPOS", "CHR2", "POS2", "END", "PE", "MAPQ", "SRMAPQ", "SR", "SRQ", "CONSENSUS"}; std::set keepInfo(tmp, tmp + sizeof(tmp)/sizeof(tmp[0])); if (!(rec->unpacked & BCF_UN_INFO)) bcf_unpack(rec, BCF_UN_INFO); for (uint32_t i = 0; i < rec->n_info; ++i){ bcf_info_t* inf = &rec->d.info[i]; const char* key = bcf_hdr_int2id(hdr, BCF_DT_ID, inf->key); if (keepInfo.find(std::string(key)) != keepInfo.end()) continue; if (inf->vptr_free) { free(inf->vptr - inf->vptr_off); inf->vptr_free = 0; } rec->d.shared_dirty |= BCF1_DIRTY_INF; inf->vptr = NULL; } } void _remove_format(bcf_hdr_t* hdr, bcf1_t* rec) { if (!(rec->unpacked & BCF_UN_FMT)) bcf_unpack(rec, BCF_UN_FMT); for(uint32_t i = 0; in_fmt; ++i) { bcf_fmt_t* fmt = &rec->d.fmt[i]; const char* key = bcf_hdr_int2id(hdr, BCF_DT_ID, fmt->id); bcf_update_format(hdr, rec, key, NULL, 0, BCF_HT_INT); // the type is irrelevant for n = 0 // Keep GT //if ((key[0]=='G') && key[1]=='T' && (!key[2])) continue; if (fmt->p_free) { free(fmt->p - fmt->p_off); fmt->p_free = 0; } rec->d.indiv_dirty = 1; fmt->p = NULL; } } inline int _getInfoType(bcf_hdr_t const* hdr, std::string const& key) { return bcf_hdr_id2type(hdr, BCF_HL_INFO, bcf_hdr_id2int(hdr, BCF_DT_ID, key.c_str())); } inline int _getFormatType(bcf_hdr_t const* hdr, std::string const& key) { return bcf_hdr_id2type(hdr, BCF_HL_FMT, bcf_hdr_id2int(hdr, BCF_DT_ID, key.c_str())); } inline bool _missing(bool const value) { return !value; } inline bool _missing(float const value) { return bcf_float_is_missing(value); } inline bool _missing(int8_t const value) { return (value == bcf_int8_missing); } inline bool _missing(int16_t const value) { return (value == bcf_int16_missing); } inline bool _missing(int32_t const value) { return (value == bcf_int32_missing); } inline bool _missing(std::string const& value) { return ((value.empty()) || (value == ".")); } inline bool _isKeyPresent(bcf_hdr_t const* hdr, std::string const& key) { return (bcf_hdr_id2int(hdr, BCF_DT_ID, key.c_str())>=0); } inline bool _isDNA(std::string const& allele) { for(uint32_t i = 0; i out(alleles.size()); int32_t inTag = 0; for(uint32_t i = 0; i') || (alleles[i] == ']') || (alleles[i] == '[') || (alleles[i] == ',')) { out[i] = alleles[i]; if (alleles[i] == '<') inTag = 1; else if (alleles[i] == ']') inTag = 2; else if (alleles[i] == '[') inTag = 3; else if ((alleles[i] == '>') && (inTag == 1)) inTag = 0; else if ((alleles[i] == ']') && (inTag == 2)) inTag = 0; else if ((alleles[i] == '[') && (inTag == 3)) inTag = 0; } else { // Replace IUPAC if ((alleles[i] == 'U') || (alleles[i] == 'u')) out[i] = 'T'; else if ((alleles[i] == 'R') || (alleles[i] == 'r')) out[i] = 'A'; else if ((alleles[i] == 'Y') || (alleles[i] == 'y')) out[i] = 'C'; else if ((alleles[i] == 'S') || (alleles[i] == 's')) out[i] = 'C'; else if ((alleles[i] == 'W') || (alleles[i] == 'w')) out[i] = 'A'; else if ((alleles[i] == 'K') || (alleles[i] == 'k')) out[i] = 'G'; else if ((alleles[i] == 'M') || (alleles[i] == 'm')) out[i] = 'A'; else if ((alleles[i] == 'B') || (alleles[i] == 'b')) out[i] = 'C'; else if ((alleles[i] == 'D') || (alleles[i] == 'd')) out[i] = 'A'; else if ((alleles[i] == 'H') || (alleles[i] == 'h')) out[i] = 'A'; else if ((alleles[i] == 'V') || (alleles[i] == 'v')) out[i] = 'A'; else out[i] = 'N'; } } return std::string(out.begin(), out.end()); } // Convert string to char* struct cstyle_str { const char* operator ()(const std::string& s) { return s.c_str(); } }; // Parse Delly vcf file template inline void vcfParse(TConfig const& c, bam_hdr_t* hd, std::vector& svs) { // Load bcf file htsFile* ifile = bcf_open(c.vcffile.string().c_str(), "r"); bcf_hdr_t* hdr = bcf_hdr_read(ifile); bcf1_t* rec = bcf_init(); // Parse genome if necessary faidx_t* fai = fai_load(c.genome.string().c_str()); char* seq = NULL; int32_t lastRefIndex = -1; // Parse bcf int32_t nsvend = 0; int32_t* svend = NULL; int32_t npos2 = 0; int32_t* pos2 = NULL; int32_t nsvlen = 0; int32_t* svlen = NULL; int32_t npe = 0; int32_t* pe = NULL; int32_t ninslen = 0; int32_t* inslen = NULL; int32_t nhomlen = 0; int32_t* homlen = NULL; int32_t nsr = 0; int32_t* sr = NULL; int32_t ncipos = 0; int32_t* cipos = NULL; int32_t nmapq = 0; int32_t* mapq = NULL; int32_t nct = 0; char* ct = NULL; int32_t nsrq = 0; float* srq = NULL; int32_t nsvt = 0; char* svt = NULL; int32_t nmethod = 0; char* method = NULL; int32_t ncons = 0; char* cons = NULL; int32_t nchr2 = 0; char* chr2 = NULL; uint16_t wimethod = 0; while (bcf_read(ifile, hdr, rec) == 0) { bcf_unpack(rec, BCF_UN_INFO); // Delly BCF file? if (!wimethod) { wimethod = 2; if (bcf_get_info_string(hdr, rec, "SVMETHOD", &method, &nmethod) > 0) { std::string mstr = std::string(method); if ((mstr.size() >= 10) && (mstr.substr(0, 10) == "EMBL.DELLY")) wimethod = 1; } } // Delly if (wimethod == 1) { // Fill SV record StructuralVariantRecord svRec; std::string chrName = bcf_hdr_id2name(hdr, rec->rid); int32_t tid = bam_name2id(hd, chrName.c_str()); svRec.chr = tid; svRec.svStart = rec->pos + 1; svRec.id = svs.size(); svRec.mapq = rec->qual; std::string refAllele = rec->d.allele[0]; std::string altAllele = rec->d.allele[1]; svRec.alleles = refAllele + "," + altAllele; // Parse SV type if ((bcf_get_info_string(hdr, rec, "SVTYPE", &svt, &nsvt) > 0) && (bcf_get_info_string(hdr, rec, "CT", &ct, &nct) > 0)) svRec.svt = _decodeOrientation(std::string(ct), std::string(svt)); else continue; // Parse INFO if (bcf_get_info_flag(hdr, rec, "PRECISE", 0, 0) > 0) svRec.precise=true; else svRec.precise = false; if (bcf_get_info_int32(hdr, rec, "PE", &pe, &npe) > 0) svRec.peSupport = *pe; else { if (svRec.precise) svRec.peSupport = 0; else svRec.peSupport = 2; } if (svRec.svt != 4) { if (bcf_get_info_int32(hdr, rec, "INSLEN", &inslen, &ninslen) > 0) svRec.insLen = *inslen; else svRec.insLen = 0; } else { // Insertions must have INFO/SVLEN if (bcf_get_info_int32(hdr, rec, "SVLEN", &svlen, &nsvlen) > 0) svRec.insLen = *svlen; else continue; } if (bcf_get_info_int32(hdr, rec, "HOMLEN", &homlen, &nhomlen) > 0) svRec.homLen = *homlen; else svRec.homLen = 0; if (bcf_get_info_int32(hdr, rec, "SR", &sr, &nsr) > 0) svRec.srSupport = *sr; else svRec.srSupport = 0; // SV end assignment svRec.chr2 = tid; svRec.svEnd = rec->pos + 2; if (svRec.svt < DELLY_SVT_TRANS) { // Intra-chromosomal SV if (bcf_get_info_int32(hdr, rec, "END", &svend, &nsvend) > 0) svRec.svEnd = *svend; } else { // Inter-chromosomal SV if (bcf_get_info_string(hdr, rec, "CHR2", &chr2, &nchr2) > 0) { std::string chr2Name = std::string(chr2); svRec.chr2 = bam_name2id(hd, chr2Name.c_str()); } if (bcf_get_info_int32(hdr, rec, "POS2", &pos2, &npos2) > 0) svRec.svEnd = *pos2; } if (bcf_get_info_string(hdr, rec, "CONSENSUS", &cons, &ncons) > 0) svRec.consensus = std::string(cons); else svRec.precise = false; if (bcf_get_info_int32(hdr, rec, "CIPOS", &cipos, &ncipos) > 0) { svRec.ciposlow = cipos[0]; svRec.ciposhigh = cipos[1]; } else { svRec.ciposlow = -50; svRec.ciposhigh = 50; } if (bcf_get_info_int32(hdr, rec, "CIEND", &cipos, &ncipos) > 0) { svRec.ciendlow = cipos[0]; svRec.ciendhigh = cipos[1]; } else { svRec.ciendlow = -50; svRec.ciendhigh = 50; } if (bcf_get_info_int32(hdr, rec, "MAPQ", &mapq, &nmapq) > 0) svRec.peMapQuality = (uint8_t) *mapq; else svRec.peMapQuality = 0; if (bcf_get_info_int32(hdr, rec, "SRMAPQ", &mapq, &nmapq) > 0) svRec.srMapQuality = (uint8_t) *mapq; else svRec.srMapQuality = 0; if (bcf_get_info_float(hdr, rec, "SRQ", &srq, &nsrq) > 0) svRec.srAlignQuality = (double) *srq; else svRec.srAlignQuality = 0; svs.push_back(svRec); } else if (wimethod == 2) { // Assume precise SV, only deletions supported and INFO:END is required!!! if (rec->n_allele == 2) { std::string refAllele = rec->d.allele[0]; std::string altAllele = rec->d.allele[1]; StructuralVariantRecord svRec; bool tagUse; bool insertion = false; if (altAllele == "") { svRec.svt = 2; tagUse = true; } else if (altAllele == "") { // No precise insertion sequence, cannot be genotyped by Delly continue; } else { if ((refAllele.size() > altAllele.size()) && (_isDNA(refAllele)) && (_isDNA(altAllele))) { svRec.svt = 2; tagUse = false; } else if ((altAllele.size() > refAllele.size()) && (_isDNA(refAllele)) && (_isDNA(altAllele))) { insertion = true; svRec.svt = 4; tagUse = false; } else continue; } if (tagUse) { if (bcf_get_info_int32(hdr, rec, "END", &svend, &nsvend) > 0) svRec.svEnd = *svend; else continue; } else { if (insertion) { svRec.svEnd = rec->pos + 2; int32_t diff = altAllele.size() - refAllele.size(); svRec.insLen = diff; } else { int32_t diff = refAllele.size() - altAllele.size(); svRec.svEnd = rec->pos + diff + 2; svRec.insLen = 0; } } std::string chrName = bcf_hdr_id2name(hdr, rec->rid); int32_t tid = bam_name2id(hd, chrName.c_str()); svRec.chr = tid; svRec.chr2 = tid; svRec.svStart = rec->pos + 1; svRec.id = svs.size(); svRec.alleles = refAllele + "," + altAllele; svRec.precise=true; svRec.peSupport = 0; svRec.homLen = 0; svRec.srSupport = 5; svRec.peMapQuality = 20; svRec.srAlignQuality = 1; svRec.ciposlow = -50; svRec.ciposhigh = 50; svRec.ciendlow = -50; svRec.ciendhigh = 50; // Lazy loading of reference sequence if ((seq == NULL) || (tid != lastRefIndex)) { if (seq != NULL) free(seq); int32_t seqlen = -1; seq = faidx_fetch_seq(fai, chrName.c_str(), 0, faidx_seq_len(fai, chrName.c_str()), &seqlen); lastRefIndex = tid; } // Build consensus sequence if ((seq != NULL) && ((svRec.svStart + 15 < svRec.svEnd) || (svRec.insLen >= 15))) { int32_t buffer = 75; if (tagUse) { int32_t prefix = 0; if (buffer < rec->pos) prefix = rec->pos - buffer; std::string pref = boost::to_upper_copy(std::string(seq + prefix, seq + rec->pos + 1)); int32_t suffix = svRec.svEnd + buffer; std::string suf = boost::to_upper_copy(std::string(seq + svRec.svEnd, seq + suffix)); svRec.consensus = pref + suf; } else { int32_t prefix = 0; if (buffer < rec->pos) prefix = rec->pos - buffer; std::string pref = boost::to_upper_copy(std::string(seq + prefix, seq + rec->pos)); int32_t suffix = svRec.svEnd + buffer; std::string suf = boost::to_upper_copy(std::string(seq + svRec.svEnd - 1, seq + suffix)); svRec.consensus = pref + altAllele + suf; } svs.push_back(svRec); } } } } // Clean-up free(svend); free(pos2); free(svlen); free(svt); free(method); free(pe); free(inslen); free(homlen); free(sr); free(cons); free(cipos); free(mapq); free(srq); free(ct); free(chr2); // Clean-up index if (seq != NULL) free(seq); fai_destroy(fai); // Close VCF bcf_hdr_destroy(hdr); bcf_close(ifile); bcf_destroy(rec); } template inline void vcfOutput(TConfig const& c, std::vector const& svs, TJunctionCountMap const& jctCountMap, TReadCountMap const& readCountMap, TCountMap const& spanCountMap) { // BoLog class BoLog bl; // Open one bam file header samFile* samfile = sam_open(c.files[0].string().c_str(), "r"); hts_set_fai_filename(samfile, c.genome.string().c_str()); bam_hdr_t* bamhd = sam_hdr_read(samfile); // Output all structural variants htsFile *fp = hts_open(c.outfile.string().c_str(), "wb"); bcf_hdr_t *hdr = bcf_hdr_init("w"); // Print vcf header boost::posix_time::ptime now = boost::posix_time::second_clock::local_time(); boost::gregorian::date today = now.date(); std::string datestr("##fileDate="); datestr += boost::gregorian::to_iso_string(today); bcf_hdr_append(hdr, datestr.c_str()); bcf_hdr_append(hdr, "##ALT="); bcf_hdr_append(hdr, "##ALT="); bcf_hdr_append(hdr, "##ALT="); bcf_hdr_append(hdr, "##ALT="); bcf_hdr_append(hdr, "##ALT="); bcf_hdr_append(hdr, "##FILTER="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##INFO="); bcf_hdr_append(hdr, "##FORMAT="); bcf_hdr_append(hdr, "##FORMAT="); bcf_hdr_append(hdr, "##FORMAT="); bcf_hdr_append(hdr, "##FORMAT="); bcf_hdr_append(hdr, "##FORMAT="); bcf_hdr_append(hdr, "##FORMAT="); bcf_hdr_append(hdr, "##FORMAT="); bcf_hdr_append(hdr, "##FORMAT="); bcf_hdr_append(hdr, "##FORMAT="); bcf_hdr_append(hdr, "##FORMAT="); if (c.isHaplotagged) { bcf_hdr_append(hdr, "##FORMAT="); bcf_hdr_append(hdr, "##FORMAT="); bcf_hdr_append(hdr, "##FORMAT="); bcf_hdr_append(hdr, "##FORMAT="); } bcf_hdr_append(hdr, "##FORMAT="); bcf_hdr_append(hdr, "##FORMAT="); if (c.isHaplotagged) { bcf_hdr_append(hdr, "##FORMAT="); bcf_hdr_append(hdr, "##FORMAT="); bcf_hdr_append(hdr, "##FORMAT="); bcf_hdr_append(hdr, "##FORMAT="); } // Add reference std::string refloc("##reference="); refloc += c.genome.string(); bcf_hdr_append(hdr, refloc.c_str()); for (int i = 0; in_targets; ++i) { std::string refname("##contig=target_name[i]) + ",length=" + boost::lexical_cast(bamhd->target_len[i]) + ">"; bcf_hdr_append(hdr, refname.c_str()); } // Add samples for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) bcf_hdr_add_sample(hdr, c.sampleName[file_c].c_str()); bcf_hdr_add_sample(hdr, NULL); if (bcf_hdr_write(fp, hdr) != 0) std::cerr << "Error: Failed to write BCF header!" << std::endl; if (!svs.empty()) { // Genotype arrays int32_t *gts = (int*) malloc(bcf_hdr_nsamples(hdr) * 2 * sizeof(int)); float *gls = (float*) malloc(bcf_hdr_nsamples(hdr) * 3 * sizeof(float)); int32_t *rcl = (int*) malloc(bcf_hdr_nsamples(hdr) * sizeof(int)); int32_t *rc = (int*) malloc(bcf_hdr_nsamples(hdr) * sizeof(int)); int32_t *rcr = (int*) malloc(bcf_hdr_nsamples(hdr) * sizeof(int)); int32_t *cnest = (int*) malloc(bcf_hdr_nsamples(hdr) * sizeof(int)); int32_t *drcount = (int*) malloc(bcf_hdr_nsamples(hdr) * sizeof(int)); int32_t *dvcount = (int*) malloc(bcf_hdr_nsamples(hdr) * sizeof(int)); int32_t *hp1drcount = (int*) malloc(bcf_hdr_nsamples(hdr) * sizeof(int)); int32_t *hp2drcount = (int*) malloc(bcf_hdr_nsamples(hdr) * sizeof(int)); int32_t *hp1dvcount = (int*) malloc(bcf_hdr_nsamples(hdr) * sizeof(int)); int32_t *hp2dvcount = (int*) malloc(bcf_hdr_nsamples(hdr) * sizeof(int)); int32_t *rrcount = (int*) malloc(bcf_hdr_nsamples(hdr) * sizeof(int)); int32_t *rvcount = (int*) malloc(bcf_hdr_nsamples(hdr) * sizeof(int)); int32_t *hp1rrcount = (int*) malloc(bcf_hdr_nsamples(hdr) * sizeof(int)); int32_t *hp2rrcount = (int*) malloc(bcf_hdr_nsamples(hdr) * sizeof(int)); int32_t *hp1rvcount = (int*) malloc(bcf_hdr_nsamples(hdr) * sizeof(int)); int32_t *hp2rvcount = (int*) malloc(bcf_hdr_nsamples(hdr) * sizeof(int)); int32_t *gqval = (int*) malloc(bcf_hdr_nsamples(hdr) * sizeof(int)); std::vector ftarr; ftarr.resize(bcf_hdr_nsamples(hdr)); // Iterate all structural variants typedef std::vector TSVs; now = boost::posix_time::second_clock::local_time(); std::cout << '[' << boost::posix_time::to_simple_string(now) << "] " << "Genotyping" << std::endl; boost::progress_display show_progress( svs.size() ); bcf1_t *rec = bcf_init(); for(typename TSVs::const_iterator svIter = svs.begin(); svIter!=svs.end(); ++svIter) { ++show_progress; if ((svIter->srSupport == 0) && (svIter->peSupport == 0)) continue; // Output main vcf fields int32_t tmpi = bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS"); if (svIter->chr == svIter->chr2) { // Intra-chromosomal if (((svIter->peSupport < 3) || (svIter->peMapQuality < 20)) && ((svIter->srSupport < 3) || (svIter->srMapQuality < 20))) tmpi = bcf_hdr_id2int(hdr, BCF_DT_ID, "LowQual"); } else { // Inter-chromosomal if (((svIter->peSupport < 5) || (svIter->peMapQuality < 20)) && ((svIter->srSupport < 5) || (svIter->srMapQuality < 20))) tmpi = bcf_hdr_id2int(hdr, BCF_DT_ID, "LowQual"); } rec->rid = bcf_hdr_name2id(hdr, bamhd->target_name[svIter->chr]); int32_t svStartPos = svIter->svStart - 1; if (svStartPos < 1) svStartPos = 1; int32_t svEndPos = svIter->svEnd; if (svEndPos < 1) svEndPos = 1; if (svEndPos >= (int32_t) bamhd->target_len[svIter->chr2]) svEndPos = bamhd->target_len[svIter->chr2] - 1; rec->pos = svStartPos; std::string id(_addID(svIter->svt)); std::string padNumber = boost::lexical_cast(svIter->id); padNumber.insert(padNumber.begin(), 8 - padNumber.length(), '0'); id += padNumber; bcf_update_id(hdr, rec, id.c_str()); std::string alleles = _replaceIUPAC(svIter->alleles); bcf_update_alleles_str(hdr, rec, alleles.c_str()); bcf_update_filter(hdr, rec, &tmpi, 1); // Add INFO fields if (svIter->precise) bcf_update_info_flag(hdr, rec, "PRECISE", NULL, 1); else bcf_update_info_flag(hdr, rec, "IMPRECISE", NULL, 1); bcf_update_info_string(hdr, rec, "SVTYPE", _addID(svIter->svt).c_str()); std::string dellyVersion("EMBL.DELLYv"); dellyVersion += dellyVersionNumber; bcf_update_info_string(hdr,rec, "SVMETHOD", dellyVersion.c_str()); if (svIter->svt < DELLY_SVT_TRANS) { tmpi = svEndPos; bcf_update_info_int32(hdr, rec, "END", &tmpi, 1); } else { tmpi = svStartPos + 2; bcf_update_info_int32(hdr, rec, "END", &tmpi, 1); bcf_update_info_string(hdr,rec, "CHR2", bamhd->target_name[svIter->chr2]); tmpi = svEndPos; bcf_update_info_int32(hdr, rec, "POS2", &tmpi, 1); } if (svIter->svt == 4) { tmpi = svIter->insLen; bcf_update_info_int32(hdr, rec, "SVLEN", &tmpi, 1); } tmpi = svIter->peSupport; bcf_update_info_int32(hdr, rec, "PE", &tmpi, 1); tmpi = svIter->peMapQuality; bcf_update_info_int32(hdr, rec, "MAPQ", &tmpi, 1); bcf_update_info_string(hdr, rec, "CT", _addOrientation(svIter->svt).c_str()); int32_t ciend[2]; ciend[0] = svIter->ciendlow; ciend[1] = svIter->ciendhigh; int32_t cipos[2]; cipos[0] = svIter->ciposlow; cipos[1] = svIter->ciposhigh; bcf_update_info_int32(hdr, rec, "CIPOS", cipos, 2); bcf_update_info_int32(hdr, rec, "CIEND", ciend, 2); if (svIter->precise) { tmpi = svIter->srMapQuality; bcf_update_info_int32(hdr, rec, "SRMAPQ", &tmpi, 1); tmpi = svIter->insLen; bcf_update_info_int32(hdr, rec, "INSLEN", &tmpi, 1); tmpi = svIter->homLen; bcf_update_info_int32(hdr, rec, "HOMLEN", &tmpi, 1); tmpi = svIter->srSupport; bcf_update_info_int32(hdr, rec, "SR", &tmpi, 1); float tmpf = svIter->srAlignQuality; bcf_update_info_float(hdr, rec, "SRQ", &tmpf, 1); if (svIter->consensus.size()) { bcf_update_info_string(hdr, rec, "CONSENSUS", svIter->consensus.c_str()); tmpf = entropy(svIter->consensus); bcf_update_info_float(hdr, rec, "CE", &tmpf, 1); } } // Add genotype columns for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) { // Counters rcl[file_c] = 0; rc[file_c] = 0; rcr[file_c] = 0; cnest[file_c] = 0; drcount[file_c] = 0; dvcount[file_c] = 0; if (c.isHaplotagged) { hp1drcount[file_c] = 0; hp2drcount[file_c] = 0; hp1dvcount[file_c] = 0; hp2dvcount[file_c] = 0; } rrcount[file_c] = 0; rvcount[file_c] = 0; if (c.isHaplotagged) { hp1rrcount[file_c] = 0; hp2rrcount[file_c] = 0; hp1rvcount[file_c] = 0; hp2rvcount[file_c] = 0; } drcount[file_c] = spanCountMap[file_c][svIter->id].ref.size(); dvcount[file_c] = spanCountMap[file_c][svIter->id].alt.size(); if (c.isHaplotagged) { hp1drcount[file_c] = spanCountMap[file_c][svIter->id].refh1; hp2drcount[file_c] = spanCountMap[file_c][svIter->id].refh2; hp1dvcount[file_c] = spanCountMap[file_c][svIter->id].alth1; hp2dvcount[file_c] = spanCountMap[file_c][svIter->id].alth2; } rrcount[file_c] = jctCountMap[file_c][svIter->id].ref.size(); rvcount[file_c] = jctCountMap[file_c][svIter->id].alt.size(); if (c.isHaplotagged) { hp1rrcount[file_c] = jctCountMap[file_c][svIter->id].refh1; hp2rrcount[file_c] = jctCountMap[file_c][svIter->id].refh2; hp1rvcount[file_c] = jctCountMap[file_c][svIter->id].alth1; hp2rvcount[file_c] = jctCountMap[file_c][svIter->id].alth2; } // Compute GLs if (svIter->precise) _computeGLs(bl, jctCountMap[file_c][svIter->id].ref, jctCountMap[file_c][svIter->id].alt, gls, gqval, gts, file_c); else _computeGLs(bl, spanCountMap[file_c][svIter->id].ref, spanCountMap[file_c][svIter->id].alt, gls, gqval, gts, file_c); // Compute RCs rcl[file_c] = readCountMap[file_c][svIter->id].leftRC; rc[file_c] = readCountMap[file_c][svIter->id].rc; rcr[file_c] = readCountMap[file_c][svIter->id].rightRC; cnest[file_c] = -1; if ((rcl[file_c] + rcr[file_c]) > 0) cnest[file_c] = boost::math::iround( 2.0 * (double) rc[file_c] / (double) (rcl[file_c] + rcr[file_c]) ); // Genotype filter if (gqval[file_c] < 15) ftarr[file_c] = "LowQual"; else ftarr[file_c] = "PASS"; } int32_t qvalout = svIter->mapq; if (qvalout < 0) qvalout = 0; if (qvalout > 10000) qvalout = 10000; rec->qual = qvalout; bcf_update_genotypes(hdr, rec, gts, bcf_hdr_nsamples(hdr) * 2); bcf_update_format_float(hdr, rec, "GL", gls, bcf_hdr_nsamples(hdr) * 3); bcf_update_format_int32(hdr, rec, "GQ", gqval, bcf_hdr_nsamples(hdr)); std::vector strp(bcf_hdr_nsamples(hdr)); std::transform(ftarr.begin(), ftarr.end(), strp.begin(), cstyle_str()); bcf_update_format_string(hdr, rec, "FT", &strp[0], bcf_hdr_nsamples(hdr)); bcf_update_format_int32(hdr, rec, "RCL", rcl, bcf_hdr_nsamples(hdr)); bcf_update_format_int32(hdr, rec, "RC", rc, bcf_hdr_nsamples(hdr)); bcf_update_format_int32(hdr, rec, "RCR", rcr, bcf_hdr_nsamples(hdr)); bcf_update_format_int32(hdr, rec, "RDCN", cnest, bcf_hdr_nsamples(hdr)); bcf_update_format_int32(hdr, rec, "DR", drcount, bcf_hdr_nsamples(hdr)); bcf_update_format_int32(hdr, rec, "DV", dvcount, bcf_hdr_nsamples(hdr)); if (c.isHaplotagged) { bcf_update_format_int32(hdr, rec, "HP1DR", hp1drcount, bcf_hdr_nsamples(hdr)); bcf_update_format_int32(hdr, rec, "HP2DR", hp2drcount, bcf_hdr_nsamples(hdr)); bcf_update_format_int32(hdr, rec, "HP1DV", hp1dvcount, bcf_hdr_nsamples(hdr)); bcf_update_format_int32(hdr, rec, "HP2DV", hp2dvcount, bcf_hdr_nsamples(hdr)); } bcf_update_format_int32(hdr, rec, "RR", rrcount, bcf_hdr_nsamples(hdr)); bcf_update_format_int32(hdr, rec, "RV", rvcount, bcf_hdr_nsamples(hdr)); if (c.isHaplotagged) { bcf_update_format_int32(hdr, rec, "HP1RR", hp1rrcount, bcf_hdr_nsamples(hdr)); bcf_update_format_int32(hdr, rec, "HP2RR", hp2rrcount, bcf_hdr_nsamples(hdr)); bcf_update_format_int32(hdr, rec, "HP1RV", hp1rvcount, bcf_hdr_nsamples(hdr)); bcf_update_format_int32(hdr, rec, "HP2RV", hp2rvcount, bcf_hdr_nsamples(hdr)); } bcf_write1(fp, hdr, rec); bcf_clear1(rec); } bcf_destroy1(rec); // Clean-up free(gts); free(gls); free(rcl); free(rc); free(rcr); free(cnest); free(drcount); free(dvcount); free(hp1drcount); free(hp2drcount); free(hp1dvcount); free(hp2dvcount); free(rrcount); free(rvcount); free(hp1rrcount); free(hp2rrcount); free(hp1rvcount); free(hp2rvcount); free(gqval); } // Close BAM file bam_hdr_destroy(bamhd); sam_close(samfile); // Close VCF file bcf_hdr_destroy(hdr); hts_close(fp); // Build index bcf_index_build(c.outfile.string().c_str(), 14); } } #endif delly-0.9.1/src/msa.h000066400000000000000000000220241414764127700143670ustar00rootroot00000000000000#ifndef MSA_H #define MSA_H #include #include "needle.h" #include "gotoh.h" namespace torali { inline int32_t lcs(std::string const& s1, std::string const& s2) { uint32_t m = s1.size(); uint32_t n = s2.size(); int32_t prevdiag = 0; int32_t prevprevdiag = 0; std::vector onecol(n+1, 0); for(uint32_t i = 0; i <= m; ++i) { for(uint32_t j = 0; j <= n; ++j) { if ((i==0) || (j==0)) { onecol[j] = 0; prevprevdiag = 0; prevdiag = 0; } else { prevprevdiag = prevdiag; prevdiag = onecol[j]; if (s1[i-1] == s2[j-1]) onecol[j] = prevprevdiag + 1; else onecol[j] = (onecol[j] > onecol[j-1]) ? onecol[j] : onecol[j-1]; } } } return onecol[n]; } template inline void distanceMatrix(TSplitReadSet const& sps, TDistArray& d) { typedef typename TDistArray::index TDIndex; typename TSplitReadSet::const_iterator sIt1 = sps.begin(); for (TDIndex i = 0; sIt1 != sps.end(); ++sIt1, ++i) { typename TSplitReadSet::const_iterator sIt2 = sIt1; ++sIt2; for (TDIndex j = i+1; sIt2 != sps.end(); ++sIt2, ++j) { d[i][j] = (lcs(*sIt1, *sIt2) * 100) / std::min(sIt1->size(), sIt2->size()); } } } template inline int closestPair(TDistArray const& d, TDIndex num, TDIndex& dI, TDIndex& dJ) { int dMax = -1; for (TDIndex i = 0; idMax) { dMax = d[i][j]; dI = i; dJ = j; } } } return dMax; } template inline void updateDistanceMatrix(TDistArray& d, TPhylogeny const& p, TDIndex num, TDIndex& dI, TDIndex& dJ) { for (TDIndex i = 0; i < num; ++i) if (p[i][0] == -1) d[i][num] = (((dI < i) ? d[dI][i] : d[i][dI]) + ((dJ < i) ? d[dJ][i] : d[i][dJ])) / 2; for (TDIndex i = 0; i inline TDIndex upgma(TDistArray& d, TPhylogeny& p, TDIndex num) { TDIndex nn = num; for(;nn<2*num+1; ++nn) { TDIndex dI = 0; TDIndex dJ = 0; if (closestPair(d, nn, dI, dJ) == -1) break; p[dI][0] = nn; p[dJ][0] = nn; p[nn][1] = dI; p[nn][2] = dJ; updateDistanceMatrix(d, p, nn, dI, dJ); } return (nn > 0) ? (nn - 1) : 0; } template inline void palign(TConfig const& c, TSplitReadSet const& sps, TPhylogeny const& p, TDIndex root, TAlign& align) { typedef typename TAlign::index TAIndex; if ((p[root][1] == -1) && (p[root][2] == -1)) { typename TSplitReadSet::const_iterator sIt = sps.begin(); if (root) std::advance(sIt, root); align.resize(boost::extents[1][sIt->size()]); TAIndex ind = 0; for(typename std::string::const_iterator str = sIt->begin(); str != sIt->end(); ++str) align[0][ind++] = *str; } else { TAlign align1; palign(c, sps, p, p[root][1], align1); TAlign align2; palign(c, sps, p, p[root][2], align2); AlignConfig endFreeAlign; gotoh(align1, align2, align, endFreeAlign, c.aliscore); } } template inline void sprealign(TAlign& align) { typedef typename TAlign::index TAIndex; for(TAIndex i = 0; i gaps; for(TAIndex j = 0; j endFreeAlign; gotoh(align1, align2, align, endFreeAlign); } } template inline void consensus(TAlign const& align, std::string& gapped, std::string& cs) { typedef typename TAlign::index TAIndex; // Calculate coverage typedef boost::multi_array TFlag; TFlag fl; fl.resize(boost::extents[align.shape()[0]][align.shape()[1]]); typedef std::vector TCoverage; TCoverage cov; cov.resize(align.shape()[1], 0); for(TAIndex i = 0; i < (TAIndex) align.shape()[0]; ++i) { int start = 0; int end = -1; for(TAIndex j = 0; j < (TAIndex) align.shape()[1]; ++j) { fl[i][j] = false; if (align[i][j] != '-') end = j; else if (end == -1) start = j + 1; } for(TAIndex j = start; j<=end; ++j) { ++cov[j]; fl[i][j] = true; } } int covThreshold = 3; TAIndex j = 0; std::vector cons(align.shape()[1], '-'); for(typename TCoverage::const_iterator itCov = cov.begin(); itCov != cov.end(); ++itCov, ++j) { int32_t maxIdx = 4; // Leading/trailing gaps until min. coverage is reached if (*itCov >= covThreshold) { // Get consensus letter std::vector count(5, 0); // ACGT- for(TAIndex i = 0; i < (TAIndex) align.shape()[0]; ++i) { if (fl[i][j]) { if ((align[i][j] == 'A') || (align[i][j] == 'a')) ++count[0]; else if ((align[i][j] == 'C') || (align[i][j] == 'c')) ++count[1]; else if ((align[i][j] == 'G') || (align[i][j] == 'g')) ++count[2]; else if ((align[i][j] == 'T') || (align[i][j] == 't')) ++count[3]; else ++count[4]; } } maxIdx = 0; int32_t maxCount = count[0]; for(uint32_t i = 1; i<5; ++i) { if (count[i] > maxCount) { maxCount = count[i]; maxIdx = i; } } } switch (maxIdx) { case 0: cons[j] = 'A'; break; case 1: cons[j] = 'C'; break; case 2: cons[j] = 'G'; break; case 3: cons[j] = 'T'; break; default: break; } } gapped = std::string(cons.begin(), cons.end()); for(uint32_t i = 0; i inline void consensus(TAlign const& align, std::string& cs) { std::string gapped; consensus(align, gapped, cs); //std::cerr << "Consensus:" << std::endl; //std::cerr << gapped << std::endl; //std::cerr << cs << std::endl; } template inline int msa(TConfig const& c, TSplitReadSet const& sps, std::string& cs) { // Compute distance matrix typedef boost::multi_array TDistArray; typedef typename TDistArray::index TDIndex; TDIndex num = sps.size(); TDistArray d(boost::extents[2*num+1][2*num+1]); for (TDIndex i = 0; i<(2*num+1); ++i) for (TDIndex j = i+1; j<(2*num+1); ++j) d[i][j]=-1; distanceMatrix(sps, d); // UPGMA typedef boost::multi_array TPhylogeny; TPhylogeny p(boost::extents[2*num+1][3]); for(TDIndex i = 0; i<(2*num+1); ++i) for (TDIndex j = 0; j<3; ++j) p[i][j] = -1; TDIndex root = upgma(d, p, num); // Debug guide tree //std::cerr << "Phylogeny" << std::endl; //std::cerr << "#Sequences: " << sps.size() << std::endl; //std::cerr << "Root: " << root << std::endl; //std::cerr << "Node:Parent\tLeftChild\tRightChild" << std::endl; //for(TDIndex i = 0; i<(2*num+1); ++i) { //std::cerr << i << ':' << '\t'; //for (TDIndex j = 0; j<3; ++j) { //std::cerr << p[i][j] << '\t'; //} //std::cerr << std::endl; //} // Progressive Alignment typedef boost::multi_array TAlign; TAlign align; palign(c, sps, p, root, align); // Debug MSA //for(uint32_t i = 0; i inline void outputConsensus(bam_hdr_t* hdr, TStructuralVariant const& sv, std::string const& cons) { std::cerr << ">" << hdr->target_name[sv.chr] << ':' << sv.svStart << ',' << hdr->target_name[sv.chr2] << ':' << sv.svEnd << " SVT:" << sv.svt << " SR:" << sv.srSupport << " PE:" << sv.peSupport << std::endl; std::cerr << cons << std::endl; } } #endif delly-0.9.1/src/needle.h000066400000000000000000000324641414764127700150540ustar00rootroot00000000000000#ifndef NEEDLE_H #define NEEDLE_H #define BOOST_DISABLE_ASSERTS #include #include #include #include "align.h" namespace torali { inline int32_t longestHomology(std::string const& s1, std::string const& s2, int32_t scoreThreshold) { // DP Matrix typedef boost::multi_array TMatrix; int32_t m = s1.size(); int32_t n = s2.size(); TMatrix mat(boost::extents[m+1][n+1]); // Initialization int32_t k = std::abs(scoreThreshold); mat[0][0] = 0; for(int32_t col = 1; col <= k; ++col) mat[0][col] = mat[0][col-1] - 1; for(int32_t row = 1; row <= k; ++row) mat[row][0] = mat[row-1][0] - 1; // Edit distance for(int32_t row = 1; row <= m; ++row) { int32_t bestCol = scoreThreshold - 1; for(int32_t h = -k; h <= k; ++h) { int32_t col = row + h; if ((col >= 1) && (col <= n)) { mat[row][col] = mat[row-1][col-1] + (s1[row-1] == s2[col-1] ? 0 : -1); if ((row - 1 - col >= -k) && (row - 1 - col <= k)) mat[row][col] = std::max(mat[row][col], mat[row-1][col] - 1); if ((row - col + 1 >= -k) && (row - col + 1 <= k)) mat[row][col] = std::max(mat[row][col], mat[row][col-1] - 1); if (mat[row][col] > bestCol) bestCol = mat[row][col]; } } if (bestCol < scoreThreshold) return row - 1; } return 0; } template inline bool longNeedle(std::string const& s1, std::string const& s2, TAlign& align, TAlignConfig const& ac, TScoreObject const& sc) { typedef typename TScoreObject::TValue TScoreValue; typedef typename TAlign::index TAIndex; // DP Matrix typedef boost::multi_array TMatrix; std::size_t m = s1.size(); std::size_t n = s2.size(); TMatrix mat(boost::extents[m+1][n+1]); // Initialization mat[0][0] = 0; for(std::size_t col = 1; col <= n; ++col) mat[0][col] = mat[0][col-1] + _horizontalGap(ac, 0, m, sc.ge); for(std::size_t row = 1; row <= m; ++row) mat[row][0] = mat[row-1][0] + _verticalGap(ac, 0, n, sc.ge); // Forward alignment for(std::size_t row = 1; row <= m; ++row) for(std::size_t col = 1; col <= n; ++col) mat[row][col] = std::max(std::max(mat[row-1][col-1] + (s1[row-1] == s2[col-1] ? sc.match : sc.mismatch), mat[row-1][col] + _verticalGap(ac, col, n, sc.ge)), mat[row][col-1] + _horizontalGap(ac, row, m, sc.ge)); // Reverse input sequences std::string sRev1 = s1; reverseComplement(sRev1); std::string sRev2 = s2; reverseComplement(sRev2); // Reverse alignment TMatrix rev(boost::extents[m+1][n+1]); rev[0][0] = 0; for(std::size_t col = 1; col <= n; ++col) rev[0][col] = rev[0][col-1] + _horizontalGap(ac, 0, m, sc.ge); for(std::size_t row = 1; row <= m; ++row) rev[row][0] = rev[row-1][0] + _verticalGap(ac, 0, n, sc.ge); for(std::size_t row = 1; row <= m; ++row) for(std::size_t col = 1; col <= n; ++col) rev[row][col] = std::max(std::max(rev[row-1][col-1] + (sRev1[row-1] == sRev2[col-1] ? sc.match : sc.mismatch), rev[row-1][col] + _verticalGap(ac, col, n, sc.ge)), rev[row][col-1] + _horizontalGap(ac, row, m, sc.ge)); if (mat[m][n] != rev[m][n]) { //std::cerr << "Warning: Alignment scores disagree!" << std::endl; return false; } else { // Find best join TMatrix bestMat(boost::extents[m+1][n+1]); for(std::size_t row = 0; row <= m; ++row) { bestMat[row][0] = mat[row][0]; for(std::size_t col = 1; col <= n; ++col) { if (mat[row][col] > bestMat[row][col-1]) bestMat[row][col] = mat[row][col]; else bestMat[row][col] = bestMat[row][col-1]; } } TMatrix bestRev(boost::extents[m+1][n+1]); for(std::size_t row = 0; row <= m; ++row) { bestRev[row][0] = rev[row][0]; for(std::size_t col = 1; col <= n; ++col) { if (rev[row][col] > bestRev[row][col-1]) bestRev[row][col] = rev[row][col]; else bestRev[row][col] = bestRev[row][col-1]; } } TScoreValue bestScore = mat[m][n]; std::size_t consLeft = 0; std::size_t refLeft = 0; for(std::size_t row = 0; row<=m; ++row) { for(std::size_t col = 0; col<=n; ++col) { if (bestMat[row][col]+bestRev[m-row][n-col] > bestScore) { bestScore=bestMat[row][col]+bestRev[m-row][n-col]; consLeft = row; refLeft = col; } } } std::size_t consRight = m - consLeft; std::size_t refRight = 0; // Find right bound for(std::size_t right = 0; right<=(n-refLeft); ++right) { if (mat[consLeft][refLeft] + rev[consRight][right] == bestScore) { refRight = right; } } // Debug best join /* TScoreValue bScore = mat[m][n]; std::size_t cLeft = 0; std::size_t cRight = 0; std::size_t rLeft = 0; std::size_t rRight = 0; for(std::size_t fwdcut = 0; fwdcut<=m; ++fwdcut) { std::size_t revcut = m - fwdcut; // Iterate all valid collinear alignments on the reference for(std::size_t left = 0; left<=n; ++left) { for(std::size_t right = 0; right<=(n-left); ++right) { if (mat[fwdcut][left] + rev[revcut][right] > bScore) { bScore = mat[fwdcut][left] + rev[revcut][right]; cLeft = fwdcut; cRight = revcut; rLeft = left; rRight = right; } } } } std::cerr << mat[m][n] << ',' << bScore << ';' << s1.size() << ',' << s2.size() << ';' << cLeft << ',' << (m-cRight) << ',' << rLeft << ',' << (n-rRight) << std::endl; std::cerr << mat[m][n] << ',' << bestScore << ';' << s1.size() << ',' << s2.size() << ';' << consLeft << ',' << (m-consRight) << ',' << refLeft << ',' << (n-refRight) << std::endl; */ // Better split found? if (bestScore == mat[m][n]) return false; // No split found // Trace-back fwd std::size_t rr = consLeft; std::size_t cc = refLeft; typedef std::vector TTrace; TTrace trace; while ((rr>0) || (cc>0)) { if ((rr>0) && (mat[rr][cc] == mat[rr-1][cc] + _verticalGap(ac, cc, n, sc.ge))) { --rr; trace.push_back('v'); } else if ((cc>0) && (mat[rr][cc] == mat[rr][cc-1] + _horizontalGap(ac, rr, m, sc.ge))) { --cc; trace.push_back('h'); } else { --rr; --cc; trace.push_back('s'); } } TAlign fwd; _createAlignment(trace, s1.substr(0, consLeft), s2.substr(0, refLeft), fwd); // Trace-back rev rr = consRight; cc = refRight; typedef std::vector TTrace; TTrace rtrace; while ((rr>0) || (cc>0)) { if ((rr>0) && (rev[rr][cc] == rev[rr-1][cc] + _verticalGap(ac, cc, n, sc.ge))) { --rr; rtrace.push_back('v'); } else if ((cc>0) && (rev[rr][cc] == rev[rr][cc-1] + _horizontalGap(ac, rr, m, sc.ge))) { --cc; rtrace.push_back('h'); } else { --rr; --cc; rtrace.push_back('s'); } } TAlign rvs; _createAlignment(rtrace, sRev1.substr(0, consRight), sRev2.substr(0, refRight), rvs); // Concat alignments std::size_t gapref = (n-refRight) - refLeft; std::size_t alilen = fwd.shape()[1] + rvs.shape()[1] + gapref; align.resize(boost::extents[2][alilen]); TAIndex jEnd = rvs.shape()[1]; for(TAIndex i = 0; i < (TAIndex) fwd.shape()[0]; ++i) { TAIndex alicol = 0; for(;alicol < (TAIndex) fwd.shape()[1]; ++alicol) align[i][alicol]=fwd[i][alicol]; for(TAIndex j = refLeft; j < (TAIndex) (n-refRight); ++j, ++alicol) { if (i==0) align[i][alicol] = '-'; else align[i][alicol] = s2[j]; } for(TAIndex j = 0; j < (TAIndex) rvs.shape()[1]; ++j, ++alicol) { switch (rvs[i][jEnd-j-1]) { case 'A': align[i][alicol] = 'T'; break; case 'C': align[i][alicol] = 'G'; break; case 'G': align[i][alicol] = 'C'; break; case 'T': align[i][alicol] = 'A'; break; case 'N': align[i][alicol] = 'N'; break; case '-': align[i][alicol] = '-'; break; default: break; } } } } return true; } template inline int needleScore(TAlign1 const& a1, TAlign2 const& a2, TAlignConfig const& ac, TScoreObject const& sc) { typedef typename TScoreObject::TValue TScoreValue; // DP Matrix std::size_t m = _size(a1, 1); std::size_t n = _size(a2, 1); std::vector s(n+1, 0); TScoreValue prevsub = 0; // Create profile typedef boost::multi_array TProfile; TProfile p1; TProfile p2; if ((_size(a1, 0) != 1) || (_size(a2, 0) != 1)) { _createProfile(a1, p1); _createProfile(a2, p2); } // DP for(std::size_t row = 0; row <= m; ++row) { for(std::size_t col = 0; col <= n; ++col) { // Initialization if ((row == 0) && (col == 0)) { s[0] = 0; prevsub = 0; } else if (row == 0) { s[col] = _horizontalGap(ac, 0, m, col * sc.ge); } else if (col == 0) { s[0] = _verticalGap(ac, 0, n, row * sc.ge); if (row - 1 == 0) prevsub = 0; else prevsub = _verticalGap(ac, 0, n, (row - 1) * sc.ge); } else { // Recursion TScoreValue prevprevsub = prevsub; prevsub = s[col]; s[col] = std::max(std::max(prevprevsub + _score(a1, a2, p1, p2, row-1, col-1, sc), prevsub + _verticalGap(ac, col, n, sc.ge)), s[col-1] + _horizontalGap(ac, row, m, sc.ge)); } } } // Score return s[n]; } template inline int32_t needleBanded(std::string const& s1, std::string const& s2, TAlignConfig const& ac, TScoreObject const& sc) { typedef typename TScoreObject::TValue TScoreValue; // DP Matrix int32_t m = s1.size(); int32_t n = s2.size(); int32_t band = 100; int32_t lowBand = band; int32_t highBand = band; if (m < n) highBand += n - m; else lowBand += m - n; std::vector s(n+1, 0); TScoreValue prevsub = 0; TScoreValue prevprevsub = 0; // DP for(int32_t row = 0; row <= m; ++row) { for(int32_t col = std::max(0, row - lowBand); col <= std::min(n, row + highBand); ++col) { // Initialization if ((row == 0) && (col == 0)) { s[0] = 0; prevsub = 0; } else if (row == 0) { s[col] = _horizontalGap(ac, 0, m, col * sc.ge); } else if (col == 0) { s[0] = _verticalGap(ac, 0, n, row * sc.ge); if (row - 1 == 0) prevsub = 0; else prevsub = _verticalGap(ac, 0, n, (row - 1) * sc.ge); } else { // Recursion prevprevsub = prevsub; prevsub = s[col]; if (col == row - lowBand) { prevprevsub = s[col-1]; s[col - 1] = DELLY_OUTOFBAND; } else if (col == row + highBand) prevsub = DELLY_OUTOFBAND; s[col] = std::max(std::max(prevprevsub + (s1[row-1] == s2[col-1] ? sc.match : sc.mismatch), prevsub + _verticalGap(ac, col, n, sc.ge)), s[col-1] + _horizontalGap(ac, row, m, sc.ge)); } } } // Score return s[n]; } template inline int needle(TAlign1 const& a1, TAlign2 const& a2, TAlign& align, TAlignConfig const& ac, TScoreObject const& sc) { typedef typename TScoreObject::TValue TScoreValue; // DP Matrix std::size_t m = _size(a1, 1); std::size_t n = _size(a2, 1); std::vector s(n+1, 0); TScoreValue prevsub = 0; // Trace Matrix std::size_t mf = n+1; typedef boost::dynamic_bitset<> TBitSet; TBitSet bit3( (m+1) * (n+1), false); TBitSet bit4( (m+1) * (n+1), false); // Create profile typedef boost::multi_array TProfile; TProfile p1; TProfile p2; if ((_size(a1, 0) != 1) || (_size(a2, 0) != 1)) { _createProfile(a1, p1); _createProfile(a2, p2); } // DP for(std::size_t row = 0; row <= m; ++row) { for(std::size_t col = 0; col <= n; ++col) { // Initialization if ((row == 0) && (col == 0)) { s[0] = 0; prevsub = 0; } else if (row == 0) { s[col] = _horizontalGap(ac, 0, m, col * sc.ge); bit3[col] = true; } else if (col == 0) { s[0] = _verticalGap(ac, 0, n, row * sc.ge); if (row - 1 == 0) prevsub = 0; else prevsub = _verticalGap(ac, 0, n, (row - 1) * sc.ge); bit4[row * mf] = true; } else { // Recursion TScoreValue prevprevsub = prevsub; prevsub = s[col]; s[col] = std::max(std::max(prevprevsub + _score(a1, a2, p1, p2, row-1, col-1, sc), prevsub + _verticalGap(ac, col, n, sc.ge)), s[col-1] + _horizontalGap(ac, row, m, sc.ge)); // Trace if (s[col] == s[col-1] + _horizontalGap(ac, row, m, sc.ge)) bit3[row * mf + col] = true; else if (s[col] == prevsub + _verticalGap(ac, col, n, sc.ge)) bit4[row * mf + col] = true; } } } // Trace-back using pointers std::size_t row = m; std::size_t col = n; typedef std::vector TTrace; TTrace trace; while ((row>0) || (col>0)) { if (bit3[row * mf + col]) { --col; trace.push_back('h'); } else if (bit4[row * mf + col]) { --row; trace.push_back('v'); } else { --row; --col; trace.push_back('s'); } } // Create alignment _createAlignment(trace, a1, a2, align); // Score return s[n]; } template inline int needle(TAlign1 const& a1, TAlign2 const& a2, TAlign& align, TAlignConfig const& ac) { DnaScore dnasc; return needle(a1, a2, align, ac, dnasc); } template inline int needle(TAlign1 const& a1, TAlign2 const& a2, TAlign& align) { AlignConfig ac; return needle(a1, a2, align, ac); } } #endif delly-0.9.1/src/scan.h000066400000000000000000000251671414764127700145460ustar00rootroot00000000000000#ifndef SCAN_H #define SCAN_H #include #include #include #include #include #include #include #include #include "version.h" #include "util.h" namespace torali { struct ScanWindow { bool select; int32_t start; int32_t end; uint32_t cov; uint32_t uniqcov; ScanWindow() : select(false), start(0), end(0), cov(0), uniqcov(0) {} explicit ScanWindow(int32_t const s) : select(false), start(s), end(s+1), cov(0), uniqcov(0) {} }; template struct SortScanWindow : public std::binary_function { inline bool operator()(TScanWindow const& sw1, TScanWindow const& sw2) { return ((sw1.start inline int32_t _findScanWindow(TConfig const& c, uint32_t const reflen, std::vector const& binMap, int32_t const midPoint) { if (c.hasScanFile) { if (binMap[midPoint] == LAST_BIN) return -1; else return binMap[midPoint]; } else { uint32_t bin = midPoint / c.scanWindow; uint32_t allbins = reflen / c.scanWindow; if (bin >= allbins) return -1; else return bin; } return -1; } inline std::pair estCountBounds(std::vector< std::vector > const& scanCounts) { std::vector all; for(uint32_t refIndex = 0; refIndex < scanCounts.size(); ++refIndex) { for(uint32_t i = 0; i absdev; for(uint32_t i = 0; i inline void scan(TConfig const& c, LibraryInfo const& li, std::vector< std::vector >& scanCounts) { // Load bam file samFile* samfile = sam_open(c.bamFile.string().c_str(), "r"); hts_set_fai_filename(samfile, c.genome.string().c_str()); hts_idx_t* idx = sam_index_load(samfile, c.bamFile.string().c_str()); bam_hdr_t* hdr = sam_hdr_read(samfile); // Pre-defined scanning windows if (c.hasScanFile) { typedef boost::icl::interval_set TChrIntervals; typedef std::vector TRegionsGenome; TRegionsGenome scanRegions; if (!_parseBedIntervals(c.scanFile.string(), c.hasScanFile, hdr, scanRegions)) { std::cerr << "Warning: Couldn't parse BED intervals. Do the chromosome names match?" << std::endl; } for (int32_t refIndex = 0; refIndex < hdr->n_targets; ++refIndex) { for(typename TChrIntervals::iterator it = scanRegions[refIndex].begin(); it != scanRegions[refIndex].end(); ++it) { if (it->lower() < it->upper()) { if ((it->lower() >= 0) && (it->upper() < hdr->target_len[refIndex])) { ScanWindow sw; sw.start = it->lower(); sw.end = it->upper(); sw.select = true; scanCounts[refIndex].push_back(sw); } } } // Sort scan windows sort(scanCounts[refIndex].begin(), scanCounts[refIndex].end(), SortScanWindow()); } } // Parse BAM file boost::posix_time::ptime now = boost::posix_time::second_clock::local_time(); std::cout << '[' << boost::posix_time::to_simple_string(now) << "] " << "Scanning Windows" << std::endl; boost::progress_display show_progress( hdr->n_targets ); // Iterate chromosomes uint64_t totalCov = 0; faidx_t* faiMap = fai_load(c.mapFile.string().c_str()); for(int32_t refIndex=0; refIndex < (int32_t) hdr->n_targets; ++refIndex) { ++show_progress; if (chrNoData(c, refIndex, idx)) continue; // Exclude small chromosomes if ((hdr->target_len[refIndex] < c.minChrLen) && (totalCov > 1000000)) continue; // Exclude sex chromosomes if ((std::string(hdr->target_name[refIndex]) == "chrX") || (std::string(hdr->target_name[refIndex]) == "chrY") || (std::string(hdr->target_name[refIndex]) == "X") || (std::string(hdr->target_name[refIndex]) == "Y")) continue; // Check presence in mappability map std::string tname(hdr->target_name[refIndex]); int32_t seqlen = faidx_seq_len(faiMap, tname.c_str()); if (seqlen == -1) continue; else seqlen = -1; char* seq = faidx_fetch_seq(faiMap, tname.c_str(), 0, faidx_seq_len(faiMap, tname.c_str()), &seqlen); // Get Mappability std::vector uniqContent(hdr->target_len[refIndex], 0); { // Mappability map typedef boost::dynamic_bitset<> TBitSet; TBitSet uniq(hdr->target_len[refIndex], false); for(uint32_t i = 0; i < hdr->target_len[refIndex]; ++i) { if (seq[i] == 'C') uniq[i] = 1; } // Sum across fragments int32_t halfwin = (int32_t) (c.meanisize / 2); int32_t usum = 0; for(int32_t pos = halfwin; pos < (int32_t) hdr->target_len[refIndex] - halfwin; ++pos) { if (pos == halfwin) { for(int32_t i = pos - halfwin; i<=pos+halfwin; ++i) usum += uniq[i]; } else { usum -= uniq[pos - halfwin - 1]; usum += uniq[pos + halfwin]; } uniqContent[pos] = usum; } } // Bins on this chromosome std::vector binMap; if (!c.hasScanFile) { uint32_t allbins = hdr->target_len[refIndex] / c.scanWindow; scanCounts[refIndex].resize(allbins, ScanWindow()); for(uint32_t i = 0; i < allbins; ++i) { scanCounts[refIndex][i].start = i * c.scanWindow; scanCounts[refIndex][i].end = (i+1) * c.scanWindow; } } else { // Fill bin map binMap.resize(hdr->target_len[refIndex], LAST_BIN); if (scanCounts[refIndex].size() >= LAST_BIN) { std::cerr << "Warning: Too many scan windows on " << hdr->target_name[refIndex] << std::endl; } for(uint32_t bin = 0;((bin < scanCounts[refIndex].size()) && (bin < LAST_BIN)); ++bin) { for(int32_t k = scanCounts[refIndex][bin].start; k < scanCounts[refIndex][bin].end; ++k) binMap[k] = bin; } } // Mate map typedef boost::unordered_map TMateMap; TMateMap mateMap; // Count reads hts_itr_t* iter = sam_itr_queryi(idx, refIndex, 0, hdr->target_len[refIndex]); bam1_t* rec = bam_init1(); int32_t lastAlignedPos = 0; std::set lastAlignedPosReads; while (sam_itr_next(samfile, iter, rec) >= 0) { if (rec->core.flag & (BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP | BAM_FSUPPLEMENTARY | BAM_FUNMAP)) continue; if ((rec->core.flag & BAM_FPAIRED) && ((rec->core.flag & BAM_FMUNMAP) || (rec->core.tid != rec->core.mtid))) continue; if (rec->core.qual < c.minQual) continue; if (getSVType(rec) != 2) continue; int32_t midPoint = rec->core.pos + halfAlignmentLength(rec); if (rec->core.flag & BAM_FPAIRED) { // Clean-up the read store for identical alignment positions if (rec->core.pos > lastAlignedPos) { lastAlignedPosReads.clear(); lastAlignedPos = rec->core.pos; } if ((rec->core.pos < rec->core.mpos) || ((rec->core.pos == rec->core.mpos) && (lastAlignedPosReads.find(hash_string(bam_get_qname(rec))) == lastAlignedPosReads.end()))) { // First read lastAlignedPosReads.insert(hash_string(bam_get_qname(rec))); std::size_t hv = hash_pair(rec); mateMap[hv] = true; continue; } else { // Second read std::size_t hv = hash_pair_mate(rec); if ((mateMap.find(hv) == mateMap.end()) || (!mateMap[hv])) continue; // Mate discarded mateMap[hv] = false; } // Insert size filter int32_t isize = (rec->core.pos + alignmentLength(rec)) - rec->core.mpos; if ((li.minNormalISize < isize) && (isize < li.maxNormalISize)) midPoint = rec->core.mpos + (int32_t) (isize/2); else continue; } // Count fragment if ((midPoint >= 0) && (midPoint < (int32_t) hdr->target_len[refIndex])) { int32_t bin = _findScanWindow(c, hdr->target_len[refIndex], binMap, midPoint); if (bin >= 0) { ++scanCounts[refIndex][bin].cov; if (uniqContent[midPoint] >= c.fragmentUnique * c.meanisize) ++scanCounts[refIndex][bin].uniqcov; ++totalCov; } } } // Clean-up bam_destroy1(rec); hts_itr_destroy(iter); if (seq != NULL) free(seq); } // clean-up fai_destroy(faiMap); bam_hdr_destroy(hdr); hts_idx_destroy(idx); sam_close(samfile); } template inline void selectWindows(TConfig const& c, std::vector< std::vector >& scanCounts) { if (c.noScanWindowSelection) { // Select all windows for(uint32_t refIndex = 0; refIndex < scanCounts.size(); ++refIndex) { for(uint32_t i = 0; i 0) uniqratio = (double) scanCounts[refIndex][i].uniqcov / scanCounts[refIndex][i].cov; if (uniqratio > c.uniqueToTotalCovRatio) scanCounts[refIndex][i].select = true; else scanCounts[refIndex][i].select = false; } } // Normalize user-defined scan windows to same length (10,000bp) if (c.hasScanFile) { for(uint32_t refIndex = 0; refIndex < scanCounts.size(); ++refIndex) { for(uint32_t i = 0; i TCountBounds; TCountBounds cb = estCountBounds(scanCounts); // Select CN2 windows for(uint32_t refIndex = 0; refIndex < scanCounts.size(); ++refIndex) { for(uint32_t i = 0; i cb.first) && (scanCounts[refIndex][i].cov < cb.second)) scanCounts[refIndex][i].select = true; else scanCounts[refIndex][i].select = false; } } } } } } #endif delly-0.9.1/src/shortpe.h000066400000000000000000000570051414764127700153020ustar00rootroot00000000000000#ifndef SHORTPE_H #define SHORTPE_H #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "version.h" #include "util.h" #include "bolog.h" #include "tags.h" #include "coverage.h" #include "msa.h" #include "split.h" #include "junction.h" #include "cluster.h" #include #include #include #include #include namespace torali { template inline void assembleSplitReads(TConfig const& c, TValidRegion const& validRegions, TSRStore const& srStore, std::vector& svs) { typedef typename TValidRegion::value_type TChrIntervals; typedef typename TSRStore::value_type TPosReadSV; // Open file handles typedef std::vector TSamFile; typedef std::vector TIndex; TSamFile samfile(c.files.size()); TIndex idx(c.files.size()); for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) { samfile[file_c] = sam_open(c.files[file_c].string().c_str(), "r"); hts_set_fai_filename(samfile[file_c], c.genome.string().c_str()); idx[file_c] = sam_index_load(samfile[file_c], c.files[file_c].string().c_str()); } bam_hdr_t* hdr = sam_hdr_read(samfile[0]); // Reads per SV typedef std::set TSequences; typedef std::vector TSVSequences; TSVSequences traStore(svs.size(), TSequences()); uint32_t maxReadPerSV = 20; typedef std::vector TQualities; typedef std::vector TQualVectors; TQualVectors traQualStore(svs.size(), TQualities()); // Parse BAM boost::posix_time::ptime now = boost::posix_time::second_clock::local_time(); std::cout << '[' << boost::posix_time::to_simple_string(now) << "] " << "Split-read assembly" << std::endl; boost::progress_display show_progress( 2 * hdr->n_targets ); faidx_t* fai = fai_load(c.genome.string().c_str()); for(int32_t refIndex = 0; refIndex < hdr->n_targets; ++refIndex) { ++show_progress; if (validRegions[refIndex].empty()) continue; if (srStore[refIndex].empty()) continue; // Load sequence int32_t seqlen = -1; std::string tname(hdr->target_name[refIndex]); char* seq = faidx_fetch_seq(fai, tname.c_str(), 0, hdr->target_len[refIndex], &seqlen); // Collect all split-read pos typedef boost::dynamic_bitset<> TBitSet; TBitSet hits(hdr->target_len[refIndex]); for(typename TPosReadSV::const_iterator it = srStore[refIndex].begin(); it != srStore[refIndex].end(); ++it) hits[it->first.first] = 1; // Sequences TSVSequences seqStore(svs.size(), TSequences()); TQualVectors qualStore(svs.size(), TQualities()); // Collect reads from all samples for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) { // Read alignments for(typename TChrIntervals::const_iterator vRIt = validRegions[refIndex].begin(); vRIt != validRegions[refIndex].end(); ++vRIt) { hts_itr_t* iter = sam_itr_queryi(idx[file_c], refIndex, vRIt->lower(), vRIt->upper()); bam1_t* rec = bam_init1(); while (sam_itr_next(samfile[file_c], iter, rec) >= 0) { if (rec->core.flag & (BAM_FQCFAIL | BAM_FDUP | BAM_FUNMAP | BAM_FSECONDARY | BAM_FSUPPLEMENTARY)) continue; if ((rec->core.qual < c.minMapQual) || (rec->core.tid<0)) continue; if (!hits[rec->core.pos]) continue; // Valid split-read std::size_t seed = hash_string(bam_get_qname(rec)); typename TPosReadSV::const_iterator it = srStore[refIndex].find(std::make_pair(rec->core.pos, seed)); if (it != srStore[refIndex].end()) { int32_t svid = it->second; // Get the sequence if (svid == (int32_t) svs[svid].id) { // Should be always true std::string sequence; sequence.resize(rec->core.l_qseq); uint8_t* seqptr = bam_get_seq(rec); for (int i = 0; i < rec->core.l_qseq; ++i) sequence[i] = "=ACMGRSVTWYHKDBN"[bam_seqi(seqptr, i)]; // Adjust orientation bool bpPoint = false; if (_translocation(svs[svid].svt)) { if (rec->core.tid == svs[svid].chr2) bpPoint = true; } else { // Only relevant for inversions if (svs[svid].svt == 0) { if (rec->core.pos + 25 > svs[svid].svStart) bpPoint = true; else bpPoint = false; } else if (svs[svid].svt == 1) { if (rec->core.pos + 25 > svs[svid].svEnd) bpPoint = true; else bpPoint = false; } } _adjustOrientation(sequence, bpPoint, svs[svid].svt); // At most n split-reads if (seqStore[svid].size() < maxReadPerSV) { bool insertSuccess = false; if (_translocation(svs[svid].svt)) insertSuccess = traStore[svid].insert(sequence).second; else insertSuccess = seqStore[svid].insert(sequence).second; // Store qualities if (insertSuccess) { if (_translocation(svs[svid].svt)) traQualStore[svid].push_back(rec->core.qual); else qualStore[svid].push_back(rec->core.qual); } } } } } bam_destroy1(rec); hts_itr_destroy(iter); } } // Process all SVs on this chromosome for(uint32_t svid = 0; svid < seqStore.size(); ++svid) { if (_translocation(svs[svid].svt)) continue; if (svs[svid].chr != refIndex) continue; // MSA bool msaSuccess = false; if (seqStore[svid].size() > 1) { msa(c, seqStore[svid], svs[svid].consensus); if (alignConsensus(c, hdr, seq, NULL, svs[svid])) msaSuccess = true; } if (!msaSuccess) { svs[svid].consensus = ""; svs[svid].srSupport = 0; svs[svid].srAlignQuality = 0; } else { // SR support and qualities std::sort(qualStore[svid].begin(), qualStore[svid].end()); svs[svid].mapq = 0; for(uint32_t i = 0; i < qualStore[svid].size(); ++i) svs[svid].mapq += qualStore[svid][i]; svs[svid].srSupport = seqStore[svid].size(); svs[svid].srMapQuality = qualStore[svid][qualStore[svid].size()/2]; } } // Clean-up if (seq != NULL) free(seq); } // Process translocations for(int32_t refIndex2 = 0; refIndex2 < hdr->n_targets; ++refIndex2) { ++show_progress; if (validRegions[refIndex2].empty()) continue; char* sndSeq = NULL; for(int32_t refIndex = refIndex2 + 1; refIndex < hdr->n_targets; ++refIndex) { if (validRegions[refIndex].empty()) continue; char* seq = NULL; // Iterate SVs for(uint32_t svid = 0; svid < traStore.size(); ++svid) { if (!_translocation(svs[svid].svt)) continue; if ((svs[svid].chr != refIndex) || (svs[svid].chr2 != refIndex2)) continue; bool msaSuccess = false; if (traStore[svid].size() > 1) { // Lazy loading of references if (seq == NULL) { int32_t seqlen = -1; std::string tname(hdr->target_name[refIndex]); seq = faidx_fetch_seq(fai, tname.c_str(), 0, hdr->target_len[refIndex], &seqlen); } if (sndSeq == NULL) { int32_t seqlen = -1; std::string tname(hdr->target_name[refIndex2]); sndSeq = faidx_fetch_seq(fai, tname.c_str(), 0, hdr->target_len[refIndex2], &seqlen); } msa(c, traStore[svid], svs[svid].consensus); if (alignConsensus(c, hdr, seq, sndSeq, svs[svid])) msaSuccess = true; } if (!msaSuccess) { svs[svid].consensus = ""; svs[svid].srSupport = 0; svs[svid].srAlignQuality = 0; } else { // SR support and qualities std::sort(traQualStore[svid].begin(), traQualStore[svid].end()); svs[svid].mapq = 0; for(uint32_t i = 0; i < traQualStore[svid].size(); ++i) svs[svid].mapq += traQualStore[svid][i]; svs[svid].srSupport = traStore[svid].size(); svs[svid].srMapQuality = traQualStore[svid][traQualStore[svid].size()/2]; } } if (seq != NULL) free(seq); } if (sndSeq != NULL) free(sndSeq); } // Clean-up fai_destroy(fai); bam_hdr_destroy(hdr); for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) { hts_idx_destroy(idx[file_c]); sam_close(samfile[file_c]); } } template inline void scanPEandSR(TConfig const& c, TValidRegion const& validRegions, std::vector& svs, std::vector& srSVs, TSRStore& srStore, TSampleLib& sampleLib) { typedef typename TValidRegion::value_type TChrIntervals; // Open file handles typedef std::vector TSamFile; typedef std::vector TIndex; TSamFile samfile(c.files.size()); TIndex idx(c.files.size()); for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) { samfile[file_c] = sam_open(c.files[file_c].string().c_str(), "r"); hts_set_fai_filename(samfile[file_c], c.genome.string().c_str()); idx[file_c] = sam_index_load(samfile[file_c], c.files[file_c].string().c_str()); } bam_hdr_t* hdr = sam_hdr_read(samfile[0]); // Split-read records typedef std::vector TSRBamRecord; typedef std::vector TSvtSRBamRecord; TSvtSRBamRecord srBR(2 * DELLY_SVT_TRANS, TSRBamRecord()); // Create bam alignment record vector typedef std::vector TBamRecord; typedef std::vector TSvtBamRecord; TSvtBamRecord bamRecord(2 * DELLY_SVT_TRANS, TBamRecord()); // Parse genome, process chromosome by chromosome boost::posix_time::ptime now = boost::posix_time::second_clock::local_time(); std::cout << '[' << boost::posix_time::to_simple_string(now) << "] " << "Paired-end and split-read scanning" << std::endl; boost::progress_display show_progress( c.files.size() * hdr->n_targets ); // Iterate all samples #pragma omp parallel for default(shared) for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) { // Inter-chromosomal mate map and alignment length typedef std::pair TQualLen; typedef boost::unordered_map TMateMap; std::vector matetra(c.files.size()); // Split-read junctions typedef std::vector TJunctionVector; typedef std::map TReadBp; TReadBp readBp; // Iterate all chromosomes for that sample for(int32_t refIndex=0; refIndex < (int32_t) hdr->n_targets; ++refIndex) { ++show_progress; // Any data? if (validRegions[refIndex].empty()) continue; bool nodata = true; std::string suffix("cram"); std::string str(c.files[file_c].string()); if ((str.size() >= suffix.size()) && (str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0)) nodata = false; uint64_t mapped = 0; uint64_t unmapped = 0; hts_idx_get_stat(idx[file_c], refIndex, &mapped, &unmapped); if (mapped) nodata = false; if (nodata) continue; // Intra-chromosomal mate map and alignment length TMateMap mateMap; // Read alignments for(typename TChrIntervals::const_iterator vRIt = validRegions[refIndex].begin(); vRIt != validRegions[refIndex].end(); ++vRIt) { hts_itr_t* iter = sam_itr_queryi(idx[file_c], refIndex, vRIt->lower(), vRIt->upper()); bam1_t* rec = bam_init1(); int32_t lastAlignedPos = 0; std::set lastAlignedPosReads; while (sam_itr_next(samfile[file_c], iter, rec) >= 0) { if (rec->core.flag & (BAM_FQCFAIL | BAM_FDUP | BAM_FUNMAP)) continue; if ((rec->core.qual < c.minMapQual) || (rec->core.tid<0)) continue; unsigned seed = hash_string(bam_get_qname(rec)); // SV detection using single-end read uint32_t rp = rec->core.pos; // reference pointer uint32_t sp = 0; // sequence pointer // Parse the CIGAR uint32_t* cigar = bam_get_cigar(rec); for (std::size_t i = 0; i < rec->core.n_cigar; ++i) { if ((bam_cigar_op(cigar[i]) == BAM_CMATCH) || (bam_cigar_op(cigar[i]) == BAM_CEQUAL) || (bam_cigar_op(cigar[i]) == BAM_CDIFF)) { sp += bam_cigar_oplen(cigar[i]); rp += bam_cigar_oplen(cigar[i]); } else if (bam_cigar_op(cigar[i]) == BAM_CDEL) { if (bam_cigar_oplen(cigar[i]) > c.minRefSep) _insertJunction(readBp, seed, rec, rp, sp, false); rp += bam_cigar_oplen(cigar[i]); if (bam_cigar_oplen(cigar[i]) > c.minRefSep) _insertJunction(readBp, seed, rec, rp, sp, true); } else if (bam_cigar_op(cigar[i]) == BAM_CINS) { if (bam_cigar_oplen(cigar[i]) > c.minRefSep) _insertJunction(readBp, seed, rec, rp, sp, false); sp += bam_cigar_oplen(cigar[i]); if (bam_cigar_oplen(cigar[i]) > c.minRefSep) _insertJunction(readBp, seed, rec, rp, sp, true); } else if ((bam_cigar_op(cigar[i]) == BAM_CSOFT_CLIP) || (bam_cigar_op(cigar[i]) == BAM_CHARD_CLIP)) { int32_t finalsp = sp; bool scleft = false; if (sp == 0) { finalsp += bam_cigar_oplen(cigar[i]); // Leading soft-clip / hard-clip scleft = true; } sp += bam_cigar_oplen(cigar[i]); if (bam_cigar_oplen(cigar[i]) > c.minClip) _insertJunction(readBp, seed, rec, rp, finalsp, scleft); } else if (bam_cigar_op(cigar[i]) == BAM_CREF_SKIP) { rp += bam_cigar_oplen(cigar[i]); } else { std::cerr << "Warning: Unknown Cigar operation!" << std::endl; } } // Paired-end clustering if (rec->core.flag & BAM_FPAIRED) { // Single-end library if (sampleLib[file_c].median == 0) continue; // Single-end library // Secondary/supplementary alignments, mate unmapped or blacklisted chr if (rec->core.flag & (BAM_FSECONDARY | BAM_FSUPPLEMENTARY)) continue; if ((rec->core.mtid<0) || (rec->core.flag & BAM_FMUNMAP)) continue; if (validRegions[rec->core.mtid].empty()) continue; if ((_translocation(rec)) && (rec->core.qual < c.minTraQual)) continue; // SV type int32_t svt = _isizeMappingPos(rec, sampleLib[file_c].maxISizeCutoff); if (svt == -1) continue; if ((c.svtcmd) && (c.svtset.find(svt) == c.svtset.end())) continue; // Check library-specific insert size for deletions if ((svt == 2) && (sampleLib[file_c].maxISizeCutoff > std::abs(rec->core.isize))) continue; // Clean-up the read store for identical alignment positions if (rec->core.pos > lastAlignedPos) { lastAlignedPosReads.clear(); lastAlignedPos = rec->core.pos; } // Get or store the mapping quality for the partner if (_firstPairObs(rec, lastAlignedPosReads)) { // First read lastAlignedPosReads.insert(seed); std::size_t hv = hash_pair(rec); if (_translocation(svt)) matetra[file_c][hv]= std::make_pair((uint8_t) rec->core.qual, alignmentLength(rec)); else mateMap[hv]= std::make_pair((uint8_t) rec->core.qual, alignmentLength(rec)); } else { // Second read std::size_t hv = hash_pair_mate(rec); int32_t alenmate = 0; uint8_t pairQuality = 0; if (_translocation(svt)) { // Inter-chromosomal if ((matetra[file_c].find(hv) == matetra[file_c].end()) || (!matetra[file_c][hv].first)) continue; // Mate discarded TQualLen p = matetra[file_c][hv]; pairQuality = std::min((uint8_t) p.first, (uint8_t) rec->core.qual); alenmate = p.second; matetra[file_c][hv].first = 0; } else { // Intra-chromosomal if ((mateMap.find(hv) == mateMap.end()) || (!mateMap[hv].first)) continue; // Mate discarded TQualLen p = mateMap[hv]; pairQuality = std::min((uint8_t) p.first, (uint8_t) rec->core.qual); alenmate = p.second; mateMap[hv].first = 0; } #pragma omp critical { bamRecord[svt].push_back(BamAlignRecord(rec, pairQuality, alignmentLength(rec), alenmate, sampleLib[file_c].median, sampleLib[file_c].mad, sampleLib[file_c].maxNormalISize)); } ++sampleLib[file_c].abnormal_pairs; } } } bam_destroy1(rec); hts_itr_destroy(iter); } } // Process all junctions for this BAM file for(typename TReadBp::iterator it = readBp.begin(); it != readBp.end(); ++it) { std::sort(it->second.begin(), it->second.end(), SortJunction()); } // Collect split-read SVs #pragma omp critical { if ((!c.svtcmd) || (c.svtset.find(2) != c.svtset.end())) selectDeletions(c, readBp, srBR); if ((!c.svtcmd) || (c.svtset.find(3) != c.svtset.end())) selectDuplications(c, readBp, srBR); if ((!c.svtcmd) || (c.svtset.find(0) != c.svtset.end()) || (c.svtset.find(1) != c.svtset.end())) selectInversions(c, readBp, srBR); if ((!c.svtcmd) || (c.svtset.find(4) != c.svtset.end())) selectInsertions(c, readBp, srBR); if ((!c.svtcmd) || (c.svtset.find(DELLY_SVT_TRANS) != c.svtset.end()) || (c.svtset.find(DELLY_SVT_TRANS + 1) != c.svtset.end()) || (c.svtset.find(DELLY_SVT_TRANS + 2) != c.svtset.end()) || (c.svtset.find(DELLY_SVT_TRANS + 3) != c.svtset.end())) selectTranslocations(c, readBp, srBR); } } // Debug abnormal paired-ends and split-reads //outputSRBamRecords(c, srBR); // Cluster split-read records now = boost::posix_time::second_clock::local_time(); std::cout << '[' << boost::posix_time::to_simple_string(now) << "] " << "Split-read clustering" << std::endl; boost::progress_display spSR( srBR.size() ); for(uint32_t svt = 0; svt < srBR.size(); ++svt) { ++spSR; if ((c.svtcmd) && (c.svtset.find(svt) == c.svtset.end())) continue; if (srBR[svt].empty()) continue; // Sort std::sort(srBR[svt].begin(), srBR[svt].end(), SortSRBamRecord()); // Cluster cluster(c, srBR[svt], srSVs, c.maxReadSep, svt); // Debug SR SVs //outputStructuralVariants(c, srSVs, svt); } // Cluster paired-end records now = boost::posix_time::second_clock::local_time(); std::cout << '[' << boost::posix_time::to_simple_string(now) << "] " << "Paired-end clustering" << std::endl; boost::progress_display spPE( bamRecord.size() ); // Maximum variability in insert size int32_t varisize = getVariability(c, sampleLib); for(int32_t svt = 0; svt < (int32_t) bamRecord.size(); ++svt) { ++spPE; if ((c.svtcmd) && (c.svtset.find(svt) == c.svtset.end())) continue; if (bamRecord[svt].empty()) continue; // Sort BAM records according to position std::sort(bamRecord[svt].begin(), bamRecord[svt].end(), SortBamRecords()); // Cluster cluster(c, bamRecord[svt], svs, varisize, svt); } // Track split-reads for(uint32_t svt = 0; svt < srBR.size(); ++svt) { for(uint32_t i = 0; i < srBR[svt].size(); ++i) { // Read assigned? if ((srBR[svt][i].svid != -1) && (srBR[svt][i].rstart != -1)) { if (srBR[svt][i].rstart < (int32_t) hdr->target_len[srBR[svt][i].chr]) srStore[srBR[svt][i].chr].insert(std::make_pair(std::make_pair(srBR[svt][i].rstart, srBR[svt][i].id), srBR[svt][i].svid)); if (srBR[svt][i].chr != srBR[svt][i].chr2) { // Unclear which chr was primary alignment so insert both if and only if rstart < reference length if (srBR[svt][i].rstart < (int32_t) hdr->target_len[srBR[svt][i].chr2]) srStore[srBR[svt][i].chr2].insert(std::make_pair(std::make_pair(srBR[svt][i].rstart, srBR[svt][i].id), srBR[svt][i].svid)); } } } } // Clean-up bam_hdr_destroy(hdr); for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) { hts_idx_destroy(idx[file_c]); sam_close(samfile[file_c]); } } inline void mergeSort(std::vector& pe, std::vector& sr) { typedef typename std::vector TVariants; // Sort PE records for look-up sort(pe.begin(), pe.end(), SortSVs()); // Sort SR records for look-up sort(sr.begin(), sr.end(), SortSVs()); // Augment PE SVs and append missing SR SVs for(int32_t svt = 0; svt < 10; ++svt) { for(int32_t i = 0; i < (int32_t) sr.size(); ++i) { if (sr[i].svt != svt) continue; if ((sr[i].srSupport == 0) || (sr[i].srAlignQuality == 0)) continue; // SR assembly failed // Precise duplicates int32_t searchWindow = 500; bool svExists = false; typename TVariants::iterator itOther = std::lower_bound(pe.begin(), pe.end(), StructuralVariantRecord(sr[i].chr, std::max(0, sr[i].svStart - searchWindow), sr[i].svEnd), SortSVs()); for(; ((itOther != pe.end()) && (std::abs(itOther->svStart - sr[i].svStart) < searchWindow)); ++itOther) { if ((itOther->svt != svt) || (itOther->precise)) continue; if ((sr[i].chr != itOther->chr) || (sr[i].chr2 != itOther->chr2)) continue; // Mismatching chr // Breakpoints within PE confidence interval? if ((itOther->svStart + itOther->ciposlow < sr[i].svStart) && (sr[i].svStart < itOther->svStart + itOther->ciposhigh)) { if ((itOther->svEnd + itOther->ciendlow < sr[i].svEnd) && (sr[i].svEnd < itOther->svEnd + itOther->ciendhigh)) { svExists = true; // Augment PE record itOther->svStart = sr[i].svStart; itOther->svEnd = sr[i].svEnd; itOther->ciposlow = sr[i].ciposlow; itOther->ciposhigh = sr[i].ciposhigh; itOther->ciendlow = sr[i].ciendlow; itOther->ciendhigh = sr[i].ciendhigh; itOther->srMapQuality = sr[i].srMapQuality; itOther->srSupport = sr[i].srSupport; itOther->insLen = sr[i].insLen; itOther->homLen = sr[i].homLen; itOther->srAlignQuality = sr[i].srAlignQuality; itOther->precise = true; itOther->consensus = sr[i].consensus; itOther->mapq += sr[i].mapq; } } } // SR only SV if (!svExists) { // Make sure there is no PRECISE duplicate int32_t precSearchWindow = 10; bool preciseDuplicate = false; for(int32_t j = i + 1; j < (int32_t) sr.size(); ++j) { if (std::abs(sr[i].svStart - sr[j].svStart) > precSearchWindow) break; if (sr[i].svt != sr[j].svt) continue; // Mismatching SV types if ((sr[i].chr != sr[j].chr) || (sr[i].chr2 != sr[j].chr2)) continue; // Mismatching chr // Breakpoints within PE confidence interval? if ((sr[j].svStart + sr[j].ciposlow <= sr[i].svStart) && (sr[i].svStart <= sr[j].svStart + sr[j].ciposhigh)) { if ((sr[j].svEnd + sr[j].ciendlow <= sr[i].svEnd) && (sr[i].svEnd <= sr[j].svEnd + sr[j].ciendhigh)) { // Duplicate, keep better call if ((sr[i].srSupport < sr[j].srSupport) || ((i < j) && (sr[i].srSupport == sr[j].srSupport))) preciseDuplicate = true; } } } for(int32_t j = i - 1; j>=0; --j) { if (std::abs(sr[i].svStart - sr[j].svStart) > precSearchWindow) break; if (sr[i].svt != sr[j].svt) continue; // Mismatching SV types if ((sr[i].chr != sr[j].chr) || (sr[i].chr2 != sr[j].chr2)) continue; // Mismatching chr // Breakpoints within PE confidence interval? if ((sr[j].svStart + sr[j].ciposlow < sr[i].svStart) && (sr[i].svStart < sr[j].svStart + sr[j].ciposhigh)) { if ((sr[j].svEnd + sr[j].ciendlow < sr[i].svEnd) && (sr[i].svEnd < sr[j].svEnd + sr[j].ciendhigh)) { // Duplicate, keep better call if ((sr[i].srSupport < sr[j].srSupport) || ((i < j) && (sr[i].srSupport == sr[j].srSupport))) preciseDuplicate = true; } } } if (!preciseDuplicate) { pe.push_back(sr[i]); sort(pe.begin(), pe.end(), SortSVs()); } } } } } } #endif delly-0.9.1/src/split.h000066400000000000000000000435441414764127700147540ustar00rootroot00000000000000#ifndef SPLIT_H #define SPLIT_H #include #include "gotoh.h" #include "needle.h" namespace torali { struct AlignDescriptor { int32_t cStart; int32_t cEnd; int32_t rStart; int32_t rEnd; int32_t homLeft; int32_t homRight; float percId; AlignDescriptor() : cStart(0), cEnd(0), rStart(0), rEnd(0), homLeft(0), homRight(0), percId(0) {} }; template inline void _adjustOrientation(std::string& sequence, TBPoint bpPoint, int32_t const svt) { if (_translocation(svt)) { uint8_t ct = _getSpanOrientation(svt); if (((ct==0) && (bpPoint)) || ((ct==1) && (!bpPoint))) reverseComplement(sequence); } else { if (svt == 0) { if (bpPoint) reverseComplement(sequence); } else if (svt == 1) { if (!bpPoint) reverseComplement(sequence); } } } inline bool _largeClipFraction(int32_t const clipSize, int32_t const qlen, int32_t const svt) { if (svt == 2) return (((double) clipSize / (double) qlen) > 0.5); else return false; } inline bool _validSoftClip(bam1_t* rec, int32_t& clipSize, int32_t& splitPoint, bool& leadingSC, unsigned short qualCut, int32_t const svt) { // Check read-length if (rec->core.l_qseq < 35) return false; // Check for single soft-clip unsigned int numSoftClip = 0; uint32_t* cigar = bam_get_cigar(rec); for (unsigned int i = 0; i < rec->core.n_cigar; ++i) { if (bam_cigar_op(cigar[i]) == BAM_CSOFT_CLIP) { ++numSoftClip; clipSize = bam_cigar_oplen(cigar[i]); } } if (numSoftClip != 1) return false; // Check clip fraction if (_largeClipFraction(clipSize, rec->core.l_qseq, svt)) return false; // Get quality vector typedef std::vector TQuality; TQuality quality; quality.resize(rec->core.l_qseq); uint8_t* qualptr = bam_get_qual(rec); for (int i = 0; i < rec->core.l_qseq; ++i) quality[i] = qualptr[i]; // Get soft-clips unsigned int alen = 0; unsigned int lastIns = 0; unsigned int meanQuality = 0; for (unsigned int i = 0; i < rec->core.n_cigar; ++i) { if (bam_cigar_op(cigar[i]) == BAM_CMATCH) { alen += bam_cigar_oplen(cigar[i]) + lastIns; lastIns = 0; } else if (bam_cigar_op(cigar[i]) == BAM_CINS) { lastIns = bam_cigar_oplen(cigar[i]); // Only add if followed by 'M' } else if (bam_cigar_op(cigar[i]) == BAM_CSOFT_CLIP) { if (!alen) leadingSC = true; else leadingSC = false; splitPoint = rec->core.pos + alen; unsigned int qualSum = 0; for(unsigned int i = alen; i < (alen+clipSize); ++i) qualSum += quality[i]; meanQuality = qualSum / clipSize; } } //std::cerr << clipSize << ',' << meanQuality << ',' << splitPoint << std::endl; return (meanQuality >= (unsigned int) qualCut); } template inline bool _validSCOrientation(TBPoint bpPoint, bool leadingSC, int32_t const svt) { if (_translocation(svt)) { uint8_t ct = _getSpanOrientation(svt); if (ct == 0) return (!leadingSC); else if (ct == 1) return leadingSC; else if (ct == 2) { if (((!bpPoint) && (!leadingSC)) || ((bpPoint) && (leadingSC))) return true; else return false; } else if (ct == 3) { if (((!bpPoint) && (leadingSC)) || ((bpPoint) && (!leadingSC))) return true; else return false; } else return false; } else { if (svt == 0) return (!leadingSC); else if (svt == 1) return leadingSC; else if (svt == 2) { if (((!bpPoint) && (!leadingSC)) || ((bpPoint) && (leadingSC))) return true; else return false; } else if (svt == 3) { if (((!bpPoint) && (leadingSC)) || ((bpPoint) && (!leadingSC))) return true; else return false; } else if (svt == 4) { if (((!bpPoint) && (!leadingSC)) || ((bpPoint) && (leadingSC))) return true; else return false; } } return false; } // Deletions template inline std::string _getSVRef(TSeq const* const ref, TSVRecord const& svRec, TRef const refIndex, int32_t const svt) { if (_translocation(svt)) { uint8_t ct = _getSpanOrientation(svt); if (svRec.chr==refIndex) { if ((ct==0) || (ct == 2)) return boost::to_upper_copy(std::string(ref + svRec.svStartBeg, ref + svRec.svStartEnd)) + svRec.part1; else if (ct == 1) { std::string strEnd=boost::to_upper_copy(std::string(ref + svRec.svStartBeg, ref + svRec.svStartEnd)); std::string refPart=strEnd; std::string::reverse_iterator itR = strEnd.rbegin(); std::string::reverse_iterator itREnd = strEnd.rend(); for(unsigned int i = 0; itR!=itREnd; ++itR, ++i) { switch (*itR) { case 'A': refPart[i]='T'; break; case 'C': refPart[i]='G'; break; case 'G': refPart[i]='C'; break; case 'T': refPart[i]='A'; break; case 'N': refPart[i]='N'; break; default: break; } } return refPart + svRec.part1; } else return svRec.part1 + boost::to_upper_copy(std::string(ref + svRec.svStartBeg, ref + svRec.svStartEnd)); } else { // chr2 if (ct==0) { std::string strEnd=boost::to_upper_copy(std::string(ref + svRec.svEndBeg, ref + svRec.svEndEnd)); std::string refPart=strEnd; std::string::reverse_iterator itR = strEnd.rbegin(); std::string::reverse_iterator itREnd = strEnd.rend(); for(unsigned int i = 0; itR!=itREnd; ++itR, ++i) { switch (*itR) { case 'A': refPart[i]='T'; break; case 'C': refPart[i]='G'; break; case 'G': refPart[i]='C'; break; case 'T': refPart[i]='A'; break; case 'N': refPart[i]='N'; break; default: break; } } return refPart; } else return boost::to_upper_copy(std::string(ref + svRec.svEndBeg, ref + svRec.svEndEnd)); } } else { if (svt == 2) { if (svRec.svEnd - svRec.svStart <= DELLY_CHOP_REFSIZE) return boost::to_upper_copy(std::string(ref + svRec.svStartBeg, ref + svRec.svEndEnd)); else return boost::to_upper_copy(std::string(ref + svRec.svStartBeg, ref + svRec.svStartEnd)) + boost::to_upper_copy(std::string(ref + svRec.svEndBeg, ref + svRec.svEndEnd)); } else if (svt == 4) { return boost::to_upper_copy(std::string(ref + svRec.svStartBeg, ref + svRec.svEndEnd)); } else if (svt == 3) { return boost::to_upper_copy(std::string(ref + svRec.svEndBeg, ref + svRec.svEndEnd)) + boost::to_upper_copy(std::string(ref + svRec.svStartBeg, ref + svRec.svStartEnd)); } else if (svt == 0) { std::string strEnd=boost::to_upper_copy(std::string(ref + svRec.svEndBeg, ref + svRec.svEndEnd)); std::string strRevComp=strEnd; std::string::reverse_iterator itR = strEnd.rbegin(); std::string::reverse_iterator itREnd = strEnd.rend(); for(unsigned int i = 0; itR!=itREnd; ++itR, ++i) { switch (*itR) { case 'A': strRevComp[i]='T'; break; case 'C': strRevComp[i]='G'; break; case 'G': strRevComp[i]='C'; break; case 'T': strRevComp[i]='A'; break; case 'N': strRevComp[i]='N'; break; default: break; } } return boost::to_upper_copy(std::string(ref + svRec.svStartBeg, ref + svRec.svStartEnd)) + strRevComp; } else if (svt == 1) { std::string strStart=boost::to_upper_copy(std::string(ref + svRec.svStartBeg, ref + svRec.svStartEnd)); std::string strRevComp=strStart; std::string::reverse_iterator itR = strStart.rbegin(); std::string::reverse_iterator itREnd = strStart.rend(); for(unsigned int i = 0; itR!=itREnd; ++itR, ++i) { switch (*itR) { case 'A': strRevComp[i]='T'; break; case 'C': strRevComp[i]='G'; break; case 'G': strRevComp[i]='C'; break; case 'T': strRevComp[i]='A'; break; case 'N': strRevComp[i]='N'; break; default: break; } } return strRevComp + boost::to_upper_copy(std::string(ref + svRec.svEndBeg, ref + svRec.svEndEnd)); } } return ""; } template inline bool _coordTransform(TString const& ref, TSvRecord const& sv, TAlignDescriptor const& ad, TPosition& finalGapStart, TPosition& finalGapEnd, int32_t svt) { if (_translocation(svt)) { uint8_t ct = _getSpanOrientation(svt); if (ct == 0) { int32_t annealed = sv.svStartEnd - sv.svStartBeg; if ((ad.rStart >= annealed) || (ad.rEnd < annealed)) return false; finalGapStart = sv.svStartBeg + ad.rStart; finalGapEnd = sv.svEndBeg + (ref.size() - ad.rEnd) + 1; } else if (ct == 1) { int32_t annealed = sv.svStartEnd - sv.svStartBeg; if ((ad.rStart >= annealed) || (ad.rEnd < annealed)) return false; finalGapStart = sv.svStartBeg + (annealed - ad.rStart) + 1; finalGapEnd = sv.svEndBeg + (ad.rEnd - annealed); } else if (ct == 2) { int32_t annealed = sv.svStartEnd - sv.svStartBeg; if ((ad.rStart >= annealed) || (ad.rEnd < annealed)) return false; finalGapStart = sv.svStartBeg + ad.rStart; finalGapEnd = sv.svEndBeg + (ad.rEnd - annealed); } else if (ct == 3) { int32_t annealed = sv.svEndEnd - sv.svEndBeg; if ((ad.rStart >= annealed) || (ad.rEnd < annealed)) return false; finalGapStart = sv.svStartBeg + (ad.rEnd - annealed); finalGapEnd = sv.svEndBeg + ad.rStart; } else return false; return true; } else { if (svt == 2) { if (sv.svEnd - sv.svStart > DELLY_CHOP_REFSIZE) { int32_t annealed = sv.svStartEnd - sv.svStartBeg; if ((ad.rStart >= annealed) || (ad.rEnd < annealed)) return false; finalGapStart = sv.svStartBeg + ad.rStart; finalGapEnd = sv.svEndBeg + (ad.rEnd - annealed); } else { finalGapStart = sv.svStartBeg + ad.rStart; finalGapEnd = sv.svStartBeg + ad.rEnd; } return true; } else if (svt == 3) { int32_t annealed = sv.svEndEnd - sv.svEndBeg; if ((ad.rStart >= annealed) || (ad.rEnd < annealed)) return false; finalGapStart = sv.svStartBeg + (ad.rEnd - annealed); finalGapEnd = sv.svEndBeg + ad.rStart; return true; } else if (svt == 0) { int32_t annealed = sv.svStartEnd - sv.svStartBeg; if ((ad.rStart >= annealed) || (ad.rEnd < annealed)) return false; finalGapStart = sv.svStartBeg + ad.rStart; finalGapEnd = sv.svEndBeg + (ref.size() - ad.rEnd) + 1; return true; } else if (svt == 1) { int32_t annealed = sv.svStartEnd - sv.svStartBeg; if ((ad.rStart >= annealed) || (ad.rEnd < annealed)) return false; finalGapStart = sv.svStartBeg + (annealed - ad.rStart) + 1; finalGapEnd = sv.svEndBeg + (ad.rEnd - annealed); return true; } else if (svt == 4) { finalGapStart = sv.svStartBeg + ad.rStart; finalGapEnd = sv.svStartBeg + ad.rEnd; return true; } } return true; } template inline bool _validSRAlignment(TPos const cStart, TPos const cEnd, TPos const rStart, TPos const rEnd, int32_t const svt) { if (svt == 4) return (((rEnd - rStart) < 5) && ((cEnd - cStart) > 15)); else return (((cEnd - cStart) < 5) && ((rEnd - rStart) > 15)); } template inline bool _checkSVGap(TGap const refGap, TGap const oldRefGap, TGap const varGap, TGap const oldVarGap, int32_t const svt) { if (svt == 4) return (varGap > oldVarGap); else return (refGap > oldRefGap); } template inline void _findHomology(std::string const& consensus, std::string const& svRefStr, TAlignDescriptor& ad, int32_t const svt) { if (svt == 4) { ad.homRight = longestHomology(consensus.substr(ad.cStart), svRefStr.substr(ad.rEnd -1), -1); std::string preC = consensus.substr(0, ad.cEnd - 1); std::string preR = svRefStr.substr(0, ad.rStart); std::reverse(preC.begin(), preC.end()); std::reverse(preR.begin(), preR.end()); ad.homLeft = longestHomology(preC, preR, -1); } else { ad.homRight = longestHomology(consensus.substr(ad.cEnd - 1), svRefStr.substr(ad.rStart), -1); std::string preC = consensus.substr(0, ad.cStart); std::string preR = svRefStr.substr(0, ad.rEnd - 1); std::reverse(preC.begin(), preC.end()); std::reverse(preR.begin(), preR.end()); ad.homLeft = longestHomology(preC, preR, -1); } } template inline void _percentIdentity(TAlign const& align, TAIndex const gS, TAIndex const gE, TFloat& percId) { // Find percent identity bool varSeen = false; bool refSeen = false; uint32_t gapMM = 0; uint32_t mm = 0; uint32_t ma = 0; bool inGap=false; for(TAIndex j = 0; j < (TAIndex) align.shape()[1]; ++j) { if ((j < gS) || (j > gE)) { if (align[0][j] != '-') varSeen = true; if (align[1][j] != '-') refSeen = true; // Internal gap? if ((align[0][j] == '-') || (align[1][j] == '-')) { if ((refSeen) && (varSeen)) { if (!inGap) { inGap = true; gapMM = 0; } gapMM += 1; } } else { if (inGap) { mm += gapMM; inGap=false; } if (align[0][j] == align[1][j]) ma += 1; else mm += 1; } } } percId = (TFloat) ma / (TFloat) (ma + mm); } template inline bool _findSplit(TConfig const& c, std::string const& consensus, std::string const& svRefStr, TAlign const& align, TAlignDescriptor& ad, int32_t const svt) { // Initializiation int32_t gS=0; int32_t gE=0; // Find longest internal gap int32_t refIndex=0; int32_t varIndex=0; int32_t gapStartRefIndex=0; int32_t gapStartVarIndex=0; int32_t a1 = 0; bool inGap=false; for(int32_t j = 0; j < (int32_t) align.shape()[1]; ++j) { if (align[0][j] != '-') ++varIndex; if (align[1][j] != '-') ++refIndex; // Internal gap? if (((align[0][j] == '-') || (align[1][j] == '-')) && (refIndex>0) && (varIndex>0)) { if (!inGap) { gapStartVarIndex = (align[0][j] != '-') ? (varIndex - 1) : varIndex; gapStartRefIndex = (align[1][j] != '-') ? (refIndex - 1) : refIndex; a1 = j; inGap = true; } } else { if ((inGap) && (_checkSVGap((refIndex - gapStartRefIndex), (ad.rEnd - ad.rStart), (varIndex - gapStartVarIndex), (ad.cEnd - ad.cStart), svt))) { ad.rStart=gapStartRefIndex; ad.rEnd=refIndex; ad.cStart=gapStartVarIndex; ad.cEnd=varIndex; gS = a1; gE = j - 1; } inGap=false; } } if (ad.rEnd <= ad.rStart) return false; // Is this a valid split-read alignment? if (!_validSRAlignment(ad.cStart, ad.cEnd, ad.rStart, ad.rEnd, svt)) return false; // Check percent identity _percentIdentity(align, gS, gE, ad.percId); if (ad.percId < c.flankQuality) return false; // Find homology _findHomology(consensus, svRefStr, ad, svt); // Check flanking alignment length if ((ad.homLeft + c.minimumFlankSize > ad.cStart) || ( varIndex < ad.cEnd + ad.homRight + c.minimumFlankSize)) return false; if ((ad.homLeft + c.minimumFlankSize > ad.rStart) || ( refIndex < ad.rEnd + ad.homRight + c.minimumFlankSize)) return false; // Valid split-read alignment return true; } template inline bool _consRefAlignment(std::string const& cons, std::string const& svRefStr, TAlign& aln, int32_t const svt) { AlignConfig semiglobal; DnaScore lnsc(5, -4, -4, -4); bool reNeedle = false; if (svt == 4) { reNeedle = longNeedle(svRefStr, cons, aln, semiglobal, lnsc); for(uint32_t j = 0; j < aln.shape()[1]; ++j) { char tmp = aln[0][j]; aln[0][j] = aln[1][j]; aln[1][j] = tmp; } } else { reNeedle = longNeedle(cons, svRefStr, aln, semiglobal, lnsc); } return reNeedle; } template inline bool alignConsensus(TConfig const& c, bam_hdr_t* hdr, char const* seq, char const* sndSeq, StructuralVariantRecord& sv) { if ( (int32_t) sv.consensus.size() < (2 * c.minimumFlankSize + sv.insLen)) return false; // Get reference slice Breakpoint bp(sv); if (sv.svt ==4) { int32_t bufferSpace = std::max((int32_t) ((sv.consensus.size() - sv.insLen) / 3), c.minimumFlankSize); _initBreakpoint(hdr, bp, bufferSpace, sv.svt); } else _initBreakpoint(hdr, bp, sv.consensus.size(), sv.svt); if (bp.chr != bp.chr2) bp.part1 = _getSVRef(sndSeq, bp, bp.chr2, sv.svt); std::string svRefStr = _getSVRef(seq, bp, bp.chr, sv.svt); // Consensus to reference alignment typedef boost::multi_array TAlign; TAlign align; if (!_consRefAlignment(sv.consensus, svRefStr, align, sv.svt)) return false; // Debug consensus to reference alignment //std::cerr << "Consensus-to-Reference alignment" << std::endl; //for(uint32_t i = 0; i < align.shape()[0]; ++i) { //for(uint32_t j = 0; j< align.shape()[1]; ++j) { //std::cerr << align[i][j]; //} //std::cerr << std::endl; //} // Check breakpoint AlignDescriptor ad; if (!_findSplit(c, sv.consensus, svRefStr, align, ad, sv.svt)) return false; // Get the start and end of the structural variant unsigned int finalGapStart = 0; unsigned int finalGapEnd = 0; if (!_coordTransform(svRefStr, bp, ad, finalGapStart, finalGapEnd, sv.svt)) return false; sv.precise=true; sv.svStart=finalGapStart; sv.svEnd=finalGapEnd; sv.srAlignQuality = ad.percId; sv.insLen=ad.cEnd - ad.cStart - 1; sv.homLen=std::max(0, ad.homLeft + ad.homRight - 2); int32_t ci_wiggle = std::max(ad.homLeft, ad.homRight); sv.ciposlow = -ci_wiggle; sv.ciposhigh = ci_wiggle; sv.ciendlow = -ci_wiggle; sv.ciendhigh = ci_wiggle; if (c.islr) { // Set alleles sv.alleles = _addAlleles(boost::to_upper_copy(std::string(seq + sv.svStart - 1, seq + sv.svStart)), std::string(hdr->target_name[sv.chr2]), sv, sv.svt); // Get exact alleles for INS and DEL if ((sv.svt == 2) || (sv.svt == 4)) { std::string refVCF; std::string altVCF; int32_t cpos = 0; bool inSV = false; for(uint32_t j = 0; jcore.tid != rec->core.mtid); } // Deletions inline uint8_t _getSpanOrientation(int32_t const svt) { if (_translocation(svt)) { return svt - DELLY_SVT_TRANS; } else { return svt; } } // Structural variant record struct StructuralVariantRecord { int32_t chr; int32_t svStart; int32_t chr2; int32_t svEnd; int32_t ciposlow; int32_t ciposhigh; int32_t ciendlow; int32_t ciendhigh; int32_t srSupport; int32_t srMapQuality; int32_t mapq; int32_t insLen; int32_t svt; int32_t id; int32_t homLen; int32_t peSupport; int32_t peMapQuality; float srAlignQuality; bool precise; std::string alleles; std::string consensus; StructuralVariantRecord() : chr(0), svStart(0), chr2(0), svEnd(0), ciposlow(0), ciposhigh(0), ciendlow(0), ciendhigh(0), srSupport(0), srMapQuality(0), mapq(0), insLen(0), svt(-1), id(0), homLen(0), peSupport(0), peMapQuality(0), srAlignQuality(0), precise(false) {} StructuralVariantRecord(int32_t const c, int32_t const s, int32_t const e) : chr(c), svStart(s), chr2(c), svEnd(e), ciposlow(0), ciposhigh(0), ciendlow(0), ciendhigh(0), srSupport(0), srMapQuality(0), mapq(0), insLen(0), svt(-1), id(0), homLen(0), peSupport(0), peMapQuality(0), srAlignQuality(0), precise(false) {} StructuralVariantRecord(int32_t const c1, int32_t const s, int32_t const c2, int32_t const e, int32_t const cipl, int32_t const ciph, int32_t const ciel, int32_t const cieh, int32_t const sup, int32_t const srmapq, int32_t const qval, int32_t const ilen, int32_t const svtype, int32_t const idval): chr(c1), svStart(s), chr2(c2), svEnd(e), ciposlow(cipl), ciposhigh(ciph), ciendlow(ciel), ciendhigh(cieh), srSupport(sup), srMapQuality(srmapq), mapq(qval), insLen(ilen), svt(svtype), id(idval), homLen(0), peSupport(0), peMapQuality(0), srAlignQuality(0), precise(true) {} }; template struct SortSVs : public std::binary_function { inline bool operator()(TSV const& sv1, TSV const& sv2) { return ((sv1.chr sv2.peSupport)) || ((sv1.chr==sv2.chr) && (sv1.svStart==sv2.svStart) && (sv1.chr2==sv2.chr2) && (sv1.svEnd==sv2.svEnd) && (sv1.peSupport == sv2.peSupport) && (sv1.srSupport > sv2.srSupport))); } }; struct Breakpoint { int32_t svStartBeg; int32_t svStartEnd; int32_t svEndBeg; int32_t svEndEnd; int32_t svStart; int32_t svEnd; int32_t peSupport; int32_t svt; int32_t chr; int32_t chr2; std::string part1; Breakpoint() : svStartBeg(0), svStartEnd(0), svEndBeg(0), svEndEnd(0), svStart(0), svEnd(0), peSupport(0), svt(-1), chr(0), chr2(0) {} explicit Breakpoint(StructuralVariantRecord const& sv) : svStartBeg(sv.svStart), svStartEnd(sv.svStart), svEndBeg(sv.svEnd), svEndEnd(sv.svEnd), svStart(sv.svStart), svEnd(sv.svEnd), peSupport(sv.peSupport), svt(sv.svt), chr(sv.chr), chr2(sv.chr2) {} }; // Initialize breakpoint template inline void _initBreakpoint(bam_hdr_t* hdr, TBreakpoint& bp, int32_t const boundary, int32_t const svt) { if (_translocation(svt)) { bp.svStartBeg = std::max(0, bp.svStart - boundary); bp.svStartEnd = std::min((int32_t) (hdr->target_len[bp.chr]), bp.svStart + boundary); bp.svEndBeg = std::max(0, bp.svEnd - boundary); bp.svEndEnd = std::min((int32_t) (hdr->target_len[bp.chr2]), bp.svEnd + boundary); } else { if (svt == 4) { bp.svStartBeg = std::max(0, bp.svStart - boundary); bp.svStartEnd = std::min((int32_t) (hdr->target_len[bp.chr]), bp.svStart + boundary); bp.svEndBeg = std::max(0, bp.svEnd - boundary); bp.svEndEnd = std::min((int32_t) (hdr->target_len[bp.chr2]), bp.svEnd + boundary); } else { bp.svStartBeg = std::max(0, bp.svStart - boundary); bp.svStartEnd = std::min(bp.svStart + boundary, (bp.svStart + bp.svEnd)/2); bp.svEndBeg = std::max((bp.svStart + bp.svEnd)/2 + 1, bp.svEnd - boundary); bp.svEndEnd = std::min((int32_t) (hdr->target_len[bp.chr2]), bp.svEnd + boundary); } } } template inline TPos _minCoord(TPos const position, TPos const matePosition, int32_t const svt) { if (_translocation(svt)) return position; else return std::min(position, matePosition); } template inline TPos _maxCoord(TPos const position, TPos const matePosition, int32_t const svt) { if (_translocation(svt)) return matePosition; else return std::max(position, matePosition); } // Deletions, duplications and inversions template inline bool _mappingPosGeno(TRef const refID, TRef const mateRefID, TPos const position, TPos const matePosition, int32_t const svt) { if (_translocation(svt)) return ((refID==mateRefID) && (position==matePosition)); else { if (svt == 3) return ((refID!=mateRefID) || (std::abs(position - matePosition) < 100 )); else return ((refID!=mateRefID) || (position==matePosition)); } } template inline bool _svSizeCheck(TSize const s, TSize const e, int32_t const svt) { // Short reads if (svt == 0) return (( e - s ) >= 300); else if (svt == 1) return (( e - s ) >= 300); else if (svt == 2) return (( e - s ) >= 300); else if (svt == 3) return (( e - s ) >= 100); else return true; } template inline bool _svSizeCheck(TSize const s, TSize const e, int32_t const svt, int32_t const inslen) { // Long reads if (svt == 0) return (( e - s ) >= 15); else if (svt == 1) return (( e - s ) >= 15); else if (svt == 2) return (( e - s ) >= 15); else if (svt == 3) return (( e - s ) >= 15); else if (svt == 4) return (inslen >= 15); else return true; } // 0: Left-spanning inversion // 1: Right-spanning inversion // 2: Deletion-type // 3: Duplication-type inline uint8_t getSVType(bam1_t* rec) { if (!(rec->core.flag & BAM_FREVERSE)) { if (!(rec->core.flag & BAM_FMREVERSE)) return 0; else return (rec->core.pos < rec->core.mpos) ? 2 : 3; } else { if (!(rec->core.flag & BAM_FMREVERSE)) return (rec->core.pos > rec->core.mpos) ? 2 : 3; else return 1; } } inline int32_t _isizeMappingPos(bam1_t* rec, int32_t isize) { if (_translocation(rec)) { uint8_t orient = getSVType(rec); if (orient == 0) return DELLY_SVT_TRANS + 0; else if (orient == 1) return DELLY_SVT_TRANS + 1; else { // 3to5 or 5to3? if (rec->core.tid > rec->core.mtid) { if (!(rec->core.flag & BAM_FREVERSE)) return DELLY_SVT_TRANS + 2; else return DELLY_SVT_TRANS + 3; } else { if (!(rec->core.flag & BAM_FREVERSE)) return DELLY_SVT_TRANS + 3; else return DELLY_SVT_TRANS + 2; } } } else { if (rec->core.pos == rec->core.mpos) return -1; // No SV uint8_t orient = getSVType(rec); if (orient == 0) return 0; else if (orient == 1) return 1; else if (orient == 2) { if (isize > std::abs(rec->core.isize)) return -1; else return 2; } else { if (std::abs(rec->core.pos - rec->core.mpos) < 100) return -1; // Too small return 3; } } } inline unsigned hash_string(const char *s) { unsigned h = 37; while (*s) { h = (h * 54059) ^ (s[0] * 76963); s++; } return h; } template inline bool _firstPairObs(bam1_t* rec, TAlignedReads const& lastAlignedPosReads) { if (rec->core.tid == rec->core.mtid) return ((rec->core.pos < rec->core.mpos) || ((rec->core.pos == rec->core.mpos) && (lastAlignedPosReads.find(hash_string(bam_get_qname(rec))) == lastAlignedPosReads.end()))); else return (rec->core.tid < rec->core.mtid); } // Deletions template inline bool _pairsDisagree(TSize const pair1Min, TSize const pair1Max, TSize const pair1ReadLength, TISize const pair1maxNormalISize, TSize const pair2Min, TSize const pair2Max, TSize const pair2ReadLength, TISize const pair2maxNormalISize, int32_t const svt) { if (_translocation(svt)) { uint8_t ct = _getSpanOrientation(svt); // Check read offsets if (ct%2==0) { if ((pair2Min + pair2ReadLength - pair1Min) > pair1maxNormalISize) return true; if (ct>=2) { if (pair2Max < pair1Max) { if ((pair1Max + pair1ReadLength - pair2Max) > pair1maxNormalISize) return true; } else { if ((pair2Max + pair2ReadLength - pair1Max) > pair2maxNormalISize) return true; } } else { if (pair2Max < pair1Max) { if ((pair1Max + pair1ReadLength - pair2Max) > pair2maxNormalISize) return true; } else { if ((pair2Max + pair2ReadLength - pair1Max) > pair1maxNormalISize) return true; } } } else { if ((pair2Min + pair2ReadLength - pair1Min) > pair2maxNormalISize) return true; if (ct>=2) { if (pair2Max < pair1Max) { if ((pair1Max + pair1ReadLength - pair2Max) > pair2maxNormalISize) return true; } else { if ((pair2Max + pair2ReadLength - pair1Max) > pair1maxNormalISize) return true; } } else { if (pair2Max < pair1Max) { if ((pair1Max + pair1ReadLength - pair2Max) > pair1maxNormalISize) return true; } else { if ((pair2Max + pair2ReadLength - pair1Max) > pair2maxNormalISize) return true; } } } return false; } else { if (svt < 2) { // Inversion if (!svt) { // Left-spanning inversions if ((pair2Min + pair2ReadLength - pair1Min) > pair1maxNormalISize) return true; if ((pair2Max < pair1Max) && ((pair1Max + pair1ReadLength - pair2Max) > pair2maxNormalISize)) return true; if ((pair2Max >= pair1Max) && ((pair2Max + pair2ReadLength - pair1Max) > pair1maxNormalISize)) return true; } else { // Right-spanning inversions if ((pair2Min + pair2ReadLength - pair1Min) > pair2maxNormalISize) return true; if ((pair2Max < pair1Max) && ((pair1Max + pair1ReadLength - pair2Max) > pair1maxNormalISize)) return true; if ((pair2Max >= pair1Max) && ((pair2Max + pair2ReadLength - pair1Max) > pair2maxNormalISize)) return true; } return false; } else if (svt == 2) { // Deletion if ((pair2Min + pair2ReadLength - pair1Min) > pair1maxNormalISize) return true; if ((pair2Max < pair1Max) && ((pair1Max + pair1ReadLength - pair2Max) > pair1maxNormalISize)) return true; if ((pair2Max >= pair1Max) && ((pair2Max + pair2ReadLength - pair1Max) > pair2maxNormalISize)) return true; if ((pair1Max < pair2Min) || (pair2Max < pair1Min)) return true; return false; } else if (svt == 3) { if ((pair2Min + pair2ReadLength - pair1Min) > pair2maxNormalISize) return true; if ((pair2Max < pair1Max) && ((pair1Max + pair1ReadLength - pair2Max) > pair2maxNormalISize)) return true; if ((pair2Max >= pair1Max) && ((pair2Max + pair2ReadLength - pair1Max) > pair1maxNormalISize)) return true; return false; } } return false; } } #endif delly-0.9.1/src/tegua.h000066400000000000000000000336171414764127700147260ustar00rootroot00000000000000#ifndef TEGUA_H #define TEGUA_H #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "delly.h" #include "coverage.h" #include "genotype.h" #include "util.h" #include "junction.h" #include "cluster.h" #include "assemble.h" #include "modvcf.h" namespace torali { struct TeguaConfig { bool islr; bool hasDumpFile; bool hasExcludeFile; bool isHaplotagged; bool svtcmd; uint16_t minMapQual; uint16_t minGenoQual; uint32_t minClip; uint32_t minRefSep; uint32_t maxReadSep; uint32_t graphPruning; uint32_t minCliqueSize; uint32_t maxReadPerSV; int32_t nchr; int32_t minimumFlankSize; float indelExtension; float flankQuality; std::set svtset; DnaScore aliscore; boost::filesystem::path dumpfile; boost::filesystem::path outfile; std::vector files; boost::filesystem::path genome; boost::filesystem::path exclude; std::vector sampleName; }; template inline void _alignmentScore(TConfig& c, std::string const& scoring) { typedef boost::tokenizer< boost::char_separator > Tokenizer; boost::char_separator sep(",\t "); Tokenizer tokens(scoring, sep); Tokenizer::iterator tokIter = tokens.begin(); int32_t match = boost::lexical_cast(*tokIter++); int32_t mismatch = boost::lexical_cast(*tokIter++); int32_t go = boost::lexical_cast(*tokIter++); int32_t ge = boost::lexical_cast(*tokIter++); c.aliscore = DnaScore(match, mismatch, go, ge); //std::cerr << c.aliscore.match << ',' << c.aliscore.mismatch << ',' << c.aliscore.go << ',' << c.aliscore.ge << std::endl; return; } template inline int32_t runTegua(TConfig& c) { #ifdef PROFILE ProfilerStart("delly.prof"); #endif // Structural Variants typedef std::vector TVariants; TVariants svs; // Open header samFile* samfile = sam_open(c.files[0].string().c_str(), "r"); bam_hdr_t* hdr = sam_hdr_read(samfile); // Exclude intervals typedef boost::icl::interval_set TChrIntervals; typedef std::vector TRegionsGenome; TRegionsGenome validRegions; if (!_parseExcludeIntervals(c, hdr, validRegions)) { std::cerr << "Delly couldn't parse exclude intervals!" << std::endl; bam_hdr_destroy(hdr); sam_close(samfile); return 1; } // SR Store typedef std::vector TSvPosVector; typedef boost::unordered_map TReadSV; TReadSV srStore; // Identify SVs if (srStore.empty()) { // Structural Variant Candidates typedef std::vector TVariants; TVariants svc; // Temporary split-read store TReadSV tmpStore; // SV Discovery _clusterSRReads(c, validRegions, svc, tmpStore); // Assemble assemble(c, validRegions, svc, tmpStore); // Sort SVs sort(svc.begin(), svc.end(), SortSVs()); // Keep assembled SVs only StructuralVariantRecord lastSV; for(typename TVariants::iterator svIter = svc.begin(); svIter != svc.end(); ++svIter) { if ((svIter->srSupport == 0) && (svIter->peSupport == 0)) continue; // Duplicate? if (!svs.empty()) { if ((lastSV.chr == svIter->chr) && (lastSV.chr2 == svIter->chr2) && (std::abs(svIter->svStart - lastSV.svStart) < c.minRefSep) && (std::abs(svIter->svEnd - lastSV.svEnd) < c.minRefSep)) continue; } lastSV = *svIter; svs.push_back(*svIter); } // Sort sort(svs.begin(), svs.end(), SortSVs()); // Re-number SVs and update SR Store typedef std::map TIdMap; TIdMap idmap; uint32_t cliqueCount = 0; for(typename TVariants::iterator svIt = svs.begin(); svIt != svs.end(); ++svIt, ++cliqueCount) { idmap.insert(std::make_pair(svIt->id, cliqueCount)); svIt->id = cliqueCount; } for(typename TReadSV::iterator ts = tmpStore.begin(); ts != tmpStore.end(); ++ts) { bool keep = false; for(uint32_t idx = 0; idx < ts->second.size(); ++idx) { if (idmap.find(ts->second[idx].svid) == idmap.end()) ts->second[idx].svid = -1; else { ts->second[idx].svid = idmap.find(ts->second[idx].svid)->second; keep = true; } } if (keep) srStore.insert(*ts); } //outputStructuralVariants(c, svs); } // Clean-up bam_hdr_destroy(hdr); sam_close(samfile); // Annotate junction reads typedef std::vector TSVJunctionMap; typedef std::vector TSampleSVJunctionMap; TSampleSVJunctionMap jctMap(c.files.size()); // Annotate spanning coverage typedef std::vector TSVSpanningMap; typedef std::vector TSampleSVSpanningMap; TSampleSVSpanningMap spanMap(c.files.size()); // Annotate coverage typedef std::vector TSVReadCount; typedef std::vector TSampleSVReadCount; TSampleSVReadCount rcMap(c.files.size()); // Initialize count maps for(uint32_t file_c = 0; file_c < c.files.size(); ++file_c) { jctMap[file_c].resize(svs.size(), JunctionCount()); spanMap[file_c].resize(svs.size(), SpanningCount()); rcMap[file_c].resize(svs.size(), ReadCount()); } // Reference SV Genotyping //trackRef(c, svs, jctMap, rcMap); genotypeLR(c, svs, srStore, jctMap, rcMap); // VCF Output vcfOutput(c, svs, jctMap, rcMap, spanMap); #ifdef PROFILE ProfilerStop(); #endif // End boost::posix_time::ptime now = boost::posix_time::second_clock::local_time(); std::cout << '[' << boost::posix_time::to_simple_string(now) << "] Done." << std::endl;; return 0; } int tegua(int argc, char **argv) { TeguaConfig c; c.isHaplotagged = false; c.islr = true; // Parameter std::string svtype; std::string scoring; std::string mode; boost::program_options::options_description generic("Generic options"); generic.add_options() ("help,?", "show help message") ("svtype,t", boost::program_options::value(&svtype)->default_value("ALL"), "SV type to compute [DEL, INS, DUP, INV, BND, ALL]") ("technology,y", boost::program_options::value(&mode)->default_value("ont"), "seq. technology [pb, ont]") ("genome,g", boost::program_options::value(&c.genome), "genome fasta file") ("exclude,x", boost::program_options::value(&c.exclude), "file with regions to exclude") ("outfile,o", boost::program_options::value(&c.outfile)->default_value("sv.bcf"), "SV BCF output file") ; boost::program_options::options_description disc("Discovery options"); disc.add_options() ("mapqual,q", boost::program_options::value(&c.minMapQual)->default_value(10), "min. mapping quality") ("minclip,c", boost::program_options::value(&c.minClip)->default_value(25), "min. clipping length") ("min-clique-size,z", boost::program_options::value(&c.minCliqueSize)->default_value(2), "min. clique size") ("minrefsep,m", boost::program_options::value(&c.minRefSep)->default_value(30), "min. reference separation") ("maxreadsep,n", boost::program_options::value(&c.maxReadSep)->default_value(75), "max. read separation") ; boost::program_options::options_description cons("Consensus options"); cons.add_options() ("max-reads,p", boost::program_options::value(&c.maxReadPerSV)->default_value(5), "max. reads for consensus computation") ("flank-size,f", boost::program_options::value(&c.minimumFlankSize)->default_value(100), "min. flank size") ("flank-quality,a", boost::program_options::value(&c.flankQuality)->default_value(0.9), "min. flank quality") ; boost::program_options::options_description geno("Genotyping options"); geno.add_options() ("geno-qual,u", boost::program_options::value(&c.minGenoQual)->default_value(5), "min. mapping quality for genotyping") ("dump,d", boost::program_options::value(&c.dumpfile), "gzipped output file for SV-reads") ; boost::program_options::options_description hidden("Hidden options"); hidden.add_options() ("input-file", boost::program_options::value< std::vector >(&c.files), "input file") ("pruning,j", boost::program_options::value(&c.graphPruning)->default_value(1000), "graph pruning cutoff") ("extension,e", boost::program_options::value(&c.indelExtension)->default_value(0.5), "enforce indel extension") ("scoring,s", boost::program_options::value(&scoring)->default_value("3,-2,-3,-1"), "alignment scoring") ; boost::program_options::positional_options_description pos_args; pos_args.add("input-file", -1); boost::program_options::options_description cmdline_options; cmdline_options.add(generic).add(disc).add(cons).add(geno).add(hidden); boost::program_options::options_description visible_options; visible_options.add(generic).add(disc).add(cons).add(geno); boost::program_options::variables_map vm; boost::program_options::store(boost::program_options::command_line_parser(argc, argv).options(cmdline_options).positional(pos_args).run(), vm); boost::program_options::notify(vm); // Check command line arguments if ((vm.count("help")) || (!vm.count("input-file")) || (!vm.count("genome"))) { std::cout << std::endl; std::cout << "Usage: delly " << argv[0] << " [OPTIONS] -g ..." << std::endl; std::cout << visible_options << "\n"; return 0; } // Set alignment score _alignmentScore(c, scoring); // SV types to compute? _svTypesToCompute(c, svtype, vm.count("svtype")); // Dump reads if (vm.count("dump")) c.hasDumpFile = true; else c.hasDumpFile = false; // Clique size if (c.minCliqueSize < 2) c.minCliqueSize = 2; // Check reference if (!(boost::filesystem::exists(c.genome) && boost::filesystem::is_regular_file(c.genome) && boost::filesystem::file_size(c.genome))) { std::cerr << "Reference file is missing: " << c.genome.string() << std::endl; return 1; } else { faidx_t* fai = fai_load(c.genome.string().c_str()); if (fai == NULL) { if (fai_build(c.genome.string().c_str()) == -1) { std::cerr << "Fail to open genome fai index for " << c.genome.string() << std::endl; return 1; } else fai = fai_load(c.genome.string().c_str()); } fai_destroy(fai); } // Check input files c.sampleName.resize(c.files.size()); c.nchr = 0; for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) { if (!(boost::filesystem::exists(c.files[file_c]) && boost::filesystem::is_regular_file(c.files[file_c]) && boost::filesystem::file_size(c.files[file_c]))) { std::cerr << "Alignment file is missing: " << c.files[file_c].string() << std::endl; return 1; } samFile* samfile = sam_open(c.files[file_c].string().c_str(), "r"); if (samfile == NULL) { std::cerr << "Fail to open file " << c.files[file_c].string() << std::endl; return 1; } hts_idx_t* idx = sam_index_load(samfile, c.files[file_c].string().c_str()); if (idx == NULL) { std::cerr << "Fail to open index for " << c.files[file_c].string() << std::endl; return 1; } bam_hdr_t* hdr = sam_hdr_read(samfile); if (hdr == NULL) { std::cerr << "Fail to open header for " << c.files[file_c].string() << std::endl; return 1; } if (!c.nchr) c.nchr = hdr->n_targets; else { if (c.nchr != hdr->n_targets) { std::cerr << "BAM files have different number of chromosomes!" << std::endl; return 1; } } faidx_t* fai = fai_load(c.genome.string().c_str()); for(int32_t refIndex=0; refIndex < hdr->n_targets; ++refIndex) { std::string tname(hdr->target_name[refIndex]); if (!faidx_has_seq(fai, tname.c_str())) { std::cerr << "BAM file chromosome " << hdr->target_name[refIndex] << " is NOT present in your reference file " << c.genome.string() << std::endl; return 1; } } fai_destroy(fai); std::string sampleName = "unknown"; getSMTag(std::string(hdr->text), c.files[file_c].stem().string(), sampleName); c.sampleName[file_c] = sampleName; bam_hdr_destroy(hdr); hts_idx_destroy(idx); sam_close(samfile); } // Check exclude file if (vm.count("exclude")) { if (!(boost::filesystem::exists(c.exclude) && boost::filesystem::is_regular_file(c.exclude) && boost::filesystem::file_size(c.exclude))) { std::cerr << "Exclude file is missing: " << c.exclude.string() << std::endl; return 1; } c.hasExcludeFile = true; } else c.hasExcludeFile = false; // Check output directory if (!_outfileValid(c.outfile)) return 1; // Show cmd boost::posix_time::ptime now = boost::posix_time::second_clock::local_time(); std::cout << '[' << boost::posix_time::to_simple_string(now) << "] "; std::cout << "delly "; for(int i=0; i #include #include #include #include #include #include #include #include #include "tags.h" namespace torali { #ifndef LAST_BIN #define LAST_BIN 65535 #endif #ifndef MAX_CN #define MAX_CN 10 #endif struct LibraryInfo { int32_t rs; int32_t median; int32_t mad; int32_t minNormalISize; int32_t minISizeCutoff; int32_t maxNormalISize; int32_t maxISizeCutoff; uint32_t abnormal_pairs; LibraryInfo() : rs(0), median(0), mad(0), minNormalISize(0), minISizeCutoff(0), maxNormalISize(0), maxISizeCutoff(0), abnormal_pairs(0) {} }; struct CNV { int32_t chr; int32_t start; int32_t end; int32_t ciposlow; int32_t ciposhigh; int32_t ciendlow; int32_t ciendhigh; int32_t qval; double cn; double mappable; double sd; CNV() : chr(0), start(0), end(0), ciposlow(0), ciposhigh(0), ciendlow(0), ciendhigh(0), qval(0), cn(-1), mappable(0), sd(1) {} CNV(int32_t const c, int32_t const s, int32_t const e, int32_t const cil, int32_t const cih, int32_t const cel, int32_t ceh, double const estcn, double const mp) : chr(c), start(s), end(e), ciposlow(cil), ciposhigh(cih), ciendlow(cel), ciendhigh(ceh), qval(0), cn(estcn), mappable(mp), sd(1) {} }; template struct SortCNVs : public std::binary_function { inline bool operator()(TCNV const& sv1, TCNV const& sv2) { return ((sv1.chr(sv.svEnd) + "]"; } else if (ct == 1) { return ref + "," + "[" + chr2 + ":" + boost::lexical_cast(sv.svEnd) + "[" + ref; } else if (ct == 2) { return ref + "," + ref + "[" + chr2 + ":" + boost::lexical_cast(sv.svEnd) + "["; } else if (ct == 3) { return ref + "," + "]" + chr2 + ":" + boost::lexical_cast(sv.svEnd) + "]" + ref; } else { return ref + ",<" + _addID(svt) + ">"; } } else return ref + ",<" + _addID(svt) + ">"; } // Add Orientation inline std::string _addOrientation(int32_t const svt) { uint8_t ct = _getSpanOrientation(svt); if (ct==0) return "3to3"; else if (ct==1) return "5to5"; else if (ct==2) return "3to5"; else if (ct==3) return "5to3"; else return "NtoN"; } // Output directory/file checks inline bool _outfileValid(boost::filesystem::path const& outfile) { try { boost::filesystem::path outdir; if (outfile.has_parent_path()) outdir = outfile.parent_path(); else outdir = boost::filesystem::current_path(); if (!boost::filesystem::exists(outdir)) { std::cerr << "Output directory does not exist: " << outdir << std::endl; return false; } else { boost::filesystem::file_status s = boost::filesystem::status(outdir); boost::filesystem::ofstream file(outfile.string()); file.close(); if (!(boost::filesystem::exists(outfile) && boost::filesystem::is_regular_file(outfile))) { std::cerr << "Fail to open output file " << outfile.string() << std::endl; std::cerr << "Output directory permissions: " << s.permissions() << std::endl; return false; } else { boost::filesystem::remove(outfile.string()); } } } catch (boost::filesystem::filesystem_error const& e) { std::cerr << e.what() << std::endl; return false; } return true; } template inline void _svTypesToCompute(TConfig& c, std::string const& svtype, bool const specified) { c.svtcmd = false; if (specified) { c.svtcmd = true; if (svtype == "DEL") { c.svtset.insert(2); } else if (svtype == "INS") { c.svtset.insert(4); } else if (svtype == "DUP") { c.svtset.insert(3); } else if (svtype == "INV") { c.svtset.insert(0); c.svtset.insert(1); } else if (svtype == "INV_3to3") { c.svtset.insert(0); } else if (svtype == "INV_5to5") { c.svtset.insert(1); } else if (svtype == "BND") { c.svtset.insert(DELLY_SVT_TRANS + 0); c.svtset.insert(DELLY_SVT_TRANS + 1); c.svtset.insert(DELLY_SVT_TRANS + 2); c.svtset.insert(DELLY_SVT_TRANS + 3); } else if (svtype == "BND_3to3") { c.svtset.insert(DELLY_SVT_TRANS + 0); } else if (svtype == "BND_5to5") { c.svtset.insert(DELLY_SVT_TRANS + 1); } else if (svtype == "BND_3to5") { c.svtset.insert(DELLY_SVT_TRANS + 2); } else if (svtype == "BND_5to3") { c.svtset.insert(DELLY_SVT_TRANS + 3); } else { c.svtcmd = false; } } } inline uint32_t sequenceLength(bam1_t const* rec) { uint32_t* cigar = bam_get_cigar(rec); uint32_t slen = 0; for (uint32_t i = 0; i < rec->core.n_cigar; ++i) if ((bam_cigar_op(cigar[i]) == BAM_CMATCH) || (bam_cigar_op(cigar[i]) == BAM_CEQUAL) || (bam_cigar_op(cigar[i]) == BAM_CDIFF) || (bam_cigar_op(cigar[i]) == BAM_CINS) || (bam_cigar_op(cigar[i]) == BAM_CSOFT_CLIP) || (bam_cigar_op(cigar[i]) == BAM_CHARD_CLIP)) slen += bam_cigar_oplen(cigar[i]); return slen; } inline int32_t readLength(bam1_t const* rec) { //int32_t slen = rec->core.l_qseq; # Incorrect for seq. with hard-clips return sequenceLength(rec); } inline uint32_t alignmentLength(bam1_t const* rec) { uint32_t* cigar = bam_get_cigar(rec); uint32_t alen = 0; for (uint32_t i = 0; i < rec->core.n_cigar; ++i) if ((bam_cigar_op(cigar[i]) == BAM_CMATCH) || (bam_cigar_op(cigar[i]) == BAM_CEQUAL) || (bam_cigar_op(cigar[i]) == BAM_CDIFF) || (bam_cigar_op(cigar[i]) == BAM_CDEL) || (bam_cigar_op(cigar[i]) == BAM_CREF_SKIP)) alen += bam_cigar_oplen(cigar[i]); return alen; } inline uint32_t halfAlignmentLength(bam1_t const* rec) { return (alignmentLength(rec) / 2); } inline uint32_t lastAlignedPosition(bam1_t const* rec) { return rec->core.pos + alignmentLength(rec); } inline std::size_t hash_lr(bam1_t* rec) { boost::hash string_hash; std::string qname = bam_get_qname(rec); std::size_t seed = hash_string(qname.c_str()); boost::hash_combine(seed, string_hash(qname)); return seed; } inline std::size_t hash_pair(bam1_t* rec) { std::size_t seed = hash_string(bam_get_qname(rec)); boost::hash_combine(seed, rec->core.tid); boost::hash_combine(seed, rec->core.pos); boost::hash_combine(seed, rec->core.mtid); boost::hash_combine(seed, rec->core.mpos); return seed; } inline std::size_t hash_pair_mate(bam1_t* rec) { std::size_t seed = hash_string(bam_get_qname(rec)); boost::hash_combine(seed, rec->core.mtid); boost::hash_combine(seed, rec->core.mpos); boost::hash_combine(seed, rec->core.tid); boost::hash_combine(seed, rec->core.pos); return seed; } inline void reverseComplement(std::string& sequence) { std::string rev = boost::to_upper_copy(std::string(sequence.rbegin(), sequence.rend())); std::size_t i = 0; for(std::string::iterator revIt = rev.begin(); revIt != rev.end(); ++revIt, ++i) { switch (*revIt) { case 'A': sequence[i]='T'; break; case 'C': sequence[i]='G'; break; case 'G': sequence[i]='C'; break; case 'T': sequence[i]='A'; break; case 'N': sequence[i]='N'; break; default: break; } } } inline std::string compressStr(std::string const& data) { std::stringstream compressed; std::stringstream origin(data); boost::iostreams::filtering_streambuf out; out.push(boost::iostreams::gzip_compressor(boost::iostreams::gzip_params(boost::iostreams::gzip::best_speed))); out.push(origin); boost::iostreams::copy(out, compressed); return compressed.str(); } inline std::string decompressStr(std::string const& data) { std::stringstream compressed(data); std::stringstream decompressed; boost::iostreams::filtering_streambuf out; out.push(boost::iostreams::gzip_decompressor()); out.push(compressed); boost::iostreams::copy(out, decompressed); return decompressed.str(); } inline double entropy(std::string const& st) { typedef double TPrecision; std::vector stvec(st.begin(), st.end()); std::set alphabet(stvec.begin(), stvec.end()); TPrecision ent = 0; for(std::set::const_iterator c = alphabet.begin(); c != alphabet.end(); ++c) { int ctr = 0; for (std::vector::const_iterator s = stvec.begin(); s != stvec.end(); ++s) if (*s == *c) ++ctr; TPrecision freq = (TPrecision) ctr / (TPrecision) stvec.size(); ent += (freq) * log(freq)/log(2); } return -ent; } inline uint32_t setMinChrLen(bam_hdr_t const* hdr, double const xx) { uint32_t minChrLen = 0; std::vector chrlen(hdr->n_targets, 0); uint64_t genomelen = 0; for(int32_t refIndex = 0; refIndex < hdr->n_targets; ++refIndex) { chrlen[refIndex] = hdr->target_len[refIndex]; genomelen += hdr->target_len[refIndex]; } std::sort(chrlen.begin(), chrlen.end(), std::greater()); uint64_t cumsum = 0; for(uint32_t i = 0; i < chrlen.size(); ++i) { cumsum += chrlen[i]; minChrLen = chrlen[i]; if (cumsum > genomelen * xx) break; } return minChrLen; } template inline bool chrNoData(TConfig const& c, uint32_t const refIndex, hts_idx_t const* idx) { // Check we have mapped reads on this chromosome std::string suffix("cram"); std::string str(c.bamFile.string()); if ((str.size() >= suffix.size()) && (str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0)) return false; uint64_t mapped = 0; uint64_t unmapped = 0; hts_idx_get_stat(idx, refIndex, &mapped, &unmapped); if (mapped) return false; else return true; } inline std::size_t hash_se(bam1_t* rec) { std::size_t seed = hash_string(bam_get_qname(rec)); boost::hash_combine(seed, rec->core.tid); boost::hash_combine(seed, rec->core.pos); return seed; } inline void getSMTag(std::string const& header, std::string const& fileName, std::string& sampleName) { std::set smIdentifiers; std::string delimiters("\n"); typedef std::vector TStrParts; TStrParts lines; boost::split(lines, header, boost::is_any_of(delimiters)); TStrParts::const_iterator itH = lines.begin(); TStrParts::const_iterator itHEnd = lines.end(); bool rgPresent = false; for(;itH!=itHEnd; ++itH) { if (itH->find("@RG")==0) { std::string delim("\t"); TStrParts keyval; boost::split(keyval, *itH, boost::is_any_of(delim)); TStrParts::const_iterator itKV = keyval.begin(); TStrParts::const_iterator itKVEnd = keyval.end(); for(;itKV != itKVEnd; ++itKV) { size_t sp = itKV->find(":"); if (sp != std::string::npos) { std::string field = itKV->substr(0, sp); if (field == "SM") { rgPresent = true; std::string rgSM = itKV->substr(sp+1); smIdentifiers.insert(rgSM); } } } } } if (!rgPresent) { sampleName = fileName; } else if (smIdentifiers.size() == 1) { sampleName = *(smIdentifiers.begin()); } else if (smIdentifiers.size() > 1) { sampleName = *(smIdentifiers.begin()); std::cerr << "Warning: Multiple sample names (@RG:SM) present in the BAM file!" << std::endl; } } template inline int32_t _parseExcludeIntervals(TConfig const& c, bam_hdr_t* hdr, TRegionsGenome& validRegions) { typedef typename TRegionsGenome::value_type TChrIntervals; typedef typename TChrIntervals::interval_type TIVal; validRegions.resize(hdr->n_targets); TRegionsGenome exclg; exclg.resize(hdr->n_targets); std::vector validChr; validChr.resize(hdr->n_targets, true); if (c.hasExcludeFile) { std::ifstream chrFile(c.exclude.string().c_str(), std::ifstream::in); if (chrFile.is_open()) { while (chrFile.good()) { std::string chrFromFile; getline(chrFile, chrFromFile); typedef boost::tokenizer< boost::char_separator > Tokenizer; boost::char_separator sep(" \t,;"); Tokenizer tokens(chrFromFile, sep); Tokenizer::iterator tokIter = tokens.begin(); if (tokIter!=tokens.end()) { std::string chrName = *tokIter++; int32_t tid = bam_name2id(hdr, chrName.c_str()); if (tid >= 0) { if (tokIter!=tokens.end()) { int32_t start = 0; try { start = boost::lexical_cast(*tokIter++); } catch (boost::bad_lexical_cast&) { std::cerr << "Exclude file needs to be in tab-delimited format: chr, start, end" << std::endl; std::cerr << "Offending line: " << chrFromFile << std::endl; return false; } if (tokIter!=tokens.end()) { int32_t end = start + 1; try { end = boost::lexical_cast(*tokIter++); } catch (boost::bad_lexical_cast&) { std::cerr << "Exclude file needs to be in tab-delimited format: chr, start, end" << std::endl; std::cerr << "Offending line: " << chrFromFile << std::endl; return false; } if (start < end) { exclg[tid].insert(TIVal::right_open(start, end)); } else { std::cerr << "Exclude file needs to be in tab-delimited format (chr, start, end) and start < end." << std::endl; std::cerr << "Offending line: " << chrFromFile << std::endl; return false; } } else { std::cerr << "Exclude file needs to be in tab-delimited format: chr, start, end" << std::endl; std::cerr << "Offending line: " << chrFromFile << std::endl; return false; } } else validChr[tid] = false; // Exclude entire chromosome } } } chrFile.close(); } } // Create the valid regions for (int32_t i = 0; in_targets; ++i) { if (!validChr[i]) continue; uint32_t istart = 0; for(typename TChrIntervals::iterator it = exclg[i].begin(); it != exclg[i].end(); ++it) { if (istart + 1 < it->lower()) validRegions[i].insert(TIVal::right_open(istart, it->lower() - 1)); istart = it->upper(); } if (istart + 1 < hdr->target_len[i]) validRegions[i].insert(TIVal::right_open(istart, hdr->target_len[i])); } exclg.clear(); return true; } template inline void getMedian(TIterator begin, TIterator end, TValue& median) { std::nth_element(begin, begin + (end - begin) / 2, end); median = *(begin + (end - begin) / 2); } template inline void getPercentile(TVector& vec, TPercentile p, TValue& percentile) { std::nth_element(vec.begin(), vec.begin() + int((vec.size() * p)), vec.end()); percentile = *(vec.begin() + int(vec.size() * p)); } template inline int32_t getVariability(TConfig const&, std::vector const& lib) { int32_t overallVariability = 0; for(uint32_t libIdx = 0; libIdx < lib.size(); ++libIdx) { if (lib[libIdx].maxNormalISize > overallVariability) overallVariability = lib[libIdx].maxNormalISize; if (lib[libIdx].rs > overallVariability) overallVariability = lib[libIdx].rs; } return overallVariability; } template inline void getMAD(TIterator begin, TIterator end, TValue median, TValue& mad) { std::vector absDev; for(;begin inline void getMean(TIterator begin, TIterator end, TValue& mean) { mean = 0; unsigned int count = 0; for(; begin inline void getStdDev(TIterator begin, TIterator end, TValue mean, TValue& stdDev) { stdDev = 0; unsigned int count = 0; for(;begin inline void getLibraryParams(TConfig const& c, TValidRegion const& validRegions, TSampleLibrary& sampleLib) { typedef typename TValidRegion::value_type TChrIntervals; // Open file handles typedef std::vector TSamFile; typedef std::vector TIndex; typedef std::vector TSamHeader; TSamFile samfile(c.files.size()); TIndex idx(c.files.size()); TSamHeader hdr(c.files.size()); for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) { samfile[file_c] = sam_open(c.files[file_c].string().c_str(), "r"); hts_set_fai_filename(samfile[file_c], c.genome.string().c_str()); idx[file_c] = sam_index_load(samfile[file_c], c.files[file_c].string().c_str()); hdr[file_c] = sam_hdr_read(samfile[file_c]); } // Iterate all samples for(uint32_t file_c = 0; file_c < c.files.size(); ++file_c) { uint32_t maxAlignmentsScreened=10000000; uint32_t maxNumAlignments=1000000; uint32_t minNumAlignments=1000; uint32_t alignmentCount=0; uint32_t processedNumPairs = 0; uint32_t processedNumReads = 0; uint32_t rplus = 0; uint32_t nonrplus = 0; typedef std::vector TSizeVector; TSizeVector vecISize; TSizeVector readSize; // Collect insert sizes bool libCharacterized = false; for(uint32_t refIndex=0; refIndex < (uint32_t) hdr[0]->n_targets; ++refIndex) { if (validRegions[refIndex].empty()) continue; for(typename TChrIntervals::const_iterator vRIt = validRegions[refIndex].begin(); ((vRIt != validRegions[refIndex].end()) && (!libCharacterized)); ++vRIt) { hts_itr_t* iter = sam_itr_queryi(idx[file_c], refIndex, vRIt->lower(), vRIt->upper()); bam1_t* rec = bam_init1(); while (sam_itr_next(samfile[file_c], iter, rec) >= 0) { if (!(rec->core.flag & BAM_FREAD2) && (rec->core.l_qseq < 65000)) { if (rec->core.flag & (BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP | BAM_FSUPPLEMENTARY | BAM_FUNMAP)) continue; if ((alignmentCount > maxAlignmentsScreened) || ((processedNumReads >= maxNumAlignments) && (processedNumPairs == 0)) || (processedNumPairs >= maxNumAlignments)) { // Paired-end library with enough pairs libCharacterized = true; break; } ++alignmentCount; // Single-end if (processedNumReads < maxNumAlignments) { readSize.push_back(rec->core.l_qseq); ++processedNumReads; } // Paired-end if ((rec->core.flag & BAM_FPAIRED) && !(rec->core.flag & BAM_FMUNMAP) && (rec->core.tid==rec->core.mtid)) { if (processedNumPairs < maxNumAlignments) { vecISize.push_back(abs(rec->core.isize)); if (getSVType(rec) == 2) ++rplus; else ++nonrplus; ++processedNumPairs; } } } } bam_destroy1(rec); hts_itr_destroy(iter); if (libCharacterized) break; } if (libCharacterized) break; } // Get library parameters if (processedNumReads >= minNumAlignments) { std::sort(readSize.begin(), readSize.end()); sampleLib[file_c].rs = readSize[readSize.size() / 2]; } if (processedNumPairs >= minNumAlignments) { std::sort(vecISize.begin(), vecISize.end()); int32_t median = vecISize[vecISize.size() / 2]; std::vector absDev; for(uint32_t i = 0; i < vecISize.size(); ++i) absDev.push_back(std::abs((int32_t) vecISize[i] - median)); std::sort(absDev.begin(), absDev.end()); int32_t mad = absDev[absDev.size() / 2]; // Get default library orientation if ((median >= 50) && (median<=100000)) { if (rplus < nonrplus) { std::cerr << "Warning: Sample has a non-default paired-end layout! File: " << c.files[file_c].string() << std::endl; std::cerr << "The expected paired-end orientation is ---Read1---> <---Read2--- which is the default illumina paired-end layout." << std::endl; } else { sampleLib[file_c].median = median; sampleLib[file_c].mad = mad; sampleLib[file_c].maxNormalISize = median + (c.madNormalCutoff * mad); sampleLib[file_c].minNormalISize = median - (c.madNormalCutoff * mad); if (sampleLib[file_c].minNormalISize < 0) sampleLib[file_c].minNormalISize=0; sampleLib[file_c].maxISizeCutoff = median + (c.madCutoff * mad); sampleLib[file_c].minISizeCutoff = median - (c.madCutoff * mad); // Deletion insert-size sanity checks sampleLib[file_c].maxISizeCutoff = std::max(sampleLib[file_c].maxISizeCutoff, 2*sampleLib[file_c].rs); sampleLib[file_c].maxISizeCutoff = std::max(sampleLib[file_c].maxISizeCutoff, 500); if (sampleLib[file_c].minISizeCutoff < 0) sampleLib[file_c].minISizeCutoff=0; } } } } // Clean-up for(unsigned int file_c = 0; file_c < c.files.size(); ++file_c) { bam_hdr_destroy(hdr[file_c]); hts_idx_destroy(idx[file_c]); sam_close(samfile[file_c]); } } template inline uint32_t _trimAlignedSequences(TAlign const& align, std::string& s0, std::string& s1) { int32_t s = -1; int32_t e = -1; uint32_t leadCrop = 0; for(uint32_t j = 0; j