pax_global_header00006660000000000000000000000064131441555030014513gustar00rootroot0000000000000052 comment=825432a2672435389f3d6b1676f7b0f80706eba2 HINGE-0.5.0/000077500000000000000000000000001314415550300123475ustar00rootroot00000000000000HINGE-0.5.0/.gitignore000066400000000000000000000001671314415550300143430ustar00rootroot00000000000000data .DS_Store *.pyc src/build src/.idea/ scripts/.ipynb_checkpoints/ scripts/figures/ build notebook demo .idea inst/ HINGE-0.5.0/.gitmodules000066400000000000000000000011221314415550300145200ustar00rootroot00000000000000[submodule "DALIGNER"] path = thirdparty/DALIGNER url = https://github.com/thegenemyers/DALIGNER.git [submodule "DAZZ_DB"] path = thirdparty/DAZZ_DB url = https://github.com/Eureka22/DAZZ_DB.git [submodule "DEXTRACTOR"] path = thirdparty/DEXTRACTOR url = https://github.com/thegenemyers/DEXTRACTOR.git [submodule "DASCRUBBER"] path = thirdparty/DASCRUBBER url = https://github.com/thegenemyers/DASCRUBBER.git [submodule "graphmap"] path = graphmap url = https://github.com/isovic/graphmap.git [submodule "src/spdlog"] path = src/spdlog url = https://github.com/gabime/spdlog.git HINGE-0.5.0/.travis.yml000066400000000000000000000014121314415550300144560ustar00rootroot00000000000000language: cpp compiler: gcc sudo: required install: sudo apt-get update && sudo apt-get install build-essential && sudo apt-get install libboost-dev && sudo apt-get install libboost-all-dev && sudo add-apt-repository ppa:ubuntu-toolchain-r/test -y && sudo apt-get update; sudo apt-get install gcc-4.8 g++-4.8 -y && gcc-4.8 --version && which gcc-4.8 && wget http://www.cmake.org/files/v3.2/cmake-3.2.2.tar.gz && tar xf cmake-3.2.2.tar.gz && cd cmake-3.2.2 && ./configure && make -j 8 && sudo make install && cmake --version && pwd && cd .. script: ./utils/build.sh notifications: email: - xf1280@gmail.com - govinda.kamath@gmail.com HINGE-0.5.0/CMakeLists.txt000066400000000000000000000002721314415550300151100ustar00rootroot00000000000000cmake_minimum_required(VERSION 3.2) project(AwAssembler) set(libexec "lib/hinge") add_subdirectory(src bin) install( DIRECTORY scripts/ DESTINATION ${libexec} USE_SOURCE_PERMISSIONS ) HINGE-0.5.0/README.md000066400000000000000000000132431314415550300136310ustar00rootroot00000000000000# HINGE Software accompanying "HINGE: Long-Read Assembly Achieves Optimal Repeat Resolution" - Preprint: http://biorxiv.org/content/early/2016/08/01/062117 - Paper: http://genome.cshlp.org/content/early/2017/03/20/gr.216465.116.abstract - An ipython notebook to reproduce results in the paper can be found in this [repository](https://github.com/govinda-kamath/HINGE-analyses). CI Status: ![image](https://travis-ci.org/HingeAssembler/HINGE.svg?branch=master) ## Introduction HINGE is a long read assembler based on an idea called _hinging_. ## Pipeline Overview HINGE is an OLC(Overlap-Layout-Consensus) assembler. The idea of the pipeline is shown below. ![image](misc/High_level_overview.png) At a high level, the algorithm can be thought of a variation of the classical greedy algorithm. The main difference with the greedy algorithm is that rather than each read having a single successor, and a single predecessor, we allow a small subset of reads to have a higher number of successors/predecessors. This subset is identified by a process called _hinging_. This helps us to recover the graph structure directly during assembly. Another significant difference from HGAP or Falcon pipeline is that it does not have a pre-assembly or read correction step. ## Algorithm Details ### Reads filtering Reads filtering filters reads that have long chimer in the middle, and short reads. Reads which can have higher number of predecessors/successors are also identified there. This is implemented in `filter/filter.cpp` ### Layout The layout is implemented in `layout/hinging.cpp`. It is done by a variant of the greedy algorithm. The graph output by the layout stage is post-processed by running `scripts/pruning_and_clipping.py`. One output is a graphml file which is the graph representation of the backbone. This removes dead ends and Z-structures from the graph enabling easy condensation. It can be analyzed and visualized, etc. ## Parameters In the pipeline described above, several programs load their parameters from a configuration file in the ini format. All tunable parameters are described in [this document](parameter_description.md). # Installation ## Dependencies - g++ 4.9 - cmake 3.x - libhdf5 - boost - Python 2.7 The following python packages are necessary: - numpy - ujson - configparser - colormap - easydev.tools This software is still at prototype stage so it is not well packaged, however it is designed in a modular flavor so different combinations of methods can be tested. Installing the software is very easy. ``` git clone https://github.com/fxia22/HINGE.git git submodule init git submodule update ./utils/build.sh ``` Alternatively, you can use docker to build and use HINGE, see [this guide](https://github.com/HingeAssembler/HINGE/tree/master/docker) for more information. # Running In order to call the programs from anywhere, I suggest one export the directory of binary file to system environment, you can do that by using the script `setup.sh`. The parameters are initialised in `utils/nominal.ini`. The path to nominal.ini has to be specified to run the scripts. A demo run for assembling the ecoli genome is the following: ``` source utils/setup.sh mkdir data/ecoli cd data/ecoli # reads.fasta should be in data/ecoli fasta2DB ecoli reads.fasta DBsplit -x500 -s100 ecoli HPC.daligner -t5 ecoli | csh -v # alternatively, you can put output of HPC.daligner to a bash file and edit it to support rm ecoli.*.ecoli.* LAmerge ecoli.las ecoli.+([[:digit:]]).las rm ecoli.*.las # we only need ecoli.las DASqv -c100 ecoli ecoli.las # Run filter mkdir log hinge filter --db ecoli --las ecoli.las -x ecoli --config # Get maximal reads hinge maximal --db ecoli --las ecoli.las -x ecoli --config # Run layout hinge layout --db ecoli --las ecoli.las -x ecoli --config -o ecoli # Run postprocessing hinge clip ecoli.edges.hinges ecoli.hinge.list # get draft assembly hinge draft-path ecoli ecoli.G2.graphml hinge draft --db ecoli --las ecoli.las --prefix ecoli --config --out ecoli.draft # get consensus assembly hinge correct-head ecoli.draft.fasta ecoli.draft.pb.fasta draft_map.txt fasta2DB draft ecoli.draft.pb.fasta HPC.daligner ecoli draft | zsh -v hinge consensus draft ecoli draft.ecoli.las ecoli.consensus.fasta hinge gfa ecoli ecoli.consensus.fasta #results should be in ecoli_consensus.gfa ``` ## Analysis of Results ### showing ground truth on graph Some programs are for debugging and oberservation. For example, one can get the ground truth by mapping reads to reference and get `ecoli.ecoli.ref.las`. This `las` file can be parsed to json file for other programs to use. ``` run_mapping.py ecoli ecoli.ref ecoli.ecoli.ref.las 1-$ ``` In the prune step, if `ecoli.mapping.json` exists, the output `graphml` file will contain the information of ground truth. ### drawing alignment graphs and mapping graphs Draw a read, for example 60947, and output figure to `sample` folder (need plus 1 as LAshow counts from 1): ``` draw2.py ecoli ecoli.las 60948 sample 100 ``` Draw pileup on draft assembly, given a region(start,end): ``` draw2_pileup_region.py 3600000 4500000 ``` # Results: For ecoli 160X dataset, after shortening reads to have a mean length of 3500 (with a variance of 1500), the graph is preserved. ![image](misc/ecoli_shortened.png) Results on the bacterial genomes of the [NCTC 3000](http://www.sanger.ac.uk/resources/downloads/bacteria/nctc/) project can be found at [web.stanford.edu/~gkamath/NCTC/report.html](https://web.stanford.edu/~gkamath/NCTC/report.html) HINGE-0.5.0/demo/000077500000000000000000000000001314415550300132735ustar00rootroot00000000000000HINGE-0.5.0/demo/NCTC9657_demo/000077500000000000000000000000001314415550300153615ustar00rootroot00000000000000HINGE-0.5.0/demo/NCTC9657_demo/run.sh000066400000000000000000000022451314415550300165240ustar00rootroot00000000000000hinge correct-head NCTC9657_reads.fasta reads.pb.fasta map.txt fasta2DB NCTC9657 reads.pb.fasta DBsplit NCTC9657 HPC.daligner NCTC9657 | bash -v rm NCTC9657.*.NCTC9657.*.las LAmerge NCTC9657.las NCTC9657.[0-9].las DASqv -c100 NCTC9657 NCTC9657.las mkdir log hinge filter --db NCTC9657 --las NCTC9657 --mlas -x NCTC9657 --config ../../utils/nominal.ini hinge maximal --db NCTC9657 --las NCTC9657 --mlas -x NCTC9657 --config ../../utils/nominal.ini hinge layout --db NCTC9657 --las NCTC9657.las -x NCTC9657 --config ../../utils/nominal.ini -o NCTC9657 hinge clip NCTC9657.edges.hinges NCTC9657.hinge.list demo hinge draft-path $PWD NCTC9657 NCTC9657demo.G2.graphml hinge draft --db NCTC9657 --las NCTC9657.las --prefix NCTC9657 --config ../../utils/nominal.ini --out NCTC9657.draft hinge correct-head NCTC9657.draft.fasta NCTC9657.draft.pb.fasta draft_map.txt fasta2DB draft NCTC9657.draft.pb.fasta HPC.daligner NCTC9657 draft | bash -v # rm draft.*.NCTC9657.*.las # LAmerge draft.NCTC9657.las draft.NCTC9657.*.las hinge consensus draft NCTC9657 draft.NCTC9657.las NCTC9657.consensus.fasta ../../utils/nominal.ini hinge gfa $PWD NCTC9657 NCTC9657.consensus.fasta HINGE-0.5.0/demo/ecoli_P4_demo/000077500000000000000000000000001314415550300157355ustar00rootroot00000000000000HINGE-0.5.0/demo/ecoli_P4_demo/run.sh000066400000000000000000000023101314415550300170710ustar00rootroot00000000000000wget -nv http://files.pacb.com/datasets/secondary-analysis/ecoli-k12-P4C2-20KSS/ecoliK12.tar.gz tar -zxf ecoliK12.tar.gz dextract -o ecoliK12/Analysis_Results/*.bax.h5 fasta2DB ecoli m130404_014004_sidney_c100506902550000001823076808221337_s1_p0.fasta DBsplit ecoli HPC.daligner ecoli | bash -v rm ecoli.*.ecoli.*.las LAmerge ecoli.las ecoli.[0-9].las DASqv -c100 ecoli ecoli.las mkdir log hinge filter --db ecoli --las ecoli --mlas -x ecoli --config ../../utils/nominal.ini hinge maximal --db ecoli --las ecoli --mlas -x ecoli --config ../../utils/nominal.ini hinge layout --db ecoli --las ecoli.las -x ecoli --config ../../utils/nominal.ini -o ecoli hinge clip ecoli.edges.hinges ecoli.hinge.list demo hinge draft-path $PWD ecoli ecolidemo.G2.graphml hinge draft --db ecoli --las ecoli.las --prefix ecoli --config ../../utils/nominal.ini --out ecoli.draft hinge correct-head ecoli.draft.fasta ecoli.draft.pb.fasta draft_map.txt fasta2DB draft ecoli.draft.pb.fasta HPC.daligner ecoli draft | bash -v #rm draft.*.ecoli.*.las #LAmerge draft.ecoli.las draft.ecoli.*.las hinge consensus draft ecoli draft.ecoli.las ecoli.consensus.fasta ../../utils/nominal.ini hinge gfa $PWD ecoli ecoli.consensus.fasta HINGE-0.5.0/demo/ecoli_demo/000077500000000000000000000000001314415550300153725ustar00rootroot00000000000000HINGE-0.5.0/demo/ecoli_demo/run.sh000066400000000000000000000022771314415550300165420ustar00rootroot00000000000000wget http://gembox.cbcb.umd.edu/mhap/raw/ecoli_p4_filtered.fastq.gz gunzip ecoli_p4_filtered.fastq.gz seqtk seq -a ecoli_p4_filtered.fastq > reads.fasta hinge correct-head reads.fasta reads.pb.fasta map.txt fasta2DB ecoli reads.pb.fasta DBsplit ecoli HPC.daligner ecoli | bash -v rm ecoli.*.ecoli.*.las LAmerge ecoli.las ecoli.[0-9].las DASqv -c100 ecoli ecoli.las mkdir log hinge filter --db ecoli --las ecoli --mlas -x ecoli --config ../../utils/nominal.ini hinge maximal --db ecoli --las ecoli --mlas -x ecoli --config ../../utils/nominal.ini hinge layout --db ecoli --las ecoli --mlas -x ecoli --config ../../utils/nominal.ini -o ecoli hinge clip ecoli.edges.hinges ecoli.hinge.list demo hinge draft-path $PWD ecoli ecolidemo.G2.graphml hinge draft --db ecoli --las ecoli --mlas --prefix ecoli --config ../../utils/nominal.ini --out ecoli.draft hinge correct-head ecoli.draft.fasta ecoli.draft.pb.fasta draft_map.txt fasta2DB draft ecoli.draft.pb.fasta HPC.daligner ecoli draft | bash -v #rm draft.*.ecoli.*.las #LAmerge draft.ecoli.las draft.ecoli.*.las hinge consensus draft ecoli draft.ecoli.las ecoli.consensus.fasta ../../utils/nominal.ini hinge gfa $PWD ecoli ecoli.consensus.fasta HINGE-0.5.0/demo/ecoli_demo/run_norevcomp.sh000066400000000000000000000022741314415550300206270ustar00rootroot00000000000000wget http://gembox.cbcb.umd.edu/mhap/raw/ecoli_p4_filtered.fastq.gz gunzip ecoli_p4_filtered.fastq.gz seqtk seq -a ecoli_p4_filtered.fastq > reads.fasta correct_head.py reads.fasta reads.pb.fasta map.txt fasta2DB ecoli reads.pb.fasta DBsplit ecoli HPC.daligner ecoli | bash -v rm ecoli.*.ecoli.*.las LAmerge ecoli.las ecoli.[0-9].las DASqv -c100 ecoli ecoli.las mkdir -p log Reads_filter --db ecoli --las ecoli --mlas -x ecoli --config ~/AwesomeAssembler/utils/nominal.ini hinging --db ecoli --las ecoli.las -x ecoli --config ~/AwesomeAssembler/utils/nominal.ini -o ecoli pruning_and_clipping.py ecoli.edges.hinges ecoli.hinge.list demo get_draft_path.py $PWD ecoli ecolidemo.G2.graphml draft_assembly --db ecoli --las ecoli.las --prefix ecoli --confi ~/AwesomeAssembler/utils/nominal.ini --out ecoli.draft get_draft_path_norevcomp.py ecoli.draft.fasta ecoli.draft.norevcomp.fasta correct_head.py ecoli.draft.norevcomp.fasta ecoli.draft.pb.fasta draft_map.txt fasta2DB draft ecoli.draft.pb.fasta HPC.daligner ecoli draft | bash -v rm draft.*.ecoli.*.las LAmerge draft.ecoli.las draft.ecoli.*.las consensus draft ecoli draft.ecoli.las ecoli.consensus.fasta ~/AwesomeAssembler/utils/nominal.ini HINGE-0.5.0/demo/ecoli_nanopore/000077500000000000000000000000001314415550300162675ustar00rootroot00000000000000HINGE-0.5.0/demo/ecoli_nanopore/run.sh000066400000000000000000000024051314415550300174300ustar00rootroot00000000000000wget http://s3.climb.ac.uk/nanopore/R9_Ecoli_K12_MG1655_lambda_MinKNOW_0.51.1.62.all.fasta #gunzip ecoli_p4_filtered.fastq.gz #seqtk seq -a ecoli_p4_filtered.fastq > reads.fasta hinge correct-head R9_Ecoli_K12_MG1655_lambda_MinKNOW_0.51.1.62.all.fasta reads.pb.fasta map.txt fasta2DB ecoli reads.pb.fasta DBsplit ecoli HPC.daligner ecoli | bash -v rm ecoli.*.ecoli.*.las LAmerge ecoli.las ecoli.[0-9].las DASqv -c100 ecoli ecoli.las mkdir log hinge filter --db ecoli --las ecoli --mlas -x ecoli --config ../../utils/nominal.ini hinge maximal --db ecoli --las ecoli --mlas -x ecoli --config ../../utils/nominal.ini hinge layout --db ecoli --las ecoli.las -x ecoli --config ../../utils/nominal.ini -o ecoli hinge clip-nanopore ecoli.edges.hinges ecoli.hinge.list demo hinge draft-path $PWD ecoli ecolidemo.G2.graphml hinge draft --db ecoli --las ecoli.las --prefix ecoli --config ../../utils/nominal.ini --out ecoli.draft hinge correct-head ecoli.draft.fasta ecoli.draft.pb.fasta draft_map.txt fasta2DB draft ecoli.draft.pb.fasta HPC.daligner ecoli draft | bash -v #rm draft.*.ecoli.*.las #LAmerge draft.ecoli.las draft.ecoli.*.las hinge consensus draft ecoli draft.ecoli.las ecoli.consensus.fasta ../../utils/nominal.ini hinge gfa $PWD ecoli ecoli.consensus.fasta HINGE-0.5.0/demo/yeast_W303_demo/000077500000000000000000000000001314415550300161405ustar00rootroot00000000000000HINGE-0.5.0/demo/yeast_W303_demo/nominal.ini000066400000000000000000000007151314415550300203010ustar00rootroot00000000000000 [filter] length_threshold = 1000; quality_threshold = 0.23; n_iter = 3; // filter iteration aln_threshold = 1000; min_cov = 5; cut_off = 300; theta = 300; use_qv = true; [running] n_proc = 12; [draft] min_cov = 10; trim = 200; edge_safe = 100; tspace = 900; step = 50; [consensus] min_length = 4000; trim_end = 200; best_n = 1; quality_threshold = 0.23; [layout] hinge_slack = 1000 min_connected_component_size = 8 del_telomere = 1 aggressive_pruning = 1 HINGE-0.5.0/demo/yeast_W303_demo/run.sh000066400000000000000000000021561314415550300173040ustar00rootroot00000000000000wget -nc -i https://gist.githubusercontent.com/pb-jchin/6359919/raw/9c172c7ff7cbc0193ce89e715215ce912f3f30e6/gistfile1.txt dextract -o *.bax.h5 fasta2DB yeast m130605_000141_42207_c100515142550000001823076608221372_s1_p0.fasta DBsplit yeast HPC.daligner yeast | bash -v rm yeast.*.yeast.*.las LAmerge yeast.las yeast.[0-9].las DASqv -c100 yeast yeast.las mkdir log hinge filter --db yeast --las yeast --mlas -x yeast --config nominal.ini hinge maximal --db yeast --las yeast --mlas -x yeast --config nominal.ini hinge layout --db yeast --las yeast -x yeast --config nominal.ini -o yeast hinge clip yeast.edges.hinges yeast.hinge.list demo hinge draft-path $PWD yeast yeastdemo.G3.graphml hinge draft --db yeast --las yeast.las --prefix yeast --config nominal.ini --out yeast.draft hinge correct-head yeast.draft.fasta yeast.draft.pb.fasta draft_map.txt fasta2DB draft yeast.draft.pb.fasta HPC.daligner yeast draft | bash -v rm draft.*.yeast.*.las LAmerge draft.yeast.las draft.yeast.*.las hinge consensus draft yeast draft.yeast.las yeast.consensus.fasta nominal.ini hinge gfa $PWD yeast yeast.consensus.fasta HINGE-0.5.0/docker/000077500000000000000000000000001314415550300136165ustar00rootroot00000000000000HINGE-0.5.0/docker/README.md000066400000000000000000000003061314415550300150740ustar00rootroot00000000000000# Docker Image Build Guide This folder contains dockerfiles to build hinge for certain linux distributions. To use copy the dockerfile to root directory of the repository and run `docker build .` HINGE-0.5.0/docker/centos6/000077500000000000000000000000001314415550300151775ustar00rootroot00000000000000HINGE-0.5.0/docker/centos6/Dockerfile000066400000000000000000000020601314415550300171670ustar00rootroot00000000000000FROM centos:6 RUN rpm --import http://ftp.scientificlinux.org/linux/scientific/5x/x86_64/RPM-GPG-KEYs/RPM-GPG-KEY-cern RUN yum install wget -y RUN wget http://people.centos.org/tru/devtools-2/devtools-2.repo -O /etc/yum.repos.d/devtools-2.repo RUN yum install devtoolset-2-gcc devtoolset-2-binutils -y RUN yum install devtoolset-2-gcc-c++ devtoolset-2-gcc-gfortran -y RUN source /opt/rh/devtoolset-2/enable ENV PATH=$PATH:/opt/rh/devtoolset-2/root/usr/bin/ RUN wget http://www.cmake.org/files/v3.2/cmake-3.2.2.tar.gz --no-check-certificate && tar xf cmake-3.2.2.tar.gz RUN cd cmake-3.2.2 && ./configure && make && make install RUN wget http://sourceforge.net/projects/boost/files/boost/1.55.0/boost_1_55_0.tar.gz --no-check-certificate RUN tar -xvzf boost_1_55_0.tar.gz WORKDIR /boost_1_55_0/ RUN ./bootstrap.sh --with-libraries=graph RUN ./b2 install RUN yum install zlib-devel -y RUN ln -s /opt/rh/devtoolset-2/root/usr/bin/gcc /usr/bin/gcc-4.8 RUN ln -s /opt/rh/devtoolset-2/root/usr/bin/g++ /usr/bin/g++-4.8 ADD . /hinge/ WORKDIR /hinge/ RUN ./utils/build.sh HINGE-0.5.0/docker/ubuntu12/000077500000000000000000000000001314415550300153035ustar00rootroot00000000000000HINGE-0.5.0/docker/ubuntu12/Dockerfile000066400000000000000000000012421314415550300172740ustar00rootroot00000000000000FROM ubuntu:12.04 RUN apt-get update RUN apt-get install zlibc zlib1g zlib1g-dev -y RUN apt-get install software-properties-common python-software-properties -y RUN apt-get install build-essential wget -y RUN apt-get install libboost-graph-dev -y RUN wget http://www.cmake.org/files/v3.2/cmake-3.2.2.tar.gz --no-check-certificate && tar xf cmake-3.2.2.tar.gz RUN cd cmake-3.2.2 && ./configure && make && make install RUN cmake --version RUN add-apt-repository ppa:ubuntu-toolchain-r/test -y RUN apt-get update; apt-get install gcc-4.8 g++-4.8 -y RUN gcc-4.8 --version RUN which gcc-4.8 ADD . /hinge/ WORKDIR /hinge/ RUN ./utils/build.sh HINGE-0.5.0/docker/ubuntu14/000077500000000000000000000000001314415550300153055ustar00rootroot00000000000000HINGE-0.5.0/docker/ubuntu14/Dockerfile000066400000000000000000000012061314415550300172760ustar00rootroot00000000000000FROM ubuntu:14.04 RUN apt-get update RUN apt-get install zlibc zlib1g zlib1g-dev -y RUN apt-get install software-properties-common -y RUN apt-get install build-essential wget -y RUN apt-get install libboost-graph-dev -y RUN wget http://www.cmake.org/files/v3.2/cmake-3.2.2.tar.gz --no-check-certificate && tar xf cmake-3.2.2.tar.gz RUN cd cmake-3.2.2 && ./configure && make && make install RUN cmake --version RUN add-apt-repository ppa:ubuntu-toolchain-r/test -y RUN apt-get update; apt-get install gcc-4.8 g++-4.8 -y RUN gcc-4.8 --version RUN which gcc-4.8 ADD . /hinge/ WORKDIR /hinge/ RUN ./utils/build.sh HINGE-0.5.0/licence.txt000066400000000000000000000051671314415550300145230ustar00rootroot00000000000000Copyright (c) 2016, Govinda Kamath, Fei Xia, Ilan Shomorony, Thomas Courtade and David Tse. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: · Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. · Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. · The name of HINGE may not be used to endorse or promote products derived from this software without specific prior written permission. · This project uses open source code from Dr. Eugene W. Myers, Jason Chin and Heng Li, their code is protected by their licenses which are explicitly contained in the source files. Redistributions of source code and binary must also reproduce their copyright notice. THIS SOFTWARE IS PROVIDED BY Govinda Kamath, Fei Xia, Ilan Shomorony, Thomas Courtade and David Tse ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL Govinda Kamath, Fei Xia, Ilan Shomorony, Thomas Courtade, and David Tse BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. For any issues regarding this software and its use, contact Fei Xia at xf1280@gmail.com. HINGE-0.5.0/misc/000077500000000000000000000000001314415550300133025ustar00rootroot00000000000000HINGE-0.5.0/misc/Falcon_ecoli_shortened.png000066400000000000000000004372201314415550300204500ustar00rootroot00000000000000PNG  IHDR? iCCPICC ProfileHXSBB D@JM^t*vdQ"+"*EE"Ƃ.lI躯||˙3g3w , U /a&&%3IbE ' Fwa4+\@ r9k|o0+_ ᷐UP De%T[Kcb}!@Xt$tGAٚ!o`q !Oə Y 4<˙:JcXF Ysߖ-Cj0(Z2f8ofJjGU q! g9 PaBs2DYq#lJx4;©™#~vDHeQnjƤ!Õ*̈MD; x weń_1#EK4B~& `9y¬,i_ꐽ3bdmDn^bب_pq#0|GږFnv`ly1m/&a&+$J{'ȏiq|`,`&~KVX@XxF[$Hk o ˘Wvii,r{nxzb;.혊~ blLΆEx \8:#oϝ/y!|g yLoE2l L[kG$w CFrp)o>G@g^8 d>\r! P@S8&[܀! Ă$0zȁgy`1(e`5X{ p\F/x;0  !tDE qF<$ F$#"d)Cʑ*dRAN C^#P 6jNDQo4Eh.Z+J݃6' 5T@1c LĜ1_,K0!+*Zl Lcq"NǙ%\Axs ߅7>%Z +!HH'"*; KxG$D|7ĹFb;8H"4H$wR$E'6.zId]-9L擋c!9%9#9WH9Ur;Z. Q)&wJ,%RIG9EKy#///"?Y'HR~Y*Ts/u*UD]ISoQh4c-O[Iݧ}P+X)+p*T+4)\Vx(h8]PB%~%9%c%_%j#J76ʑ9+w+S~BR1VWlW9 t6} }WjZWKu@ME^-^mZQ51c3ٌUOy[>n߸ޫWR窗7_SXѬqO4ל9KFfxnK[ 2׊֚]֠v@{I~N:c:}t]]:ϙjLof6 m7я/oԿg@1p6H3Xga0`kn8ϰ𶑜Q3FM7?3Q7 6)4i0kJ3455jF4s62lm;gW_@--x-z&&LOpÒjmY``afUdlrk&:z"V׶lj۫v4v-v-57K:8:9 99:8mr eKGWG|Yev{6dwҎIYLbO=OgC//Nfޙ{_X}uuWHhtD Zt#X;\<2?3Z0̊upls,0\܎yz{0{Ȃ /]hbſY]XxQj(Q(Xt2|oYr-唞/.(6?W<2me*U5\\MJ׽]?c -(Dĕa- 7*ZOu&M7|ƫf-e[>mm-p[SqmvOv8/;5wRǯYT_[kAзg~{[Yh,?5Btt>ѡMK9M▤#!G:ZZf[]^[QQ>^x|]"ģwN&9T詳N<}Ym\9|ㅦ.ǮKNZ][{&y+_p-Z7oL!ɹVW nYtpҽZk0Q(>Ň1̧OOf}&}bk׻9% `iiGQΌReg4Kq{X S]X4;[Y.*<>  /;[}#=V+ u?ڿjmv>B* pHYs%%IR$iTXtXML:com.adobe.xmp 1310 1262 %iDOTw(wwnWo@IDATx ¸-ȒYdd}ilcdiK~TD @ @&𸠦 @ @< @ @,. 8  @ @x @ @X\@qqR @ @  @ @ 2$@ @ @@Q @ @ @`qIeH @ @>@ @ @ʐ @ @G} @ @'! @ @ @ @ <.N*C @ @ @ @x\T @ @< @ @,. 8  @ @x @ @X\@qqR @ @  @ @ 2$@ @ @@Q @ @ @`qIeH @ @>@ @ @ʐ @ @G} @ @'! @ @ @ @ <.N*C @ @ @ @x\T @ @< @ @,. 8  @ @x @ @X\@qqR @ @  @ @ 2$@ @ @@Q @ @ @`qIeH @ @>@ @ @ʐ @ @G} @ @'! @ @ @ @ <.N*C @ @ @ @x\T @ @< @ @,. 8  @ @x @ @X\@qqR @ @  @ @ 2$@ @ @@Q @ @ @`qIeH @ @>@ @ @ʐ @ @G} @ @'! @ @ @ @ <.N*C @ @ @ @x\T @ @< @ @,. 8  @ @x @ @X\@qqR @ @  @ @ 2$@ @ @@Q @ @ @`qIeH @ @>@ @ @ʐ @ @G} @ @'! @ @ @ @ <.N*C @ @ @ @x\T @ @< @ @,. 8  @ @x @ @X\@qqR @ @  @ @ 2$@ @f? >R@#]  @ @UC 6u%L |/O.<'  @&sKq҆RzpL/ @ p85 @[ -eGK <>o @ \Zq҆Rrj!@iOӔ*B2` g3: \*3 z#XZ@qiQ @G/UU O1T @l/ 𸽹-  6&%oN3Hz].]< @!?m>$ @ <.F)#.O3mʀ^JpAxL_k g  @x\T|@ &ؘHL r.oSy,'  @`5heL2]3 : <6- ?bٚc]s;&0@u1xVR @glUu"@`5@bdj[Hݸ[o tRM0%@# 𸎫\ 8@6p .v{=uWA7-y3ʱ @e'b&ic<uF3 <2@~z=ZE @ d%M h/g`ݿ_<[d6v7ey_,|iܸy @) xvU+^ A %Lkp0A fYK[Iyd>3M4:z63:糭gr @x]H`*A-k.7h aye>)ip0Hlys 5䑀bMFR]}}YaG:(I @C@ 0$믿^n H o/6$Еmem58n(&P `3SS^sYώtNY7ˋe-__w{֫AԸ(ywyc%nv.KA,| f)_ܒڎY'ˋZLkǬB @+˞H *Aן n%WM|U6tLl]xli,O?c}O?|>~If>tNn۲ @׳3v̴)~R)>rv]ujw\7y=3M4 }˲D @c9YVHЧw9&w] ;Ocq|VU[ln1=?GLHllߙ[D @/ 𸾱- @ $HjP$JY%]q|j?J?_,uu}?S @ <~Z/&f ApƔuP?iy?˯S>;.g& 㵚 ?w:~.}/#NZl\ޗ @]@-~P` ,6p@Lk`1]ӟ;^FkSK?/hjLLU @M@l->X LlQ.k1:q]h_}v5#e @P@񀍦8@iПf>A]^6y/*bzL$do}gL) @ @$ $%`x=e 4u|t؅I߻o'ؿMz]G@ @6" @  DAc "&H;R7ؘgw#xGCo| X.egL  @l- 𸵸 @Atfj o~DR)A#mw\h[=NuRMYe]ǔ @Zkʗ $ [)Kzcϑ2YIiO rU6 @~JO' (G >6M#Ǔs 飁GiO @xq&R0*xt7䛸>NxB@ , @,giI @11V ?.|#cdI=^^^ Oy?_ (f  @xWyYlOFsxRltL'&I @OxԖWo|c1n48"WKwA D6[]5 @HGj-e%@6A)r ߃cT򮛿yo1/ @< Ν zCt"@t B~w'd*@ckwFC @xfWi,26I` @&8pU @gx^@&O @`~n=C*G@qPJGwwOhw7 _I/ϭ_ p> @`3?یچ @@/ Is^@q&R@+;[޹3fޚ)>[̗RԞT@ @IOꞦB Ȥc~l)R'! @xVS,. 8  0@MO?9+9S @3?՚ ;QʈS $cRIԅ %  @[~R @^2_J3lZ&@ <.)3|@%H@"@Lǟ`70 ) @+ mXD Sw,)N Ȧ $[2 @3?՚ 'uZtC2#@izX@wÏ  @ <~^1f'!+П`NH6 @&[^ :0 gףy jIUUyeN 믗cxfTp t>Hȝ-\31%@ܐZ<'>;5_$A?[z%#@gne#@5 @`#?d~vn92_.zj_o$@ <~zP,$ l p2Wc@NVSd a3{y,%Әgۤv=ܾ @\jO!@n9]^[0ϭoA ,u򺷬¼qL_Nn:~cLy~@dpy\DS&8#ŮZ'(vdyW_xZ-e䵌ln'@b$Ge.#8@m? VslrNi=]S (&. &jeں]9u|V+ mXT\.a @ h%Y9cʝє2瑴݊B}rS bU= YjK2`̝1˘x& 6U\kq{)K2#i2]6r~9_Fc @syY`~8@;gޯUz1NOK|xTq=ôhyv;[JgѲ.wA Hp% n a] 2&Sh>Z'ɣ^x[Ҕ <>o pG0Tb(_Z_b*o> ffyRϣIO_Bdk3, _0ƾO jMU Wa)H"Lo7XKo׸`Q^*O|lMtXv_nwls<ՂSnXNQ) @ 4(vU|yLOǟO 2>ix|k% xT* # *vqN+/b,ď@cJ糞4@$Mnv#%!@xk3%# ^YbVx\XFYLA_s;sנU߿5:s֭yYڤmvc;TSޥ&=s~xIEA.Fs" @) C rxQ/IxE|y$|內iz}#'ҕKJ4KʏOx>Y]{jig%L ' ]/YzlLy>z3CZl',~ӶMA >m_~r!I&  +j>3@J.\rѲօeQ\ %~iց-X^Y@qQ?ray-3I9C)~ܵ>7"РF3K~@cv}ecP&dJ[o2l]癷3;g7 ڶ@. s1 ^43DV” $0{W>յd@^1+L%3=훹|kr]-6-ߣ#i블q4|uY/=~mv~tm @O? yw (ֹH{3_bՑBX>* lȱ*i#,}4i (\x"@)JjiϹy?7GtM?}Gq|9[gi#$M}q'zq|'1v 8f?RYa#//;`m&| lT,!pSͱG]cfx|q6{,W6c;,z'@ت'S.4M#\IN/.` S4ؘI|qk $LU`8z'@Z'x=M#iQ_./x>O=].߆r$@3?U\"#\䑴xV\]7?F ܇hYI2+Ǯӌ)hcos,ϼ#GvԺmˌU|\ƪM)s>H@U]vyZ %>_0 4@Ae99解sqJ4cKI:0{w1ָ5w>k3ԷuTW.;P56-j|rܿƌbSWk,zލEԠ/^*vIRj,@M?KggV&`n٫u>J(-_9r2G&+ԇhDJq?AaM5 ܙ 7R/bee4eHy4ڏq#̧u9B-د'} \w?~ r\@w/ (,:O`Yu7Af_V 7 E`G?3MB3UkxkwO/X @{  lBz z,nX Q\k 8{Nn`kr 5vXl_i;ru1>yo>庢Yu{1@WC96?fm-<# uWCNI|cy*.^? M-Nlꒀw Ri1G2c1oz}i5KG;Ȁx9,ػM.2 (}0?@m @%}`KG3mA6K9+AKᯞ➀cG랶;ԧۜIiqSK~<әOY+ <+:`#sR}O:YϹY6:s9sI^#GWY?=z4Ș@C?g쁄U O˧_[6]gg=v<D @g <~vOQ\\!V@xt3E?kL/}{ Upm:Te_i(uqzyS wwE[+` j;rNʣ˿+y@NIE Ņcڧ^6x du]4w}ͧ?9;N|&}uu|\m's 0v~\ %{OMuero1/a4oG,!o>|XgZR_O9ffYy$ddxJ[sxީ[Lh|jk<<|3q>Ŭǚ-:O#e2v꾑3/,Hz9Wk @JYoq^@up"8-bW-N| GNӤU>~ 68iRi=X9VS~gO}RWݜ#&?Ӝ﯃Yǭ%`@ڰ|~}m|˿t+Q ɑ <+or. , vM DQ?8@MǸ2S@/z!-U?פ sA6#^ZR }$l+tkg|qYS :3MS[׾Į8ժ.KƓ=9EL-Ӵeꝺܝroii|ҿگZw#@{6sbOEąc2Pɴ  <ޒl&Ӗ:M:˹9$v|^')iҸ^uy\#ST( †OievHzhWm+>e#c_Xqoeݾo1p' sJ}snx'x;fq)V{)=Fʴm:mח >^ 67Y8U{iR@Y ? <$r!Q.3e@ڭ_Ҏ@M"K 4r>Y*h<xԔ2]i9ct%Kẇ<Ln}i+y?_ޫOSV >LGΚ:-^l- m-(ed^i7=\'-и-/Pj^e@Mϙ߶.\:w,y^ξoo+-3mk;'wBGoR/w9oי?~=1t\};/ @xM^9[ޙf}K[ &+t,{T 1ؒq`h>Kt| 4u@^Χy3 zJϧ͒zWd_}!ϤgK(뵏51g:_o%3(xBCy/sBaMr{ 2/ʺܔ(@IJ/'cNRK ;Ͷr<g{Kí[ox~no1}`V9_[_A+_R} 罤L= @C?f.zx =y?L(^ BQ듎LSq_리u^ @[Y s_'6oՅrakag$qF~j&eIr~K-{1:mi}t3uj9_H 5Ҿo֎X[hiQ ("Ǭ1z*fxNX\$\vzBUvԆ*ޕ*2/ dO6Z[< ;WE˶6㽖yگ}y?=S˗r,ѷ~*)rͰ/(?' ymzsz&JN./,^]o*hk\Z4ݯ,Kylo2-16TKShNWۇ3M MRN/[?~bm?2헤sK=Tu ?Y[H 9)lgi9M xO_:ܺ콜}c"M,kwc :5^78t9'3f\uMgi^__VZpʲ}٭'g)cW)3*7Kgm]|'uΉ'ebeM ЋD̞T @ w~z ܼt-K͘_/|y?_K5lrLcXS]ߗnI-/DxІ_9dԓk],# rieM.;4u g5H*['[ܣ>0}o|u'|V^ضZPҫǫԩ:m"@-[*,osaq/6|P>Hfb~ .:]+uO *JyޛedKfqr?+M_Ǿw^k>[On˽v)˞狔AwW bJ@q+NAIzAc<"R6=[?'e:`k}Y6GsX+/]"SG7d}%)MmK>rn?Lŭ[8ZlD"v Jh1Pjյt6|ǭwY/ }=5#3k>m; oS׬xD;)}1u#yy}>Knl?'{Ŭr,Ⱦ@b:Vrw9i9ޝȾ(&uzj 'g~LoYȾ9~\[) c7/>\ |kL2͹%){)O,wk(lZ=;9GZSGmsZN9PmZ8áL^RǓzN]OǾz[Br)X8 ]i[سn;ǒJcɔO <> f d1s_,!pv^|v` 4gO;Z :3ۘoq>%]ٯNk=~e>)ث<xZTc}&$Ǫ%oY. @;7r7|I6'Oo @ 9{EMAWVg=V<ѺffۜRt^Yg8v$}}} wЂ%sMrIJ8N'cU dړkN䝋ϑSS1!\K dջ<:j۴/2%HeXrL={~ڪ3k5 ,)~k_ٯ2#eݿdʲk2{9]ӞϮ?>|kn=Q&@ۛjg9. yyvpyRԸA H$O=9iRߛoYۥ:g{gcy\I縲Z9h_Ogiʚy vmG0ؔgzN\nkS|'u\: vA`Y8@I4\ z3L; u޻upg9U@w<'wA-kc@y )zY7NJLgNVM?}vq8vYcg< L  R@N9-'> 0сXaL5u~W˲Դ9LL93e¤[/}=~hk[)M}yrI[fz:0Sn绽y ~ue?1"<^~2MJ^Iͯ/ =M)Q /}Ҧik[Ӕ+H(Cv]9ey%u\s_v۪.9˶vHk[ w<~]=H 䛺w(#.?T6դ[35 d;v.;OROc{7͒wz}ezMkݪ0vdbfK_[zuEcT2K#)}2']vycXj#p!@` )g2XJI-X\Lߒ :@koI֒xvf6澑s}0k۷\)Ś>j i}A2sg_Zx$iwoƁq/ 8@.?dB-kBg ̥n9_ӧIZc&X;}9nW>>ǿôISιmqyb>9^(< prǓ7R 0u^>!@`+dүk9~fqcjt߇k 2ӟu0E@D)vGp5_ׂ=N䭵= R͞Gol% ӓ7@ /7koM{ #O,9l,=lkArZ>@9yjjml_NnʓiRsyiôCRylqܼl [^@M~rBE[3ׯ- @X[MWʷgbL5<'N?<&I\;e[ٖ/>O VI e[/l}^kL}?eY3U}#m0,}ӯZ p6dz/](-+;'+&=Z1uq)βL2LX+34J1)Bu$оYYm^fvzҶmoq|)[ƣex\K$B1'OR5 @`a >[d9sSslO )fB)}f؆knGAǵϙ*yc̒58ygy)KʗiZ=ȱ,}"Dz3.7 `z/?ՕL}Rృ0X耪e4mn\ӬGRls}ﲂw >=huAǧ~~/i~icVPg?eEԏ\ogOeM.C&xf-B-\/i@eC;<8gҴe4}7ye[O]N!cڠǷg[KGe9qd:dhq{cX;eKǬyz\KH. 8׷| 0~e* @i9~ .8>ƷeH{d>[N^gu3]gz+yg4g{ 6Ϊ߭{)_r[?@xXc1i8:{yg|lusk>dmɟ[ #U H'}~({"@w;{zgD5ezNn/+.<<3|r^K'E^sg;W_|O1mVjҮokmK%G}y=5t˛鸹M+=}pc\6[i[!@gm7ꕋ fg<Q-%@`e](wPA`3sl`:8o,{ʟ]^&en.iQخiJ>[}8. i|ۇ{ɲSʗrs[?_2TǾT!@^{Ofg< O̦h>^`ӑ`ZLr߁ H2]-SHy뼿5eIjpe˲&萔OJigz5@/]G/cY[\wfq1Y} g);}o6 [Oik$@`_}zNr< '?fxꁧ?cKʗ]1 .oI={ u{S꘺dYe#׵فeKSa1][ tRu=/v Yӗjm ^m76흋ɪ8l$07ۛ4:HRΤqLGT7AwϺ=GeYSGo}e4=@<hIW. Oxq$>)EY{,L{]ΔqɿvyOkk~ݿ;xC ?rF 82-k<2OO]罼δۮyn7ص5.[bm%!fugfGҫ}<$ly^2>SʝkRrzݻs 3mQfnke#@`}ܧj.dr -|p/D9:@ܪb)cR9T|)cX/o|="m 2ncpq쳵:5oYvwfX{|J>|쿮חmgv> ED෯n׉^hk0ePiw35eJ9良mu]kv=z <-KvԲ}7M~uo|G` wOejo룧ԡǵwltc9_*R!@8x:|z:·|FO3:w~ .v%ef@.LVhLh:u577wCszu=S3)Ǹq{+X+ez3QMz_?ƿJ9l XO81R%}W ߐ1]oO3O?)Oo=]lkxyS:Ǧ 1r##'q^wLVkvH^f=NǸy3 CϖE3ԧNJ x9{_I9_W^pS:uN&~o{Wu[Rܝc>9n'ezI۪<=̧]Ne>3rݶ? 26%/y,)-z}{?< d?o>lr~^+ͧmcm+,# 㴹r|3HdfT0 xǑցxu`K 8R?J>iäg]lvYͲi%Θw>fɲq_?JYt%~>wks|c:&8DQίmO{Lgo_* iɾ1) t8K^%8cݻgкg9汦;imk0흶O||Y]23ͺ}>K>N^L8@{I\6#SZG ^k뇄eRV_0mnS% 1>^hu,?1ZH) _ ǖk./6z@!jmv{s[ /=uH0/o.nй,ȱ;v}X]%/?k?u VX3g9 b#H||rI1b- π$iOOR.˾rNƺ'+7z~y6Ȓ&O= :/4Hl?[ ն>zS@DAwӕs7= @q@#=lUx%Z˸g}P\ޚ}|KW^^1RaS2us 닣k]='gmxoswrq`EN>O%Ӥ, <^X<XM ZL?f>S>܋˾s[RW?vvO@w=fFisHr'Z+WLgsZ5˾W+9Ͼ*sv]g 6 <~s:X%Uc*sʹm) d~؜sDi%}'wIooҒm;'//&yx!V.FeOVaOl3lROGuEZaݎ)^x|EmtporPydI'/}`E}2'2sW~IY/)3Ǟwqkm?cNyFKm_eeF<@ss~vȹ$)?)xW>ɩum?xg~1s<-I ,Ӌk:\瑴T`qclTt,Uc11m\|R)ʼT?%9?{wק8g XAu<"cIu<>Jv |]J'DyU.~Rzͼ' @q|Y>Z2e)]2pO jO6I76G5ruOx:-N|㕔b#frAD }*Rw]2~|O6k=X?},pcY>eOS3GS{$~f֩;Vfmʕ}`z\VN~$~YYoOe_g[zG/동~}|6Q~EĽ pȣBz`sb U#@/t/g|K=:?)ǂR7˘>{__?8bUgc]xTÜtk-۟~W!-Xε͘ukq#@;t^2V#D@HrhӼweEvz,ͱ)O9n[ǵϿZ[ۚuYgm}ViOo{=&漑#Ϻ-Wup/[0G=|@{7nl' [EX.ri 4",dY j]t#m ,v}Dzߏ:j3M.N{~6m{o?{I߳ܶM`o}w|9%/z ?g9| | \ |$/ak`vlf`2=aU_d'#Fu_N?%n'[Ӳd٧|E nc?[}Ǻ'@ ݇WN߭NY٘tpz,U#!m* @`O=Wv.A}]H.zW^Kr~:vd<IY/ǧ[emG~dgX'd>Zg\?o}ڧ^^OOqvߖNsַ&ϷzQ;@q$n-^'DW֦O_6/Wdf%;[W#`"Dz6&d6I@&Tk W4p?{x՗Qcjء4ӍGt֨d@@Ar}jB+g̏z￧5[_K/ற#%[5MVd+&7%iwS}/6~ͣ L:ڪs(|.N+F+Ebd9Ĕ֟O3^\cWݵZÁ3~?>Y2oY# WLcbgFB~ NdOl{z"{;A@Z?CpJ.8LF#*N{}uĚ5)y542#̲z˺^]-!؊YԳ"_Ph*rwh!E(.&/へ s%v0p>LEfGҟ8˕Zp]e}zg{†k u@|ԹoI8l:]iT]:6UFeVFoGfѮFZLu2n{E@W }K hQg?abP> i ?wѶדxy")8xIqC@* UZ?!~˾sSm=Avo5F`F8U ~e`C.":E i,7Db>'c QHf4@ ߇>(V=& T?F×sVx9|Zk =iB]#^-q!=g@" ܬAtͱ^&x/y8B)6x7s_ajF1:JvxF.ğOaZ 0P9R9D$[o&=n:rMy(AMiޢ *l}+oHB2#@ܰ)zb8s՚3zefٖC` H@l .+!oٕtEWoot93,z_핾W籶51!׺.G0VJ͇5%FFd@mӱ&F H$?5#?IXHXhGq7#PztB*HO\sgǻTsXquDiK!wD5,?UӸȎ[D{kΨ 9`şz7a,6>IQ0 _ s7;Ʊۙo }%gc9.~lx̴DeW\ zccT V#p3qMYB.Ս#5`F![XUlv[cm&r"D%NZ0omI q98QfÈvh@џ,zMdͿg#i ֢Hث\J0WͫHˆ&#` |kͶb-rڄ$A4Ǔ|2Ɛ1@ @[4x ntxSZzyCy&^3FM- hUK|n9o#Rj%g5HPc"oݑ,I*dJjAdpla ϩؠFfLrʗoPs$S6$m6.xHJɷ; .> X◘qZ~)y04ƣoWeq`VLK!% mc[Pi[͜C2@m;`8)尥rڷ),3lku/1:qgz]_rY -j1%=g!&#`#(QZd'dj#aCpm=5ؐ\R>T{=s[.fCwZ?}=ET?Ɩȟ:qg5C*kAU (q9=ȒwF#;=x7+1= S`D@Nk|Oe]9P[#{k`#㧜EjCgRm0{}o:Yv+:IᙅkS%ikJ7o0#`>!, =-|pac.>@-*&?OhKWNwNkN9uEmܳdJwusȎٵ+YF.\3f>şԧIEj|ң[ 3opX#P o<&"$S"qs@M?eMz{TWrQWs%_|{(09V,97505.=X!V;)k~}\¥]V kp!!gg@-BB=<)9\$ywib -᯿m Xw6s.sB yKȎ3?cǑuguR<#ʿ[LU|]Gm?(V~f@ ks #!nw9U#0x`+-T$𿚚:¯PweyR8AlA5l￯ 6d96[v!?p֊7U4uت~L)'vwjy/9N9z751lj{@K_ }8\ @t##\+W}|o9@a.-?7`ƚ{UMzECj9g^OXW@r@W9~9QGrn %q2us}|@_֛X_Xנ5hU ΌpQz3M |uUAe>Ѓا͔Mdg F7v(P{] ;pI9ƳCg8$bR~skcߘasi3GӏL~$wyYRב;Yg?~[+ yqꡃ>aR$(+{c! {ID_Ʌ82wLY؍7ոϏ%|`+瑓l _|-鬱 8鉟@\xłk79 EIҪka\!a-- V"k 977})z[5/~do]X!-=Ä{\ON:8JLxřڣ\7}G8X☵'[O6ڿ |- ^){XlsOd!d;#SX_˃Qa!TucsE?6eb\9}azW:+~9x~rVʭǨ*w>vaUogK95փɥ,*(vLeʹ=gSެ{:H7/xNtU#}bƟ͢HC@6ڣz@Or]zС J|)s2cc*ܔsC4s,4>{"'LQJi{7HO;5K_;?ٽ ?^99e aLȗ8ukGJ/7Q_1Ok]Oֻ',7k#~O ̢F[OsuTޣ,*7ŭD*Ir"d]8/m.ķlg >OC ]gZǸ$#}姟݋̜/<ϧa}w5f/1i(JT]l+ЮB`WJopϱb#|JyI>Z+fbfQ3zr!OZ0Dox;=a-2J WފR91>#1O) w-|(s!Gy0r-&?F9EH1PtVym?Y6|F)x|X*\aɃڨLGTz3d"|emQ F6ؿP*/}Zyr6'ϳ^o%y_zՑvJ~%>߭9t.!w<[=cP.d(9/^{6hcm HAm½^lՌh@ LѪ>ISMI}sZmJ97C[{!l*(@cdtJsz%?ØaE؇yfV|7RpePg=9ujWճg9!3rtL=n_5gP}~8[HV^yF`+L *|%B_PoFY\}4f%JMtiF9oE2>!b4Xi\Y?:b}_3(*5J. :x2o ҷiM7pAjǣ=-Kg|9B7ܩLfL" ?%>2[ fHk:~=H|{Iv| ?|9W+1Vo0że1גy35ɣx'786>M,+F3y5Oh50kCLs#XbF5#Ž$NzH\G,-zp ψ?~Tj8$C?m.'œJ}- 7~J?8iAkOx1gzҏ:ν_C |ヨu@8"n\cGZg1LN#z<]B>%㏜=x'x2J*9{j)PɃ1B\!眖ҵCz㱍W$#ǼO͑_> _#m6G! Նs ״٧K8x~\C_g=֬ y 1g~7װbێF'ؚ.Oz9z?e]ы--*!=;O-f{;l-lqwh -N~/rcRnU{9Q/ݧ꣖qI:y/#e55[]oEǖ8ߒUl#Fq_}sݧUq/z#A$uU$~~5QQE93.| +lxWs3۴-ϤHF9V#u]-ɹGRޥYWu?wK\Acyѱwȍw)~z2PkN;w]aP֓eOFmSk{圧/QFs~ ,t@]Ε ۛ[2]uvGm7\V :Ϯ?}Z\^,/xx0oĐ]G5z-|뽒|J2Q^}8@_t!u=פV6BO§&%Lk,AA(@D '>5i1W9=gK<@*~q[k߈ǶߎКD?|߆[:{15Zx~K^XxFƶX:RְߊcJٹ7sh% b\oXsnE.dR=WN8@IW}~҆km7_Qgdd%[|Ⱦh梧WB~G'Ws7cbTzzQ>1vħ=6Yc%K*y`D+i^"p,Py =ca!_۾tDCۢoW413}q|nȁ(3V =\!fLRZms35IR|Yz _>9{>kJ T⤘+OȊ ^ sɠ{a>Ć W}룞jB<#/@@vxoC$8>B5bVՖ9>%pSQẠğ!U ƣf9c厔>#0*C>$F1uHfv>{QϺ!*vu߭XoXϰ6Ϲx<^[p&5o L䬚x61Zp4 ˫UOhiR75Y*bxS[#w֦\zԖ;v>G3zLEVA,%xZ kÄ~y%'2P(A1GG& ,~1^|V_}3kL2>- c_}Vh-0Jes$kWIm<i?0"b6;oZЌo{.kxI?T-Kϳ+Qnxw<b#q@KX E(;vCK,ƂO9B =r~n|fc>ڰ)|3vΕb }8?%?^xg"%yHJO-"v%-MIU˱l\3v8bd9w\7]#Z+WSf,5"nϵ ~@]7c+F`z8>)vG=!4=YDmP@Yć1k|>ާ7'q9 OE ;6eOR@?ί(>O+_䞮_j,UZჯ0wT2>8C(.'6 )6WkQuG8 d u\sbs0`R,ۯfw ۳>D}OZ#7SP#08*vW+\:g6Ra!^a?]i>P9vOK`0Caf 560Եc?p6x?i5pqn(r?GWq^Ӕ9<֎z+e#<\(ab[Aq&J~c' UqE1aP>Oe#&F>*^{'fEl,@Zݿk/Zx~WyRXxO}g}ϮcZ 7i+YcBZop=ŭV-.O[ok@yK|fnĄ|guW欯Snҍ;|zI[z'/7!,j}@D,,9 BCU1K!DLJ > ?3x as=3 ήD :'wO|rC%'/٪s!Twa+?=$,[兕pϭH{U sj# {ONǝCdυ9_/'Eǎ+\z_i37k@'Bq K #mr" P~daywl'5}=#>}>kùv y#FAO0A(#JYٚ1V!r !ȇ`Gpwպu/6U䗘‰تȋfN@D7 Rk%Ok9<m2#[߶@%4ϫVPHV>;bc@<]WogȑkŜ*GMZSr~p`87wv~XŘs?Os9gW)Qh!3-0vsDyoreYe/X恶EVu?7V<9fU FG=%|׷=!qi[ ]/[X]/.v}G@~pMy;W_6[#UF0%^{]kqlOHv~"3M{ 6UŚ>#P Z0 -Qlo\3V=~jv<9$_|?@?Q-՚<)>l_{ʳg\ݻ4؉XDZ/b~xɧ~ݓ%a'Q-|zm0NoK_I*F`㖱LF`EX!HYiVPLf;k[`g󷾦oܑt> 6o 8A0jش'{ ylQ.o*|oG譺p c|?١$E}FȌO@=|&co.#p*;[XZ=k˦K+a.?wޔ8!C/US3} r 6ӯ>ktg+ٕ~w}tRҚ~"ڌmֲ6C<0PbRm=zC/,EY$ 'JMXE$LV< ̢{ Qodj_Mnvl O}Cq%O'H|(V{D+5dvl v$ry*eEjׯnX)d8Zpra{z(t38<ΟuD|+%$i{u F[鸒;闃>5*cew;oq䶝!eX>ʶ.wQՂsccD[Q?7H !鹕7RZ/%6Fø;o@`O8gG?3єVS؆㊔SR늧j3^W檚`kΕ5-n.lۻ dS<1];Kt7Jƻ?W:y>H5՜酏?7%Cۜ]+1y@k|XUXIcZO# w*TrEweB;ϢWmj\oPLBS Foò~fM:$%x-7's!fOף3^7غg?=K7i [#`vR*G/|0CטB߅ur8_v㟥tKiJ_’-Cm>LspWNzxꉮ{dDSߛ?̡0{e#p@ nGz {R08b \9F$U(S2_jW霂רy"Z[؄88?Ks?"?8;U0~%ˉd=<{6@+/!4c>, @SPY)`g"|a$kc6+ *{:2)9aE&Õ=J֬\zY rz:ӻ^91Z7W5]4A`OE)0#@ |V"U7X{{(6笙Z7$(seԘ=5pҁ^`+jW UҦʪY$g2F`Q]mg{[kGx:$w֨+NBr XL5o_T1tN-8K<5f[i=39ZFFx癿|#@RCm/%0`dl _,Qۊ-UzO|BJ]>:zT6/&CY)G1wdcH^Jgq>> wrj<=r |9F$V#d%'|̹eEmU3Л[Q< |;țۂKevopJ7+ۘB孏m9Syl%_8G@<Z^yfPl1K`aF@kʋ5.r O>I='}rȗ Or9Yu>p`/Df Ypޒ >ʃ%)Ī<ُٗDEj~z zd<97ް-;XA{_}b~}n_pW3gXR!s$߶eu"g.~D׽^}`yk9p;~dXc?ЧV\``O\(6jX֦4x,—MU[(ͽ3zo*NǁP[:b//h_t_(׃ʕrnfFgxa/W ϳ7R-*>}Oȯeߴyc2Uk[51EL䮽@6;*X9P?Яd]l-i/]gy,dM!55F -ߺ_y}+Pys39[ h@Og]*QRq"e}9mǘgXZ w]bME^3 g#ً^Xq+9OF|f2FcbFk-bZҩ<{$ۧEK׽}VZ7ϖQhYRoe/sW!/kg\SwezOcdzTr^_f|#h9r>xou}FƢowz˧x%x\čb6&R89)1Zz:wSvj-q~&]{ж6-pj4"²hZLS 72a ? G|Kzioe_i|ɜP Gt(Eko'G\n<9@]߻l mS.$-g)戵:Ѻ =XZQkY<h."P"WgKf *ROn%]b"M V?יW fa1Z^h߸ﲷmeMx\V#@qMMڪ❻Gr>0 ;>ωμZ)ZAl7<>#HNYsnb|8T?sSmGH6rOӦ'=tbq);VA3ٔyX3Zx/UH}{G$P"Ť U[oSq yaSnk.[69@~ƿ#vz-$%lp֒5Ο@9̧;) Ϛɑ^js[|=4~lԒ7kQO, bq'L##-pQ,a+l~BȵhMyo>yg6`偻~f<˷c>Vz(?ɯ:H!_ŧ~_ +s9ZW rqa2 76#߾o 7 w[8ߺ-M ] 3=`!z?ۃXa;sCOl@ ".B@1͝_A&T1*`z: JH~a,RToswѨ:FGzxeXw;gCKo<~WPP- kAs&.FCBOp?>@`d8Z4_\– Zyoc! _ MH`zbF6"DPZv(_f~ 5 K#)M=+=נTlOo>%5{[#իʷGr!4<L@С$kߴI^%CcO 2|T>[@gԍp=Cls}]߱??iYi#\Ou#c)/"Cj+T:#9ʗ![ObW_P0sa 2φ#!yWsڞ7"anB ^s !pyCW>%6)V$\lf0 &GqE1w%6=pXdĞuc!m˱lgi#XR\mDQE\\涏|s"7&|2wʼ-e~ʿ9s7 XX| _96^!N|KZNW_pG2?.1]]?d{9ȆÉЧs ԫ"-B&#`E`+zտ>C:]^ dYCz0O?tOsm>c!ѣG[=xb8(h8 /uUrq9XOяq?s=Q G4uiWr8sy[Jp*^ԯlTKGlt>_&_Oj>0״I Ka A \GJDHO>OWuY*jm.M8r/4O| m +mݎ1HOk(mVwDoƕ/zΥxo87ʱ>}uϸ8|=ILhKBcC]ϨreUsX,|^qg]K7\_ I|# egc651<1zu~r`PՓCc{yxyC/$Ӄ7IO /dj韒㉍3r\!?t/AroN7'PN<ȭ^s1Uĥ4;yIDATКCT%n7[[BhbJݰ8 bQ%8 `#ƌ0)ӭ9bb) ƀ%툤RBX<_jCɭ, JxjJ>?$v)4XwL| &˷*P[B=Թœ*ǍR# ~ڒgOd` 8Ǿd'}#m;ϐU~WSn-=-k9lW3Gsυ7粧Y9%֋ͅM*MW0Oz@~YȮփO OT(@@yG;J&qܪ8- QW[2sxd<~Unw;Ăm_o"H7ᕓ_䴿H]`_xjOqCzD\(><ixݞeBٜd??E߿>8?"G<@? GcǏ=h<΋&#En'~P/9Lm#ǕCug7Ck>|X-Q|BCF?iu|bp/5dqiW(#h2F`,~x wv~E!~-=ekNǭm>"|zx-}[(_Ж"bj4DnA8;\omL.qnef2#!O>j4D@ c9j#H)4U8c|-_ҽzA_'1Ӌ?[h}/p}.Ia>+s>PUF44Q!`>z5xf|xq>ZIPAŦTazO %>[ Z7=X*"#-u!_Zs+x!՞䏑OK8E3C #k)_I;bV|; 'zZH]<T8,D6ynG '|"rݓ f_0^f7rr a=dI3e|_T0Fx,yP=)3N*YKV9Ct݂7y)E< ]Jy_rIQ+^9Ϊ'oΒS_+Qk :[V<ܶmᏭ8k&3d /dG-!wV]Im2CT2whq̴6i:Í`xaょ9?I Vٚ=nn-Jnm>5Oi=ViZcSV ׌m󼡕bzcB@X3˭ z$gȃ\1Dߚx>PKZcű؅C:>K\<d46Wk՞&ڊ-CoFlŌT x391au_lZlb-=ƪkxK)-s;$IqWrUδ^C g4qSM 5=Z6#0;[qÿ%Ţ{uYzo [6KZ"5O9rç9{26.W>3f0M⎃x-asNJ5[>&xJNmXB,5kp]̍xV_h["|+.?3[!c+2K\`l+g7[@V1;-t._XxV+MpLv"XѝK·D>cSbq$+Fj ~laIAbRE߈ (R.$k~zC̞O7_JmP@p*k>ta/l&B/;r}!;7Wz';Zt_)5mR{ 7kV!1vv5ԱW[!l;L3ߵJ?zk_#=(0=p ԢA٢'[?Sq@,yw!U?SbcE\56)g#[Ⱥ bf%@'}C[Xu3a$К@om4xo @շ@Jso=vp @Q v jl*1o9˯3'wommJW@; r+b:lvcm[C3M3!yEWɇLڝx?15sn6΂)4^N9ϵVSm˩GȋھRyFӿG@y?*A'NZ1z 2Eo1g.>݃_ҭ0 JH>sԢ۪৬֖Abe>MZ&?&#o:nǶФ~ ߬- 0孃Vѷ-qAV{('Jx8n̄mZ.>;na$cb|O$W$M 3ᘢǤ!ߠ3DQҶXά&poL_ on5V?~q ܪ\@m@l 鯿c-> ZI2֯=VW}i %1?%;3Gn?[bC.Vb;br5s3'>A]@B76Q4.\M`e͍{#bߣC{=lEoS~l.Bh?PtTj;\j`9CO{1|_sY֬]A4>~_ƴdF=CoZglі @a_sȰs $}cz|rʔ/^%ߜzWؑd e\C>-n@ N/軂MGѻZrG ,HaW|'.,gx{n<$T=6tU99,g+*1^[XJCז9$k }%l2O!%n[bFC_sc- }cEsydZ!_"dPxK]QZK[!cs6l춯WROH cS ϕOb&jZO_銙[nZkx#&=ar2u|8?W+-׵Vko݂g֘xx>枱sH[}cb3<,\hQlwΞ”~ݸRxlRW?$W*kn (6bUckYœ*Ďk } 6R⸦ k'\yknMKh\ۍǸ)1h<;fYoy8E`!E *T^Q>j~ /iyy˟#(t{橹񨍀s*cf*=Co݊ˎrEW=o)sΏSU?V$ /k P[pgs8uc{X9TjܙL(bU>L)cW{5/gOqO2E[7[9zZ~Rc%38enZ ]i@1skg~}k>oVD@>o7zd"_"㑮rQ^ 9?k04~zd:~q˂u7!ۂV p^VLRqa<8o\*6of{h*&׷Ƿ ;ɕw+4}RsSQ"StO 0zfg!\Z>PnYXOɺ8QD1]Y9}pCZr@.OzUr|?y[]~>7!?L_締j'c[s35!LWwSM.\r.W(l^,ч?>C$OI3$/JOCҙ\{U˖ιZ]O᭯1!"k>Lgʁ-{b=`{H0Zoϴ~UP[ ȵ6[!c[;FѯXl)+rrc1썝Ջ6+ǷC@A*o rE צ YrU^x%(35YrcJș>6.a^UACy($^b9<D@ OWDSx~&Ͳ}Ǿc2"$KêX-X0{ z|Ȕ?ϞPl"cɕajk>(%>j Gu?e`xG0 k>{738#f7 ZMҾy>K%FlA6{�vV#P:r3oźLTʷEv oOƍ[ cJ3I)۷γ!x'.9IOט uC/|V'a@{&#r >6wGr(n%g2 9bWUت_HpdqS \Qd0 ~m)_q?aO,**6ՖC:b/KC{:np&#~wCh_}@+wJ>ˇm9?ydj `Oh6 N'&,lz;w/t?zAEH>KSO(*iV:Xh![rV7!vGԧ,wro'ki\Ir 1)ߦo/G )*#jٽ8%)$S28%A^SԐ¼ ܶI<粫Z{%oܗd@ZTsjыCK+}ג7χr| ZsmW"ս1е!?>K/ݲA} '}#WgEe_ 0pԽ7Ix|}kN?O퐷rU7_ϸ>?ՠZ{Vµ`r'ΡwIt\ߍ{r#z@Uu,F -$R 6xh<=DNȫEiЕCzt_ /2*$;3p@@w#O:B?[W<+̥K+}xՖMelR(nQa_d$V_u#~SW>+bdH{1`-oqwcg|uΧϔA2sȧCT<F o<Ơ> Bĩ/q [F fs#体}aZO䈱M-\Ox)?iF?Jߗ 32 od%gtc}=c("k(DZhtyV*{d 97yDk=3#?\1C?q\<9u='HcY hc9wD?i=L(z]?f?R=^'3Y.}Vq (}8-Da23#PH}!_A@>6zX?S+1n[!%"ňΌEOdC*]d_Zϝlw2~dks Ѥ !Oy1mr'VGƽkg4L-o?(r9'9t-Vj"s047̭׋c]fee2C ?ؒ/Y9uWO믿vgB-h[SZK_]WЇ3f>a;V09 5;~ɁPJ2I r`ӑ}2眅W_2& _{4-%5{Q¦GICq5#Yje;b[]ϒ_x[E* 3ty_cW0㎇E'xE@ ߋ;e%2p&EEݙab$VX$QWvG*/=a,S]:Ԛ+#ڶ'2`œq5x՜-'2א9zƦgz(yhl8͡r7cz`3>>#ac%Y97㺶REz0IŸ,G'Shŕ{=Xʬz")m/~5} /${'5q.09. 2+繟畳h9+|1ؾ1ܧ<Kk 5]cy494捓'I}N[0/8,wM~{-IPگS:wYtj*q\[0uO381's7I߷?WƠo ?)v:bc|Wsr+G|8CgZUHf)"cWO`*R V>+1ʾ [+?BbRq53p cdQu{zೊw#G2esҿ8{jcПlg {6r^I7&#{[=Z8h&R%ç^ `Cb ЃA=JuE_Y\s µ$cq̌eե_h.|kc; Mb&x#k>:R7؛E⛲*."r# :3Sr^{<%HKPB<[.?kO1 O\,3Ѽ*z5Vp.㸾IIX9/ [Gۦq([-!_RN*V1p o+%2}eؽe s|P9]Oƹ{Cg{(r5S>d.HEy#(оc}kZlF֡|NNd.xC7|cK<1gH躌"?.aۃwi֐F/@TlsZ>CLOkc'i~&|uF!5s}~]lGokR-9I"$WBu:8ŨM84/ > O1ANvw!.&.sރM$E_orNy5:)1Tz5_{ - J0fU0 6֢JR9-tG&#='5E@ 6LIJ,u"s/r[KwCeփrq/'Y?P|gk-ZS]Ă|\OKOⶥ}e^\!*{zRyYƍT& vmwc6?rPO>=y |]iγf@mnLHonK˕@sWtS,|D-j{w.oϝN_S3sqtc]qq~x_OZ|ReU3|F@yo{9nI12tw79)y)JH%xaO ^3/5~*k3 T\"ƚ>q5 @ OI1t\=E_X5dtX?5wxFn埧?k*B\61Q}Iz7k9[`5c9lL2(E*Ulȣ+ķz䫅Gi#pϱU',|h{[th1w]E.||K++SЖ(yк%+\τ\ݣ& 7P4=Y'fvyy)qb{1`ic חc|F=E=}Ҩ@"LH UICYCRLh֘9j܄ѕ^xā1A5;KF5\FMrQ9y4rp|5ljeIuOX+2/~_fckw#4L|-0|aKW1?ܓIZv:wK]EBlHr5şV{vsa+ fE$j^6XY1CI,q1mѶ_8_1Gr ̹VCm5kakO1kƞ98kp&b(-8>ŏi}]5b<ǖ "x܊mԔFbe~s6vwk<`и3nBa|`c>g׳mh%(k="fB"`*;걣cg_A^|]R/IC~9~D_3j= ^c[Nko6V@1EN2넫)>Wx >˴xs9ZޕT|B]y6$ιm*ֳ6r$urޥuH.ǻ/׆_9Mc^S.=U=3j.fb sa{_Dwe!H_z/{2sIsG|Dvְ)/SO!'%{[JJӷN5=8U쭍|z=;z/Žru_cuU+`r =bb,㝾F%xȯgc#y\yNwp!wF̜W;w3W5b|>h؆= x:fqLZ&Ṕ3t9Ԝ;7qf7ώ3cU$q`o!ܡ G+cG`:זw hۃ9 ؈"9,ą{#?_g6upm ar{~^8;Q637[Z xdG[y|: eJwFWnkJ෴@hfW6J`\<\Bf.s~jdtsFYT Y?c o$"\p_^MϤp0CArФh|49|E*+ے8^:Ww%qms-XlW }w?3x|ޫCldߔx"vꃎ y+ջOɞELQ{q5QʐeC?羵nLi}y>Z4?iًpeG֍7Fse%l<0#֟?51rf$;Ö! ϰ-|ɧ)n6<\Cr (Jq;)>G=Z+aNYr5qçyzq]G[l7?_-8NFaOKs5R0!^W-r`~rl{k}15'퍰n Z̧@IDAT 6y|2O6gn<%Zۦ}}n;E=xrct߄ ?`|%"b,)-冬9p&n84`Y{qx!w;6g֭ *gY1n2S#LY|dBu<߲{?_?›#w>Ic-՗sujמWil965Wb J!`\gM=;>ƝG`ZpG#su9ͫ6s8x8GC#/2֥^vv6>^Iy y&_g_}8F$BBwT}s17ۑkv>%66Om pY\Gw0O;3/YzG,v#A}{{ S%SN2v.Ve1q?{ z h~kG_ \]|,?d _ L;TSpԧ)MPU_e9t h6bO!`8$5M'}O%5ϹVKyh@p;?>ۧ޳!,^[B[We4:b;ۖӏOB+}latĿإky>C h  00m]yP Ph9dë$ڸx;y=E4+5i:G3oNz1ato~ /zyICOiL=QxBNڦ]j{9ĀsqpL8|?g'{\Z5\fG/tYQㅸ`!O-&2f=1zDB_t5ň z"`nS=68Nk"mYﴸ&7a "]@qg?ُ'BtT?^@ O<ȟE`~~*}^N%̜X3_ǩ^mh^r|+iy47/#->޽ '>/\ Ѳ?x1=oS,t{_bu1g|4=xLֺmIwt 0^u)gi}4=i->2dkeAvԽ= b~f$=yݳplӾދ6bG<״b~yTJK{=5*axvcs/&EZd75{$bn>a1a1R1_9g^qo\< yjU3zwZDy+k9ͧ98F`+ڲ%ܙǜ9n3Gd8G:y>ш'@ `?M! XǮ\(.:Yc  Lz^Ku-)<^P p)!bfzǩY@K.=B<=ܒKW:Ͽ_jNsLޅ*,l{f! <L#؏ zD+GL|QZ떹8ۼ;V> zhϱIIګ:5صEi[JY '9̯^z#ܦfWE/;Bny)o-8䢆vdSL-ڱ,sIs٣O EuP @{Rkدojڏc.ϳZұy㌿c=-mǞ ,x7g |Ou6=/!3x́@lFvû8_iq v|Bg\+;b?jO۽bǶueR˭e8> US'苯9>z@<gyՁ"yZk9GϙljGreN hnC1Ŗ78Gr7ŊtK# \3/#R| cfY{9U33cG~m.|XOR/j'~?'=c>C<07s^mJs<|m )8)w" 6Q {O/N=><ߌc}8Iklf_pps/(X#@\ʎ\{f:te:w$G⬔:ѝúHۿ%l >+c<+ɱ}>NZoO'VW08?z`xrp/cĂ:Ձ u}FO|Kp-Q-S\cƵ|$0Oc~C_#h/K$%y':O˟6d__ѱEbg os ]Љ?^:+a3ߍ+vi3Ŷ[uĂC poCrzy\DnQ7ڠg"߉n!-]pJAFgj+/#A)͌'rDL#X #|ÅX)|}#rG> (o[;#@AWȹ>c.5+mo k^ssw?ԙ{0<GګTU=by-&1q^#jla(6]Ťxpſ`Bf30[hE^wy=O7mOĠV4|'}Ku̳<|9њ:@khGxңp*V!J;-ٿ ^ / ㎶FAHz?#c/lVKN&w!O[icN.|low='j !kz-}x3G<49_h+LڨPWN2ЌD5ql0X8ɻ~ >p\[`0kGl{|*=>v?ŇrZϫĕ2i0G{Vn_05_PRN}ǯ^!'x ]ONAF VBAU4ZBBTqGI;G)K#@~yNeGZĠ:Q aCpL{˗]78@o;= 5QL?b-$j"\ŇK'1M/pFz,-ci=0&Y&N_G>8Acgy<<=wέ;f>F\O#)Q5' ~O/֮?9#ߑ"a8  η-(] 2B<|5Α~9Q,S(UC3|7"e:)ja|3!zu3XŘzĊ9;?5AC\i &]67K j隫) 3#X*e՝1js5xjw75*^xlQ?^l#@}09_wCnY+Ub*ƧYjEŢg  $K,ǽ0.KM1Xor!Au066{wI m j-Md=Z\ D@5'k+ޭ=xqEشco|< }+צ=xLg e)iF˟,v_m'R9z萨!/W2oxJ9%Tq+6?Zšd "xL_"8mVw$lqa>^ϖ68?e5u :Śe#@F娱~;a_f)}[qeFaYrF >Iķd3}fͩ2xąx[Gjnc rl~Gyg>i yu΄lrq&ԕ9;|3~-ΟW=*d r}"as>-9P8ZM#hAtdql?Y #6S73&mɬ)ɼ%x[|w8ϚϐՂ6ՐQFqi_g}s|9sX̓r<ϝsGb#:Z+|w%*Oq xGG>,<xLNw$ ܑcjA\ϟGĐ1ezG>ƹ{p5z#*C7cOcQ4h,I6UkjqTtlUb| 0 k*Qt =@1]s <]8 ajLm,c3-(4^f:j,?~rvqmxVOA\IVs1J7/hS ǽ@ \6~wcY<s#9f\ԩ{IMaƀ#@;ao9e=l$?f.p/b;;s_>՗w=3Dq%A oZD,9X G5vd]C S $eQ寿qkR~ 5F5 G Ib/g;*ݴuVmx p#$p[^}KWrt|̘~6~h%?۷o?jxG-#=jFa38 7B@`Ljתv(/ڇ"cP2a~sGcRw_75 Kj*9Cmn~+/k,3'O:Vsx<~|9پ.nʳ~sdn'g[^ d̗_Oӝl'W#O.,<+_F@5Ng7̛z(: -qGp`E"kLZط]նXC9N~;I:gnaS׸7J^Ǩk,"x3"@|-3:@lwv̂KK=͏2{˪Y Ɯk.C@Psr|rYN)WGDc0Wyacb?6GtC17h8$Ex#KLT]bKB{ Dbsg^0>2lQ7[\A{mKSc=zڛb:C W#FٯnOkx;~ $ȋXk<}m-ѝ)> tea!\Bdc'GE:ĵߏkh5*tfZn>\㉣'Q-{0%I:ky:~[4G5ׯ/9jMSӎ=OL$cc5~ȅ݂RA=8Y#M^? Cc.50#aܧEh\d4ab!8ggHp?% ~gk RLYCp}~@>~#(@FxşIiږ22}Wz x)f#ш0<W͜{xpO*HS ,qW¶ڱ;Rf Y`mQ7~G\<~z;@WZp%羞n5J$e3bKL>v5 F 6+.[,\H*3m)6N+UEϡ@ ep%Wj2fȷZ\oZ=Z>3ǏC _Wc;QO߀1욐GۺxuWk!ZآNuP |G 7Ҝ lc T$}Hzo$L[RdЕ#((qEN/s O䗾Hk,5K?~?MV[lNfM!θhg@K bh3e}5F$$寿B?s.&:~߾}{=v_~Yf;s ZqZf+zʡQh9zwg4ԍ-R֐ھ?cYS7ǧPv@%5/A =T^Oe>5~l8v΀/0GPR-Nn%}IF2}d$xiI_u/.<ɯ֓KphWfPws>FέcWu׹C HkS3nБMX,h  '˄*9},@R_%FAR?N [Yg9?ь6GkXZ3׭|Ag;՞۽[-ͧ⸅M{ 95x5m$?9x8OFuN ?ZDl- ⌃pr8h&v1HV/!ާelh/eyRڱA,F5'ģ6J F%f#"wPk+jrqw࣮D3"? 0F-<< M(Q#"sEY[bu$mOݡ-ҹE聞#Ĩ6Y{^߭p^'vmk)CgL䝉zԨsҖ}b=pҫ< FZR;=1iN®8Stxb'Gd^@5Lк}lc;Pt5]O_p8Qb=w/3: rZZg}%O5~Jtϱo[n^}=s?Yfʠy ܥHΣth:~y"~!0 % vg`|чkuR"1Q))bUd}?0 O=LJ?3_PP=ݳi kF?;Ohdf9kd=ir @lAT,R95zq4'#eg*=G پ.NrzƏ:$EAM?(6;bQѱα>֣U?1gc=C6j{NL:fV2gı&W׮?C{~Lq*Cnx4fF[L un [cKcЗ3EPAgӞݣ~}YK7d7A'ݧXϧk-G`1k?l/?^6mA1խ>q/@T+.">395{Ƥ [/{z-mP F cH\F q]BXa.Cn3(,", nl xq}{8h^ɏ/՗&}W_S]L@t?^FfQ`11^ǖN_Ɛ^A(6(+ U 9zRR[[ڧ^zwMw:r,yf]o6rB6֡,, =A@/ xYQ=_F&t^8.}A>~Q2uЯͯw]Ө?4KO sڂu!?%ZbqCq5(3;}]j5و3N ,8 ƐAe2EψAf25q_}e)}8_8~G@?P㼜ml\k`%Gͩu-F]䋩آ_\_( =l%|x~k^9b{b=6dp\HYbI?~8PL1Gc͊'{I7vbꇚ w}#oGsfvC1ha\1 oQs(ǘdnzR97P'sw@Rx)X&)=OibfyxȓH=TߠyvnIy͛!@eR}#O3-內>!jR!gSGOEO1s΃>nW|##d\l( ,;R|g&lqi0SmZ6z]z+[Gߙzd<1kyu{Zn@K>,Rk7BV73,z(Dna mmƎ; #H{k6CW&εmŸ833ifB`i&Bp8b8y~"@<czrŗU1G{镋V?쩙nFl:{&guj1o^ N3.^߱FDϳ4gm|8<Ӽt`3ucpascP'V&W76:V4El'A7_5Б厖ٞ9? FFSyZaXEshz wx+^CGKOQ7Β>!XQ$b*Ҹ=v/(8/Ƥ 1ƽ=9[=HvԮ/ӈ4䌴ك.c Wc$|xqqfl1OOė.6s=*Omվ:ԑ1r]zp#_1j)A_矗ӻer}\&/ba_~}_t>Rg:uS;cF?eb>oт7T+> ka|Ec96{6KW7ڻq}:k/V55JNb#>F6ީ{{Fx 2ѿ~MAY{Oe݂\u=zy~Jbcyye;]?lf-y5WO%ʧ=qFcƄmkK"_CL#`>O}m3q6j Iu>r]son=ǥjˋEKoV1,᷸,P?-cP.Z%e{YDi2 >^?CڟZ?,J> ?Z#্rnaѶġk~쉽z1>pޭ>9? 7mQj] ;IƓ@`NEpsoZ#AQ(꠵pyAQ9'F`-ϮX;kb{`pw"3sO䴒I4GWH:O!lg&NWzi=ɧ5}2oc$;g'6]hHuрC-Fg@ׅJ t7ĶiI8@7|1[T Kw%gő8Gt1{`6NG[GY<+Il-ήM8<(@l<^E03.^1,FtCg_ci1 ?rע~ks<]1we2vSt{}&F:[V hG!sg|uf,cY1n\Y|⛒cjع%qj YkwO鏭(k 9S9ܥ_=F o2z#$ɣ?2dz`_X.rs`#xٓJx ch!9}8Dq2:q_ASI`/OA?Zs`X>Y(י>=W.=WrY;Jզ߂%ntG 'lwGDRq[&nk@j؊O=>Šϗu˨&Ls'^g\C_X.>S%>:p./[ɗ6;b/VX hwXɱ`l(pr>"OELŒ5W^]G2_5ew=VM cA0X@[oLhxcE_l(]ޜ9[oϜuKZC7x<y_A)׍ ?|Uġ[N=5:ڏ-xw._6Ij鑋Ą}FZG[7c9$` X RCR8mlkwGF 6GNVQcU M`&h}IW5u !ӆ G6ZC`n k mmncxZpX@IDAT'>g<9l\ָfcXOu!6O7h-ԑO5_C]3_[SK ܯ/at$Tup^"k!;4!"Fդ%^`];:^e;|13W-;>O#zkz))ńoE!x6^gc^h5WO<.r3$e4 BXbOԊ7muB%Y~o!/h[V+q'h%gܾi$giSYgyȩ t{׎=gB݃hU/(~PGI9倃ֱ"I9[kOõAH{!j% l&@ 6'pRX'e/@\TƢafº+1#ucʼn/ߵ6;T|B"kg<'!<%6gI)kDžil9o֭u+Շ<Xeg{Y'N羟^}yo?1"!{ 0!6oGx6ؠmm|y_y:E]i`LXcSyl?]l<Я .clwIp-ʋ&/m0$i&5^:Yk+Ӵ趁@Oaҥ[D2x s~9ժvcg<a b΋VRomft3#7b0ktC漭gjQSY}yxǸ7Ƀ!@Qu;jpgg~cx1N%7җ|vpEk`M+n !GkO)Ͻy=ߺ5[#SŤC7J@eb='͑8}]Kl;bXݽlj"8#t X8Gнpn{5K2lH p^Dzx{]4ˋs^K4~\>.bj-i5==ѡV c|>ؕ=.nFoͱ؁쨩ȼ5o; Gװ)xG\ "'8ݹlr10wAG `._/^.r}|:_`Ύq756zȜL LwŻT&d`ךjZVzOA:mqnh~N 0CǨ$7^bdn։4N2q#QO?Y @EA@v=OYǟpE5lsK^;eï}m-KϝGSZ~,/M- qퟶ6}}y^>>'GQ]N=;"'/vo:~ؐ~ 6xv^sؔ'y[|7瑯#&^Otj'h"`+} 3"3z-t/f" 9Du,D ]Z+<&hMXŵ8H _7UϫuyM-}7n&%S jS3}#5k^ȓ<n=766l\g9+1b ֞򱿼ҽM&0H8`Kֲcb02N"Z/ޔo 1Q޲F֪-{^/7%qs[7~=s7A}.};zT3 x`wHثsZ mѵ~6ӎ;YwXkb+>Grx3̅RzOc `M#lZl }"@e=2 7U0!#X/X־ P=Rւ6QhӚ=N@jk^qzXc-5vz霤?UY9^H=ʹx&v% x7',KI%Wc!3j3z-tnO/M a@.]Y0H.s8.6pNHNv#?;V\{ms<}ƹ2>!M7J9%luߌ{xΆx)Ogشʋ2oWcU?h_pmmqh<`3nGX;31_<b8/G& Tt ]{""z8H.#Ưł 3#/g._alPKUKm[+}qGSG5r9֜_kj꠬QZ}/zYz_yB'v/ry<IWt$t47bBۢjc]hRxp aPV6n}!O=3qN AD`h;],7]8M}dVz9 LGykENd7&'sq#CA♺z:fH>Ky!tn35`뱟!zOw7)#O4b 檵 rO iBa1 -td1b?Zl/¢*e |ak4|i(D~`Z 37 4ȑ{;Ʃ[obvѾ[:zÌ;Gq]sfj˔K^ I)>o:rƸC9)Mƣ|Qy0"iꭟ9Xjs~A}_;w"q8:>!6Dyx1^z1ɛ9yG{c5<'7ORፌ~%Nwn?qKn0PPn0ˢ3(",]&I_΃@ᷧ k2 bwtIu{[:,7f"qi鋙EW㴖>_Yo,ۺV47s?yIJ/}Z0>;}>}u9d0TPkA ܺ\փrq*ݎ긘1f,/Zw~g#_iΧ#ذWyyu䴟>< (7WJD7F+s[Af (>'b5Fw)?>IB^_^^ э~WH9v"K[sڭ>Wtn9V[RWb-;O}BC-;aܥw[wsnwo8&~xUky]?pʑK}_TS1 "ihctc>`$ ƾ\ /; 1e;{ޟ9^UN3}icƐ= `}dq~'8ͭ)s޹`ıw%H},q^!ƙ9''9<1{k5bt $}t .X,=}Dbˁ~/.h- Lyrk;bq=ӈ]'UWx-#{yXzm1~ty=z>';gc"S\ͷ#eryOϝlɛumx\*eP pQ8_[WY$cYd=g3}/ >UνeD|tz@l<&` ?/v ,̆6(S9lVĢ{Cԧ\R<=V,cDsK;G3^* Vı3δk5" ;gT&XE8Ưu3d\T8>|0 X&ן 'OٲH94_d˟OcҖssHoڟgo+Vqb:_:\CGƴzmKnKldhir{niR?|ҕ gCg cs> O|!1x|c>]IS|6菿ȷ=9>ܣ} ↽As 8jn>dbKKg-O݈l{wuY[G_jK >g9>9\E ,AKN`"B| .Ο@.)lN_.i9q2g]pXEOֹ Gj~82VLۺ>Z'yΑK5c#!у5]{yJ؏rmsF @\Eǟ,x\$(ywˢY|ߖSAc#>ү w/usbw٤7r+~,,FV5tK])nɩ˂;09pSo k!򽷮gt/[W#g| =!fj԰} )X_ rϤg"zCkh@ %)e"~&#ײ0_f2 ~r8w^C~{~薫_ %"`ՒBLãm)#>ſ:Y+lgͫ3ȟV9co}z3~h!].>O;rxr?k1`x\FI2`gA /蓾q' 9P?[sa@g!@ "cdִv7(kZGkc8-:)gA^u3th3u,[O=[\"B˧<#@1I:88 B|_ _,IG0mꕞ\9}ӕV6A P"YK _G#/>-6Xֶu`lT_y+׻mmLR'[k.搼"rкka|!A@ i=v`ne>"|,|δ~ߑ_tzLZ ^3Cq= a3c=ˁ+EܹK͟ qפG(Ym=ӟwPs֘U;p/ŝk#>3&|B>捹G\s0zԚ>8B "&ބDXlK"4\D!bSu.}K>=7 #<|{/b Н;ɫq/o0o*]C79-C{ZƵk׍q͑;Rؚ6_H֘>Ɏ!@̝ܰGƳ 4@l}=vb'Β|vFs5=ܼ6-=7,M~]l<@=d>bxP " ё^\5wXr{/):@¼$|wΟsW{>C1ouw~az|9;8bUza]G8^_0Yzk߯ĝ@ SeF z!4 K{ 4Y~, ۬?'[$tHn$~Hcqm-Z#?3Dŏ*_9km@V̈ /A3b0es6;UcCc-ݵ/H>/b,3}3[zm"[>]wLnR,o;H-.!Jˑ9R ˔wsGZ*wdJa=3|H3}\k>=~E 6#nzfێ|#` &nqKҾp[F~Q@-)y~YJYI]).W89FHv* ̨%/% ?W^CL5^RpnqsQrCWW[}_tC&y|g+IJgs==Y G 6 4^H}#Kt5Ȃ!/%-_ՊkE-}?K&J7*gj5u9Yzy#osn I~U&6Hb <Яz괶qkk0b,&O S޾--ژ[/1^/@?b!9 \\}!qܨk] /PmŸh4<7GE$~&g|)׵6,խ3{9Z?tMA1s֘h=f2j ~ ~;>Þ3Qt"T׍ ?9Zi`_; @l<>a83x ʋ=5_|/䀙Be\̾[>+}ΔzY~u?mE?t,i*{r2K厧<8g<zqx3ܢ wɺ_B?ˋoУ;i>? #Ll<>2<@x E /|uqqe)}K[w`8RW^h!t=~@2ʤ_^J[FϚ2e-V ë9t0J㽫Vu)2c;rƙq\oZtK;kDwMO98@l<-Fpdb1hlRs[r!.'xm!6\s=답a AKf+\s6Ossj7kkI<ճ\-@#ΘZă-=x>;[񪯷m]h=@ 'XeֹBB`–W^ ?5:Κɞ?3hl|˵f37NyryR,>i{C_0H W9n-1Q+dȪ=w5w32i̮cǸ*9͍Zh{:,&vchn^\/P؁n2 ZjH0sqJ=6qR{%۴^}g (M\vEǍc9!\-mOQ}r?iNo@  xc#ؚ F+^f~b1Bm=O>j͓w6ǦSZn*݀ɑժ(콻s/%豦tgIk=s{6u ^imz.x:1Rn׎˂uc +ߒʵQ%=ׇ RÌ/xN}@ Pxj * x]ď! @{~KylC-_s6n\ۙ_͙7Os~_?sH9xαsHާlV8En^lR/G["j2[l<74'ⓘYkϜց@ @l<`O7BS/I`R-y<4]¶֋V{Q"w{umxk\p~57x<; 9NJ |2i9huعg9Z~tM⦑JL- >y֭43ψSj1k '#O~>%16h;BܗaٕE'̂V^z<`a6F=;#@|BǞc3z3N7o=d9?_3Ⱥ=qV_sSdLG6tj5\wmZK}um~\i <x|ipW3x|avp9/v^R;iy96;!/6y YnMinc+H F:;StpMSlxssp2~ңf7c^䅹[i5i.B@ E 6s~@HeA /$'^(8f O_J J&W95f;×RObYkٖ"0[:tزݽKmZΦCuma`:Az5#1>nncZUb*1>>T7LX7{fza%g-fi mcgk__\Djm_B6=̛nP$6hK7[7֠--;q f<5&&13sbҏNZ!6f<]U.fdOB 6vbql׺Gˑ,2yqe%c #Ne.gE,ZcnG1jإ5ZEX|k`˹-\aSBJ}^Lust稕⑶_%k?3)6q%nޭxl@ e X.fb p{iY g01dy~DIl!A!rm٪zԋVS;oC2tA^|M<>{o[O>GwkOV쮶k=e`mꇒךvz+w+fGC wdhY>ifܨSpdQؼ |ttX7BZ17PۯϺf:Gt$Os?sRosދC]zqU.1]<>19bΟǥz ϐѓKlxwU_}~ʿR[?FJDGs΍)Yڿƹ oG)ZN}C>o ;G}l{MRK=l42?"g)59zxzD`! ,v]<./*?~_{p)~Yf>*3Z6% ~x*;GKgwrEl紽>J{8Һ*ᙱJm]vB O%lƗR ^Q}kmz/@ h(@pA"skp?XzLsgdb^T#?R>ɶ;-j qdj@ qIWC_y<Ŷ^:԰Qj $;UL<~b/̳ @ F z;!?hӒF;Ӳ@'t ȟ'grq(r! :eԄI9O3k f"ܮEĕZ !{^ki%@뾯p|_ӦTnz~Le>76jTWC4X9kqv޽=@*,|yA C#.ȋn)Wu^Gl ֕3X1wq 7b5YkYYH@ pn0{P`i;WLv{׌> Om9=?KV𫃀/u5Eq3rǧ2C O<ܱ9}K1'\J|hsSqN{3cʊc!ωrr@ x.\߇7DIA(pxCoc/{/11{KL3c=`Smk؇ g}YCY|~I#'״.X9l>i'|wOOM?c;;0k=ٷn)k+5zS i a9jkLm[".4~AE/aLD!@6E?|>|/yQ ӣ|{wS3 zڴgk=7u >ȼʏya.;O$c9 GxmGumتRNLwQN6@ !3vpbw1KzQxe"/.L 7cR昝:f) !@SY׬>Z9%KyOq6i+\b nzG;f_< 9Nց9X?:w]rJ˽H;Rg @s JiKUO~%x0P^M9v_@Z>:A@ <ڬJkXTG Sa,ӗ>)8>ԏ,|Y?^d_81cKg{fՁ<-[sx+UQ2eXjֹ?k9Ek|)-g݂VыcKλJoXMi.)qn%s!<0/cRȕWLX @P Le,סAǫ}ǽo߾܃W? daO?A"I:k {'?QB|\@}<'!E1^;^q?A܊tB@`J=Q_ʱ--m`}E {T֕wd1JmZ1:q}5c91F4h# cǹ>'E{I߉օ->?x>g/scN?}J#׳郳vs,7'ԣ=ߧZi{|y~ʅ^i9Gbu&{F%yN@`qQ"N!=ڠ@ /x_߆e!DQbb? :ƒ@O%0_NJB4~9;,F¡g܌SuIcM  &A bL.2/_f[TE>/ qէmKRW7n ;[8%nSЉCk( '#GiCNO#|W眧amv#8BX!/R5#y^ 078B p=YЖ&*G!tO[Gp3te$|&[U uڪ #ĀֻV}EϱGg#b؞)<؂)x$)*qXwc7muz?@ 1Z/4`vf԰ Ir5!{V/]gKג: gDf]oFS=>Mk~~kTfq<+I?Ӿ.ܫg*[B˽}42%>k)yWgbxDVP  ^؄5@1Hӗ.\ #ðS?3H[~3j>tE}K'#"yO8S]R/_Q^~GyF|RӹiM{]q]^OB$wz^RV ~Ɂ0qeBEk/W?l;+ח2M[x[ۮ\ObC SWTe|b̃UΎkhK=8X:/q-6`)F\V.?%j|»@ 8@l<;dIme Wٽ+ H S T* NLjWH]8-__tX~/ŐSYlT;iKK|o8D~O }` ;sw9 _c|9Ǝ:y* ^u.uR[g}[ '.gI4ڇ&@/I,~|^ʻAm5#S?W36酭ڍMڽI´?c`Jm԰ `3a<+6[f c6Z{`s77w>2?>^tL跲)sٸe賵>B7r<|x"kMd K $sD[`!"+Sr1$RWT 6p[n^ $6Tiiw-߆|%֕e^r*稺- h5?`GZ><#|k]H{ЖY.cgqbul ey6U,kOK@IDAT=WO1X\{ "t bCg!PefacO ~:Y=S4zg/[ŶE{^c{Nβ~U')6k3l@or2GRY}sDߧԅu޾^|眽VyOxRN{޻ 0Zx^RsY.)jBt u< I¾,^k=gZ Fuv[ 5Vos9, V=TvlU =ۭ|+ѝ|]w{N?|h>Zgg -!cmllgvkg?EY}`Z>wMkc skijr {s} $x߾(Y$!X&7+5B,Hl`L,s?ZfO;h 9C>q1Gʶ+~,ҕ07h8',o-YaO? qMzk_O|Nr~ůj%`֭,۩F@\H9_ތ%!p~CFZig1򓫌?Sk':pSזu. _&9|s?]Gpw{i>M=kl0o8[{sO6u~J\[lG.}IqN|[$e(Žcr%amR.)j/v8|^5w'c "ӟMqM,to> +7 e1WNu ˛o_jb{p9tzfSS71ƵRНo`\|ldWOt15멹вK϶=C}Uγ swm5uG5!p=Ϣ7'\ > ɑ+7#ݦCYYrizx``u){GK-Y_(pe;a^_[]ۢۑE)þa.e)ȣNR{`~x6?.t_T|g{{7ImF #3B;3JLg1Bņ !z?kt7yYGHB6+v^s:^O=3ڣ nj=S keig67syϱt-om)-#ɱ?"CD^]KgYOܗ'G.fg􆟨M1:ϴL}`@nֲ;X۴џpqpl-PlKY,Cyl|M,t7@.;}߭_eX{ZGP}7S!1I,B ,X=>/ vbvss-חipMAߐ*Ӻ~oK%mԣ8C=c;聽q,=2vLb'c|'zgdr-KIcyy%B!RCB1;Y5K?5t>-)Z/ϟϿhJHB_(KxF!IN@~f1Q~kMmҟgb?0:DO~@TK*׋/Oh;y$?JVEY;YM8{)\7+Ka%V-.8O 5efOi^Tι؇_J;lV㼕!0M i. *\}Ui$!l ~\S& )#/?XΜb)7nʟg,LewF[[f+pa21nHKcŲO>~bSgѯm7}N)ySyw~/ԡ-Ȱ.gG.M<8o\ [mVό߃;"r,N O$|Ns~[Sm3osں)!'h,\] [8LC NE?d˙M$kҘ3΀u/YhCޙrh&Ϸc,>BqpR?w|74fEc63u_cܣ/k]_[eb>pSFE$Y܍S?.ƂmRK{\Wd9[ryN}Reh4}hڇA;j(%x,Fcq]|>^j|F_Zaʲ*}|?7?Jbܕ>ϹvbEc !ahJ(^>//ܓA= ǿhYЕ\P+/75-ȼϭucv G7yK2%XҘf|.ǧIwt˹J<+cٵgdjj\+h9 I;cx'AoD$0ǡP|A+tkdr^Lc2r^;N] g:^<IT'Kt,\4z ~ERY/ePࢺjkw%PnPh>qr=Oܳs89ڊOֺgHZozIiZkٍ)4.CcT|}C~ wioYl'teN}}9GX uw̤=t828['泃R-1oŗo6Q<~H'\@{DI. 膂32[w ۋy(ξ!$Z|=-~]Ӽ{*Z3k9ƫNw9jXQy} `^Fgese>ocKe^&S'k)]. *I^+w|ϭEi;)BxG?Eˇp3ER%BrS\p|I^r΢\`N-(ͩ6|7]]_nݗ}m BNbnZrcfZC X g 5M➄=&چ36ijh@j+2)_=>:I!pcX[$}:t#3N&I!' q|^:YxOH8 PG=]mݐr<p~J-> 绲71;s|ԑkƀs7[)_$wc2Hi5罁GdɚZ Ɣ|)}~}}Xh,r'~ǹn57#W~2@> q$A1Kdg7nT˷WOI޲N \'#oZֿڿϼ>{,~ogxo3swϷrS#ν$^%:j>_چLpK 34K?+ V ˍ.P M77<#~Lܥ^j~pCd8{'>z/=jp]}V[3{F~:}~^RdGy I!К@郴U`˶me!О@\<{^Y-*osFn{Jic!cl'/G+ZגVl&G}-в[[A~nآ= <)˗\'Yʵ~o⟭RN6"7B?i&@:.=R57ť 5h%ޕ[1#6藍`_qzS.O _}~xn%{Tg΍QK}s%NC \#փvhO 72,)B/,ZH!q=fvl:OLu=1W9>WyGC?WnQ D>>o ^i^ykgK;ɣg;g9@+EΉ\㣭NΤx.5C p͂&=,AE&wa6L9g`/aB 6NyGc >F?D9@sƶ'`{PdЎk}~|NSI=2i|N;9Z'Q}\ᏼ/Ze{7C <9@$"}gT[Ilm\|*օA uY{fU_aK9ei=o嶷sA5Z%胿 R6o_zu{}S'y?d^^!/շ?{Kȸ*&gh;FT x+EC !ƅ54gK5r!p?oB,x  I ^.>Y"c}vAgf7؀l֖xJ/ Fgcg-\ס2k!P\‡k|^6jȊk ֟|R}hܿd:ڥ,}O>П?֟dY8./%{ȼ?Rv6㪾?gZvڤ={LzdR['לgG1يߒ_3GM>L;}$@/rBGERcA|@K3$@m|HGj}D"}YxvOi۷rGu?ρ\A@b r}uԣN&6v׶:l>`MsՔƦrJl>)z":9[1dgoNF^\G:r@l&f0psǸglR}mi,svng>M<+c9ګ)}y>Jk+}Z֟B?->[}rlŽrݪ`"Sۻmų'e:.z]4-e'e{{ppWF}'̫ߵ_qߪ6`s暣]! X&b @aVk3ſ?6wwJT,-Ù# ׽&͞>wT_ퟚעl2ZG&b9%leqķz1&5<בJך2k9!$[j@tF ⢱&on3g--^WcM寓җi{ZNӔZK쿵,^{+e]|qxGi% _{7tqI#iUbrΙI5e+2B H@EOG&5UŅd-nsr?cs5VSrZ]s}$G>ԇXЧ8O;JjGj☬-w<}ʹ~ qɵc _&FB&8B C]s%!/'Pk.k0[#Å䚲{Ům2{\sƆ{p\ݵտO[{9VWm͖-|LΌ}ջL{ֳ 83gNb}\%̌$S?"c_mB Be=@wk̗\˝k{F=VL{,OZxBjnI6f=ehm jZW=1;c̩r̸"?߇ B`DKwn|[E غb>nY:![UK~OoפN>9|ۀum/9V5zS-\GʄԼ/>jk)!/z;~mf!/$b(i@lYtRYd{V_ڹL[ȁ9ggu2qKVٷ4PZG9$9KuV+ˑԞVI_m%\ؚ)=eө_KVC $o x0GAA1kF֍f E>kڦ܂{Idu.2PuJmע5b擲wJ0X=`qLWneDv3S|QFƕrʶ@#0=ԓI!!p60)qs3U>{?&s,Tg.4&-gA@z-KhY|y˖xƢ3 _ʶko'X&{9/ˬvLkI=ԗ!!p)B/UQ{=]p)mL/2-7YSgSϒ/}8_&m~=c󚄬;rߗX?e֕_ ?Orv)fM\+0AD} 8@pN+!p!BUi# {ًNt%pM ƃK:!HZi=B$,heN'72WY&&%!'8n}jL}HO|T_i1'Sƪkcy`S2 2ƅ[ |{R@}νگ)*OiS^@-I{/Om[J 签k2vA|1Tlc.){0txM{Jܣ}@\FcM4˅#tbvk[6ʼoˮiWYkʦ5 hW-s<ʔ1.գg2[ܧ]0F2qϱr>]Sl'!'0@$0^@sj.Z("l69y3.[r{g"gmqܷ%`?GsV~1W3G{uǘ=j'@!ڊMp·'ޟ̧{&B>{߇I4 x7X<.-{5%clnH3luseѝ7 ԇwq%p'uo=ծڲ#;؟ZɴՂp~ٔ<| "3BofB ^Jm !Es6اec^e.dl.YO}%|:_kZas$05Vk{w;Y}-9w:'nl;hD\ijcl'! 0[2B `AqX hZ-0vNw9se^.cb,s|lg->}$C@>3xOuW0;_XcV%} I!pwxd~!u$Xc@ܜK77gQ+>g3^m\H#45Nee9'A>>4V ^[c9wN؀w}O=:jqÏ8BƣUlo,B Bl2x~*~g#4F`"!?Yfp?א7ŏg.P.Z>yܧjG5ڌ}-E0jdzI1xǛͳѩo?2R/z /\:9_fSRu@I ǒFC ^OΛfm:@͓am~E/sl O8z=zz!~Byn 9;Xa32TZ߫Ge,j  0ycO47#:,@'>j P \Hr#-AsEtA񷺬-}?*0'~_Osc9cO#QK9lno=^{K.^ cG<3#>{/X!0KL  L; >/FaY\S6!W=ִ!`kN_fKɲ!k-3`Oҵ)&|ݓB)ir\^5N6vNl x=mgq6'|mtL_G}w=8z^vRmߥS>L!c $Ox <.h{ ;/\\t7f9f.؋ڲT&>^Gu54XVG?O1zk!SֽEj&/B :'d<0W#GM]r@/]i3:|[LO$q=<=ޠGYQ_YV}\{srNꇀJw쥴:ǝ'"2I!!0E )* xC.Xl]$)EWW.nw =7gmM=Q  xݲAz9Ϛ>O[S)_ӭ7ݦ9Չ:15Œ< 8e&EL͏KOgX~>X!b,Xbl-{C9DũWﳭ2,vbꄿ/<=_8;[:>3o9 >HR@p }O?ϟ){_ 6~T>לIkl|܃C )Gҏ ?x6_y5>#g8gQ{1pYmV }Ბ>!GC>IU!}w~9v0J{r}oy>\x, qo>B:I!!g!/Ÿɖ? %lgsw}KRg!TmzJbn ][o,:$J3+27|,Ź@Ӷ#ssz[ |{ᨏAT"h~B M xI!!P/~gEʬgN,:k_h'$FW7=r/Y$7R>^Mq0f,$M@˄e]{ZHvDƝx<[!@zztxKr%pt'֗JC ~xv\5օ@1o}o78]_Pѝ5qχ?chǖ5{sN0Qy|=S;cNbO=تc{H Wr*ϧN`d~kk[Sy0E&y!,I!!_přV /ẑЅC`.3=ltOl(\'RK}ڹӯͳ^zw8mN)Kw|Wx9rޞ.Sϒ!gL  ?')g34~O} 5$UGxO.b? n6iKv~ȇ }\s$kYAcɶ 3SW+G+;[ロIO]s8&'?ϝjCRa-<:<)BWKER@b̠/Vu#F[A$I\zֹ6#ҳP6ku3t_sL-gA3軞OI{lַ]{SΡst{̔㙴V 0*7&5 gqG[]PTosVr˶u@H oJI! +.^ȟZiu3Gd:z~v{Gv,+]꫚m᰷.P+J7Y}vk'>j|TmL:+;15]I!!0E @@.X̵Y+u:[M=ZoIggF2Y.e}'& Ghgk5=F OmՖ)Zc~wu_~6)~8\'\\@@X!И 1jmBy^GE #rֽ"3}S(S#}t܁WAv*z,5sŴe^Itλ,$'NۤG=7WA9;|wm#C I ogk `.82\{\"0힙h$Lꦁs@\[p3tsĔ?mCX杭ϖx|EN1q͵|j9y'r0's:!"3ք@tNM 3< k'-d/zv{KYXұTY{O;R%}~VEk]"_/jǾνsպ}kN~%M1rUy}ϖkֱOO<^eι8?;LK!!8.Z9Lbln: 9-0zKek~T{WN+6uޫ㔬yꆯpT[z< x]1fr \1^8NR#=ε<7i͹ZSf0E%y!H1~!; `Erᢖo`ϕ!߅~ޓ3֙-?iܷgS֍}Yԩ ,w9w_[7MootQ贝}|߯MΣ:z };۳p}$=M^yֿE)-@@\rv#<\M.;Uy.hk:SrS-:ڒG-הaM]IlKY-rp(~i}N=%?3zKMt> g=c:X?K״7\w}gaL>ɩ!dagN !gQn_?yy3biY ю)GtVo ? 2p8~MOk~@~^IyC|"@1XW)ʒ᳤rK?mQxo,kN>guUv'>}ы6!b!)B B`D÷x5gڼgdG"_*t\#k[k̵Nںkنei-來51+5~r;Mz#iϓ+G?L=I$@܉@?+;Q!$B͌I.<mPxэ-%:{q֮%Yۖ}F6e0s5v=} uϼwm}>knGK׿WƹTn>)޹2wԳ@@~jB K3Hp_3)/\> ݽ2nSz:վno봥M[Zٔ4z=OeozP|ҫIk9mg+^|^m@ܹEmgחu|3rL)*m1~hK!C Hq+=F1ep2l=h;H/[يnWN499VǭRvRa>1(<r:IkƇ~}ּ1(6!繱5Q%Y#϶Ν[5/e{5Xi>\[K2@tG_!o#Y$ ?{l ?|F%@@I x̜I!%b+g4X˷㾼 {p_&X`Z~ОmloeYكzߪ<<ߒJ#DMA|؏q?l"c ;/Y3x |mFq *s4c Va@IDAT|cz9cN-P/29#-E&(dC B >}8m`tݫ1zLi[9IJ\cE_ej/I7}ygW[+m:5[Z~V;1OҷG7c9x>^GO9қ|@L1&7B %Ő(sF,(BHΖ9Εu_zot+'.mig)Jok۾߾ 6zf7Ny'VL拭cOJf{1KIm%.?{S:Sϒ!^ sdr}}ߎҾ6~4KK/ ms> 0k&|Uҗ9%9wL|'Bȯ=.s} ?̌I!}'}ù?q,0@{MIJpTGGѻ9}ñ.}6eL[Un8l3e}6MuħIMkP8"oϹ Oeڡ |}z|ycLC{DyĞ}M;^ٽBcFl4~П]%zM {[Y\k2bu@I x̎I!%YgPY1*z!$B Btϟ}{Gt )km'HwI!vqϳLe<{yϣñ_/?km"O5gZjyX%rZG;Kly+C I!!М rQg͕HO}X,FפdQl}țjv?y&K,cٵR:)}~!rTlsN 5%vxP|Ϲ\f#|<wF85oٳL{!Z(q"%>/΅̳5}̘ Q} ƾS|$;֥J~ F5d>rlO|C@>m+x<|!廲=zo< ޜGG|IKN92{蒺$ݣ=w~+G/uU}\@KXe^#;B^|fǤc9>,L~p>xmmmc}(;+yM_:?P.?3߱ejA~3ý-;lar` t()T8~g Y/>ss>kt]ûP8!p"vZ6,B&'!=,(-_I7Bpu7Gd>.ADopO&ܧ)H?̣5| Y%{,[Eȭz\ _Ӷ?籵QA3ۤ@%B +g1ͭm-Qܷ^Soigg#5\s[W|c۷A9ךC>hD&lW;9ݜTvJ֙<ۯ.C>Yzsl"o ymGvI;g.y- 0ʔy $(C pYte_ulZ -Y.,fo9uLlZ>NaCK?ǾA/ʢמxOxrs<ԤkQ;eǘ;l9A!?;3y=\ϑq/+w6C~x_E8#A8̍q%1}\Og9[&u0{J^2ƘDrx ғ#*p]XhTD5&0i*|C:R܈ϽySAn(jgɵ筁GtrTvIi]dp>֋mΉkv]1/ϼxVaij5i77L@W@NRRF%Z-~m]8:c}*py\q l4ndf|t-2O{fPൕ]yh.?ʅd]ݲ\~#_LKc_5Hs m;XfPvy'(#@H -eAk3&"ɍۜuLYK-og>IA;åF0aDKywrLAo*Z3i3+R 3Evx0,pwsQc\-+-)Gv@xkR~-&/͖#J`[b]P#6ׇMB /8z'=8M] \&Ǒ ߠ.XeOhc ء]2~ <5} :AE#6gll J>~[k)DYsckS3਍o09k ba|Oꏀsqab-osJc94GpXnk<[5mhm{Y!Ƿ3ezTEG\$~Twk| S>[H1tebp8#X=Cte@29=6a]mt 8@pN+!Mލ2<~)A*!THvY&7n=t<{C R2KKg^~ݞLSmg} 6}{1~%wD>j ?C꽧gK/B B`@sd(7,6I=.]a dMmZI[e7p ?uLqv 8ی^}`҇Ѝ7T<|x?e/=<.~4(5G_ps 67Ĥv澻X ( $Xut@E26ru!Q4?7$sF<t nQ.s⯴sFБs_V~|v̫0?0ύmc<7Ƕ>B Bx|~›`3Gr|F&x1|0ZmgnņMH>g'7s}sf^2S_I,.E=1`l .zyx8~Rqdp,_:79IssQ!!$؊lJFA6q+)ؼ >R߷M6@/7nNC+ǽcۗ0z>'-P\wljqyxo0x{n̢o@xK &x|]X7(o WvCϙ`{f\޴ġ}W9rstOK{ʦoHg=U>ۋѷ^y36wUrn$ODק!!$Xe$*nٸY0c "7G[6Ecxv]7x,m@'Z&'8q{CIukB^[TV7fxc)9xYcqIGimĶ7z!!n h`Wh J\EX,ʾ`<ʼq'@ܠutiΝ#9RB B B>3pSƂ7;ƒMws*XV9!7P{cs[e|}9ws8[,AC<3k^Y+)K'=2k 0.h׵ow~81!!!p@v Xؠ!3!0l6>(`6;l[,%'}_sq@.CrMB7myd2ELcٹ}f?Un^H׾w!!!@٥f XI#mֆp&ַw;ʐԁ6j }wk_sZN|$̨tas_5kyԷm~לGdp7军|l\!`?)T|#>}MC B I gk:@^h8A4 $p_OѰl`nЖr\6Q_)ŵv3[7ګ^3%O ڐqC ^ʠ!] Җs g'3y-Kz.|ȹ (|ǞK#@@@x_U\emǝժOl > 5LpoXc֠zF{VxoaCߐ53?"_!LK<#q_F8ok _$}jxĿk8@'N3',sM 8!!6td tT]7}9אݖu1;R @ahX~i;Ϙ73g)%ip ǚ`uA%bמ!'A1#ox>9~7y!!m $ؖo?@p,7\0-&Wn(& \e QG⺺Ŷ%ʭ teDԜ8ebs9Csnl6S!aߑchGxk}.!!!0O y6yr,hYriΦBi双''ltnV).pÏ=|_!a#8'#SR F^7v=ϷYrbJ3QSe!!!Ж@mF Y6Yް#!eAk6e@VFq\.@C >#0\Ѿ)>G63yOٻʱ%9Eѥb.0"$B B %c}. 0ޘ \j fq8(2O:9|k NlL[}u͘,65qo}8#|?&Xu%v~MJ!!!7 !!")@X hܝs-v 20!b;)ʍ`-Z2QZogT#{焕ͤX,3#嘓,bhJ Ǧx#\k6wؒ8Qsshw c0c>"ɯ9Mz36y-@ݚo,R ٟC?<||97+Kq ;{^@@'qpSzoԾ! $6sK?][kc Y[Ҟ:[Po.`@>Oz D9'c\d9`Ǚ8IGLyGLN^%8q^\G$r$x|!1x6slΎ&!{˸Z~^e͜ ee=o BQΝD-o >j'y^eHߚ('l̷67o5_`l}g}cOk#?B B N Ƿ{/7c΢o a}G $Z=68[W] <6:{ ˥"v2r&  ;ޣ.׾C: (ZnLeF]tX꟥)?%#y!p>ïзſcvS>B B B83`ɢM#I!6$[iwMRyhٰ>lę\Q";>~˷ͨמo)eո/)a'a#g$؏.[زRvD*ZhcZȼʶ 7H rM(<9{9lHW>c1xlc568J~$'ș@iO@k5h04"UpѶVNʫ}V-PǠc nmF>~;8B B <Ѣ!4f MGttE 86mW&t 9sϊb$'d,[d(k;Rq4XQ|{`dȂ)F{!}  ۷wh6蓲#j".ikr8F cRslRYe9Q|OIt^ Zgl[m~a ;)vs_)75MER2X6=ƁE꒏=̧qMᱶetȿn~|\?x|~ $oD  437GD܂@GS{yJ׵ynl׶A91(U&7@R3zs.۠M8(if}VU؆܇:n*朶F @Glf[~+c\4$0loJsr=861r.)ٴWGdL}cY>)goD9|hofI>A~cj <1nڛ!!!D <>WcOՍχ\9$XDBVU2e`yDpTýR7pEʐ f*T;F?$hHZ7By!!m $ؖo_HZ6NfBtT@jTn s@^\PQy{hw:_̠#U_ ?7ޗC|Tʉ!:!!!p@v# ,4;﨨1x[ly@#jN)?9ۗx7G.2r?+"xB/ekW+ȷ{糿qM<7H RY`LoGg)Bdffq G6sl3Ǒ hc>E#EB'&>TŐ?u*ߟM騮9@mC 8Zm9D^@@<@O0I\307!C{7~Cwg~ Q|)787%o\nv xp*N0fs|ΰͶJ;C^ל1^ڱ<B B H=p6,2󳚋: ͆Yw 0'GI~cQ ucpIKo͡v,V u6Eʾl}>p5mP'di~!PsU"!u!!!!pOC&5 ̳ɧGG#֘GcM\_9Ξ{.B B ~%LnZx|HƌG n| Ĕum ϶"mg\ qm~  ~WA_8ME SyD'vÙT"?k֜9DT'ZLG:N:;j@@@<ߓMSiTylfK=hFzNRl] n菴s66`M[wOؤY6җ wFp>A?d,{D90!ybD@@tN ;('Taj@ko2HM523*R|y ,>BC3 tJEåt=0$aםB`Gaӕ':+uK!5-&侇1^9@@$^~.2x| l jl >ƗAEi<̳Z0b,4puD;eJ!?J=eLLe28TDz='uL9Z[Q?G>)–!!!+epC:4f<-FM!T3x(S\ ,]ZEwca8W&d~뗻 f-=]eݞvS'J3{0Rmx+oDxpDKkAEM<[X;+(_؆$$?T-q[r(x}[C>kl-V V}R>B B B Ƿ 4&]C` `PC ,OFô7Єl[PbS z[P@F׃\D :Wԛs ̓}xn, ?x< a!pqPe{LMa}6]"稌5 2 k]2I3i|lo}C9fPvξ( iM  o=>3|QkD@@@7x+Hm~&d#/ Al8=͞2L) u 2Ԡ"K>@=.'uzWıd9бu৕-{_Yc<%C B B s(<'<t%iCd;:]cd(=$l%w{6|:[8l06s+g2vR@@@[ <[P]ilSJ)+7gn,[֛z^#6HKz 6|(\d <tkq =\ |>4Pg-)OXGi6ZJ-0ڠ-˙yn9Q<B B *H"̈7QF=0P<'sa:,)e*̶&'E}X\?- 60~iy[s)k\ֽEN{^1GO}=`lc $mx;܈fs~y0i*HϹ&LO8.>Zflὶ;q妎[o&a,[:ˇs}ڞ/\%I&A9Φ2iޑs_em՗'=vĮԽ+%|&@@ܗ@hC6Zn1 `8 Ye`òSy>v6~):e׌5>Sgs?qdqmm`sOKc}Ȅd!wJ&}~5:+K b|vOϥN@@#>nu3'3l673+?60:6l9vν\SYolz$cVgzkSYF9=탌z& .E0P ~+ Τ5]ֳ?)~mQƶʤE+\w2K!!!p? <ޯϢ.4W):=mO}멅Nɤz TdgAci)}w\K{ #x 3=> u=[h:%踅^&?2ossm#/B B N Ƿ{[2/skS A q=6řu(A^$|Izet^Z*} p\<ʖJ֧.N]Sgg߯=^|D .zgk1V 0Jq?ԬG_{!!!c|U <%fsUblLIguAI j <䣖/| A:~ߦR~[Se+fi+06 xӗd%n.ob&@`XAA#8|M&6^wI*I [=ϠsgAUSz/C B B; B` @cF9?;'>S>'f)6Tg+vDߝrid%|ҹqe x֡1g;rJm6Y$o#Ԡ=_ (Yn,V~C_^3}<1sj 2-}ؕj)ۚgjܴn;C B B !c?}M.$ fCFBt\Fq;tXM}i ]44ԟv7#}~~N}s. /&\ux1!!]H౫2W`Fbu6q|65AV"xn_趃sYր-2}/eA5}n :!2ggB B B '} :#|sNso,~9[!DO^ |_~ʾC50} #߄ʾF'ڻqxfi+B B B? <'Ѩ,P0y)o bݮ6h%`ueQڇg-G@XE艀sQ|^.!!!pco@ёjm\o`rT|)*o :n ܙ>+QOnǖ>~U|qjj~@@~ 8(o64+BAz3Ux\)LEogڮ =cܴ7D@`}ggIj-WǾ`<?\, H}*pc˦ gEH UY#(Dy.:8wCc̙m˱ sO>et<+9@qXL  $(C ~ R|!JwJ6TscFk7D2(Єâ  h$C8h7vg H!gy K@5ul'ܩ!=B)Xm oxk[A@[ 9jqnVjcAv2>.& @++܅#Gr۸3q1 nyp˗[Xn>l> ۫=[@66oXۈ2ֳ=>nwvAL# x:z@cۉ¶e:6-щu~`*Sh Zm+r11qqQWW l9j @ 7rB?l91Cw̬, 2F^*IV9~3/:mWG3Oynr׎3> -P_{ @6mE:=ۉyguR|b~&Yl ,-2,G/?ekd}Je񑯏r}I޷Xs,].$@/hyr?}= @ x*|ɷt@?9ß{:M0J8 ^oRKc0ckQ~Bڌ2zَ2&Т@s^& О౽h ;q6܍ 2F .8J@MYZkۭQ.V q09 Z.Y޲@n#-S @6mA+f,Onn] q%,kn"Mzz}Y7=ʹg`-s r{;ݗS @)SdvdI';,91vӋ+Y^fsMZ}zQ/ζcx:ze`!U#m8 @McuѪ~HYTX g, c_sLx^iGu8:NK~*<9v܏2?~F,z>9/C' x<ڞ&5SIag<%:KaW19&<.^3?ha /y{-7:{9m s޳;^C@𸇢mX)O'+7?YgM'_}NYskܮeDXF˚}c6'f;k @`dշ&' MFedd%{Z[4qIDATcc9{щxie,8?=WK{l;!tYnvdcz;Ϩ}'P㱎lO @@+] h5~hzNIaTD !mI(|\+~{  @9s]X +R3^ cRq[xn1@Sٚлi-mU c.z͇""@<^on>/%IjϘ̓5+J2}}5ڣ63l״\=hE,Ǖ`Y cfwoވ @S .η*?PZ* !Ku:7C,;sTX5G+pc6O+j8O! xOCrt @AVl:=-z>aB- [XҦ @|:&Pa;'N+ma|Іe|eqۆ^>kLzLwP e칪i2hH@P14R ' zj5ϲNݳ-Լ•:lM7 /e Rr;S캎tvz~Nؘ60WS3qE3zyN# xzڡ@,W)鰈+Р{uXJt!mpee[;<'U!T넃Ylfz@ך~u-ӦWkʋ5/¹; P@ōݴd[٣5oW8 g$r׭](.{꠹1ÛS;;z5|\#Vr1#v,s d\g?s]o  @Qǣdm9 Ȓ';^Գf^QBZr :/βe!5jv{S8:_c%mypf';YCA@M8KLONRN~f\cN2cuzU>Blzױ>9NR,nh"m繥66}S$ q7J"p@N r)8=Uj "I-944>^'I͎:!C쟦&YX,j9ݷN p'P:s @=m˓B4*Lpt}eɾjBu8Gs=YgPM=>:fzײ?~-8y4> 0qg!^ҽ 3{*3jލdGxڮ|;>jM/a֣r[uYW`˫?>yv|?kSw<{ض["@:7Ɇ+4~5zG \8W6c׷ tQc:n2~{*h4t, xxN^N~s{O'dh,WY' FjIhlOKhu,Wg eߖcZ;1xZ|(?M ~\7 @ !t$!t>dL&*:_T1't"p,cꪓra~\n)o;y|g0/N}l)y~{WA  @G.d7 מԥv rIasǠcٟKR{ƁP`/0ݻ֪=dKgTp} @\'0b SPu^,{šX4zMS ܸwX-Viw= z{nzH<^nNI''ĮBzX۲>񓽵p XɸlHоZ1cm8Q[/fI& x|˽ t)P'GFKu?eNS|??Vjڤ.-,# m|GZ}zgjxzD \q=];&άZJ%qfks xH㱐103z R,֭֬]\T2H @/A@o [rR$7L&P')WGIjV.ׯ_Yj>dW]ʺW @k@DN",F,P|ZaLmҀQI{Z{yϲ[,O л j?jFD6$ dmR c?'潟>. QB:F 3FMhq*G@O%u$~;@` kH4BX=6ں>XݣC @+v>h]T`zvJVAբQCcR!c*{z'xZ@f,c*㶥dq#[ DGm*i6P,W.tbE6ZqˀW MwmlB  @tvH`|$ :VϮeku>F c1cxx}G@ُ_~ԫQXA  @ǃmfMf /?"7|1ư"Y.L @FF8#@u'>y97} {mGGXfL[Fƿu|lH @,YCMN~t΀hI 1yLEb^htV=b~%>Y) pW[%@Dfq*NwÝGF 3rf\uܛ:3"@oY*:NoxкG<6Z"0@p8Sk uxg3k㳟ۃ{)0z1;_@f<#m[v㼕DU> 0q$Ҭ|d1֕jQG{yu5ُY2&g1I=݆  @vI::a2ke{.PcIz%̩8>۵p1c6xlK,l_ ЎౝZh O*=rf-'msꗰf4Wuf  @dzM53M|@ KfEFTvA{;^fk3:V5SX#ݔc]ڮ @g ?huҐ{A6ۤ@N'D2,QK?.O qe[;be(HGw㈡f @ ۯ B cNf8i^.THgG9r~bg ?><α\ @wy, T - K,kI蘯|t\?jn @1WX$DB'm+ݭ _Z %ǛN?f/=W>[U?X @@+V*\*`&0_BntAS =2[Xc x ;ֱ{"v=~u|^V k @^L+ĜbNOִag,h <ܮm+:fcWq_r3 1W @穵 RfݝTu/ ZU,y'$eT=n Vqu{,_κm @@+V*4%Ii<@O^0tǏ:ނ UW*z- {'_˥5ab>]>u @ǫ+`4-{?2;qC$XPܒ_KmILpU[XR/gWU~  @ ]US`hLm_ѽ"|<9F,?ɐ J Аౡbh m į^& ۮ,k8V+RJ W-Hg# @G > f@sJMF xrl-Cn 66`c7;֔2L3sLN4F9 @JJ(w#@R 'v:B QCQuƘhm?ִS9gYv.rJÚ8C@x} 0@N31KNeVɍAsYFgǃK~x25qҶLlsp3KŚ8Z@x 02|%Y='S ;9KX=ZxLe+L|ucc遭 @ G p@V֪'xު۬ xؚqRڑRCs~ giK @1c\m jI_rmɽ?Heq#|h ߧv#q @8qL=9hG?*Ȟ9t!pCi1=ni=Gϣ @s8  d^vn'm_>m2Vdd~we eyp[M\fQ @y !R)9Zg29ƜK1_/֞jf>T9m%@>'>._^69fЩuBE" xlABȩ Ԭ׹5>sQc\A֖+xn' @@6)P'FN u/3}Ծ#XHrsO9:xJl3g_ @=m,* -PN@* OM[m,, н"` <^KU |X\ y8O͛Ǐ  @<,j&V^ *Z\s=؟:沿:[]C% xKv @`Z sN7z6jhvhV8h]ˬw46A* x< @`4̪1N G~֙1(!]#V  @)ʬ @Zːc}> ly+ QǼcBQ_F6 @={~M T`Ǒy_]yT-sS-!@^< 0rG <[?u{UNo_?  @9s]  @E *SPGm uf,2򏎴5ko0@ @ @R 'YX_zPfUZB0ִ99ǽF @g Զ/N '5K.:vW?\in1:2H۪5v  @@'N  p@k0;\&Q׾n+?K6x @ IM KȖ:l\,c+&siGϴGcm8U@x* @@B6u^ۗp'"ܹ1L{Q~r=U!/_~߸Q @:=z>,51ϏWܗh]@zXØ2`JjxMG5e݀;L#Pc%>NSv%@Y@3 @@ L6kZ|I'+ @V WS#*P/,,f:\|"|uNf>Z @X/ x\o ЙY.PgcE$ @f͖F @ k;9j,^?k @mmnE ZE%a=%@U@8k ,jC]Z/tPa4 @Yc0X+ko2|~[= @`&LW ( z7إ?awBnt`4 @c3xE o%t- ze;KU [azO 0qj+Xr̬| )n'>vR($@T@x) @+1K&u 3=)o_ @{v @`beS ^ZW ,ǥxU @ec6L. v"3:[\!+*` @- [ @`RA|u,+|s5V  @I^  вrYœ3첒Z`9^3>>Z @/Q@dcBkI cMJev+a @- [ @`b&Ac޿1ytq˭mXG p4j;"@gc1n'В^-UG[ @<^%o pFh@,Sk1$  @f<\}}'@:ao ܇mR @\@y5 =ލ޿ j@, 3}  @f<l @ / @geˬ{X+۷]k܏ @`DU'4"1H49 ^n} @) xl.ZEdɇX&}Mg @<<>Rq% t|σ1 @ I@̓ @@߿ [} :Y%I @(nG ']^bIS( 3G). @@+qO lpo*x~1\t @%$ A 0@~=NU @^Y|̍ @Dlj @yGKš @`,O @TvF Рh п@^b;Voܺ$  @fr# @ @V/(  @L+ x:Nx:F} @f<>x((p}C.7 @C@ @8fIxՙh!@ @sFAL ,pL~ ]$@U@+ @= ,Oٍ驪J ЊౕJh p2l31_uacX @& xQ @@Ge31_ƥ @< @@߾}ٺQƟ< @]@;  @W Ac'AcxcpA @0a6L%poƄyƯ_ j @]@8 Щf4 ;-f @ # x:B2V_  @}I+  @@ C]e2es|=[*8>r; @>}I+  @@:&4LQ,SA2lG @**&w"@ dFcBƏen4 @' xzDۇAcfLYB @|j @-eXbrƷ= @@Ǯ˧ @<:/Β2φ~B @:L*8t+IENDB`HINGE-0.5.0/misc/High_level_overview.png000066400000000000000000001435311314415550300200130ustar00rootroot00000000000000PNG  IHDR%k iCCPICC ProfileHXSBB D@JM^t*vdQ"+"*EE"Ƃ.lI躯||˙3g3w , U /a&&%3IbE ' Fwa4+\@ r9k|o0+_ ᷐UP De%T[Kcb}!@Xt$tGAٚ!o`q !Oə Y 4<˙:JcXF Ysߖ-Cj0(Z2f8ofJjGU q! g9 PaBs2DYq#lJx4;©™#~vDHeQnjƤ!Õ*̈MD; x weń_1#EK4B~& `9y¬,i_ꐽ3bdmDn^bب_pq#0|GږFnv`ly1m/&a&+$J{'ȏiq|`,`&~KVX@XxF[$Hk o ˘Wvii,r{nxzb;.혊~ blLΆEx \8:#oϝ/y!|g yLoE2l L[kG$w CFrp)o>G@g^8 d>\r! P@S8&[܀! Ă$0zȁgy`1(e`5X{ p\F/x;0  !tDE qF<$ F$#"d)Cʑ*dRAN C^#P 6jNDQo4Eh.Z+J݃6' 5T@1c LĜ1_,K0!+*Zl Lcq"NǙ%\Axs ߅7>%Z +!HH'"*; KxG$D|7ĹFb;8H"4H$wR$E'6.zId]-9L擋c!9%9#9WH9Ur;Z. Q)&wJ,%RIG9EKy#///"?Y'HR~Y*Ts/u*UD]ISoQh4c-O[Iݧ}P+X)+p*T+4)\Vx(h8]PB%~%9%c%_%j#J76ʑ9+w+S~BR1VWlW9 t6} }WjZWKu@ME^-^mZQ51c3ٌUOy[>n߸ޫWR窗7_SXѬqO4ל9KFfxnK[ 2׊֚]֠v@{I~N:c:}t]]:ϙjLof6 m7я/oԿg@1p6H3Xga0`kn8ϰ𶑜Q3FM7?3Q7 6)4i0kJ3455jF4s62lm;gW_@--x-z&&LOpÒjmY``afUdlrk&:z"V׶lj۫v4v-v-57K:8:9 99:8mr eKGWG|Yev{6dwҎIYLbO=OgC//Nfޙ{_X}uuWHhtD Zt#X;\<2?3Z0̊upls,0\܎yz{0{Ȃ /]hbſY]XxQj(Q(Xt2|oYr-唞/.(6?W<2me*U5\\MJ׽]?c -(Dĕa- 7*ZOu&M7|ƫf-e[>mm-p[SqmvOv8/;5wRǯYT_[kAзg~{[Yh,?5Btt>ѡMK9M▤#!G:ZZf[]^[QQ>^x|]"ģwN&9T詳N<}Ym\9|ㅦ.ǮKNZ][{&y+_p-Z7oL!ɹVW nYtpҽZk0Q(>Ň1̧OOf}&}bk׻9% `iiGQΌReg4Kq{X S]X4;[Y.*<>  /;[}#=V+ u?ڿjmv>B* pHYs%%IR$iTXtXML:com.adobe.xmp 1796 276 6$ iDOT(bCA/@IDATxkŕ] 6lb%E &!! !H*0DZ,n?,ΚG+mh7*\'opV1Az7'&"!l7+O1}Lmg{W]էNNMUwS'! @ @ @ $p + @ @ @@A @ @ @& 6.U @ @ @A @ @ @& 6.U @ @ @A @ @ @& 6.U @ @ @A @ @ @& 6.U @ @ @A @ @ @& 6.U @ @ @A @ @ @& 6.U @ @ @A @ @ @& 6.U @ @ @A @ @ @& 6.U @ @ @A @ @ @& 6.U @ @ @A @ @ @& 6.U @ @ @A @ @ @& 6.U @ @ @A @ @ @& 6.U @ @ @A @ @ @& 6.U @ @ @A @ @ @& 6.U @ @ @A @ @ @& 6.U @ @ @A @ @ @& 6.U @ @ @A @8zhذaCp"'p|%~aΜ9D@ @h 1@Oo ַ_j@`Ho~sjLU!@ Lgsj @YmE0 {CP@ @hͰZ@&)2ǧ @T!8U榲 !8\H 0,8E @A`3 Cp"H% )$Cp N!@ "CpMe!@%dyܑ@,鲤@ @`,{R@X 0u`gzOֲjyÆ y5@K@C@WnΜ9Sqj @#Cu&Ca@@s 0 <3K/݉6!WZ8ġgCQo8-O7xcַe:7 @@c liP  >L47ftM7;3+nꫯ~a֭=ٲزt^ }YS%̟?5֭ =XG__{キK..ԐE2!@ @e JN>@ 9r  L.&a[,rǎaʕ]&u0sOX~}aɕI?f٩ڣ% %0ME9 "CpIm @c%5ɍh2CP|TNĆG_~9,]Aazf٩R@uྦྷnȃ : & @SN k6:{ ˗/O/ӟ~m۶[ou=êUe]TpNСCg v z'Ns ;BVb~%K:uYeWpȑ3gN;wnW,wd9?xgH|a|..yaM?=z[䕘esH `<  fC  @ dM`rc\pLS:e6m 7pCe_χ|3$1O/q^q# ;묳e˖IyL:DLYʖ;† bOF~m^{mxJ$;1D~zH` @C%Cpx@.7u8֭[6oޜV(q{ŋȇ>{Ղ7n v[W^yexGe:Vf'eέ{G?{l6]⋝2N=_:ilݺK_ٲeKtO`r: !2`CP VB7Ͱ):Pj%\bEWe3Ɏiv}P3VʕⴺKÇ; URT|pghR[ç?8u]q2Vˮ,kleر#|k_NW9"C^K/ᄏDeOȕU|9&A^g_z"$@?=LND aUdB 6aC70wL#LV^o|Y{MȊipA'!,=Pki|[#H׫@#;Da֬Y,!#iES)|O?o޼>R]=yҥKt[=>*\;}{u g[8'8N 0"#M1 P+D @` pc {ԠZw*K3g7 ,:;G zv"Cی|oѢEiz՘ib_rNl5աNQ%d9^o1mEJlYy'9/)w?i4)@%y 0N8I!@Ff;jk^}֮].Hߴ{ꩧUC _Bzgy&\z饩XĶ7u5X19rlPY3`zZҽ^1`Ymցߍ-18v @<:A@S le  B7hTU+qL˿KXfM֭ 7oN|ί4:vX;_WNZF8WiOo, );IY'@?]?S$B >cJ L najG>*w ٓ:ֺFvGwFr<ꫯ[obItՃZx':o߾=]9|NԥH'-/ue|?pܹ=Y-kT2D ևp Oo]ұS]$@`8觇 0\8 "q3@sOX~}T;v+WNKrm۶-|aΝ%% ݻww}0KjRC0!o7*`>YiʻH߬9>ӣ:}Xխ/СCaUtw]A|֥UrUɲiV&;՞Eں=3D5΁ 0:ӣcMI PD @` pc܌&Џc k׮_ޮcc>r쩧Ҥ3<3X_w3fhꪫ*rM7;ShsUzkظqcGu"ɁcǎBԗm۶իWwd?[FzD|}!ybOkKANtlc&<{~򓟄~8׿Nk{gO|+ׇk ٷo_8zhxwgcj @?# T&C22N @ 7YdF{|h*ӥ__)\sMyoʴN$;D_*i2dAO~şdxpС0|U:imv*+҅cҨ&28oᆮyYo-T嬳 jQnPA @Ƹ6 !sމB6mLXZee58f̘~_K;Ieڳ>%GrL]V:9/Oq|J E\e={v>:^cs8J~zKJ;v+W^_}{:-'1Ǝm='ؒ@&cs+6ly'QW@M ܯ|+9sIutn;CP{ʱ,P~3|SݞoycNCIk%\q`eR![v#|CPV:|y:Ϙ(=OI~v'[z1@'ШT Co ַ1U ~glE pc|8;v,ȫ/w+@TmI%M?,bfc\'S'|R_~yؾ}{9;nv j+d)W~zՇ R8[j8Ԇ 0jLx;ƸvCk@`zOةvC7tq{a]*mٲ%\wuc.̜9P<S&ȃ!DN;-adyxöm[o9OV ]vYcȑ#ѣi~i|;9]tQ:Eg}6$0\ùDy睗`LL?<<A,p;evN_ O|(@`j ZSq@kZssc<@[ORةvCKoo~{ _җR: +%o'8y!y]gxt7nڴ)p ~,"B\~}O])+Dew:*+-|K8Oܼ zLf/A%2[ɵ-5 0dyu@6X:~k4 S;42_)+͛חhY'+mHCy={Ӥ{McUv+VHʪŋrI7:u;gZZjG_cӇ7nݗ}ў<Ĝ\tiNN~D@ItڝZC$M&JD@c$@?=FN`8[hQZ/=X1X։ە81[]'1gr7|swÃ>(U}شcAyeիʓYx%t^*:W[=.Gy я~]Xc]~;}Fh @@4X:B@ CӃE~QT;eV޲e:k1}D>:L3τK/4Zʾ}’%KҴn)y~l^5^9ĩQ^cxSsz'C=ToP?Y9c<=&=䓺ʷO=ԮcYW";'@?zR@SITJC&M6R ~8 O~SyVm)ꪫ*XGYzbG:7c--NVu]ثIxŜez;ܲS!W\z7gíi-O2m^yٳpرve`[k: 0pN< ^2Hy @`Azة:6q}kצRibɈ_g";}0k֬ Wz}=’~uSO=>O ;!/|!kR嵣y+ U'yly_m;#lذAvB4/vmj+C-  48f 0f،-/r> 0ab; _!X+CwV\u>]f͚4Dpk7׭[6oޜ_zycb+[c+ my1WC䍕ñ !Xg@hLy^9S Ood'ckA9W+C 5uiEeҌ3t7Ĝj ܺukǯ"N2$ ȫ%Sb9/K}>dkx≎۷oY,lye{éVnP!8M pc̓jh?x Oxa,B-~e_s2~ov7o^}+2JFW矕UW6mۂ|qΝՊoˋ _g,MքWZ!@ip6"@`Z^<D}@` OةvTKYhѢTL!&X/vVӽ#:đz4 /?t_"Ē;M+L:t(̟?b:l/_]*{m%@?Vˡ7 &CpO!Ʀ *TEV@c @?=}S.\ OO*h?Uh Oy?%b~7n-U*Wdu]in ҟ@͹$PXvR=hs?}zNoc[CIM#znwV\iuYl>\|PGš5k}++ -X_1\*H^~&ẏa%Kte<8bm۶իW2ˬ(L3iXJ@h,dKH7er M>ݣcLc2yJ?Yv"Nx4+.Xl8\x ]yؙ,6$ &``2ZM6uVs)\ŋW>Z鶾;k`T^r>Ouv;wjR׶ 6R4hF'G{ kcWΜ93EOK}+aٲe]ى1<ΝLuuk;i~mS?\?5[qpmu%]{:ۑ'+Fmg>|A{8۝Wˁ}q^RK.$ Y:rr]h<ͣ[?9ro<^巜ec@ًN_&}Zal#UV":;ebl{._5@+RRٵnonmF|N Ø6Po 24iy24E^)]iEpX~}Wv&@L+D'|2/ۻ&Ì3}ǾE~g{#>=I*m;6l ÿ24q[n%7WJ##K6'duܵj[?=n^*;|3}7k; Ν;9lR8tPH ɛ(A^>.=n]DϮt5-۱>s:7&2<c; ɋ-ʼFyȄ 0LrM塯"p`!<|LK) g}6$+BZw..Bz}*\=z7gΜ R~wd$w[o??|2)ziYf"}URKK| zY}q;/~4YFg;|.&"xöm:$jժpeUA0b{o~ge˖77_.X+O֎NgA9YE]1f+u^T?)K~OױA&k\(߹~k"Ym.nQ~۽R^jk׼f~7ebz`fm@X'yI,-wWda^k.=廡Pw;1c޺?)mk*cV"WCl ö-} :kTmJ_yxIzלe1hUo\^^((E^mNy0]H>v# >yS4MxArgo~q-;ݣV+Jk7xtC^Fsp m]m+wope.VfPlĺ̏/]bU>t?+W&cA8Yg-z!]D\/&Ol5朓dK ,ζ.U~)r-/_/u 6oݜl]l-A҇{Q]t+B~t\g;:6!h C vM;bc;s%bcl]O.6VqӸ>*jM[qH*cMC=[wmusl/*OXb%2Fe9b+ڭ8kӥ tW봼zh{_"1#2qBml< R acJlFl7x;>-l|HPc@mle֣{_)}g>D  UOnN<;"2Q 2q'bE6]u8[]:g=uYYfgX}E_)L}*ҿU%m'àGa⏉6+׬77b=H{y~1rǵ,v\PnvMXŏ^_ߧg=cϓ"7PUrn]O=aC3Ifׯp2|@߯pv<|MߴY۲vcpO#:D:Zyq֦ݔc4[#zW?lzڠ qeM N9A `  _[Mg,N,-@/7g#8 8(}_&b2A#*TF~Ƨ{S2IჽXMlt/Vݗsh]iFׯ^{}Lrlث6cOk=c󓢚7o+O[bVyw_",d+,bj0ߋ˴I۶0ڲqE o/CׯMOE7ַ)+z>315+~]d[VxU;YK=s'[y'v=`dQ\yT+ώNMvbM N9A X1)]ms5nͼtP{@ ~u!X@;=*2ed>Kϳ.F&6DMY^'l'$ݯ:TI;V_h#`–L O[EnQ bE+ˍ9pdtdr"rS*`8ybʳ}{EVXbm2v̞cywܶ^xӴ9J`V_Xg?s@M|,$idz끖+[{lybo>_-v_h>ߜE(M^9zJ=ڴyiGի)[Nl) 66@`jFi&`f^'`md ϲ~5!X@;<,*q+(6k2CĆV&Mlf}eĭL?%<{9eՂLEy`zVP~rOPj_!_wnV={.xj!}'}OQ}[v+ef[nOEG?r}޼1M2hE:wrbȵWQ>UGnlrȫyiGի)[ak~q@Fr@ @? ;{%E>@iN &BGN9hcR'ܴiS2ӋI8ìYvҥZx'֭[{O&.B>sαd$21@'v:Qv6o'>@ 9+VdXyLL9&8E?Z7֙A%k&bѿFQEŠ+eR^mYdeRtnV-WfկHMuQHjab&,lls2Կ(0/eRa4h;oǩeo7ߤx`߿cY=(H[խaeپʗ)u啲bNJ3v\[z>̶[dkS%^˴2^4-,mϤꍝ&ղ $j8IW&Deh B)BJ^tWUV̾H}m(J׼^7}5_ԾbѾT^z7T>WU;)U]c_=}-C&+7b;3Ê+r-IO&5ҥKcY :t(̟?+_꠫c?0"iIDAT ɱNVf=6L0U`ov3ڗ2=rmh??M ez k׮VȮ>S [YE#zgڴ-V=fy1?Vk2[++S*ekusiZ-/MQjʶ tSXS4N @t1&^M@U%bc I"(S_!hWdV*IVhu++^H2֕?Y ]enT7yJ[^ejX?_V͖%qJL[2WYe,ڲeK YWpN0 NVXz,//"(ߠmgYmY=8tM-*2}kd׮]Q>o4SɃVVQ[EV-6~Kd̶2-?hGyUG2^:XGꔡ}Y'KF A@eC62'D@o->J>D a:$?kP+; +t-KOl'Aty=,'g29&rET[0[w!p+ 2'NL䏽HK,9&׿~4Ko#`nba!/M˪Y6L1c2A]W;;۬[/om/~4_E_uhuö~O˔m]yl偈f=h9{|˫*s!cQۈNEM.U6o^,[^:z5emoz  f~oZxN@i~!X@OԹBNBA-:>6gen'oh/˫[1=,}eT˒ml5ЭoܼzhY9YqOҎbNE[?_ROϯe_-o4OݿY4ikJ?)[zܗƩe@ȱߧtVg*BPd3 >C]y;hی<l<-KG9Rfy:EF(?hGylZfYiGի)['X8|wdL?y^Km`̛I8H,be:b^&=P+/W7_W+yH-oc6OLah\tc]H1q ind/CIX_ԱLS}>y{Dv_M~کg2}oǡ eK}׏| P8T7ߵǸ2+/~8?zՙ7K9K*m)MbcYvraq?QW6R~QAmdU͛ū봲ɗW;ZMk{&Vd\mO㪷Qz;:aae*W:]"3'=~yOͣ3C]gꇧ%!0~!X@n7U:]a肶(=;db&bo^('kEEAv,O]t[cж^m]ZHא_2c~+2+Iq%O6jc[Pǔw 8P˾![fK]4/1}\X:xNrCXvz[JyRQT-/k+:f0կNekuP^HrcYO}MhLiQQO9'7Z1hیVKIoO䍅yx9(km~ !}`^W\y\HYy̳h[츬p No'V]t[*mJAڽWTϪ$ v򚲵}7E6Wt Zxԡfud<%L6{>ƛ[[u59l;6/)-R̙gαi!`,ȉSN9%Zԉ2ws-\0}aƌV }#Gtt\`AXdI7o^e: 䐬˖- }7 ZZ'ŋGGVWcXعsgGtrCN\޽{;6{{izl1>FHE])=QcΜ9=Qt!CL.k(^zi'pMVt=/N>iSWNgG{~ӟ=r۶6HRiAmϣñ֭ 7oJj#*A%:Y7SUƉgnT~mk?~'l!I%ж~ v4t bGUW]ObF\rI8x`[,|)y睱1eL^c֯_Kt$W=X S\LX^vlMkꬎsb啹;ӟ;LGvN1EfǜHI~{c<ӰO?]+뤽b+;Vyp`'&>l05xG! p0ꧺ ROiVEړumW26?e(J|C#nv6m$͠>s.ntɜ{l &l|<5|cƇ]&OVۥfZM+ AbkX<|1qBugd-sك. X$M/*æ-YVYMoޞmrиg;/o+f Z Og{Ҏ~OP!`N'k]''EkO^zuˬlkz'73a͚5CJn(. W3V bob<)ꩧJm2A/8{VYf_d7ynx5)[?#W^ye?+lݺ'd5ombS6]N:PʓJY۷ob;JPYQ "AVa+N'wM;ZAsPV|k_ vJyKQ^K/ᄏRq8 vQ2>-ܹ+]vu=a+ *3gvjAVF#qL_}sYǎ~ v|ydŋZݻwGꪟWW=^3"zƲKB@)h4T2%LQhbYAj50<Ǡ_ W^0_˗/OE7]TQ"*~q"!f1vN'52ģ:Lf8R4|~- J6qCKƍmݦ";[_olHVP{2]9bb6]u'Kot;![Mojo?*tҞl}517M 4Kg[c]'|GirSI?k6}+RP'ApO;YueJ-n=FS[޴uCГd2/N EZ'ӔӁ)1ՌpibPr/OSEȆ 0LO3r:gQI6>~wmñ3DNDf+r}w/Z3bT$Z=od5ߓO>%ʋɲrbL ,*:XU{=9^Mo޶yq,t9y0wܨ蘳8FLAOgP;^xVGAvU/ؾϳt"o ׆:4|,ҫx=<|V~M\I(!U:K9HU`=p饗F+V7o^kkצuzꩧ=JD<Ϭײ(/d?6 h~:L˓%_{MT{ɷ5\+ޟS li>on'>.ʾ_H7]wo!AQg\pOS;UOgZlXC%㍯o8=I!L9hG@1)hXO[#NR |NS-+kGnݺy4)o5f]^"N~MVIگ|J1˳mhu%!OgoK-: ۲u[eo{M9eTl!@C@4F?_支hʎFEq~T/gI(! :U~;QbE[n%|_~NyL nݚ^r->Hy-%lWl~[ΰ ۲1Ae*$lONe}s=yfS~Ǔ"~Q񰁗Y =2C=ZaA%:tƆQ@U,.|aً{A@|Fv"+NJEPrw3YrbB c۰aCׅJkeUqXy2<nux<ʮH7^_?,ܢ=x`K)7E:MR:i}j|8+>$ʘewȴU֠hMl9V=VfSꪧȳz)G-oN}|Ӱo»Cj@` @Cp\S E ;R"Pr؇ h?8?aWo6iOR'P։$Em">>aժU]k'$}eQeRO-?hzY|[W>}k$fY{ x@Va靥?lWe:{&y>56~`Sxʰ6dǪn={+36fyE먧YTˏ{Z4m~C]! P@SHO?kRnTGI LeqTӉJ3fP$ؾ}’%K*f~?'1y0w\-sСC駟wd% ^r;c喍WEMz[|yʳL[EduytPv9|9%_*ooY{x9mߧ߂B\^'>Faҥ'3=/Nsa=lP*_tPŋ;@Xe*ԭJ9~|A'a}NH_KN{Ƣ B@J 71}M[ $.E sy2ϵuQP;(8KNXz-}}}@v~Ӷ͟yF9y~Տu{SvuwUVWw82B8:T[o!Cᄀ)Vm_~ezs=7?^yU;x׆ E("#ߊ Sq?cA]g|[G|#~igmk裏=KOs4ΫOйƲxgjy:.н??#E{BU.Q^\ҹ2KS+[ɦ_~y˿EF{:rO+_%ϏI~u)G?ZLRW׫7wj۠~Mi~1i&ѸLy iۼ/GvO|~q-uĥ˒RX?Ƨ>&ʍK_ֳk׮m?U}WT^炨ley܎;DRu7a_T);%%umJ)9I}Ru}m6?a'=*o~AEQFhAe XgM|,X?cYL@'-I$`}d!oF=le oL67ʓO6}fg4Ϋ76?;+gSijPrW(`Ե^,16(S.Ɉ~98Vt|B\X.=R$kM<HOpOJXWjYV~vڸG*6Mbmf=|E:18dh/h0w t1'0FGڨ)PفsS(Tet"SS4-˞O:G5A曐v7ZMe|ϲQUѾ|8g&m>g9:iWol~td0L({\ݼ'uZ 8ƁU+~HW Nl yU'?j}{HU˺;[gY7k[Kqd/$O|~q.SAȦR}LQ6Vu(n_[UaGzƕU.X84e_G354*?q_OoY\A5+m~9.ݸX?M eӮo;XunqZVW%C*yOO Kuogu7KՠE6cE-^w˒vjF\Lm~~4Ϋ76?ePaOI}eg,׵b^}נSeIT 7i:U'g+(b Z+SQ}Eq}M뭳T) <ڔ~/Ej8o],qEiOm?~ҏyFEU71U?c"Yq{}zqۢM=*cM~Q,7˪MYV>kȎ4guǵ8xccӅaQ?x3{"s{'i/#Y.b{R'RX|Dm۪/zF4INh#_R67|76?KYdhm׏jNA?)Ш I{Q_o9Y/$`Mfez#Ic#Z/KSYK=Z&lvw>#=M>AsU"7jvWTV^ql.POԡ{eMfvmS>őԤ>Vn_kQ:0騣J9#u1dɒEHꪫuپ}{زeK)(BQUC+Wn† kݻæM٠eغuk3w^x饗I}M˗/+V㏇}O>dk:]ʳ{!f\|'.(}yЊp7 .`)Q)~_>W\qEXf͂÷m|;E+2ye'z'E?9ǒ6>Z2$&iNO~f裏:ߘ DKdO@@Ùg{k`N]{9u>ny Çûy |1P<'EOtI3=|ZJWoVZլoh4 [.6F6y_kM#Uo2駻Q-- bi&)3ii>e4I@_g3CPؙg^lj<6)T|~}^9zKG[X.zQR39Sdh&t,nY~QEnuI?R;X|;Cmۮ8~yԙiY|LxS˲s:6r/'Yutҹw3[﯅u%{O]A:E zm/~G[Gߪxv='Ŷs eY=Kf];/S\G t@r\+ߡ8yzC)H8˲[45Ner|ضh֛e ԺG]Ⲫ8/3L &))siPQ( )R rI -NvEPcٔ-u~Ȟh/T/շ>Ex?l2uU7AXڞbnoYWW&ˮԵMyܦTj"7-}':%В4_|yuUz_W>@̮6.e~pjΥMuOj'er|;2էv75UϘ`oY|SU}*[4إze5l+_ !yZ%;vr8eH>*#eLM`QO}R35sֺ{ڧ ۇ|t>R˪~-u]#|uiC̯ fg-OooԱli6ksR+MȆ:mJ.}þom}'qmo^irpP.u9-z_ۨ+:eήS vKM9n>iC&T زƽ,hmG75ڛ*zmVYW';ୗ-{, /OS~?iӿflFI^""m曑mheEE3Bu>Lm,zBA*@ol$3C~U(C˪iqeSl#93;4%Oe&e-PHǧu+__HNdJR߉ok?i׏W~cO\YVzH4tNl^<>V2]ջ䬒vI H[,J^nzS^VoYԴh^GR< :7H>u- g-Ͼ'>'el~PEy~(5U,QJC;R6s֭uLQh FM-eWЬ{?Xy^VSmwbg Iw FS{+}qY@K|׻d<[!nV\ך(ø+ea^$~ X̀OH44(S xޢ^@h>udu+խ aٟ}m}uK1) bʳmq~9/NueJoA٫HQғ떉SS}Eu/WlSVGJfbX9-K ԫ6SmMqr_+J~{77-[z eT5_:lJXzk)5{?{db/L^z@6ziXzEY6?%RA}c8xjHƲ,_ـ+Y6[Y}VeȖ|M(ϾXx8j6c}`0٪AQLRncmC.e? Z+L詒]VoS8Xݗ>TM:[Uxe.n~f:uh_EںmΫ^76?<@ؾۦ'@ >( ZtQG%뜑:I]ydɒA O?tl߾=lٲ%ߕC6p>|8,[lh6|{9Sþ}ʍ x ? /`]zK{Vgxɼ5J}Ia"T+:j{682SK2M(Ϯ1 [CI,#0OaY$ݻ-B:y晅;wdu|g};z{ECʆ(u:tPXre,`zeÕW^N80[[>r->Y-l۶6Xy~_J$m]HtK՗L+機ORzAh&%@m$tE@^nNUC)mK >ᢋ.ŧZvm^fhGz+`yGjw̍7|1ECٲv^;7tlayQ+3^$ӄ@'1ఎ3Jfˎڢݻíޚ?RG7xcU$˳ʾ3%ªUG8.by>hoq'{A馛¥z=@觻A-- /ffDIil`Uh0 W^y%s9CdLC‚WG2C܊Ѿڛ x~ej= WV6QG/%و:6OXn]JLM5k줓N K.՗*x_zKeה 4ɔE|SeIgj>Է"?aظqc7x??C:zcmIhܫ|Voٷ}G~h5C6 _[ԙ!(aXy\m:evXTd45Cp[/yꧻ3e; 0觻e-- F~&)3)g & {Yd|=d[P??TN:ROgko6@lliH 4¶Q볲bL?ʹu^,*XٜV'$ͷC;h;flz(-+.&GK;3Slxޱzm6{%<ـ˶S*MqX0)+_iQJΨ`yMJ=efZNչߛS򿽸'`K_6l }6Dem&@_|ǵr1k8l  vecSh]<>W8D|RWV/S;ԗ>ɲL?اŽ("v{6#`q|io}\o$@l/~GGe:g/bn}}#wzνV۶Xxk=S[YQc8XVf7Yyk6byv$02εl(-חe}/vx؇N ~u:p m;m`8рdY8 i j4TɤC}5YAvPx@q7{`CqvW#=4+3@ױ ^Ah<#קW#D 쓍і~7RADV%`m\ߗ{پدк?ـDdDP$NV_IVDzG|a+&i+>6}{UVulߝw͎k}c} !|hs%9xc^6u< qk㒛,`ϸ?آk^~Rzjj/j'2>STOJ']trpN}nW]h&m&`یͷYquXrS瞢󝎫mg,Jz B?kR_Tu?ϔQTEۚ`[ut*+4_gQ#i֜_*\agZyݧm>ɱe_u-hƇךYl2:Ӄf-b#R3ǑcU5f466<Ϻy?>| xg6 _5C?5^Vh4hYqeDE2v{A-?Gu84u_Y/"auyȴuHKovhv|GQaܙڶnqzjD|L)tM cY:&i< QSbVfz ˪Mu->|Sz>Yyݧ>n;ڐ:"u=ehoߛۇ|l~dO\R==S_ZKΛ=+?n?%ƕbm{^[f͂;ֻgl^ۀqqqew5bU?H *n A*"I^|[j *_oj[(IQˏu (>PG&!`ہFjflkv>PRt|Q߫>< LUV2+e>~mαx_(yXY9QZ|q{ҵm=_f4ˆT-^XRErFU1~fGۍ} hR£e읕1y짬6?*?/oyZ'D t O7MU")@ f4$kC7u\}m|} g|ۗd,//zh"=m@mv@QOwV|* Iki׸IaT)$*NQQz*_땦zm#e,([ׇe>'S9- 7{'k]u~UQeu滢yС#@g {%O{ZPL}ڸ8$v}ŦZu z\ceY E K BEd.-ѦؙJʏL~LX&uXVKҬB`lTR+<z&RF([I7=cY(Wc+ 6ڦkbw ~oQ^ח |767;0 ϨkQby7|mAg@Gĵ騣J9#uO%K8y|L'|r"{%fظqc3_~yعsg9 ׯ[=301 賹nkҴ\q |abŊ:dO}$H}o1!9:{'V^=8-]tyYzC`xaժUy2MshJ//I^y^zl/d3Ay0)9/ciþ}rԆoܾt4C? ni1:?[n1IfL g]/7>)4D?gb  _6j٣>\& oZQpIݞ(yPߕÇòeru)݁ 76pRU96 hsf$t7>~fW_}5pI'K/4}١9?3޽{ C=\pA}/bu\W< z墋.xgË/{K.dF#ߙe#{iӦ|>U[/+:P[՟ڛSRk† yկv|X^{߁ O~gom9c׹Nۚ9,Ϧ^x!תˇq{UoԫױN-?qi뗮z=oݢdTV~TXX=^Kj]!@`> s#Vfb|.k}{2)9馅c!Lm36?: |E ؾ*A ol 55W"| "uߦoG1eޮ^ﳋz8[ջaT`-|YA[=nj(ͦ)WIӲzRkOWMTW^UPQdKJ.n|mAg@׍X'N)F*((2*Z[yJy?<_$@[/~9[=5{JP>+6 lfE^_ibG[ʧNS:h*{#c瓂CL6c TJ7k- Y=|/eOSk/l>ԺG^qlvL|uH-LSs}Z:kۢokv/ dnvZw9BE& a37-` "L5]!@? OOY)}CfիWm+zk{jnaƍ_gqF], VX1-8p ]6;/Ky,~|fߪߍ)뮻m7aɒ%\|aϞ=&_vO_W; {b-_uа? X[沚ֵ7<2\=wL2e|};+++Yq8,m?-|)˖-oW+jCy۞BUW]*m};  4Y?3m~ӳ_IoN6f0YZq&V4 X~TQ9i$ڧ %͊|e5J͢Tyf%[>]T5O_"]X%1S<,/-}=g ?5jU3ڠ'nZVY}M^ ~``v&AWx+F'@G@XLӋI{'~;r@~^d4H3.]o,VA3ko,]a! FߤuT@? }o𭿪`:R~./ X%!nlS335 F0N]Htu ͚S[>a|CF9jy٣c=\OM,QgMS;ͭ͌sEժcFu]w ^ 2;vW\qEXb<{M_ 0eve2G)oWz]^/Ğ@h+m zAnlf'~zH̀zK~呷z+[9|8p@A駟&g4% ^*3τaÆ V1iӦ|x}|͠2> jݖ Ȟ;.RMSԵ7W&iɫWNӦÇe˖S,cj/֯_o :4%7[C=6oޜR["uuetW 0"d=Gfm6@HɂJ;G9CPxd|lriy#+ʎhIo}+ok-uL*,05o/:.6e|̰y_>zV%3 ӹ62v-,4Ԕ?%~Cp кoG*ly[|B^|SLq*Kjw]k3+/U]>T-Ef! KzL=Y'0J fr(-woZ^:\Fr?U}mm! -I"`}cr2m]g  >J> 9ZRyɰn#*,x^*3>@Vd_Jcji˗JLSګe룠[*]Rl[Vz6ffߌ~uϩyXe[^kSvujzC|ٝ=Yo #m`>NX9)bR~vHIS&էkv} (wN&`ہWI$`}côdgj#=h(orS? LǤ[3n$e[ bZ}K:)@e/.nM}}-U+Ks[qY\тL*ЧYsv@LFR~-+ZO,>yM)V^^+S}^?/뱽emBw@?}Cpڵwg'N ْfZ^-6xa6:L]횖W>{GW_y2C~?oFdA7,Flɒ%HsNy iOQ"fAxVe?<(_ǐ0rp1ǘ-_eF@,JY@0lݺugwA 7xcx{7(|_yp9,Dn#,o7d`ېߖS> օ:+)3 x2Ô[Yy[N2|{rncN Qىj&o^4Zm0 h-lIC\^Xt@'ZـW\֥V]F̍) |l 76'-E1e'9濙I-u*Jϊʧ^n̵T*bdLEzej'u3R3"],,g kojiuۼf7V3[GY^߾ԟ-D5+Ж+ˋcQt|uʫZ}@q|sC@`]^mlIIԞP, hG>t"!~] 3PTOI@A@;f]n'oXkC18(T&G<XT*W* %}*K#-ļ)Q&['<0MeiRֱ~OTN3:T^(8| VӮ ']>!\s1yӿ5movb ^,I CG0Ǿ"*JmqVfz@o˧TbH~^YӼI\zu79o-pOtIaҥ /Җ?d jm'xbXjUm ԇg!]X|௨wbUN>LF4vYM_oFXbO\oԮ?v%rʠobuϢflj$@@[ lg  Awi \4c|K/ g}зq/5A.~ɍmO?xoi.VO暰aÆA>~?{ݻC#}7n.5φ_|1KA m۶pw{QE~.M6.y/=qL켠seRuV[tTf}Oѫ@X@X4/Ud4AK)r,tM]Y ~`_::t}ܟ,`IfƼm޼9& `^xy@1U({ xvӎ۷-[DBb~77n[=F){X8*˞S7$s٫,i;ww1e>A[o s@Bu[y(t+݂R PA` vC P7YŶ}7'ݻwH?b; [nM,M6%m>|8p (S{zؒ%K| /??_$F&1CҘ}?h]> (Ko%#mRvּ땬s=v~tuϲeP͕k0C? LDA h Eځ*_F.,<COr}QwlR_vqv,E=g+v׫>7AuW>q_\Z4D>zzk_Zկ~yxxuʕ~ed??s9'g?قٌeA(A6-cX@}뇵ݵ?uwh:AZ|d:ѢkgyfvX{Upm INL?=}@ :n,Rv}^^av͆,++Y~F;UW]]S7&nVlo.Eef^{YG68-觧E 0MI 9#9ܘtrCL lYyU%|5^W`~_믿>V4ؾ}{زeK ZIDAT~}U>aÆ yYNJX~UhfWAO+~ꆟв;t~<8PZoO+Vh!z`HJzXlC}o}[|4yWpO:!@`R'% 1Q]d閂aǀ-7~W5f͚>}cJލ7|X$|畃q8cl^~O~|;o.YlFϤPvut7'!t.{!@-#q^o`('VYe3%@? n -!%@?=r@]&@@Cw@@pc2,:Oߛ^$~KoZ+KZQGf%w6nܘg71et7|'!t.{!@-#q\*?~oz<к313x5*׿'IW_}5{C*L~*_#j6d_觻O ?% 0X@v{@hn[ETgRUI{za߾}CR:$L<{ӟ4?Wg׿4a?)~ꆟ_{, e=t 7-s"cKujU'M/g鸏>(^ڋ>-[G{A|뭷ºur;w k֬ 2tIaҥzUi?lذ!%bmaӦMSeRʻY0n8?uOh /=C2]CZF9dձA:{gE]r 7x _+vmhp̺nm۶%K~~:_Oe/*m +Wwr-m'ihu&Q, Oϒ>uCK @X@HfCY-}U|}衇͛co~֝ ر#\uUCfwqqu kI:([N\\Cɏi׮]aƍqe M% IcYvh PDiC@l E%fTU'?j}@f=Ãx8p ^ m߾=lٲefy{W93T?P /P<߿?|2o_>ͮx=^j%m_=ZB_̯@삗 EwāYOT(Z; ) sr  Lcv_|qسg@M̈́{KU*?~_y<[^A[o5jҌ?:thpիÉ'Uerm0J29&v-~_w@(%kǃoF/_,;!@$@@p 3 4̡sbl ifg9dɒBbömj/rX~}uFf O7i @B`W< `NB4c/]v7/<ܹ3ޥ^z= tkFFYƹOύ1 @C8X @` 4OBcgEǛo:MJ;cg e3 mY>-[Х@f4 OO @:I`'݆ v`~Ajz?O}'f26B@; Y&|T @ @@Do/ @ݿ/:1B @J /֭[ÿۿ1S; f?wygX|y;B @ @ SP@ @ @ v- @ @ @ 0 @ @ @hZ@ @ @ @`*N+B!@ @ @  @ @ @T VB @ @ @h@ @ @  8 @ @ @@;l @ @ @S!@@p*X @ @ @ v ? @ @ @BT" @ @ @ @@~@ @ @ @ L`E( @ @ @A`; @ @ @ SP@ @ @ v- @ @ @ 0 @ @ @hZ@ @ @ @`*N+B!@ @ @  @ @ @T VB @ @ @h@ @ @  8 @ @ @@;l @ @ @S!@@p*X @ @ @ v ? @ @ @BT" @ @ @ @@~@ @ @ @ L`E( @ @ @A`; @ @ @ SP@ @ @ v- @ @ @ 0 @ @ @hZ@ @ @ @`*N+B!@ @ @  @ @ @T VB @ @ @h@ @ @  8 @ @ @@;l @ @ @S!@@p*X @ @ @ v ? @ @ @BT" @ @ @ @@~@ @ @ @ L`E( @ @ @A`; @ @ @ d ;IENDB`HINGE-0.5.0/misc/ecoli_shortened.png000066400000000000000000001717761314415550300172010ustar00rootroot00000000000000PNG  IHDRJ{ IDATx Wu7iF}e[Y^YB%KHH xؖeK־k;ؒG5Tѓ:A5=` HC/hP7vڊl6jЪIsJ&9Cxߠ@0襻ӃS61>'/O{EDڕ{}'۷V&` \|[$xwr>:| ' M/]L7BD<Ӈ2ud2΢(CO suE. v6MU/5˦9q㔽,Q85PmKʷ~Ue:k:)sbuyn M&0 qa~H.no;ƳZ|<'h% *O5Ӱ4t\{M&`/9ғC4{ed$\u7^1Z"շP A19h1hl6姙$[Lũ!8ExqkkeK3-w8lj2{.Jl6s%`;s%6?nD8 GJ<+8g (NT{B!wβl6@c-s$~WhI<\%>ZWጳT]Gvs@©Sg?k {)$$1q8_o*5f,"4ҩ}?l[Hm Y |ho6yU\;%xE)\nTCH*jZ2 of-\nLn{p8VU6mtyh麥V54-Fv5v4#چ-d3I{K~-, vXs!M NQ!ho~Y!e;=u \lX睞>ϝ!9-q{jTX ڛA +dUm6slg}l C(U<+E:๽N:BqH&pu'Sfޠ}M$PK\0Y-0>H..fSUլ5$߭d+tOU&pIcwiB2~+.flBX@㲚X$yz9XZذTio@X`ۣ^_kiЫĮVnzشa3_UR{?E0\ 2ӣOO7+Fy@U^2`V[ֿVj(GgTrU<A}{NOQ*9T`^_ Fy;7ȓ .Dl/!`;ǗVF$I{[`[M̈ttk@J@ 5iJ'wSzYޫ05U`&z7~`Փ=+>Fj7zV^ '`;:&Ǐ`,]gT Y>$3Ap*8F]`; 39>fJ<,HJ3"nw=^?a>qPlale.s.p<&-͍Ι,t`o??azt'E}y8'-2"[/r?Mcsh /'lٴΤ|kl6_Yݫr)ciR)2D6fXA?'α@Z9]l"ͤH:^a"⌾ūc兕hɪ-aE;l6_Y+]$0dM;:]j Xɪ^k70Z5e%CҎ22 #Q]IfSj@'uILl/!`;ǗL.in3x| 8 \&ly/!jbg\EoDgbM/9_Z,r٦:;ILwLn.fwLTbpLSwZ'4ښeyDMۻԅ6(Lh{g=: :\u=/19:z 9ycma%rH&RU4]P^e;WLf!r>DGp -=UiAOYwl&U- αNrKXR i]# zd$UakhM+}T w8 c4p$s#h,זQlX}h݌O$s.ҳOr䘆 7$|NarET7T|+Q.gd GFY[lX5ȶ&`iYQnB$W@je"ټ%&zeY>6`OQ9,M=5f-\!I A$l%&dwc`eXU*1q|dy5""(!C@SK}iaB)!\/qݍ(`E;YȪE8$ E1M;Z|m9O~r>4*j/s$઩5|}G :2:QXM3A X аVYu LB~r*d[X]LׁS~a( rvHqȢIPNmO&juo!M1XWە@Uf59֚p + q5ej/Ś%V-hIDN׸3w'&4<iW [ުYCQv k(9?ڛM&~lJF"Uf`XcѺ.(TJ PVx|}`pCֈd02)Kdr ;jo#֟ p*H4l.09^`jJ'J8CF`%rB}L$ Z:*1?, #QtDeUJ$?:Q UohWv@N\e)j`:k[%#>5IST!"b"+uJoM q,lqqy?hqJ"Im@ܥnkfx%L\B@g0DVKC> <Z6wIʎO#pJ"LZ{g>$jwL'yG{$lOKv2?9ƢAz:N=2S%8fNEapLWx3 '?X3Ru [%Pju+"54_h` Ԍ=Z3l K2ɴ^)[-6Lra2imݰ=Hm3X- $3`kM(LOkSֶ$`?9gUӱEI #joIe3/a܁1KgVP5 $(擑" oN+͎!P`b$S:*/?~oT αN{rVŲ ׁ J Fn ܌ k ǭjm6C׈(JQD;K4:sYT6@~t8BW;t v[Qr.9^5jàQ"^b-KEqt vdSf.iy xr9AWT0GYTVlH~I @ n/o~_+đ2ɔΉ~ѱBZJN)ahdƇh?iTucpDƉ" 6OTaqG 4Ϊ flx ת Q+A٬+XDU4JHt_`p\Gt5Ā:* E/w"KVp8PaR@bHCoU \8p_-YαIFrYv6QrNgqJV R@I M+bL%L\ybُo^מm+ L@OˌpES\f%s4L~DrҾn8ՋV2pP$wMTeYcbRA%[^kZaպ:EHOg+PU9Vw$ͻ\61̅UW8UdڠPq51 ?F4BxY6hn!yUkZlX]g0:"R,$禪nVS"4rB0l'6\z+[4$pX?Jj2'2Яrpd ^_ Զ:&`qYN'JkX/l?Ve㨥In\责 %]2"Ɍ) N: T)Um-&PlX]XI!W AE{4]dpj3VuHA^5 Ԁ" JEb^Qb>7h)v[Ui XCkC'2]-1=]BWX-!N$_geد>(ՑGs8sI_f0#x۬#6W+ h F$:۝4ّ<(CeVaգkItW Mc8NtD44VwXU4| t1M'GOȄcvڸv-l+N7h+-ӘFjVe'jEGg%ojmA&+@,$Lr_«b>K "qinPɶ#`?9΃>=xD!1'p`!XI g tE c!b*l_.$ PL˔&MQVlGI%o݊D/]+%9/M"`?9u`m[1 kdӃU*3uLJ4m7V %۝T +rXB4FN˺~k<ږijtiX&ZdUUFɉZD:.]$}a3DF #M"Po;M%&sRuR0\Fm9g](x΢Nq9Zq=_T]t._?lj.ⱯZ&'D@kZumD?^gC'UN Hݸ<ي?@&mx@Sg8?.]1MkBl04©D]}M!`?9΃xY:BSWEQP(n!4MQFQy@lnPd29]L' RL< Jt :ƪɷO$36@@A%R C0݆?JRhuߧ@3)Je112!] +W5-]݃^nb!_/GL$+,L8aj*qjgzڧ  Cav<;36Re||lf Dr rI$Oq(tu*lcAG Ӷf]VK(d J">(R fQ메vq[y\nD$0LqlC8hB HP]Krr) #,rxgp/QY]h;RW<14ʔuSsӍM7ӻ1sּDKKZ6ӳn ٶvDK&l! fvnMp}tanzz5&8\C `2<Ց8hgpxe'\M3."e&R^d~8YpSLd\.A |Yhlm6ɢʟpddB 8M+rw!|xn[U*ihjqJP{M>O!`tJ&Av^Zsǡpr!֨$gl2~s\'P} Oq機P ihnf;Ɲ h1 FXֶ0>p@Is b{90Pf:}O24_՟݉7yYu}!P58OwTϤ1M"15[.g`3:>W3OԤwq8u#O'`L[ѼYI|zhZUa~Rەg=Y=Y5A}Gx>6]&E U/92<ɏ {I$LXCn`4&ܺ+厍lfuD.Wr6v%d{w=F4 ?߭|^s@-_Us='RS|ASFɊZ[nAFJ)2^Ol?'9߳7/Eg=YG2]] a=q BW*4\>읯 {Idu'QKܳ䘦`HZIkݡ{7r͍Ū`]ϝlP` us٦E#&SN98gF:G d!)T]B4 '=@&,!Oa/\y9Nܼv''y?Qj!£L 7/7AҐ˥myQ*`\e/?~?b-0a"!oҿ~&$IDzH}y]Ăd,I.2(Ød3'f<9=B9} LJ3r6.y]4L"aF8uhVI He{nٱO]1(TdWnwv#b[ǜL`_v|u vVTtόP.:I|sOY6Wp-Y_}<oϠɩ*P;Nm :x/WݹyN Z: !9q˒G3{em^FƳ-<ό^4t$J) iZ M=e6\5w,kFmG: ,ŢF*#װd\rvY'w+9}ϭce +7+/gn6~㘺D̡OG+/r9zٻP6ZۮQw b>N{}U CYa_bIl@~SX4M8]#[:*Kq&[<Ľ>IÏe7g_ p*?zеB ^tw@*~| E25}U<^ou\u2>0 `k%(zn;Vйw˵ K3%NMbXrɤ?eIsX*+n\ Qʺ9-T`48z6G8&S*kP\3H%2tjq7m w6Z9'X%8[nƜss]ݩ4kvp 59"39bYݡ8m |7Nq)CfdR{_ؾ,+il>vH86>̢i$iFO1i o㌠W au4:pHtuǎ޽qF-Az:e<>B'pHMtϜbptmx uɕqZ^A>v30Wl5uWJX(9ٶd᝿}r]MҹI"v607ħ~v3:tv,02fV(gmӪA,$s VhY`:+}nF&XѫL:><U)I#GKɄw})LzVjC+ׇ_m߶TV4cF{Xr>t?_/fh[R b!8Cc,f鋗.OݻO;qKloVmsVZ Y98e\q`IB~@ki"}BJen2^oZ{a[9-+(M0`靘>T(KzDnԢʮkVy&?<4Maj@AFy0ux҉A8%])4f2T'OowakZZ搙QEn}=_Nת$2x\~ ?C=U@BlV-u `}ٟ:a|l&OѼ%.Dwu O%x"`0OA&ubU/m振\aKBc&xPJ@#Ț9a8OUX8F$wQt<<)s qv\;_vZHLGbfG8yNFX9Dy%*U*t/bcgg`~3Ϟ2k$;kK ڼGÓI F3ْϲ}g%ߥRT^`i>wCK IdIeG;Xҧ*1GO2q}(VZ,Z0UStT }e!!a̐ݐ(||7sM,aP9sO<] Z5+N/TA/QT<(/pj7#4bh!9݄[$q9&GVfs6xEI@7~ǻa%V kw&/Nګ?\&ih9w7Ak3@WT96¼q"HgAR3<:Y`WiX?nT2$DqI7|?mUz6run6ch=obޏ(x:]+CӚ3o>t"m=҃j_czrgvchT*7Զo?=察(HdZY_};W^._>0;V:|.nw&Lɬɱa _MWuI`%`rFNi:Q`Jmdz41hDC,pZ k N <1h8Q,qH:W^#z[;DZ|1ZcW]15N[KT96B @rb:D$~p7_{_2x pr$3$r,hY ?jg01*+TCEk:5 ]V-rQ(s΀=ZΘ0_vpXV϶岃Ã*Қ%x=|O!9'n kC-8n~{mM8Tsߑ}Q":y4M%KһNᣟnވPA7/jUUod@)5BzzNoy2\~T9*.m4{\?>Į'L5mj>נ# +pAao~1 ¬[$Do Wq*#c*+#V'wP'6/፮={&Y+g 1Y)HeʔTȐ8pu@Se+~3>;~sBw՘e<+nǪl3_g ;|bxۯ%m;@j)s zi'8=|U0-o LO,BTYޟ>|7Ǐ~ɇCtUֹ%˗ɕ'tHϪE^6t!H!_z"E%#*MAȶ3je-Iď1>n21^ffBg<\wl_?& FPd"^u̫ܹz~ݸQ0bnR|'PhA{k7;zɍnb<ҏرRb}rjh?{5 xANVn߉s:R,HV85R"kg5,b 2= -47Lɕ1Ӱ`~^4 OT$i)ږ|3r+8edl}TT(-n+G 9ZzkooO#2(/|@ww_ͧVV  ְG| G?q~ 0ز5"[)~|tFY%Lp!i-mXIJ#Vʼz$S:R{#<㱑izp9] Nb|׳h\)F|:*5>ن?V .̌=׷iϽ%L27nCuQΌS%Zu# +w͝~\VM/^u 8shNN-Cz$n=܌z˹ F@Q$QY1QhHBX8 ݨe7N[^!?0|ϒDaOw]7}u.c~чDvJܴǧ?zD+.5r|oݶ ;2Eʙ |7rӻb$ ȂQrux;$vrIXYk {噊 Q"aKE 8ݡӉ?b]rx$ĉLC1V2P|C H~HDk~ [ВHS14~W|bŚ5h_*?˽LSizuH~)cu}`qu^lR 7OL[w_Ѭ#[߷&Xl>Dh;x_vgIƭ"g,6;hoszD!R!yDX褠trSYc{ϙJd[*s,&ep@}Qs˩jGcw bnr8.Ǹ}+D0|Lv͕O _>r Ip=!F?b\飋V}a\rfgw#)n OO9B4tvX)BQOpb4QT3$( xv󬸼7RiL]W!stJ8eʻP\g__xְC;x8,yXR&8Er(. (3.U5pj?&h)IYQ*㳗b1߶#'_rhKT(YGnwA|3ٶg.sr6oXBKP%H9C'ϒj E0ʍm4u;9m\DD<7n`8RYB'H䦜N$W)EimXEm9܏0q h_n9OҞhrHeJoz _DO6`8El?[DzU f$2}Opw12c\:Ū@m,hk/lmJ8,Xxo1 Lb[1 q@I cRIN$pRC¬9 -H&S IQ->уԊ<[*9`M4pO:|Lg XOKsʗf#[X0A'(#yai{Uv WDz|!i ^wC96oO^KGsS {݅ZCUx o}V/Wh.9qq+Yb|'F+ކt;1 i^,RƖ07G8|RcihnS gףA%VV1", !QW~1 IDAT_&k@dF?E8\ 4T:4x7AJ%+]L0gTͤs|;FL]%xp/+#ѯ"؁Y4QLk{_<6 \t]-1nf%17N0Z+=(5wLq~Y>7ߥs!;5Wo68~8ep7Q?ѳC"O ](ZRTN`d"˴|3/vp%812(w`rݹ{ߝ}UŒl^@@ &R0BSBwɒmYzvN}~eKgn}߻sQHyܝ7SiF$Rc܆^x7_D0Xv}4x/hD B:/t<>~EJ.?k<rzgS +/c}AeGN< `U۸@r^ Yc0`XJ# ^;9AsQh7f,6 =s hoPc H˨j^v]TΝ(U: Lo e4^ l6N"Qoοy[vQvz6ǰZ(*z/g)gO\$9wɗO?U=q: BExh7moa\yܕuG{R$ٹye+O 7@28jAm2ZI@k58^.i,s''U.esx^R)F0>FGI&!1 ̻AJ™}zs_!F"8a7~ v_LAәa7Ao^_|5;zL.Bqȭ? O^JŲ ?®H)B̧ѹ)i\^~7YR2k2% R$ %jj͚8OߎG*1thryC?׏HŦ7S/0c~T:$HC[eË{ 8l*"m5M\ >^Q( \=’s6^JWiEb"#~5:4ikSQ(搄S[ v3OBZ?ՋUWCT`uz>/˰Z-g]6ĮaK|W->b= ֺ>*wՖˑtc  LьZ"BAR, ndLkmd&fwSFm\?LFx(vuJ^ :-W~Tcv)$r VLJZEV5:?̢Ek\ڼˤ~?2h%t ӧ2sCKS0Jancؿ"??+n\|(J˱i_?{a /6 AJVjc|yt\,&6y0r<T*ŮzU0@gSHFѻ(J(t&ҡIt EfwM.;6B6 4nkJm)7~o:-Q@,1+(HJ$5HKtM`+g Ţā՟ŕ{NФv(26V⼛g0΁h%1TgXv.zŐݰ_}Cd Z#y(:ޏ2粹.RU~w=y*ǃ)gM>VbtK6Bkavסh)峤ca$R,lAg,5ND[C U;sKqbwu3=MA5Je?`N}5h r[&ih4vkʵp1*~Ζ- <{U N$/bn/Q( jx!ݝE=lu?wL9|2#18屗j>zR:4bl"N$E5_]mGjr[8/ Ye7AqIE*/=)JT:=FO ԠY,X QtQvPk5TSdb6MZҾY=twiS2gZ,1ktDIC,^Á^ff-6W^VGZ]gӰ* t*гRATe0KǥvVxBt*Jt8 xj p{xzvLId!4&;*E5N'9#98}] boFbo0ݼWɾ "R6B0XlR1 Z2<BaaR8j%f)Ǵ:ꪱ4VWa3 (DND=4ar5"\栺ZWWݰ)FS/Y<,{EhtchJv֠U"<\Zv_!.bS)ٔF[/н@Ւ-aUO%J3f7~s,/8ɲiw7m S (UeO?I6GC-|KQQ{* N9_rn7s]}lL3D]ӁيrrH!h-*!ESj(s>d<ܪz, OyvfG_R.jRV4$:LP,SrΥ`"1kTQ咨EpTUSWc1UNfAdDT?NNN?I{:~22 H(*J<\Hʟ) Dz$Ћ{AؿYfT]:Qk ƽ %lfǓ?$MOPU`vn3-ޱ'Isr^<{z~/zHkqR$1dxf+-/y{;+wr| <"pQÌ„$5XP(|6K&Gӡ9P $zT3a jPͤaґp9N*d6@&C%("2*n\PDq8-f&47S,((E|XDPiFƾEkH_U(Ƨc*B (Z8E~J/_a[8|q/GicOᙢ+R;ZۨR~M*bcxv+Mrс|4UW^&\.WNhTәތ<,NO8ݪv"N7{Jb1(c\˱XxGNm$%,Yr(rCjP32r䬨V)i$.H(&fkP # `)) xILz1ƇP)rZlr 6YV:F@N]hn5Y:Z/ H#a{t1LweE%Pzu5ʗ GKv#D^S74S[s3mmנӝB@.gpPm03DaAgu-*ˤF5%cYb׺jpٲ,j=/7+Yr<@h2M$l~od&C4_btdRENg$IDBh zVl|:Z@)bo*SC(겔IkR eSv&&' ܎"b@n4#80٭/Jt%ʬ.z-TQW^kd3a(Ȣ  ꍔgbI4׃R_yͽ/bpTCP*^l"!Ϥ=D{p2k 2ɀV[=Ab9\UT{yeԺOA.RV. z:a:U{LNZC ,뵠Oe٢Oװ :&р *eѪU5wJa}Fz(C^B6Ecfz7%)c``TW׽E.gU=_s]5&u5<# Q.0pPJ)B1|#X)&w?Jd0EK@8*JZ FM#%ǥMл;hn43RCׂkz1S- KOY2iNeK}p)[6W2Q+FF) WNɧ/A7|t2D:1ڿ 4w08+Dҙ4y\6-* ݲr1\fšͺ 4f*VL8{'ʖ4O57_9]]iD{I!`.ehWFiI H N:x1( u 70ex3,j&)z1+.w4rde PAMT8vo}[a*l&vz? :Gk\Vb?K~B]m~`d? ?ZApdZ/AZ|$NDȊ3:mTz5HD9P*="' Wu'X]|>֋/m|_Hu{GمF*騭ŋX2g[2w%4x, *dǬ/Q=B. i('Q1FblTGk̙7oV*rUx(w So$/Œx}ǘ@6f7kAT7Qfh4biӴԵ@־3[H(D$vMW^v["ZI2BsRY9a/R9|a18ɧKx$J%stsBQ9WA ! yw*ݜF עhHb|s:3L԰g;$T*(Bpɾ'׋T}Ԛ-4T4foSkTյu1$N bIdtzhR<8ꦣў5'_ںoz)$Hpxq\y|M;'AfX2k0E)| 3K+>Y=kQ#"@jsY hx8++@E9ƹSX{0T¬qLj]5-5$MF|ї%Uz|M#BX60bUNWxW PQ4xXCSTTO//FvnSu v&V7f`̻Nj6kkYݻQBPOgJyv݋jY堋cBak``t *uO.ϛv1+4Ys zGCYF''~ HktFj]JNty:/ž-k[qwvM]'-1]cB:MWO(ծ#}CجFk" 5(=Ot#+ғղ`JΗD"DRmU -]+0ێOi?~RXv8Th3Yχk-?ꏃ]>@tWHc)ũNjUd:Af~9X7VE,b1$bI/6L[O x$Tj2,u KPqSJo&$1@:c#~A:/E:vAAv^tXEt [.Ak;/zL40ҽy7UǍʼn<0867~+B&jx$KT7Œ#63Lm23e(q5QYEYƼoL=b2BA,7>m΍Yog;KꬸW`r<~t`|T_^9-'r~ E觰' b5Duh8CSݜ[t1?@@.oעR"39}ڟBfEr.?hDcm۴sw= I֚I$ɤ3̩CswX|jpJbtGkt4ke:V[JwQv[U$&@i:G@br0T*&%#_v&9x7{8`4"^/j?z-+/̭j6R ">G$ބB,!䦲h?fNdsFG{T[Z+ ҾRqg{*X$]k&QKhz2 O<.71D*|a lcj^UP[ژ^SϾ!p91KҖMW5hˑYۼ["M1H2u{OK/%\R!]7gw|ַ<W1 Cm.!*Ux&fKDM(=us:T{!eDK~]ڌ|U */X{R{ZŮ'4 e?Lf"qXضc ~cdTCC & N-32@g6Ȥٸw'RHgCIC5:T: ^|y=:B}.tEGtW}R_K>Bc4rz5uCL7;_yf ($S9vg8u O[UjW[medd Zb .@yrB)]<{ Z0vO5} ֭ N =[<ʼf=&5?pJ'$FM"W_+.:4zI_2 ԹpJDy&Wr7hZR[^b"jm ~ﰊ>m`ӛI34ػ jc;K*V9C9B4R5YL,?SraU,&7[hЗhbhDSr}Nmnevs;gQH *{QbEpkN;B J?u5fۉ[r_d;O fS9(&4<q/m aiKIS%sKo7E^-؜ZY4vܧjsGku:QſY>m j*,q~w1ջ)}L'H[ƸRS"0 ZA[M*b "5Q_{伽?qOɥ䴠ZB]|!Za߮}@`d? >f UAewʼ_ُAZ _z!_rۉO/1Alf(Me-?ak,$zҨɵ$gvPk}(PHgɍxQkaվZ]xBv dsu*,!+nC=qXW=al;hh ĕSfq۵+ߢ1v=}Tߠ@hþ4&5:>ʊ[m;{T*xhKIZ^mef}?(mB̺ٳ'3VԣVW9_AF80/Ifaؠb7+tߕ7f{ZCӄ7cj'owrZZ3m~%1L려Y} dچ#Gf,bp#$y|zNhukq7֡iI⬘2Ot\@[󯄻fN\R&Tu􃿤sѭo:T(yrOIc(UjeKn<%8jt),#=PR`*:*,h4Ū; v]Jm'咿k#Rj)<56u29â鈾 "*ŒlG-zdrmZ/ %#1}xDGO 3 4l|񖏼x?vܚ(RWƊh;orXv]/:tXH^wKK9jmL p`D9zU0O(quN9ܕ[*,X8ɯ";g:}8㛙Ѱ 8Nt29.W֡Չix{DT ^z)zR6KذeQIKK`Iw̱q[Tʶ0ُ= 6J_!U0E|ۨz=&89U?!iՔ]Ub}2S̿è5_Kb<8 eR!^͌)gS WVcnCz''c4mW]txOt+U8(wxd?zs8/G{Ϋ= 龜29@[z7Q$cEc!SϝmJS+] !)ѡ\澽4ykߞMa7h"uVYN.|?5r֪I#|nfɜ{/ynR  ?Ŭ뾋l-ټ~jկHxBTeK-yr~Zv~aIAg'Gp*ߍJ[$x&>Toe5FpjBs{oa!͜C}j3Nŗ1OCt6B_.:.>9v.B(شc{9^וIw[wVS|+ ,$3F?^1Z eۻ}Hͨ*,֫ekO- "Rw<+Aǂ y M?`Vb4iY] OځØBS;Uz^1NZ=jd&ψ?M&.KV36_kPU?8Zh$8k.I#]EPܓ0;amo;)DGrF@q;:a#;OBG#`l\|1NdξejAE,g#l̉4wL,/I;񏌣6V918,<5ߥwpJ(J2;B8l igk/G+Lf,;jLSnG)j j_eǟf+dpb=P(X#t:D"Z;~FO4ca?髗F{)wcq#*n~KΐZCIzΣx|MjF;CX7"ʕe$,R}//4M]8 $-*N $TXj&%>ןDݳ`2> I\_RDEk,S*BA*O BHP" Rc[UڍK*hl"YPmӃ{F53c><I]+ۙ@vfC,᤟˭ߞ%VN wi`ԒΤݯ~OUiX(p'Yo+Pz8PRM\nBR0S8+ӄg':mW\ߟcH+[7u.<.顝+=?֒sͲϱ`֕JE Zg;]ңo7qzLgA!?@jRXKP"s)y6Ι@3q(.Eň ?ߋ C]8:4No@iQa`tPDdK͓<=}Eb{#S("ЩXEa;+ؑ rŋi;zhrk 7<9Ux7V,HCi;u_զyHkКhdm1K R)eG~ 9DMAD9^`BxOoKo%PTy%Ǖ^HcrJLR0?tV-ݸ[OUv.7>G:pڐ?L7{*bȒnbC)zvvz-F5HQT&E6V*D0VBw1if8>jahB=nQ8A' l:y4|s dtu0%˹j\DcYm%+0-Gxq<*x5ɻ^şuaY 'k IDAT,8cY JU vm E5L^]<5q,N 0ЙO*{W׭Pbdg'?6KZشe+6IS.Qm<z2Z;t: Xh4Fd.ӓ1O8]1@/݌F#HS=Io Cvơ޵yǨv1IYt sG͌  G ,%PNLQhlt/I!6܅eօX4y~x1;Elvl1 xp%]pXZL:geǶ 4/.lU䑘P(ĮyKCsND.M`!N'tk'hX#%ϏPئ.8@0dXahT3T7eï*gH/׽Jۃb"܂53,og0 WrkQh!h29\ZJGqwr:-d.aR 9L"֝ېTZ0?2` ]vO{>C0JCUJ3nZp ?$ ` "bhq_bg(x40]} DY NDvk_h/%#- v"?u+3N}c$IzQ=JhlCƌ3macĩ_{L TLg`,]|̪j5dr) Biko>K!")V+Ԣ\nDgHDYڀuɇD÷14 ET)J^[0ABe_.K'R:X;֡AbZU=$k@czF&(l3m(,}d3 =3D Vѡ9gެS]|.0jY\I CdsIL: uϴSlfrYN5ؑpw+[IE)1PNvDuHL N|fULjd7|,b2fR1u%u*fAQVH$6D=罹yb5) @u4ty7+׼@2AVIѳ c⢬R)5ZI@Ņh}?[wҗ G,A $ts7 s9 Rc`3")): }L[|1&;V ~+?֞g(z}1I郜so6w|) g/i`pQZ%Աfmy`xq;}'/:aHy ol=Zw433)/ꐋ8 Nkf#ҡ*CݵiX6)/TI>vJ!v3Ve%3:6d(p`tlAwQ➘<#c=+,Bw FDﰗ>'td$F|,\X1 az˫pSNl]PvJ-z溲QѼ7c+~i*4pY?!QQXA?oM?݋^1ӷ2?;H)@ZpѻXǦ6ڃi<$da[?kv\~d.&gSGEIK9< SսU'm mVr)6 g,9LO:B(Yb1ng:q=3 uRF<ݩ.\5G6*2ۯ=ɎT[Ym;g} ?qz7w͵'zݫNUO؊%̠[܈Ѡ >[N*V恗N^#`2S.-Ƃ5h.Ɩm$ JKq$Ͼ D1r(`POgr<%N$l-pyJxc\e6f]谏8ᬯ`J~1ki,+۬@݃;l}t~u%c$iX~CX !g4漄fq8GS1e҄R;VSK1$d:ĈBOW7F1ThG?X%c5ȒBDC\fNx{^~7A(YY{_\e q|-ƒF uak=}qU6~Gֻh\‡หTI)(`!2Y[F"GӉ"HX9Wp򞱸SM)@ƋIk’,gײN7JJۤvX?vP랧[(R\)w82MS)^2}zؽ̆өՔ!uq!9U9nZhj\%q|臨e|!qlIv[A":z7:S-#X=='lgZ Sa0y0X<#L fh8Bk/#XzރC*9CSGfA )!\k6]{KLxS6WҰAnz 4zL:3jv7}TB~-si*_#=2@ aFA@<***}mZ@Z9ږJZϫf3>k%#$kTZuzڷtg#OX0W;Y}f6= ^U"dTL&AafW}4`Le c8;Z_|`h ucUU9gWvV"(d22 <^tšé+aۖTMsSIb(0zw;ΏfyO$=}mwi91/>s<+lẓ9HM Ȃ^v\D v3m,g=YfuD~`M"`[t%#Nxlh`2㨲"Hyo$_P0($#ei"C14c^?8B`H!! Q'OǛЈC.XN 'DI'ShuZJuemXBg#7È_ϜrhZv?Μ%4΃;r;Kg_٧ 12&I[ꊓ5d3Y$D>x l2y X]`ljˑ%y|fμk6y IOakf%d%&Fֲy uoCl}f d$FFj`z@˥t?mrӓ{hT@X&BE'p%'t#dfZR*UIoFׄ9R=ĈvL.ktZ"^GkF[B#}KѶm#9VA_HߟUXBڜd0IVP= OjYcR|u;K܇/=ˍ .noQ24=5Vzycx?y8dn\t6v .:?+(4b$mm/A%J] QЇ#T[P<^2.DZO%p 3Va}!ue -gZJ!&nQiƗv5͋z)'O{^jz%e%>oD,(MeI Ӯt-gUs4_9lR x T*sm TC鼐L4Mr ?B9nzYAj *Cp/;51f65!! yI_!,b@e(Mi1p=ߌ,,,c >6htHS˜R. V5gaT+&@%sպ͹i,԰u͔VXK IDATLBP0.lR/bɵs!MT3SV!1;@{J"B*AgXf{y Kr8 QfbVwhО"  fIA Tĩʬ8n 8T^4tşd3eP{?$;LJ h4[^Cx\rf3ꦗNd۩lRŴV73u;F(䳿Akr zYNR+M1Q˩=]8A/sxQ-nD@lY{Z)W?Ǯ@l@4f4.8\rX6A B*bkF*kKώ5ՍRLg7ihIEi^7.X/ʢE˿iY| qʢfr$}*@ys5Ter߲f0c!KY˖oޓG҈I$X|K |Du>`Wc;cSnj0mV%(t/ً.gYW}Ofqsv?ۃu6cC"3„SzmT*#K@g2091K+[Nscq&u& y#Atr?2GQ'LG ꦲmǖ^1+ fO ^Rs>IaJX5 < v98 t*IF)2GJQ ^8\g5eGfC)L[=N 8{}3'el("dE]_˕7١&=zZM"r;,1; j_ ,rf&¶:ќ5%x7{~? 147#aW fp/4J(V~kV#[r1\AI3kn<~CL ;J{`+m\'sڧ ֨5Sp!ilY@WO\%pܥLءX$A݈3>@۝Ȟl#ѡ&Saӡwg=J@%p O!Me:ic1ͭ/w6bY,ij%.qzfѧx bpշ)76r«ߢd)oh/uDRYzc#쯱\..äJ14TǒiQ/9.A8LnZA_ ӊ*Ze+b7;i*?l}7Af>7{EW尒7s% (䱓^7spJ!"YfD.v9RO&O}9_2PH\t+QQ^1c5qkЧ,,Ezdf2VzCbA͹Z.z0>OB"zM [!o䨷jp94DY;^QJ@%0 V/_'oEB)\%Tq߷ې9AgHN6rgrҥIKxtjA8DA`|hSeik~6?y?٬WŜaI "3*fvًzB%0-mf3DH/ɯ% F9.ZJO~i}Vd xܹI֐ FaT%0 U3ٕ/7FsHGM9zGxSPc!i %%Ru|:=gK-p ȼ?oI`H= cJqYT*!\PGG%j4v~cw<*5£|6 sFWsgO Oȏ fZ:fDJƉHmE%8Lvf4a0zF"#{ĕ_>8v36tDk,\xi j6ΐz{LۉE+1B1d:4P{g8fT `/u΄K6{I Õc慯A݊V/ D{t FTL*/_C_͗7=%e͓J^UJ`| eg0 $(爭ãه.c7:^֋UP=; d4d!yEҠb͍ಋO"Nu#1@9& }m-u+jXv`bk\}%ԭf<QF1y29Mdr-Fcto{=P0^u *㕀FP AwCc,#BT)c#r6Zr3^T2,m8 Z2J,R<։-P/n4ޝ&ܪ0*; lTWTB u:¾}Lr\1HHũ>K|ιRMJ&.Ҫ"!:q^u B4a0̺}_#RET ,n;C!2o U[_A&ϠI )XSonl73zJ$QAD+q;,?%,` zcw}=xUU+BLē3W_7Nr 59a''eQ29RK~~&:>P/&3 Q|Zzg_Fʀ`cHJG߹&TT',9-XD"fA֮}W&A];vy3j}W/Lzcѧ"HYиފtXvIwr>˞뤟\U@gm|\Xh [9 p^k =Kh 1d~7cJ`*Pt(X"<\w`ԭMar4M4+ôi[e*%YFi `rrY;@6ju 蹝D>;U/%p'u}Q96LX stCKϾH&O12gL`. Pp8,7ɔ!K24`i4[GgT]6:JIJ\f^(Di=TV5t،z w'T*O@1ItBTٜ FI'S rX;X%$HdyBUq;SOt"2 F!Lj4zZJy~}S/TGԉC$%YpHEv~ʱ}MCiHln${UՏۑzR%0$B~*u9Z&Ȭ:tm\z)߹M!iĜ 1S *J"Pb 6Q4*l&͚^Oe. 4" oIP3 S2'SބѸ`-ֺK_wSj5b _s+cSGL.b*F$±q>$M/u,0D5)y%,,n\)I RRg 4f @.S3J%;E~מ/0?v2*%QtQ+٩ME'cd,;ET2cf_S6T!->LȇUTpe*J^EH^(3o~ꁃjK-P >Iy\yk.!NAWb -VK 4[x㊡T L.9 صxh:GORD[V{B6u)n|B&M3a?t{jJ4is~vF6xPqO2X.åR_۱WT۸zR%0U _ ݟf )n8.߲:Af!8oͦ'PA4UJ XMfE3eAz"9ng;сaٰLgևgjM$%PpnlCk;3?'l)c/ˤǥm}WQAN@#sHPFH`{0͡J;Ocs[?zNxd9ƢQh4bnٗ՟ iSpSCʆp3WNT0rTflr`HBef̾Vz4&S S*gQc4 -H͢3oR<BWP O@ȁP )@ :wtT`OL7[(UѸ9PNa OȅpŇTeHOL,ĤÐ-zχ߁ڂJ@%O\mara6)pJ/bZ]a|uTS@:Ǡ(3$W8OZœߏqá(%f _~O=t MeT*C `[ :䊑r۪D-A\F:"sm6T B]Teȉ88Q㫚O__ URgg_.y(2Q]T$ b1G!ݢ\_xO8ej'ӌ-T 5 ..)J Ɔd8 :̞#s/:_zBIi5l+lPu*ID!mq7(S|󸷀<Ջ Ssv9%unҾ"d$_*ˎh YLj'F<<ÉL;&ԒdSj}& 1#.Vuf$t m;r>:s~bCV>=FLTAPC! q؃L)Bijh4j"K6YҢeKGɿRXժJ@%p4N,2=k}S[NRLgpA͡KT LQ%bD^aq/*1E% bYvT+֣5j?#!\jpN/R#p?[KdcZra,N vݖO?xhX*-JnxS*J1!!M,/>'>c^e=ȭynxmTt;96Yg\5T(W ]'Saѳ/|\1W8ZǞQt} fݍHJs']wȺݮ*ȃ=! a@YHVxe1rЙiO#N&:neJx#hG"T˃Xer?{utdrV>S6sjz,&EsHc4"thqW9;Ô%dcz$9nI& %3%*u"ن Y7^G=Ie2𻟰J@%)P7$ڨnjlFk8@ T3S;ib'OZ;5^='TS@"g,ds(ʑ8nwruT!HқP(5h= ^zuXN%ً#Txe(Q@v۞`c\+ҘK̓v`*$ "ֈXbc*wL614_bbKmG%p<HǓx1$ iK9)B;ّΰŋ6R0l(dd9wWpf@4FAƮ"S6>2(4l%?N }XmI54If'\e.4c]K9V#ev̟^H׽&EyA 40z[Tňkd.Gň }I2avϤ#zC`SgֈBK$NUXM82#wCg)$i+s}_T< C@ ;*J\l;CA3fB*Yt$, Cao->k-d?ǙyxL<5͝X JwP\O+] IDATnDA'~c8uZJ#⨩r9r6c&v]F( $EBLȷ0v9{Le/Ϝ>3cQ$ .w!5[> uT OD]A#VCa6jE@ WTqɊA+ԡ:S83ַ15'Ulݰ6 ?$Y^{c;TVF_b;ͷe䪤"Z:em-*7F e7D](ڮ0&{Ph42w?TtV;Xv݅4j!x1 >6IDRV#v cq4V Uc ĢQ2ъЇ 9#U,%LO:}c18/y5ԇb"Hղ^@B 80E16k>kmd~HAivT`V"~W4E&y5AHD;9)t?{r" 6wPt TYMf&'渆{hYÎ MV ,ıW4E&`zҿNْy}Rw#b ło͋?QFHHk6-Q̸$QPҮ!IPaۆ pJK֘7q;~z;lwm9)4*2mH7V""n v?w@\Q*eG6"(2dy6*ENF^@$$5?CU܄YU Fvo+UU{kJ@zV`$W5͈Cfگҙed1b$UN!)ړ1VEAA ?.)/g7N 3/`_A6 SQ~_Yt(B ^ˆĺhJ8wGђg݊7ڛ{E@hi)9;Ͻ8'Cd{EI0%Rl~Q=3SZ@$F~wc idJJ%9Gяc;n6D*5 ') <+Q}Gw;%3nLE_3j\(vȱ[%&WVvɕPՒ~m_v>rs%qDv܈{hF)JևRUU}v Pb73f"Sߑ_/Q?r| !,yCLz??a?9 }/3-eoZn:)Rvd*} @2HBDU5}]fq8|uԅ`42Jl&V?tGɉ7@K]DAb&wT1;~Iqԭ<cy,.*b!9gYgzI  3zn _8B{LcFOCn D#}0g/Ýc"JsJ1&2HU::~hC IOiԨ1nfHQn!?ٯ>фC!*S}w)u@| 0T:*#Bޥ'qS;lT^KGm;qc qd%pMHKXٱ)8 %Ħss8p[κc͂Cp!D)_%J4!yd>#YT6 4FL4qQZ!Eq|h Zt3?ȓihl&`wA? lzsb6aW4&_y:Tu9GIRh/ mq r9SPMDt3!}*~K~OmDB&3b[yEɄ!T<P7T!8yr۫v) [v6eh{ը,$ζ6:!JgLEr& dޯ~KQTb*6//NqڄA8zoc%Tŀ/@w)o; x͏GEïi](JvU|lx #& ՞l!$/{h )FVbRS814l四ĩ$Lǐp⨷0luk%ӚRn􅁄!okb0k8Fqt@ej'"i*&qѨ#?'_G1'qlSE`8Σ {|aaT1Zw嵗ѲzO͋z|Ac!]3B"8/#8˫ϟC{XH:2LF2 2QHj~aőh;\)̻ o|pB,mћuI{(:. xY{h t4R"N8igZ'9цo_"DZ4P氒n粓;<7:>`9tf<͏OE#:}}0$ U[QQ4h)E>8h> xH8.N+'睳OcoMalh`1o8ɉT`獝8Fr=-YEƗ̾LHx^4EUj16GU{Az7;x IY'se Oo`^ٳbyăux{ttvKFf~B4d#T?Z!HU' qR1v3V#RΤ!Ypfq}34GMf&+ ZjddӁ3%oc#Nb!DaӷnKgcȅPx+픿$qsH E[(,IJؕAd(wxWЦHcqr,m_Obk/: e|#/3潵K3Ku~ d143)*hKͣ:Ez~!a (~J,r_ ɕlU7x}6q43>tcPrmV&Z,:|X܋܎pzYWT^*r4eHf{־C{$9_EWPj-jB]ǂ@u{mV,=C r0`1AՁ5Eb"^?;EW_N9z}ס4Vl!9FEs,Bw'o$mD E=GiGCf iRȆg{፨* @K3j$$IT+eMd^z}g-'㴡"Kx XKs8C)T7]Ǧ~wZ4 %:l G Ҽ:<2u7ёAJM<Ģc ϋYE.9Ml<:?[d[OW,H1c2DGɑylê#Ru]@rk5#Ҍh!ˤ!59/As Ca"4,+,= 6l#W ޟ?^cf<2~VLsDSG2aX~j/iqݫGˆV?e4`3߽MTvF@_[92&$K`U8rggH`QdfndIs/ȟ0aMO_1W3ޜQ1/UQwߌqVrLʦŸלhl6>y!QVp1oYO4!5kjڀ!=Î';ir@U MB}7dd3PqXʫLfZձn<(¨q leU|=i'_9gp7j/ ~hY͌er0 M/]Ŷ#p9|E"~$`,Fq8LCX|lW[Lpշއ0[h4cS4~6_~ ~LM3ELk:,C]1|j-L>(J|nj -ᬜ񣎉7{E s>zYըp9ZxY#^qBM-,<헄o☁-c!@쏗0[ULucs0‰dKHD=["1ylhWF\Y=3$j  1e > ֻgkX̱H$QroS3+jn4{^Y@ÉWҌfk^~5؏>G$='ix]ha oX4؝-oL4Sk3S6Y٬JFw!q4fO> cGq) q9_~ 4{?6v۶%M|UWD?f-v*·)w&?oZVa6goub0(fQ{;K^!,qٗܓr H΢@2'- " P:;!E>2Wd*`EFB(OMqv;5 J,Lkf!/G~X!dǍ@,r~I5boD7 6}D#Ne3)O@Gm-?aU)?ZC04ү$_zn= 2N$V秊s`ԹPzq<{gO'[{;`N[`^R0U"(Hs!?P0]W%1&z j7KXf‘0~H/LL"«aCt_y1np8LpOcJ!K =neDHmjJO$}ii~8>}bۇ, ,coy&ۊ&EX;O`ڏ{1E<*՟UgM!F h?\eK&<,l$.4 גt h~罉O q. =w闌̢?0t:x(YgE)̠OA􁱮&j^y^E^-cSh!#3SK)#.:7N*S[ A` YT?piNH[ogE}cFh[O=E맟QLXQ,ks;cJo6M98ǎIe!k6t<i5 cisjCaŴQ] O?~@-h$7Y Vf@#o|AFQw Dc1:0 6WЬID$9/FbJ^/[|=G*Һj/D%*N#ufuߩdϛKG6qԡGc vtR"<~J_`Ìh"J$#JtbӎbD~s@QGZLd utmp"P#ĔaTŤ c/;WfƞO:&z[WP{WnC1Tjؚ_ˉ_JniiBN ~– aɀ6Pf=/ 5HKi{M}zJШhC_4bZ; k0ҕ}hSwaXFZ4gbbĂ=:Vo`Gd-j0'FQ4(aFi2#MB/~Έ'cAoPq]nXgH%b(WODZAVD% ţגr`D\r]w]4{R&J'F0\)V뻼h*2 EhG•˜Ih0d͞JJn.HFCd^X7T1(p K^Tb@3a֧;3eԩa;3 ǀmKGH1`%[fgN"@VF^r1ZϗyS(;ay'³"C:\FG q5 ԫ%[gM3$dl46;lDF*M⑘8Yv,FSFĻ6VG{e*زVfz| HfjlJ=aɟ4ۑs>&FEMo#~è蜫qK7Bځ3Cd-F3|bsX7zG E}1m/VDic+#0$H=C.R.+Lj4€AB28 [Lӱ$&6.' BȂD,`nk/ao76qo7Ǝ.J; RGXqaf,̬SCT\dDZ(8CǓ1e_ػR";h7~jL&vSH&zi'^駬= &MJc9E{AUT~Uln!X^A%LlľH;sAb *b*цJB10GV3#zH5Zh@3y #UdjQ&b`qm+*f $#]J5Vj&IVLJ#UF[hFtɛ=g#Q(Mf+r$ËLp 9$ެ%ax2B g~*0[0Øg8}"Է:rezばOˆ54Lp]s}4{S`Ƅd.)I6d4xjz1^ &`CS"Q"{8uyܒ ϼKEůȍKt _,-e&Ǐes8c2i%E1$L{s2 8f,\qIDATVq Q (JH| ?u6f4z2cOru kh^pVj!4O/LcjU0%c~O'B:q R18`G lIY-tlKKP SyL ,833f˨[a] z|p3`bBSTr8s1Ï@[u5?1(m8228O{f ?"D$G;3ȜN %e0X,=Sz[̐+rЇJSzveX2UE. Qyx&)B`e'mV C p"y 7ds4_hR3gۥa$İ4oidYo^oBiHFjη8H+پx-@kM _tTeepg56DAA@{ܹeL~8B,@a}ΛKgCEų []Ș:**GC*#xl‚ 0<$8s 8kqEi cE;">#WU:YgIR3 0$83F҉l=9z_Rn{㖥-J bgݵH[rKI}Yܪ0/J aQè+f$Wo Yf)Pd.`O?CRL5Q?؞j{A4'$Gi琻 I40h%<7Y},~-<6_y}je\䓐ŐhTH /:D=;x~丄dqT\X4 @(hk#h$K0Rr1hR@Rh2qSplr:nqOd. .?73 ;hF@҈WO;y;rL:NM|rctT6&c?@Sb9yf?& P!T=qLqmjS\#<#a|_T<c殫'V-;`^OraE. U6{w)N3Pǭ91MdYsͫ, B@/"C@{0u)\}Տ D\ gVpSGdq4|bA*fTwղgi32VR4Fk*9dzdOq A``$8,&3n? S,} j8tS %\>}1l4=ELˑ6@A`$8Ǟ8?iCA$$qjYk޹wD Q[Ǧ`fD" 'A` $ܮ{օ4?1FIf19Ā#fm3~Xr%~2lPSZH`k Hĕ@S3~=q`9<{&k: ޻I=WM|(FXCe a"+r\**o~&yhuh3ݤt S}: ^}y_-ʼnB4̙ǴoHMMȄ낀 080c}[D&˚Bbs-Bw'>E{}H@nx)LN-N̜E 9g>lI@`ȋމ*PFH1Cka?q'AZ.)g1FAX -p IpZwvR~BdNvϛƘk"w\Usغk֜t:-#YkQOp$AEA@H4J5 `.ډiq+ q 81lxNPpŇyEi vz-D'0,QG#8ri2YIen \u,Nǐқge/ G"l=VfbTJ1F?r.@ (&;=uYwxǃ&&C+yKT`4rB.n>qW @@4&Lw\)'D?4 GA {q;/ >@e#E Y$l R|sqdI+?F֭w!KljM7Ñq A`q܎k`6JEaDIssx83ĺ KUOo Z P,~y\9hH.!?[}\ >B Fi3xu:.Ŵn8/ ȼ$ 5ζYeմ+Ìǟp7 1N1EFz2gX-$ H52GJjyIʎ>/6A@! q'6rK8*56FɓTh3P+!8R] 폣(^qiO1aF& '6Uo|ADWn4[&>4)NvCM{ˆN 궷>`)FJ%]>-0X1#uNHQI=¶JX/|^cHde҆- ِ= sxK 0hDZ~b44HfKm'*/t%A qc?faշ}'?&[/M1MzrSGLB@.H5uBƾq*d:ym1Oor0Z{Iqoup1=A@~Ge Y{>Ȝ6;)j!R KFFd29n2p2O4ss5=OۂZJ F6haf5?:TR32NA` q`7VUG_Q}ۅT#g͡ zJw,R)f7yV:S6o&2;x;N|~6ԅ_n ;)BK2Xt~qWKVgA@AC]z׎T~%]/e:/ V $#M) *MZ2Ʌ?aS̃ #fR6NIΜ)Rӝ1:]uD6lT92"n056Ҕ8zsț:IlNZcC[)dR'Nr$+۴Z8!-HJPc +%jh01D3_P&;2IM Q&Y J =6)I?|.JJAX! q@0}#Pz"uĸzVDG "YmL(ٲ$U P"b&%N#BdbdV 1Q"(!0R&zFGL3)cK(w2惦S0} Ծ^XA qgi> Y]GSC'*5hjJ$JY*C VPK^f8Q&ucIG?x]fRH)(88rpeedȢ-A@~xF#nİGUjFd_65#M2Q8جVTˁ13 [Z Nw]x,<>" ı@ 3 C@:lE$ IJ@cvp[Aq?² @☤'A@?B, ?S,3IENDB`HINGE-0.5.0/misc/param_description1.png000066400000000000000000000253541314415550300176050ustar00rootroot00000000000000PNG  IHDRF|fsRGB pHYsgR*IDATx TCR PYE"l-R`0(. 82l- #"`;l*P6el.,7r˽?9'g$' @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @@ .qQVFJYN$it4U1ru--/{\_W(iZ&LQϛHk-}תӾAr's.)jt䰏Jю}]+}OLZDy*9+%usR#z~DN!y`{4O;{.}$~S2Ӱ] @@5y-rVq~>N?i_4E궎i߄{D$\[JHĝ4v%9+s7YJ2?ϑKIiz̤ ~'ħK~F N4_hn֕> U&f~[` {+!lc5ݭ;L+H?ϥVئJ9G{r$j'i()ڙ3%qkU<*xmYi!/|n#6%o~pVCFhi?Wy1٤+*˙^l!E}+Y.}-Ery@p-GG-*jd1tSH7KyI~N;HG=M-n,=)ϊu' 6jJ IͻYtҷ$?(-lViwR\ي |Q B lN<sJmJuى%gXbZ1ZM&Hߐ[r?λs4Ǧ|h~H`N6Q72?[O'D}Qk;)~oIIz^ZI?v/%;^ 0Q_Ht(˓_c\t,om pU/Cy jQG*YC/K| +')s'!n.7Eɝ")eaSd U.GwidE/ Y{X_跱;q,w$= x @a|:Q-(m,SWJa]-$tyq-/G;Q~f+{F,-g,,(_H~7~ri7 9Ytg$K}% Jeo:L~iˮJ,z¾_"$yFץ3{}W6s˒~semm%0[DvYJ܍ki;iϐ.K>+]%͝%?8UHoGHMpOѱH:G J>z&lyttfQ4D|J+"qYޝʹ5'$?O5?l#'m(}zleKnU%}q_"~c)ɚr,T]ںm>p_<cIvVKiu.9;QGjA[ fw|OK\nRb~FZ[:XZE=~.'$ɝ[ܭ಑+~{h98O7Hve, ~od~jw|n`xiݙ( < ȏ%lRm_`~H;I}4bqJoW;DLǨ\1OE $v"I ֗cߨ5y M-ҳl=@.yw%)먅i~iuR͝4oPZ`N ,m,Ff ôϖ~/5%o[jy뷳8B(3ϕYReQe~* %V2 P=ʞ䑂~c% OK3MtgIքW;P $*1RLJcԕZJ SIN|3m%=_VxT+'7S=䱓tA?',%w_6c%;8zdSjzp;l')˱Mf{gɤ6P5mƙ;y{Y9/(C%O.,m%?vȑ_l'-C^g_.v&JF:=GeǴX]Jh.PYϑ֖,~9.IWl!P_viR(`<;(6 P:fӌyZދ 6J;OHKפǥ.DžrG}oOv=åRI^8VZxﴜ7ZK`?=[ @"3U)Th? ;F?7Pfa& @BNcZ\|]ʶ͗#  lI㥻%wBhRm - @`?l޸<؃ `c.~x5ݽ:+]M <7Y"hgI`?;j.,Y7 $Y %k @.YI?e_.qjϑv֚u;vsRKF Jwρ)5 <uHx<'vZXƳ;o}W=v%Y"hgI5a^ߵw!R<NI}Tɫ%RZQ,a P0Ko>̓=@Q:n_Zۑ[-# F;kYD,?iw.:RưқR5%h1TDU+}D֫ E7KåKJM_%.  yo'bc[c8M=,6NЎ%tԞn:Cʿ~}.˹֒qI>v`σWůbN0TE{]eM~#$@ 0 z+NZ0?j?~Z0yܼHBt8 @@'8ΡuѸЗ4B.jiAyWpj;>@(  @~6o`Q^]A6=X|åKD 1 @FyゞGh4enŻ!X&A tR @/l oODR]oGI7  @ #płZP!Uy<Zx`QV62@HgS5@ P>=̅s0|4x5yGWN& 5@5 @@`ͱ)5M o#Am >#IQNp@rp% :\i'5ZpM=yFpKS~#$@ jTl-ݳ\Q5}?t:Nӵ8xC-&@Ǩ@I  >z{ G-X%/C4i}qFQВ0 @'0s ;NYҥ'{y*=!@; 1j']҆ @z.tp0dI@Kn՞ jSG@C ɅL @@e#c4]n}@#D? ?`Fo6G P F MV t!O h1C!ZZ6uF]8x@Bр`'S@:̅UMV],>涔8xC% @Ǩ@ @ W,,|Ic>{S48oCtA(:F%j @@ ̜C3XKnۥV[)ugoyJ0ygS5  @45$toQOϦռ }95@i 1*mP0@ώ FJ-GvM=zqpSu(  @X~Y~h|kZXnZ0qq:j. @@!z{wjҠ6#XdZwR2 #TP @EʅhkڞoL\ю2`(3*"B P-K j ~Z0yܼj՝Bz(,Z>IENDB`HINGE-0.5.0/parameter_description.md000066400000000000000000000115171314415550300172610ustar00rootroot00000000000000## Parameters used by HINGE All the parameters below can be set using the .ini file read by the HINGE programs. ###[filter] - length_threshold = 1000; // Minimum read length - aln_threshold = 2500; // Minimum alignment length between two reads to be considered when building graph - min_cov = 5; // Minimum coverage depth for a segment on a read to not be considered erroneous/chimeric - cut_off = 300; // When looking for chimeric segments, we look for coverage gaps on a read, after reducing all matches by cut_off in the beginning and in the end - theta = 300; // When classifying a match between two reads as a right/left overlap, internal match, etc., overhangs of length up to theta are ignored - use_qv = true; // Use qv scores provided by DAligner when creating the read masks (i.e., the part of the read that will actually be used for assembly) - coverage = true; // Use coverage values when creating the read masks. If both use_qv and coverage are set to true, an intersection of the two masks is taken. - coverage_frac_repeat_annotation = 3; - min_repeat_annotation_threshold = 10; - max_repeat_annotation_threshold = 20; // A repeat annotation is placed on the read at position i+reso if ``` |coverage[i]-coverage[i+reso]| > min( max( coverage[i+reso]/coverage_frac_repeat_annotation, min_repeat_annotation_threshold), max_repeat_annotation_threshold) ``` - repeat_annotation_gap_threshold = 300; // How far two hinges of the same type can be on a read - no_hinge_region = 500; // Hinges cannot be placed within no_hinge_region of the start and end of the read - hinge_min_support = 7; // Minimum number of reads that have to start in a `reso` (default 40) length interval to be considered in hinge calling - hinge_unbridged = 6; // Number of reads that one has to see before a pileup to declare a potential hinge unbridged - hinge_bin = 100; // Physical length of the bins considered - hinge_tolerance_length = 100; // Matches starting within hinge_tolerance_length of a hinge are considered to be starting at the hinge ###[running] - n_proc = 12; // number of CPUs for layout step ###[layout] - hinge_tolerance = 150; // This is how far an overlap must start from a hinge to be considered an internal overlap. - hinge_slack = 1000; // This is the amount by which a forward overlap must be longer than a forward internal overlap to be preferred while building a graph. - matching_hinge_slack = 200; // We identify two in-hinges (out-hinges) on two different reads as corresponding to the same repeat event, if the reads match in the repeat part, and the two hinges are within matching_hinge_slack of each other - min_connected_component_size = 8; // In order to actually add a hinge to a read, we require that at least min_connected_component_size reads have a repeat annotation and they are all identified as the beginning (or end) of the same repeat - kill_hinge_overlap = 300; - kill_hinge_internal = 40; // When filtering hinges (so that only one in-hinge and one out-hinge are left for each reapeat), we kill an in-hinge (out-hinge) if there is a forward (backward) extension read that starts at least kill_hinge_overlap before (after) the hinge, or if there is a forward_internal (backward_internal) extension read that starts at most kill_hinge_internal after (before) the hinge, as illustrated below. - num_events_telomere = 7; - del_telomeres = 0; // If set to 1, any read with more than num_events_telomere repeat annotations will be classified as a telomere read and will be deleted. - aggressive_pruning = 0; //If set to 1, the pruning will be more aggressive. We recommend it be set to 1 for large genome. - use_two_matches = 1; // Allow the HINGE algorithm to consider the top two matches between a pair of reads (as opposed to just the longest match) ###[draft] - tspace = 900; //space between new "trace points" - step = 50; ###[consensus] - min_length = 4000; // Minimal length of reads used for final consensus - trim_end = 200; // Trim ends for alignments for final consensus - best_n = 1; // If one read has multiple alignments with the bacbone assembly, choose the longest n segments for consensus. - quality_threshold = 0.23; // alignment quality threshold HINGE-0.5.0/scripts/000077500000000000000000000000001314415550300140365ustar00rootroot00000000000000HINGE-0.5.0/scripts/Visualise_graph.py000077500000000000000000000032651314415550300175460ustar00rootroot00000000000000#!/usr/bin/env python # In[1]: import networkx as nx import sys # In[2]: if len(sys.argv) >2: print "wrong usage.\n python Visualise_graph.py graph_edge_file [list_of_hinges]" vertices=set() with open (sys.argv[1]) as f: for lines in f: lines1=lines.split() #print lines1 if len(lines1) < 5: continue #vertices.add(lines1[0]) #vertices.add(str(lines1[1])) #vertices.add(str(lines1[0])+"_" + lines1[3]) #vertices.add(str(lines1[1])+"_" + lines1[4]) # In[3]: len(vertices) # In[4]: G = nx.DiGraph() for vertex in vertices: G.add_node(vertex) # In[5]: with open (sys.argv[1]) as f: for lines in f: lines1=lines.split() print lines1 if len(lines1) < 5: continue #print lines1 G.add_edge(lines1[0] + "_" + lines1[3], lines1[1] + "_" + lines1[4], hinge_edge=int(lines1[5])) G.add_edge(lines1[1] + "_" + str(1-int(lines1[4])), lines1[0] + "_" + str(1-int(lines1[3])),hinge_edge=int(lines1[5])) try: in_hinges = set() out_hinges = set() with open (sys.argv[2]) as f: for lines in f: lines1=lines.split() if lines1[2] == '1': in_hinges.add(lines1[0]+'_0') out_hinges.add(lines1[0]+'_1') elif lines1[2] == '-1': in_hinges.add(lines1[0]+'_1') out_hinges.add(lines1[0]+'_0') for node in G.nodes(): if node in in_hinges and node in out_hinges: G.node[node]['hinge']=100 elif node in in_hinges: G.node[node]['hinge']=10 elif node in out_hinges: G.node[node]['hinge']=-10 else: G.node[node]['hinge']=0 except: pass nx.write_graphml(G, './out.graphml') # In[ ]: HINGE-0.5.0/scripts/add_groundtruth.py000077500000000000000000000034201314415550300176070ustar00rootroot00000000000000#!/usr/bin/env python import networkx as nx import sys graphml_file = sys.argv[1] groundtruth_file = sys.argv[2] graphml_file_w_groundtruth = sys.argv[3] try: chromosome_to_consider= int(sys.argv[4]) except: chromosome_to_consider=None g = nx.read_graphml(graphml_file) print nx.info(g) mapping_dict = {} with open(groundtruth_file,'r') as f: for num, line in enumerate(f.readlines()): m = map(int, line.strip().split()) # mapping_dict[num] = [min(m), max(m), int(m[0]>m[1])] mapping_dict[num] = [m[2],m[3],m[1]] #print mapping_dict max_len=0 for num in mapping_dict.keys(): max_len=max(max_len,len(str(m[3]))) pow_mov=10**(max_len+1) for node in g.nodes(): #print node try: nodeid = int(node.split('_')[0]) #print nodeid rev = int(node.split('_')[1]) if chromosome_to_consider != None: g.node[node]['chromosome'] = 0 if mapping_dict[nodeid][2]==chromosome_to_consider: g.node[node]['chromosome'] = mapping_dict[nodeid][2]+1 else: g.node[node]['chromosome'] = mapping_dict[nodeid][2]+1 if rev == 0: g.node[node]['aln_end'] = mapping_dict[nodeid][2]*pow_mov+ mapping_dict[nodeid][1] g.node[node]['aln_start'] = mapping_dict[nodeid][2]*pow_mov + mapping_dict[nodeid][0] # g.node[node]['aln_strand'] = mapping_dict[nodeid][2] else: g.node[node]['aln_end'] = mapping_dict[nodeid][2]*pow_mov + mapping_dict[nodeid][1] g.node[node]['aln_start'] = mapping_dict[nodeid][2]*pow_mov+ mapping_dict[nodeid][0] # g.node[node]['aln_strand'] = 1-mapping_dict[nodeid][2] except: pass nx.write_graphml(g, graphml_file_w_groundtruth) HINGE-0.5.0/scripts/add_groundtruth_json.py000066400000000000000000000027251314415550300206440ustar00rootroot00000000000000import networkx as nx import sys import json graphml_file = sys.argv[1] groundtruth_file = sys.argv[2] graphml_file_w_groundtruth = sys.argv[3] g = nx.read_graphml(graphml_file) print nx.info(g) with open(groundtruth_file) as f: read_dict=json.load(f) max_len=0 for read in read_dict: for aln_info in read_dict[read]: try: max_len=max(max_len,len(str(aln_info[0]))) max_len=max(max_len,len(str(aln_info[1]))) except: print raise pow_mov=10**(max_len+1) for node in g.nodes(): #print node nodeid = node.split('_')[0] #print nodeid rev = int(node.split('_')[1]) if rev==1: nodeid+="'" if nodeid in read_dict: g.node[node]['chr'] = read_dict[nodeid][0][2] g.node[node]['aln_end'] = pow_mov*read_dict[nodeid][0][2]+max(read_dict[nodeid][0][0],read_dict[nodeid][0][1]) # g.node[node]['aln_start'] = pow_mov*read_dict[nodeid][0][2]+min(read_dict[nodeid][0][0],read_dict[nodeid][0][1]) # g.node[node]['repeat']=0 # if len (read_dict[nodeid]) >1 : # g.node[node]['repeat']=1 # chrom_maps=set([aln[3] for aln in read_dict[nodeid]]) # if len (chrom_maps) > 1: # g.node[node]['repeat']=10 else: g.node[node]['chr'] = -1 g.node[node]['aln_end'] = -1 # g.node[node]['aln_start'] = -1 # g.node[node]['repeat']=-1 nx.write_graphml(g, graphml_file_w_groundtruth) HINGE-0.5.0/scripts/clip_ends.py000066400000000000000000000016461314415550300163570ustar00rootroot00000000000000import sys chr_lengths={} ground_truth=sys.argv[1] graph_file=sys.argv[2] out_file=sys.argv[2]+'.clipped' with open(ground_truth) as f: for line in f: m = map(int, line.strip().split()) chr_lengths.setdefault(m[1],0) chr_lengths[m[1]]= max(chr_lengths[m[1]], max(m[2],m[3])) CHR_THR=20000 reads_to_kill=set() with open(ground_truth) as f: for line in f: m = map(int, line.strip().split()) read_left=min(m[2],m[3]) read_right=max(m[2],m[3]) read_chr=m[1] if read_left < CHR_THR: reads_to_kill.add(m[0]) if read_right > chr_lengths[read_chr] - CHR_THR: reads_to_kill.add(m[0]) with open(graph_file) as f: with open(out_file, 'w') as g: for line in f: line1=line.split() if int(line1[0])in reads_to_kill or int(line1[1]) in reads_to_kill: continue g.write(line) HINGE-0.5.0/scripts/compute_n50_from_draft.py000066400000000000000000000050631314415550300207550ustar00rootroot00000000000000import sys import os import networkx as nx from Bio import SeqIO def comp_n50(contig_vec): if len(contig_vec) == 0: return 0 sorted_lengths = sorted(contig_vec) total_length = sum(contig_vec) half_length = 0.5*total_length min_n50 = sorted_lengths[-1] max_n50 = 0 for i in range(len(sorted_lengths)): sum_1 = sum(sorted_lengths[0:i+1]) sum_2 = sum(sorted_lengths[i:]) if sum_1 >= half_length and sum_2 >= half_length: min_n50 = min(sorted_lengths[i],min_n50) max_n50 = max(sorted_lengths[i],max_n50) return 0.5*(min_n50+max_n50) hinging_n50 = -1 hinging_comp_n50 = -1 hgap_n50 = -1 count = 0 count1 = 0 count2 = 0 data_dict = {} fullpath = '/data/pacbio_assembly/pb_data/NCTC/' for nctc_name in os.listdir(fullpath): if 'NCTC' not in nctc_name: continue mypath = fullpath+nctc_name if not os.path.isdir(mypath): continue mypath = mypath+'/' count += 1 hinging_n50 = -1 hinging_comp_n50 = -1 hgap_n50 = -1 data_dict[nctc_name] = [] draft_file = [x for x in os.listdir(mypath) if 'draft.graphml' in x] try: # flname = sys.argv[1] g = nx.read_graphml(mypath+draft_file[0]) contig_lengths = [] component_lengths = [] for u in g.nodes(): contig_lengths.append(len(g.node[u]['segment'])) for c in nx.weakly_connected_components(g): # we use set() so that we cannot double-count a two reverse complementary contigs # in the same component component_lengths.append(sum(set([len(g.node[u]['segment']) for u in c]))) component_lengths = set(component_lengths) hinging_n50 = comp_n50(contig_lengths) hinging_comp_n50 = comp_n50(component_lengths) count1+=1 except: pass # print "contig n50: "+str(comp_n50(contig_lengths)) # print "component n50: "+str(comp_n50(component_lengths)) hgap_file = [x for x in os.listdir(mypath) if 'hgap.fasta' in x] try: hgap_file = hgap_file[0] hgap_contigs = [len(x) for x in SeqIO.parse(open(mypath+hgap_file),'fasta')] hgap_n50 = comp_n50(hgap_contigs) count2+=1 except: pass with open(mypath+nctc_name+'.n50','w') as f: f.write('hinging'+'\t'+str(hinging_n50)+'\n') f.write('hinging_comp'+'\t'+str(hinging_comp_n50)+'\n') f.write('hgap'+'\t'+str(hgap_n50)+'\n') data_dict[nctc_name] = [hinging_n50,hinging_comp_n50,hgap_n50] print count print count1 print count2 with open(fullpath+'computed.n50','w') as f: for nctc_name in data_dict: vec = data_dict[nctc_name] f.write(nctc_name+'\t'+str(vec[0])+'\t'+str(vec[1])+'\t'+str(vec[2])+'\n') HINGE-0.5.0/scripts/condense_graph.py000077500000000000000000000125211314415550300173730ustar00rootroot00000000000000#!/usr/bin/env python import networkx as nx import sys from collections import Counter def merge_simple_path(g): for node in g.nodes(): #print g.in_degree(node), g.out_degree(node) if g.in_degree(node) == 1 and g.out_degree(node) == 1: in_node = g.in_edges(node)[0][0] out_node = g.out_edges(node)[0][1] if g.out_degree(in_node) == 1 and g.in_degree(out_node) == 1: if in_node != node and out_node != node and in_node != out_node: #print in_node, node, out_node merge_path(g,in_node,node,out_node) def merge_two_nodes(g): for node in g.nodes(): if g.in_degree(node) == 1 and g.out_degree(node) == 0: in_node = g.in_edges(node)[0][0] if g.out_degree(in_node) == 1: if in_node != node: node_id = g.graph['aval'] g.graph['aval'] += 1 g.add_node(str(node_id), count = g.node[in_node]['count'] + g.node[node]['count'], read = g.node[in_node]['read'] + '_' + g.node[node]['read'], #aln_chr = g.node[node]['aln_chr'] ) g.remove_node(in_node) g.remove_node(node) def merge_path(g,in_node,node,out_node): #ov1 = find_overlap(g.node[in_node]['bases'], g.node[node]['bases']) #ov2 = find_overlap(g.node[node]['bases'], g.node[out_node]['bases']) node_id = g.graph['aval'] g.graph['aval'] += 1 #length = g.node[node]['length'] + g.node[in_node]['length'] + g.node[out_node]['length'] - ov1 - ov2 #cov = (g.node[in_node]['cov'] * g.node[in_node]['length'] + g.node[node]['cov'] * g.node[node]['length'] + \ #g.node[out_node]['cov'] * g.node[out_node]['length'])/float(length) #bases = g.node[in_node]['bases'][:-ov1] + g.node[node]['bases'] + g.node[out_node]['bases'][ov2:] g.add_node(str(node_id), count = g.node[in_node]['count'] + g.node[node]['count'] + g.node[out_node]['count'], read = g.node[in_node]['read'] + '_' + g.node[node]['read'] + '_' +g.node[out_node]['read'], #aln_chr = g.node[node]['aln_chr'] ) #g.add_node(str(node_id)+'-', bases = reverse_comp_bases(bases), length = length, cov = cov) for edge in g.in_edges(in_node): g.add_edge(edge[0],str(node_id)) for edge in g.out_edges(out_node): g.add_edge(str(node_id),edge[1]) g.remove_node(in_node) g.remove_node(node) g.remove_node(out_node) def input1(flname): g = nx.DiGraph() with open (flname) as f: for lines in f: lines1=lines.split() #print lines1 if len(lines1) < 5: continue #print lines1 g.add_edge(lines1[0] + "_" + lines1[3], lines1[1] + "_" + lines1[4], hinge_edge=int(lines1[5])) g.add_edge(lines1[1] + "_" + str(1-int(lines1[4])), lines1[0] + "_" + str(1-int(lines1[3])),hinge_edge=int(lines1[5])) return g def input2(flname): g = nx.DiGraph() with open (flname) as f: for lines in f: lines1=lines.split() #print lines1 g.add_edge(lines1[0], lines1[1]) return g def run(filename, n_iter): f=open(filename) line1=f.readline() print line1 f.close() if len(line1.split()) !=2: g=input1(filename) else: g=input2(filename) print nx.info(g) for node in g.nodes(): g.node[node]['count'] = 1 g.node[node]['read'] = node degree_sequence=sorted(g.degree().values(),reverse=True) print Counter(degree_sequence) for i in range(n_iter): for node in g.nodes(): if g.in_degree(node) == 0: g.remove_node(node) print nx.info(g) degree_sequence=sorted(nx.degree(g).values(),reverse=True) print Counter(degree_sequence) degree_sequence=sorted(nx.degree(g).values(),reverse=True) print Counter(degree_sequence) g.graph['aval'] = 1000000000 for i in range(5): merge_simple_path(g) degree_sequence=sorted(nx.degree(g).values(),reverse=True) print Counter(degree_sequence) try: import ujson mapping = ujson.load(open(filename.split('.')[0]+'.mapping.json')) print 'get mapping' for node in g.nodes(): #print node if mapping.has_key(node): g.node[node]['aln_start'] = mapping[node][0] g.node[node]['aln_end'] = mapping[node][1] g.node[node]['aln_strand'] = mapping[node][2] else: g.node[node]['aln_start'] = 0 g.node[node]['aln_end'] = 0 g.node[node]['aln_strand'] = 0 except: pass nx.write_graphml(g, filename.split('.')[0]+'_condensed.graphml') print nx.number_weakly_connected_components(g) print nx.number_strongly_connected_components(g) filename = sys.argv[1] run(filename, 5) HINGE-0.5.0/scripts/condense_graph_and_annotate.py000077500000000000000000000153041314415550300221100ustar00rootroot00000000000000#!/usr/bin/env python import networkx as nx import sys from collections import Counter import json LENGTH_THRESHOLD=10 #Connected components with less than LENGTH_THRESHOLD reads are thrown away def merge_simple_path(g): for node in g.nodes(): #print g.in_degree(node), g.out_degree(node) if g.in_degree(node) == 1 and g.out_degree(node) == 1: in_node = g.in_edges(node)[0][0] out_node = g.out_edges(node)[0][1] if g.out_degree(in_node) == 1 and g.in_degree(out_node) == 1: if in_node != node and out_node != node and in_node != out_node: merge_path(g,in_node,node,out_node) def merge_two_nodes(g): for node in g.nodes(): if g.in_degree(node) == 1 and g.out_degree(node) == 0: in_node = g.in_edges(node)[0][0] if g.out_degree(in_node) == 1: if in_node != node: node_id = g.graph['aval'] g.graph['aval'] += 1 g.add_node(str(node_id), count = g.node[in_node]['count'] + g.node[node]['count'], read = g.node[in_node]['read'] + ':' + g.node[node]['read'], #aln_chr = g.node[node]['aln_chr'] ) g.remove_node(in_node) g.remove_node(node) def merge_path(g,in_node,node,out_node): #ov1 = find_overlap(g.node[in_node]['bases'], g.node[node]['bases']) #ov2 = find_overlap(g.node[node]['bases'], g.node[out_node]['bases']) node_id = g.graph['aval'] g.graph['aval'] += 1 #length = g.node[node]['length'] + g.node[in_node]['length'] + g.node[out_node]['length'] - ov1 - ov2 #cov = (g.node[in_node]['cov'] * g.node[in_node]['length'] + g.node[node]['cov'] * g.node[node]['length'] + \ #g.node[out_node]['cov'] * g.node[out_node]['length'])/float(length) #bases = g.node[in_node]['bases'][:-ov1] + g.node[node]['bases'] + g.node[out_node]['bases'][ov2:] g.add_node(str(node_id), count = g.node[in_node]['count'] + g.node[node]['count'] + g.node[out_node]['count'], read = g.node[in_node]['read'] + ':' + g.node[node]['read'] + ':' +g.node[out_node]['read'], #aln_chr = g.node[node]['aln_chr'] ) #g.add_node(str(node_id)+'-', bases = reverse_comp_bases(bases), length = length, cov = cov) #print g.node[str(node_id)]['chr'] for edge in g.in_edges(in_node): g.add_edge(edge[0],str(node_id)) for edge in g.out_edges(out_node): g.add_edge(str(node_id),edge[1]) g.remove_node(in_node) g.remove_node(node) g.remove_node(out_node) def input1(flname): g = nx.DiGraph() with open (flname) as f: for lines in f: lines1=lines.split() #print lines1 if len(lines1) < 5: continue #print lines1 g.add_edge(lines1[0] + "_" + lines1[3], lines1[1] + "_" + lines1[4], hinge_edge=int(lines1[5])) g.add_edge(lines1[1] + "_" + str(1-int(lines1[4])), lines1[0] + "_" + str(1-int(lines1[3])),hinge_edge=int(lines1[5])) return g def input2(flname): g = nx.DiGraph() with open (flname) as f: for lines in f: lines1=lines.split() #print lines1 g.add_edge(lines1[0], lines1[1]) return g def run(filename, gt_file, n_iter): f=open(filename) line1=f.readline() print line1 f.close() if len(line1.split()) !=2: g=input1(filename) else: g=input2(filename) print str(len(g.nodes())) + " vertices in graph to begin with." connected_components=[x for x in nx.weakly_connected_components(g)] for component in connected_components: if len(component) < 10: g.remove_nodes_from(component) print str(len(g.nodes())) + " vertices in graph after removing components of at most "+str(LENGTH_THRESHOLD)+ " nodes." read_to_chr_map={} if gt_file.split('.')[-1]=='json': with open(gt_file,'r') as f: tmp_map=json.load(f) for read in tmp_map: readid=int(read.strip("'")) read_to_chr_map[readid] = int(tmp_map[read][0][2]) else: with open(gt_file,'r') as f: for num, line in enumerate(f.readlines()): m = map(int, line.strip().split()) read_to_chr_map[m[0]]=m[1] nodes_seen=set([x.split("_")[0] for x in g.nodes()]) for node in nodes_seen: read_to_chr_map.setdefault(int(node),-1) #print nx.info(g) print "Num reads read : "+str(len(read_to_chr_map)) for node in g.nodes(): nodeid=int(node.split('_')[0]) g.node[node]['count'] = 1 g.node[node]['read'] = node #print str(nodeid), node,g.node[node]['chr'] degree_sequence=sorted(g.degree().values(),reverse=True) print Counter(degree_sequence) for i in range(n_iter): for node in g.nodes(): if g.in_degree(node) == 0: g.remove_node(node) print nx.info(g) degree_sequence=sorted(nx.degree(g).values(),reverse=True) print Counter(degree_sequence) degree_sequence=sorted(nx.degree(g).values(),reverse=True) print Counter(degree_sequence) g.graph['aval'] = 1000000000 for i in range(5): merge_simple_path(g) degree_sequence=sorted(nx.degree(g).values(),reverse=True) print Counter(degree_sequence) h=nx.DiGraph() h.add_nodes_from(g) h.add_edges_from(g.edges()) for node in g.nodes(): reads_in_node=[int(x.split('_')[0]) for x in g.node[node]['read'].split(':')] try: chr_in_node=map(lambda x: read_to_chr_map[x], reads_in_node) except: print reads_in_node,g.node[node]['read'] return chr_in_node_set=set(chr_in_node) if len(chr_in_node_set) ==1: h.node[node]['chr']=chr_in_node[0] else: h.node[node]['chr']=':'.join(map(str,chr_in_node)) h.node[node]['count']=g.node[node]['count'] try: h.node[node]['read']=g.node[node]['read'] except: pass nx.write_graphml(h, filename.split('.')[0]+'_condensed_annotated.graphml') print nx.number_weakly_connected_components(h) print nx.number_strongly_connected_components(h) # filename = sys.argv[1] gt_file=sys.argv[2] run(filename, gt_file,5) HINGE-0.5.0/scripts/condense_graph_annotate_clip_ends.py000077500000000000000000000211521314415550300233040ustar00rootroot00000000000000#!/usr/bin/env python import networkx as nx import sys from collections import Counter def merge_simple_path(g): for node in g.nodes(): #print g.in_degree(node), g.out_degree(node) if g.in_degree(node) == 1 and g.out_degree(node) == 1: in_node = g.in_edges(node)[0][0] out_node = g.out_edges(node)[0][1] if g.out_degree(in_node) == 1 and g.in_degree(out_node) == 1: if in_node != node and out_node != node and in_node != out_node: merge_path(g,in_node,node,out_node) def merge_two_nodes(g): for node in g.nodes(): if g.in_degree(node) == 1 and g.out_degree(node) == 0: in_node = g.in_edges(node)[0][0] if g.out_degree(in_node) == 1: if in_node != node: node_id = g.graph['aval'] g.graph['aval'] += 1 g.add_node(str(node_id), count = g.node[in_node]['count'] + g.node[node]['count'], read = g.node[in_node]['read'] + ':' + g.node[node]['read'], #aln_chr = g.node[node]['aln_chr'] ) g.remove_node(in_node) g.remove_node(node) def merge_path(g,in_node,node,out_node): #ov1 = find_overlap(g.node[in_node]['bases'], g.node[node]['bases']) #ov2 = find_overlap(g.node[node]['bases'], g.node[out_node]['bases']) node_id = g.graph['aval'] g.graph['aval'] += 1 #length = g.node[node]['length'] + g.node[in_node]['length'] + g.node[out_node]['length'] - ov1 - ov2 #cov = (g.node[in_node]['cov'] * g.node[in_node]['length'] + g.node[node]['cov'] * g.node[node]['length'] + \ #g.node[out_node]['cov'] * g.node[out_node]['length'])/float(length) #bases = g.node[in_node]['bases'][:-ov1] + g.node[node]['bases'] + g.node[out_node]['bases'][ov2:] g.add_node(str(node_id), count = g.node[in_node]['count'] + g.node[node]['count'] + g.node[out_node]['count'], read = g.node[in_node]['read'] + ':' + g.node[node]['read'] + ':' +g.node[out_node]['read'], #aln_chr = g.node[node]['aln_chr'] ) #g.add_node(str(node_id)+'-', bases = reverse_comp_bases(bases), length = length, cov = cov) #print g.node[str(node_id)]['chr'] for edge in g.in_edges(in_node): g.add_edge(edge[0],str(node_id),st_pc=g.edge[edge[0]][edge[1]]['st_pc'],end_pc=g.edge[edge[0]][edge[1]]['end_pc']) for edge in g.out_edges(out_node): g.add_edge(str(node_id),edge[1],st_pc=g.edge[edge[0]][edge[1]]['st_pc'],end_pc=g.edge[edge[0]][edge[1]]['end_pc']) g.remove_node(in_node) g.remove_node(node) g.remove_node(out_node) def input1(flname): g = nx.DiGraph() with open (flname) as f: for lines in f: lines1=lines.split() #print lines1 if len(lines1) < 5: continue #print lines1 g.add_edge(lines1[0] + "_" + lines1[3], lines1[1] + "_" + lines1[4], hinge_edge=int(lines1[5])) g.add_edge(lines1[1] + "_" + str(1-int(lines1[4])), lines1[0] + "_" + str(1-int(lines1[3])),hinge_edge=int(lines1[5])) return g def input2(flname): g = nx.DiGraph() with open (flname) as f: for lines in f: lines1=lines.split() #print lines1 g.add_edge(lines1[0], lines1[1]) return g def run(filename, gt_file, n_iter): f=open(filename) line1=f.readline() print line1 f.close() if len(line1.split()) !=2: g=input1(filename) else: g=input2(filename) read_to_chr_map={} pos_dict = {} mapping_dict = {} chr_lengths = {} for chr in range(14): chr_lengths[chr] = 1000 with open(gt_file,'r') as f: for num, line in enumerate(f.readlines()): m = map(int, line.strip().split()) # mapping_dict[num] = [min(m), max(m), int(m[0]>m[1])] read_to_chr_map[m[0]]= str(m[1]) mapping_dict[num] = m[1] pos_dict[num] = [min(m[2],m[3]),max(m[2],m[3])] # pos_dict[num] = [m[2],m[3],int(m[2]>m[3])] chr_lengths[m[1]] = max(chr_lengths[m[1]],max(m[2],m[3])) print nx.info(g) print "Chromosome lenghts:" print chr_lengths margin = 10000 del_count = 0 #print nx.info(g) print "Num reads read : "+str(len(read_to_chr_map)) for cur_edge in g.edges(): node0=int(cur_edge[0].split('_')[0]) node1=int(cur_edge[1].split('_')[0]) # g.edge[cur_edge[0]][cur_edge[1]]['st_pc'] = "{0:.2f}".format(1.0*pos_dict[node0][1]/chr_lengths[mapping_dict[node0]]) # g.edge[cur_edge[0]][cur_edge[1]]['end_pc'] = "{0:.2f}".format(1.0*pos_dict[node1][0]/chr_lengths[mapping_dict[node1]]) # st_pc is the "start percentage"; i.e., the percent location of edge[0] on its original chromosome # end_pc is the "end percentage"; i.e., the percent location of edge[1] on its original chromosome g.edge[cur_edge[0]][cur_edge[1]]['st_pc'] = 1.0*pos_dict[node0][1]/chr_lengths[mapping_dict[node0]] g.edge[cur_edge[0]][cur_edge[1]]['end_pc'] = 1.0*pos_dict[node1][0]/chr_lengths[mapping_dict[node1]] for node in g.nodes(): nodeid=int(node.split('_')[0]) if pos_dict[nodeid][0] < margin: g.remove_node(node) del_count += 1 continue if pos_dict[nodeid][1] > chr_lengths[mapping_dict[nodeid]] - margin: g.remove_node(node) del_count += 1 continue g.node[node]['count'] = 1 g.node[node]['read'] = node #print str(nodeid), node,g.node[node]['chr'] print "Deleted nodes: "+str(del_count) degree_sequence=sorted(g.degree().values(),reverse=True) print Counter(degree_sequence) for i in range(n_iter): for node in g.nodes(): if g.in_degree(node) == 0: g.remove_node(node) print nx.info(g) degree_sequence=sorted(nx.degree(g).values(),reverse=True) print Counter(degree_sequence) degree_sequence=sorted(nx.degree(g).values(),reverse=True) print Counter(degree_sequence) g.graph['aval'] = 1000000000 for i in range(5): merge_simple_path(g) degree_sequence=sorted(nx.degree(g).values(),reverse=True) print Counter(degree_sequence) h=nx.DiGraph() h.add_nodes_from(g) h.add_edges_from(g.edges()) for cur_edge in h.edges(): h.edge[cur_edge[0]][cur_edge[1]]['st_pc'] = g.edge[cur_edge[0]][cur_edge[1]]['st_pc'] h.edge[cur_edge[0]][cur_edge[1]]['end_pc'] = g.edge[cur_edge[0]][cur_edge[1]]['end_pc'] # h = g.copy() for node in g.nodes(): reads_in_node=[int(x.split('_')[0]) for x in g.node[node]['read'].split(':')] try: chr_in_node=map(lambda x: read_to_chr_map[x], reads_in_node) except: print reads_in_node,g.node[node]['read'] return chr_in_node_set=set(chr_in_node) if len(chr_in_node_set) ==1: h.node[node]['chr']=chr_in_node[0] else: h.node[node]['chr']= ':'.join(chr_in_node) h.node[node]['count']=g.node[node]['count'] try: h.node[node]['read']=g.node[node]['read'] except: pass try: import ujson mapping = ujson.load(open(filename.split('.')[0]+'.mapping.json')) print 'get mapping' for node in h.nodes(): #print node if mapping.has_key(node): h.node[node]['aln_start'] = mapping[node][0] h.node[node]['aln_end'] = mapping[node][1] h.node[node]['aln_strand'] = mapping[node][2] else: h.node[node]['aln_start'] = 0 h.node[node]['aln_end'] = 0 h.node[node]['aln_strand'] = 0 except: pass nx.write_graphml(h, filename.split('.')[0]+'_condensed_annotated.graphml') nx.write_graphml(g, filename.split('.')[0]+'_G_condensed_annotated.graphml') print nx.number_weakly_connected_components(h) print nx.number_strongly_connected_components(h) # filename = sys.argv[1] gt_file=sys.argv[2] run(filename, gt_file,5) HINGE-0.5.0/scripts/condense_graph_create_gfa_compute_n50.py000077500000000000000000000157741314415550300237660ustar00rootroot00000000000000#!/usr/bin/env python import networkx as nx import sys from collections import Counter # This script condenses the graph down, creates a gfa with for the condensed graph, and computes the contig N50 # python condense_graph_create_gfa_compute_n50.py ecoli.edges # The conditions in lines 23 and 24 are meant to prevent nodes corresponding to different strands to be merged # (and should be commented out if this is not desired, or if a json is not available) def merge_simple_path(g): for node in g.nodes(): if g.in_degree(node) == 1 and g.out_degree(node) == 1: in_node = g.in_edges(node)[0][0] out_node = g.out_edges(node)[0][1] if g.out_degree(in_node) == 1 and g.in_degree(out_node) == 1: if in_node != node and out_node != node and in_node != out_node: if g.node[in_node]['aln_strand']==g.node[node]['aln_strand'] or max(g.node[in_node]['aln_strand'],g.node[node]['aln_strand']) == 5: if g.node[out_node]['aln_strand']==g.node[node]['aln_strand'] or max(g.node[out_node]['aln_strand'],g.node[node]['aln_strand']) == 5: #print in_node, node, out_node merge_path(g,in_node,node,out_node) def merge_path(g,in_node,node,out_node): #ov1 = find_overlap(g.node[in_node]['bases'], g.node[node]['bases']) #ov2 = find_overlap(g.node[node]['bases'], g.node[out_node]['bases']) node_id = g.graph['aval'] g.graph['aval'] += 1 #length = g.node[node]['length'] + g.node[in_node]['length'] + g.node[out_node]['length'] - ov1 - ov2 #cov = (g.node[in_node]['cov'] * g.node[in_node]['length'] + g.node[node]['cov'] * g.node[node]['length'] + \ #g.node[out_node]['cov'] * g.node[out_node]['length'])/float(length) #bases = g.node[in_node]['bases'][:-ov1] + g.node[node]['bases'] + g.node[out_node]['bases'][ov2:] overlap1 = g.edge[in_node][node][0]['overlap'] overlap2 = g.edge[node][out_node][0]['overlap'] length0 = g.node[in_node]['length'] length1 = g.node[node]['length'] length2 = g.node[out_node]['length'] if overlap1 > min(length0,length1): print "problem here:" print overlap1, length0, length1 g.add_node(str(node_id),length = length0+length1+length2 - overlap1 - overlap2, aln_strand = g.node[node]['aln_strand']) #g.add_node(str(node_id)+'-', bases = reverse_comp_bases(bases), length = length, cov = cov) for cur_edge in g.in_edges(in_node): # print g.edge[cur_edge[0]][cur_edge[1]][0]['overlap'] g.add_edge(cur_edge[0],str(node_id),overlap = g.edge[cur_edge[0]][cur_edge[1]][0]['overlap']) for cur_edge in g.out_edges(out_node): g.add_edge(str(node_id),cur_edge[1],overlap = g.edge[cur_edge[0]][cur_edge[1]][0]['overlap']) g.remove_node(in_node) g.remove_node(node) g.remove_node(out_node) def comp_n50(contig_vec): if len(contig_vec) == 0: return 0 sorted_lengths = sorted(contig_vec) total_length = sum(contig_vec) half_length = 0.5*total_length min_n50 = sorted_lengths[-1] max_n50 = 0 for i in range(len(sorted_lengths)): #if len(sorted_lengths) % 2 == 0: # sum_1 = sum(sorted_lengths[0:i]) # sum_2 = sum(sorted_lengths[i:]) #else: # sum_1 = sum(sorted_lengths[0:i+1]) # sum_2 = sum(sorted_lengths[i:]) sum_1 = sum(sorted_lengths[0:i+1]) sum_2 = sum(sorted_lengths[i:]) if sum_1 >= half_length and sum_2 >= half_length: min_n50 = min(sorted_lengths[i],min_n50) max_n50 = max(sorted_lengths[i],max_n50) # print "Min N50: "+str(min_n50) # print "Max N50: "+str(max_n50) return 0.5*(min_n50+max_n50) def de_clip(filename, n_iter): g = nx.MultiDiGraph() # count = 0 with open(filename,'r') as f: for line in f.xreadlines(): l = line.strip().split() #print l2 g.add_edge(l[0],l[1],overlap=int(l[2])/2) # if count < 10: # print l[0], l[1], l[2] # count += 1 node0start = int(l[7][1:]) node0end = int(l[8][:-1]) g.node[l[0]]['length'] = node0end - node0start node1start = int(l[9][1:]) node1end = int(l[10][:-1]) g.node[l[1]]['length'] = node1end - node1start print nx.info(g) try: import ujson mapping = ujson.load(open(filename.split('.')[0]+'.mapping.json')) # print mapping print 'get mapping' for node in g.nodes(): #print node if mapping.has_key(node): # alnstart = int(mapping[node][0]) # alnend = int(mapping[node][1]) # g.node[node]['length'] = abs(alnend-alnstart) # print abs(alnend-alnstart) g.node[node]['aln_strand'] = mapping[node][3] # g.node[node]['aln_start'] = mapping[node][0] # g.node[node]['aln_end'] = mapping[node][1] # g.node[node]['aln_strand'] = mapping[node][2] else: # g.node[node]['length'] = 5000 g.node[node]['aln_strand'] = 5 # print "this happened" # g.node[node]['aln_start'] = 0 # g.node[node]['aln_end'] = 0 # g.node[node]['aln_strand'] = 0 except: pass degree_sequence=sorted(g.degree().values(),reverse=True) print Counter(degree_sequence) for i in range(n_iter): for node in g.nodes(): if g.degree(node) < 2: g.remove_node(node) print nx.info(g) degree_sequence=sorted(nx.degree(g).values(),reverse=True) print Counter(degree_sequence) degree_sequence=sorted(nx.degree(g).values(),reverse=True) print Counter(degree_sequence) g.graph['aval'] = 1000000000 for i in range(5): merge_simple_path(g) degree_sequence=sorted(nx.degree(g).values(),reverse=True) print Counter(degree_sequence) nx.write_graphml(g, filename.split('.')[0]+'.graphml') print nx.number_weakly_connected_components(g) print nx.number_strongly_connected_components(g) # Next we create the gfa file outputfile = filename.split('.')[0]+'.gfa' with open(outputfile, 'w') as fout: for cur_node in g.nodes(): node_length = g.node[cur_node]['length'] node_str = 'A'*node_length node_str = node_str + '\n' fout.write("NODE "+str(cur_node)+' 0 0 0 0 0\n') fout.write(node_str) fout.write(node_str) # print "NODE "+str(node) for arc in g.edges(): fout.write("ARC "+str(arc[0])+' '+str(arc[1])+' 0\n') # Compute N50 contig_lengths = [] for cur_node in g.nodes(): contig_lengths.append(g.node[cur_node]['length']) print "N50 = "+str(comp_n50(contig_lengths)) filename = sys.argv[1] de_clip(filename, 5) HINGE-0.5.0/scripts/condense_graph_with_gt.py000077500000000000000000000145721314415550300211300ustar00rootroot00000000000000#!/usr/bin/env python import networkx as nx import sys from collections import Counter def merge_simple_path(g): for node in g.nodes(): #print g.in_degree(node), g.out_degree(node) if g.in_degree(node) == 1 and g.out_degree(node) == 1: in_node = g.in_edges(node)[0][0] out_node = g.out_edges(node)[0][1] if g.out_degree(in_node) == 1 and g.in_degree(out_node) == 1: if in_node != node and out_node != node and in_node != out_node: if g.node[in_node]['chr']==g.node[node]['chr'] and g.node[out_node]['chr']==g.node[node]['chr']: #print g.node[in_node]['chr'],g.node[node]['chr'],g.node[out_node]['chr'] merge_path(g,in_node,node,out_node) def merge_two_nodes(g): for node in g.nodes(): if g.in_degree(node) == 1 and g.out_degree(node) == 0: in_node = g.in_edges(node)[0][0] if g.out_degree(in_node) == 1: if in_node != node: node_id = g.graph['aval'] g.graph['aval'] += 1 g.add_node(str(node_id), chr=g.node[node]['chr'], count = g.node[in_node]['count'] + g.node[node]['count'], read = g.node[in_node]['read'] + '_' + g.node[node]['read'], #aln_chr = g.node[node]['aln_chr'] ) g.remove_node(in_node) g.remove_node(node) def merge_path(g,in_node,node,out_node): #ov1 = find_overlap(g.node[in_node]['bases'], g.node[node]['bases']) #ov2 = find_overlap(g.node[node]['bases'], g.node[out_node]['bases']) node_id = g.graph['aval'] g.graph['aval'] += 1 #length = g.node[node]['length'] + g.node[in_node]['length'] + g.node[out_node]['length'] - ov1 - ov2 #cov = (g.node[in_node]['cov'] * g.node[in_node]['length'] + g.node[node]['cov'] * g.node[node]['length'] + \ #g.node[out_node]['cov'] * g.node[out_node]['length'])/float(length) #bases = g.node[in_node]['bases'][:-ov1] + g.node[node]['bases'] + g.node[out_node]['bases'][ov2:] g.add_node(str(node_id), chr=g.node[node]['chr'], count = g.node[in_node]['count'] + g.node[node]['count'] + g.node[out_node]['count'], read = g.node[in_node]['read'] + '_' + g.node[node]['read'] + '_' +g.node[out_node]['read'], #aln_chr = g.node[node]['aln_chr'] ) #g.add_node(str(node_id)+'-', bases = reverse_comp_bases(bases), length = length, cov = cov) #print g.node[str(node_id)]['chr'] for edge in g.in_edges(in_node): g.add_edge(edge[0],str(node_id)) for edge in g.out_edges(out_node): g.add_edge(str(node_id),edge[1]) g.remove_node(in_node) g.remove_node(node) g.remove_node(out_node) def input1(flname): g = nx.DiGraph() with open (flname) as f: for lines in f: lines1=lines.split() #print lines1 if len(lines1) < 5: continue #print lines1 g.add_edge(lines1[0] + "_" + lines1[3], lines1[1] + "_" + lines1[4], hinge_edge=int(lines1[5])) g.add_edge(lines1[1] + "_" + str(1-int(lines1[4])), lines1[0] + "_" + str(1-int(lines1[3])),hinge_edge=int(lines1[5])) return g def input2(flname): g = nx.DiGraph() with open (flname) as f: for lines in f: lines1=lines.split() #print lines1 g.add_edge(lines1[0], lines1[1]) return g def run(filename, gt_file, n_iter): f=open(filename) line1=f.readline() print line1 f.close() if len(line1.split()) !=2: g=input1(filename) else: g=input2(filename) mapping_dict = {} with open(gt_file,'r') as f: for num, line in enumerate(f.readlines()): m = map(int, line.strip().split()) # mapping_dict[num] = [min(m), max(m), int(m[0]>m[1])] mapping_dict[num] = m[1] print nx.info(g) for node in g.nodes(): nodeid=int(node.split('_')[0]) g.node[node]['count'] = 1 g.node[node]['chr']=mapping_dict[nodeid] g.node[node]['read'] = node #print str(nodeid), node,g.node[node]['chr'] degree_sequence=sorted(g.degree().values(),reverse=True) print Counter(degree_sequence) for i in range(n_iter): for node in g.nodes(): if g.in_degree(node) == 0: g.remove_node(node) print nx.info(g) degree_sequence=sorted(nx.degree(g).values(),reverse=True) print Counter(degree_sequence) degree_sequence=sorted(nx.degree(g).values(),reverse=True) print Counter(degree_sequence) g.graph['aval'] = 1000000000 for i in range(5): merge_simple_path(g) degree_sequence=sorted(nx.degree(g).values(),reverse=True) print Counter(degree_sequence) h=nx.DiGraph() h.add_nodes_from(g) h.add_edges_from(g.edges()) for node in g.nodes(): h.node[node]['count']=g.node[node]['count'] h.node[node]['chr']=g.node[node]['chr'] try: h.node[node]['read']=g.node[node]['read'] except: pass try: import ujson mapping = ujson.load(open(filename.split('.')[0]+'.mapping.json')) print 'get mapping' for node in h.nodes(): #print node if mapping.has_key(node): h.node[node]['aln_start'] = mapping[node][0] h.node[node]['aln_end'] = mapping[node][1] h.node[node]['aln_strand'] = mapping[node][2] else: h.node[node]['aln_start'] = 0 h.node[node]['aln_end'] = 0 h.node[node]['aln_strand'] = 0 except: pass nx.write_graphml(h, filename.split('.')[0]+'_condensed.graphml') print nx.number_weakly_connected_components(h) print nx.number_strongly_connected_components(h) # filename = sys.argv[1] gt_file=sys.argv[2] run(filename, gt_file,5) HINGE-0.5.0/scripts/connected.py000077500000000000000000000032211314415550300163530ustar00rootroot00000000000000#!/usr/bin/env python import networkx as nx import sys from collections import Counter def longest_path(G): dist = {} # stores [node, distance] pair for node in nx.topological_sort(G): # pairs of dist,node for all incoming edges pairs = [(dist[v][0]+1,v) for v in G.pred[node]] if pairs: dist[node] = max(pairs) else: dist[node] = (0, node) node,(length,_) = max(dist.items(), key=lambda x:x[1]) path = [] while length > 0: path.append(node) length,node = dist[node] return list(reversed(path)) filename = sys.argv[1] g = nx.DiGraph() with open(filename,'r') as f: for line in f.xreadlines(): g.add_edge(*(line.strip().split('->'))) print nx.info(g) degree_sequence=sorted(nx.degree(g).values(),reverse=True) print Counter(degree_sequence) for i in range(15): for node in g.nodes(): if g.in_degree(node) == 0: g.remove_node(node) print nx.info(g) #print nx.is_directed_acyclic_graph(g) #print list(nx.simple_cycles(g)) degree_sequence=sorted(nx.degree(g).values(),reverse=True) print Counter(degree_sequence) #print nx.diameter(g) def rev(string): if string[-1] == '\'': return string[:-1] else: return string+'\'' #for edge in g.edges(): # g.add_edge(rev(edge[1]), rev(edge[0])) #print edge #print rev(edge[1]), rev(edge[0]) print nx.info(g) print [len(item) for item in nx.weakly_connected_components(g)] nx.write_graphml(g, filename.split('.')[0]+'.graphml') with open(sys.argv[2],'w') as f: for edge in nx.dfs_edges(g): f.write('{} {}\n'.format(edge[0],edge[1])) f.close() HINGE-0.5.0/scripts/correct_head.py000077500000000000000000000023301314415550300170330ustar00rootroot00000000000000#!/usr/bin/env python import sys, os from pbcore.io import FastaIO def run(reader, writer, lookupfile): with open (lookupfile,'w') as f: for i,record in enumerate(reader): seq_length = len(record.sequence) zmw = i+1 old_header=record.header if seq_length < 30: new_header = 'Deleted' f.write(old_header+'\t'+new_header+'\n') continue #bounds = record.header.split('/')[-1] #start, end = [int(k) for k in bounds.split('_')] start = 0 new_end = start + seq_length new_header = "m000_000/{zmw}/{start}_{end}".format(zmw=zmw, start=start, end=new_end) f.write(old_header+'\t'+new_header+'\n') writer.writeRecord(new_header, record.sequence) def main(iname, ofile, lookupfile): reader = FastaIO.FastaReader(iname) writer = FastaIO.FastaWriter(ofile) run(reader, writer,lookupfile) if __name__ == '__main__': iname, oname, lookupfile = sys.argv[1:4] ofile = open(oname, 'w') try: main(iname, ofile, lookupfile) except: # clean up (for make) ofile.close() os.unlink(oname) raise HINGE-0.5.0/scripts/create_bandage_file.py000077500000000000000000000014531314415550300203210ustar00rootroot00000000000000#!/usr/bin/env python import sys import os def run(inputfile,outputfile): nodes = {} arcs = {} with open(inputfile) as file: for line in file: line_str = line[:-1] split_str = line_str.split(' ') node0 = int(split_str[0]) node1 = int(split_str[1]) # print node0,node1 nodes[node0] = 1 nodes[node1] = 1 if node0 < node1: arcs[tuple([node0,node1])] = 1 else: arcs[tuple([node1,node0])] = 1 with open(outputfile, 'w') as fout: for node in nodes: fout.write("NODE "+str(node)+' 0 0 0 0 0\n') fout.write('AAA\n') fout.write('AAA\n') # print "NODE "+str(node) for arc in arcs: fout.write("ARC "+str(arc[0])+' '+str(arc[1])+' 0\n') def main(): run(sys.argv[1],sys.argv[2]) return if __name__ == '__main__': main() HINGE-0.5.0/scripts/create_hgraph.py000077500000000000000000000034541314415550300172150ustar00rootroot00000000000000#!/usr/bin/env python import networkx as nx import random import sys from collections import Counter import json # This script creates a graphml file from the hgraph file def read_graph(filename,gt_file): with open(gt_file) as f: read_dict=json.load(f) g = nx.DiGraph() with open (filename) as f: for lines in f: lines1=lines.split() g.add_node(lines1[0] + "_" + lines1[2]) g.add_node(lines1[1] + "_" + lines1[3]) if lines1[0] in read_dict: g.node[lines1[0] + "_" + lines1[2]]['aln_start']=min(read_dict[lines1[0]][0][0],read_dict[lines1[0]][0][1]) g.node[lines1[0] + "_" + lines1[2]]['aln_end']=max(read_dict[lines1[0]][0][0],read_dict[lines1[0]][0][1]) else: g.node[lines1[0] + "_" + lines1[2]]['aln_start']=0 g.node[lines1[0] + "_" + lines1[2]]['aln_end']=0 if lines1[1] in read_dict: g.node[lines1[1] + "_" + lines1[3]]['aln_start']=min(read_dict[lines1[1]][0][0],read_dict[lines1[1]][0][1]) g.node[lines1[1] + "_" + lines1[3]]['aln_end']=max(read_dict[lines1[1]][0][0],read_dict[lines1[1]][0][1]) else: g.node[lines1[1] + "_" + lines1[3]]['aln_start']=0 g.node[lines1[1] + "_" + lines1[3]]['aln_end']=0 g.node[lines1[0] + "_" + lines1[2]]['active']=1 g.node[lines1[1] + "_" + lines1[3]]['active']=int(lines1[4]) g.add_edge(lines1[0] + "_" + lines1[2], lines1[1] + "_" + lines1[3]) nx.write_graphml(g, filename.split('.')[0]+'_hgraph.graphml') print nx.number_weakly_connected_components(g) print nx.number_strongly_connected_components(g) if __name__ == "__main__": read_graph(sys.argv[1],sys.argv[2]) HINGE-0.5.0/scripts/create_hgraph_nogt.py000066400000000000000000000015471314415550300202420ustar00rootroot00000000000000#!/usr/bin/python import networkx as nx import random import sys from collections import Counter # This script creates a graphml file from the hgraph file def read_graph(filename): g = nx.DiGraph() with open (filename) as f: for lines in f: lines1=lines.split() g.add_node(lines1[0] + "_" + lines1[2]) g.add_node(lines1[1] + "_" + lines1[3]) g.node[lines1[0] + "_" + lines1[2]]['active']=1 g.node[lines1[1] + "_" + lines1[3]]['active']=int(lines1[4]) g.add_edge(lines1[0] + "_" + lines1[2], lines1[1] + "_" + lines1[3]) nx.write_graphml(g, filename.split('.')[0]+'_hgraph.graphml') print nx.number_weakly_connected_components(g) print nx.number_strongly_connected_components(g) if __name__ == "__main__": read_graph(sys.argv[1]) HINGE-0.5.0/scripts/download_NCTC_pipeline.py000066400000000000000000000023311314415550300207120ustar00rootroot00000000000000import json import os import sys import subprocess base_dir = '/data/pacbio_assembly/pb_data/NCTC/' bact_dict = json.load(open(base_dir+'NCTC.json')) #bacterium_of_interest='NCTC7972' bacterium_of_interest=sys.argv[1] if len(sys.argv) > 2: bact_dict=sys.argv[2] bact_name="_".join(bact_dict[bacterium_of_interest]['Species'][0].split()) cmd_base = 'ascp -QT -l 1000m -i /data/pacbio_assembly/pb_data/asperaweb_id_dsa.openssh era-fasp@fasp.ega.ebi.ac.uk:vol1/' dest_dir = base_dir+bacterium_of_interest+'/' os.system('mkdir -p '+dest_dir) for run, file_list in bact_dict[bacterium_of_interest]['file_paths'].items(): for file_path in file_list: cmd = cmd_base+file_path+' '+dest_dir print cmd os.system(cmd) dest_fasta_name = dest_dir+bact_name dextract_cmd = 'dextract -o'+dest_fasta_name bax_files = [x for x in os.listdir(dest_dir) if x.endswith('.bax.h5')] for bax_file in bax_files: dextract_cmd += " " + dest_dir+bax_file print dextract_cmd try: subprocess.check_output(dextract_cmd.split()) print 'dextract done. deleting .bax.h5 files' os.system('rm '+dest_dir+'*.bax.h5') print 'removing .quiva files' os.system('rm '+dest_dir+'*.quiva') except: print 'error' HINGE-0.5.0/scripts/draft_assembly.py000077500000000000000000000015401314415550300174120ustar00rootroot00000000000000#!/usr/bin/env python import networkx as nx import sys from collections import Counter def linearize(filename): graph_name = filename.split('.')[0]+'.graphml' g = nx.read_graphml(graph_name) print nx.info(g) # get first strong connected component con = list(nx.strongly_connected_component_subgraphs(g)) con.sort(key = lambda x:len(x), reverse = True) print [len(item) for item in con] print nx.info(con[0]) dfs_edges = list(nx.dfs_edges(con[0])) dfs_edges.append((dfs_edges[-1][-1], dfs_edges[0][0])) #print dfs_edges with open(filename.split('.')[0]+'.linear.edges', 'w') as f: for item in dfs_edges: f.write(item[0] + ' ' + item[1] + ' ' + str(con[0].edge[item[0]][item[1]]['ew'])+'\n') filename = sys.argv[1] linearize(filename) HINGE-0.5.0/scripts/draft_assembly_not_perfect.py000077500000000000000000000015401314415550300220020ustar00rootroot00000000000000#!/usr/bin/env python import networkx as nx import sys from collections import Counter def linearize(filename): graph_name = filename.split('.')[0]+'.graphml' g = nx.read_graphml(graph_name) print nx.info(g) # get first strong connected component con = list(nx.strongly_connected_component_subgraphs(g)) con.sort(key = lambda x:len(x), reverse = True) print [len(item) for item in con] print nx.info(con[0]) dfs_edges = list(nx.dfs_edges(con[0])) dfs_edges.append((dfs_edges[-1][-1], dfs_edges[0][0])) #print dfs_edges with open(filename.split('.')[0]+'.linear.edges', 'w') as f: for item in dfs_edges: f.write(item[0] + ' ' + item[1] + ' ' + str(con[0].edge[item[0]][item[1]]['ew'])+'\n') filename = sys.argv[1] linearize(filename) HINGE-0.5.0/scripts/draw2.py000077500000000000000000000133721314415550300154400ustar00rootroot00000000000000#!/usr/bin/env python import numpy as np import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt #from ipywidgets.widgets import interact import interface_utils as util import sys import os import linereader os.environ['PATH'] += ':/data/pacbio_assembly/AwesomeAssembler/DALIGNER' #print os.popen("export").read() Qvd = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y'] Qvv = range(len(Qvd))[::-1] QVdict = dict(zip(Qvd,Qvv)) dbname = sys.argv[1] lasname = sys.argv[2] n = int(sys.argv[3]) path = os.getcwd()+'/' coveragename = path + dbname + '.coverage.txt' aln = [] coveragefile = linereader.copen(coveragename) coverage = coveragefile.getline(n) cov = coverage.split()[2:] covx = [] covy = [] for item in cov: data = item.split(',') covx.append(int(data[0])) covy.append(int(data[1])) qv = list(util.get_QV(path+dbname, [n]))[0] qx = [] qy = [] ts = int(sys.argv[5]) if len(sys.argv) < 7: rev = 0 else: rev = int(sys.argv[6]) print 'rev', rev for i in range(len(qv)): qx.append(i*ts) qy.append(QVdict[qv[i]]) for item in util.get_alignments2(path+dbname,path+lasname,[n]): aln.append(item) if (len(aln) == 0): sys.exit() #print aln[0:5] aln.sort(key = lambda x:x[2]) alns = [] current_b = aln[0][2] aln_group = [] for item in aln: if current_b != item[2]: alns.append(aln_group) aln_group = [] aln_group.append(item) current_b = item[2] else: aln_group.append(item) num = len(alns) print len(aln), len(alns) #print [len(item) for item in alns] #print [item[0:3] for item in aln] alns.sort(key = lambda x:min([item[3] for item in x])) #size_chunk = num/grid_size #for i in range(grid_size): # aln[i*size_chunk:min((i+1)*size_chunk, num)] = sorted(aln[i*size_chunk:min((i+1)*size_chunk, num)],key = lambda x: x[4]-x[3] ,reverse=True) fig = plt.figure(figsize = (15,10)) plt.axes() ax1 = plt.subplot2grid((6,6), (0, 0), colspan=6, rowspan=4) ax2 = plt.subplot2grid((6,6), (4, 0), colspan=6, rowspan=1, sharex = ax1) ax3 = plt.subplot2grid((6,6), (5, 0), colspan=6, rowspan=1, sharex = ax1) #plt.gca().axes.get_yaxis().set_visible(False) l = aln[0][5] tip = l/200 ed = l/50 grid_size = 1.0 ax1.set_xlim(-2000,l+2000) ax1.set_ylim(-5,num*grid_size) if rev == 0: points = [[0,0], [l,0], [l+tip,grid_size/4], [l,grid_size/2], [0,grid_size/2]] else: points = [[0,0], [-tip,grid_size/4], [0,grid_size/2], [l,grid_size/2], [l,0]] #rectangle = plt.Rectangle((0, 0), l, 5, fc='r',ec = 'none') polygon = plt.Polygon(points,fc = 'r', ec = 'none', alpha = 0.6) ax1.add_patch(polygon) dotted_line = plt.Line2D((0, 0), (0, num*grid_size ),ls='-.') ax1.add_line(dotted_line) dotted_line2 = plt.Line2D((l, l), (0, num*grid_size ),ls='-.') ax1.add_line(dotted_line2) for i,aln_group in enumerate(alns): for item in aln_group: if rev == 0: abpos = item[3] aepos = item[4] bbpos = item[6] bepos = item[7] blen = item[8] strand = item[0] else: aepos = l - item[3] abpos = l - item[4] blen = item[8] bbpos = blen - item[7] bepos = blen - item[6] strand = item[0] if strand == 'n': strand = 'c' else: strand = 'n' points_start = [] points_end = [] if strand == 'n': points = [[abpos, (i+1)*grid_size], [aepos, (i+1)*grid_size], [aepos + tip, (i+1)*grid_size + grid_size/4], [aepos, (i+1)*grid_size+grid_size/2], [abpos, (i+1)*grid_size+grid_size/2]] if (bepos < blen): points_end = [[aepos, (i+1)*grid_size], [aepos + tip, (i+1)*grid_size + grid_size/4], [aepos, (i+1)*grid_size+grid_size/2], [aepos+ed, (i+1)*grid_size+grid_size/2], [aepos + ed+ tip, (i+1)*grid_size + grid_size/4], [aepos+ed, (i+1)*grid_size]] if (bbpos > 0): points_start = [[abpos, (i+1)*grid_size], [abpos, (i+1)*grid_size+grid_size/2], [abpos-ed, (i+1)*grid_size+grid_size/2], [abpos-ed, (i+1)*grid_size]] else: points = [[abpos, (i+1)*grid_size], [aepos, (i+1)*grid_size], [aepos, (i+1)*grid_size+grid_size/2], [abpos, (i+1)*grid_size+grid_size/2], [abpos - tip, (i+1)*grid_size + grid_size/4]] if (bepos < blen): points_end = [[aepos, (i+1)*grid_size], [aepos, (i+1)*grid_size+grid_size/2], [aepos+ed, (i+1)*grid_size+grid_size/2], [aepos+ed, (i+1)*grid_size]] if (bbpos > 0): points_start = [[abpos, (i+1)*grid_size],[abpos-tip, (i+1)*grid_size+grid_size/4], [abpos, (i+1)*grid_size+grid_size/2], [abpos-ed, (i+1)*grid_size+grid_size/2],[abpos-ed-tip, (i+1)*grid_size+grid_size/4], [abpos-ed, (i+1)*grid_size]] polygon = plt.Polygon(points,fc = 'b', ec = 'none', alpha = 0.6) polygon.set_url('aln_svg' + str(item[2])+'.svg') ax1.add_patch(polygon) if points_end != []: polygon2 = plt.Polygon(points_end,fc = 'g', ec = 'none', alpha = 0.6) ax1.add_patch(polygon2) if points_start != []: polygon2 = plt.Polygon(points_start,fc = 'g', ec = 'none', alpha = 0.6) ax1.add_patch(polygon2) if rev == 1: covx = [l -item for item in covx] qx = [l - item for item in qx] ax2.plot(covx, covy) ax3.plot(qx, qy) plt.xlabel('position') ax1.set_ylabel('pile-o-gram') ax2.set_ylabel('coverage') ax3.set_ylabel('i-qv') plt.savefig(path + sys.argv[4] + '/aln_svg' + str(n) + '_' + str(rev)+ '.svg') HINGE-0.5.0/scripts/draw2_pileup.py000077500000000000000000000106311314415550300170110ustar00rootroot00000000000000#!/usr/bin/env python import numpy as np import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt from ipywidgets.widgets import interact import interface_utils as util import sys import os os.environ['PATH'] += ':/data/pacbio_assembly/AwesomeAssembler/DALIGNER' #print os.popen("export").read() n = (sys.argv[1]) rst = [] with open(n) as f: for line in f: tmp = line.strip().split() t1 = tmp[0] if t1[-1] == '\'': t1 = t1[:-1] t2 = tmp[1] if t2[-1] == '\'': t2 = t2[:-1] rst.append((int(t1)+1, int(tmp[2]))) #rst.append(int(t2)+1) #rst = range(1,1399) path = '/data/pacbio_assembly/AwesomeAssembler/data/' aln = [] for i,e in enumerate(rst): n = e[0] print i,n li = list(util.get_alignments_mapping(path+'ecoli', path + 'ecoli.ref', path +'ecoli.ecoli.ref.las', [n])) if (len(li) > 0): item = sorted(li, key=lambda x:x[4] - x[3], reverse = True)[0] aln.append(item) print aln[0:20] #aln.sort(key = lambda x:x[2]) alns = [] current_b = aln[0][2] aln_group = [] for item in aln: if current_b != item[2]: alns.append(aln_group) aln_group = [] aln_group.append(item) current_b = item[2] else: aln_group.append(item) num = len(alns) print len(aln), len(alns) #print [len(item) for item in alns] #print [item[0:3] for item in aln] #alns.sort(key = lambda x:min([item[3] for item in x])) #size_chunk = num/grid_size #for i in range(grid_size): # aln[i*size_chunk:min((i+1)*size_chunk, num)] = sorted(aln[i*size_chunk:min((i+1)*size_chunk, num)],key = lambda x: x[4]-x[3] ,reverse=True) plt.figure(figsize = (15,10)) plt.axes() #plt.gca().axes.get_yaxis().set_visible(False) l = aln[0][5] tip = l/5000 ed = l/2000 grid_size = 1.0 plt.xlim(-2000,l+2000) plt.ylim(-5,num*grid_size) points = [[0,0], [l,0], [l+tip,grid_size/4], [l,grid_size/2], [0,grid_size/2]] #rectangle = plt.Rectangle((0, 0), l, 5, fc='r',ec = 'none') polygon = plt.Polygon(points,fc = 'r', ec = 'none', alpha = 0.6) plt.gca().add_patch(polygon) dotted_line = plt.Line2D((0, 0), (0, num*grid_size ),ls='-.') plt.gca().add_line(dotted_line) dotted_line2 = plt.Line2D((l, l), (0, num*grid_size ),ls='-.') plt.gca().add_line(dotted_line2) for i,aln_group in enumerate(alns): for item in aln_group: abpos = item[3] aepos = item[4] bbpos = item[6] bepos = item[7] blen = item[8] strand = item[0] points_start = [] points_end = [] if strand == 'n': points = [[abpos, (i+1)*grid_size], [aepos, (i+1)*grid_size], [aepos + tip, (i+1)*grid_size + grid_size/4], [aepos, (i+1)*grid_size+grid_size/2], [abpos, (i+1)*grid_size+grid_size/2]] if (bepos < blen): points_end = [[aepos, (i+1)*grid_size], [aepos + tip, (i+1)*grid_size + grid_size/4], [aepos, (i+1)*grid_size+grid_size/2], [aepos+ed, (i+1)*grid_size+grid_size/2], [aepos + ed+ tip, (i+1)*grid_size + grid_size/4], [aepos+ed, (i+1)*grid_size]] if (bbpos > 0): points_start = [[abpos, (i+1)*grid_size], [abpos, (i+1)*grid_size+grid_size/2], [abpos-ed, (i+1)*grid_size+grid_size/2], [abpos-ed, (i+1)*grid_size]] else: points = [[abpos, (i+1)*grid_size], [aepos, (i+1)*grid_size], [aepos, (i+1)*grid_size+grid_size/2], [abpos, (i+1)*grid_size+grid_size/2], [abpos - tip, (i+1)*grid_size + grid_size/4]] if (bepos < blen): points_end = [[aepos, (i+1)*grid_size], [aepos, (i+1)*grid_size+grid_size/2], [aepos+ed, (i+1)*grid_size+grid_size/2], [aepos+ed, (i+1)*grid_size]] if (bbpos > 0): points_start = [[abpos, (i+1)*grid_size],[abpos-tip, (i+1)*grid_size+grid_size/4], [abpos, (i+1)*grid_size+grid_size/2], [abpos-ed, (i+1)*grid_size+grid_size/2],[abpos-ed-tip, (i+1)*grid_size+grid_size/4], [abpos-ed, (i+1)*grid_size]] polygon = plt.Polygon(points,fc = 'b', ec = 'none', alpha = 0.6) polygon.set_url("http://shannon.stanford.edu:5000/aln" + str(item[2]+1) + ".pdf") plt.gca().add_patch(polygon) if points_end != []: polygon2 = plt.Polygon(points_end,fc = 'g', ec = 'none', alpha = 0.6) plt.gca().add_patch(polygon2) if points_start != []: polygon2 = plt.Polygon(points_start,fc = 'g', ec = 'none', alpha = 0.6) plt.gca().add_patch(polygon2) plt.savefig('mapping/map.' + str(n)+ '.svg') HINGE-0.5.0/scripts/draw2_pileup_region.py000077500000000000000000000103451314415550300203560ustar00rootroot00000000000000#!/usr/bin/env python import numpy as np import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt from ipywidgets.widgets import interact import interface_utils as util import sys import os os.environ['PATH'] += ':/data/pacbio_assembly/AwesomeAssembler/DALIGNER' #print os.popen("export").read() left = int(sys.argv[1]) right = int(sys.argv[2]) #rst = range(1,1399) path = '/data/pacbio_assembly/AwesomeAssembler/data/ecoli/' aln = [] bb = [] with open('ecoli.linear.edges') as f: for line in f: e = line.split(" ")[0] if e[-1] == '\'': e = e[:-1] bb.append(int(e)) print bb bb = set(bb) for i,item in enumerate(util.get_alignments_mapping2(path+'draft', path +'ecoli', path +'draft.ecoli.las')): if i%2000 == 0: print i, item if item[3] >= left and item[4] <= right: aln.append(item) print 'number:',len(aln) aln.sort(key = lambda x:x[2]) alns = [] current_b = aln[0][2] aln_group = [] for item in aln: if current_b != item[2]: alns.append(aln_group) aln_group = [] aln_group.append(item) current_b = item[2] else: aln_group.append(item) num = len(alns) print len(aln), len(alns) alns.sort(key = lambda x:min([item[3] for item in x])) plt.figure(figsize = (15,10)) plt.axes() #plt.gca().axes.get_yaxis().set_visible(False) #l = aln[0][5] tip = (right-left)/5000 ed = (right-left)/2000 grid_size = 1.0 plt.xlim(left-2000,right+2000) plt.ylim(-5,num*grid_size) points = [[left,0], [right,0], [right+tip,grid_size/4], [right,grid_size/2], [left,grid_size/2]] #rectangle = plt.Rectangle((0, 0), l, 5, fc='r',ec = 'none') polygon = plt.Polygon(points,fc = 'r', ec = 'none', alpha = 0.6) plt.gca().add_patch(polygon) dotted_line = plt.Line2D((left, left), (0, num*grid_size ),ls='-.') plt.gca().add_line(dotted_line) dotted_line2 = plt.Line2D((right, right), (0, num*grid_size ),ls='-.') plt.gca().add_line(dotted_line2) for i,aln_group in enumerate(alns): for item in aln_group: abpos = item[3] aepos = item[4] bbpos = item[6] bepos = item[7] blen = item[8] strand = item[0] points_start = [] points_end = [] if strand == 'n': points = [[abpos, (i+1)*grid_size], [aepos, (i+1)*grid_size], [aepos + tip, (i+1)*grid_size + grid_size/4], [aepos, (i+1)*grid_size+grid_size/2], [abpos, (i+1)*grid_size+grid_size/2]] if (bepos < blen): points_end = [[aepos, (i+1)*grid_size], [aepos + tip, (i+1)*grid_size + grid_size/4], [aepos, (i+1)*grid_size+grid_size/2], [aepos+ed, (i+1)*grid_size+grid_size/2], [aepos + ed+ tip, (i+1)*grid_size + grid_size/4], [aepos+ed, (i+1)*grid_size]] if (bbpos > 0): points_start = [[abpos, (i+1)*grid_size], [abpos, (i+1)*grid_size+grid_size/2], [abpos-ed, (i+1)*grid_size+grid_size/2], [abpos-ed, (i+1)*grid_size]] else: points = [[abpos, (i+1)*grid_size], [aepos, (i+1)*grid_size], [aepos, (i+1)*grid_size+grid_size/2], [abpos, (i+1)*grid_size+grid_size/2], [abpos - tip, (i+1)*grid_size + grid_size/4]] if (bepos < blen): points_end = [[aepos, (i+1)*grid_size], [aepos, (i+1)*grid_size+grid_size/2], [aepos+ed, (i+1)*grid_size+grid_size/2], [aepos+ed, (i+1)*grid_size]] if (bbpos > 0): points_start = [[abpos, (i+1)*grid_size],[abpos-tip, (i+1)*grid_size+grid_size/4], [abpos, (i+1)*grid_size+grid_size/2], [abpos-ed, (i+1)*grid_size+grid_size/2],[abpos-ed-tip, (i+1)*grid_size+grid_size/4], [abpos-ed, (i+1)*grid_size]] if item[2] in bb: polygon = plt.Polygon(points,fc = 'r', ec = 'none', alpha = 0.8) else: polygon = plt.Polygon(points,fc = 'b', ec = 'none', alpha = 0.6) polygon.set_url("http://shannon.stanford.edu:5000/aln" + str(item[2]+1) + ".pdf") plt.gca().add_patch(polygon) if points_end != []: polygon2 = plt.Polygon(points_end,fc = 'g', ec = 'none', alpha = 0.6) plt.gca().add_patch(polygon2) if points_start != []: polygon2 = plt.Polygon(points_start,fc = 'g', ec = 'none', alpha = 0.6) plt.gca().add_patch(polygon2) plt.savefig('mapping/map.' + str(left) +'_'+ str(right)+ '.svg') HINGE-0.5.0/scripts/draw2_pileup_w_repeat.py000077500000000000000000000127511314415550300207040ustar00rootroot00000000000000#!/usr/bin/env python import numpy as np import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt from ipywidgets.widgets import interact import interface_utils as util import sys import os os.environ['PATH'] += ':/data/pacbio_assembly/AwesomeAssembler/DALIGNER' #print os.popen("export").read() path = os.environ['PWD'] + '/' #/data/pacbio_assembly/AwesomeAssembler/data/' n = (sys.argv[1]) rst = [] with open(n) as f: for line in f: rst.append(int(line.strip())) rep = {} with open(path + 'ecoli.repeat.txt') as f: for line in f: l = map(int, line.strip().split()) if len(l) > 1: for i in range((len(l) - 1) / 2): if not rep.has_key(l[0]): rep[l[0]] = [] rep[l[0]].append((l[2*i+1], l[2*i+2])) #rst = range(1,1399) aln = [] for i,e in enumerate(rst): n = e print i,n li = list(util.get_alignments_mapping(path+'ecoli', path + 'ecoli.ref', path +'ecoli.ecoli.ref.las', [n])) if (len(li) > 0): item = sorted(li, key=lambda x:x[4] - x[3], reverse = True) for l in item: aln.append(l) print aln[0:20] #aln.sort(key = lambda x:x[2]) alns = [] current_b = aln[0][2] aln_group = [] for item in aln: if current_b != item[2]: aln_group.sort(key = lambda x:x[4]-x[3], reverse = True) alns.append(aln_group) aln_group = [] aln_group.append(item) current_b = item[2] else: aln_group.append(item) num = len(alns) print len(aln), len(alns) #print [len(item) for item in alns] #print [item[0:3] for item in aln] alns.sort(key = lambda x:x[0][3]) #size_chunk = num/grid_size #for i in range(grid_size): # aln[i*size_chunk:min((i+1)*size_chunk, num)] = sorted(aln[i*size_chunk:min((i+1)*size_chunk, num)],key = lambda x: x[4]-x[3] ,reverse=True) plt.figure(figsize = (15,10)) plt.axes() #plt.gca().axes.get_yaxis().set_visible(False) l = aln[0][5] tip = l/5000 ed = l/2000 grid_size = 1.0 plt.xlim(-2000,l+2000) plt.ylim(-5,num*grid_size) points = [[0,0], [l,0], [l+tip,grid_size/4], [l,grid_size/2], [0,grid_size/2]] #rectangle = plt.Rectangle((0, 0), l, 5, fc='r',ec = 'none') polygon = plt.Polygon(points,fc = 'r', ec = 'none', alpha = 0.6) plt.gca().add_patch(polygon) dotted_line = plt.Line2D((0, 0), (0, num*grid_size ),ls='-.') plt.gca().add_line(dotted_line) dotted_line2 = plt.Line2D((l, l), (0, num*grid_size ),ls='-.') plt.gca().add_line(dotted_line2) for i,aln_group in enumerate(alns): for item in aln_group: abpos = item[3] aepos = item[4] bbpos = item[6] bepos = item[7] blen = item[8] strand = item[0] points_start = [] points_end = [] rid = item[2] abpos = abpos - bbpos aepos = aepos + (blen - bepos) if strand == 'n': points = [[abpos, (i+1)*grid_size], [aepos, (i+1)*grid_size], [aepos + tip, (i+1)*grid_size + grid_size/4], [aepos, (i+1)*grid_size+grid_size/2], [abpos, (i+1)*grid_size+grid_size/2]] if (bepos < blen): points_end = [[aepos, (i+1)*grid_size], [aepos + tip, (i+1)*grid_size + grid_size/4], [aepos, (i+1)*grid_size+grid_size/2], [aepos+ed, (i+1)*grid_size+grid_size/2], [aepos + ed+ tip, (i+1)*grid_size + grid_size/4], [aepos+ed, (i+1)*grid_size]] if (bbpos > 0): points_start = [[abpos, (i+1)*grid_size], [abpos, (i+1)*grid_size+grid_size/2], [abpos-ed, (i+1)*grid_size+grid_size/2], [abpos-ed, (i+1)*grid_size]] else: points = [[abpos, (i+1)*grid_size], [aepos, (i+1)*grid_size], [aepos, (i+1)*grid_size+grid_size/2], [abpos, (i+1)*grid_size+grid_size/2], [abpos - tip, (i+1)*grid_size + grid_size/4]] if (bepos < blen): points_end = [[aepos, (i+1)*grid_size], [aepos, (i+1)*grid_size+grid_size/2], [aepos+ed, (i+1)*grid_size+grid_size/2], [aepos+ed, (i+1)*grid_size]] if (bbpos > 0): points_start = [[abpos, (i+1)*grid_size],[abpos-tip, (i+1)*grid_size+grid_size/4], [abpos, (i+1)*grid_size+grid_size/2], [abpos-ed, (i+1)*grid_size+grid_size/2],[abpos-ed-tip, (i+1)*grid_size+grid_size/4], [abpos-ed, (i+1)*grid_size]] polygon = plt.Polygon(points,fc = 'b', ec = 'none', alpha = 0.6) polygon.set_url("http://shannon.stanford.edu:5000/aln" + str(item[2]+1) + ".pdf") plt.gca().add_patch(polygon) #if points_end != []: # polygon2 = plt.Polygon(points_end,fc = 'g', ec = 'none', alpha = 0.6) # plt.gca().add_patch(polygon2) # #if points_start != []: # polygon2 = plt.Polygon(points_start,fc = 'g', ec = 'none', alpha = 0.6) # plt.gca().add_patch(polygon2) if rep.has_key(rid): for item in rep[rid]: s = item[0] e = item[1] if item[0] == -1: s = 0 if item[1] == -1: e = blen if strand != 'n': s = blen - s e = blen - e points = [[abpos + s, (i+1)*grid_size], [abpos + e, (i+1)*grid_size], [abpos + e, (i+1)*grid_size+grid_size/2], [abpos + s, (i+1)*grid_size+grid_size/2]] polygon2 = plt.Polygon(points,fc = 'y', ec = 'none', alpha = 0.8) plt.gca().add_patch(polygon2) plt.savefig('mapping/map.svg') HINGE-0.5.0/scripts/draw_pileup_region.py000077500000000000000000000115101314415550300202670ustar00rootroot00000000000000#!/usr/bin/env python import numpy as np import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt from ipywidgets.widgets import interact import interface_utils as util import sys import os os.environ['PATH'] += ':~/AwesomeAssembler/DALIGNER' #print os.popen("export").read() left = int(sys.argv[1]) right = int(sys.argv[2]) ref = sys.argv[3] read = sys.argv[4] las = sys.argv[5] contig = sys.argv[6] length_th = int(sys.argv[7]) #path = '/data/pacbio_assembly/AwesomeAssembler/data/ecoli/' aln = [] #bb = [] #with open('ecoli.linear.edges') as f: # for line in f: # e = line.split(" ")[0] # if e[-1] == '\'': # e = e[:-1] # # bb.append(int(e)) # #print bb # #bb = set(bb) for i,item in enumerate(util.get_alignments_mapping3(ref, read, las, contig)): if i%2000 == 0: print i, item if item[3] >= left and item[4] <= right and item[4] - item[3] > length_th: aln.append(item) covy = np.zeros((right - left, )) for item in aln: covy[item[3] - left : item[4] - left] += 1 covx = np.arange(left, right) print 'number:',len(aln) aln.sort(key = lambda x:x[2]) alns = [] current_b = aln[0][2] aln_group = [] for item in aln: if current_b != item[2]: alns.append(aln_group) aln_group = [] aln_group.append(item) current_b = item[2] else: aln_group.append(item) #num = len(alns) num = len(aln) print len(aln), len(alns) alns.sort(key = lambda x:min([item[3] for item in x])) fig = plt.figure(figsize = (15,10)) plt.axes() ax1 = plt.subplot2grid((6,6), (0, 0), colspan=6, rowspan=4) ax2 = plt.subplot2grid((6,6), (4, 0), colspan=6, rowspan=1, sharex = ax1) #plt.gca().axes.get_yaxis().set_visible(False) #l = aln[0][5] tip = (right-left)/5000 ed = (right-left)/2000 grid_size = 1.0 ax1.set_xlim(left-2000,right+2000) ax1.set_ylim(-5,num*grid_size) points = [[left,0], [right,0], [right+tip,grid_size/4], [right,grid_size/2], [left,grid_size/2]] #rectangle = plt.Rectangle((0, 0), l, 5, fc='r',ec = 'none') polygon = plt.Polygon(points,fc = 'r', ec = 'none', alpha = 0.6) ax1.add_patch(polygon) dotted_line = plt.Line2D((left, left), (0, num*grid_size ),ls='-.') ax1.add_line(dotted_line) dotted_line2 = plt.Line2D((right, right), (0, num*grid_size ),ls='-.') ax1.add_line(dotted_line2) alns_all = [] for item in alns: for aln in item: alns_all.append([aln]) alns_all.sort(key = lambda x:min([item[3] for item in x])) for i,aln_group in enumerate(alns_all): for item in aln_group: abpos = item[3] aepos = item[4] bbpos = item[6] bepos = item[7] blen = item[8] strand = item[0] points_start = [] points_end = [] if strand == 'n': points = [[abpos, (i+1)*grid_size], [aepos, (i+1)*grid_size], [aepos + tip, (i+1)*grid_size + grid_size/4], [aepos, (i+1)*grid_size+grid_size/2], [abpos, (i+1)*grid_size+grid_size/2]] if (bepos < blen): points_end = [[aepos, (i+1)*grid_size], [aepos + tip, (i+1)*grid_size + grid_size/4], [aepos, (i+1)*grid_size+grid_size/2], [aepos+ed, (i+1)*grid_size+grid_size/2], [aepos + ed+ tip, (i+1)*grid_size + grid_size/4], [aepos+ed, (i+1)*grid_size]] if (bbpos > 0): points_start = [[abpos, (i+1)*grid_size], [abpos, (i+1)*grid_size+grid_size/2], [abpos-ed, (i+1)*grid_size+grid_size/2], [abpos-ed, (i+1)*grid_size]] else: points = [[abpos, (i+1)*grid_size], [aepos, (i+1)*grid_size], [aepos, (i+1)*grid_size+grid_size/2], [abpos, (i+1)*grid_size+grid_size/2], [abpos - tip, (i+1)*grid_size + grid_size/4]] if (bepos < blen): points_end = [[aepos, (i+1)*grid_size], [aepos, (i+1)*grid_size+grid_size/2], [aepos+ed, (i+1)*grid_size+grid_size/2], [aepos+ed, (i+1)*grid_size]] if (bbpos > 0): points_start = [[abpos, (i+1)*grid_size],[abpos-tip, (i+1)*grid_size+grid_size/4], [abpos, (i+1)*grid_size+grid_size/2], [abpos-ed, (i+1)*grid_size+grid_size/2],[abpos-ed-tip, (i+1)*grid_size+grid_size/4], [abpos-ed, (i+1)*grid_size]] #if item[2] in bb: # polygon = plt.Polygon(points,fc = 'r', ec = 'none', alpha = 0.8) #else: polygon = plt.Polygon(points,fc = 'b', ec = 'none', alpha = 0.6) polygon.set_url("http://shannon.stanford.edu:5000/aln" + str(item[2]+1) + ".pdf") ax1.add_patch(polygon) if points_end != []: polygon2 = plt.Polygon(points_end,fc = 'g', ec = 'none', alpha = 0.6) ax1.add_patch(polygon2) if points_start != []: polygon2 = plt.Polygon(points_start,fc = 'g', ec = 'none', alpha = 0.6) ax1.add_patch(polygon2) ax2.plot(covx, covy) plt.xlabel('position') ax1.set_ylabel('pile-o-gram') ax2.set_ylabel('coverage') plt.savefig('mapping/map.' + str(contig) + '_' + str(left) +'_'+ str(right)+ '.svg') HINGE-0.5.0/scripts/draw_pileup_region_find_bridges.py000077500000000000000000000115701314415550300227740ustar00rootroot00000000000000#!/usr/bin/env python import numpy as np import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt from ipywidgets.widgets import interact import interface_utils as util import sys import os os.environ['PATH'] += ':~/AwesomeAssembler/DALIGNER' #print os.popen("export").read() left = int(sys.argv[1]) right = int(sys.argv[2]) ref = sys.argv[3] read = sys.argv[4] las = sys.argv[5] contig = sys.argv[6] length_th = int(sys.argv[7]) bridge_begin = int(sys.argv[8]) bridge_end = int(sys.argv[9]) #path = '/data/pacbio_assembly/AwesomeAssembler/data/ecoli/' aln = [] #bb = [] #with open('ecoli.linear.edges') as f: # for line in f: # e = line.split(" ")[0] # if e[-1] == '\'': # e = e[:-1] # # bb.append(int(e)) # #print bb # #bb = set(bb) for i,item in enumerate(util.get_alignments_mapping3(ref, read, las, contig)): #if i%2000 == 0: # print i, item if item[3] >= left and item[4] <= right and item[4] - item[3] > length_th: aln.append(item) covy = np.zeros((right - left, )) for item in aln: covy[item[3] - left : item[4] - left] += 1 covx = np.arange(left, right) #for i in range(0, len(covx), 10): # print covx[i], covy[i] print 'number:',len(aln) aln.sort(key = lambda x:x[2]) alns = [] current_b = aln[0][2] aln_group = [] for item in aln: if current_b != item[2]: alns.append(aln_group) aln_group = [] aln_group.append(item) current_b = item[2] else: aln_group.append(item) num = len(alns) print len(aln), len(alns) alns.sort(key = lambda x:min([item[3] for item in x])) fig = plt.figure(figsize = (15,10)) plt.axes() ax1 = plt.subplot2grid((6,6), (0, 0), colspan=6, rowspan=4) ax2 = plt.subplot2grid((6,6), (4, 0), colspan=6, rowspan=1, sharex = ax1) #plt.gca().axes.get_yaxis().set_visible(False) #l = aln[0][5] tip = (right-left)/5000 ed = (right-left)/2000 grid_size = 1.0 ax1.set_xlim(left-2000,right+2000) ax1.set_ylim(-5,num*grid_size) points = [[left,0], [right,0], [right+tip,grid_size/4], [right,grid_size/2], [left,grid_size/2]] #rectangle = plt.Rectangle((0, 0), l, 5, fc='r',ec = 'none') polygon = plt.Polygon(points,fc = 'r', ec = 'none', alpha = 0.6) ax1.add_patch(polygon) dotted_line = plt.Line2D((left, left), (0, num*grid_size ),ls='-.') ax1.add_line(dotted_line) dotted_line2 = plt.Line2D((right, right), (0, num*grid_size ),ls='-.') ax1.add_line(dotted_line2) for i,aln_group in enumerate(alns): for item in aln_group: abpos = item[3] aepos = item[4] if abpos < bridge_begin+200 and aepos > bridge_end-200: print item bbpos = item[6] bepos = item[7] blen = item[8] strand = item[0] points_start = [] points_end = [] if strand == 'n': points = [[abpos, (i+1)*grid_size], [aepos, (i+1)*grid_size], [aepos + tip, (i+1)*grid_size + grid_size/4], [aepos, (i+1)*grid_size+grid_size/2], [abpos, (i+1)*grid_size+grid_size/2]] if (bepos < blen): points_end = [[aepos, (i+1)*grid_size], [aepos + tip, (i+1)*grid_size + grid_size/4], [aepos, (i+1)*grid_size+grid_size/2], [aepos+ed, (i+1)*grid_size+grid_size/2], [aepos + ed+ tip, (i+1)*grid_size + grid_size/4], [aepos+ed, (i+1)*grid_size]] if (bbpos > 0): points_start = [[abpos, (i+1)*grid_size], [abpos, (i+1)*grid_size+grid_size/2], [abpos-ed, (i+1)*grid_size+grid_size/2], [abpos-ed, (i+1)*grid_size]] else: points = [[abpos, (i+1)*grid_size], [aepos, (i+1)*grid_size], [aepos, (i+1)*grid_size+grid_size/2], [abpos, (i+1)*grid_size+grid_size/2], [abpos - tip, (i+1)*grid_size + grid_size/4]] if (bepos < blen): points_end = [[aepos, (i+1)*grid_size], [aepos, (i+1)*grid_size+grid_size/2], [aepos+ed, (i+1)*grid_size+grid_size/2], [aepos+ed, (i+1)*grid_size]] if (bbpos > 0): points_start = [[abpos, (i+1)*grid_size],[abpos-tip, (i+1)*grid_size+grid_size/4], [abpos, (i+1)*grid_size+grid_size/2], [abpos-ed, (i+1)*grid_size+grid_size/2],[abpos-ed-tip, (i+1)*grid_size+grid_size/4], [abpos-ed, (i+1)*grid_size]] #if item[2] in bb: # polygon = plt.Polygon(points,fc = 'r', ec = 'none', alpha = 0.8) #else: polygon = plt.Polygon(points,fc = 'b', ec = 'none', alpha = 0.6) polygon.set_url("http://shannon.stanford.edu:5000/aln" + str(item[2]+1) + ".pdf") ax1.add_patch(polygon) if points_end != []: polygon2 = plt.Polygon(points_end,fc = 'g', ec = 'none', alpha = 0.6) ax1.add_patch(polygon2) if points_start != []: polygon2 = plt.Polygon(points_start,fc = 'g', ec = 'none', alpha = 0.6) ax1.add_patch(polygon2) ax2.plot(covx, covy) plt.xlabel('position') ax1.set_ylabel('pile-o-gram') ax2.set_ylabel('coverage') plt.savefig('mapping/map.' + str(contig) + '_' + str(left) +'_'+ str(right)+ '.svg') HINGE-0.5.0/scripts/fasta_to_fastq.py000077500000000000000000000007111314415550300174100ustar00rootroot00000000000000#!/usr/bin/env python """ Convert FASTA to FASTQ file with a static Usage: $ ./fasta_to_fastq NAME.fasta NAME.fastq """ import sys, os from Bio import SeqIO # Get inputs fa_path = sys.argv[1] fq_path = sys.argv[2] # make fastq with open(fa_path, "r") as fasta, open(fq_path, "w") as fastq: for record in SeqIO.parse(fasta, "fasta"): record.letter_annotations["phred_quality"] = [40] * len(record) SeqIO.write(record, fastq, "fastq") HINGE-0.5.0/scripts/get_NCTC_json.py000066400000000000000000000034761314415550300170410ustar00rootroot00000000000000import urllib2 from bs4 import BeautifulSoup import json response = urllib2.urlopen('http://www.sanger.ac.uk/resources/downloads/bacteria/nctc/') html = response.read() soup=BeautifulSoup(html) table = soup.find("table") headings = [th.get_text() for th in table.find("tr").find_all("th")] dataset={} for row in table.find_all("tr")[1:]: # print row row1= [td.get_text() for td in row.find_all("td")] print row1 metadata={} cellname='' for i, td in enumerate(row.find_all("td")): #print metadata link=td.find('a') # print i, td if i==1: cellname=td.get_text() print cellname if i==3: # print td # ERR_soup=BeautifulSoup(td) ERR_links=[] potential_links = td.findAll('a') # print potential_links for potential_link in td.findAll('a'): ERR_links.append((potential_link.text, potential_link['href'])) metadata[headings[i]]=ERR_links continue if link != None: link=link.get('href') metadata[headings[i]]=(td.get_text(),link) list_of_files={} for run in metadata[headings[3]]: link_to_go=run[1] response1 = urllib2.urlopen(link_to_go+"&display=xml") xml = response1.read() xmlsoup = BeautifulSoup(xml) fllist=[] for data_block in xmlsoup.findAll('data_block'): for files in data_block.findAll('files'): for fle in files.findAll('file'): fllist.append(fle['filename']) list_of_files[run[0]]=fllist metadata['file_paths']=list_of_files # print xml dataset[cellname]=metadata with open('NCTC.json', 'w') as outfile: json.dump(dataset, outfile) HINGE-0.5.0/scripts/get_consensus_gfa.py000077500000000000000000000053311314415550300201110ustar00rootroot00000000000000#!/usr/bin/env python import sys import os import subprocess from parse_read import * import numpy as np import networkx as nx import itertools filedir = sys.argv[1] filename = sys.argv[2] consensus_name = sys.argv[3] in_graphml_name = filedir + '/' + filename +'_draft.graphml' map_filename = filedir + '/draft_map.txt' g = nx.read_graphml(in_graphml_name) gfaname = filedir + '/' + filename +'_consensus.gfa' cols = np.loadtxt(map_filename, dtype=str,usecols=(1,)) del_contigs = np.nonzero(cols == 'Deleted')[0] # consensus_contigs = [] # i = 0 # try: # with open(consensus_name) as f: # for line in f: # if line[0] != '>': # consensus_contigs.append(line.strip()) # i += 1 # while i in set(del_contigs): # consensus_contigs.append('') # print len() # i += 1 # except: # pass del_contig_ptr = 0 cols = np.loadtxt(map_filename, dtype=str,usecols=(1,)) del_contigs = np.nonzero(cols == 'Deleted')[0] consensus_contigs = [] i = 0 with open(consensus_name) as f: for line in f: if line[0] != '>': while del_contig_ptr < len(del_contigs) : if len(consensus_contigs) == del_contigs[del_contig_ptr]: consensus_contigs.append('') del_contig_ptr += 1 else: break consensus_contigs.append(line.strip()) i += 1 nodes_to_keep = [x for x in g.nodes() if consensus_contigs[g.node[x]['contig_id']] != '' ] h = g.subgraph(nodes_to_keep) # for i, vert in enumerate(h.nodes()): # print i, vert # try: # print i,len(h.node[vert]['path']), len(h.node[vert]['segment']), len(consensus_contigs[i]) # except: # print len(h.nodes()), len(consensus_contigs) # raise print 'Number of contigs' print len(consensus_contigs), len(h.nodes()) # print [len(x) for x in consensus_contigs] with open(gfaname,'w') as f: f.write("H\tVN:Z:1.0\n") for j,vert in enumerate(h.nodes()): i = h.node[vert]['contig_id'] # print j, i seg = consensus_contigs[i] print(len(seg)) seg_line = "S\t"+vert+"\t"+seg + '\n' f.write(seg_line) for edge in h.edges(): edge_line = "L\t"+edge[0]+"\t+\t"+edge[1]+"\t+\t0M\n" f.write(edge_line) #last = h.nodes()[-1] #print h.node[last] #path_last = h.node[last]['path'] #for i in range(len(path_last)-1): # read_a = path_last[i] # read_b = path_last[i+1] # print read_a, read_b, in_graph.edge[read_a][read_b] # for i,node in enumerate(h.nodes()): # h.node[node]['path'] = ';'.join(h.node[node]['path']) # nx.write_graphml(h,out_graphml_name) HINGE-0.5.0/scripts/get_draft_annotation.py000077500000000000000000000327111314415550300206100ustar00rootroot00000000000000#!/usr/bin/env python import sys import os import subprocess from parse_read import * import numpy as np import networkx as nx import itertools NCTCname = sys.argv[1] filename = '/data/pacbio_assembly/pb_data/NCTC/'+NCTCname+'/'+NCTCname graphml_path = sys.argv[2] in_graph = nx.read_graphml(graphml_path) reads = sorted(list(set([int(x.split("_")[0].lstrip("B")) for x in in_graph.nodes()]))) dbshow_reads = ' '.join([str(x+1) for x in reads]) DBshow_cmd = "DBshow "+filename+' '+dbshow_reads stream = subprocess.Popen(DBshow_cmd.split(), stdout=subprocess.PIPE,bufsize=1) reads_queried = parse_read(stream.stdout) read_dict = {} for read_id,read in itertools.izip(reads,reads_queried): rdlen = len(read[1]) # print read read_dict[read_id] = read complement = {'A':'T','C': 'G','T':'A', 'G':'C','a':'t','t':'a','c':'g','g':'c'} def reverse_complement(string): return "".join(map(lambda x:complement[x],reversed(string))) def get_string(path): #print path ret_str = '' for itm in path: # print itm read_id,rd_orientation = itm[0].split("_") if rd_orientation == '1': assert itm[1][0] >= itm[1][1] # print itm str_st = itm[1][1] str_end = itm[1][0] read_str = read_dict[int(read_id.lstrip("B"))][1][str_st:str_end] else: assert itm[1][0] <= itm[1][1] str_st = itm[1][0] str_end = itm[1][1] read_str = reverse_complement(read_dict[int(read_id.lstrip("B"))][1][str_st:str_end]) # print str_st,str_end # print read_id # print read_dict[int(read_id)][str_st:str_end] # print read_str print 'read len',len(read_str) ret_str += read_str print len(path), len(ret_str) return ret_str vertices_of_interest = set([x for x in in_graph if in_graph.in_degree(x) != 1 or in_graph.out_degree(x) != 1]) read_tuples = {} for vert in vertices_of_interest: vert_id, vert_or = vert.split("_") if vert_or == '1': continue vert_len = len(read_dict[int(vert_id)][1]) # print vert_len read_starts = [(in_graph.edge[x][vert]['read_b_start']) for x in in_graph.predecessors(vert)] read_ends = [(in_graph.edge[vert][x]['read_a_start']) for x in in_graph.successors(vert)] if read_starts: read_start = max(read_starts) else: read_start = 0 if read_ends: read_end = min(read_ends) else: read_end = vert_len read_tuples[vert] = (read_start,read_end) print read_starts, read_ends, vert for vert in vertices_of_interest: vert_id, vert_or = vert.split("_") if vert_or == '1': read_tuples[vert] = read_tuples[vert_id+"_0"] start_vertices = [x for x in vertices_of_interest if in_graph.in_degree(x) == 0 or in_graph.out_degree(x) > 1] h = nx.DiGraph() read_tuples_raw = {} for vertex in vertices_of_interest: successors = in_graph.successors(vertex) if successors: succ = successors[0] d = in_graph.get_edge_data(vertex,succ) read_tuples_raw[vertex] = (d['read_a_start_raw'], d['read_a_end_raw']) else: predecessors = in_graph.predecessors(vertex) if not len(predecessors) == 0: pred = predecessors[0] d = in_graph.get_edge_data(pred,vertex) read_tuples_raw[vertex] = (d['read_b_start_raw'], d['read_b_end_raw']) else: read_tuples_raw[vertex] = (0,0) for vertex in vertices_of_interest: h.add_node(vertex) if vertex.split("_")[1] == '0': path_var = [(vertex,(read_tuples[vertex][0], read_tuples[vertex][1]))] else: path_var = [(vertex,(read_tuples[vertex][1], read_tuples[vertex][0]))] #print path_var segment = get_string(path_var) h.node[vertex]['start_read'] = path_var[0][1][0] h.node[vertex]['end_read'] = path_var[0][1][1] h.node[vertex]['path'] = [vertex] h.node[vertex]['segment'] = segment vertices_used = set([x for x in h.nodes()]) contig_no = 1 for start_vertex in vertices_of_interest: first_out_vertices = in_graph.successors(start_vertex) print start_vertex, first_out_vertices for vertex in first_out_vertices: predecessor = start_vertex start_vertex_id,start_vertex_or = start_vertex.split("_") cur_vertex = vertex if start_vertex_or == '0': cur_path = [(start_vertex,(read_tuples[start_vertex][1], in_graph.edge[start_vertex][cur_vertex]['read_a_start']))] elif start_vertex_or == '1': cur_path = [(start_vertex,(read_tuples[start_vertex][0], in_graph.edge[start_vertex][cur_vertex]['read_a_start']))] while cur_vertex not in vertices_of_interest: successor = in_graph.successors(cur_vertex)[0] start_point = in_graph.edge[predecessor][cur_vertex]['read_b_start'] end_point = in_graph.edge[cur_vertex][successor]['read_a_start'] cur_path.append((cur_vertex,(start_point,end_point))) vertices_used.add(cur_vertex) predecessor = cur_vertex cur_vertex = successor stop_vertex_id, stop_vertex_or = cur_vertex.split("_") if stop_vertex_or == '0': cur_path.append((cur_vertex,(in_graph.edge[predecessor][cur_vertex]['read_b_start'], read_tuples[cur_vertex][0]))) elif stop_vertex_or == '1': cur_path.append((cur_vertex,(in_graph.edge[predecessor][cur_vertex]['read_b_start'], read_tuples[cur_vertex][1]))) node_name = str(contig_no) h.add_node(node_name) contig_no += 1 # print cur_path node_path = [x[0] for x in cur_path] h.node[node_name]['path'] = node_path h.node[node_name]['start_read'] = path_var[0][1][0] h.node[node_name]['end_read'] = path_var[-1][1][1] h.node[node_name]['segment'] = get_string(cur_path) h.add_edges_from([(start_vertex,node_name),(node_name,cur_vertex)]) # paths.append(cur_path) #print read_tuples while set(in_graph.nodes())-vertices_used: vert = list(set(in_graph.nodes())-vertices_used)[0] vert_id,vert_or = vert.split("_") if vert_or == '0': read_start = min( min([(in_graph.edge[x][vert]['read_b_start']) for x in in_graph.predecessors(vert)]), max([(in_graph.edge[vert][x]['read_a_start']) for x in in_graph.successors(vert)])) read_end = max( min([(in_graph.edge[x][vert]['read_b_start']) for x in in_graph.predecessors(vert)]), max([(in_graph.edge[vert][x]['read_a_start']) for x in in_graph.successors(vert)])) vertRC = vert_id+"_1" else: read_start = max( min([(in_graph.edge[x][vert]['read_b_start']) for x in in_graph.predecessors(vert)]), max([(in_graph.edge[vert][x]['read_a_start']) for x in in_graph.successors(vert)])) read_end = min( min([(in_graph.edge[x][vert]['read_b_start']) for x in in_graph.predecessors(vert)]), max([(in_graph.edge[vert][x]['read_a_start']) for x in in_graph.successors(vert)])) vertRC = vert_id+"_0" successor_start = in_graph.successors(vert)[0] d = in_graph.get_edge_data(vert,successor_start) read_tuples_raw[vert] = (d['read_a_start_raw'], d['read_a_end_raw']) successor_start = in_graph.successors(vertRC)[0] d = in_graph.get_edge_data(vertRC,successor_start) read_tuples_raw[vertRC] = (d['read_a_start_raw'], d['read_a_end_raw']) h.add_node(vert) node_path = [vert] h.node[vert]['path'] = node_path h.node[vert]['start_read'] = read_start h.node[vert]['end_read'] = read_end h.node[vert]['segment'] = get_string([(vert,(read_start, read_end))]) vertices_used.add(vert) first_out_vertices = in_graph.successors(vert) for vertex in first_out_vertices: predecessor = vert cur_vertex = vertex cur_path = [] while cur_vertex != vert: successor = in_graph.successors(cur_vertex)[0] start_point = in_graph.edge[predecessor][cur_vertex]['read_b_start'] end_point = in_graph.edge[cur_vertex][successor]['read_a_start'] cur_path.append((cur_vertex,(start_point,end_point))) vertices_used.add(cur_vertex) predecessor = cur_vertex cur_vertex = successor node_name = str(contig_no) h.add_node(node_name) contig_no += 1 # print cur_path node_path = [x[0] for x in cur_path] h.node[node_name]['path'] = node_path h.node[node_name]['start_read'] = path_var[0][1][0] h.node[node_name]['end_read'] = path_var[-1][1][1] h.node[node_name]['segment'] = get_string(cur_path) h.add_edges_from([(vert,node_name),(node_name,vert)]) if vertRC not in vertices_used: h.add_node(vertRC) h.node[vertRC]['segment'] = get_string([(vertRC,(read_end, read_start))]) h.node[vertRC]['path'] = [vertRC] h.node[vertRC]['start_read'] = read_end h.node[vertRC]['end_read'] = read_start vertices_used.add(vertRC) first_out_vertices = in_graph.successors(vertRC) for vertex in first_out_vertices: predecessor = vertRC cur_vertex = vertex cur_path = [] while cur_vertex != vertRC: successor = in_graph.successors(cur_vertex)[0] start_point = in_graph.edge[predecessor][cur_vertex]['read_b_start'] end_point = in_graph.edge[cur_vertex][successor]['read_a_start'] cur_path.append((cur_vertex,(start_point,end_point))) vertices_used.add(cur_vertex) predecessor = cur_vertex cur_vertex = successor node_name = str(contig_no) h.add_node(node_name) contig_no += 1 # print cur_path node_path = [x[0] for x in cur_path] h.node[node_name]['path'] = node_path h.node[node_name]['start_read'] = path_var[0][1][0] h.node[node_name]['end_read'] = path_var[-1][1][1] h.node[node_name]['segment'] = get_string(cur_path) print len(cur_path) h.add_edges_from([(vertRC,node_name),(node_name,vertRC)]) outfile = '/data/pacbio_assembly/pb_data/NCTC/'+NCTCname+'/'+NCTCname + ".edges.list" vert_to_merge = [x for x in h.nodes() if len(h.successors(x)) == 1 and len(h.predecessors(h.successors(x)[0])) == 1 and len(nx.node_connected_component(h.to_undirected(), x)) > 2] while True: vert_to_merge = [x for x in h.nodes() if len(h.successors(x)) == 1 and len(h.predecessors(h.successors(x)[0])) == 1 and len(nx.node_connected_component(h.to_undirected(), x)) > 2] if not vert_to_merge: break vert = vert_to_merge[0] #print vert, succ = h.successors(vert)[0] preds = h.predecessors(vert) h.node[succ]['segment'] = h.node[vert]['segment'] + h.node[succ]['segment'] h.node[succ]['path'] = h.node[vert]['path'] + h.node[succ]['path'][1:] for pred in preds: #print pred, succ h.add_edges_from([(pred,succ)]) h.remove_edge(pred,vert) h.remove_edge(vert,succ) h.remove_node(vert) for i, vert in enumerate(h.nodes()): print i,len(h.node[vert]['path']) with open(outfile, 'w') as f: for i,node in enumerate(h.nodes()): #print node #print h.node[node] path = h.node[node]['path'] f.write('>Unitig%d\n'%(i)) if len(path) == 1: #print path[0] f.write(' '.join([path[0].split('_')[0], path[0].split('_')[1], str(read_tuples_raw[path[0]][0]), str(read_tuples_raw[path[0]][1])]) + '\n') for j in range(len(path)-1): nodeA = path[j].lstrip("B") nodeB = path[j+1].lstrip("B") d = in_graph.get_edge_data(path[j],path[j+1]) f.write('%s %s %s %s %d %d %d %d %d\n'%(nodeA.split('_')[0],nodeA.split('_')[1] , nodeB.split('_')[0], nodeB.split('_')[1], -d['read_a_start_raw'] + d['read_a_end_raw'] - d['read_b_start_raw'] + d['read_b_end_raw'], d['read_a_start_raw'], d['read_a_end_raw'], d['read_b_start_raw'], d['read_b_end_raw'])) out_graphml_name = '/data/pacbio_assembly/pb_data/NCTC/'+NCTCname+'/'+NCTCname+'_draft.graphml' gfaname = '/data/pacbio_assembly/pb_data/NCTC/'+NCTCname+'/'+NCTCname+'_draft_python.gfa' consensus_name = sys.argv[3] consensus_contigs = [] try: with open(consensus_name) as f: for line in f: if line[0] != '>': consensus_contigs.append(line.strip()) except: pass for i, vert in enumerate(h.nodes()): print i,len(h.node[vert]['path']), len(h.node[vert]['segment']), len(consensus_contigs[i]) with open(gfaname,'w') as f: f.write("H\tVN:Z:1.0\n") for i,vert in enumerate(h.nodes()): if len(consensus_contigs) > 0: seg = consensus_contigs[i] else: seg = h.node[vert]['segment'] seg_line = "S\t"+vert+"\t"+seg + '\n' f.write(seg_line) for edge in h.edges(): edge_line = "L\t"+edge[0]+"\t+\t"+edge[1]+"\t+\t0M\n" f.write(edge_line) #last = h.nodes()[-1] #print h.node[last] #path_last = h.node[last]['path'] #for i in range(len(path_last)-1): # read_a = path_last[i] # read_b = path_last[i+1] # print read_a, read_b, in_graph.edge[read_a][read_b] for i,node in enumerate(h.nodes()): h.node[node]['path'] = ';'.join(h.node[node]['path']) nx.write_graphml(h,out_graphml_name) HINGE-0.5.0/scripts/get_draft_path.py000077500000000000000000000355211314415550300173740ustar00rootroot00000000000000#!/usr/bin/env python import sys import os import subprocess from parse_read import * import numpy as np import networkx as nx import itertools from pbcore.io import FastaIO def rev_node(node): node_id = node.split('_')[0] return node_id + '_' + str(1-int(node.split('_')[1])) def merge_nodes(g,in_node,out_node): weight = str(g.edge[in_node][out_node]['length']) if 'path' in g.node[in_node]: path1 = g.node[in_node]['path'] weightspath1 = g.node[in_node]['weightspath'] else: path1 = in_node weightspath1 = '' if 'path' in g.node[out_node]: path2 = g.node[out_node]['path'] weightspath2 = ';' + g.node[out_node]['weightspath'] else: path2 = out_node weightspath2 = '' g.node[in_node]['path'] = path1 + ';' + path2 if weightspath1 == '': g.node[in_node]['weightspath'] = weight + weightspath2 else: g.node[in_node]['weightspath'] = weightspath1 + ';' + weight + weightspath2 for nodeB in g.successors(out_node): g.add_edge(in_node,nodeB,length=g.edge[out_node][nodeB]['length']) g.node[in_node]['cut_end'] = g.node[out_node]['cut_end'] g.remove_node(out_node) filedir = sys.argv[1] filename = sys.argv[2] graphml_path = sys.argv[3] in_graph = nx.read_graphml(graphml_path) # debug output #for node in in_graph.nodes(): # print node #for edge in in_graph.edges(): # print len(in_graph.edge[edge[0]][edge[1]]) reads = sorted(list(set([int(x.split("_")[0].lstrip("B")) for x in in_graph.nodes()]))) dbshow_reads = ' '.join([str(x+1) for x in reads]) DBshow_cmd = "DBshow "+ filedir+'/'+ filename+' '+dbshow_reads stream = subprocess.Popen(DBshow_cmd.split(), stdout=subprocess.PIPE,bufsize=1) reads_queried = parse_read(stream.stdout) read_dict = {} for read_id,read in itertools.izip(reads,reads_queried): rdlen = len(read[1]) # print read read_dict[read_id] = read # to simulate reads # read_dict = {} # for vertex in in_graph.nodes(): # read_dict[int(vertex.split('_')[0])] = ['A','A'*50000] complement = {'A':'T','C': 'G','T':'A', 'G':'C','a':'t','t':'a','c':'g','g':'c'} # out_graphml_name = 'test.graphml' out_graphml_name = filedir + '/' + filename +'_draft.graphml' # outfile = 'test.edges.list' outfile = filedir + '/' + filename + ".edges.list" rev_comp_contig = True out_graph = in_graph.copy() # first we add some info to the graph for the cutting of contigs for vert in out_graph.nodes(): vert_id, vert_or = vert.split("_") vert_id = vert_id.lstrip("B") vert_len = len(read_dict[int(vert_id)][1]) out_graph.node[vert]['cut_start'] = 0 out_graph.node[vert]['cut_end'] = vert_len # SHOULD THIS USE THE RAW MATCHES? if out_graph.in_degree(vert) > 1: if vert_or == '0': out_graph.node[vert]['cut_start'] = max([out_graph.edge[x][vert]['read_b_match_start'] for x in out_graph.predecessors(vert)]) else: out_graph.node[vert]['cut_start'] = vert_len - min([out_graph.edge[vert_id+'_0'][x]['read_a_match_start'] for x in out_graph.successors(vert_id+'_0')]) if out_graph.out_degree(vert) > 1: if vert_or == '0': out_graph.node[vert]['cut_end'] = min([out_graph.edge[vert][x]['read_a_match_start'] for x in out_graph.successors(vert)]) else: out_graph.node[vert]['cut_end'] = vert_len - max([out_graph.edge[x][vert_id+'_0']['read_b_match_start'] for x in out_graph.predecessors(vert_id+'_0')]) # next we merge the nodes in out_graph to form the contigs nodes_to_merge = [x for x in out_graph.nodes() if out_graph.in_degree(x) == 1 and out_graph.out_degree(out_graph.predecessors(x)[0]) == 1] # print len(read_dict[41260][1]) # print len(read_dict[4697][1]) while nodes_to_merge: cur_node = nodes_to_merge[0] prev_node = out_graph.predecessors(cur_node)[0] if prev_node != cur_node: merge_nodes(out_graph,prev_node,cur_node) else: out_graph.node[cur_node]['path'] = out_graph.node[cur_node]['path'] + ';' + cur_node out_graph.node[cur_node]['weightspath'] = out_graph.node[cur_node]['weightspath'] + ';' + str(out_graph.edge[prev_node][cur_node]['length']) out_graph.node[cur_node]['cut_end'] = len(read_dict[int(cur_node.split('_')[0])][1]) nodes_to_merge.pop(0) # print len(nodes_to_merge) # next we print the contigs out to the .edges.list file contig_no = 0 # print "Writing out_graph with "+str(len(out_graph.nodes()))+" contigs/nodes" # we keep track of the already printed nodes so that reverse complement pairs are printed together # we don't add to printed_nodes the "border" nodes so that we still have a partition of the nodes into contigs # printed_nodes = set() printed_nodes = {} # debug output # for node in out_graph.nodes(): # print node # print out_graph.node[node] # for edge in out_graph.edges(): # print edge # print out_graph.edge[edge[0]][edge[1]] with open(outfile, 'w') as f: for vertex in out_graph.nodes(): if rev_node(vertex) in printed_nodes: out_graph.node[vertex]['contig_id'] = printed_nodes[rev_node(vertex)] + 1 continue # single-node contig if 'path' not in out_graph.node[vertex]: out_graph.node[vertex]['contig_id'] = contig_no + 1 f.write('>Unitig%d\n'%(contig_no)) # printed_nodes = printed_nodes | set([vertex]) printed_nodes[vertex] = contig_no contig_no += 1 # we repeat the same node twice so that the line is easily distinguishable (6 numbers) f.write('O %s %s %s %s %d %d\n'%(vertex.split('_')[0].lstrip('B'), vertex.split('_')[1] , vertex.split('_')[0].lstrip('B'), vertex.split('_')[1], out_graph.node[vertex]['cut_start'], out_graph.node[vertex]['cut_end']) ) f.write('>Unitig%d\n'%(contig_no)) contig_no += 1 vertex_rc = rev_node(vertex) f.write('O %s %s %s %s %d %d\n'%(vertex_rc.split('_')[0].lstrip('B'), vertex_rc.split('_')[1] , vertex_rc.split('_')[0].lstrip('B'), vertex_rc.split('_')[1], out_graph.node[vertex_rc]['cut_start'], out_graph.node[vertex_rc]['cut_end']) ) continue node_list = out_graph.node[vertex]['path'].split(';') weights_list = out_graph.node[vertex]['weightspath'].split(';') # double-node contig if out_graph.in_degree(vertex) != 1 and out_graph.out_degree(vertex) != 1 and len(node_list) == 2: out_graph.node[vertex]['contig_id'] = contig_no f.write('>Unitig%d\n'%(contig_no)) # printed_nodes = printed_nodes | set(node_list) printed_nodes[node_list[0]] = contig_no printed_nodes[node_list[1]] = contig_no contig_no += 1 nodeA = node_list[0] nodeB = node_list[1] f.write('D %s %s %s %s %s %d %d\n'%(nodeA.split('_')[0].lstrip('B'), nodeA.split('_')[1] , nodeB.split('_')[0].lstrip('B'), nodeB.split('_')[1], weights_list[0], out_graph.node[vertex]['cut_start'], out_graph.node[vertex]['cut_end']) ) f.write('>Unitig%d\n'%(contig_no)) contig_no += 1 nodeA = rev_node(node_list[1]) nodeB = rev_node(node_list[0]) f.write('D %s %s %s %s %s %d %d\n'%(nodeA.split('_')[0].lstrip('B'), nodeA.split('_')[1] , nodeB.split('_')[0].lstrip('B'), nodeB.split('_')[1], weights_list[0], len(read_dict[int(nodeA.split('_')[0])][1]) - out_graph.node[vertex]['cut_end'], len(read_dict[int(nodeB.split('_')[0])][1]) - out_graph.node[vertex]['cut_start'] ) ) continue # print out_graph.node[vertex]['path'] # print node_list # print out_graph.node[vertex]['weightspath'] # print weights_list # print len(node_list),len(weights_list) if len(node_list) != len(weights_list)+1: print 'Something went wrong with contig '+str(contig_no) continue # printed_nodes = printed_nodes | set(node_list) for curnode in node_list: printed_nodes[curnode] = contig_no # print 'Unitig ' +str(contig_no) + ' ('+str(len(node_list))+' nodes)' out_graph.node[vertex]['contig_id'] = contig_no f.write('>Unitig%d\n'%(contig_no)) contig_no += 1 # prev_vert = out_graph.node[node_list[0]]['prev_node'] # if prev_vert != '': if out_graph.in_degree(vertex) == 1 and out_graph.predecessors(vertex)[0] != vertex: prev_contig = out_graph.predecessors(vertex)[0] cut_start = out_graph.node[prev_contig]['cut_end'] if out_graph.node[prev_contig].has_key('path'): nodeA = out_graph.node[prev_contig]['path'].split(';')[-1] else: nodeA = prev_contig nodeB = node_list[0] f.write('S %s %s %s %s %s %d\n'%(nodeA.split('_')[0].lstrip('B'), nodeA.split('_')[1] , nodeB.split('_')[0].lstrip('B'), nodeB.split('_')[1], out_graph.edge[prev_contig][vertex]['length'], cut_start) ) if len(node_list) > 2: nodeA = node_list[0] nodeB = node_list[1] f.write('T %s %s %s %s %s\n'%(nodeA.split('_')[0].lstrip('B'),nodeA.split('_')[1] , nodeB.split('_')[0].lstrip('B'), nodeB.split('_')[1], weights_list[0]) ) else: nodeA = node_list[0] nodeB = node_list[1] f.write('S %s %s %s %s %s %d\n'%(nodeA.split('_')[0].lstrip('B'), nodeA.split('_')[1] , nodeB.split('_')[0].lstrip('B'), nodeB.split('_')[1], weights_list[0], out_graph.node[vertex]['cut_start']) ) for i in range(1,len(weights_list)-1): nodeA = node_list[i] nodeB = node_list[i+1] f.write('T %s %s %s %s %s\n'%(nodeA.split('_')[0].lstrip('B'), nodeA.split('_')[1] , nodeB.split('_')[0].lstrip('B'), nodeB.split('_')[1], weights_list[i]) ) if out_graph.out_degree(vertex) == 1 and out_graph.successors(vertex)[0] != vertex: if len(node_list) > 2: nodeA = node_list[len(weights_list)-1] nodeB = node_list[len(weights_list)] f.write('T %s %s %s %s %s\n'%(nodeA.split('_')[0].lstrip('B'), nodeA.split('_')[1] , nodeB.split('_')[0].lstrip('B'), nodeB.split('_')[1], weights_list[-1]) ) next_contig = out_graph.successors(vertex)[0] # we end this contig where the next one begins cut_end = out_graph.node[next_contig]['cut_start'] nodeA = node_list[len(weights_list)] if out_graph.node[next_contig].has_key('path'): nodeB = out_graph.node[next_contig]['path'].split(';')[0] else: nodeB = next_contig f.write('E %s %s %s %s %s %d\n'%(nodeA.split('_')[0].lstrip('B'),nodeA.split('_')[1] , nodeB.split('_')[0].lstrip('B'), nodeB.split('_')[1], out_graph.edge[vertex][next_contig]['length'], cut_end) ) else: nodeA = node_list[len(weights_list)-1] nodeB = node_list[len(weights_list)] f.write('E %s %s %s %s %s %d\n'%(nodeA.split('_')[0].lstrip('B'),nodeA.split('_')[1] , nodeB.split('_')[0].lstrip('B'), nodeB.split('_')[1], weights_list[-1], out_graph.node[vertex]['cut_end']) ) # if we want reverse complement contigs, we print them next to each other if rev_comp_contig == False: continue f.write('>Unitig%d\n'%(contig_no)) contig_no += 1 if out_graph.out_degree(vertex) == 1 and out_graph.successors(vertex)[0] != vertex: next_contig = out_graph.successors(vertex)[0] nodeB = rev_node(node_list[len(weights_list)]) if out_graph.node[next_contig].has_key('path'): nodeA = rev_node(out_graph.node[next_contig]['path'].split(';')[0]) else: nodeA = rev_node(next_contig) # we start this contig where the previous (rc: next) one ended cut_start = len(read_dict[int(nodeA.split('_')[0])][1]) - out_graph.node[next_contig]['cut_start'] f.write('S %s %s %s %s %s %d\n'%(nodeA.split('_')[0].lstrip('B'), nodeA.split('_')[1] , nodeB.split('_')[0].lstrip('B'), nodeB.split('_')[1], out_graph.edge[vertex][next_contig]['length'], cut_start) ) if len(node_list) > 2: nodeA = rev_node(node_list[len(weights_list)]) nodeB = rev_node(node_list[len(weights_list)-1]) f.write('T %s %s %s %s %s\n'%(nodeA.split('_')[0].lstrip('B'), nodeA.split('_')[1] , nodeB.split('_')[0].lstrip('B'), nodeB.split('_')[1], weights_list[-1]) ) else: nodeA = rev_node(node_list[len(weights_list)]) nodeB = rev_node(node_list[len(weights_list)-1]) f.write('S %s %s %s %s %s %d\n'%(nodeA.split('_')[0].lstrip('B'), nodeA.split('_')[1] , nodeB.split('_')[0].lstrip('B'), nodeB.split('_')[1], weights_list[-1], len(read_dict[int(nodeA.split('_')[0])][1]) - out_graph.node[vertex]['cut_end']) ) for i in range(len(weights_list)-1,1,-1): nodeA = rev_node(node_list[i]) nodeB = rev_node(node_list[i-1]) f.write('T %s %s %s %s %s\n'%(nodeA.split('_')[0].lstrip('B'), nodeA.split('_')[1] , nodeB.split('_')[0].lstrip('B'), nodeB.split('_')[1], weights_list[i-1]) ) if out_graph.in_degree(vertex) == 1 and out_graph.predecessors(vertex)[0] != vertex: if len(node_list) > 2: nodeA = rev_node(node_list[1]) nodeB = rev_node(node_list[0]) f.write('T %s %s %s %s %s\n'%(nodeA.split('_')[0].lstrip('B'), nodeA.split('_')[1] , nodeB.split('_')[0].lstrip('B'), nodeB.split('_')[1], weights_list[0]) ) prev_contig = out_graph.predecessors(vertex)[0] nodeA = rev_node(node_list[0]) if out_graph.node[prev_contig].has_key('path'): nodeB = rev_node(out_graph.node[prev_contig]['path'].split(';')[-1]) else: nodeB = rev_node(prev_contig) cut_end = len(read_dict[int(nodeB.split('_')[0])][1]) - out_graph.node[prev_contig]['cut_end'] f.write('E %s %s %s %s %s %d\n'%(nodeA.split('_')[0].lstrip('B'),nodeA.split('_')[1] , nodeB.split('_')[0].lstrip('B'), nodeB.split('_')[1], out_graph.edge[prev_contig][vertex]['length'], cut_end) ) else: nodeB = rev_node(node_list[0]) nodeA = rev_node(node_list[1]) f.write('E %s %s %s %s %s %d\n'%(nodeA.split('_')[0].lstrip('B'),nodeA.split('_')[1] , nodeB.split('_')[0].lstrip('B'), nodeB.split('_')[1], weights_list[0], len(read_dict[int(nodeB.split('_')[0])][1]) - out_graph.node[vertex]['cut_start']) ) print "Number of contigs: "+str(contig_no) nx.write_graphml(out_graph,out_graphml_name) HINGE-0.5.0/scripts/get_draft_path_norevcomp.py000077500000000000000000000006061314415550300214600ustar00rootroot00000000000000#!/usr/bin/env python import sys import os from pbcore.io import FastaIO def run(reader, writer): for i,record in enumerate(reader): if i%2 == 0: writer.writeRecord(record.header, record.sequence) if __name__ == '__main__': iname, oname = sys.argv[1:3] reader = FastaIO.FastaReader(iname) writer = FastaIO.FastaWriter(oname) run(reader, writer) HINGE-0.5.0/scripts/get_single_strand.py000066400000000000000000000005701314415550300201050ustar00rootroot00000000000000#!/usr/bin/env python #usage python get_single_strand.py from pbcore.io import FastaIO import sys flpath = sys.argv[1] outpath = sys.argv[2] writer = FastaIO.FastaWriter(outpath) reader = FastaIO.FastaReader(flpath) j = 0 for i,record in enumerate(reader): if j%2 == 0: writer.writeRecord('Consensus'+str(j), record.sequence) j+=1HINGE-0.5.0/scripts/interface_utils.py000077500000000000000000000072261314415550300176020ustar00rootroot00000000000000#!/usr/bin/env python import sys import os import subprocess from parse_read import * from parse_alignment import * from parse_qv import * #filename = sys.argv[1] #readarg = sys.argv[2] def get_reads(filename, readlist): stream = subprocess.Popen(["DBshow", filename] + map(str,readlist), stdout=subprocess.PIPE,bufsize=1) reads = parse_read(stream.stdout) # generator return reads def get_QV(filename, readlist): stream = subprocess.Popen(["DBdump", filename, '-i'] + map(str,readlist), stdout=subprocess.PIPE,bufsize=1) qv = parse_qv(stream.stdout) # generator return qv def get_alignments(filename, readlist): stream = subprocess.Popen(["LAshow", filename,filename]+ map(str,readlist), stdout=subprocess.PIPE,bufsize=1) alignments = parse_alignment(stream.stdout) # generator return alignments def get_alignments2(filename, alignmentname, readlist): stream = subprocess.Popen(["LA4Awesome", filename, filename, alignmentname]+ map(str,readlist), stdout=subprocess.PIPE,bufsize=1) alignments = parse_alignment2(stream.stdout) # generator return alignments def get_alignments_mapping(filename, ref, alignmentname, readlist): stream = subprocess.Popen(["LA4Awesome", filename, ref, alignmentname]+ map(str,readlist)+ ['-F'], stdout=subprocess.PIPE,bufsize=1) alignments = parse_alignment2(stream.stdout) # generator return alignments def get_alignments_mapping2(ref, filename, alignmentname): print ref,filename,alignmentname stream = subprocess.Popen(["LA4Awesome", ref, filename, alignmentname], stdout=subprocess.PIPE,bufsize=1) alignments = parse_alignment2(stream.stdout) # generator return alignments def get_alignments_mapping3(ref, filename, alignmentname, contig_no): print ref,filename,alignmentname stream = subprocess.Popen(["LA4Awesome", ref, filename, alignmentname, contig_no], stdout=subprocess.PIPE,bufsize=1) alignments = parse_alignment2(stream.stdout) # generator return alignments def get_all_reads(filename): stream = subprocess.Popen(["DBshow", filename], stdout=subprocess.PIPE,bufsize=1) reads = parse_read(stream.stdout) # generator return reads def get_all_alignments(filename): stream = subprocess.Popen(["LAshow", filename, filename ], stdout=subprocess.PIPE,bufsize=1) alignments = parse_alignment(stream.stdout) # generator return alignments def get_all_alignments2(filename, alignmentname): stream = subprocess.Popen(["LA4Awesome", filename, filename, alignmentname ], stdout=subprocess.PIPE,bufsize=1) alignments = parse_alignment2(stream.stdout) # generator return alignments def get_all_reads_in_alignment_with_one(filename,read): this_read = get_reads(filename,[read]) alignments = list(get_alignments(filename,[read])) readlist = map(lambda x:x[2],alignments) print readlist other_reads = get_reads(filename,readlist) return [list(this_read), list(other_reads), alignments] # note that this is not a generator # test #for item in get_reads('G',[1]): # print item #for item in get_alignments('G',[1]): # print item #for item in get_alignments2('G','G.1.las',[1]): # print item #for item in get_all_reads_in_alignment_with_one('G',1): # print item #for item in get_reads('G', [1,2,3]): # print item HINGE-0.5.0/scripts/longest_path.py000077500000000000000000000027031314415550300171040ustar00rootroot00000000000000#!/usr/bin/env python import networkx as nx import sys from collections import Counter def longest_path(G): dist = {} # stores [node, distance] pair for node in nx.topological_sort(G): # pairs of dist,node for all incoming edges pairs = [(dist[v][0]+1,v) for v in G.pred[node]] if pairs: dist[node] = max(pairs) else: dist[node] = (0, node) node,(length,_) = max(dist.items(), key=lambda x:x[1]) path = [] while length > 0: path.append(node) length,node = dist[node] return list(reversed(path)) filename = sys.argv[1] g = nx.DiGraph() with open(filename,'r') as f: for line in f.xreadlines(): g.add_edge(*(line.strip().split('->'))) print nx.info(g) degree_sequence=sorted(nx.degree(g).values(),reverse=True) print Counter(degree_sequence) for i in range(7): for node in g.nodes(): if g.in_degree(node) == 0: g.remove_node(node) print nx.info(g) degree_sequence=sorted(nx.degree(g).values(),reverse=True) print Counter(degree_sequence) def rev(string): if string[-1] == '\'': return string[:-1] else: return string+'\'' for edge in g.edges(): g.add_edge(rev(edge[1]), rev(edge[0])) #print edge #print rev(edge[1]), rev(edge[0]) print nx.info(g) nx.write_graphml(g, filename.split('.')[0]+'.graphml') #print(list(nx.dfs_edges(g,sys.argv[2]))) #p=nx.shortest_path(g) HINGE-0.5.0/scripts/merge_hinges.py000077500000000000000000000504541314415550300170570ustar00rootroot00000000000000#!/usr/bin/env python import networkx as nx import random import sys from collections import Counter import ujson def dead_end_clipping(G,threshold): # H=nx.DiGraph() H = G.copy() start_nodes = set([x for x in H.nodes() if H.in_degree(x) ==0]) for st_node in start_nodes: cur_path = [st_node] if len(H.successors(st_node)) == 1: cur_node = H.successors(st_node)[0] while H.in_degree(cur_node) == 1 and H.out_degree(cur_node) == 1 and len(cur_path) < threshold + 2: cur_path.append(cur_node) cur_node = H.successors(cur_node)[0] if len(cur_path) <= threshold: for vertex in cur_path: H.remove_node(vertex) end_nodes = set([x for x in H.nodes() if H.out_degree(x) ==0]) for end_node in end_nodes: cur_path = [end_node] if len(H.predecessors(end_node)) == 1: cur_node = H.predecessors(end_node)[0] while H.in_degree(cur_node) == 1 and H.out_degree(cur_node) == 1 and len(cur_path) < threshold + 2: cur_path.append(cur_node) cur_node = H.predecessors(cur_node)[0] if len(cur_path) <= threshold: for vertex in cur_path: H.remove_node(vertex) return H # In[9]: def z_clipping(G,threshold,in_hinges,out_hinges,print_z = False): H = G.copy() start_nodes = set([x for x in H.nodes() if H.out_degree(x) > 1 and x not in out_hinges]) for st_node in start_nodes: for sec_node in H.successors(st_node): if H.out_degree(st_node) == 1: break cur_node = sec_node cur_path = [[st_node,cur_node]] while H.in_degree(cur_node) == 1 and H.out_degree(cur_node) == 1: cur_path.append([cur_node,H.successors(cur_node)[0]]) cur_node = H.successors(cur_node)[0] if len(cur_path) > threshold + 1: break if len(cur_path) <= threshold and H.in_degree(cur_node) > 1 and H.out_degree(st_node) > 1 and cur_node not in in_hinges: if print_z: print cur_path for edge in cur_path: H.remove_edge(edge[0],edge[1]) for j in range(len(cur_path)-1): H.remove_node(cur_path[j][1]) end_nodes = set([x for x in H.nodes() if H.in_degree(x) > 1 and x not in in_hinges]) for end_node in end_nodes: for sec_node in H.predecessors(end_node): if H.in_degree(end_node) == 1: break cur_node = sec_node cur_path = [[cur_node,end_node]] while H.in_degree(cur_node) == 1 and H.out_degree(cur_node) == 1: cur_path.append([H.predecessors(cur_node)[0],cur_node]) cur_node = H.predecessors(cur_node)[0] if len(cur_path) > threshold + 1: break if len(cur_path) <= threshold and H.out_degree(cur_node) > 1 and H.in_degree(end_node) > 1 and cur_node not in out_hinges: if print_z: print cur_path for edge in cur_path: H.remove_edge(edge[0],edge[1]) for j in range(len(cur_path)-1): H.remove_node(cur_path[j][0]) return H def merge_path(g,in_node,node,out_node): g.add_edge(in_node,out_node,hinge_edge = -1,false_positive = 0) g.remove_node(node) def merge_a_to_b(g,node_a,node_b): if node_a not in g.nodes() or node_b not in g.nodes(): return for node in g.predecessors(node_a): if node != node_b: g.add_edge(node,node_b,hinge_edge = 1,false_positive = 0) for node in g.successors(node_a): if node != node_b: g.add_edge(node_b,node,hinge_edge = 1,false_positive = 0) g.remove_node(node_a) def random_condensation(G,n_nodes): g = G.copy() max_iter = 20000 iter_cnt = 0 while len(g.nodes()) > n_nodes and iter_cnt < max_iter: iter_cnt += 1 node = g.nodes()[random.randrange(len(g.nodes()))] if g.in_degree(node) == 1 and g.out_degree(node) == 1: in_node = g.in_edges(node)[0][0] out_node = g.out_edges(node)[0][1] if g.out_degree(in_node) == 1 and g.in_degree(out_node) == 1: if in_node != node and out_node != node and in_node != out_node: #print in_node, node, out_node # merge_path(g,in_node,node,out_node) bad_node=False for in_edge in g.in_edges(node): if g.edge[in_edge[0]][in_edge[1]]['false_positive']==1: bad_node=True for out_edge in g.out_edges(node): if g.edge[out_edge[0]][out_edge[1]]['false_positive']==1: bad_node=True if not bad_node: #print in_node, node, out_node merge_path(g,in_node,node,out_node) if iter_cnt >= max_iter: print "couldn't finish sparsification"+str(len(g.nodes())) return g def add_groundtruth(g,json_file,in_hinges,out_hinges): mapping = ujson.load(json_file) print 'getting mapping' mapped_nodes=0 print str(len(mapping)) print str(len(g.nodes())) slack = 500 for node in g.nodes(): # print node node_base=node.split("_")[0] # print node_base #print node if mapping.has_key(node_base): g.node[node]['aln_start'] = min (mapping[node_base][0][0],mapping[node_base][0][1]) g.node[node]['aln_end'] = max(mapping[node_base][0][1],mapping[node_base][0][0]) # g.node[node]['chr'] = mapping[node_base][0][2] mapped_nodes+=1 else: # pass g.node[node]['aln_start'] = 0 g.node[node]['aln_end'] = 0 # g.node[node]['aln_strand'] = 0 if node in in_hinges or node in out_hinges: g.node[node]['hinge'] = 1 else: g.node[node]['hinge'] = 0 for edge in g.edges_iter(): in_node=edge[0] out_node=edge[1] # if ((g.node[in_node]['aln_start'] < g.node[out_node]['aln_start'] and # g.node[out_node]['aln_start'] < g.node[in_node]['aln_end']) or # (g.node[in_node]['aln_start'] < g.node[out_node]['aln_end'] and # g.node[out_node]['aln_end'] < g.node[in_node]['aln_end'])): # g.edge[in_node][out_node]['false_positive']=0 # else: # g.edge[in_node][out_node]['false_positive']=1 if ((g.node[in_node]['aln_start'] < g.node[out_node]['aln_start'] and g.node[out_node]['aln_start'] < g.node[in_node]['aln_end']) or (g.node[in_node]['aln_start'] < g.node[out_node]['aln_end'] and g.node[out_node]['aln_end'] < g.node[in_node]['aln_end'])): g.edge[in_node][out_node]['false_positive']=0 else: g.edge[in_node][out_node]['false_positive']=1 return g def read_graph(edges_file,hg_file,gt_file, hinge_file): prefix = edges_file.split('.')[0] with open(gt_file) as f: read_dict = ujson.load(f) g = nx.DiGraph() hinge_nodes = [] hinge_pos = {} with open (hinge_file) as f: for lines in f: lines1=lines.split() hinge_nodes.append(lines1[0] + "_0_" + lines1[1]) hinge_nodes.append(lines1[0] + "_1_" + lines1[1]) # if lines1[0] not in hinge_pos: # hinge_pos[lines] # hinge_pos[lines1[0]] = lines1[1] # with open (hg_file) as f: # for lines in f: # lines1=lines.split() # g.add_node(lines1[0] + "_" + lines1[2]) # g.add_node(lines1[1] + "_" + lines1[3]) # if lines1[0] in read_dict: # g.node[lines1[0] + "_" + lines1[2]]['aln_start']=min(read_dict[lines1[0]][0][0],read_dict[lines1[0]][0][1]) # g.node[lines1[0] + "_" + lines1[2]]['aln_end']=max(read_dict[lines1[0]][0][0],read_dict[lines1[0]][0][1]) # else: # g.node[lines1[0] + "_" + lines1[2]]['aln_start']=0 # g.node[lines1[0] + "_" + lines1[2]]['aln_end']=0 # if lines1[1] in read_dict: # g.node[lines1[1] + "_" + lines1[3]]['aln_start']=min(read_dict[lines1[1]][0][0],read_dict[lines1[1]][0][1]) # g.node[lines1[1] + "_" + lines1[3]]['aln_end']=max(read_dict[lines1[1]][0][0],read_dict[lines1[1]][0][1]) # else: # g.node[lines1[1] + "_" + lines1[3]]['aln_start']=0 # g.node[lines1[1] + "_" + lines1[3]]['aln_end']=0 # if lines1[0] in hinge_nodes: # g.node[lines1[0] + "_" + lines1[2]]['active']=2 # else: # g.node[lines1[0] + "_" + lines1[2]]['active']=1 # if lines1[1] in hinge_nodes: # g.node[lines1[1] + "_" + lines1[3]]['active']=2 # else: # g.node[lines1[1] + "_" + lines1[3]]['active']=int(lines1[4]) # g.add_edge(lines1[0] + "_" + lines1[2], lines1[1] + "_" + lines1[3], rev = int(lines1[5])) # need to construct double stranded hinge graph, so that proper mapping can be found with open (hg_file) as f: for lines in f: lines1=lines.split() nodeA0 = lines1[0] + "_0_"+ lines1[2] nodeA1 = lines1[0] + "_1_"+ lines1[2] nodeB0 = lines1[1] + "_0_"+ lines1[3] nodeB1 = lines1[1] + "_1_"+ lines1[3] nodeA0short = lines1[0] + "_0" nodeA1short = lines1[0] + "_1" nodeB0short = lines1[1] + "_0" nodeB1short = lines1[1] + "_1" g.add_node(nodeA0) g.add_node(nodeA1) g.add_node(nodeB0) g.add_node(nodeB1) if nodeA0short not in hinge_pos: hinge_pos[nodeA0short] = [int(lines1[2])] hinge_pos[nodeA1short] = [int(lines1[2])] elif lines1[2] not in hinge_pos[nodeA0short]: hinge_pos[nodeA0short].append(int(lines1[2])) hinge_pos[nodeA1short].append(int(lines1[2])) if nodeB0 not in hinge_pos: hinge_pos[nodeB0short] = [int(lines1[3])] hinge_pos[nodeB1short] = [int(lines1[3])] elif lines1[3] not in hinge_pos[nodeB0short]: hinge_pos[nodeB0short].append(int(lines1[3])) hinge_pos[nodeB1short].append(int(lines1[3])) if lines1[0] in read_dict: g.node[lines1[0] + "_0_"+ lines1[2]]['aln_start']=min(read_dict[lines1[0]][0][0],read_dict[lines1[0]][0][1]) g.node[lines1[0] + "_0_"+ lines1[2]]['aln_end']=max(read_dict[lines1[0]][0][0],read_dict[lines1[0]][0][1]) g.node[lines1[0] + "_1_"+ lines1[2]]['aln_start']=min(read_dict[lines1[0]][0][0],read_dict[lines1[0]][0][1]) g.node[lines1[0] + "_1_"+ lines1[2]]['aln_end']=max(read_dict[lines1[0]][0][0],read_dict[lines1[0]][0][1]) else: g.node[lines1[0] + "_0_"+ lines1[2]]['aln_start']=0 g.node[lines1[0] + "_0_"+ lines1[2]]['aln_end']=0 g.node[lines1[0] + "_1_"+ lines1[2]]['aln_start']=0 g.node[lines1[0] + "_1_"+ lines1[2]]['aln_end']=0 if lines1[1] in read_dict: g.node[lines1[1] + "_0_"+ lines1[3]]['aln_start']=min(read_dict[lines1[1]][0][0],read_dict[lines1[1]][0][1]) g.node[lines1[1] + "_0_"+ lines1[3]]['aln_end']=max(read_dict[lines1[1]][0][0],read_dict[lines1[1]][0][1]) g.node[lines1[1] + "_1_"+ lines1[3]]['aln_start']=min(read_dict[lines1[1]][0][0],read_dict[lines1[1]][0][1]) g.node[lines1[1] + "_1_"+ lines1[3]]['aln_end']=max(read_dict[lines1[1]][0][0],read_dict[lines1[1]][0][1]) else: g.node[lines1[1] + "_0_"+ lines1[3]]['aln_start']=0 g.node[lines1[1] + "_0_"+ lines1[3]]['aln_end']=0 g.node[lines1[1] + "_1_"+ lines1[3]]['aln_start']=0 g.node[lines1[1] + "_1_"+ lines1[3]]['aln_end']=0 if nodeA0 in hinge_nodes: g.node[nodeA0]['active']=2 g.node[nodeA1]['active']=2 else: g.node[nodeA0]['active']=1 g.node[nodeA1]['active']=1 if nodeB0 in hinge_nodes: g.node[nodeB0]['active']=2 g.node[nodeB1]['active']=2 else: g.node[nodeB0]['active']=int(lines1[4]) g.node[nodeB1]['active']=int(lines1[4]) if int(lines1[5]) == 1: # reverse match g.add_edge(nodeA0, nodeB1) g.add_edge(nodeA1, nodeB0) else: g.add_edge(nodeA0,nodeB0) g.add_edge(nodeA1,nodeB1) # nx.write_graphml(g, filename.split('.')[0]+'_hgraph.graphml') # for c in nx.connected_components(g): # print len(c) hinge_mapping = {} for c in nx.weakly_connected_components(g): if len(c) > 10: component_sink = -1 for node in c: if g.out_degree(node) == 0 and g.node[node]['active']== 2 and component_sink == -1: component_sink = node elif g.out_degree(node) == 0 and g.node[node]['active']== 2 and g.in_degree(node) > g.in_degree(component_sink): component_sink = node if component_sink != -1: g.node[component_sink]['active'] = 3 else: component_sink = list(c)[0] # sink_shortname = component_sink.split('_')[0] for node in c: hinge_mapping[node] = component_sink else: for node in c: g.node[node]['active'] = -1 nx.write_graphml(g, hg_file.split('.')[0]+'_hgraph2.graphml') # print nx.number_weakly_connected_components(g) # print nx.number_strongly_connected_components(g) G = nx.DiGraph() merging = 1 if merging == 0: with open (edges_file) as f: for lines in f: lines1=lines.split() if len(lines1) < 6: continue if int(lines1[5]) != 0: if int(lines1[5]) == 1: # nodeB_id = lines1[1]+"_"+lines1[4] # hingepos = int(lines1[6]) nodeA_id = lines1[0] + "_" + lines1[3] hinge_node = lines1[1]+"_"+lines1[4] + '_' + lines1[6] print hinge_node eff_hinge = hinge_mapping[hinge_node] eff_b = eff_hinge.split('_') if eff_b[0] + "_" + eff_b[1] != lines1[0] + "_" + lines1[3]: G.add_edge(lines1[0] + "_" + lines1[3], eff_b[0] + "_" + eff_b[1],hinge_edge=1) G.add_edge(eff_b[0] + "_" + str(1-int(eff_b[1])), lines1[0] + "_" + str(1-int(lines1[3])),hinge_edge=1) else: G.add_edge(lines1[0] + "_" + lines1[3], lines1[1] + "_" + lines1[4],hinge_edge=1) G.add_edge(lines1[1] + "_" + str(1-int(lines1[4])), lines1[0] + "_" + str(1-int(lines1[3])),hinge_edge=1) elif int(lines1[5]) == -1: hinge_node = lines1[0]+"_"+lines1[3] + '_' + lines1[6] eff_hinge = hinge_mapping[hinge_node] eff_b = eff_hinge.split('_') if eff_b[0] + "_" + eff_b[1] != lines1[1] + "_" + lines1[4] : G.add_edge(eff_b[0] + "_" + eff_b[1], lines1[1] + "_" + lines1[4] ,hinge_edge=1) G.add_edge(lines1[1] + "_" + str(1-int(lines1[4])), eff_b[0] + "_" + str(1-int(eff_b[1])),hinge_edge=1) else: G.add_edge(lines1[0] + "_" + lines1[3], lines1[1] + "_" + lines1[4],hinge_edge=1) G.add_edge(lines1[1] + "_" + str(1-int(lines1[4])), lines1[0] + "_" + str(1-int(lines1[3])),hinge_edge=1) # if nodeA_id in hinge_pos: # print "found A" # else: # print "didnt find A" # if nodeB_id in hinge_pos: # print "Node B of hinged match IS in hinge_pos" # hinge_found = False # hingepos = hinge_pos[nodeB_id][0] # for candidate_pos in hinge_pos[nodeB_id]: # if (abs(candidate_pos - int(lines1[8][1:])) < 200): # hingepos = candidate_pos # print "Matching hinge found" # hinge_found = True # if not hinge_found: # print "not found" # print lines1[8][1:], hinge_pos[nodeB_id] # hinge_node = nodeB_id + '_' + str(hingepos) # eff_hinge = hinge_mapping[hinge_node] # eff_b = eff_hinge.split('_') # G.add_edge(lines1[0] + "_" + lines1[3], eff_b[0] + "_" + eff_b[1],hinge_edge=1) # G.add_edge(eff_b[0] + "_" + str(1-int(eff_b[1])), lines1[0] + "_" + str(1-int(lines1[3])),hinge_edge=1) else: G.add_edge(lines1[0] + "_" + lines1[3], lines1[1] + "_" + lines1[4],hinge_edge=int(lines1[5])) G.add_edge(lines1[1] + "_" + str(1-int(lines1[4])), lines1[0] + "_" + str(1-int(lines1[3])),hinge_edge=int(lines1[5])) else: to_be_merged = [] with open (edges_file) as f: for lines in f: lines1=lines.split() if len(lines1) < 6: continue G.add_edge(lines1[0] + "_" + lines1[3], lines1[1] + "_" + lines1[4],hinge_edge=int(lines1[5])) G.add_edge(lines1[1] + "_" + str(1-int(lines1[4])), lines1[0] + "_" + str(1-int(lines1[3])),hinge_edge=int(lines1[5])) if int(lines1[5]) != 0: if int(lines1[5]) == 1: to_be_merged.append([lines1[1],lines1[6]]) elif int(lines1[5]) == -1: to_be_merged.append([lines1[0],lines1[6]]) for pair in to_be_merged: sink_node_long = hinge_mapping[pair[0]+'_0_'+pair[1]] sink_node = sink_node_long.split('_')[0]+'_'+sink_node_long.split('_')[1] if pair[0]+'_0' != sink_node: merge_a_to_b(G,pair[0]+'_0',sink_node) sink_node_long = hinge_mapping[pair[0]+'_1_'+pair[1]] sink_node = sink_node_long.split('_')[0]+'_'+sink_node_long.split('_')[1] if pair[0]+'_1' != sink_node: merge_a_to_b(G,pair[0]+'_1',sink_node) in_hinges = set() out_hinges = set() with open (hinge_file) as f: for lines in f: lines1=lines.split() if lines1[2] == '1': in_hinges.add(lines1[0]+'_0') out_hinges.add(lines1[0]+'_1') elif lines1[2] == '-1': in_hinges.add(lines1[0]+'_1') out_hinges.add(lines1[0]+'_0') json_file = open(gt_file) add_groundtruth(G,json_file,in_hinges,out_hinges) G0 = G.copy() nx.write_graphml(G0, prefix+'.'+'G0_merged'+'.graphml') G0s = random_condensation(G0,3500) nx.write_graphml(G0s, prefix+'.'+'G0s_merged'+'.graphml') G1=dead_end_clipping(G0,10) G1=z_clipping(G1,5,in_hinges,out_hinges) nx.write_graphml(G1, prefix+'.'+'G1_merged'+'.graphml') Gs = random_condensation(G1,2500) nx.write_graphml(Gs, prefix+'.'+'Gs_merged'+'.graphml') if __name__ == "__main__": read_graph(sys.argv[1],sys.argv[2],sys.argv[3],sys.argv[4]) HINGE-0.5.0/scripts/parallel_draw.sh000077500000000000000000000011021314415550300172000ustar00rootroot00000000000000#!/bin/bash echo "Bash version ${BASH_VERSION}..." for i in $(seq 4000 1 20000) do echo drawing read $i num1=$(ps -ef | grep 'python draw.py' | wc -l) num2=$(ps -ef | grep 'LA4Awesome' | wc -l) num=$(( $num1 + $num2 )) echo $num running while [ $num -gt 60 ] do sleep 5 echo waiting, $num running num1=$(ps -ef | grep 'python draw.py' | wc -l) num2=$(ps -ef | grep 'LA4Awesome' | wc -l) num=$(( $num1 + $num2 )) done python draw2.py $i & done HINGE-0.5.0/scripts/parallel_draw_large.sh000077500000000000000000000006061314415550300203620ustar00rootroot00000000000000#!/bin/bash echo "Bash version ${BASH_VERSION}..." for i in $(seq 1 1 100) do echo drawing read $i num1=$(ps -ef | grep 'python draw.py' | wc -l) num2=$(ps -ef | grep 'LA4Awesome' | wc -l) num=$(( $num1 + $num2 )) echo $num running while [ $num -gt 12 ] do sleep 5 echo waiting done python draw.py $i & doneHINGE-0.5.0/scripts/parse.py000077500000000000000000000024061314415550300155270ustar00rootroot00000000000000#!/usr/bin/env python import sys min_len_aln = 1000 with sys.stdin as f: for l in f: l = l.strip().split() if len(l) != 2: continue read_id = l[0] seq = l[1] print read_id,seq #if len(seq) > max_len: # seq = seq[:max_len-1] if read_id not in ("+", "-", "*"): if len(seq) >= min_len_aln: if len(seqs) == 0: seqs.append(seq) #the "seed" seed_id = l[0] if read_id not in read_ids: #avoidng using the same read twice. seed is used again here by design seqs.append(seq) read_ids.add(read_id) elif l[0] == "+": if len(seqs) >= min_cov_aln: seqs = seqs[:1] + sorted(seqs[1:], key=lambda x: -len(x)) yield (seqs[:max_n_read], seed_id, config) #seqs_data.append( (seqs, seed_id) ) seqs = [] read_ids = set() seed_id = None elif l[0] == "*": seqs = [] read_ids = set() seed_id = None elif l[0] == "-": #yield (seqs, seed_id) #seqs_data.append( (seqs, seed_id) ) break HINGE-0.5.0/scripts/parse_alignment.py000077500000000000000000000013571314415550300175710ustar00rootroot00000000000000#!/usr/bin/env python import sys import re def parse_alignment(stream = sys.stdin): with stream as f: for l in f: sub = re.sub('[\[\].x:': if sid == '': sid = l[1:].strip() else: tsid = sid tseq = seq seq = '' sid = l[1:].strip() yield (tsid,tseq) else: seq += l.strip() yield(sid,seq) #for read in parse_read(): # print read # do whatever you want to do with the reads HINGE-0.5.0/scripts/pileup.ipynb000066400000000000000000000416751314415550300164140ustar00rootroot00000000000000{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%matplotlib inline\n", "%load_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import numpy as np\n", "import matplotlib.pyplot as plt\n", "from ipywidgets.widgets import interact " ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import interface_utils as util" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import os\n", "os.environ['PATH'] += ':/data/pacbio_assembly/AwesomeAssembler/DALIGNER'\n", "#print os.popen(\"export\").read()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [], "source": [ "path = '/data/pacbio_assembly/AwesomeAssembler/data/'\n", "aln = []\n", "for item in util.get_alignments2(path+'ecoli',path+'ecoli.las',[2]):\n", " aln.append(item)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[['n', 0, 104, 170, 7424, 7424, 4749, 11779, 13726, 1294, 7],\n", " ['c', 0, 757, 206, 7424, 7424, 3568, 10592, 15876, 1242, 7],\n", " ['n', 0, 978, 214, 1850, 7424, 9996, 11675, 11675, 392, 1],\n", " ['n', 0, 1183, 1183, 3057, 7424, 156, 2065, 5115, 359, 2],\n", " ['n', 0, 1183, 3057, 6052, 7424, 2151, 5115, 5115, 517, 3]]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "aln[0:5]" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAA28AAAJMCAYAAABtgJ7QAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAH9lJREFUeJzt3X2sJfV5H/DvLLt4AVO2BGdZXpy1ApZZlCoEFdK6ESdt\nSpaoxq5UYVdyldaoUoQbrCZtvPiPzkWqGoNUpaDI7ouTAlYhQYpiJ4JgXtKD1D/CKhUk2LAGJON6\nb8xisPxShTi83P4xs9rD+m737O6ZM/Ob8/lIV3fO3HP2/nbEiOe788wzCQAAAAAAAAAAAAAAAAAA\nAAAAAAAAAAAAMEAvJvnzJE8m2d/uOzfJI0meS/Jwkh0z778lyfNJDiS5dmb/lUmebn92R6crBgAA\nWEFfSxPWZt2e5Nfa7U8m+XS7vSfJU0m2Jdmd5IUkVfuz/UmuarcfTLK3m+UCAACMy9YTeG911Ovr\nk1zTbt+dZJpkX5IPJrkvyetprti9kOTqJF9PcnaOXLm7J8mHkjw0+4dec801G48//vgJLAsAAGBU\nHk8yOXrnvOFtI8mjSd5M8l+S/LckO5Mcan9+qH2dJBck+ZOZzx5McmGaMHdwZv96u//tq3z88Wxs\nbMy5LDaztraWtbW1vpcBRXC+wPycLzA/5wunoqqqazbbv2XOz78/yRVJrkvy8SQ/c9TPN9ovBmA6\n7XsFAACsInm1W/Neeftm+/1bSX4/zX1rh5Kcn+SlJLuSvNy+Zz3JxTOfvSjNFbf1dnt2//pmv2z2\nXykmk0kmk8mcywQAACjLdDrNdI4rMEffx7aZM5OcluT7Sc5KM1ny1iQ/l+TVJLeluddtR/t9T5J7\n0wS8C9O0W16S5srcE0luTnPf2wNJ7sxR97wl2dA2eWqm06nAC3NyvsD8nC8wP+cLp6KqqmSTrDZP\neHtPmqttSXOl7n8k+fU00yfvT/LuNINJbkjynfZ9n0rysSRvJPlEki+1+69McleSM9JMm7x5k98n\nvAEAACvrVMLbsglvAADAyjpWeJt3YAkAAAA9Et5GyJQfAAD6oA7tlrbJEaqqxCEEAGDZ1KGLoW0S\nAACgYMLbCNV13ysAAGAVqUO7pW0SAABgQLRNAgAAFEx4AwAAKIDwBgAAUADhbYQ8XwMAgD6oQ7sl\nvAEAABTAtEkAAIABMW0SAACgYMIbAABAAYQ3AACAAghvI2TKDwAAfVCHdsvAkhGqqsQhBABg2dSh\ni2FgCQAAQMGEtxGq675XAADAKlKHdkvbJAAAwIBomwQAACiY8AYAAFAA4Q0AAKAAwtsIeb4GAAB9\nUId2S3gDAAAogGmTAAAAA2LaJAAAQMGENwAAgAIIbwAAAAUQ3kbIlB8AAPqgDu2WgSUjVFWJQwgA\nwLKpQxfDwBIAAICCCW8jVNd9rwAAgFWkDu2WtkkAAIAB0TYJAABQMOENAACgAMIbAABAAYS3EfJ8\nDQAA+qAO7ZbwBgAAUADTJgEAAAbEtEkAAICCCW8AAAAFEN4AAAAKILyNkCk/AAD0QR3aLQNLRqiq\nEocQAIBlU4cuhoElAAAABRPeRqiu+14BAACrSB3aLW2TAAAAA6JtEgAAoGDCGwAAQAGENwAAgAII\nbyPk+RoAAPRBHdot4Q0AAKAApk0CAAAMiGmTAAAABRPeAAAACiC8AQAAFEB4GyFTfgAA6IM6tFsG\nloxQVSUOIQAAy6YOXQwDSwAAAAomvI1QXfe9AgAAVpE6tFvaJgEAAAZE2yQAAEDBhDcAAIACCG8A\nAAAFEN5GyPM1AADogzq0W8IbAABAAUybBAAAGBDTJgEAAAomvAEAABRAeAMAACiA8DZCpvwAANAH\ndWi3DCwZoapKHEIAAJZNHboYBpYAAAAUTHgbobruewUAAKwidWi3tE0CAAAMiLZJAACAgglvAAAA\nBRDeAAAACiC8jZDnawAA0Ad1aLeENwAAgAKYNgkAADAgpk0CAAAUTHgDAAAogPAGAABQAOFthEz5\nAQCgD+rQbhlYMkJVlTiEAAAsmzp0MQwsAQAAKJjwNkJ13fcKAABYRerQbmmbBAAAGBBtkwAAAAUT\n3gAAAAogvAEAABRAeBshz9cAAKAP6tBuCW8AAAAFMG0SAABgQEybBAAAKJjwBgAAUADhDQAAoADC\n2wiZ8gMAQB/Uod0ysGSEqipxCAEAWDZ16GIYWAIAAFCwecPbaUmeTPKH7etzkzyS5LkkDyfZMfPe\nW5I8n+RAkmtn9l+Z5On2Z3ec/JI5nrruewUAAKwidWi35m2b/JU04evsJNcnuT3JK+33Tyb5m0n2\nJdmT5N4kfzvJhUkeTXJpko0k+5P8q/b7g0nuTPLQJr9L2yQAALCyTqVt8qIkv5DkczN/wPVJ7m63\n707yoXb7g0nuS/J6kheTvJDk6iS70gS//e377pn5DAAAAMcxT3j7jST/NslbM/t2JjnUbh9qXyfJ\nBUkOzrzvYJorcEfvX2/3AwAAMIfjhbd/lOTlNPe7HavFcqP9AgAAoCNbj/Pzv5umRfIXkmxP8jeS\nfD7N1bbzk7yUpiXy5fb960kunvn8RWmuuK2327P714/1S9dmHhAxmUwymUyO9/dgxtqaZ2wAALB8\n6tCTM51OM51Oj/u+E3nO2zVJ/k2SD6QZVPJqktvSDCrZkbcPLLkqRwaWXJLmytwTSW5Oc9/bAzGw\npDNOGgAA+qAOXYxjDSw50fD2q2muxJ2b5P4k704zmOSGJN9p3/epJB9L8kaSTyT5Urv/yiR3JTkj\nzbTJm4/xe4Q3AABgZS0ivC2L8AYAAKysU3lUAAAAAD0T3gAAAAogvI2Qm0QBAOiDOrRb7nkboapK\nHEIAAJZNHboY7nkDAAAomPA2QnXd9woAAFhF6tBuaZsEAAAYEG2TAAAABRPeAAAACiC8AQAAFEB4\nGyHP1wAAoA/q0G4JbwAAAAUwbRIAAGBATJsEAAAomPAGAABQAOENAACgAMLbCJnyAwBAH9Sh3TKw\nZISqKnEIAQBYNnXoYhhYAgAAUDDhbYTquu8VAACwitSh3dI2CQAAMCDaJgEAAAomvAEAABRAeAMA\nACiA8DZCnq8BAEAf1KHdEt4AAAAKYNokAADAgJg2CQAAUDDhDQAAoADCGwAAQAGEtxEy5QcAgD6o\nQ7tlYMkIVVXiEAIAsGzq0MUwsAQAAKBgwtsI1XXfKwAAYBWpQ7ulbRIAAGBAtE0CAAAUTHgDAAAo\ngPAGAABQAOFthDxfAwCAPqhDuyW8AQAAFMC0SQAAgAExbRIAAKBgwhsAAEABhDcAAIACCG8jZMoP\nAAB9UId2y8CSEaqqxCEEAGDZ1KGLYWAJAABAwYS3EarrvlcAAMAqUod2S9skAADAgGibBAAAKJjw\nBgAAUADhDQAAoADC2wh5vgYAAH1Qh3ZLeAMAACiAaZMAAAADYtokAABAwYQ3AACAAghvAAAABRDe\nRsiUHwAA+qAO7ZaBJSNUVYlDCADAsqlDF8PAEgAAgIIJbyNU132vAACAVaQO7Za2SQAAgAHRNgkA\nAFAw4Q0AAKAAwhsAAEABhLcR8nwNAAD6oA7tlvAGAABQANMmAQAABsS0SQAAgIIJbwAAAAUQ3gAA\nAAogvI2QKT8AAPRBHdotA0tGqKoShxAAgGVThy6GgSUAAAAFE95GqK77XgEAAKtIHdotbZMAAAAD\nom0SAACgYMIbAABAAYQ3AACAAghvI+T5GgAA9EEd2i3hDQAAoACmTQIAAAyIaZMAAAAFE94AAAAK\nILwBAAAUQHgbIVN+AADogzq0WwaWjFBVJQ4hAADLpg5dDANLAAAACia8jVBd970CAABWkTq0W9om\nAQAABkTbJAAAQMGENwAAgAIIbwAAAAUQ3kbI8zUAAOiDOrRbwhsAAEABTJsEAAAYENMmAQAACia8\nAQAAFEB4AwAAKIDwNkKm/AAA0Ad1aLcMLBmhqkocQgAAlk0duhgnO7Bke5InkjyV5Jkkv97uPzfJ\nI0meS/Jwkh0zn7klyfNJDiS5dmb/lUmebn92x4n+BQAAAFbZ8cLbXyX52SQ/meRvtdt/L8m+NOHt\nvUkea18nyZ4kH26/703ymRxJjJ9NcmOSS9uvvYv6S/B2dd33CgAAWEXq0G6dSNvkmUkeT/LPk/xe\nkmuSHEpyfpJpkveluer2VpLb2s88lGQtydeT/HGSy9r9H0kySfJLm/webZMAAMDKOpXnvG1J0zZ5\nKMn/TPKVJDvb12m/72y3L0hycOazB5NcuMn+9XY/AAAAc9g6x3veStM2eU6SL6VpnZy10X4BAADQ\nkXnC22HfTfJAmsEjh9slX0qyK8nL7XvWk1w885mL0lxxW2+3Z/evH+sXrc3MGJ1MJplMJiewTAAA\ngHJMp9NMp9Pjvu9497ydl+SNJN9JckaaK2+3Jvn5JK+mubdtX5ppk/vSDCq5N8lVadoiH01ySZor\nc08kuTnJ/jQh8M4098QdzT1vp2htzTM2AABYPnXoYhzrnrfjXXnbleTuNPe9bUny+TTTJZ9Mcn+a\n6ZEvJrmhff8z7f5n0oS+m3KkpfKmJHelCYEPZvPgBgAAwCY8pBsAAGBATmXaJAAAAD0T3gAAAAog\nvAEAABRAeBshE34AAOiDOrRbBpaMUFUlDiEAAMumDl0MA0sAAAAKJryNUF33vQIAAFaROrRb2iYB\nAAAGRNskAABAwYQ3AACAAghvAAAABRDeRsjzNQAA6IM6tFvCGwAAQAFMmwQAABgQ0yYBAAAKJrwB\nAAAUQHgDAAAogPA2Qqb8AADQB3VotwwsGaGqShxCAACWTR26GAaWAAAAFEx4G6G67nsFAACsInVo\nt7RNAgAADIi2SQAAgIIJbwAAAAUQ3gAAAAogvI2Q52sAANAHdWi3hDcAAIACmDYJAAAwIKZNAgAA\nFEx4AwAAKIDwBgAAUADhbYRM+QEAoA/q0G4ZWDJCVZU4hAAALJs6dDEMLAEAACiY8DZCdd33CgAA\nWEXq0G5pmwQAABgQbZMAAAAFE94AAAAKILwBAAAUQHgbIc/XAACgD+rQbglvAAAABTBtEgAAYEBM\nmwQAACiY8AYAAFAA4Q0AAKAAwtsImfIDAEAf1KHdMrBkhKoqcQgBAFg2dehiGFgCAABQMOFthOq6\n7xUAALCK1KHd0jYJAAAwINomAQAACia8AQAAFEB4AwAAKIDwNkKerwEAQB/Uod0S3gAAAApg2iQA\nAMCAmDYJAABQMOENAACgAMIbAABAAYS3ETLlBwCAPqhDu2VgyQhVVeIQAgCwbOrQxTCwBAAAoGDC\n2wjVdd8rAABgFalDu6VtEgAAYEC0TQIAABRMeAMAACiA8AYAAFAA4W2EPF8DAIA+qEO7JbwBAAAU\nwLRJAACAATFtEgAAoGDCGwAAQAGENwAAgAIIbyNkyg8AAH1Qh3bLwJIRqqrEIQQAYNnUoYthYAkA\nAEDBhLcRquu+VwAAwCpSh3ZL2yQAAMCAaJsEAAAomPAGAABQAOENAACgAMLbCHm+BgAAfVCHdkt4\nAwAAKIBpkwAAAANi2iQAAEDBhDcAAIACCG8AAAAFEN5GyJQfAAD6oA7tloElI1RViUMIAMCyqUMX\nw8ASAACAgglvI1TXfa8AAIBVpA7tlrZJAACAAdE2CQAAUDDhDQAAoADCGwAAQAGEtxHyfA0AAPqg\nDu2W8AYAAFAA0yYBAAAGxLRJAACAgglvAAAABRDeAAAACiC8jZApPwAA9EEd2q15BpZcnOSeJD+a\nZCPJf01yZ5Jzk/xukh9L8mKSG5J8p/3MLUk+luTNJDcnebjdf2WSu5JsT/Jgkk9s8vsMLDlFVZU4\nhAAALJs6dDFOZWDJ60n+dZLLk/x0ko8nuSzJviSPJHlvksfa10myJ8mH2+97k3xm5hd/NsmNSS5t\nv/aezF8GAABg1cwT3l5K8lS7/X+TPJvkwiTXJ7m73X93kg+12x9Mcl+a0PdikheSXJ1kV5Kzk+xv\n33fPzGdYoLruewUAAKwidWi3TvSet91JrkjyRJKdSQ61+w+1r5PkgiQHZz5zME3YO3r/erufBdNr\nDABAH9Sh3TqR8PbOJL+X5j617x/1s432CwAAgA5snfN929IEt88n+UK771CS89O0Ve5K8nK7fz3N\nkJPDLkpzxW293Z7dv77ZL1ubieyTySSTyWTOZQIAAJRlOp1mOp0e933zTJus0tzT9mqawSWH3d7u\nuy3NsJId7fc9Se5NclWatshHk1yS5srcE2mmT+5P8kCaqZUPHfX7TJsEAABW1qlMm3x/ko8m+dkk\nT7Zfe5N8Osk/TPJckr/fvk6SZ5Lc337/oyQ35UhL5U1JPpfk+TSDTI4ObiyAXmMAAPqgDu3WPG2T\n/yvHDnk/d4z9/6H9Otr/TvITc/xOAAAAZszTNrls2iYBAICVdSptkwAAAPRMeAMAACiA8AYAAFAA\n4W2ETPkBAKAP6tBuGVgyQlWVOIQAACybOnQxDCwBAAAomPA2QnXd9woAAFhF6tBuaZsEAAAYEG2T\nAAAABRPeAAAACiC8AQAAFEB4GyHP1wAAoA/q0G4JbwAAAAUwbRIAAGBATJsEAAAomPAGAABQAOEN\nAACgAMLbCJnyAwBAH9Sh3TKwZISqKnEIAQBYNnXoYhhYAgAAUDDhbYTquu8VAACwitSh3dI2CQAA\nMCDaJgEAAAomvAEAABRAeAMAACiA8DZCnq8BAEAf1KHdEt4AAAAKYNokAADAgJg2CQAAUDDhDQAA\noADCGwAAQAGEtxEy5QcAgD6oQ7tlYMkIVVXiEAIAsGzq0MUwsAQAAKBgwtsI1XXfKwAAYBWpQ7ul\nbRIAAGBAtE0CAAAUTHgDAAAogPAGAABQAOFthDxfAwCAPqhDuyW8AQAAFMC0SQAAgAExbRIAAKBg\nwhsAAEABhDcAAIACCG8jZMoPAAB9UId2y8CSEaqqxCEEAGDZ1KGLYWAJAABAwYS3EarrvlcAAMAq\nUod2S9skAADAgGibBAAAKJjwBgAAUADhDQAAoADC2wh5vgYAAH1Qh3ZLeAMAACiAaZMAAAADYtok\nAABAwYQ3AACAAghvAAAABRDeRsiUHwAA+qAO7ZaBJSNUVYlDCADAsqlDF8PAEgAAgIIJbyNU132v\nAACAVaQO7Za2SQAAgAHRNgkAAFAw4Q0AAKAAwhsAAEABhLcR8nwNAAD6oA7tlvAGAABQANMmAQAA\nBsS0SQAAgIIJbwAAAAUQ3gAAAAogvI2QKT8AAPRBHdotA0tGqKoShxAAgGVThy6GgSUAAAAFE95G\nqK77XgEAAKtIHdotbZMAAAADom0SAACgYMIbAABAAYQ3AACAAghvI+T5GgAA9EEd2i3hDQAAoACm\nTQIAAAyIaZMAAAAFE94AAAAKILwBAAAUQHgbIVN+AADogzq0WwaWjFBVJQ4hAADLpg5dDANLAAAA\nCia8jVBd970CAABWkTq0W9omAQAABkTbJAAAQMGENwAAgAIIbwAAAAUQ3kbI8zUAAOiDOrRbwhsA\nAEABTJsEAAAYENMmAQAACia8AQAAFGCe8PbbSQ4leXpm37lJHknyXJKHk+yY+dktSZ5PciDJtTP7\nr2z/jOeT3HHySwYAAFg984S3/55k71H79qUJb+9N8lj7Okn2JPlw+31vks/kSK/mZ5PcmOTS9uvo\nP5MFMeUHAIA+qEO7Ne/Akt1J/jDJT7SvDyS5Js0VufOTTJO8L81Vt7eS3Na+76Eka0m+nuSPk1zW\n7v9IkkmSX9rkdxlYcoqqKnEIAQBYNnXoYix6YMnONMEt7fed7fYFSQ7OvO9gkgs32b/e7gcAAGAO\nixhYstF+MRB13fcKAABYRerQbm09yc8dbpd8KcmuJC+3+9eTXDzzvovSXHFbb7dn968f6w9fm2mW\nnUwmmUwmJ7nM1aTXGACAPqhDT850Os10Oj3u+072nrfbk7ya5t62fWmmTe5LM6jk3iRXpWmLfDTJ\nJWmuzD2R5OYk+5M8kOTONPfEHc09bwAAwMo61j1v81x5uy/NcJLzknwjyb9L8ukk96eZHvlikhva\n9z7T7n8myRtJbsqRlsqbktyV5IwkD2bz4AYAAMAm5r3ytkyuvAEAACtr0dMmGTC9xgAA9EEd2i3h\nDQAAoADaJgEAAAZE2yQAAEDBhDcAAIACCG8AAAAFEN5GyJQfAAD6oA7tloElI1RViUMIAMCyqUMX\nw8ASAACAgglvI1TXfa8AAIBVpA7tlrZJAACAAdE2CQAAUDDhDQAAoADCGwAAQAGEtxHyfA0AAPqg\nDu2W8AYAAFAA0yYBAAAGxLRJAACAgglvAAAABRDeAAAACiC8jZApPwAA9EEd2i0DS0aoqhKHEACA\nZVOHLoaBJQAAAAUT3kaorvteAQAAq0gd2i1tkwAAAAOibRIAAKBgwhsAAEABhDcAAIACCG8j5Pka\nAAD0QR3aLeENAACgAKZNAgAADIhpkwAAAAUT3gAAAAogvAEAABRAeBshU34AAOiDOrRbBpaMUFUl\nDiEAAMumDl0MA0sAAAAKJryNUF33vQIAAFaROrRb2iYBAAAGRNskAABAwYQ3AACAAghvAAAABRDe\nRsjzNQAA6IM6tFvCGwAAQAFMmwQAABgQ0yYBAAAKJrwBAAAUQHgDAAAogPA2Qqb8AADQB3Votwws\nGaGqShxCAACWTR26GAaWAAAAFEx4G6G67nsFAACsInVot7RNAgAADIi2SQAAgIIJbwAAAAUQ3gAA\nAAogvI2Q52sAANAHdWi3hDcAAIACmDYJAAAwIKZNAgAAFEx4AwAAKIDwBgAAUADhbYRM+QEAoA/q\n0G4ZWDJCVZU4hAAALJs6dDEMLAEAACiY8DZCdd33CgAAWEXq0G5pmwQAABgQbZMAAAAFE94AAAAK\nILwBAAAUQHgbIc/XAACgD+rQbglvAAAABTBtEgAAYEBMmwQAACiY8AYAAFAA4Q0AAKAAwtsImfID\nAEAf1KHdMrBkhKoqcQgBAFg2dehiGFgCAABQMOFthOq67xUAALCK1KHd0jYJAAAwINomAQAACia8\nAQAAFEB4AwAAKIDwNkKerwEAQB/Uod0S3gAAAApg2iQAAMCAmDYJAABQMOENAACgAMIbAABAAYS3\nETLlBwCAPqhDu2VgyQhVVeIQAgCwbOrQxTCwBAAAoGDC2wjVdd8rAABgFalDu6VtEgAAYEC0TQIA\nABRMeAMAACiA8AYAAFAA4W2EPF8DAIA+qEO7JbwBAAAUwLRJAACAATnWtMmty18KJ6K69cTz9b/f\nupFvf7uDxYzc6Wd/P+/c/Ww2znsm3zv9q3lz46+P+5kt1Zbs3rE7e961J5f/6OV515nvOnyyAQDA\nQvVRZe5N8p+SnJbkc0luO+rnK33l7bXXX8uzrzybr7z8lRx45UBee+O1436mypa8862Lcvp3L89f\n/Z/L85eHLsgKH8KT9hvnnNzpsFE72AAALM5QnvN2WpLfTBPg9iT5p0kuW/IaYFNfvEIIAwBguJbd\nNnlVkheSvNi+/p0kH0zy7JLXUYRtp23LRo4fKKpsyTve3J5tW7bnrS3b84WHp/nABybdL3Bktm3r\newX0YTqdZjKZ9L0MKILzBea3qufL2pqJk11adni7MMk3Zl4fTHL1ktdQhLNOPys7z9qZH7zxg+O+\nt6q2ZPsP3pVq27k580c28uUvT3PjjZPuFzkyzz+f/OZ58199++hHk3PO6XBBLMWq/s8VTobzBea3\nqufLrbcKb11adnjTl3YcZ2w7Iz+166fyvvPel1f+8pW8/ubrc3+2ymmpXjs9SfKB7Y90tURaZz32\nWnKa/6SLd+BA8sUv9r0KKIPzBea3iudLVSW5PnntteSMM/pezSgtO7ytJ7l45vXFaa6+vc3aTFyf\nTCYr+a8WZ247M+8+590n9dlrrvhefvyy0xe8In6YYzwK27cnO3b0vQoog/MF5reK50tVpf7lbyfv\nWLG/9wJMp9NMp9Pjvm/Z0ya3Jvlqkn+Q5C+S7E8ztGT2nrdpkmuWvC4AAICheDzJpO9FJMl1aQLc\nC0lu6XktAAAAAAAAAAAAABRrb5IDSZ5P8sme1wJ9eTHJnyd5Ms29tUlybpJHkjyX5OEks3dT35Lm\nnDmQ5NqZ/Vcmebr92R2drhiW57eTHErz3/Zhizw/3pHkd9v9f5Lkxxa7fFiqzc6XtTQD955sv66b\n+ZnzBZjbaWnuJdydZFuSp5Jc1ueCoCdfS1OMzro9ya+1259M8ul2e0+ac2VbmnPnhRwZ5rQ/yVXt\n9oNp/nEESvczSa7I24vRRZ4fNyX5TLv94SS/s9DVw3Jtdr7USX5lk/c6X4AT8neSPDTzel/7Bavm\na0l+5Kh9B5LsbLfPb18nzb+Szl6lfijJTyfZlbdPwv1Ikv+88JVCP3bn7cXoIs+Ph5Jc3W5vTfKt\nRS0aerI7PxzefnWT9zlf6NyWvhfAQl2Y5Bszrw+2+2DVbCR5NMmfJvmX7b6daVpf0n4/XKhekLc/\nb/LweXP0/vU4nxivRZ4fs/8veiPJd/PDV8KhdL+c5M+S/FaOtBk7X+ic8DYuG30vAAbi/WnaXK5L\n8vE0bS+zNuJ8gWNxfsD/32eTvCfJTyb5ZpL/2O9yWCXC27isJ7l45vXFefu/9MCq+Gb7/VtJfj/N\nfQaH0rSDJU0Ly8vt9tHnzUVpzpv1dnt2/3pH64W+LeL8ODjzmXe321uTnJPk24tfMvTm5Rz5R47P\n5ci9bM4XOie8jcufJrk0TW/26WlufP2DPhcEPTgzydnt9llppn09neZc+MV2/y8m+UK7/Qdp7j84\nPc2/pF6a5sbyl5J8L829CFWSfzbzGRibRZwfX9zkz/onSR7reO2wbLtmtv9xjtwP53wBTth1Sb6a\nZsLRLT2vBfrwnjTTvp5K8uUcOQ/OTXMf3Gaj0D+V5pw5kOTnZ/YfHu38QpI7O101LM99Sf4iyV+n\nudfmX2Sx58c7ktyfI6PPd3fwd4BlOfp8+ViSe9I8jubP0vxDx86Z9ztfAAAAAAAAAAAAAAAAAAAA\nAAAAAAAAAAAAAAAAAAAAAGb9P+ohqNZZpTi4AAAAAElFTkSuQmCC\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "aln.sort(key = lambda x:x[3])\n", "num = len(aln)\n", "#size_chunk = num/10\n", "#for i in range(10):\n", "# aln[i*size_chunk:min((i+1)*size_chunk, num)] = sorted(aln[i*size_chunk:min((i+1)*size_chunk, num)],key = lambda x: x[4]-x[3] ,reverse=True)\n", "\n", "plt.figure(figsize = (15,10))\n", "plt.axes()\n", "l = aln[0][5]\n", "plt.xlim(-2000,l+2000)\n", "plt.ylim(-5,num*10)\n", "points = [[0,0], [l,0], [l+40,2.5], [l,5], [0,5]]\n", "#rectangle = plt.Rectangle((0, 0), l, 5, fc='r',ec = 'none')\n", "polygon = plt.Polygon(points,fc = 'r', ec = 'none')\n", "plt.gca().add_patch(polygon)\n", "\n", "dotted_line = plt.Line2D((0, 0), (0, num*10 ),ls='-.') \n", "plt.gca().add_line(dotted_line)\n", "\n", "dotted_line2 = plt.Line2D((l, l), (0, num*10 ),ls='-.') \n", "plt.gca().add_line(dotted_line2)\n", "\n", "for i,item in enumerate(aln):\n", " abpos = item[3]\n", " aepos = item[4]\n", " bbpos = item[6]\n", " bepos = item[7]\n", " blen = item[8]\n", " strand = item[0]\n", " points_begin = []\n", " points_end = []\n", " tip = l/200\n", " ed = l/50\n", " \n", " if strand == 'n':\n", " points = [[abpos, (i+1)*10], [aepos, (i+1)*10], [aepos + tip, (i+1)*10 + 2.5], [aepos, (i+1)*10+5], [abpos, (i+1)*10+5]]\n", " if (bepos < blen):\n", " points_end = [[aepos, (i+1)*10], [aepos + tip, (i+1)*10 + 2.5], [aepos, (i+1)*10+5], [aepos+ed, (i+1)*10+5], [aepos + ed+ tip, (i+1)*10 + 2.5], [aepos+ed, (i+1)*10]]\n", " if (bbpos > 0):\n", " points_start = [[abpos, (i+1)*10], [abpos, (i+1)*10+5], [abpos-ed, (i+1)*10+5], [abpos-ed, (i+1)*10]]\n", " else:\n", " points = [[abpos, (i+1)*10], [aepos, (i+1)*10], [aepos, (i+1)*10+5], [abpos, (i+1)*10+5], [abpos - tip, (i+1)*10 + 2.5]]\n", " if (bepos < blen):\n", " points_end = [[aepos, (i+1)*10], [aepos, (i+1)*10+5], [aepos+ed, (i+1)*10+5], [aepos+ed, (i+1)*10]]\n", " if (bbpos > 0):\n", " points_start = [[abpos, (i+1)*10],[abpos-tip, (i+1)*10+2.5], [abpos, (i+1)*10+5], [abpos-ed, (i+1)*10+5],[abpos-ed-tip, (i+1)*10+2.5], [abpos-ed, (i+1)*10]]\n", " \n", " polygon = plt.Polygon(points,fc = 'b', ec = 'none')\n", " plt.gca().add_patch(polygon)\n", " \n", " if points_end != []:\n", " polygon2 = plt.Polygon(points_end,fc = 'g', ec = 'none')\n", " plt.gca().add_patch(polygon2)\n", " \n", "\n", " if points_start != []:\n", " polygon2 = plt.Polygon(points_start,fc = 'g', ec = 'none')\n", " plt.gca().add_patch(polygon2)\n", " \n", " plt.savefig('test.pdf')" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "233" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(aln)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 0 } HINGE-0.5.0/scripts/pipeline_consensus.py000077500000000000000000000064461314415550300203320ustar00rootroot00000000000000#!/usr/bin/env python import sys import os import subprocess if len(sys.argv) >= 2: bact_id = sys.argv[1] ini_path = 'nominal.ini' if len(sys.argv) >= 3: ini_path = sys.argv[2] run_identifier = 'A' if len(sys.argv) >= 4: run_identifier = sys.argv[3] graphml_file = bact_id+run_identifier+'.G2.graphml' # This is used to start the pipeline in the middle st_point = 0 if len(sys.argv) >= 5: st_point = int(sys.argv[4]) # This is used to stop the pipeline in the middle end_point = 20 if len(sys.argv) >= 6: end_point = int(sys.argv[5]) base_path = './' if st_point <= 1 and end_point >= 1: draft_path_cmd = 'get_draft_path.py '+base_path+' '+ bact_id+' '+graphml_file print '1: '+draft_path_cmd subprocess.check_output(draft_path_cmd,cwd=base_path, shell=True) if st_point <= 2 and end_point >= 2: draft_assembly_cmd = 'draft_assembly --db '+bact_id+' --las '+bact_id+'.las --prefix '+bact_id+' --config '+ini_path+' --out '+bact_id+'.draft' print '2: '+draft_assembly_cmd subprocess.check_output(draft_assembly_cmd,cwd=base_path, shell=True) if st_point <= 3 and end_point >= 3: corr_head_cmd = 'correct_head.py '+bact_id+'.draft.fasta '+bact_id+'.draft.pb.fasta draft_map.txt' print '3: '+corr_head_cmd subprocess.check_output(corr_head_cmd,cwd=base_path, shell=True) if st_point <= 4 and end_point >= 4: subprocess.call("rm -f draft.db",shell=True,cwd=base_path) fasta2DB_cmd = "fasta2DB draft "+base_path+bact_id+'.draft.pb.fasta' print '4: '+fasta2DB_cmd subprocess.check_output(fasta2DB_cmd.split(),cwd=base_path) if st_point <= 5 and end_point >= 5: subprocess.call("rm -f draft.*.las",shell=True,cwd=base_path) mapper_cmd = "HPCmapper draft "+bact_id print '5: '+mapper_cmd subprocess.call(mapper_cmd.split(),stdout=open(base_path+'draft_consensus.sh','w') , cwd=base_path) if st_point <= 6 and end_point >= 6: # modify_cmd = """awk '{gsub("daligner -A -k20 -h50 -e.85","daligner -A",$0); print $0}' draft_consensus.sh""" modify_cmd = ['awk','{gsub("daligner -A -k20 -h50 -e.85","daligner -A",$0); print $0}','draft_consensus.sh'] print '6: '+"""awk '{gsub("daligner -A -k20 -h50 -e.85","daligner -A",$0); print $0}' draft_consensus.sh""" subprocess.call(modify_cmd,stdout=open(base_path+'draft_consensus2.sh','w') , cwd=base_path) if st_point <= 7 and end_point >= 7: mapper_shell_cmd = "csh -v draft_consensus.sh" print '7: '+mapper_shell_cmd subprocess.check_output(mapper_shell_cmd.split(), cwd=base_path) if st_point <= 8 and end_point >= 8: # remove_cmd = 'rm -f nonrevcompdraft.'+bact_id+'.*.las' # subprocess.call(remove_cmd,shell=True,cwd=base_path) LAmerge_cmd = "LAmerge draft."+bact_id+".las "+'draft.'+bact_id+'.[0-9].las' print '8: '+LAmerge_cmd subprocess.check_output(LAmerge_cmd,cwd=base_path,shell=True) if st_point <= 9 and end_point >= 9: consensus_cmd = 'consensus draft '+bact_id+' draft.'+bact_id+'.las '+bact_id+'.consensus.fasta '+ini_path print '9: '+consensus_cmd subprocess.check_output(consensus_cmd,cwd=base_path,shell=True) if st_point <= 10 and end_point >= 10: gfa_cmd = 'get_consensus_gfa.py '+base_path+ ' '+ bact_id+ ' '+bact_id+'.consensus.fasta' print '10: '+gfa_cmd subprocess.check_output(gfa_cmd,cwd=base_path,shell=True) HINGE-0.5.0/scripts/pipeline_consensus_norevcomp.py000077500000000000000000000061411314415550300224120ustar00rootroot00000000000000#!/usr/bin/env python import sys import os import subprocess if len(sys.argv) >= 2: bact_id = sys.argv[1] ini_path = 'nominal.ini' if len(sys.argv) >= 3: ini_path = sys.argv[2] run_identifier = 'A' if len(sys.argv) >= 4: run_identifier = sys.argv[3] graphml_file = bact_id+run_identifier+'.G2.graphml' # This is used to start the pipeline in the middle st_point = 0 if len(sys.argv) >= 5: st_point = int(sys.argv[4]) # This is used to stop the pipeline in the middle end_point = 20 if len(sys.argv) >= 6: end_point = int(sys.argv[5]) base_path = './' if st_point <= 1 and end_point >= 1: draft_path_cmd = 'get_draft_path_norevcomp.py '+base_path+' '+ bact_id+' '+graphml_file print '1: '+draft_path_cmd subprocess.check_output(draft_path_cmd,cwd=base_path, shell=True) if st_point <= 2 and end_point >= 2: draft_assembly_cmd = 'draft_assembly --db '+bact_id+' --las '+bact_id+'.las --prefix '+bact_id+' --config '+ini_path+' --out '+bact_id+'.draft' print '2: '+draft_assembly_cmd subprocess.check_output(draft_assembly_cmd,cwd=base_path, shell=True) if st_point <= 3 and end_point >= 3: corr_head_cmd = 'correct_head.py '+bact_id+'.draft.fasta '+bact_id+'.draft.pb.fasta draft_map.txt' print '3: '+corr_head_cmd subprocess.check_output(corr_head_cmd,cwd=base_path, shell=True) if st_point <= 4 and end_point >= 4: subprocess.call("rm -f draft.db",shell=True,cwd=base_path) fasta2DB_cmd = "fasta2DB draft "+base_path+bact_id+'.draft.pb.fasta' print '4: '+fasta2DB_cmd subprocess.check_output(fasta2DB_cmd.split(),cwd=base_path) if st_point <= 5 and end_point >= 5: subprocess.call("rm -f draft.*.las",shell=True,cwd=base_path) mapper_cmd = "HPCmapper draft "+bact_id print '5: '+mapper_cmd subprocess.call(mapper_cmd.split(),stdout=open(base_path+'draft_consensus.sh','w') , cwd=base_path) if st_point <= 6 and end_point >= 6: # modify_cmd = """awk '{gsub("daligner -A -k20 -h50 -e.85","daligner -A",$0); print $0}' draft_consensus.sh""" modify_cmd = ['awk','{gsub("daligner -A -k20 -h50 -e.85","daligner -A",$0); print $0}','draft_consensus.sh'] print '6: '+"""awk '{gsub("daligner -A -k20 -h50 -e.85","daligner -A",$0); print $0}' draft_consensus.sh""" subprocess.call(modify_cmd,stdout=open(base_path+'draft_consensus2.sh','w') , cwd=base_path) if st_point <= 7 and end_point >= 7: mapper_shell_cmd = "csh -v draft_consensus2.sh" print '7: '+mapper_shell_cmd subprocess.check_output(mapper_shell_cmd.split(), cwd=base_path) if st_point <= 8 and end_point >= 8: # remove_cmd = 'rm -f nonrevcompdraft.'+bact_id+'.*.las' # subprocess.call(remove_cmd,shell=True,cwd=base_path) LAmerge_cmd = "LAmerge draft."+bact_id+".las "+'draft.'+bact_id+'.[0-9].las' print '8: '+LAmerge_cmd subprocess.check_output(LAmerge_cmd,cwd=base_path,shell=True) if st_point <= 9 and end_point >= 9: consensus_cmd = 'consensus draft '+bact_id+' draft.'+bact_id+'.las '+bact_id+'.norevcomp_consensus.fasta '+ini_path print '9: '+consensus_cmd subprocess.check_output(consensus_cmd,cwd=base_path,shell=True) HINGE-0.5.0/scripts/pipeline_nctc.py000077500000000000000000000047701314415550300172370ustar00rootroot00000000000000#!/usr/bin/env python import sys import os import subprocess bact_name = "ecoli" if len(sys.argv) >= 2: bact_id = sys.argv[1] st_point = 0 if len(sys.argv) >= 3: st_point = int(sys.argv[2]) base_path='/data/pacbio_assembly/pb_data/NCTC/'+bact_id+"/" if len(sys.argv) >= 4: base_path = sys.argv[3] fasta_names = [x for x in os.listdir(base_path) if x.endswith('.fasta')] assert len(fasta_names)==1 fasta_name = fasta_names[0] bact_name = fasta_name.split('.fasta')[0] print bact_name if st_point <= 1: subprocess.call("rm -f *.db",shell=True,cwd=base_path) fasta2DB_cmd = "fasta2DB "+bact_name+' '+base_path+fasta_name print fasta2DB_cmd subprocess.check_output(fasta2DB_cmd.split(),cwd=base_path) if st_point <= 2: DBsplit_cmd = "DBsplit -x500 -s100 "+bact_name print DBsplit_cmd subprocess.check_output(DBsplit_cmd.split(),cwd=base_path) if st_point <= 3: subprocess.call("rm -f *.las",shell=True,cwd=base_path) daligner_cmd = "HPCdaligner -t5 "+bact_name daligner_shell_cmd = "csh -v daligner_cmd.sh" print daligner_cmd p = subprocess.call(daligner_cmd.split(),stdout=open(base_path+'daligner_cmd.sh','w') , cwd=base_path) p2 = subprocess.check_output(daligner_shell_cmd.split(), cwd=base_path) if st_point <= 4: remove_cmd = "rm "+base_path+bact_name+".*."+bact_name+".*" print remove_cmd os.system(remove_cmd) if st_point <= 5: LAmerge_cmd = "LAmerge "+bact_name+".las "+bact_name+".*.las" print LAmerge_cmd subprocess.check_output(LAmerge_cmd,cwd=base_path,shell=True) if st_point <= 6: remove_cmd2 = "rm "+base_path+bact_name+".*.las" os.system(remove_cmd2) if st_point <= 7: os.system("mkdir -p "+base_path+"log") if st_point <= 8: DASqv_cmd = "DASqv -c100 "+bact_name+" "+bact_name+".las" subprocess.check_output(DASqv_cmd.split(),cwd=base_path) if st_point <= 9: Reads_filter_cmd = "Reads_filter --db "+bact_name+" --las "+bact_name+".las -x "+bact_name+" --config ~/AwesomeAssembler/utils/nominal.ini" print Reads_filter_cmd subprocess.check_output(Reads_filter_cmd,cwd=base_path, shell=True) if st_point <= 10: hinging_cmd = "hinging --db "+bact_name+" --las "+bact_name+".las -x "+bact_name+" --config ~/AwesomeAssembler/utils/nominal.ini -o "+bact_name print hinging_cmd subprocess.check_output(hinging_cmd, cwd=base_path, shell=True) if st_point <= 11: pruning_cmd = "python ~/AwesomeAssembler/scripts/pruning_and_clipping.py "+bact_name+".edges.hinges "+bact_name+".hinge.list A" print pruning_cmd subprocess.check_output(pruning_cmd, cwd=base_path, shell=True) HINGE-0.5.0/scripts/pruning_and_clipping.py000077500000000000000000001351631314415550300206150ustar00rootroot00000000000000#!/usr/bin/env python # coding: utf-8 # In[115]: import networkx as nx import random import sys import numpy as np import ujson from colormap import rgb2hex import operator import matplotlib.colors import configparser # print G.number_of_edges(),G.number_of_nodes() # In[3]: def write_graph(G,flname): with open(flname,'w') as f: for edge in G.edges_iter(): f.write(str(edge[0])+'\t'+str(edge[1])+'\n') # In[4]: def write_graph2(G,Ginfo,flname): count_no = 0 count_yes = 0 with open(flname,'w') as f: for edge in G.edges_iter(): if (edge[0],edge[1]) not in Ginfo: count_no += 1 print "not found" continue else: count_yes += 1 # line = Ginfo[(edge[0],edge[1])] # line_sp = line.split(' ') # f.write(str(edge[0])+' '+str(edge[1])) # for j in range(2,len(line_sp)): # f.write(' '+line_sp[j]) f.write(Ginfo[(edge[0],edge[1])]+'\n') print count_no, count_yes # In[7]: def prune_graph(graph,in_hinges,out_hinges,reverse=False): H=nx.DiGraph() if reverse: G=nx.reverse(graph,copy=True) else: G=graph start_nodes = [x for x in G.nodes() if G.in_degree(x) ==0] in_hinges = list(in_hinges.intersection(set(G.nodes()))) out_hinges = list(out_hinges.intersection(set(G.nodes()))) if reverse: for node in in_hinges: for successor in G.successors(node): # H.add_edge(node,successor) H.add_node(successor) for node in out_hinges: H.add_node(node) else: for node in out_hinges: for successor in G.successors(node): # H.add_edge(node,successor) H.add_node(successor) for node in in_hinges: H.add_node(node) map(H.add_node,start_nodes) all_vertices=set(G.nodes()) current_vertices=set(H.nodes()) undiscovered_vertices=all_vertices-current_vertices last_discovered_vertices=current_vertices while undiscovered_vertices: discovered_vertices_set=set([x for node in last_discovered_vertices for x in G.successors(node) if x not in current_vertices]) for vertex in discovered_vertices_set: for v_predecessor in G.predecessors(vertex): if v_predecessor in current_vertices: H.add_edge(v_predecessor,vertex) break current_vertices=current_vertices.union(discovered_vertices_set) # print len(undiscovered_vertices) if len(discovered_vertices_set)==0: print last_discovered_vertices print 'did not reach all nodes' print 'size of G: '+str(len(G.nodes())) print 'size of H: '+str(len(H.nodes())) # return H rand_node = list(undiscovered_vertices)[0] discovered_vertices_set.add(rand_node) last_discovered_vertices=discovered_vertices_set undiscovered_vertices=all_vertices-current_vertices # if reverse: # for vertex in out_hinges: # for v_predecessor in G.predecessors(vertex): # H.add_edge(v_predecessor,vertex) # else: # for vertex in in_hinges: # for v_predecessor in G.predecessors(vertex): # H.add_edge(v_predecessor,vertex) if reverse: for node in in_hinges: for successor in G.successors(node): H.add_edge(node,successor) for node in out_hinges: for predecessor in G.predecessors(node): H.add_edge(predecessor,node) else: for node in out_hinges: for successor in G.successors(node): H.add_edge(node,successor) for node in in_hinges: for predecessor in G.predecessors(node): H.add_edge(predecessor,node) if reverse: return nx.reverse(H) return H # In[8]: def dead_end_clipping(G,threshold): # H=nx.DiGraph() H = G.copy() start_nodes = set([x for x in H.nodes() if H.in_degree(x) ==0]) for st_node in start_nodes: cur_path = [st_node] if len(H.successors(st_node)) == 1: cur_node = H.successors(st_node)[0] while H.in_degree(cur_node) == 1 and H.out_degree(cur_node) == 1 and len(cur_path) < threshold + 2: cur_path.append(cur_node) cur_node = H.successors(cur_node)[0] if len(cur_path) <= threshold: for vertex in cur_path: H.remove_node(vertex) end_nodes = set([x for x in H.nodes() if H.out_degree(x) ==0]) for end_node in end_nodes: cur_path = [end_node] if len(H.predecessors(end_node)) == 1: cur_node = H.predecessors(end_node)[0] while H.in_degree(cur_node) == 1 and H.out_degree(cur_node) == 1 and len(cur_path) < threshold + 2: cur_path.append(cur_node) cur_node = H.predecessors(cur_node)[0] if len(cur_path) <= threshold: for vertex in cur_path: H.remove_node(vertex) return H def rev_node(node): node_id = node.split('_')[0] return node_id + '_' + str(1-int(node.split('_')[1])) def dead_end_clipping_sym(G,threshold,print_debug = False): # H=nx.DiGraph() H = G.copy() start_nodes = set([x for x in H.nodes() if H.in_degree(x) ==0]) for st_node in start_nodes: if st_node not in H.nodes(): continue cur_path = [st_node] cur_node = st_node if print_debug: print '----0' print st_node if len(H.successors(st_node)) == 1: cur_node = H.successors(st_node)[0] if print_debug: print '----1' while H.in_degree(cur_node) == 1 and H.out_degree(cur_node) == 1 and len(cur_path) < threshold + 2: cur_path.append(cur_node) if print_debug: print cur_node cur_node = H.successors(cur_node)[0] if len(cur_path) > threshold + 1: break if print_debug: print '----2' print cur_path if len(cur_path) <= threshold and (H.in_degree(cur_node) > 1 or H.out_degree(cur_node) == 0): for vertex in cur_path: # try: if print_debug: print 'about to delete ',vertex,rev_node(vertex) H.remove_node(vertex) H.remove_node(rev_node(vertex)) # except: # pass if print_debug: print 'deleted ',vertex,rev_node(vertex) return H # In[9]: # This function is no longer used. See z_clipping_sym def z_clipping(G,threshold,in_hinges,out_hinges,print_z = False): H = G.copy() start_nodes = set([x for x in H.nodes() if H.out_degree(x) > 1 and x not in out_hinges]) for st_node in start_nodes: for sec_node in H.successors(st_node): if H.out_degree(st_node) == 1: break cur_node = sec_node cur_path = [[st_node,cur_node]] while H.in_degree(cur_node) == 1 and H.out_degree(cur_node) == 1: cur_path.append([cur_node,H.successors(cur_node)[0]]) cur_node = H.successors(cur_node)[0] if len(cur_path) > threshold + 1: break if len(cur_path) <= threshold and H.in_degree(cur_node) > 1 and H.out_degree(st_node) > 1 and cur_node not in in_hinges: if print_z: print cur_path for edge in cur_path: H.remove_edge(edge[0],edge[1]) for j in range(len(cur_path)-1): H.remove_node(cur_path[j][1]) end_nodes = set([x for x in H.nodes() if H.in_degree(x) > 1 and x not in in_hinges]) for end_node in end_nodes: for sec_node in H.predecessors(end_node): if H.in_degree(end_node) == 1: break cur_node = sec_node cur_path = [[cur_node,end_node]] while H.in_degree(cur_node) == 1 and H.out_degree(cur_node) == 1: cur_path.append([H.predecessors(cur_node)[0],cur_node]) cur_node = H.predecessors(cur_node)[0] if len(cur_path) > threshold + 1: break if len(cur_path) <= threshold and H.out_degree(cur_node) > 1 and H.in_degree(end_node) > 1 and cur_node not in out_hinges: if print_z: print cur_path for edge in cur_path: H.remove_edge(edge[0],edge[1]) for j in range(len(cur_path)-1): H.remove_node(cur_path[j][0]) return H def z_clipping_sym(G,threshold,in_hinges,out_hinges,print_z = False): H = G.copy() G0 = G.copy() start_nodes = set([x for x in H.nodes() if H.out_degree(x) > 1 and x not in out_hinges]) for st_node in start_nodes: try: # need this because we are deleting nodes inside loop H.successors(st_node) except: continue for sec_node in H.successors(st_node): if H.out_degree(st_node) == 1: break cur_node = sec_node cur_path = [[st_node,cur_node]] while H.in_degree(cur_node) == 1 and H.out_degree(cur_node) == 1: cur_path.append([cur_node,H.successors(cur_node)[0]]) cur_node = H.successors(cur_node)[0] if len(cur_path) > threshold + 1: break if len(cur_path) <= threshold and H.in_degree(cur_node) > 1 and H.out_degree(st_node) > 1 and cur_node not in in_hinges: if print_z: print cur_path for edge in cur_path: G0.edge[edge[0]][edge[1]]['z'] = 1 G0.edge[rev_node(edge[1])][rev_node(edge[0])]['z'] = 1 try: H.remove_edge(edge[0],edge[1]) H.remove_edge(rev_node(edge[1]),rev_node(edge[0])) except: pass for j in range(len(cur_path)-1): G0.node[cur_path[j][1]]['z'] = 1 G0.node[rev_node(cur_path[j][1])]['z'] = 1 try: H.remove_node(cur_path[j][1]) H.remove_node(rev_node(cur_path[j][1])) except: pass return H, G0 # In[48]: def merge_path(g,in_node,node,out_node): # g.add_edge(in_node,out_node,hinge_edge = -1,false_positive = 0) if g.edge[in_node][node]['intersection'] == 1 and g.edge[node][out_node]['intersection'] == 1: g.add_edge(in_node,out_node,hinge_edge = -1,intersection = 1,z=0) else: g.add_edge(in_node,out_node,hinge_edge = -1,intersection = 0,z=0) g.remove_node(node) # In[121]: def random_condensation(G,n_nodes,check_gt = False): g = G.copy() max_iter = 20000 iter_cnt = 0 while len(g.nodes()) > n_nodes and iter_cnt < max_iter: iter_cnt += 1 node = g.nodes()[random.randrange(len(g.nodes()))] if g.in_degree(node) == 1 and g.out_degree(node) == 1: in_node = g.in_edges(node)[0][0] out_node = g.out_edges(node)[0][1] if g.out_degree(in_node) == 1 and g.in_degree(out_node) == 1: if in_node != node and out_node != node and in_node != out_node: #print in_node, node, out_node # merge_path(g,in_node,node,out_node) bad_node=False if check_gt: for in_edge in g.in_edges(node): if g.edge[in_edge[0]][in_edge[1]]['false_positive']==1: bad_node=True for out_edge in g.out_edges(node): if g.edge[out_edge[0]][out_edge[1]]['false_positive']==1: bad_node=True if not bad_node: #print in_node, node, out_node merge_path(g,in_node,node,out_node) if iter_cnt >= max_iter: print "couldn't finish sparsification"+str(len(g.nodes())) return g def random_condensation_sym(G,n_nodes,check_gt = False): g = G.copy() max_iter = 20000 iter_cnt = 0 while len(g.nodes()) > n_nodes and iter_cnt < max_iter: iter_cnt += 1 node = g.nodes()[random.randrange(len(g.nodes()))] if g.in_degree(node) == 1 and g.out_degree(node) == 1: in_node = g.in_edges(node)[0][0] out_node = g.out_edges(node)[0][1] if g.out_degree(in_node) == 1 and g.in_degree(out_node) == 1: if in_node != node and out_node != node and in_node != out_node: #print in_node, node, out_node # merge_path(g,in_node,node,out_node) bad_node=False if check_gt: for in_edge in g.in_edges(node): if g.edge[in_edge[0]][in_edge[1]]['false_positive']==1: bad_node=True for out_edge in g.out_edges(node): if g.edge[out_edge[0]][out_edge[1]]['false_positive']==1: bad_node=True if not bad_node: #print in_node, node, out_node try: merge_path(g,in_node,node,out_node) merge_path(g,rev_node(out_node),rev_node(node),rev_node(in_node)) except: pass if iter_cnt >= max_iter: print "couldn't finish sparsification"+str(len(g.nodes())) return g # In[118]: def random_condensation2(g,n_nodes): g = G.copy() max_iter = 20000 iter_cnt = 0 while len(g.nodes()) > n_nodes and iter_cnt < max_iter: iter_cnt += 1 node = g.nodes()[random.randrange(len(g.nodes()))] if g.in_degree(node) == 1 and g.out_degree(node) == 1: base_node=node.split("_")[0] orintation = node.split("_")[1] # if orintation=='1': # node2=base_node+'_0' # else: # node2=base_node+'_1' # print node,node2 in_node = g.in_edges(node)[0][0] out_node = g.out_edges(node)[0][1] if g.node[node]['hinge']==0 and g.node[in_node]['hinge']==0 and g.node[out_node]['hinge']==0: if g.out_degree(in_node) == 1 and g.in_degree(out_node) == 1: if in_node != node and out_node != node and in_node != out_node: bad_node=False # print g.in_edges(node) # print g.edge[g.in_edges(node)[0][0]][g.in_edges(node)[0][1]] # print g.out_edges(node) for in_edge in g.in_edges(node): if g.edge[in_edge[0]][in_edge[1]]['false_positive']==1: bad_node=True for out_edge in g.out_edges(node): if g.edge[out_edge[0]][out_edge[1]]['false_positive']==1: bad_node=True if not bad_node: #print in_node, node, out_node merge_path(g,in_node,node,out_node) if iter_cnt >= max_iter: print "couldn't finish sparsification: "+str(len(g.nodes())) return g def bubble_bursting_sym(H,threshold,print_bubble = False): start_nodes = set([x for x in H.nodes() if H.out_degree(x) == 2]) for st_node in start_nodes: try: # need this because we are deleting nodes inside loop H.successors(st_node)[1] except: continue sec_node = H.successors(st_node)[0] cur_node = sec_node cur_path = [[st_node,cur_node]] while H.in_degree(cur_node) == 1 and H.out_degree(cur_node) == 1: cur_path.append([cur_node,H.successors(cur_node)[0]]) cur_node = H.successors(cur_node)[0] if len(cur_path) > threshold + 1: break end_node0 = cur_node cur_node = H.successors(st_node)[1] alt_path = [[st_node,cur_node]] while H.in_degree(cur_node) == 1 and H.out_degree(cur_node) == 1: alt_path.append([cur_node,H.successors(cur_node)[0]]) cur_node = H.successors(cur_node)[0] if len(alt_path) > threshold + 1: break if len(cur_path) <= threshold and len(alt_path) <= threshold and end_node0 == cur_node: if print_bubble: print 'found bubble' for edge in cur_path: # try: H.remove_edge(edge[0],edge[1]) H.remove_edge(rev_node(edge[1]),rev_node(edge[0])) # except: # pass for j in range(len(cur_path)-1): # try: H.remove_node(cur_path[j][1]) H.remove_node(rev_node(cur_path[j][1])) # except: # pass return H def resolve_rep(g,rep_path,in_node,out_node): prefix = 'B' g.add_edge(in_node,prefix + rep_path[0], length=g.edge[in_node][rep_path[0]]['length'], read_a_match_start=g.edge[in_node][rep_path[0]]['read_a_match_start'], read_a_match_end=g.edge[in_node][rep_path[0]]['read_a_match_end'], read_b_match_start=g.edge[in_node][rep_path[0]]['read_b_match_start'], read_b_match_end=g.edge[in_node][rep_path[0]]['read_b_match_end'], read_a_match_start_raw=g.edge[in_node][rep_path[0]]['read_a_match_start_raw'], read_a_match_end_raw=g.edge[in_node][rep_path[0]]['read_a_match_end_raw'], read_b_match_start_raw=g.edge[in_node][rep_path[0]]['read_b_match_start_raw'], read_b_match_end_raw=g.edge[in_node][rep_path[0]]['read_b_match_end_raw']) g.remove_edge(in_node,rep_path[0]) g.add_edge(prefix+rep_path[-1],out_node, length=g.edge[rep_path[-1]][out_node]['length'], read_a_match_start=g.edge[rep_path[-1]][out_node]['read_a_match_start'], read_a_match_end=g.edge[rep_path[-1]][out_node]['read_a_match_end'], read_b_match_start=g.edge[rep_path[-1]][out_node]['read_b_match_start'], read_b_match_end=g.edge[rep_path[-1]][out_node]['read_b_match_end'], read_a_match_start_raw=g.edge[rep_path[-1]][out_node]['read_a_match_start_raw'], read_a_match_end_raw=g.edge[rep_path[-1]][out_node]['read_a_match_end_raw'], read_b_match_start_raw=g.edge[rep_path[-1]][out_node]['read_b_match_start_raw'], read_b_match_end_raw=g.edge[rep_path[-1]][out_node]['read_b_match_end_raw']) g.remove_edge(rep_path[-1],out_node) g.add_edge(rev_node(prefix + rep_path[0]),rev_node(in_node), length =g.edge[rev_node(rep_path[0])][rev_node(in_node)]['length'], read_a_match_start=g.edge[rev_node(rep_path[0])][rev_node(in_node)]['read_a_match_start'], read_a_match_end=g.edge[rev_node(rep_path[0])][rev_node(in_node)]['read_a_match_end'], read_b_match_start=g.edge[rev_node(rep_path[0])][rev_node(in_node)]['read_b_match_start'], read_b_match_end=g.edge[rev_node(rep_path[0])][rev_node(in_node)]['read_b_match_end'], read_a_match_start_raw=g.edge[rev_node(rep_path[0])][rev_node(in_node)]['read_a_match_start_raw'], read_a_match_end_raw=g.edge[rev_node(rep_path[0])][rev_node(in_node)]['read_a_match_end_raw'], read_b_match_start_raw=g.edge[rev_node(rep_path[0])][rev_node(in_node)]['read_b_match_start_raw'], read_b_match_end_raw=g.edge[rev_node(rep_path[0])][rev_node(in_node)]['read_b_match_end_raw']) g.remove_edge(rev_node(rep_path[0]),rev_node(in_node)) g.add_edge(rev_node(out_node),rev_node(prefix+rep_path[-1]), length=g.edge[rev_node(out_node)][rev_node(rep_path[-1])]['length'], read_a_match_start=g.edge[rev_node(out_node)][rev_node(rep_path[-1])]['read_a_match_start'], read_a_match_end=g.edge[rev_node(out_node)][rev_node(rep_path[-1])]['read_a_match_end'], read_b_match_start=g.edge[rev_node(out_node)][rev_node(rep_path[-1])]['read_b_match_start'], read_b_match_end=g.edge[rev_node(out_node)][rev_node(rep_path[-1])]['read_b_match_end'], read_a_match_start_raw=g.edge[rev_node(out_node)][rev_node(rep_path[-1])]['read_a_match_start_raw'], read_a_match_end_raw=g.edge[rev_node(out_node)][rev_node(rep_path[-1])]['read_a_match_end_raw'], read_b_match_start_raw=g.edge[rev_node(out_node)][rev_node(rep_path[-1])]['read_b_match_start_raw'], read_b_match_end_raw=g.edge[rev_node(out_node)][rev_node(rep_path[-1])]['read_b_match_end_raw']) g.remove_edge(rev_node(out_node),rev_node(rep_path[-1])) for i in range(0,len(rep_path)-1): g.add_edge(prefix+rep_path[i],prefix+rep_path[i+1], length=g.edge[rep_path[i]][rep_path[i+1]]['length'], read_a_match_start=g.edge[rep_path[i]][rep_path[i+1]]['read_a_match_start'], read_a_match_end=g.edge[rep_path[i]][rep_path[i+1]]['read_a_match_end'], read_b_match_start=g.edge[rep_path[i]][rep_path[i+1]]['read_b_match_start'], read_b_match_end=g.edge[rep_path[i]][rep_path[i+1]]['read_b_match_end'], read_a_match_start_raw=g.edge[rep_path[i]][rep_path[i+1]]['read_a_match_start_raw'], read_a_match_end_raw=g.edge[rep_path[i]][rep_path[i+1]]['read_a_match_end_raw'], read_b_match_start_raw=g.edge[rep_path[i]][rep_path[i+1]]['read_b_match_start_raw'], read_b_match_end_raw=g.edge[rep_path[i]][rep_path[i+1]]['read_b_match_end_raw']) g.add_edge(rev_node(prefix+rep_path[i+1]),rev_node(prefix+rep_path[i]), length =g.edge[rev_node(rep_path[i+1])][rev_node(rep_path[i])]['length'], read_a_match_start=g.edge[rev_node(rep_path[i+1])][rev_node(rep_path[i])]['read_a_match_start'], read_a_match_end=g.edge[rev_node(rep_path[i+1])][rev_node(rep_path[i])]['read_a_match_end'], read_b_match_start=g.edge[rev_node(rep_path[i+1])][rev_node(rep_path[i])]['read_b_match_start'], read_b_match_end=g.edge[rev_node(rep_path[i+1])][rev_node(rep_path[i])]['read_b_match_end'], read_a_match_start_raw=g.edge[rev_node(rep_path[i+1])][rev_node(rep_path[i])]['read_a_match_start_raw'], read_a_match_end_raw=g.edge[rev_node(rep_path[i+1])][rev_node(rep_path[i])]['read_a_match_end_raw'], read_b_match_start_raw=g.edge[rev_node(rep_path[i+1])][rev_node(rep_path[i])]['read_b_match_start_raw'], read_b_match_end_raw=g.edge[rev_node(rep_path[i+1])][rev_node(rep_path[i])]['read_b_match_end_raw']) def loop_resolution(g,max_nodes,flank,print_debug = False): starting_nodes = [x for x in g.nodes() if g.out_degree(x) == 2] if print_debug: print '----' print starting_nodes tandem = [] for st_node in starting_nodes: if g.out_degree(st_node) != 2: continue if print_debug: print '----' print st_node loop_len = 0 for first_node in g.successors(st_node): if g.out_degree(st_node) != 2: continue if print_debug: print '----' print first_node other_successor = [x for x in g.successors(st_node) if x != first_node][0] next_node = first_node if print_debug: print 'going on loop' loop_len = 0 prev_edge = g[st_node][next_node] node_cnt = 0 while g.in_degree(next_node) == 1 and g.out_degree(next_node) == 1 and node_cnt < max_nodes: node_cnt += 1 in_node = next_node next_node = g.successors(next_node)[0] loop_len += abs(g[in_node][next_node]['read_a_match_start'] - prev_edge['read_b_match_start']) prev_edge = g[in_node][next_node] if node_cnt >= max_nodes: continue if print_debug: print "length in loop " + str(loop_len) len_in_loop = loop_len first_node_of_repeat = next_node if g.in_degree(next_node) == 2: prev_node = [x for x in g.predecessors(next_node) if x != in_node][0] node_cnt = 0 while g.in_degree(prev_node) == 1 and g.out_degree(prev_node) == 1: node_cnt += 1 prev_node = g.predecessors(prev_node)[0] if node_cnt >= flank: break if node_cnt < flank: # and prev_node != st_node: continue next_node = other_successor node_cnt = 0 while g.in_degree(next_node) == 1 and g.out_degree(next_node) == 1: node_cnt += 1 next_node = g.successors(next_node)[0] if node_cnt >= flank: break if node_cnt < flank: # and next_node != first_node_of_repeat: continue rep = [first_node_of_repeat] next_node = first_node_of_repeat node_cnt = 0 if g.in_degree(next_node) == 2 and g.out_degree(next_node) == 1: next_double_node = g.successors(next_node)[0] rep.append(next_double_node) prev_edge = g[next_node][next_double_node] else: next_double_node = next_node try: assert not (g.in_degree(next_double_node) == 1 and g.out_degree(next_double_node) == 1) except: print str(g.in_degree(next_node)) print str(g.out_degree(next_node)) raise while g.in_degree(next_double_node) == 1 and g.out_degree(next_double_node) == 1 and node_cnt < max_nodes: node_cnt += 1 loop_len += abs(g[next_double_node][g.successors(next_double_node)[0]]['read_a_match_start'] - prev_edge['read_b_match_start']) next_double_node = g.successors(next_double_node)[0] rep.append(next_double_node) if print_debug: print "length in repeat " + str(loop_len-len_in_loop) if next_double_node == st_node and loop_len > MAX_PLASMID_LENGTH: if print_debug: print 'success!' print "length in loop " + str(loop_len) print 'rep is:' print rep print 'in_node and other_successor:' print in_node, other_successor resolve_rep(g,rep,in_node,other_successor) # print next_double_node if node_cnt < 5: tandem.append(rep) continue if len(tandem) > 0: with open('tandem.txt', 'w') as tandemout: for rep in tandem: tandemout.write(str(rep)) return g def y_pruning(G,flank): H = G.copy() y_nodes = set([x for x in H.nodes() if H.out_degree(x) > 1 and H.in_degree(x) == 1]) pruned_count = 0 for st_node in y_nodes: pruned = 0 try: H.predecessors(st_node) except: continue prev_node = H.predecessors(st_node)[0] node_cnt = 0 while H.in_degree(prev_node) == 1 and H.out_degree(prev_node) == 1: node_cnt += 1 prev_node = H.predecessors(prev_node)[0] if node_cnt >= flank: break if node_cnt < flank: # and prev_node != st_node: continue # if we got here, we probably have a Y, and not a collapsed repeat for vert in H.successors(st_node): if H.node[vert]['CFLAG'] == True: try: H.remove_edge(st_node,vert) H.remove_edge(rev_node(vert),rev_node(st_node)) pruned = 1 except: pass if pruned == 1: pruned_count += 1 # print "Number of pruned Y's: "+str(pruned_count) return H # In[72]: def add_groundtruth(g,json_file,in_hinges,out_hinges): mapping = ujson.load(json_file) print 'getting mapping' mapped_nodes=0 print str(len(mapping)) print str(len(g.nodes())) slack = 500 max_chr = 0 chr_length_dict = {} for node in g.nodes(): # print node node_base=node.split("_")[0] # print node_base #print node g.node[node]['normpos'] = 0 if mapping.has_key(node_base): g.node[node]['chr'] = mapping[node_base][0][2]+1 g.node[node]['aln_start'] = min (mapping[node_base][0][0],mapping[node_base][0][1]) g.node[node]['aln_end'] = max(mapping[node_base][0][1],mapping[node_base][0][0]) # max_chr = max(g.node[node]['chr'],max_chr) # mapped_nodes+=1 else: # pass g.node[node]['chr'] = 0 g.node[node]['aln_start'] = 1 g.node[node]['aln_end'] = 1 # g.node[node]['aln_strand'] = 0 if node in in_hinges or node in out_hinges: g.node[node]['hinge'] = 1 else: g.node[node]['hinge'] = 0 if g.node[node]['chr'] in chr_length_dict: chr_length_dict[g.node[node]['chr']] = max(g.node[node]['aln_end'], chr_length_dict[g.node[node]['chr']]) else: chr_length_dict[g.node[node]['chr']] = max(g.node[node]['aln_end'], 1) chr_list = sorted(chr_length_dict.items(), key=operator.itemgetter(1), reverse=True) max_chr_len1 = max([g.node[x]['aln_end'] for x in g.nodes()]) max_chr_multiplier = 10**len(str(max_chr_len1)) print [x for x in chr_list] chr_set =[x [0] for x in chr_list] print chr_set # red_bk = 102 # green_bk = 102 # blue_bk = 102 colour_list = ['red', 'lawngreen', 'deepskyblue', 'deeppink', 'darkorange', 'purple', 'gold', 'mediumblue', 'saddlebrown', 'darkgreen'] for colour in colour_list: print matplotlib.colors.colorConverter.to_rgb(colour) for index, chrom in enumerate(chr_set): node_set = set([x for x in g.nodes() if g.node[x]['chr'] == chrom]) print chrom max_chr_len = max([g.node[x]['aln_end'] for x in g.nodes() if g.node[x]['chr'] == chrom]) # max_chr_multiplier = 10**len(str(max_chr_len)) if index < 10: rgb_tuple = matplotlib.colors.colorConverter.to_rgb(colour_list[index]) red = int(255*rgb_tuple[0]) green = int(255*rgb_tuple[1]) blue = int(255*rgb_tuple[2]) else: red = random.randint(0,255) # green = random.randint(0,255) blue = random.randint(0,255) brightness = 200 green = max(0,min( 255,brightness - int((0.2126 *red + 0.0722 *blue)/0.7152 ))) red_bk = max(red-100,0) blue_bk = max(blue-100,0) green_bk = max(green-100,0) print red,blue,green for node in node_set: g.node[node]['normpos'] = g.node[node]['chr'] * max_chr_multiplier + (g.node[node]['aln_end']/float(max_chr_len))*max_chr_multiplier lamda = (g.node[node]['aln_end']/max_chr_len) nd_red = (1-lamda)*red + lamda*red_bk nd_green = (1-lamda)*green + lamda*green_bk nd_blue = (1-lamda)*blue + lamda*blue_bk g.node[node]['color'] = rgb2hex(nd_red, nd_green, nd_blue) g.node[node]['color_r'] = nd_red g.node[node]['color_g'] = nd_green g.node[node]['color_b'] = nd_blue # max_chr_len = len(str(max_chr)) # div_num = float(10**(max_chr_len)) # for node in g.nodes(): # g.node[node]['normpos'] = (g.node[node]['chr'] + g.node[node]['aln_end']/float(chr_length_dict[g.node[node]['chr']]))/div_num for edge in g.edges_iter(): in_node=edge[0] out_node=edge[1] # if ((g.node[in_node]['aln_start'] < g.node[out_node]['aln_start'] and # g.node[out_node]['aln_start'] < g.node[in_node]['aln_end']) or # (g.node[in_node]['aln_start'] < g.node[out_node]['aln_end'] and # g.node[out_node]['aln_end'] < g.node[in_node]['aln_end'])): # g.edge[in_node][out_node]['false_positive']=0 # else: # g.edge[in_node][out_node]['false_positive']=1 if ((g.node[in_node]['aln_start'] < g.node[out_node]['aln_start'] and g.node[out_node]['aln_start'] < g.node[in_node]['aln_end']) or (g.node[in_node]['aln_start'] < g.node[out_node]['aln_end'] and g.node[out_node]['aln_end'] < g.node[in_node]['aln_end'])): g.edge[in_node][out_node]['false_positive']=0 else: g.edge[in_node][out_node]['false_positive']=1 return g def mark_skipped_edges(G,skipped_name): with open (skipped_name) as f: for lines in f: lines1=lines.split() if len(lines1) < 5: continue e1 = (lines1[0] + "_" + lines1[3], lines1[1] + "_" + lines1[4]) if e1 in G.edges(): G.edge[lines1[0] + "_" + lines1[3]][lines1[1] + "_" + lines1[4]]['skipped'] = 1 G.edge[lines1[1] + "_" + str(1-int(lines1[4]))][lines1[0] + "_" + str(1-int(lines1[3]))]['skipped'] = 1 def add_annotation(g,in_hinges,out_hinges): for node in g.nodes(): if node in in_hinges: g.node[node]['hinge'] = 1 elif node in out_hinges: g.node[node]['hinge'] = -1 else: g.node[node]['hinge'] = 0 return g def add_chimera_flags(g,prefix): cov_flags = prefix + '.cov.flag' slf_flags = None for node in g.nodes(): g.node[node]['CFLAG'] = False if slf_flags != None: g.node[node]['SFLAG'] = False node_set = set(g.nodes()) num_bad_cov_reads = 0 if cov_flags != None: with open(cov_flags,'r') as f: for line in f: node_name = line.strip() try: assert not ((node_name+'_0' in node_set and node_name+'_1' not in node_set) or (node_name+'_0' not in node_set and node_name+'_1' in node_set)) except: print node_name + ' is not symmetrically present in the graph input.' raise if node_name+'_0' in node_set: g.node[node_name+'_0']['CFLAG'] = True g.node[node_name+'_1']['CFLAG'] = True num_bad_cov_reads += 1 print str(num_bad_cov_reads) + ' bad coverage reads.' num_bad_slf_reads = 0 if slf_flags != None: with open(slf_flags,'r') as f: for line in f: node_name = line.strip() try: assert not ((node_name+'_0' in node_set and node_name+'_1' not in node_set) or (node_name+'_0' not in node_set and node_name+'_1' in node_set)) except: print node_name + ' is not symmetrically present in the graph input.' raise if node_name+'_0' in node_set: g.node[node_name+'_0']['SFLAG'] = True g.node[node_name+'_1']['SFLAG'] = True num_bad_slf_reads += 1 print str(num_bad_slf_reads) + ' bad self aligned reads.' def connect_strands(g): for node in g.nodes(): revnode = rev_node(node) g.add_edge(node,revnode) g.add_edge(revnode,node) return g def create_bidirected(g): h = nx.DiGraph() for u in g.nodes(): for successor in g.successors(u): tail_id, tail_orientation = u.split('_') head_id, head_orientation = successor.split('_') h.add_edge(tail_id,head_id,tail_or = int(tail_orientation),head_or = int(head_orientation), read_a_match_start=g.edge[u][successor]['read_a_match_start'], read_a_match_end=g.edge[u][successor]['read_a_match_end'], read_b_match_start=g.edge[u][successor]['read_b_match_start'], read_b_match_end=g.edge[u][successor]['read_b_match_end']) st_nodes = [x for x in g if g.in_degree(x) != 1 or g.out_degree(x) > 1] for st_node in st_nodes: for sec_node in g.successors(st_node): cur_node = st_node cur_id = cur_node.split('_')[0] next_node = sec_node next_id = next_node.split('_')[0] if next_id in h.successors(cur_id) and cur_id in h.successors(next_id): h.remove_edge(next_id,cur_id) while g.in_degree(next_node) == 1 and g.out_degree(next_node) == 1: cur_node = next_node cur_id = cur_node.split('_')[0] next_node = g.successors(next_node)[0] next_id = next_node.split('_')[0] # else: # print 'not in h' if next_id in h.successors(cur_id) and cur_id in h.successors(next_id): h.remove_edge(next_id,cur_id) else: break return h def create_bidirected2(g): h = nx.DiGraph() for u in g.nodes(): for successor in g.successors(u): tail_id, tail_orientation = u.split('_') head_id, head_orientation = successor.split('_') h.add_edge(tail_id,head_id) # h.add_edge(tail_id,head_id,tail_or = int(tail_orientation),head_or = int(head_orientation), # read_a_match_start=g.edge[u][successor]['read_a_match_start'], # read_a_match_end=g.edge[u][successor]['read_a_match_end'], # read_b_match_start=g.edge[u][successor]['read_b_match_start'], # read_b_match_end=g.edge[u][successor]['read_b_match_end']) st_nodes = [x for x in g if g.in_degree(x) != 1 or g.out_degree(x) > 1] for st_node in st_nodes: for sec_node in g.successors(st_node): cur_node = st_node cur_id = cur_node.split('_')[0] next_node = sec_node next_id = next_node.split('_')[0] if next_id in h.successors(cur_id) and cur_id in h.successors(next_id): h.remove_edge(next_id,cur_id) while g.in_degree(next_node) == 1 and g.out_degree(next_node) == 1: cur_node = next_node cur_id = cur_node.split('_')[0] next_node = g.successors(next_node)[0] next_id = next_node.split('_')[0] # else: # print 'not in h' if next_id in h.successors(cur_id) and cur_id in h.successors(next_id): h.remove_edge(next_id,cur_id) else: break return g def write_graphml(g,prefix,suffix,suffix1): h = g.copy() connect_strands(h) nx.write_graphml(h, prefix+suffix+'.'+'suffix1'+'.graphml') flname = sys.argv[1] # flname = '../pb_data/ecoli_shortened/ecoli4/ecolii2.edges.hinges' prefix = flname.split('.')[0] hingesname = sys.argv[2] # hingesname = '../pb_data/ecoli_shortened/ecoli4/ecolii2.hinge.list' suffix = sys.argv[3] DEL_TELOMERE = False AGGRESSIVE_PRUNING = False if len(sys.argv) >= 5: ini_file_path = sys.argv[4] config = configparser.ConfigParser() config.read(ini_file_path) try: MAX_PLASMID_LENGTH = config.getint('layout', 'max_plasmid_length') # print 'MAX_PLASMID_LENGTH in config '+str(MAX_PLASMID_LENGTH) except: MAX_PLASMID_LENGTH = 500000 # print 'MAX_PLASMID_LENGTH '+str(MAX_PLASMID_LENGTH) try: DEL_TELOMERE = config.getbool('layout','del_telomere') except: DEL_TELOMERE = False try: AGGRESSIVE_PRUNING = config.getbool('layout','aggressive_pruning') except: AGGRESSIVE_PRUNING = False else: MAX_PLASMID_LENGTH = 500000 if len(sys.argv)>=6: json_file = open(sys.argv[5]) else: json_file = None # path = '../pb_data/ecoli_shortened/ecoli4/' # suffix = 'i2' # In[116]: G = nx.DiGraph() Ginfo = {} with open (flname) as f: for lines in f: lines1=lines.split() if len(lines1) < 5: continue e1 = (lines1[0] + "_" + lines1[3], lines1[1] + "_" + lines1[4]) # print lines1 # e1_match1 = abs(int(lines1[6].lstrip('['))-int(lines1[7].rstrip(']'))) # e1_match2 = abs(int(lines1[8].lstrip('['))-int(lines1[9].rstrip(']'))) e1_match_len = int(lines1[2]) ra_match_start = int(lines1[6].lstrip('[')) ra_match_end = int(lines1[7].rstrip(']')) rb_match_start = int(lines1[8].lstrip('[')) rb_match_end = int(lines1[9].rstrip(']')) ra_match_start_raw = int(lines1[-4].lstrip('[')) ra_match_end_raw = int(lines1[-3].rstrip(']')) rb_match_start_raw = int(lines1[-2].lstrip('[')) rb_match_end_raw = int(lines1[-1].rstrip(']')) if e1 in G.edges(): G.add_edge(lines1[0] + "_" + lines1[3], lines1[1] + "_" + lines1[4], hinge_edge=int(lines1[5]),intersection=1,length=e1_match_len,z=0, read_a_match_start=ra_match_start,read_a_match_end=ra_match_end, read_b_match_start=rb_match_start,read_b_match_end=rb_match_end, read_a_match_start_raw=ra_match_start_raw,read_a_match_end_raw=ra_match_end_raw, read_b_match_start_raw=rb_match_start_raw,read_b_match_end_raw=rb_match_end_raw) G.add_edge(lines1[1] + "_" + str(1-int(lines1[4])), lines1[0] + "_" + str(1-int(lines1[3])), hinge_edge=int(lines1[5]),intersection=1,length=e1_match_len,z=0, read_a_match_start=rb_match_start,read_a_match_end=rb_match_end, read_b_match_start=ra_match_start,read_b_match_end=ra_match_end, read_a_match_start_raw=rb_match_start_raw,read_a_match_end_raw=rb_match_end_raw, read_b_match_start_raw=ra_match_start_raw,read_b_match_end_raw=ra_match_end_raw) else: G.add_edge(lines1[0] + "_" + lines1[3], lines1[1] + "_" + lines1[4], hinge_edge=int(lines1[5]),intersection=0,length=e1_match_len,z=0, read_a_match_start=ra_match_start,read_a_match_end=ra_match_end, read_b_match_start=rb_match_start,read_b_match_end=rb_match_end, read_a_match_start_raw=ra_match_start_raw,read_a_match_end_raw=ra_match_end_raw, read_b_match_start_raw=rb_match_start_raw,read_b_match_end_raw=rb_match_end_raw) G.add_edge(lines1[1] + "_" + str(1-int(lines1[4])), lines1[0] + "_" + str(1-int(lines1[3])), hinge_edge=int(lines1[5]),intersection=0,length=e1_match_len,z=0, read_a_match_start=rb_match_start,read_a_match_end=rb_match_end, read_b_match_start=ra_match_start,read_b_match_end=ra_match_end, read_a_match_start_raw=rb_match_start_raw,read_a_match_end_raw=rb_match_end_raw, read_b_match_start_raw=ra_match_start_raw,read_b_match_end_raw=ra_match_end_raw) towrite = lines1[0] + "_" + lines1[3] +' '+ lines1[1] + "_" + lines1[4] +' '+ lines1[2]+' '+str(int(lines1[11][:-1])-int(lines1[10][1:]))+' '+str(int(lines1[13][:-1])-int(lines1[12][1:])) Ginfo[(lines1[0] + "_" + lines1[3],lines1[1] + "_" + lines1[4])] = towrite towrite= lines1[1] + "_" + str(1-int(lines1[4])) +' '+ lines1[0] + "_" + str(1-int(lines1[3])) +' '+ lines1[2]+' '+str(int(lines1[13][:-1])-int(lines1[12][1:]))+' '+str(int(lines1[11][:-1])-int(lines1[10][1:])) Ginfo[(lines1[1] + "_" + str(1-int(lines1[4])), lines1[0] + "_" + str(1-int(lines1[3])))] = towrite nx.write_graphml(G, prefix+suffix+'.'+'G00'+'.graphml') vertices=set() in_hinges = set() out_hinges = set() with open (hingesname) as f: for lines in f: lines1=lines.split() if lines1[2] == '1': in_hinges.add(lines1[0]+'_0') out_hinges.add(lines1[0]+'_1') elif lines1[2] == '-1': in_hinges.add(lines1[0]+'_1') out_hinges.add(lines1[0]+'_0') add_annotation(G,in_hinges,out_hinges) add_chimera_flags(G,prefix) # try: mark_skipped_edges(G,flname.split('.')[0] + '.edges.skipped') # except: # print "some error here" # pass # json_file = open('../pb_data/ecoli_shortened/ecoli4/ecoli.mapping.1.json') if json_file!= None: add_groundtruth(G,json_file,in_hinges,out_hinges) # In[ ]: G0 = G.copy() # Actual pruning, clipping and z deletion occurs below G0 = dead_end_clipping_sym(G0,10) # G1=z_clipping_sym(G1,5,in_hinges,out_hinges) G1,G0 = z_clipping_sym(G0,6,set(),set()) # G1=z_clipping_sym(G1,5,in_hinges,out_hinges) # G1=z_clipping_sym(G1,5,in_hinges,out_hinges) # G1=z_clipping_sym(G1,5,in_hinges,out_hinges) if DEL_TELOMERE: G1 = bubble_bursting_sym(G1,20) G1 = dead_end_clipping_sym(G1,20) else: G1 = bubble_bursting_sym(G1,10) G1 = dead_end_clipping_sym(G1,5) nx.write_graphml(G0, prefix+suffix+'.'+'G0'+'.graphml') nx.write_graphml(G1, prefix+suffix+'.'+'G1'+'.graphml') G2 = G1.copy() Gs = random_condensation_sym(G1,1000) loop_resolution(G2,500,50) G2s = random_condensation_sym(G2,1000) nx.write_graphml(G2, prefix+suffix+'.'+'G2'+'.graphml') nx.write_graphml(Gs, prefix+suffix+'.'+'Gs'+'.graphml') nx.write_graphml(G2s, prefix+suffix+'.'+'G2s'+'.graphml') Gc = connect_strands(Gs) nx.write_graphml(Gc, prefix+suffix+'.'+'Gc'+'.graphml') G2c = connect_strands(G2s) nx.write_graphml(G2c, prefix+suffix+'.'+'G2c'+'.graphml') if AGGRESSIVE_PRUNING: G3 = y_pruning(G2,10) G3 = dead_end_clipping_sym(G3,10) G3s = random_condensation_sym(G3,1000) G3c = connect_strands(G3s) nx.write_graphml(G3, prefix+suffix+'.'+'G2'+'.graphml') nx.write_graphml(G3s, prefix+suffix+'.'+'G3s'+'.graphml') nx.write_graphml(G3c, prefix+suffix+'.'+'G3c'+'.graphml') # G2b = create_bidirected2(G2) # nx.write_graphml(G2b, prefix+suffix+'.'+'G2b'+'.graphml') # H=prune_graph(G1,in_hinges,out_hinges) # H=dead_end_clipping(H,5) # I=prune_graph(H,in_hinges,out_hinges,True) # I=dead_end_clipping(I,5) # Gs = random_condensation(G1,2000) # nx.write_graphml(Gs, path+'G'+suffix+'.graphml') # write_graph(Gs,path+'G'+suffix+'.txt') # Hs = random_condensation(H,2500) # nx.write_graphml(Hs, path+'H'+suffix+'.graphml') # write_graph(Hs,path+'H'+suffix+'.txt') # Is = random_condensation(I,2500) # nx.write_graphml(Is, path+'I'+suffix+'.graphml') # write_graph(Is,path+'I'+suffix+'.txt') HINGE-0.5.0/scripts/pruning_and_clipping_nanopore.py000077500000000000000000001225471314415550300225200ustar00rootroot00000000000000#!/usr/bin/env python # coding: utf-8 # In[115]: import networkx as nx import random import sys import numpy as np import ujson from colormap import rgb2hex import operator import matplotlib.colors # print G.number_of_edges(),G.number_of_nodes() # In[3]: def write_graph(G,flname): with open(flname,'w') as f: for edge in G.edges_iter(): f.write(str(edge[0])+'\t'+str(edge[1])+'\n') # In[4]: def write_graph2(G,Ginfo,flname): count_no = 0 count_yes = 0 with open(flname,'w') as f: for edge in G.edges_iter(): if (edge[0],edge[1]) not in Ginfo: count_no += 1 print "not found" continue else: count_yes += 1 # line = Ginfo[(edge[0],edge[1])] # line_sp = line.split(' ') # f.write(str(edge[0])+' '+str(edge[1])) # for j in range(2,len(line_sp)): # f.write(' '+line_sp[j]) f.write(Ginfo[(edge[0],edge[1])]+'\n') print count_no, count_yes # In[7]: def prune_graph(graph,in_hinges,out_hinges,reverse=False): H=nx.DiGraph() if reverse: G=nx.reverse(graph,copy=True) else: G=graph start_nodes = [x for x in G.nodes() if G.in_degree(x) ==0] in_hinges = list(in_hinges.intersection(set(G.nodes()))) out_hinges = list(out_hinges.intersection(set(G.nodes()))) if reverse: for node in in_hinges: for successor in G.successors(node): # H.add_edge(node,successor) H.add_node(successor) for node in out_hinges: H.add_node(node) else: for node in out_hinges: for successor in G.successors(node): # H.add_edge(node,successor) H.add_node(successor) for node in in_hinges: H.add_node(node) map(H.add_node,start_nodes) all_vertices=set(G.nodes()) current_vertices=set(H.nodes()) undiscovered_vertices=all_vertices-current_vertices last_discovered_vertices=current_vertices while undiscovered_vertices: discovered_vertices_set=set([x for node in last_discovered_vertices for x in G.successors(node) if x not in current_vertices]) for vertex in discovered_vertices_set: for v_predecessor in G.predecessors(vertex): if v_predecessor in current_vertices: H.add_edge(v_predecessor,vertex) break current_vertices=current_vertices.union(discovered_vertices_set) # print len(undiscovered_vertices) if len(discovered_vertices_set)==0: print last_discovered_vertices print 'did not reach all nodes' print 'size of G: '+str(len(G.nodes())) print 'size of H: '+str(len(H.nodes())) # return H rand_node = list(undiscovered_vertices)[0] discovered_vertices_set.add(rand_node) last_discovered_vertices=discovered_vertices_set undiscovered_vertices=all_vertices-current_vertices # if reverse: # for vertex in out_hinges: # for v_predecessor in G.predecessors(vertex): # H.add_edge(v_predecessor,vertex) # else: # for vertex in in_hinges: # for v_predecessor in G.predecessors(vertex): # H.add_edge(v_predecessor,vertex) if reverse: for node in in_hinges: for successor in G.successors(node): H.add_edge(node,successor) for node in out_hinges: for predecessor in G.predecessors(node): H.add_edge(predecessor,node) else: for node in out_hinges: for successor in G.successors(node): H.add_edge(node,successor) for node in in_hinges: for predecessor in G.predecessors(node): H.add_edge(predecessor,node) if reverse: return nx.reverse(H) return H # In[8]: def dead_end_clipping(G,threshold): # H=nx.DiGraph() H = G.copy() start_nodes = set([x for x in H.nodes() if H.in_degree(x) ==0]) for st_node in start_nodes: cur_path = [st_node] if len(H.successors(st_node)) == 1: cur_node = H.successors(st_node)[0] while H.in_degree(cur_node) == 1 and H.out_degree(cur_node) == 1 and len(cur_path) < threshold + 2: cur_path.append(cur_node) cur_node = H.successors(cur_node)[0] if len(cur_path) <= threshold: for vertex in cur_path: H.remove_node(vertex) end_nodes = set([x for x in H.nodes() if H.out_degree(x) ==0]) for end_node in end_nodes: cur_path = [end_node] if len(H.predecessors(end_node)) == 1: cur_node = H.predecessors(end_node)[0] while H.in_degree(cur_node) == 1 and H.out_degree(cur_node) == 1 and len(cur_path) < threshold + 2: cur_path.append(cur_node) cur_node = H.predecessors(cur_node)[0] if len(cur_path) <= threshold: for vertex in cur_path: H.remove_node(vertex) return H def rev_node(node): node_id = node.split('_')[0] return node_id + '_' + str(1-int(node.split('_')[1])) def dead_end_clipping_sym(G,threshold,print_debug = False): # H=nx.DiGraph() H = G.copy() start_nodes = set([x for x in H.nodes() if H.in_degree(x) ==0]) for st_node in start_nodes: if st_node not in H.nodes(): continue cur_path = [st_node] cur_node = st_node if print_debug: print '----0' print st_node if len(H.successors(st_node)) == 1: cur_node = H.successors(st_node)[0] if print_debug: print '----1' while H.in_degree(cur_node) == 1 and H.out_degree(cur_node) == 1 and len(cur_path) < threshold + 2: cur_path.append(cur_node) if print_debug: print cur_node cur_node = H.successors(cur_node)[0] if len(cur_path) > threshold + 1: break if print_debug: print '----2' print cur_path if len(cur_path) <= threshold and (H.in_degree(cur_node) > 1 or H.out_degree(cur_node) == 0): for vertex in cur_path: # try: if print_debug: print 'about to delete ',vertex,rev_node(vertex) H.remove_node(vertex) H.remove_node(rev_node(vertex)) # except: # pass if print_debug: print 'deleted ',vertex,rev_node(vertex) return H # In[9]: # This function is no longer used. See z_clipping_sym def z_clipping(G,threshold,in_hinges,out_hinges,print_z = False): H = G.copy() start_nodes = set([x for x in H.nodes() if H.out_degree(x) > 1 and x not in out_hinges]) for st_node in start_nodes: for sec_node in H.successors(st_node): if H.out_degree(st_node) == 1: break cur_node = sec_node cur_path = [[st_node,cur_node]] while H.in_degree(cur_node) == 1 and H.out_degree(cur_node) == 1: cur_path.append([cur_node,H.successors(cur_node)[0]]) cur_node = H.successors(cur_node)[0] if len(cur_path) > threshold + 1: break if len(cur_path) <= threshold and H.in_degree(cur_node) > 1 and H.out_degree(st_node) > 1 and cur_node not in in_hinges: if print_z: print cur_path for edge in cur_path: H.remove_edge(edge[0],edge[1]) for j in range(len(cur_path)-1): H.remove_node(cur_path[j][1]) end_nodes = set([x for x in H.nodes() if H.in_degree(x) > 1 and x not in in_hinges]) for end_node in end_nodes: for sec_node in H.predecessors(end_node): if H.in_degree(end_node) == 1: break cur_node = sec_node cur_path = [[cur_node,end_node]] while H.in_degree(cur_node) == 1 and H.out_degree(cur_node) == 1: cur_path.append([H.predecessors(cur_node)[0],cur_node]) cur_node = H.predecessors(cur_node)[0] if len(cur_path) > threshold + 1: break if len(cur_path) <= threshold and H.out_degree(cur_node) > 1 and H.in_degree(end_node) > 1 and cur_node not in out_hinges: if print_z: print cur_path for edge in cur_path: H.remove_edge(edge[0],edge[1]) for j in range(len(cur_path)-1): H.remove_node(cur_path[j][0]) return H def z_clipping_sym(G,threshold,in_hinges,out_hinges,print_z = False): H = G.copy() G0 = G.copy() start_nodes = set([x for x in H.nodes() if H.out_degree(x) > 1 and x not in out_hinges]) for st_node in start_nodes: try: # need this because we are deleting nodes inside loop H.successors(st_node) except: continue for sec_node in H.successors(st_node): if H.out_degree(st_node) == 1: break cur_node = sec_node cur_path = [[st_node,cur_node]] while H.in_degree(cur_node) == 1 and H.out_degree(cur_node) == 1: cur_path.append([cur_node,H.successors(cur_node)[0]]) cur_node = H.successors(cur_node)[0] if len(cur_path) > threshold + 1: break if len(cur_path) <= threshold and H.in_degree(cur_node) > 1 and H.out_degree(st_node) > 1 and cur_node not in in_hinges: if print_z: print cur_path for edge in cur_path: G0.edge[edge[0]][edge[1]]['z'] = 1 G0.edge[rev_node(edge[1])][rev_node(edge[0])]['z'] = 1 try: H.remove_edge(edge[0],edge[1]) H.remove_edge(rev_node(edge[1]),rev_node(edge[0])) except: pass for j in range(len(cur_path)-1): G0.node[cur_path[j][1]]['z'] = 1 G0.node[rev_node(cur_path[j][1])]['z'] = 1 try: H.remove_node(cur_path[j][1]) H.remove_node(rev_node(cur_path[j][1])) except: pass return H, G0 # In[48]: def merge_path(g,in_node,node,out_node): # g.add_edge(in_node,out_node,hinge_edge = -1,false_positive = 0) if g.edge[in_node][node]['intersection'] == 1 and g.edge[node][out_node]['intersection'] == 1: g.add_edge(in_node,out_node,hinge_edge = -1,intersection = 1,z=0) else: g.add_edge(in_node,out_node,hinge_edge = -1,intersection = 0,z=0) g.remove_node(node) # In[121]: def random_condensation(G,n_nodes,check_gt = False): g = G.copy() max_iter = 20000 iter_cnt = 0 while len(g.nodes()) > n_nodes and iter_cnt < max_iter: iter_cnt += 1 node = g.nodes()[random.randrange(len(g.nodes()))] if g.in_degree(node) == 1 and g.out_degree(node) == 1: in_node = g.in_edges(node)[0][0] out_node = g.out_edges(node)[0][1] if g.out_degree(in_node) == 1 and g.in_degree(out_node) == 1: if in_node != node and out_node != node and in_node != out_node: #print in_node, node, out_node # merge_path(g,in_node,node,out_node) bad_node=False if check_gt: for in_edge in g.in_edges(node): if g.edge[in_edge[0]][in_edge[1]]['false_positive']==1: bad_node=True for out_edge in g.out_edges(node): if g.edge[out_edge[0]][out_edge[1]]['false_positive']==1: bad_node=True if not bad_node: #print in_node, node, out_node merge_path(g,in_node,node,out_node) if iter_cnt >= max_iter: print "couldn't finish sparsification"+str(len(g.nodes())) return g def random_condensation_sym(G,n_nodes,check_gt = False): g = G.copy() max_iter = 20000 iter_cnt = 0 while len(g.nodes()) > n_nodes and iter_cnt < max_iter: iter_cnt += 1 node = g.nodes()[random.randrange(len(g.nodes()))] if g.in_degree(node) == 1 and g.out_degree(node) == 1: in_node = g.in_edges(node)[0][0] out_node = g.out_edges(node)[0][1] if g.out_degree(in_node) == 1 and g.in_degree(out_node) == 1: if in_node != node and out_node != node and in_node != out_node: #print in_node, node, out_node # merge_path(g,in_node,node,out_node) bad_node=False if check_gt: for in_edge in g.in_edges(node): if g.edge[in_edge[0]][in_edge[1]]['false_positive']==1: bad_node=True for out_edge in g.out_edges(node): if g.edge[out_edge[0]][out_edge[1]]['false_positive']==1: bad_node=True if not bad_node: #print in_node, node, out_node try: merge_path(g,in_node,node,out_node) merge_path(g,rev_node(out_node),rev_node(node),rev_node(in_node)) except: pass if iter_cnt >= max_iter: print "couldn't finish sparsification"+str(len(g.nodes())) return g # In[118]: def random_condensation2(g,n_nodes): g = G.copy() max_iter = 20000 iter_cnt = 0 while len(g.nodes()) > n_nodes and iter_cnt < max_iter: iter_cnt += 1 node = g.nodes()[random.randrange(len(g.nodes()))] if g.in_degree(node) == 1 and g.out_degree(node) == 1: base_node=node.split("_")[0] orintation = node.split("_")[1] # if orintation=='1': # node2=base_node+'_0' # else: # node2=base_node+'_1' # print node,node2 in_node = g.in_edges(node)[0][0] out_node = g.out_edges(node)[0][1] if g.node[node]['hinge']==0 and g.node[in_node]['hinge']==0 and g.node[out_node]['hinge']==0: if g.out_degree(in_node) == 1 and g.in_degree(out_node) == 1: if in_node != node and out_node != node and in_node != out_node: bad_node=False # print g.in_edges(node) # print g.edge[g.in_edges(node)[0][0]][g.in_edges(node)[0][1]] # print g.out_edges(node) for in_edge in g.in_edges(node): if g.edge[in_edge[0]][in_edge[1]]['false_positive']==1: bad_node=True for out_edge in g.out_edges(node): if g.edge[out_edge[0]][out_edge[1]]['false_positive']==1: bad_node=True if not bad_node: #print in_node, node, out_node merge_path(g,in_node,node,out_node) if iter_cnt >= max_iter: print "couldn't finish sparsification: "+str(len(g.nodes())) return g def bubble_bursting_sym(H,threshold,print_bubble = False): start_nodes = set([x for x in H.nodes() if H.out_degree(x) == 2]) for st_node in start_nodes: try: # need this because we are deleting nodes inside loop H.successors(st_node)[1] except: continue sec_node = H.successors(st_node)[0] cur_node = sec_node cur_path = [[st_node,cur_node]] while H.in_degree(cur_node) == 1 and H.out_degree(cur_node) == 1: cur_path.append([cur_node,H.successors(cur_node)[0]]) cur_node = H.successors(cur_node)[0] if len(cur_path) > threshold + 1: break end_node0 = cur_node cur_node = H.successors(st_node)[1] alt_path = [[st_node,cur_node]] while H.in_degree(cur_node) == 1 and H.out_degree(cur_node) == 1: alt_path.append([cur_node,H.successors(cur_node)[0]]) cur_node = H.successors(cur_node)[0] if len(alt_path) > threshold + 1: break if len(cur_path) <= threshold and len(alt_path) <= threshold and end_node0 == cur_node: if print_bubble: print 'found bubble' for edge in cur_path: # try: H.remove_edge(edge[0],edge[1]) H.remove_edge(rev_node(edge[1]),rev_node(edge[0])) # except: # pass for j in range(len(cur_path)-1): # try: H.remove_node(cur_path[j][1]) H.remove_node(rev_node(cur_path[j][1])) # except: # pass return H def resolve_rep(g,rep_path,in_node,out_node): prefix = 'B' g.add_edge(in_node,prefix + rep_path[0], length=g.edge[in_node][rep_path[0]]['length'], read_a_match_start=g.edge[in_node][rep_path[0]]['read_a_match_start'], read_a_match_end=g.edge[in_node][rep_path[0]]['read_a_match_end'], read_b_match_start=g.edge[in_node][rep_path[0]]['read_b_match_start'], read_b_match_end=g.edge[in_node][rep_path[0]]['read_b_match_end'], read_a_match_start_raw=g.edge[in_node][rep_path[0]]['read_a_match_start_raw'], read_a_match_end_raw=g.edge[in_node][rep_path[0]]['read_a_match_end_raw'], read_b_match_start_raw=g.edge[in_node][rep_path[0]]['read_b_match_start_raw'], read_b_match_end_raw=g.edge[in_node][rep_path[0]]['read_b_match_end_raw']) g.remove_edge(in_node,rep_path[0]) g.add_edge(prefix+rep_path[-1],out_node, length=g.edge[rep_path[-1]][out_node]['length'], read_a_match_start=g.edge[rep_path[-1]][out_node]['read_a_match_start'], read_a_match_end=g.edge[rep_path[-1]][out_node]['read_a_match_end'], read_b_match_start=g.edge[rep_path[-1]][out_node]['read_b_match_start'], read_b_match_end=g.edge[rep_path[-1]][out_node]['read_b_match_end'], read_a_match_start_raw=g.edge[rep_path[-1]][out_node]['read_a_match_start_raw'], read_a_match_end_raw=g.edge[rep_path[-1]][out_node]['read_a_match_end_raw'], read_b_match_start_raw=g.edge[rep_path[-1]][out_node]['read_b_match_start_raw'], read_b_match_end_raw=g.edge[rep_path[-1]][out_node]['read_b_match_end_raw']) g.remove_edge(rep_path[-1],out_node) g.add_edge(rev_node(prefix + rep_path[0]),rev_node(in_node), length =g.edge[rev_node(rep_path[0])][rev_node(in_node)]['length'], read_a_match_start=g.edge[rev_node(rep_path[0])][rev_node(in_node)]['read_a_match_start'], read_a_match_end=g.edge[rev_node(rep_path[0])][rev_node(in_node)]['read_a_match_end'], read_b_match_start=g.edge[rev_node(rep_path[0])][rev_node(in_node)]['read_b_match_start'], read_b_match_end=g.edge[rev_node(rep_path[0])][rev_node(in_node)]['read_b_match_end'], read_a_match_start_raw=g.edge[rev_node(rep_path[0])][rev_node(in_node)]['read_a_match_start_raw'], read_a_match_end_raw=g.edge[rev_node(rep_path[0])][rev_node(in_node)]['read_a_match_end_raw'], read_b_match_start_raw=g.edge[rev_node(rep_path[0])][rev_node(in_node)]['read_b_match_start_raw'], read_b_match_end_raw=g.edge[rev_node(rep_path[0])][rev_node(in_node)]['read_b_match_end_raw']) g.remove_edge(rev_node(rep_path[0]),rev_node(in_node)) g.add_edge(rev_node(out_node),rev_node(prefix+rep_path[-1]), length=g.edge[rev_node(out_node)][rev_node(rep_path[-1])]['length'], read_a_match_start=g.edge[rev_node(out_node)][rev_node(rep_path[-1])]['read_a_match_start'], read_a_match_end=g.edge[rev_node(out_node)][rev_node(rep_path[-1])]['read_a_match_end'], read_b_match_start=g.edge[rev_node(out_node)][rev_node(rep_path[-1])]['read_b_match_start'], read_b_match_end=g.edge[rev_node(out_node)][rev_node(rep_path[-1])]['read_b_match_end'], read_a_match_start_raw=g.edge[rev_node(out_node)][rev_node(rep_path[-1])]['read_a_match_start_raw'], read_a_match_end_raw=g.edge[rev_node(out_node)][rev_node(rep_path[-1])]['read_a_match_end_raw'], read_b_match_start_raw=g.edge[rev_node(out_node)][rev_node(rep_path[-1])]['read_b_match_start_raw'], read_b_match_end_raw=g.edge[rev_node(out_node)][rev_node(rep_path[-1])]['read_b_match_end_raw']) g.remove_edge(rev_node(out_node),rev_node(rep_path[-1])) for i in range(0,len(rep_path)-1): g.add_edge(prefix+rep_path[i],prefix+rep_path[i+1], length=g.edge[rep_path[i]][rep_path[i+1]]['length'], read_a_match_start=g.edge[rep_path[i]][rep_path[i+1]]['read_a_match_start'], read_a_match_end=g.edge[rep_path[i]][rep_path[i+1]]['read_a_match_end'], read_b_match_start=g.edge[rep_path[i]][rep_path[i+1]]['read_b_match_start'], read_b_match_end=g.edge[rep_path[i]][rep_path[i+1]]['read_b_match_end'], read_a_match_start_raw=g.edge[rep_path[i]][rep_path[i+1]]['read_a_match_start_raw'], read_a_match_end_raw=g.edge[rep_path[i]][rep_path[i+1]]['read_a_match_end_raw'], read_b_match_start_raw=g.edge[rep_path[i]][rep_path[i+1]]['read_b_match_start_raw'], read_b_match_end_raw=g.edge[rep_path[i]][rep_path[i+1]]['read_b_match_end_raw']) g.add_edge(rev_node(prefix+rep_path[i+1]),rev_node(prefix+rep_path[i]), length =g.edge[rev_node(rep_path[i+1])][rev_node(rep_path[i])]['length'], read_a_match_start=g.edge[rev_node(rep_path[i+1])][rev_node(rep_path[i])]['read_a_match_start'], read_a_match_end=g.edge[rev_node(rep_path[i+1])][rev_node(rep_path[i])]['read_a_match_end'], read_b_match_start=g.edge[rev_node(rep_path[i+1])][rev_node(rep_path[i])]['read_b_match_start'], read_b_match_end=g.edge[rev_node(rep_path[i+1])][rev_node(rep_path[i])]['read_b_match_end'], read_a_match_start_raw=g.edge[rev_node(rep_path[i+1])][rev_node(rep_path[i])]['read_a_match_start_raw'], read_a_match_end_raw=g.edge[rev_node(rep_path[i+1])][rev_node(rep_path[i])]['read_a_match_end_raw'], read_b_match_start_raw=g.edge[rev_node(rep_path[i+1])][rev_node(rep_path[i])]['read_b_match_start_raw'], read_b_match_end_raw=g.edge[rev_node(rep_path[i+1])][rev_node(rep_path[i])]['read_b_match_end_raw']) def loop_resolution(g,max_nodes,flank,print_debug = False): starting_nodes = [x for x in g.nodes() if g.out_degree(x) == 2] if print_debug: print '----' print starting_nodes tandem = [] for st_node in starting_nodes: if g.out_degree(st_node) != 2: continue if print_debug: print '----' print st_node for first_node in g.successors(st_node): if g.out_degree(st_node) != 2: continue if print_debug: print '----' print first_node other_successor = [x for x in g.successors(st_node) if x != first_node][0] next_node = first_node if print_debug: print 'going on loop' node_cnt = 0 while g.in_degree(next_node) == 1 and g.out_degree(next_node) == 1 and node_cnt < max_nodes: node_cnt += 1 in_node = next_node next_node = g.successors(next_node)[0] first_node_of_repeat = next_node if g.in_degree(next_node) == 2: prev_node = [x for x in g.predecessors(next_node) if x != in_node][0] node_cnt = 0 while g.in_degree(prev_node) == 1 and g.out_degree(prev_node) == 1: node_cnt += 1 prev_node = g.predecessors(prev_node)[0] if node_cnt >= flank: break if node_cnt < flank: # and prev_node != st_node: continue next_node = other_successor node_cnt = 0 while g.in_degree(next_node) == 1 and g.out_degree(next_node) == 1: node_cnt += 1 next_node = g.successors(next_node)[0] if node_cnt >= flank: break if node_cnt < flank: # and next_node != first_node_of_repeat: continue rep = [first_node_of_repeat] next_node = first_node_of_repeat node_cnt = 0 if g.in_degree(next_node) == 2 and g.out_degree(next_node) == 1: next_double_node = g.successors(next_node)[0] rep.append(next_double_node) else: next_double_node = next_node while g.in_degree(next_double_node) == 1 and g.out_degree(next_double_node) == 1 and node_cnt < max_nodes: node_cnt += 1 next_double_node = g.successors(next_double_node)[0] rep.append(next_double_node) if next_double_node == st_node: if print_debug: print 'success!' print 'rep is:' print rep print 'in_node and other_successor:' print in_node, other_successor resolve_rep(g,rep,in_node,other_successor) # print next_double_node if node_cnt < 5: tandem.append(rep) continue if len(tandem) > 0: with open('tandem.txt', 'w') as tandemout: for rep in tandem: tandemout.write(str(rep)) return g # In[72]: def add_groundtruth(g,json_file,in_hinges,out_hinges): mapping = ujson.load(json_file) print 'getting mapping' mapped_nodes=0 print str(len(mapping)) print str(len(g.nodes())) slack = 500 max_chr = 0 chr_length_dict = {} for node in g.nodes(): # print node node_base=node.split("_")[0] # print node_base #print node g.node[node]['normpos'] = 0 if mapping.has_key(node_base): g.node[node]['chr'] = mapping[node_base][0][2]+1 g.node[node]['aln_start'] = min (mapping[node_base][0][0],mapping[node_base][0][1]) g.node[node]['aln_end'] = max(mapping[node_base][0][1],mapping[node_base][0][0]) # max_chr = max(g.node[node]['chr'],max_chr) # mapped_nodes+=1 else: # pass g.node[node]['chr'] = 0 g.node[node]['aln_start'] = 1 g.node[node]['aln_end'] = 1 # g.node[node]['aln_strand'] = 0 if node in in_hinges or node in out_hinges: g.node[node]['hinge'] = 1 else: g.node[node]['hinge'] = 0 if g.node[node]['chr'] in chr_length_dict: chr_length_dict[g.node[node]['chr']] = max(g.node[node]['aln_end'], chr_length_dict[g.node[node]['chr']]) else: chr_length_dict[g.node[node]['chr']] = max(g.node[node]['aln_end'], 1) chr_list = sorted(chr_length_dict.items(), key=operator.itemgetter(1), reverse=True) max_chr_len1 = max([g.node[x]['aln_end'] for x in g.nodes()]) max_chr_multiplier = 10**len(str(max_chr_len1)) print [x for x in chr_list] chr_set =[x [0] for x in chr_list] print chr_set # red_bk = 102 # green_bk = 102 # blue_bk = 102 colour_list = ['red', 'lawngreen', 'deepskyblue', 'deeppink', 'darkorange', 'purple', 'gold', 'mediumblue', 'saddlebrown', 'darkgreen'] for colour in colour_list: print matplotlib.colors.colorConverter.to_rgb(colour) for index, chrom in enumerate(chr_set): node_set = set([x for x in g.nodes() if g.node[x]['chr'] == chrom]) print chrom max_chr_len = max([g.node[x]['aln_end'] for x in g.nodes() if g.node[x]['chr'] == chrom]) # max_chr_multiplier = 10**len(str(max_chr_len)) if index < 10: rgb_tuple = matplotlib.colors.colorConverter.to_rgb(colour_list[index]) red = int(255*rgb_tuple[0]) green = int(255*rgb_tuple[1]) blue = int(255*rgb_tuple[2]) else: red = random.randint(0,255) # green = random.randint(0,255) blue = random.randint(0,255) brightness = 200 green = max(0,min( 255,brightness - int((0.2126 *red + 0.0722 *blue)/0.7152 ))) red_bk = max(red-100,0) blue_bk = max(blue-100,0) green_bk = max(green-100,0) print red,blue,green for node in node_set: g.node[node]['normpos'] = g.node[node]['chr'] * max_chr_multiplier + (g.node[node]['aln_end']/float(max_chr_len))*max_chr_multiplier lamda = (g.node[node]['aln_end']/max_chr_len) nd_red = (1-lamda)*red + lamda*red_bk nd_green = (1-lamda)*green + lamda*green_bk nd_blue = (1-lamda)*blue + lamda*blue_bk g.node[node]['color'] = rgb2hex(nd_red, nd_green, nd_blue) g.node[node]['color_r'] = nd_red g.node[node]['color_g'] = nd_green g.node[node]['color_b'] = nd_blue # max_chr_len = len(str(max_chr)) # div_num = float(10**(max_chr_len)) # for node in g.nodes(): # g.node[node]['normpos'] = (g.node[node]['chr'] + g.node[node]['aln_end']/float(chr_length_dict[g.node[node]['chr']]))/div_num for edge in g.edges_iter(): in_node=edge[0] out_node=edge[1] # if ((g.node[in_node]['aln_start'] < g.node[out_node]['aln_start'] and # g.node[out_node]['aln_start'] < g.node[in_node]['aln_end']) or # (g.node[in_node]['aln_start'] < g.node[out_node]['aln_end'] and # g.node[out_node]['aln_end'] < g.node[in_node]['aln_end'])): # g.edge[in_node][out_node]['false_positive']=0 # else: # g.edge[in_node][out_node]['false_positive']=1 if ((g.node[in_node]['aln_start'] < g.node[out_node]['aln_start'] and g.node[out_node]['aln_start'] < g.node[in_node]['aln_end']) or (g.node[in_node]['aln_start'] < g.node[out_node]['aln_end'] and g.node[out_node]['aln_end'] < g.node[in_node]['aln_end'])): g.edge[in_node][out_node]['false_positive']=0 else: g.edge[in_node][out_node]['false_positive']=1 return g def mark_skipped_edges(G,skipped_name): with open (skipped_name) as f: for lines in f: lines1=lines.split() if len(lines1) < 5: continue e1 = (lines1[0] + "_" + lines1[3], lines1[1] + "_" + lines1[4]) if e1 in G.edges(): G.edge[lines1[0] + "_" + lines1[3]][lines1[1] + "_" + lines1[4]]['skipped'] = 1 G.edge[lines1[1] + "_" + str(1-int(lines1[4]))][lines1[0] + "_" + str(1-int(lines1[3]))]['skipped'] = 1 def add_annotation(g,in_hinges,out_hinges): for node in g.nodes(): if node in in_hinges: g.node[node]['hinge'] = 1 elif node in out_hinges: g.node[node]['hinge'] = -1 else: g.node[node]['hinge'] = 0 return g def connect_strands(g): for node in g.nodes(): revnode = rev_node(node) g.add_edge(node,revnode) g.add_edge(revnode,node) return g def create_bidirected(g): h = nx.DiGraph() for u in g.nodes(): for successor in g.successors(u): tail_id, tail_orientation = u.split('_') head_id, head_orientation = successor.split('_') h.add_edge(tail_id,head_id,tail_or = int(tail_orientation),head_or = int(head_orientation), read_a_match_start=g.edge[u][successor]['read_a_match_start'], read_a_match_end=g.edge[u][successor]['read_a_match_end'], read_b_match_start=g.edge[u][successor]['read_b_match_start'], read_b_match_end=g.edge[u][successor]['read_b_match_end']) st_nodes = [x for x in g if g.in_degree(x) != 1 or g.out_degree(x) > 1] for st_node in st_nodes: for sec_node in g.successors(st_node): cur_node = st_node cur_id = cur_node.split('_')[0] next_node = sec_node next_id = next_node.split('_')[0] if next_id in h.successors(cur_id) and cur_id in h.successors(next_id): h.remove_edge(next_id,cur_id) while g.in_degree(next_node) == 1 and g.out_degree(next_node) == 1: cur_node = next_node cur_id = cur_node.split('_')[0] next_node = g.successors(next_node)[0] next_id = next_node.split('_')[0] # else: # print 'not in h' if next_id in h.successors(cur_id) and cur_id in h.successors(next_id): h.remove_edge(next_id,cur_id) else: break return h def create_bidirected2(g): h = nx.DiGraph() for u in g.nodes(): for successor in g.successors(u): tail_id, tail_orientation = u.split('_') head_id, head_orientation = successor.split('_') h.add_edge(tail_id,head_id) # h.add_edge(tail_id,head_id,tail_or = int(tail_orientation),head_or = int(head_orientation), # read_a_match_start=g.edge[u][successor]['read_a_match_start'], # read_a_match_end=g.edge[u][successor]['read_a_match_end'], # read_b_match_start=g.edge[u][successor]['read_b_match_start'], # read_b_match_end=g.edge[u][successor]['read_b_match_end']) st_nodes = [x for x in g if g.in_degree(x) != 1 or g.out_degree(x) > 1] for st_node in st_nodes: for sec_node in g.successors(st_node): cur_node = st_node cur_id = cur_node.split('_')[0] next_node = sec_node next_id = next_node.split('_')[0] if next_id in h.successors(cur_id) and cur_id in h.successors(next_id): h.remove_edge(next_id,cur_id) while g.in_degree(next_node) == 1 and g.out_degree(next_node) == 1: cur_node = next_node cur_id = cur_node.split('_')[0] next_node = g.successors(next_node)[0] next_id = next_node.split('_')[0] # else: # print 'not in h' if next_id in h.successors(cur_id) and cur_id in h.successors(next_id): h.remove_edge(next_id,cur_id) else: break return g def write_graphml(g,prefix,suffix,suffix1): h = g.copy() connect_strands(h) nx.write_graphml(h, prefix+suffix+'.'+'suffix1'+'.graphml') flname = sys.argv[1] # flname = '../pb_data/ecoli_shortened/ecoli4/ecolii2.edges.hinges' prefix = flname.split('.')[0] hingesname = sys.argv[2] # hingesname = '../pb_data/ecoli_shortened/ecoli4/ecolii2.hinge.list' suffix = sys.argv[3] if len(sys.argv)==5: json_file = open(sys.argv[4]) else: json_file = None # path = '../pb_data/ecoli_shortened/ecoli4/' # suffix = 'i2' # In[116]: G = nx.DiGraph() Ginfo = {} with open (flname) as f: for lines in f: lines1=lines.split() if len(lines1) < 5: continue e1 = (lines1[0] + "_" + lines1[3], lines1[1] + "_" + lines1[4]) # print lines1 # e1_match1 = abs(int(lines1[6].lstrip('['))-int(lines1[7].rstrip(']'))) # e1_match2 = abs(int(lines1[8].lstrip('['))-int(lines1[9].rstrip(']'))) e1_match_len = int(lines1[2]) ra_match_start = int(lines1[6].lstrip('[')) ra_match_end = int(lines1[7].rstrip(']')) rb_match_start = int(lines1[8].lstrip('[')) rb_match_end = int(lines1[9].rstrip(']')) ra_match_start_raw = int(lines1[-4].lstrip('[')) ra_match_end_raw = int(lines1[-3].rstrip(']')) rb_match_start_raw = int(lines1[-2].lstrip('[')) rb_match_end_raw = int(lines1[-1].rstrip(']')) if e1 in G.edges(): G.add_edge(lines1[0] + "_" + lines1[3], lines1[1] + "_" + lines1[4], hinge_edge=int(lines1[5]),intersection=1,length=e1_match_len,z=0, read_a_match_start=ra_match_start,read_a_match_end=ra_match_end, read_b_match_start=rb_match_start,read_b_match_end=rb_match_end, read_a_match_start_raw=ra_match_start_raw,read_a_match_end_raw=ra_match_end_raw, read_b_match_start_raw=rb_match_start_raw,read_b_match_end_raw=rb_match_end_raw) G.add_edge(lines1[1] + "_" + str(1-int(lines1[4])), lines1[0] + "_" + str(1-int(lines1[3])), hinge_edge=int(lines1[5]),intersection=1,length=e1_match_len,z=0, read_a_match_start=rb_match_start,read_a_match_end=rb_match_end, read_b_match_start=ra_match_start,read_b_match_end=ra_match_end, read_a_match_start_raw=rb_match_start_raw,read_a_match_end_raw=rb_match_end_raw, read_b_match_start_raw=ra_match_start_raw,read_b_match_end_raw=ra_match_end_raw) else: G.add_edge(lines1[0] + "_" + lines1[3], lines1[1] + "_" + lines1[4], hinge_edge=int(lines1[5]),intersection=0,length=e1_match_len,z=0, read_a_match_start=ra_match_start,read_a_match_end=ra_match_end, read_b_match_start=rb_match_start,read_b_match_end=rb_match_end, read_a_match_start_raw=ra_match_start_raw,read_a_match_end_raw=ra_match_end_raw, read_b_match_start_raw=rb_match_start_raw,read_b_match_end_raw=rb_match_end_raw) G.add_edge(lines1[1] + "_" + str(1-int(lines1[4])), lines1[0] + "_" + str(1-int(lines1[3])), hinge_edge=int(lines1[5]),intersection=0,length=e1_match_len,z=0, read_a_match_start=rb_match_start,read_a_match_end=rb_match_end, read_b_match_start=ra_match_start,read_b_match_end=ra_match_end, read_a_match_start_raw=rb_match_start_raw,read_a_match_end_raw=rb_match_end_raw, read_b_match_start_raw=ra_match_start_raw,read_b_match_end_raw=ra_match_end_raw) towrite = lines1[0] + "_" + lines1[3] +' '+ lines1[1] + "_" + lines1[4] +' '+ lines1[2]+' '+str(int(lines1[11][:-1])-int(lines1[10][1:]))+' '+str(int(lines1[13][:-1])-int(lines1[12][1:])) Ginfo[(lines1[0] + "_" + lines1[3],lines1[1] + "_" + lines1[4])] = towrite towrite= lines1[1] + "_" + str(1-int(lines1[4])) +' '+ lines1[0] + "_" + str(1-int(lines1[3])) +' '+ lines1[2]+' '+str(int(lines1[13][:-1])-int(lines1[12][1:]))+' '+str(int(lines1[11][:-1])-int(lines1[10][1:])) Ginfo[(lines1[1] + "_" + str(1-int(lines1[4])), lines1[0] + "_" + str(1-int(lines1[3])))] = towrite nx.write_graphml(G, prefix+suffix+'.'+'G00'+'.graphml') vertices=set() in_hinges = set() out_hinges = set() with open (hingesname) as f: for lines in f: lines1=lines.split() if lines1[2] == '1': in_hinges.add(lines1[0]+'_0') out_hinges.add(lines1[0]+'_1') elif lines1[2] == '-1': in_hinges.add(lines1[0]+'_1') out_hinges.add(lines1[0]+'_0') add_annotation(G,in_hinges,out_hinges) # try: mark_skipped_edges(G,flname.split('.')[0] + '.edges.skipped') # except: # print "some error here" # pass # json_file = open('../pb_data/ecoli_shortened/ecoli4/ecoli.mapping.1.json') if json_file!= None: add_groundtruth(G,json_file,in_hinges,out_hinges) # In[ ]: G0 = G.copy() # Actual pruning, clipping and z deletion occurs below G0 = dead_end_clipping_sym(G0,10) # G1=z_clipping_sym(G1,5,in_hinges,out_hinges) G1,G0 = z_clipping_sym(G0,6,set(),set()) # G1=z_clipping_sym(G1,5,in_hinges,out_hinges) # G1=z_clipping_sym(G1,5,in_hinges,out_hinges) # G1=z_clipping_sym(G1,5,in_hinges,out_hinges) G1 = bubble_bursting_sym(G1,20) G1 = dead_end_clipping_sym(G1,20) nx.write_graphml(G0, prefix+suffix+'.'+'G0'+'.graphml') nx.write_graphml(G1, prefix+suffix+'.'+'G1'+'.graphml') G2 = G1.copy() Gs = random_condensation_sym(G1,1000) loop_resolution(G2,500,50) G2s = random_condensation_sym(G2,1000) nx.write_graphml(G2, prefix+suffix+'.'+'G2'+'.graphml') nx.write_graphml(Gs, prefix+suffix+'.'+'Gs'+'.graphml') nx.write_graphml(G2s, prefix+suffix+'.'+'G2s'+'.graphml') Gc = connect_strands(Gs) nx.write_graphml(Gc, prefix+suffix+'.'+'Gc'+'.graphml') G2c = connect_strands(G2s) nx.write_graphml(G2c, prefix+suffix+'.'+'G2c'+'.graphml') # G2b = create_bidirected2(G2) # nx.write_graphml(G2b, prefix+suffix+'.'+'G2b'+'.graphml') # H=prune_graph(G1,in_hinges,out_hinges) # H=dead_end_clipping(H,5) # I=prune_graph(H,in_hinges,out_hinges,True) # I=dead_end_clipping(I,5) # Gs = random_condensation(G1,2000) # nx.write_graphml(Gs, path+'G'+suffix+'.graphml') # write_graph(Gs,path+'G'+suffix+'.txt') # Hs = random_condensation(H,2500) # nx.write_graphml(Hs, path+'H'+suffix+'.graphml') # write_graph(Hs,path+'H'+suffix+'.txt') # Is = random_condensation(I,2500) # nx.write_graphml(Is, path+'I'+suffix+'.graphml') # write_graph(Is,path+'I'+suffix+'.txt') HINGE-0.5.0/scripts/random_condensation.py000077500000000000000000000221311314415550300204360ustar00rootroot00000000000000#!/usr/bin/env python import networkx as nx import random import sys from collections import Counter # This script does a random condensation of the graph down to 2000 nodes # python random_condensation.py ecoli.edges 2000 # It also keeps the ground truth on the graph through the condensation steps (if a json file is available) def merge_path(g,in_node,node,out_node): g.add_edge(in_node,out_node,hinge_edge = -1,false_positive = 0) g.remove_node(node) def input1(flname): print "input1" g = nx.DiGraph() with open (flname) as f: for lines in f: lines1=lines.split() #print lines1 if len(lines1) < 5: continue #print lines1 g.add_edge(lines1[0] + "_" + lines1[3], lines1[1] + "_" + lines1[4], hinge_edge=int(lines1[5])) g.add_edge(lines1[1] + "_" + str(1-int(lines1[4])), lines1[0] + "_" + str(1-int(lines1[3])),hinge_edge=int(lines1[5])) return g def input2(flname): print "input2" g = nx.DiGraph() with open (flname) as f: for lines in f: lines1=lines.split() #print lines1 g.add_edge(lines1[0], lines1[1]) return g def input3(flname): print "input3" # g = nx.DiGraph() g = nx.read_graphml(flname) def de_clip(filename, n_nodes, hinge_list,gt_file): n_iter = 5 f=open(filename) line1=f.readline() print line1 f.close() extension = filename.split('.')[-1] if extension == 'graphml': g=input3(filename) elif len(line1.split()) !=2: g=input1(filename) else: g=input2(filename) print nx.info(g) degree_sequence=sorted(g.degree().values(),reverse=True) print Counter(degree_sequence) degree_sequence=sorted(nx.degree(g).values(),reverse=True) print Counter(degree_sequence) try: import ujson mapping = ujson.load(open(gt_file)) print 'getting mapping' mapped_nodes=0 print str(len(mapping)) print str(len(g.nodes())) for node in g.nodes(): # print node node_base=node.split("_")[0] # print node_base #print node if mapping.has_key(node_base): g.node[node]['aln_start'] = min (mapping[node_base][0][0],mapping[node_base][0][1]) g.node[node]['aln_end'] = max(mapping[node_base][0][1],mapping[node_base][0][0]) g.node[node]['chr'] = mapping[node_base][0][2] mapped_nodes+=1 else: # pass g.node[node]['aln_start'] = 0 g.node[node]['aln_end'] = 0 g.node[node]['aln_strand'] = 0 for edge in g.edges_iter(): in_node=edge[0] out_node=edge[1] # print 'akjdfakjhfakljh' if ((g.node[in_node]['aln_start'] < g.node[out_node]['aln_start'] and g.node[out_node]['aln_start'] < g.node[in_node]['aln_end']) or (g.node[in_node]['aln_start'] < g.node[out_node]['aln_end'] and g.node[out_node]['aln_end'] < g.node[in_node]['aln_end'])): g.edge[in_node][out_node]['false_positive']=0 else: g.edge[in_node][out_node]['false_positive']=1 except: raise # print "json "+filename.split('.')[0]+'.mapping.json'+" not found. exiting." print hinge_list print str(mapped_nodes)+" out of " +str(len(g.nodes()))+" nodes mapped." # for i in range(5): # merge_simple_path(g) # degree_sequence=sorted(nx.degree(g).values(),reverse=True) # print Counter(degree_sequence) in_hinges = set() out_hinges = set() num_iter=10000 iter_done=0 if hinge_list != None: print "Found hinge list." with open(hinge_list,'r') as f: for lines in f: lines1=lines.split() if lines1[2] == '1': in_hinges.add(lines1[0]+'_0') out_hinges.add(lines1[0]+'_1') elif lines1[2] == '-1': in_hinges.add(lines1[0]+'_1') out_hinges.add(lines1[0]+'_0') print str(len(in_hinges))+' hinges found.' for node in g.nodes(): if node in in_hinges and node in out_hinges: g.node[node]['hinge']=100 elif node in in_hinges: g.node[node]['hinge']=10 elif node in out_hinges: g.node[node]['hinge']=-10 else: g.node[node]['hinge']=0 while len(g.nodes()) > n_nodes and iter_done < num_iter : node = g.nodes()[random.randrange(len(g.nodes()))] iter_done+=1 # print iter_done if g.in_degree(node) == 1 and g.out_degree(node) == 1: base_node=node.split("_")[0] orintation = node.split("_")[1] # if orintation=='1': # node2=base_node+'_0' # else: # node2=base_node+'_1' # print node,node2 in_node = g.in_edges(node)[0][0] out_node = g.out_edges(node)[0][1] if g.node[node]['hinge']==0 and g.node[in_node]['hinge']==0 and g.node[out_node]['hinge']==0: if g.out_degree(in_node) == 1 and g.in_degree(out_node) == 1: if in_node != node and out_node != node and in_node != out_node: bad_node=False # print g.in_edges(node) # print g.edge[g.in_edges(node)[0][0]][g.in_edges(node)[0][1]] # print g.out_edges(node) for in_edge in g.in_edges(node): if g.edge[in_edge[0]][in_edge[1]]['false_positive']==1: bad_node=True for out_edge in g.out_edges(node): if g.edge[out_edge[0]][out_edge[1]]['false_positive']==1: bad_node=True if not bad_node: #print in_node, node, out_node merge_path(g,in_node,node,out_node) # print g.edge[edge1[0]][edge1[1]]['hinge_edge'] for nd in g.nodes(): if len(nd.split("_"))==1: print nd + " in trouble" # in_node = g.in_edges(node2)[0][0] # out_node = g.out_edges(node2)[0][1] # if g.node[node2]['hinge']==0 and g.node[in_node]['hinge']==0 and g.node[out_node]['hinge']==0: # if g.out_degree(in_node) == 1 and g.in_degree(out_node) == 1: # if in_node != node2 and out_node != node2 and in_node != out_node: # bad_node=False # for in_edge in g.in_edges(node2): # if g.edge[in_edge]==1: # bad_node=True # for out_edge in g.out_edges(node2): # if g.edge[out_edge]==1: # bad_node=True # if not bad_node: # #print in_node, node, out_node # merge_path(g,in_node,node2,out_node) # for nd in g.nodes(): # print nd else: while len(g.nodes()) > n_nodes: node = g.nodes()[random.randrange(len(g.nodes()))] if g.in_degree(node) == 1 and g.out_degree(node) == 1: # assert g.in_degree(node2) == 1 and g.out_degree(node2) == 1 # edge_1 = g.out_edges(node)[0] # edge_2 = g.in_edges(node)[0] edge1 = g.out_edges(node)[0] edge2 = g.in_edges(node)[0] # print g.edge[edge1[0]][edge1[1]]['hinge_edge'] if (g.edge[edge1[0]][edge1[1]]['hinge_edge'] == -1 and g.edge[edge2[0]][edge2[1]]['hinge_edge'] == -1): in_node = g.in_edges(node)[0][0] out_node = g.out_edges(node)[0][1] if g.out_degree(in_node) == 1 and g.in_degree(out_node) == 1: if in_node != node and out_node != node and in_node != out_node: #print in_node, node, out_node merge_path(g,in_node,node,out_node) degree_sequence=sorted(nx.degree(g).values(),reverse=True) print Counter(degree_sequence) nx.write_graphml(g, filename.split('.')[0]+'.sparse3.graphml') print nx.number_weakly_connected_components(g) print nx.number_strongly_connected_components(g) if __name__ == "__main__": filename = sys.argv[1] try : hinge_list=sys.argv[3] print "Found hinge list." except: hinge_list=None print "in except "+hinge_list de_clip(filename, int(sys.argv[2]),hinge_list, sys.argv[4]) HINGE-0.5.0/scripts/repeat_annotate_reads.py000077500000000000000000000067041314415550300207510ustar00rootroot00000000000000#!/usr/bin/env python import sys import os def reverse_complement(bases): rev_comp={'A':'T','C':'G','G':'C','T':'A','N':'N'} return ''.join(map(lambda x :rev_comp[x],bases[::-1])) def run(multifasta_path,intermediate_repeat_file_path, gt_file_path, gt_annotated_file_path): ##Read chromosomes from multifasta file chrom={} cur_chrom='' start=True chr_num=0 with open(multifasta_path,'r') as f: for lines in f: if lines[0]=='>': if start: start=False chr_num=int(lines.split()[0][1:])-1 print 'detect chr '+ str(chr_num) else: chrom[chr_num]=cur_chrom print len(cur_chrom) chr_num=int(lines.split()[0][1:])-1 print 'detect chr '+ str(chr_num) cur_chrom='' else: cur_chrom+=lines.strip() print len(cur_chrom) chrom[chr_num]=cur_chrom ##Run mummer to get repeats mummer_cmd='mummer -maxmatch -b -c -l 1000 -L '+multifasta_path+' '+multifasta_path +' > '+intermediate_repeat_file_path os.system(mummer_cmd) #Put repeats discovered by mummer in right form chr_num=0 chr_repeats={} rev_com=False with open (intermediate_repeat_file_path) as f: for line in f: if line[0]=='>': line1=line.strip().split() chr_num=int(line1[1])-1 #print len(line1) if len(line1)==6: rev_com=True chr_len=int(line1[5]) else: rev_com=False chr_len=int(line1[4]) else: line1=line.strip().split() chr2_num=int(line1[0])-1 chr2_start=int(line1[1])-1 chr1_start=int(line1[2])-1 rep_len=int(line1[3]) if not rev_com: if not (chrom[chr_num][chr1_start:chr1_start+rep_len] ==chrom[chr2_num][chr2_start:chr2_start+rep_len]): print chr_num+1,line if chr1_start==0 and rep_len==chr_len: continue chr_repeats.setdefault(chr_num,[]).append((chr1_start,chr1_start+rep_len)) else: if not (chrom[chr_num][chr1_start-rep_len+1:chr1_start+1] == reverse_complement(chrom[chr2_num][chr2_start:chr2_start+rep_len])): print chr_num+1,line,rev_com chr_repeats.setdefault(chr_num,[]).append((chr1_start-rep_len+1,chr1_start+1)) #Go through gt file and annotate reads that intersect with repeats. with open(gt_file_path) as f: with open(gt_annotated_file_path,'w') as g: for line in f: line1=line.split() cr=int(line1[1]) rd_st=int(line1[2]) rd_end=int(line1[3]) is_repeat=0 for tup in chr_repeats[cr]: if ((rd_st >= tup[0] and rd_st <= tup[1]) or (rd_end >= tup[0] and rd_end <= tup[1])): is_repeat=1 line2=line.strip()+"\t"+str(is_repeat)+"\n" g.write(line2) if __name__ == '__main__': multifasta_path=sys.argv[1] gt_file_path=sys.argv[2] gt_annotated_file_path=sys.argv[3] intermediate_repeat_file_path='./repeats_discovered.txt' if len(sys.argv) > 4: intermediate_repeat_file_path=sys.argv[4] run(multifasta_path,intermediate_repeat_file_path, gt_file_path, gt_annotated_file_path)ß HINGE-0.5.0/scripts/run_mapping.py000077500000000000000000000021521314415550300167320ustar00rootroot00000000000000#!/usr/bin/env python import sys import os import subprocess from parse_read import * from parse_alignment import * filename,filename2 = sys.argv[1:3] alignmentname = sys.argv[3] readarg = sys.argv[4] stream = subprocess.Popen(["LA4Awesome", filename, filename2 , alignmentname ,readarg, '-F'], stdout=subprocess.PIPE, bufsize=1) alignments = parse_alignment2(stream.stdout) # generator d = {} for alignment in alignments: if not d.has_key(alignment[2]): d[alignment[2]] = [] d[alignment[2]].append([alignment[0],alignment[3],alignment[4], alignment[6], alignment[7], alignment[1]]) #print d mapping = {} for key,value in d.items(): value.sort(key = lambda x:x[2]-x[1], reverse=True) aln = value[0] if aln[0] == 'n': mapping[str(key)] = (aln[1], aln[2],aln[-1], 0) mapping[str(key)+'\''] = (aln[2], aln[1],aln[-1], 1) else: mapping[str(key)] = (aln[2], aln[1], aln[-1], 1) mapping[str(key)+'\''] = (aln[1], aln[2], aln[-1], 0) #print mapping import ujson ujson.dump(mapping,open(filename+'.mapping.json','w')) HINGE-0.5.0/scripts/run_mapping2.py000077500000000000000000000026021314415550300170140ustar00rootroot00000000000000#!/usr/bin/env python import sys import os import subprocess from parse_read import * from parse_alignment import * filename,filename2 = sys.argv[1:3] alignmentname = sys.argv[3] readarg = sys.argv[4] k = int(sys.argv[5]) stream = subprocess.Popen(["LA4Awesome", filename, filename2 , alignmentname ,readarg], stdout=subprocess.PIPE, bufsize=1) alignments = parse_alignment2(stream.stdout) # generator d = {} for alignment in alignments: if not d.has_key(alignment[2]): d[alignment[2]] = [] d[alignment[2]].append([alignment[0],alignment[3],alignment[4], alignment[6], alignment[7], alignment[1]]) #print d mapping = {} for key,value in d.items(): value.sort(key = lambda x:x[2]-x[1], reverse=True) alns = value[:k] max_val=alns[0][2]-alns[0][1] for aln in alns: if aln[2]-aln[1] > max_val/2.: if not mapping.has_key(str(key)): mapping[str(key)] = [(aln[1], aln[2],aln[-1], 1-int(aln[0] == 'n'))] # mapping[str(key)+'\''] = [(aln[2], aln[1],aln[-1], int(aln[0] == 'n'))] else: mapping[str(key)].append((aln[1], aln[2],aln[-1], 1-int(aln[0] == 'n'))) # mapping[str(key)+'\''].append((aln[2], aln[1],aln[-1], int(aln[0] == 'n'))) #print mapping import ujson ujson.dump(mapping,open(filename2+'.mapping.'+str(k)+'.json','w')) HINGE-0.5.0/scripts/run_mapping3.py000077500000000000000000000026051314415550300170200ustar00rootroot00000000000000#!/usr/bin/env python import sys import os import subprocess from parse_read import * from parse_alignment import * filename,filename2 = sys.argv[1:3] alignmentname = sys.argv[3] readarg = sys.argv[4] k = int(sys.argv[5]) stream = subprocess.Popen(["LA4Awesome", filename, filename2 , alignmentname ,readarg], stdout=subprocess.PIPE, bufsize=1) alignments = parse_alignment2(stream.stdout) # generator d = {} for alignment in alignments: if not d.has_key(alignment[2]): d[alignment[2]] = [] d[alignment[2]].append([alignment[0],alignment[3],alignment[4], alignment[6], alignment[7], alignment[1]]) #print d mapping = {} for key,value in d.items(): value.sort(key = lambda x:x[2]-x[1], reverse=True) #alns = value[:k] if len(alns) > 0: alns = [item for item in alns if (item[2] - item[1]) > (alns[0][2] - alns[0][1])/2] for aln in alns: if not mapping.has_key(str(key)): mapping[str(key)] = [(aln[1], aln[2],aln[-1], 1-int(aln[0] == 'n'))] mapping[str(key)+'\''] = [(aln[2], aln[1],aln[-1], int(aln[0] == 'n'))] else: mapping[str(key)].append((aln[1], aln[2],aln[-1], 1-int(aln[0] == 'n'))) mapping[str(key)+'\''].append((aln[2], aln[1],aln[-1], int(aln[0] == 'n'))) #print mapping import ujson ujson.dump(mapping,open(filename2+'.mapping.json','w')) HINGE-0.5.0/scripts/run_parse_alignment.py000077500000000000000000000006301314415550300204460ustar00rootroot00000000000000#!/usr/bin/env python import sys import os import subprocess from parse_read import * from parse_alignment import * filename = sys.argv[1] readarg = sys.argv[2] stream = subprocess.Popen(["LAshow", filename , filename ,readarg], stdout=subprocess.PIPE, bufsize=1) alignments = parse_alignment(stream.stdout) # generator for alignment in alignments: print alignment HINGE-0.5.0/scripts/run_parse_read.py000077500000000000000000000005451314415550300174100ustar00rootroot00000000000000#!/usr/bin/env python import sys import os import subprocess from parse_read import * filename = sys.argv[1] readarg = sys.argv[2] stream = subprocess.Popen(["DBshow", filename ,readarg], stdout=subprocess.PIPE,bufsize=1) reads = parse_read(stream.stdout) # generator for read in reads: print read #print result HINGE-0.5.0/scripts/split_las.py000077500000000000000000000011751314415550300164110ustar00rootroot00000000000000#!/usr/bin/env python import os import argparse ap = argparse.ArgumentParser(description="run LAsplit by splitting las into sizes of less than specified length") ap.add_argument("las", help="path to las file to be split. assumed to be sorted.") ap.add_argument("max_size", help="max size of any split file.", type=int, default=4, nargs='?') args = ap.parse_args() laspath = args.las max_las_size = args.max_size x = os.path.getsize(laspath) num_divisions = (x/10**9)/max_las_size + 1 out_las_name = laspath.split('.las')[0]+'.# ' LAsplit_cmd = 'LAsplit -v '+out_las_name+ str(num_divisions) +' < ' + laspath os.system(LAsplit_cmd)HINGE-0.5.0/scripts/unitig.py000077500000000000000000000100041314415550300157050ustar00rootroot00000000000000#!/usr/bin/env python import networkx as nx import sys import itertools filename = sys.argv[1] outfile = filename.split('.')[0] + ".edges.list" g = nx.read_graphml(filename) print nx.info(g) def get_circle(g,node,vertices_of_interest): cur_path = [node] cur_vertex = g.successors(node)[0] i = 0 while cur_vertex != node: cur_path.append(cur_vertex) try: assert len(g.successors(cur_vertex)) == 1 except: print g.successors(cur_vertex), cur_vertex, node print cur_vertex in vertices_of_interest raise successor = g.successors(cur_vertex)[0] cur_vertex = successor cur_path.append(cur_vertex) return cur_path def get_unitigs(g): paths = [] num_paths = 0 node_set = set(g.nodes()) vertices_of_interest = set([x for x in g if g.in_degree(x) != 1 or g.out_degree(x) != 1]) vertices_used = set(vertices_of_interest) for start_vertex in vertices_of_interest: first_out_vertices = g.successors(start_vertex) print first_out_vertices for vertex in first_out_vertices: cur_path = [start_vertex] cur_vertex = vertex while cur_vertex not in vertices_of_interest: successor = g.successors(cur_vertex)[0] cur_path.append(cur_vertex) predecessor = cur_vertex cur_vertex = successor cur_path.append(cur_vertex) vertices_used = vertices_used.union(set(cur_path)) paths.append(cur_path) print len(node_set) print len(vertices_used) while len(node_set-vertices_used) > 0: node = list(node_set-vertices_used)[0] # print list(node_set-vertices_used) # # print vertices_of_interest # # print len(node_set-vertices_used) # break path = get_circle(g, node, vertices_of_interest) vertices_used = vertices_used.union(set(path)) if len(path) > 1: paths.append(path) print len(paths) # print paths print "paths" return paths paths = get_unitigs(g) print len(paths) h = nx.DiGraph() for i, path in enumerate(paths): h.add_node(i) h.node[i]['path'] = path vertices_of_interest = set([x for x in g if g.in_degree(x) != 1 or g.out_degree(x) != 1]) for vertex in vertices_of_interest: successors = [x for x in h.nodes() if h.node[x]['path'][0] == vertex] predecessors = [x for x in h.nodes() if h.node[x]['path'][-1] == vertex] print successors,predecessors assert len(successors)==1 or len(predecessors)==1 for succ, pred in itertools.product(successors,predecessors): h.add_edge(pred,succ) # if vertex.split('_') == '0': # if len(predecessors) == 1: # for succ in successors: # rel_suc = h.node[succ]['path'][1] # d = g.get_edge_data(vertex,rel_suc) # h.edge[predecessors[0]][succ]['start_pos'] = d['read_a_end'] # h.edge[predecessors[0]][succ]['weight'] = d['read_a_end_raw'] - d['read_a_start'] # if len(successors) == 1: # for pred in predecessors: # rel_pred = h.node[pred]['path'][-2] # d = g.get_edge_data(rel_pred,vertex) # h.edge[predecessors[0]][succ][''] = d['read_a_end_raw'] - d['read_a_start'] with open(outfile, 'w') as f: for i,path in enumerate(paths): f.write('>Unitig%d\n'%(i)) for j in range(len(path)-1): nodeA = path[j].lstrip("B") nodeB = path[j+1].lstrip("B") d = g.get_edge_data(path[j],path[j+1]) f.write('%s %s %s %s %d %d %d %d %d\n'%(nodeA.split('_')[0],nodeA.split('_')[1] , nodeB.split('_')[0], nodeB.split('_')[1], -d['read_a_start_raw'] + d['read_a_end_raw'] - d['read_b_start_raw'] + d['read_b_end_raw'], d['read_a_start_raw'], d['read_a_end_raw'], d['read_b_start_raw'], d['read_b_end_raw'])) f.close() HINGE-0.5.0/src/000077500000000000000000000000001314415550300131365ustar00rootroot00000000000000HINGE-0.5.0/src/CMakeLists.txt000066400000000000000000000030471314415550300157020ustar00rootroot00000000000000cmake_minimum_required(VERSION 3.2) #if (${CMAKE_SYSTEM_NAME} MATCHES "Windows") # set(WINDOWS TRUE) #elseif (${CMAKE_SYSTEM_NAME} MATCHES "Linux") # set(LINUX TRUE) # message( "Linux Detected, using gcc48") # set(CMAKE_C_COMPILER /usr/bin/gcc-4.8) # set(CMAKE_CXX_COMPILER /usr/bin/g++-4.8) #elseif (${CMAKE_SYSTEM_NAME} MATCHES "Darwin") # set(MACOSX TRUE) # set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/build") # message( "OS X Detected, using gcc49") # set(CMAKE_C_COMPILER /usr/local/bin/gcc-4.9) # set(CMAKE_CXX_COMPILER /usr/local/bin/g++-4.9) # set(CMAKE_INCLUDE_CURRENT_DIR ON) # set(CMAKE_AUTOMOC ON) #endif() set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=gnu++11") set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fopenmp") set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp") include_directories(include) ## Libraries add_subdirectory(lib lib) add_subdirectory(spdlog) #add_subdirectory(ogdf) find_program(PANDOC pandoc) if(PANDOC) add_custom_command( OUTPUT hinge.1 COMMAND ${PANDOC} --from markdown --to man -s ${CMAKE_CURRENT_SOURCE_DIR}/hinge.1.md -o hinge.1 DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/hinge.1.md VERBATIM ) add_custom_target(man ALL DEPENDS hinge.1) install(FILES ${CMAKE_CURRENT_BINARY_DIR}/hinge.1 DESTINATION share/man/man1) endif() install(PROGRAMS hinge DESTINATION bin) ## Executables add_subdirectory(filter filter) add_subdirectory(maximal maximal) add_subdirectory(consensus consensus) add_subdirectory(layout layout) ## Tests add_subdirectory(test) HINGE-0.5.0/src/consensus/000077500000000000000000000000001314415550300151565ustar00rootroot00000000000000HINGE-0.5.0/src/consensus/CMakeLists.txt000066400000000000000000000004611314415550300177170ustar00rootroot00000000000000cmake_minimum_required(VERSION 3.2) add_executable(draft_assembly draft) target_link_libraries(draft_assembly LAInterface ini falcon spdlog) add_executable(consensus consensus.cpp) target_link_libraries(consensus LAInterface falcon ini) install(TARGETS draft_assembly consensus DESTINATION ${libexec}) HINGE-0.5.0/src/consensus/consensus.cpp000066400000000000000000000173461314415550300177150ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include "DB.h" #include "align.h" #include "LAInterface.h" #include #include #include #include #include #include #include extern "C" { #include "common.h" } #include "INIReader.h" static char ToU[4] = { 'A', 'C', 'G', 'T' }; int chop_end(std::pair * alignment, int chop) { int len = alignment->first.size(); if (len < chop*2 + 10) return 0; int start = chop; while (alignment->first[start] == '-') start++; int offset = 0; for (int i =0; i < start; i++) if (alignment->first[i]!= '-') offset++; alignment->first = alignment->first.substr(start, len-start-chop); alignment->second = alignment->second.substr(start, len-start-chop); return offset; } char toLower(char c) { char base = c; switch (c) { case 'A': base = 'a'; break; case 'C': base = 'c'; break; case 'G': base = 'g'; break; case 'T': base = 't'; break; } return base; } int remove_multialign(std::vector idx, int idx_size, int LENGTH_THRESHOLD) { int i, j, r=0; for(i = 0; i < idx_size; i ++) { if (idx[i]->aepos - idx[i]->abpos >= LENGTH_THRESHOLD) { for(j = 0; j < r; j ++) if(idx[j]->read_B_id_ == idx[i]->read_B_id_) break; if(j == r) idx[r++] = idx[i]; } } return r; } int main(int argc, char *argv[]) { std::string name_db1 = std::string(argv[1]); std::string name_db2 = std::string(argv[2]); std::string name_las = std::string(argv[3]); char * name_out = argv[4]; char * name_config = argv[5]; std::ofstream out(name_out); INIReader reader(name_config); if (reader.ParseError() < 0) { std::cout << "Can't load "< res; la.resetAlignment(); la.getAlignment(res, 0, n_alns); // get all alignments std::vector> idx; printf("%lu\n", res.size()); for (int i = 0; i < n_contigs; i++) idx.push_back(std::vector()); for (int i = 0; i < n_alns; i++) { idx[res[i]->read_A_id_].push_back(res[i]); } for (int i = 0; i < n_contigs; i++) { std::sort(idx[i].begin(), idx[i].end(), compare_overlap_aln); printf("%d %lu\n", i, idx[i].size()); } std::cout << "Getting read lengths" << std::endl; std::vector reads_vec; la.getRead(reads_vec, 0, n_contigs); for (int i = 0; i < n_contigs; i++){ std::cout << i << "\t" << (reads_vec[i]->bases).size() << std::endl; } std::cout << "Building consensus sequences..." << std::endl; for (int i = 0; i < n_contigs; i++) { int seq_count = remove_multialign(idx[i], idx[i].size(),LENGTH_THRESHOLD); std::cout << "Contig " << i << ": " << seq_count << " reads" << std::endl; if (seq_count == 0) { out << ">Consensus" << i << std::endl; out << reads_vec[i]->bases << std::endl; continue; } std::vector> contig_base_scores; std::vector insertion_score (idx[i][0]->alen,0); std::vector> insertion_base_scores; // handling single insertions only std::vector cov_depth (idx[i][0]->alen,0); std::vector zero_scores (5,0); // scores for A,C,G,T,- are initialized at 0 for (int j = 0; j < idx[i][0]->alen; j++) { contig_base_scores.push_back(zero_scores); insertion_base_scores.push_back(zero_scores); } for (int j = 0; j < seq_count ; j ++) { la.recoverAlignment(idx[i][j]); std::pair alignment = la.getAlignmentTags(idx[i][j]); //std::cout<<"before:" << alignment.first.substr(0,200) << std::endl; int offset = chop_end(&alignment,100); std::cout << offset<abpos + offset; for (int m = 0; m < alignment.first.length(); m++) { int base = -1; switch (alignment.second[m]) { case 'A': base = 0; break; case 'C': base = 1; break; case 'G': base = 2; break; case 'T': base = 3; break; case '-': base = 4; break; } if (alignment.first[m] != '-') { if (base != -1) { contig_base_scores[pos_in_contig][base]++; cov_depth[pos_in_contig]++; } pos_in_contig++; } else if (base != -1) { insertion_score[pos_in_contig]++; insertion_base_scores[pos_in_contig][base]++; } } } int good_bases = 0; int insertions = 0; // insertion here means that a base is inserted in the consensus int deletions = 0; // deletion here means that the base from the draft is deleted in the consensus int consensus_length = 0; int low_coverage_bases = 0; long int sum_coverage = 0; out << ">Consensus" << i << std::endl; for (int j=0; j < idx[i][0]->alen ; j++) { sum_coverage += cov_depth[j]; if (cov_depth[j] < 3) { // std::cout << "Low coverage." << std::endl; low_coverage_bases++; out << toLower(reads_vec[i]->bases[j]); continue; } if (insertion_score[j] > cov_depth[j]/2) { int max_insertion_base = 0; for (int b=1; b<4; b++) { if (insertion_base_scores[j][b] > insertion_base_scores[j][max_insertion_base]) max_insertion_base = b; } out << ToU[max_insertion_base]; consensus_length++; insertions++; } int max_base = 0; for (int b=1; b<5; b++) { if (contig_base_scores[j][b] > contig_base_scores[j][max_base]) max_base = b; } if (max_base < 4) { out << ToU[max_base]; good_bases++; consensus_length++; } else { deletions++; } } out << std::endl; printf("Average coverage: %f\n",(1.0*sum_coverage)/idx[i][0]->alen); printf("Good bases: %d/%d\n",good_bases,idx[i][0]->alen); printf("Insertions: %d/%d\n",insertions,idx[i][0]->alen); printf("Deletions: %d/%d\n",deletions,idx[i][0]->alen); printf("Low coverage bases: %d/%d\n",low_coverage_bases,idx[i][0]->alen); printf("Consensus length: %d\n",consensus_length); } la.closeDB(); //close database*/ return 0; } HINGE-0.5.0/src/consensus/draft.cpp000066400000000000000000001235341314415550300167720ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include "spdlog/spdlog.h" #include "cmdline.h" #include "INIReader.h" #include "DB.h" #include "align.h" #include "LAInterface.h" #include #include #include extern "C" { #include "common.h" } #define LAST_READ_SYMBOL '$' #define HINGED_EDGE 1 #define UNHINGED_EDGE -1 #define REVERSE_COMPLEMENT_MATCH 1 #define SAME_DIRECTION_MATCH 0 using namespace boost; typedef adjacency_list Graph; typedef std::tuple Edge_w; inline std::vector glob(const std::string& pat){ using namespace std; glob_t glob_result; int i = 1; std::string search_name; search_name = pat + "."+std::to_string(i)+".las"; std::cout << search_name << endl; glob(search_name.c_str(),GLOB_TILDE,NULL,&glob_result); vector ret; while (glob_result.gl_pathc != 0){ ret.push_back(string(glob_result.gl_pathv[0])); i ++; search_name = pat + "."+std::to_string(i)+".las"; glob(search_name.c_str(),GLOB_TILDE,NULL,&glob_result); // std::cout << "Number of files " << glob_result.gl_pathc << std::endl; } std::cout << "-------------------------"<< std::endl; std::cout << "Number of files " << i-1 << std::endl; std::cout << "Input string " << pat.c_str() << std::endl; std::cout << "-------------------------"<< std::endl; globfree(&glob_result); return ret; } std::vector get_mapping(std::string aln_tag1, std::string aln_tag2) { int pos = 0; int count = 0; int count2 = 0; std::vector ret; while (pos < aln_tag1.size()) { if (aln_tag1[pos] != '-') { ret.push_back(count2); count ++; } if (aln_tag2[pos] != '-') { count2 ++; } pos++; } return ret; } std::string reverse_complement(std::string seq) { static std::map m = {{'a','t'}, {'c','g'}, {'g','c'}, {'t','a'}, {'A','T'}, {'C','G'}, {'T','A'}, {'G','C'}, {'n','n'}, {'N', 'N'}, {'-', '-'}}; std::reverse(seq.begin(), seq.end()); for (int i = 0; i < seq.size(); i++) { seq[i] = m[seq[i]]; } return seq; } std::vector &split(const std::string &s, char delim, std::vector &elems) { std::stringstream ss(s); std::string item; while (std::getline(ss, item, delim)) { elems.push_back(item); } return elems; } std::vector split(const std::string &s, char delim) { std::vector elems; split(s, delim, elems); return elems; } int draft_assembly_ctg(std::vector & edgelist, LAInterface & la, std::vector & full_aln, std::unordered_map > &idx3, std::unordered_map > > & idx, std::vector & reads, int TSPACE, int EDGE_SAFE, int MIN_COV2, int cut_start, int cut_end, bool one_read_contig, bool two_read_contig, std::string& contig) { std::cout << "list size:" << edgelist.size() << std::endl; if (edgelist.size() == 0) return -1; //error std::string draft_assembly = ""; if (one_read_contig) { if (std::get<0>(edgelist[0]).strand == 0) draft_assembly = reads[std::get<0>(edgelist[0]).id]->bases; else draft_assembly = reverse_complement(reads[std::get<0>(edgelist[0]).id]->bases); std::cout << cut_start << " " << cut_end << " " << reads[std::get<0>(edgelist[0]).id]->len << std::endl; if ((cut_start <= draft_assembly.size()) and (cut_end <= draft_assembly.size())) contig = draft_assembly.substr(cut_start, cut_end-cut_start); return 1; } //std::vector full_alns; std::vector selected; std::unordered_map> idx_aln; //la.resetAlignment(); std::vector range; for (int i = 0; i < edgelist.size(); i++) { range.push_back(std::get<0>(edgelist[i]).id); idx_aln[std::get<0>(edgelist[i]).id] = std::vector(); } std::sort(range.begin(), range.end()); //la.getAlignment(full_alns, range); for (auto i:full_aln) { idx_aln[i->read_A_id_].push_back(i); } for (int i = 0; i < edgelist.size(); i++) { int aid = std::get<0>(edgelist[i]).id; int bid = std::get<1>(edgelist[i]).id; bool found = false; for (int j = 0; j < idx_aln[std::get<0>(edgelist[i]).id].size(); j++) { //printf("%d %d %d %d\n",bid, idx_aln[aid][j]->read_B_id_, idx_aln[aid][j]->aepos - idx_aln[aid][j]->abpos + idx_aln[aid][j]->bepos - idx_aln[aid][j]->bbpos, std::get<2>(edgelist[i])); if ((idx_aln[aid][j]->read_B_id_ == bid) and \ (idx_aln[aid][j]->aepos - idx_aln[aid][j]->abpos + idx_aln[aid][j]->bepos - idx_aln[aid][j]->bbpos == std::get<2>(edgelist[i]))) { selected.push_back(idx_aln[aid][j]); found = true; break; } if (found) continue; } } std::cout << "selected:" << selected.size() << std::endl; if (two_read_contig) { if (std::get<0>(edgelist[0]).strand == 0) draft_assembly = reads[std::get<0>(edgelist[0]).id]->bases; else draft_assembly = reverse_complement(reads[std::get<0>(edgelist[0]).id]->bases); int aend = selected[0]->aepos; int bstart = selected[0]->bbpos; std::string readB; if (std::get<1>(edgelist[0]).strand == 0) readB = reads[std::get<1>(edgelist[0]).id]->bases; else readB = reverse_complement(reads[std::get<1>(edgelist[0]).id]->bases); std::cout << "alen blen aend bstart" << reads[std::get<0>(edgelist[0]).id]->len << " " << reads[std::get<1>(edgelist[0]).id]->len << " " << aend << " " << bstart << std::endl; draft_assembly = draft_assembly.substr(0, aend); draft_assembly += readB.substr(bstart); std::cout << cut_start << " " << cut_end << " " << reads[std::get<0>(edgelist[0]).id]->len << std::endl; if ((cut_start <= draft_assembly.size()) and (cut_end <= draft_assembly.size())) contig = draft_assembly.substr(cut_start, cut_end-cut_start); return 2; } std::unordered_map > > aln_tags_map; std::vector > aln_tags_list; std::vector > aln_tags_list_true_strand; for (int i = 0; i < selected.size(); i++) { la.recoverAlignment(selected[i]); //printf("%d %d %d %d %d\n", selected[i]->read_A_id_, selected[i]->read_B_id_, // selected[i]->alen, selected[i]->blen, selected[i]->tlen); //printf("%d %d\n",selected[i]->tlen, selected[i]->trace_pts_len); std::pair res = la.getAlignmentTags(selected[i]); aln_tags_map[selected[i]->read_A_id_][selected[i]->read_B_id_] = res; aln_tags_list.push_back(res); } std::string sequence = ""; std::vector bedges; std::vector breads; std::vector > > pitfalls; range.clear(); for (int i = 0; i < edgelist.size(); i++) { range.push_back(std::get<0>(edgelist[i]).id); } std::vector *> coverages; for (int i = 0; i < range.size(); i++) { int aread = range[i]; if (idx3[aread].size() > 0) { std::vector *res = la.getCoverage(idx3[aread]); std::vector > *res2 = la.lowCoverageRegions(*res, MIN_COV2); //delete res; coverages.push_back(res); //printf("%d %d: (%d %d) ", i, aread, 0, idx3[aread][0]->alen); //for (int j = 0; j < res2->size(); j++) { // printf("[%d %d] ", res2->at(j).first, res2->at(j).second); //} //printf("\n"); pitfalls.push_back(*res2); delete res2; } } /*** * Prepare the data */ std::string overhang; int len_overhang = 0; for (int i = 0; i < edgelist.size(); i++) { std::vector currentalns = idx[std::get<0>(edgelist[i]).id][std::get<1>(edgelist[i]).id]; LOverlap *currentaln = NULL; for (int j = 0; j < currentalns.size(); j++) { //std::cout << std::get<0>(edgelist[i]).id << " " << std::get<1>(edgelist[i]).id << " " << currentalns[j]->match_type_ << std::endl; if (currentalns[j]->read_A_match_end_ - currentalns[j]->read_A_match_start_ + currentalns[j]->read_B_match_end_ - currentalns[j]->read_B_match_start_ == std::get<2>(edgelist[i])) currentaln = currentalns[j]; } if (currentaln == NULL) exit(1); //currentaln->show(); std::string current_seq; std::string next_seq; std::string aln_tags1; std::string aln_tags2; if (std::get<0>(edgelist[i]).strand == 0) current_seq = reads[std::get<0>(edgelist[i]).id]->bases; else current_seq = reverse_complement(reads[std::get<0>(edgelist[i]).id]->bases); if (std::get<0>(edgelist[i]).strand == 0) { aln_tags1 = aln_tags_list[i].first; aln_tags2 = aln_tags_list[i].second; } else { aln_tags1 = reverse_complement(aln_tags_list[i].first); aln_tags2 = reverse_complement(aln_tags_list[i].second); } aln_tags_list_true_strand.push_back(std::pair(aln_tags1, aln_tags2)); if (std::get<1>(edgelist[i]).strand == 0) next_seq = reads[std::get<1>(edgelist[i]).id]->bases; else next_seq = reverse_complement(reads[std::get<1>(edgelist[i]).id]->bases); int abpos, aepos, alen, bbpos, bepos, blen, aes, aee, bes, bee; alen = currentaln->alen; blen = currentaln->blen; if (std::get<0>(edgelist[i]).strand == 0) { abpos = currentaln->read_A_match_start_; aepos = currentaln->read_A_match_end_; aes = currentaln->eff_read_A_read_start_; aee = currentaln->eff_read_A_read_end_; } else { abpos = alen - currentaln->read_A_match_end_; aepos = alen - currentaln->read_A_match_start_; aes = alen - currentaln->eff_read_A_read_end_; aee = alen - currentaln->eff_read_A_read_start_; } if (((std::get<1>(edgelist[i]).strand == 0))) { bbpos = currentaln->read_B_match_start_; bepos = currentaln->read_B_match_end_; bes = currentaln->eff_read_B_read_start_; bee = currentaln->eff_read_B_read_end_; } else { bbpos = blen - currentaln->read_B_match_end_; bepos = blen - currentaln->read_B_match_start_; bes = blen - currentaln->eff_read_B_read_end_; bee = blen - currentaln->eff_read_B_read_start_; } aes = 0; bes = 0; aee = alen; bee = blen; // printf("%d %d [[%d %d] << [%d %d]] x [[%d %d] << [%d %d]]\n", std::get<0>(edgelist[i]).id, std::get<1>(edgelist[i]).id, abpos, aepos, aes, aee, bbpos, bepos, bes, bee); LOverlap *new_ovl = new LOverlap(); new_ovl->read_A_match_start_ = abpos; new_ovl->read_A_match_end_ = aepos; new_ovl->read_B_match_start_ = bbpos; new_ovl->read_B_match_end_ = bepos; new_ovl->eff_read_A_read_end_ = aee; new_ovl->eff_read_A_read_start_ = aes; new_ovl->eff_read_B_read_end_ = bee; new_ovl->eff_read_B_read_start_ = bes; new_ovl->alen = currentaln->alen; new_ovl->blen = currentaln->blen; new_ovl->read_A_id_ = std::get<0>(edgelist[i]).id; new_ovl->read_B_id_ = std::get<1>(edgelist[i]).id; bedges.push_back(new_ovl); breads.push_back(current_seq); overhang = next_seq; len_overhang = new_ovl->blen - new_ovl->read_B_match_end_ - (new_ovl->alen - new_ovl->read_A_match_end_); } //need to trim the end if ((len_overhang > 0) and (len_overhang < overhang.size())) { overhang = overhang.substr(overhang.size()-len_overhang); } else overhang = ""; std::vector > mappings; for (int i = 0; i < range.size(); i++) { mappings.push_back(get_mapping(aln_tags_list_true_strand[i].first, aln_tags_list_true_strand[i].second)); } std::cout << bedges.size() << " " << breads.size() << " " << selected.size() << " " << aln_tags_list.size() << " " << pitfalls.size() << " " << aln_tags_list_true_strand.size() << " " << mappings.size() << " " << coverages.size() << std::endl; /*for (int i = 0; i < bedges.size() - 1; i++) { printf("%d %d %d %d %d\n", bedges[i]->read_B_match_start_, bedges[i]->read_B_match_end_, bedges[i+1]->read_A_match_start_, bedges[i+1]->read_A_match_end_, bedges[i]->read_B_match_end_ - bedges[i+1]->read_A_match_start_); }*/ int tspace = TSPACE; // set lane length to be 500 int nlane = 0; std::vector>> lanes; int currentlane = 0; int current_starting_read = 0; int current_starting_space = 1; int current_starting_offset = 0; int n_bb_reads = range.size(); std::vector> trace_pts(n_bb_reads); bool revert = false; int rmax = -1; /** * Move forward and put "trace points" */ while (current_starting_read < n_bb_reads - 1) { int currentread = current_starting_read; int additional_offset = 0; while (bedges[current_starting_read]->read_A_match_start_ + current_starting_space * tspace + current_starting_offset + additional_offset < bedges[current_starting_read]->read_A_match_end_ - EDGE_SAFE) { int waypoint = bedges[current_starting_read]->read_A_match_start_ + tspace * current_starting_space + current_starting_offset + additional_offset; //if ((waypoint - bedges[current_starting_read]->read_A_match_start_) < EDGE_SAFE) // waypoint += EDGE_SAFE; //int next_waypoint = mappings[currentread][waypoint - bedges[current_starting_read]->read_A_match_start_] + bedges[current_starting_read]->read_B_match_start_; std::vector > lane; while ((waypoint > bedges[currentread]->read_A_match_start_) and (waypoint < bedges[currentread]->read_A_match_end_)) { //printf("%d %d\n", currentread, waypoint); trace_pts[currentread].push_back(waypoint); /*if (waypoint > bedges[currentread]->read_A_match_end_ - EDGE_SAFE) { printf("Reaching the end, neglect low coverage\n"); } if ((coverages[currentread]->at(waypoint) < MIN_COV2) and (waypoint < bedges[currentread]->read_A_match_end_ - EDGE_SAFE)) { revert = true; printf("Low coverage, revert\n"); break; }*/ lane.push_back(std::pair(currentread, waypoint)); if (currentread > rmax) rmax = currentread; //int previous_wp = waypoint; waypoint = mappings[currentread][waypoint - bedges[currentread]->read_A_match_start_] + bedges[currentread]->read_B_match_start_; currentread++; if (currentread >= n_bb_reads) break; } if (currentread < n_bb_reads) if (waypoint < bedges[currentread]->alen) { lane.push_back(std::pair(currentread, waypoint)); if (currentread > rmax) rmax = currentread; } /*if (revert) { printf("revert\n"); revert = false; while (currentread >= current_starting_read) { trace_pts[currentread].pop_back(); currentread --; additional_offset += STEP; } currentread = current_starting_read; } else*/ { if (currentread >= rmax) lanes.push_back(lane); current_starting_space++; currentread = current_starting_read; } } current_starting_read++; current_starting_space = 1;//get next space; if (trace_pts[current_starting_read].size() == 0) current_starting_offset = 0; else current_starting_offset = trace_pts[current_starting_read].back() - bedges[current_starting_read]->read_A_match_start_; } /** * Show trace points on reads */ for (int i = 0; i < n_bb_reads; i++) { printf("Read %d:", i); for (int j = 0; j < trace_pts[i].size(); j++) { printf("%d ", trace_pts[i][j]); } printf("\n"); } /** * Show lanes */ for (int i = 0; i < lanes.size(); i++) { printf("Lane %d\n", i); for (int j = 0; j < lanes[i].size(); j++) { printf("[%d %d] ", lanes[i][j].first, lanes[i][j].second); } printf("\n"); } printf("In total %lu lanes\n", lanes.size()); //if (lanes.size() < 2) { // draft_assembly = breads[0]; // out_fa << ">DraftAssemblyContig" << num_contig << std::endl; // out_fa << draft_assembly << std::endl; // num_contig++; // continue; //} int first_start = lanes[0][0].second; int last_end = lanes.back().back().second; std::cout << "first " << first_start << " last " << last_end << std::endl; std::cout << "len " << reads[std::get<0>(edgelist[0]).id]->len << " " << reads[std::get<1>(edgelist.back()).id]->len << std::endl; assert(first_start <= reads[std::get<0>(edgelist[0]).id]->len); assert(last_end <= reads[std::get<0>(edgelist.back()).id]->len); std::string prefix = reads[std::get<0>(edgelist[0]).id]->bases.substr(0,first_start); std::string suffix = reads[std::get<0>(edgelist.back()).id]->bases.substr(last_end); printf("last read %d length %d, cut %d\n",std::get<1>(edgelist.back()).id, reads[std::get<1>(edgelist.back()).id]->len, cut_end); cut_end = reads[std::get<1>(edgelist.back()).id]->len - cut_end; /** * Consequtive lanes form a column (ladder) */ std::vector > > ladders; for (int i = 0; i < lanes.size() - 1; i++) { std::vector > lane1 = lanes[i]; std::vector > lane2 = lanes[i + 1]; std::vector > ladder; int pos = 0; for (int j = 0; j < lane2.size(); j++) { while ((lane1[pos].first != lane2[j].first) and (pos < lane1.size() - 1)) pos++; if ((lane1[pos].first == lane2[j].first)) ladder.push_back(std::make_tuple(lane2[j].first, lane1[pos].second, lane2[j].second)); } ladders.push_back(ladder); } /** * show ladders */ for (int i = 0; i < ladders.size(); i++) { // printf("Ladder %d\n", i); // for (int j = 0; j < ladders[i].size(); j++) { // //printf("[%d %d-%d] ", std::get<0>(ladders[i][j]), std::get<1>(ladders[i][j]), std::get<2>(ladders[i][j]) ); // //printf("%s\n", breads[std::get<0>(ladders[i][j])].substr(std::get<1>(ladders[i][j]),std::get<2>(ladders[i][j])-std::get<1>(ladders[i][j])).c_str()); // // } if (ladders[i].size() == 0) { printf("low coverage!\n"); continue; } if (ladders[i].size() > 1) { int mx = 0; int maxcoverage = 0; for (int j = 0; j < ladders[i].size(); j++) { int mincoverage = 10000; int read = std::get<0>(ladders[i][j]); int start = std::get<1>(ladders[i][j]); int end = std::get<2>(ladders[i][j]); for (int pos = start; pos < end; pos++) { if (coverages[read]->at(pos) < mincoverage) mincoverage = coverages[read]->at(pos); } if (mincoverage > maxcoverage) { maxcoverage = mincoverage; mx = j; } } // std::cout << "ladder " << i << " num reads " << ladders[i].size() << " possibly error here " << // maxcoverage << "\n!"; //if (ladders[i].size() == 2) { // draft_assembly += breads[std::get<0>(ladders[i][mx])].substr(std::get<1>(ladders[i][mx]), // std::get<2>(ladders[i][mx]) - // std::get<1>(ladders[i][mx])); // continue; // } std::string base = breads[std::get<0>(ladders[i][mx])].substr(std::get<1>(ladders[i][mx]), std::get<2>(ladders[i][mx]) - std::get<1>(ladders[i][mx]));; int seq_count = ladders[i].size(); // printf("seq_count:%d, max %d\n", seq_count, mx); align_tags_t **tags_list; tags_list = (align_tags_t **) calloc(seq_count, sizeof(align_tags_t *)); consensus_data *consensus; int alen = (std::get<2>(ladders[i][mx]) - std::get<1>(ladders[i][mx])); for (int j = 0; j < ladders[i].size(); j++) { int blen = (std::get<2>(ladders[i][j]) - std::get<1>(ladders[i][j])); char *aseq = (char *) malloc( (20 + (std::get<2>(ladders[i][mx]) - std::get<1>(ladders[i][mx]))) * sizeof(char)); char *bseq = (char *) malloc( (20 + (std::get<2>(ladders[i][j]) - std::get<1>(ladders[i][j]))) * sizeof(char)); strcpy(aseq, breads[std::get<0>(ladders[i][mx])].substr(std::get<1>(ladders[i][mx]), std::get<2>(ladders[i][mx]) - std::get<1>(ladders[i][mx])).c_str()); strcpy(bseq, breads[std::get<0>(ladders[i][j])].substr(std::get<1>(ladders[i][j]), std::get<2>(ladders[i][j]) - std::get<1>(ladders[i][j])).c_str()); aln_range *arange = (aln_range *) calloc(1, sizeof(aln_range)); arange->s1 = 0; arange->e1 = strlen(bseq); arange->s2 = 0; arange->e2 = strlen(aseq); arange->score = 5; //printf("blen %d alen%d\n",strlen(bseq), strlen(aseq)); //printf("before get tags\n"); alignment *alng = _align(bseq, blen, aseq, alen, 150, 1); char *q_aln_str = (char *) malloc((5 + strlen(alng->q_aln_str)) * sizeof(char)); char *t_aln_str = (char *) malloc((5 + strlen(alng->t_aln_str)) * sizeof(char)); strcpy(q_aln_str + 1, alng->q_aln_str); strcpy(t_aln_str + 1, alng->t_aln_str); q_aln_str[0] = 'T'; t_aln_str[0] = 'T'; for (int pos = 0; pos < strlen(q_aln_str); pos++) q_aln_str[pos] = toupper(q_aln_str[pos]); for (int pos = 0; pos < strlen(t_aln_str); pos++) t_aln_str[pos] = toupper(t_aln_str[pos]); //printf("Q:%s\nT:%s\n", q_aln_str, t_aln_str); tags_list[j] = get_align_tags(q_aln_str, t_aln_str, strlen(alng->q_aln_str) + 1, arange, (unsigned int) j, 0); //free(aseq); //free(bseq); /*for (int k = 0; k < tags_list[j]->len; k++) { printf("%d %d %ld %d %c %c\n",j, k, tags_list[j]->align_tags[k].t_pos, tags_list[j]->align_tags[k].delta, //tags_list[j]->align_tags[k].p_q_base, aseq[tags_list[j]->align_tags[k].t_pos], tags_list[j]->align_tags[k].q_base); }*/ free(q_aln_str); free(t_aln_str); free(aseq); free(bseq); free_alignment(alng); } //printf("%d %d\n%s\n",seq_count, strlen(seq), seq); consensus = get_cns_from_align_tags(tags_list, seq_count, alen + 1, 1); // printf("Consensus len :%d\n",strlen(consensus->sequence)); draft_assembly += std::string(consensus->sequence); free_consensus_data(consensus); for (int j = 0; j < seq_count; j++) free_align_tags(tags_list[j]); } else { draft_assembly += breads[std::get<0>(ladders[i][0])].substr(std::get<1>(ladders[i][0]), std::get<2>(ladders[i][0]) - std::get<1>(ladders[i][0])); } // printf("\n"); } /*for (int i = 0; i < mapping.size(); i++) printf("%d %d\n", i, mapping[i]); printf("[%d %d], [%d %d]\n", bedges[0]->read_A_match_start_, bedges[0]->read_A_match_end_, bedges[0]->read_B_match_start_, bedges[0]->read_B_match_end_);*/ std::cout << sequence.size() << std::endl; std::cout << draft_assembly.size() << std::endl; //if (draft_assembly.size() > 0) { // out_fa << ">Draft_assembly" << num_contig << std::endl; // out_fa << draft_assembly << std::endl; //} //num_contig++; contig = prefix + draft_assembly + suffix + overhang; std::cout << "ctg size:" << contig.size() << "cut_start:" << cut_start << "cut_end:" << cut_end << std::endl; if ((cut_start <= contig.size()) and (cut_end <= contig.size())) contig = contig.substr(cut_start, contig.size() - cut_end - cut_start); return 0; } int main(int argc, char *argv[]) { cmdline::parser cmdp; cmdp.add("db", 'b', "db file name", false, ""); cmdp.add("las", 'l', "las file name", false, ""); cmdp.add("paf", 'p', "paf file name", false, ""); cmdp.add("config", 'c', "configuration file name", false, ""); cmdp.add("fasta", 'f', "fasta file name", false, ""); cmdp.add("prefix", 'x', "(intermediate output) input file prefix", true, ""); cmdp.add("out", 'o', "final output file name", true, ""); cmdp.add("log", 'g', "log folder name", false, "log"); cmdp.add("path", 0, "path file name", false, "path"); cmdp.add("debug", '\0', "debug mode"); cmdp.add("mlas", '\0', "multiple las files"); // cmdp.add("restrictreads",'r',"restrict to reads in the file",false,""); cmdp.parse_check(argc, argv); LAInterface la; const char *name_db = cmdp.get("db").c_str(); //.db file of reads to load const char *name_las = cmdp.get("las").c_str();//.las file of alignments const char *name_paf = cmdp.get("paf").c_str(); const char *name_fasta = cmdp.get("fasta").c_str(); const char *name_config = cmdp.get("config").c_str();//name of the configuration file, in INI format std::string out = cmdp.get("prefix"); std::string out_name = cmdp.get("out"); std::string path_name = cmdp.get("path"); // const char * name_restrict = cmdp.get("restrictreads").c_str(); std::string name_mask = out + ".mas"; std::string name_max = out + ".max"; std::string name_homo = out + ".homologous.txt"; std::string name_rep = out + ".repeat.txt"; std::string name_hg = out + ".hinges.txt"; std::string name_cov = out + ".coverage.txt"; std::string name_garbage = out + ".garbage.txt"; std::string name_contained = out + ".contained.txt"; std::string name_deadend = out_name + ".deadends.txt"; std::ofstream deadend_out(name_deadend); std::ofstream garbage_out(name_garbage); std::ofstream contained_out(name_contained); std::ifstream homo(name_homo); std::vector homo_reads; bool delete_telomere = false; // TODO: command line option to set this true int read_id; while (homo >> read_id) homo_reads.push_back(read_id); namespace spd = spdlog; //auto console = spd::stdout_logger_mt("console"); std::vector sinks; sinks.push_back(std::make_shared()); sinks.push_back( std::make_shared(cmdp.get("log") + "/log", "txt", 23, 59)); auto console = std::make_shared("log", std::begin(sinks), std::end(sinks)); spdlog::register_logger(console); console->info("draft consensus"); console->info("name of db: {}, name of .las file {}", name_db, name_las); console->info("name of fasta: {}, name of .paf file {}", name_fasta, name_paf); console->info("filter files prefix: {}", out); console->info("output prefix: {}", out_name); std::ifstream ini_file(name_config); std::string str((std::istreambuf_iterator(ini_file)), std::istreambuf_iterator()); console->info("Parameters passed in \n{}", str); if (strlen(name_db) > 0) la.openDB(name_db); std::vector name_las_list; std::string name_las_str(name_las); if (cmdp.exist("mlas")) { name_las_list = glob(name_las_str); console->info("calling glob."); } else name_las_list.push_back(name_las_str); //if (strlen(name_las) > 0) // la.openAlignmentFile(name_las); int64 n_aln = 0; //if (strlen(name_las) > 0) { // n_aln = la.getAlignmentNumber(); // console->info("Load alignments from {}", name_las); // console->info("# Alignments: {}", n_aln); //} int n_read; if (strlen(name_db) > 0) { n_read = la.getReadNumber(); } std::vector reads; //Vector of pointers to all reads if (strlen(name_fasta) > 0) { n_read = la.loadFASTA(name_fasta, reads); } console->info("# Reads: {}", n_read); // output some statistics if (strlen(name_db) > 0) { la.getRead(reads, 0, n_read); } std::ifstream max_reads_file(name_max); std::vector maximal_read; maximal_read.resize(n_read, false); std::string read_line; int num_active_reads = 0; while(std::getline(max_reads_file, read_line)) { int read_number; read_number = atoi(read_line.c_str()); maximal_read[read_number] = true; num_active_reads++; } console->info("Total number of active reads: {}/{}", num_active_reads, n_read); for (int i = 0; i < n_read; i++){ reads[i]->active = maximal_read[i]; } // start loading and cleaning int number_of_parts; number_of_parts = name_las_list.size(); std::vector range; for (int i = 0; i < n_read; i++) { if (reads[i]->active) range.push_back(i); } std::sort(range.begin(), range.end()); std::vector aln;//Vector of pointers to all alignments std::vector full_aln;//Vector of pointers to all alignments std::vector aln_to_remove;//Vector of pointers to all alignments std::vector full_aln_to_remove;//Vector of pointers to all alignments for (int part = 0; part < number_of_parts; part++) { console->info("part:{}", part); console->info("name of las {}", name_las_list[part]); la.openAlignmentFile(name_las_list[part]); int64 n_aln_part = 0; n_aln_part = la.getAlignmentNumber(); n_aln += n_aln_part; console->info("Load alignment from {}", name_las_list[part]); console->info("# Alignments: {}", n_aln_part); la.resetAlignment(); la.getOverlap(aln_to_remove, range); la.resetAlignment(); la.getAlignment(full_aln_to_remove, range); for (int j = 0; j < aln_to_remove.size(); j++) { if ((reads[aln_to_remove[j]->read_A_id_]->active) && (reads[aln_to_remove[j]->read_B_id_]->active)) { aln.push_back(aln_to_remove[j]); } else delete aln_to_remove[j]; } for (int j = 0; j < full_aln_to_remove.size(); j++) { if ((reads[full_aln_to_remove[j]->read_A_id_]->active) && (reads[full_aln_to_remove[j]->read_B_id_]->active)) { full_aln.push_back(full_aln_to_remove[j]); } else delete full_aln_to_remove[j]; } //need to do some cleaning here aln_to_remove.clear(); full_aln_to_remove.clear(); } //if (strlen(name_las) > 0) { // la.resetAlignment(); // la.getOverlap(aln, range); // la.resetAlignment(); // la.getAlignment(full_aln, range); //} if (strlen(name_paf) > 0) { n_aln = la.loadPAF(std::string(name_paf), aln); console->info("Load alignments from {}", name_paf); console->info("# Alignments: {}", n_aln); } if (n_aln == 0) { console->error("No alignments!"); return 1; } console->info("Input data finished"); INIReader reader(name_config); if (reader.ParseError() < 0) { console->warn("Can't load {}", name_config); return 1; } int LENGTH_THRESHOLD = int(reader.GetInteger("filter", "length_threshold", -1)); double QUALITY_THRESHOLD = reader.GetReal("filter", "quality_threshold", 0.0); int N_ITER = (int) reader.GetInteger("filter", "n_iter", -1); int ALN_THRESHOLD = (int) reader.GetInteger("filter", "aln_threshold", -1); int MIN_COV = (int) reader.GetInteger("filter", "min_cov", -1); int CUT_OFF = (int) reader.GetInteger("filter", "cut_off", -1); int THETA = (int) reader.GetInteger("filter", "theta", -1); int THETA2 = (int) reader.GetInteger("filter", "theta2", 0); int N_PROC = (int) reader.GetInteger("running", "n_proc", 4); int HINGE_SLACK = (int) reader.GetInteger("layout", "hinge_slack", 1000); //This is the amount by which a forward overlap //must be longer than a forward internal overlap to be preferred while //building a graph. int HINGE_TOLERANCE = (int) reader.GetInteger("layout", "hinge_tolerance", 150); //This is how far an overlap must start from a hinge to be considered an internal //overlap. int KILL_HINGE_OVERLAP_ALLOWANCE = (int) reader.GetInteger("layout", "kill_hinge_overlap", 300); int KILL_HINGE_INTERNAL_ALLOWANCE = (int) reader.GetInteger("layout", "kill_hinge_internal", 40); int MATCHING_HINGE_SLACK = (int) reader.GetInteger("layout", "matching_hinge_slack", 200); int NUM_EVENTS_TELOMERE = (int) reader.GetInteger("layout", "num_events_telomere", 7); int MIN_CONNECTED_COMPONENT_SIZE = (int) reader.GetInteger("layout", "min_connected_component_size", 8); int MIN_COV2 = reader.GetInteger("draft", "min_cov", -1); int EDGE_TRIM = reader.GetInteger("draft", "trim", -1); int EDGE_SAFE = reader.GetInteger("draft", "edge_safe", -1); int TSPACE = reader.GetInteger("draft", "tspace", -1); int STEP = reader.GetInteger("draft", "step", -1); console->info("LENGTH_THRESHOLD = {}", LENGTH_THRESHOLD); console->info("QUALITY_THRESHOLD = {}", QUALITY_THRESHOLD); console->info("ALN_THRESHOLD = {}", ALN_THRESHOLD); console->info("MIN_COV = {}", MIN_COV); console->info("CUT_OFF = {}", CUT_OFF); console->info("THETA = {}", THETA); console->info("N_ITER = {}", N_ITER); console->info("THETA2 = {}", THETA2); console->info("N_PROC = {}", N_PROC); console->info("HINGE_SLACK = {}", HINGE_SLACK); console->info("HINGE_TOLERANCE = {}", HINGE_TOLERANCE); console->info("KILL_HINGE_OVERLAP_ALLOWANCE = {}", KILL_HINGE_OVERLAP_ALLOWANCE); console->info("KILL_HINGE_INTERNAL_ALLOWANCE = {}", KILL_HINGE_INTERNAL_ALLOWANCE); console->info("MATCHING_HINGE_SLACK = {}", MATCHING_HINGE_SLACK); console->info("MIN_CONNECTED_COMPONENT_SIZE = {}", MIN_CONNECTED_COMPONENT_SIZE); omp_set_num_threads(N_PROC); std::vector edgelist, edgelist_ms; // save output to edgelist std::vector > > idx_ab; for (int i = 0; i < n_read; i++) { //An initialisation for loop //TODO Preallocate memory. Much more efficient. idx_ab.push_back(std::unordered_map >()); } for (int i = 0; i < aln.size(); i++) { idx_ab[aln[i]->read_A_id_][aln[i]->read_B_id_] = std::vector(); } for (int i = 0; i < aln.size(); i++) { idx_ab[aln[i]->read_A_id_][aln[i]->read_B_id_].push_back(aln[i]); } std::unordered_map > idx3; // this is the pileup std::vector > has_overlap(n_read); std::unordered_map > > idx; for (int i = 0; i < n_read; i++) { //has_overlap[i] = std::set(); idx3[i] = std::vector(); } //for (int i = 0; i < aln.size(); i++) // if (aln[i]->active) // idx[std::pair(aln[i]->aid, aln[i]->bid)] = std::vector(); for (int i = 0; i < aln.size(); i++) { if (aln[i]->active) { idx[aln[i]->read_A_id_][aln[i]->read_B_id_] = std::vector(); } } for (int i = 0; i < aln.size(); i++) { if (aln[i]->active) { has_overlap[aln[i]->read_A_id_].insert(aln[i]->read_B_id_); } } for (int i = 0; i < aln.size(); i++) { if (aln[i]->active) { idx3[aln[i]->read_A_id_].push_back(aln[i]); } } std::cout << "add data" << std::endl; for (int i = 0; i < aln.size(); i++) { if (aln[i]->active) { idx[aln[i]->read_A_id_][aln[i]->read_B_id_].push_back(aln[i]); } } std::cout << "add data" << std::endl; std::string name_input= out + ".edges.list"; std::ifstream edges_file(name_input); std::string name_output = out_name + ".fasta"; std::ofstream out_fa(name_output); int num_contig = 0; int num_one_read_contig = 0; std::string current_name; std::string edge_line; std::string contig; bool one_read_contig = false; bool two_read_contig = false; int cut_start = 0, cut_end = 0; while (!edges_file.eof()) { std::getline(edges_file, edge_line); std::cout << edge_line << std::endl; if (edge_line.size() == 0) continue; if (edge_line[0] == '>') continue; std::vector tokens = split(edge_line, ' '); if (tokens.size() < 6) std::cout << "Error! Wrong format." << std::endl; Node node0; Node node1; node0.id = std::stoi(tokens[1]); node1.id = std::stoi(tokens[3]); } edges_file.clear(); edges_file.seekg(0, std::ios::beg); while (!edges_file.eof()) { std::getline(edges_file, edge_line); if (edge_line[0] == '>') { std::cout << current_name << std::endl; if (edgelist.size() > 0) { draft_assembly_ctg(edgelist, la, full_aln, idx3, idx, reads, TSPACE, EDGE_SAFE, MIN_COV2, cut_start, cut_end, one_read_contig, two_read_contig, contig); out_fa << current_name << std::endl; out_fa << contig << std::endl; } edgelist.clear(); current_name = edge_line; one_read_contig = false; two_read_contig = false; cut_start = 0; cut_end = 0; continue; } if (edges_file.eof()) { // process edges list std::cout << current_name << std::endl; draft_assembly_ctg(edgelist, la, full_aln, idx3, idx, reads, TSPACE, EDGE_SAFE, MIN_COV2, cut_start, cut_end, one_read_contig, two_read_contig, contig); out_fa << current_name << std::endl; out_fa << contig << std::endl; edgelist.clear(); one_read_contig = false; two_read_contig = false; continue; } std::vector tokens = split(edge_line, ' '); if (tokens.size() < 6) std::cout << "Error! Wrong format." << std::endl; std::cout << edge_line << std::endl; Node node0; Node node1; int w; node0.id = std::stoi(tokens[1]); node0.strand = std::stoi(tokens[2]); node1.id = std::stoi(tokens[3]); node1.strand = std::stoi(tokens[4]);; if (tokens[0] == "O") { w = 0; one_read_contig = true; } else if (tokens[0] == "D") { w = std::stoi(tokens[5]); two_read_contig = true; } else w = std::stoi(tokens[5]); edgelist.push_back(std::make_tuple(node0, node1, w)); if (tokens[0] == "O") { cut_start = std::stoi(tokens[5]); cut_end = std::stoi(tokens[6]); } else if (tokens[0] == "S") { cut_start = std::stoi(tokens[6]); } else if (tokens[0] == "E") { cut_end = std::stoi(tokens[6]); } else if (tokens[0] == "D") { cut_start = std::stoi(tokens[6]); cut_end = std::stoi(tokens[7]); } } if (strlen(name_db) > 0) la.closeDB(); //close database return 0; } HINGE-0.5.0/src/consensus/draft_chopper.cpp000066400000000000000000001115421314415550300205060ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include "spdlog/spdlog.h" #include "cmdline.h" #include "INIReader.h" #include "DB.h" #include "align.h" #include "LAInterface.h" #include #include #include extern "C" { #include "common.h" } #define LAST_READ_SYMBOL '$' #define HINGED_EDGE 1 #define UNHINGED_EDGE -1 #define REVERSE_COMPLEMENT_MATCH 1 #define SAME_DIRECTION_MATCH 0 using namespace boost; typedef adjacency_list Graph; typedef std::tuple Edge_w; typedef std::pair Edge_nw; static int ORDER(const void *l, const void *r) { int x = *((int32 *) l); int y = *((int32 *) r); return (x - y); } std::vector get_mapping(std::string aln_tag1, std::string aln_tag2) { int pos = 0; int count = 0; int count2 = 0; std::vector ret; while (pos < aln_tag1.size()) { if (aln_tag1[pos] != '-') { ret.push_back(count2); count ++; } if (aln_tag2[pos] != '-') { count2 ++; } pos++; } return ret; } std::string reverse_complement(std::string seq) { static std::map m = {{'a','t'}, {'c','g'}, {'g','c'}, {'t','a'}, {'A','T'}, {'C','G'}, {'T','A'}, {'G','C'}, {'n','n'}, {'N', 'N'}, {'-', '-'}}; std::reverse(seq.begin(), seq.end()); for (int i = 0; i < seq.size(); i++) { seq[i] = m[seq[i]]; } return seq; } std::ostream& operator<<(std::ostream& out, const MatchType value){ static std::map strings; if (strings.size() == 0){ #define INSERT_ELEMENT(p) strings[p] = #p INSERT_ELEMENT(FORWARD); INSERT_ELEMENT(BACKWARD); INSERT_ELEMENT(ACOVERB); INSERT_ELEMENT(BCOVERA); INSERT_ELEMENT(INTERNAL); INSERT_ELEMENT(UNDEFINED); INSERT_ELEMENT(NOT_ACTIVE); #undef INSERT_ELEMENT } return out << strings[value]; } std::vector &split(const std::string &s, char delim, std::vector &elems) { std::stringstream ss(s); std::string item; while (std::getline(ss, item, delim)) { elems.push_back(item); } return elems; } std::vector split(const std::string &s, char delim) { std::vector elems; split(s, delim, elems); return elems; } bool compare_overlap(LOverlap * ovl1, LOverlap * ovl2) { return ((ovl1->read_A_match_end_ - ovl1->read_A_match_start_ + ovl1->read_B_match_end_ - ovl1->read_B_match_start_) > (ovl2->read_A_match_end_ - ovl2->read_A_match_start_ + ovl2->read_B_match_end_ - ovl2->read_B_match_start_)); } bool compare_overlap_weight(LOverlap * ovl1, LOverlap * ovl2) { return (ovl1->weight > ovl2->weight); } bool compare_overlap_abpos(LOverlap * ovl1, LOverlap * ovl2) { return ovl1->read_A_match_start_ < ovl2->read_A_match_start_; } bool compare_overlap_aepos(LOverlap * ovl1, LOverlap * ovl2) { return ovl1->read_A_match_start_ > ovl2->read_A_match_start_; } int main(int argc, char *argv[]) { cmdline::parser cmdp; cmdp.add("db", 'b', "db file name", false, ""); cmdp.add("las", 'l', "las file name", false, ""); cmdp.add("paf", 'p', "paf file name", false, ""); cmdp.add("config", 'c', "configuration file name", false, ""); cmdp.add("fasta", 'f', "fasta file name", false, ""); cmdp.add("prefix", 'x', "(intermediate output) input file prefix", true, ""); cmdp.add("out", 'o', "final output file name", true, ""); cmdp.add("log", 'g', "log folder name", false, "log"); cmdp.add("path", 0, "path file name", false, "path"); cmdp.add("debug", '\0', "debug mode"); // cmdp.add("restrictreads",'r',"restrict to reads in the file",false,""); cmdp.parse_check(argc, argv); LAInterface la; const char *name_db = cmdp.get("db").c_str(); //.db file of reads to load const char *name_las = cmdp.get("las").c_str();//.las file of alignments const char *name_paf = cmdp.get("paf").c_str(); const char *name_fasta = cmdp.get("fasta").c_str(); const char *name_config = cmdp.get("config").c_str();//name of the configuration file, in INI format std::string out = cmdp.get("prefix"); std::string out_name = cmdp.get("out"); std::string path_name = cmdp.get("path"); // const char * name_restrict = cmdp.get("restrictreads").c_str(); std::string name_mask = out + ".mas"; std::string name_max = out + ".max"; std::string name_homo = out + ".homologous.txt"; std::string name_rep = out + ".repeat.txt"; std::string name_hg = out + ".hinges.txt"; std::string name_cov = out + ".coverage.txt"; std::string name_garbage = out + ".garbage.txt"; std::string name_contained = out + ".contained.txt"; std::string name_deadend = out_name + ".deadends.txt"; std::ofstream deadend_out(name_deadend); std::ofstream maximal_reads(name_max); std::ofstream garbage_out(name_garbage); std::ofstream contained_out(name_contained); std::ifstream homo(name_homo); std::vector homo_reads; bool delete_telomere = false; // TODO: command line option to set this true int read_id; while (homo >> read_id) homo_reads.push_back(read_id); namespace spd = spdlog; //auto console = spd::stdout_logger_mt("console"); std::vector sinks; sinks.push_back(std::make_shared()); sinks.push_back( std::make_shared(cmdp.get("log") + "/log", "txt", 23, 59)); auto console = std::make_shared("log", std::begin(sinks), std::end(sinks)); spdlog::register_logger(console); console->info("draft consensus"); if (cmdp.exist("debug")) { char *buff = (char *) malloc(sizeof(char) * 2000); getwd(buff); console->info("current user {}, current working directory {}", getlogin(), buff); free(buff); } console->info("name of db: {}, name of .las file {}", name_db, name_las); console->info("name of fasta: {}, name of .paf file {}", name_fasta, name_paf); console->info("filter files prefix: {}", out); console->info("output prefix: {}", out_name); std::ifstream ini_file(name_config); std::string str((std::istreambuf_iterator(ini_file)), std::istreambuf_iterator()); console->info("Parameters passed in \n{}", str); if (strlen(name_db) > 0) la.openDB(name_db); if (strlen(name_las) > 0) la.openAlignmentFile(name_las); int64 n_aln = 0; if (strlen(name_las) > 0) { n_aln = la.getAlignmentNumber(); console->info("Load alignments from {}", name_las); console->info("# Alignments: {}", n_aln); } int n_read; if (strlen(name_db) > 0) n_read = la.getReadNumber(); std::vector reads; //Vector of pointers to all reads if (strlen(name_fasta) > 0) { n_read = la.loadFASTA(name_fasta, reads); } console->info("# Reads: {}", n_read); // output some statistics std::vector aln;//Vector of pointers to all alignments if (strlen(name_las) > 0) { la.resetAlignment(); la.getOverlap(aln, 0, n_aln); } if (strlen(name_paf) > 0) { n_aln = la.loadPAF(std::string(name_paf), aln); console->info("Load alignments from {}", name_paf); console->info("# Alignments: {}", n_aln); } if (n_aln == 0) { console->error("No alignments!"); return 1; } if (strlen(name_db) > 0) { la.getRead(reads, 0, n_read); } console->info("Input data finished"); INIReader reader(name_config); if (reader.ParseError() < 0) { console->warn("Can't load {}", name_config); return 1; } int LENGTH_THRESHOLD = int(reader.GetInteger("filter", "length_threshold", -1)); double QUALITY_THRESHOLD = reader.GetReal("filter", "quality_threshold", 0.0); int N_ITER = (int) reader.GetInteger("filter", "n_iter", -1); int ALN_THRESHOLD = (int) reader.GetInteger("filter", "aln_threshold", -1); int MIN_COV = (int) reader.GetInteger("filter", "min_cov", -1); int CUT_OFF = (int) reader.GetInteger("filter", "cut_off", -1); int THETA = (int) reader.GetInteger("filter", "theta", -1); int THETA2 = (int) reader.GetInteger("filter", "theta2", 0); int N_PROC = (int) reader.GetInteger("running", "n_proc", 4); int HINGE_SLACK = (int) reader.GetInteger("layout", "hinge_slack", 1000); //This is the amount by which a forward overlap //must be longer than a forward internal overlap to be preferred while //building a graph. int HINGE_TOLERANCE = (int) reader.GetInteger("layout", "hinge_tolerance", 150); //This is how far an overlap must start from a hinge to be considered an internal //overlap. int KILL_HINGE_OVERLAP_ALLOWANCE = (int) reader.GetInteger("layout", "kill_hinge_overlap", 300); int KILL_HINGE_INTERNAL_ALLOWANCE = (int) reader.GetInteger("layout", "kill_hinge_internal", 40); int MATCHING_HINGE_SLACK = (int) reader.GetInteger("layout", "matching_hinge_slack", 200); int NUM_EVENTS_TELOMERE = (int) reader.GetInteger("layout", "num_events_telomere", 7); int MIN_CONNECTED_COMPONENT_SIZE = (int) reader.GetInteger("layout", "min_connected_component_size", 8); int MIN_COV2 = reader.GetInteger("draft", "min_cov", -1); int EDGE_TRIM = reader.GetInteger("draft", "trim", -1); int EDGE_SAFE = reader.GetInteger("draft", "edge_safe", -1); int TSPACE = reader.GetInteger("draft", "tspace", -1); int STEP = reader.GetInteger("draft", "step", -1); console->info("LENGTH_THRESHOLD = {}", LENGTH_THRESHOLD); console->info("QUALITY_THRESHOLD = {}", QUALITY_THRESHOLD); console->info("ALN_THRESHOLD = {}", ALN_THRESHOLD); console->info("MIN_COV = {}", MIN_COV); console->info("CUT_OFF = {}", CUT_OFF); console->info("THETA = {}", THETA); console->info("N_ITER = {}", N_ITER); console->info("THETA2 = {}", THETA2); console->info("N_PROC = {}", N_PROC); console->info("HINGE_SLACK = {}", HINGE_SLACK); console->info("HINGE_TOLERANCE = {}", HINGE_TOLERANCE); console->info("KILL_HINGE_OVERLAP_ALLOWANCE = {}", KILL_HINGE_OVERLAP_ALLOWANCE); console->info("KILL_HINGE_INTERNAL_ALLOWANCE = {}", KILL_HINGE_INTERNAL_ALLOWANCE); console->info("MATCHING_HINGE_SLACK = {}", MATCHING_HINGE_SLACK); console->info("MIN_CONNECTED_COMPONENT_SIZE = {}", MIN_CONNECTED_COMPONENT_SIZE); omp_set_num_threads(N_PROC); std::vector edgelist, edgelist_ms; // save output to edgelist std::vector > > idx_ab; for (int i = 0; i < n_read; i++) { //An initialisation for loop //TODO Preallocate memory. Much more efficient. idx_ab.push_back(std::unordered_map >()); } for (int i = 0; i < aln.size(); i++) { idx_ab[aln[i]->read_A_id_][aln[i]->read_B_id_] = std::vector(); } for (int i = 0; i < aln.size(); i++) { idx_ab[aln[i]->read_A_id_][aln[i]->read_B_id_].push_back(aln[i]); } std::unordered_map > idx3; // this is the pileup std::vector > has_overlap(n_read); std::unordered_map > > idx; for (int i = 0; i < n_read; i++) { //has_overlap[i] = std::set(); idx3[i] = std::vector(); } //for (int i = 0; i < aln.size(); i++) // if (aln[i]->active) // idx[std::pair(aln[i]->aid, aln[i]->bid)] = std::vector(); for (int i = 0; i < aln.size(); i++) { if (aln[i]->active) { idx[aln[i]->read_A_id_][aln[i]->read_B_id_] = std::vector(); } } for (int i = 0; i < aln.size(); i++) { if (aln[i]->active) { has_overlap[aln[i]->read_A_id_].insert(aln[i]->read_B_id_); } } for (int i = 0; i < aln.size(); i++) { if (aln[i]->active) { idx3[aln[i]->read_A_id_].push_back(aln[i]); } } std::cout << "add data" << std::endl; for (int i = 0; i < aln.size(); i++) { if (aln[i]->active) { idx[aln[i]->read_A_id_][aln[i]->read_B_id_].push_back(aln[i]); } } std::cout << "add data" << std::endl; std::string name_input= out + ".edges.list"; std::ifstream edges_file(name_input); std::string name_output = out_name + ".mega.fasta"; std::ofstream out_fa(name_output); std::string name_output_orig = out_name + ".fasta"; std::ofstream out_fa_orig(name_output_orig); int num_contig = 0; int num_one_read_contig = 0; while (true) { if (edges_file.eof()) break; edgelist.clear(); std::string edge_line; while (!edges_file.eof()) { std::getline(edges_file, edge_line); //std::cout << edge_line << std::endl; std::vector tokens = split(edge_line, ' '); if (tokens.size() == 1) { break; } //std::cout << tokens.size() << std::endl; Node node0; Node node1; int w; if (tokens.size() > 5 ) { node0.id = std::stoi(tokens[0]); node0.strand = std::stoi(tokens[1]); node1.id = std::stoi(tokens[2]); node1.strand = std::stoi(tokens[3]);; w = std::stoi(tokens[4]); edgelist.push_back(std::make_tuple(node0, node1, w)); } if (tokens.size() == 4) { out_fa << ">OneReadContig" << num_one_read_contig << std::endl; out_fa_orig << ">OneReadContig" << num_one_read_contig << std::endl; int node_id = std::stoi(tokens[0]); int node_strand = std::stoi(tokens[1]); int from = std::stoi(tokens[2]); int to = std::stoi(tokens[3]); std::string current_seq; if (node_strand == 0) current_seq = reads[node_id]->bases; else current_seq = reverse_complement(reads[node_id]->bases); out_fa << current_seq.substr(from, to-from) << std::endl; out_fa_orig << current_seq.substr(from, to-from) << std::endl; num_one_read_contig++; } } std::cout << "list size:" << edgelist.size() << std::endl; if (edgelist.size() == 0) continue; std::vector full_alns; std::vector selected; std::unordered_map> idx_aln; la.resetAlignment(); std::vector range; for (int i = 0; i < edgelist.size(); i++) { range.push_back(std::get<0>(edgelist[i]).id); idx_aln[std::get<0>(edgelist[i]).id] = std::vector(); } std::sort(range.begin(), range.end()); la.getAlignment(full_alns, range); for (auto i:full_alns) { idx_aln[i->read_A_id_].push_back(i); } for (int i = 0; i < edgelist.size(); i++) { int aid = std::get<0>(edgelist[i]).id; int bid = std::get<1>(edgelist[i]).id; bool found = false; for (int j = 0; j < idx_aln[std::get<0>(edgelist[i]).id].size(); j++) { //printf("%d %d %d %d\n",bid, idx_aln[aid][j]->bid, idx_aln[aid][j]->read_A_match_end_ - idx_aln[aid][j]->read_A_match_start_, std::get<2>(edgelist[i])); if ((idx_aln[aid][j]->read_B_id_ == bid) and \ (idx_aln[aid][j]->aepos - idx_aln[aid][j]->abpos + idx_aln[aid][j]->bepos - idx_aln[aid][j]->bbpos == std::get<2>(edgelist[i]))) { selected.push_back(idx_aln[aid][j]); found = true; break; } if (found) continue; } } std::cout << "selected:" << selected.size() << std::endl; std::unordered_map > > aln_tags_map; std::vector > aln_tags_list; std::vector > aln_tags_list_true_strand; for (int i = 0; i < selected.size(); i++) { la.recoverAlignment(selected[i]); //printf("%d %d\n",selected[i]->tlen, selected[i]->trace_pts_len); std::pair res = la.getAlignmentTags(selected[i]); aln_tags_map[selected[i]->read_A_id_][selected[i]->read_B_id_] = res; aln_tags_list.push_back(res); } std::string sequence = ""; std::vector bedges; std::vector breads; std::vector > > pitfalls; range.clear(); for (int i = 0; i < edgelist.size(); i++) { range.push_back(std::get<0>(edgelist[i]).id); } std::vector *> coverages; for (int i = 0; i < range.size(); i++) { int aread = range[i]; if (idx3[aread].size() > 0) { std::vector *res = la.getCoverage(idx3[aread]); std::vector > *res2 = la.lowCoverageRegions(*res, MIN_COV2); //delete res; coverages.push_back(res); //printf("%d %d: (%d %d) ", i, aread, 0, idx3[aread][0]->alen); //for (int j = 0; j < res2->size(); j++) { // printf("[%d %d] ", res2->at(j).first, res2->at(j).second); //} //printf("\n"); pitfalls.push_back(*res2); delete res2; } } /*** * Prepare the data */ for (int i = 0; i < edgelist.size(); i++) { std::vector currentalns = idx[std::get<0>(edgelist[i]).id][std::get<1>(edgelist[i]).id]; LOverlap *currentaln = NULL; for (int j = 0; j < currentalns.size(); j++) { //std::cout << std::get<0>(edgelist[i]).id << " " << std::get<1>(edgelist[i]).id << " " << currentalns[j]->match_type_ << std::endl; if (currentalns[j]->read_A_match_end_ - currentalns[j]->read_A_match_start_ + currentalns[j]->read_B_match_end_ - currentalns[j]->read_B_match_start_ == std::get<2>(edgelist[i])) currentaln = currentalns[j]; } if (currentaln == NULL) exit(1); //currentaln->show(); std::string current_seq; std::string next_seq; std::string aln_tags1; std::string aln_tags2; if (std::get<0>(edgelist[i]).strand == 0) current_seq = reads[std::get<0>(edgelist[i]).id]->bases; else current_seq = reverse_complement(reads[std::get<0>(edgelist[i]).id]->bases); if (std::get<0>(edgelist[i]).strand == 0) { aln_tags1 = aln_tags_list[i].first; aln_tags2 = aln_tags_list[i].second; } else { aln_tags1 = reverse_complement(aln_tags_list[i].first); aln_tags2 = reverse_complement(aln_tags_list[i].second); } aln_tags_list_true_strand.push_back(std::pair(aln_tags1, aln_tags2)); if (std::get<1>(edgelist[i]).strand == 0) next_seq = reads[std::get<1>(edgelist[i]).id]->bases; else next_seq = reverse_complement(reads[std::get<1>(edgelist[i]).id]->bases); int abpos, aepos, alen, bbpos, bepos, blen, aes, aee, bes, bee; alen = currentaln->alen; blen = currentaln->blen; if (std::get<0>(edgelist[i]).strand == 0) { abpos = currentaln->read_A_match_start_; aepos = currentaln->read_A_match_end_; aes = currentaln->eff_read_A_read_start_; aee = currentaln->eff_read_A_read_end_; } else { abpos = alen - currentaln->read_A_match_end_; aepos = alen - currentaln->read_A_match_start_; aes = alen - currentaln->eff_read_A_read_end_; aee = alen - currentaln->eff_read_A_read_start_; } if (((std::get<1>(edgelist[i]).strand == 0))) { bbpos = currentaln->read_B_match_start_; bepos = currentaln->read_B_match_end_; bes = currentaln->eff_read_B_read_start_; bee = currentaln->eff_read_B_read_end_; } else { bbpos = blen - currentaln->read_B_match_end_; bepos = blen - currentaln->read_B_match_start_; bes = blen - currentaln->eff_read_B_read_end_; bee = blen - currentaln->eff_read_B_read_start_; } aes = 0; bes = 0; aee = alen; bee = blen; printf("%d %d [[%d %d] << [%d %d]] x [[%d %d] << [%d %d]]\n", std::get<0>(edgelist[i]).id, std::get<1>(edgelist[i]).id, abpos, aepos, aes, aee, bbpos, bepos, bes, bee); LOverlap *new_ovl = new LOverlap(); new_ovl->read_A_match_start_ = abpos; new_ovl->read_A_match_end_ = aepos; new_ovl->read_B_match_start_ = bbpos; new_ovl->read_B_match_end_ = bepos; new_ovl->eff_read_A_read_end_ = aee; new_ovl->eff_read_A_read_start_ = aes; new_ovl->eff_read_B_read_end_ = bee; new_ovl->eff_read_B_read_start_ = bes; new_ovl->alen = currentaln->alen; new_ovl->blen = currentaln->blen; new_ovl->read_A_id_ = std::get<0>(edgelist[i]).id; new_ovl->read_B_id_ = std::get<1>(edgelist[i]).id; bedges.push_back(new_ovl); breads.push_back(current_seq); } //need to trim the end std::vector > mappings; for (int i = 0; i < range.size(); i++) { mappings.push_back(get_mapping(aln_tags_list_true_strand[i].first, aln_tags_list_true_strand[i].second)); } std::cout << bedges.size() << " " << breads.size() << " " << selected.size() << " " << aln_tags_list.size() << " " << pitfalls.size() << " " << aln_tags_list_true_strand.size() << " " << mappings.size() << " " << coverages.size() << std::endl; /*for (int i = 0; i < bedges.size() - 1; i++) { printf("%d %d %d %d %d\n", bedges[i]->read_B_match_start_, bedges[i]->read_B_match_end_, bedges[i+1]->read_A_match_start_, bedges[i+1]->read_A_match_end_, bedges[i]->read_B_match_end_ - bedges[i+1]->read_A_match_start_); }*/ int tspace = TSPACE; // set lane length to be 500 int nlane = 0; //printf("%d %d\n", mappings[0][800], mappings[0][1000]); // debug output //printf("%s\n%s\n", breads[0].substr(bedges[0]->read_A_match_start_ + 800, 50).c_str(), // breads[1].substr(bedges[0]->read_B_match_start_ + mappings[0][800], 50).c_str()); //debug output std::vector>> lanes; std::string draft_assembly = ""; int currentlane = 0; int current_starting_read = 0; int current_starting_space = 1; int current_starting_offset = 0; int n_bb_reads = range.size(); std::vector> trace_pts(n_bb_reads); bool revert = false; int rmax = -1; /** * Move forward and put "trace points" */ while (current_starting_read < n_bb_reads - 1) { int currentread = current_starting_read; int additional_offset = 0; while (bedges[current_starting_read]->read_A_match_start_ + current_starting_space * tspace + current_starting_offset + additional_offset < bedges[current_starting_read]->read_A_match_end_ - EDGE_SAFE) { int waypoint = bedges[current_starting_read]->read_A_match_start_ + tspace * current_starting_space + current_starting_offset + additional_offset; //if ((waypoint - bedges[current_starting_read]->read_A_match_start_) < EDGE_SAFE) // waypoint += EDGE_SAFE; //int next_waypoint = mappings[currentread][waypoint - bedges[current_starting_read]->read_A_match_start_] + bedges[current_starting_read]->read_B_match_start_; std::vector > lane; while ((waypoint > bedges[currentread]->read_A_match_start_) and (waypoint < bedges[currentread]->read_A_match_end_)) { printf("%d %d\n", currentread, waypoint); trace_pts[currentread].push_back(waypoint); /*if (waypoint > bedges[currentread]->read_A_match_end_ - EDGE_SAFE) { printf("Reaching the end, neglect low coverage\n"); } if ((coverages[currentread]->at(waypoint) < MIN_COV2) and (waypoint < bedges[currentread]->read_A_match_end_ - EDGE_SAFE)) { revert = true; printf("Low coverage, revert\n"); break; }*/ lane.push_back(std::pair(currentread, waypoint)); if (currentread > rmax) rmax = currentread; //int previous_wp = waypoint; waypoint = mappings[currentread][waypoint - bedges[currentread]->read_A_match_start_] + bedges[currentread]->read_B_match_start_; //printf("%s\n%s\n", breads[currentread].substr(previous_wp,50).c_str(), breads[currentread+1].substr(waypoint,50).c_str()); currentread++; if (currentread >= n_bb_reads) break; } if (currentread < n_bb_reads) if (waypoint < bedges[currentread]->alen) { lane.push_back(std::pair(currentread, waypoint)); if (currentread > rmax) rmax = currentread; } /*if (revert) { printf("revert\n"); revert = false; while (currentread >= current_starting_read) { trace_pts[currentread].pop_back(); currentread --; additional_offset += STEP; } currentread = current_starting_read; } else*/ { if (currentread >= rmax) lanes.push_back(lane); current_starting_space++; currentread = current_starting_read; } } current_starting_read++; current_starting_space = 1;//get next space; if (trace_pts[current_starting_read].size() == 0) current_starting_offset = 0; else current_starting_offset = trace_pts[current_starting_read].back() - bedges[current_starting_read]->read_A_match_start_; } /** * Show trace points on reads */ for (int i = 0; i < n_bb_reads; i++) { printf("Read %d:", i); for (int j = 0; j < trace_pts[i].size(); j++) { printf("%d ", trace_pts[i][j]); } printf("\n"); } /** * Show lanes */ for (int i = 0; i < lanes.size(); i++) { printf("Lane %d\n", i); for (int j = 0; j < lanes[i].size(); j++) { printf("[%d %d] ", lanes[i][j].first, lanes[i][j].second); } printf("\n"); } printf("In total %d lanes\n", lanes.size()); if (lanes.size() == 0) { draft_assembly = breads[0]; out_fa << ">DraftAssemblyContig" << num_contig << std::endl; out_fa << draft_assembly << std::endl; num_contig++; continue; } /** * Consequtive lanes form a column (ladder) */ std::vector > > ladders; for (int i = 0; i < lanes.size() - 1; i++) { std::vector > lane1 = lanes[i]; std::vector > lane2 = lanes[i + 1]; std::vector > ladder; int pos = 0; for (int j = 0; j < lane2.size(); j++) { while ((lane1[pos].first != lane2[j].first) and (pos < lane1.size() - 1)) pos++; if ((lane1[pos].first == lane2[j].first)) ladder.push_back(std::make_tuple(lane2[j].first, lane1[pos].second, lane2[j].second)); } ladders.push_back(ladder); } /** * show ladders */ for (int i = 0; i < ladders.size(); i++) { printf("Ladder %d\n", i); for (int j = 0; j < ladders[i].size(); j++) { //printf("[%d %d-%d] ", std::get<0>(ladders[i][j]), std::get<1>(ladders[i][j]), std::get<2>(ladders[i][j]) ); //printf("%s\n", breads[std::get<0>(ladders[i][j])].substr(std::get<1>(ladders[i][j]),std::get<2>(ladders[i][j])-std::get<1>(ladders[i][j])).c_str()); } if (ladders[i].size() == 0) { printf("low coverage!\n"); continue; } if (ladders[i].size() > 1) { int mx = 0; int maxcoverage = 0; for (int j = 0; j < ladders[i].size(); j++) { int mincoverage = 10000; int read = std::get<0>(ladders[i][j]); int start = std::get<1>(ladders[i][j]); int end = std::get<2>(ladders[i][j]); for (int pos = start; pos < end; pos++) { if (coverages[read]->at(pos) < mincoverage) mincoverage = coverages[read]->at(pos); } if (mincoverage > maxcoverage) { maxcoverage = mincoverage; mx = j; } } std::cout << "ladder " << i << " num reads " << ladders[i].size() << " possibly error here " << maxcoverage << "\n!"; //if (ladders[i].size() == 2) { // draft_assembly += breads[std::get<0>(ladders[i][mx])].substr(std::get<1>(ladders[i][mx]), // std::get<2>(ladders[i][mx]) - // std::get<1>(ladders[i][mx])); // continue; // } std::string base = breads[std::get<0>(ladders[i][mx])].substr(std::get<1>(ladders[i][mx]), std::get<2>(ladders[i][mx]) - std::get<1>(ladders[i][mx]));; int seq_count = ladders[i].size(); printf("seq_count:%d, max %d\n", seq_count, mx); align_tags_t **tags_list; tags_list = (align_tags_t **) calloc(seq_count, sizeof(align_tags_t *)); consensus_data *consensus; int alen = (std::get<2>(ladders[i][mx]) - std::get<1>(ladders[i][mx])); for (int j = 0; j < ladders[i].size(); j++) { int blen = (std::get<2>(ladders[i][j]) - std::get<1>(ladders[i][j])); char *aseq = (char *) malloc( (20 + (std::get<2>(ladders[i][mx]) - std::get<1>(ladders[i][mx]))) * sizeof(char)); char *bseq = (char *) malloc( (20 + (std::get<2>(ladders[i][j]) - std::get<1>(ladders[i][j]))) * sizeof(char)); strcpy(aseq, breads[std::get<0>(ladders[i][mx])].substr(std::get<1>(ladders[i][mx]), std::get<2>(ladders[i][mx]) - std::get<1>(ladders[i][mx])).c_str()); strcpy(bseq, breads[std::get<0>(ladders[i][j])].substr(std::get<1>(ladders[i][j]), std::get<2>(ladders[i][j]) - std::get<1>(ladders[i][j])).c_str()); aln_range *arange = (aln_range *) calloc(1, sizeof(aln_range)); arange->s1 = 0; arange->e1 = strlen(bseq); arange->s2 = 0; arange->e2 = strlen(aseq); arange->score = 5; //printf("blen %d alen%d\n",strlen(bseq), strlen(aseq)); //printf("before get tags\n"); alignment *alng = _align(bseq, blen, aseq, alen, 150, 1); char *q_aln_str = (char *) malloc((5 + strlen(alng->q_aln_str)) * sizeof(char)); char *t_aln_str = (char *) malloc((5 + strlen(alng->t_aln_str)) * sizeof(char)); strcpy(q_aln_str + 1, alng->q_aln_str); strcpy(t_aln_str + 1, alng->t_aln_str); q_aln_str[0] = 'T'; t_aln_str[0] = 'T'; for (int pos = 0; pos < strlen(q_aln_str); pos++) q_aln_str[pos] = toupper(q_aln_str[pos]); for (int pos = 0; pos < strlen(t_aln_str); pos++) t_aln_str[pos] = toupper(t_aln_str[pos]); //printf("Q:%s\nT:%s\n", q_aln_str, t_aln_str); tags_list[j] = get_align_tags(q_aln_str, t_aln_str, strlen(alng->q_aln_str) + 1, arange, (unsigned int) j, 0); //free(aseq); //free(bseq); /*for (int k = 0; k < tags_list[j]->len; k++) { printf("%d %d %ld %d %c %c\n",j, k, tags_list[j]->align_tags[k].t_pos, tags_list[j]->align_tags[k].delta, //tags_list[j]->align_tags[k].p_q_base, aseq[tags_list[j]->align_tags[k].t_pos], tags_list[j]->align_tags[k].q_base); }*/ free(q_aln_str); free(t_aln_str); free(aseq); free(bseq); free_alignment(alng); } //printf("%d %d\n%s\n",seq_count, strlen(seq), seq); consensus = get_cns_from_align_tags(tags_list, seq_count, alen + 1, 1); printf("Consensus len :%d\n",strlen(consensus->sequence)); draft_assembly += std::string(consensus->sequence); free_consensus_data(consensus); for (int j = 0; j < seq_count; j++) free_align_tags(tags_list[j]); } else { draft_assembly += breads[std::get<0>(ladders[i][0])].substr(std::get<1>(ladders[i][0]), std::get<2>(ladders[i][0]) - std::get<1>(ladders[i][0])); } printf("\n"); } /*for (int i = 0; i < mapping.size(); i++) printf("%d %d\n", i, mapping[i]); printf("[%d %d], [%d %d]\n", bedges[0]->read_A_match_start_, bedges[0]->read_A_match_end_, bedges[0]->read_B_match_start_, bedges[0]->read_B_match_end_);*/ std::cout << sequence.size() << std::endl; std::cout << draft_assembly.size() << std::endl; for (int i = 0; i < draft_assembly.size()/25000; i++) { int len = 50000; if (i*25000 + 50000 > draft_assembly.size()) len = draft_assembly.size()-25000*i; out_fa << ">Draft_assembly_" << num_contig << "_" << i << std::endl; out_fa << draft_assembly.substr(i*25000, len) << std::endl; } out_fa_orig << ">Draft_assembly_" << num_contig << std::endl; out_fa_orig << draft_assembly << std::endl; num_contig++; } if (strlen(name_db) > 0) la.closeDB(); //close database return 0; }HINGE-0.5.0/src/consensus/io_base.cpp000066400000000000000000000225051314415550300172670ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include "spdlog/spdlog.h" #include "cmdline.h" #include "INIReader.h" #include "DB.h" #include "align.h" #include "LAInterface.h" #include #include #include #define LAST_READ_SYMBOL '$' #define HINGED_EDGE 1 #define UNHINGED_EDGE -1 #define REVERSE_COMPLEMENT_MATCH 1 #define SAME_DIRECTION_MATCH 0 using namespace boost; typedef adjacency_list Graph; typedef std::tuple Edge_w; typedef std::pair Edge_nw; std::ostream& operator<<(std::ostream& out, const MatchType value){ static std::map strings; if (strings.size() == 0){ #define INSERT_ELEMENT(p) strings[p] = #p INSERT_ELEMENT(FORWARD); INSERT_ELEMENT(BACKWARD); INSERT_ELEMENT(ACOVERB); INSERT_ELEMENT(BCOVERA); INSERT_ELEMENT(INTERNAL); INSERT_ELEMENT(UNDEFINED); INSERT_ELEMENT(NOT_ACTIVE); #undef INSERT_ELEMENT } return out << strings[value]; } bool compare_overlap(LOverlap * ovl1, LOverlap * ovl2) { return ((ovl1->read_A_match_end_ - ovl1->read_A_match_start_ + ovl1->read_B_match_end_ - ovl1->read_B_match_start_) > (ovl2->read_A_match_end_ - ovl2->read_A_match_start_ + ovl2->read_B_match_end_ - ovl2->read_B_match_start_)); } bool compare_overlap_weight(LOverlap * ovl1, LOverlap * ovl2) { return (ovl1->weight > ovl2->weight); } bool compare_overlap_abpos(LOverlap * ovl1, LOverlap * ovl2) { return ovl1->read_A_match_start_ < ovl2->read_A_match_start_; } bool compare_overlap_aepos(LOverlap * ovl1, LOverlap * ovl2) { return ovl1->read_A_match_start_ > ovl2->read_A_match_start_; } int main(int argc, char *argv[]) { cmdline::parser cmdp; cmdp.add("db", 'b', "db file name", false, ""); cmdp.add("las", 'l', "las file name", false, ""); cmdp.add("paf", 'p', "paf file name", false, ""); cmdp.add("config", 'c', "configuration file name", false, ""); cmdp.add("fasta", 'f', "fasta file name", false, ""); cmdp.add("prefix", 'x', "(intermediate output) input file prefix", true, ""); cmdp.add("out", 'o', "final output file name", true, ""); cmdp.add("log", 'g', "log folder name", false, "log"); // cmdp.add("restrictreads",'r',"restrict to reads in the file",false,""); cmdp.parse_check(argc, argv); LAInterface la; const char *name_db = cmdp.get("db").c_str(); //.db file of reads to load const char *name_las = cmdp.get("las").c_str();//.las file of alignments const char *name_paf = cmdp.get("paf").c_str(); const char *name_fasta = cmdp.get("fasta").c_str(); const char *name_config = cmdp.get("config").c_str();//name of the configuration file, in INI format std::string out = cmdp.get("prefix"); std::string out_name = cmdp.get("out"); // const char * name_restrict = cmdp.get("restrictreads").c_str(); std::string name_mask = out + ".mas"; std::string name_max = out + ".max"; std::string name_homo = out + ".homologous.txt"; std::string name_rep = out + ".repeat.txt"; std::string name_hg = out + ".hinges.txt"; std::string name_cov = out + ".coverage.txt"; std::string name_garbage = out + ".garbage.txt"; std::string name_contained = out + ".contained.txt"; std::string name_deadend = out_name + ".deadends.txt"; std::ofstream deadend_out(name_deadend); std::ofstream maximal_reads(name_max); std::ofstream garbage_out(name_garbage); std::ofstream contained_out(name_contained); std::ifstream homo(name_homo); std::vector homo_reads; bool delete_telomere = false; // TODO: command line option to set this true int read_id; while (homo >> read_id) homo_reads.push_back(read_id); namespace spd = spdlog; //auto console = spd::stdout_logger_mt("console"); std::vector sinks; sinks.push_back(std::make_shared()); sinks.push_back( std::make_shared(cmdp.get("log") + "/log", "txt", 23, 59)); auto console = std::make_shared("log", std::begin(sinks), std::end(sinks)); spdlog::register_logger(console); console->info("Hinging layout"); char *buff = (char *) malloc(sizeof(char) * 2000); getwd(buff); console->info("current user {}, current working directory {}", getlogin(), buff); free(buff); console->info("name of db: {}, name of .las file {}", name_db, name_las); console->info("name of fasta: {}, name of .paf file {}", name_fasta, name_paf); console->info("filter files prefix: {}", out); console->info("output prefix: {}", out_name); std::ifstream ini_file(name_config); std::string str((std::istreambuf_iterator(ini_file)), std::istreambuf_iterator()); console->info("Parameters passed in \n{}", str); if (strlen(name_db) > 0) la.openDB(name_db); if (strlen(name_las) > 0) la.openAlignmentFile(name_las); int64 n_aln = 0; if (strlen(name_las) > 0) { n_aln = la.getAlignmentNumber(); console->info("Load alignments from {}", name_las); console->info("# Alignments: {}", n_aln); } int n_read; if (strlen(name_db) > 0) n_read = la.getReadNumber(); std::vector reads; //Vector of pointers to all reads if (strlen(name_fasta) > 0) { n_read = la.loadFASTA(name_fasta, reads); } console->info("# Reads: {}", n_read); // output some statistics std::vector aln;//Vector of pointers to all alignments if (strlen(name_las) > 0) { la.resetAlignment(); la.getOverlap(aln, 0, n_aln); } if (strlen(name_paf) > 0) { n_aln = la.loadPAF(std::string(name_paf), aln); console->info("Load alignments from {}", name_paf); console->info("# Alignments: {}", n_aln); } if (n_aln == 0) { console->error("No alignments!"); return 1; } if (strlen(name_db) > 0) { la.getRead(reads, 0, n_read); } console->info("Input data finished"); INIReader reader(name_config); if (reader.ParseError() < 0) { console->warn("Can't load {}", name_config); return 1; } int LENGTH_THRESHOLD = int(reader.GetInteger("filter", "length_threshold", -1)); double QUALITY_THRESHOLD = reader.GetReal("filter", "quality_threshold", 0.0); int N_ITER = (int) reader.GetInteger("filter", "n_iter", -1); int ALN_THRESHOLD = (int) reader.GetInteger("filter", "aln_threshold", -1); int MIN_COV = (int) reader.GetInteger("filter", "min_cov", -1); int CUT_OFF = (int) reader.GetInteger("filter", "cut_off", -1); int THETA = (int) reader.GetInteger("filter", "theta", -1); int THETA2 = (int) reader.GetInteger("filter", "theta2", 0); int N_PROC = (int) reader.GetInteger("running", "n_proc", 4); int HINGE_SLACK = (int) reader.GetInteger("layout", "hinge_slack", 1000); //This is the amount by which a forward overlap //must be longer than a forward internal overlap to be preferred while //building a graph. int HINGE_TOLERANCE = (int) reader.GetInteger("layout", "hinge_tolerance", 150); //This is how far an overlap must start from a hinge to be considered an internal //overlap. int KILL_HINGE_OVERLAP_ALLOWANCE = (int) reader.GetInteger("layout", "kill_hinge_overlap", 300); int KILL_HINGE_INTERNAL_ALLOWANCE = (int) reader.GetInteger("layout", "kill_hinge_internal", 40); int MATCHING_HINGE_SLACK = (int) reader.GetInteger("layout", "matching_hinge_slack", 200); int NUM_EVENTS_TELOMERE = (int) reader.GetInteger("layout", "num_events_telomere", 7); int MIN_CONNECTED_COMPONENT_SIZE = (int) reader.GetInteger("layout", "min_connected_component_size", 8); console->info("LENGTH_THRESHOLD = {}", LENGTH_THRESHOLD); console->info("QUALITY_THRESHOLD = {}", QUALITY_THRESHOLD); console->info("ALN_THRESHOLD = {}", ALN_THRESHOLD); console->info("MIN_COV = {}", MIN_COV); console->info("CUT_OFF = {}", CUT_OFF); console->info("THETA = {}", THETA); console->info("N_ITER = {}", N_ITER); console->info("THETA2 = {}", THETA2); console->info("N_PROC = {}", N_PROC); console->info("HINGE_SLACK = {}", HINGE_SLACK); console->info("HINGE_TOLERANCE = {}", HINGE_TOLERANCE); console->info("KILL_HINGE_OVERLAP_ALLOWANCE = {}", KILL_HINGE_OVERLAP_ALLOWANCE); console->info("KILL_HINGE_INTERNAL_ALLOWANCE = {}", KILL_HINGE_INTERNAL_ALLOWANCE); console->info("MATCHING_HINGE_SLACK = {}", MATCHING_HINGE_SLACK); console->info("MIN_CONNECTED_COMPONENT_SIZE = {}", MIN_CONNECTED_COMPONENT_SIZE); omp_set_num_threads(N_PROC); std::vector edgelist, edgelist_ms; // save output to edgelist std::vector > > idx_ab; if (strlen(name_db) > 0) la.closeDB(); //close database return 0; } HINGE-0.5.0/src/filter/000077500000000000000000000000001314415550300144235ustar00rootroot00000000000000HINGE-0.5.0/src/filter/CMakeLists.txt000066400000000000000000000002721314415550300171640ustar00rootroot00000000000000cmake_minimum_required(VERSION 3.2) add_executable(Reads_filter filter) target_link_libraries(Reads_filter LAInterface ini spdlog) install(TARGETS Reads_filter DESTINATION ${libexec}) HINGE-0.5.0/src/filter/filter.cpp000066400000000000000000001316631314415550300164260ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "INIReader.h" #include "spdlog/spdlog.h" #include "DB.h" #include "align.h" #include "LAInterface.h" #include "cmdline.h" std::string lastN(std::string input, int n) { return input.substr(input.size() - n); } inline std::vector glob(const std::string& pat){ using namespace std; glob_t glob_result; int i = 1; std::string search_name; search_name = pat + "."+std::to_string(i)+".las"; std::cout << search_name << endl; glob(search_name.c_str(),GLOB_TILDE,NULL,&glob_result); // std::cout << "Number of files " << glob_result.gl_pathc << std::endl; vector ret; while (glob_result.gl_pathc != 0){ ret.push_back(string(glob_result.gl_pathv[0])); i ++; search_name = pat + "."+std::to_string(i)+".las"; glob(search_name.c_str(),GLOB_TILDE,NULL,&glob_result); // std::cout << "Number of files " << glob_result.gl_pathc << std::endl; } std::cout << "-------------------------"<< std::endl; std::cout << "Number of files " << i-1 << std::endl; std::cout << "Input string " << pat.c_str() << std::endl; std::cout << "-------------------------"<< std::endl; globfree(&glob_result); return ret; } std::vector> Merge(std::vector & intervals, int cutoff) //Returns sections of read a which are covered by overlaps. Each overlap is considered as // . { //std::cout<<"Merge"< > ret; int n = intervals.size(); // Length of the vector intervals if (n == 0) return ret; if(n == 1) { ret.push_back(std::pair(intervals[0]->read_A_match_start_, intervals[0]->read_A_match_end_)); return ret; } //Where is sort defined ? Is this std::sort? sort(intervals.begin(),intervals.end(),compare_overlap_abpos); //sort according to left (start position of // overlap beginning on a) int left= intervals[0]->read_A_match_start_ + cutoff, right = intervals[0]->read_A_match_end_ - cutoff; //left, right means maximal possible interval now for(int i = 1; i < n; i++) { //Ovl1 ~ Ovl2 if Ovl1 and Ovl2 have a nonzero intersection. (that is both the b read maps // to the same position on the a read) //This defines a chain of connected overlaps. This for loop returns a a vector ret which // is a pair of if(intervals[i]->read_A_match_start_ + cutoff <= right) { right=std::max(right, intervals[i]->read_A_match_end_ - cutoff); } else { ret.push_back(std::pair(left,right)); left = intervals[i]->read_A_match_start_ + cutoff; right = intervals[i]->read_A_match_end_ - cutoff; } } ret.push_back(std::pair(left,right)); return ret; } //Interval = pair. Defined in LAInterface.h Interval Effective_length(std::vector & intervals, int min_cov) { //Returns //start_pos : the first position at which Read a of the overlaps have at least min_cov matches on it. //end_pos : the last position that the (#overlaps- min_cov)th read (in order of start positions ends). //Should compare_overlap_aepos actually compare read_A_match_end_? If that is done, then the end_pos // will be the last position // on the a read so that all positions beyond have less than min_cov matches on them Interval ret; sort(intervals.begin(),intervals.end(),compare_overlap_abpos); //sort according to left if (intervals.size() > min_cov) { ret.first = intervals[min_cov]->read_A_match_start_; } else ret.first = 0; sort(intervals.begin(),intervals.end(),compare_overlap_aepos); //sort according to left if (intervals.size() > min_cov) { ret.second = intervals[min_cov]->read_A_match_end_; } else ret.second = 0; return ret; } bool bridge(LOverlap* ovl, int s, int e){ //Returns True if [s e] on read a is bridged by ovl. False else. //Put 500 in a typedef perhaps? return ((ovl->read_A_match_start_ < s - 500) and (ovl->read_A_match_end_ > e + 500)); } float number_of_bridging_reads(std::vector ovl_reads, int hinge_location, int hinge_type,int threshold){ int num_bridging_reads=0; //int threshold=100; std::vector read_ends; if (hinge_type==1){ for (int i=0; i < ovl_reads.size(); i++){ if ((ovl_reads[i]->read_A_match_start_ > hinge_location-threshold ) and (ovl_reads[i]->read_A_match_start_ < hinge_location+threshold )) read_ends.push_back(ovl_reads[i]->read_A_match_end_); } } else if (hinge_type==-1){ for (int i=0; i < ovl_reads.size(); i++){ if ((ovl_reads[i]->read_A_match_end_ > hinge_location-threshold ) and (ovl_reads[i]->read_A_match_end_ < hinge_location+threshold )) read_ends.push_back(ovl_reads[i]->read_A_match_start_); } } std::sort(read_ends.begin(),read_ends.end(), std::greater()); int start_point=0; int num_bins=0; for (int i=0; i 2 * threshold) { num_bins++; start_point = i; } } return num_bins/((float)1); } int main(int argc, char *argv[]) { mkdir("log",S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH); cmdline::parser cmdp; cmdp.add("db", 'b', "db file name", false, ""); cmdp.add("las", 'l', "las file name", false, ""); cmdp.add("paf", 'p', "paf file name", false, ""); cmdp.add("config", 'c', "configuration file name", false, ""); cmdp.add("fasta", 'f', "fasta file name", false, ""); cmdp.add("prefix", 'x', "prefix of (intermediate) output", false, "out"); cmdp.add("restrictreads",'r',"restrict to reads in the file",false,""); cmdp.add("log", 'g', "log folder name", false, "log"); cmdp.add("mlas", '\0', "multiple las files"); cmdp.add("debug", '\0', "debug mode"); cmdp.parse_check(argc, argv); LAInterface la; const char * name_db = cmdp.get("db").c_str(); //.db file of reads to load const char * name_las_base = cmdp.get("las").c_str();//.las file of alignments const char * name_paf = cmdp.get("paf").c_str(); const char * name_fasta = cmdp.get("fasta").c_str(); const char * name_config = cmdp.get("config").c_str();//name of the configuration file, in INI format std::string out = cmdp.get("prefix"); bool has_qv = true; const char * name_restrict = cmdp.get("restrictreads").c_str(); namespace spd = spdlog; //auto console = spd::stdout_logger_mt("console",true); std::vector sinks; sinks.push_back(std::make_shared()); sinks.push_back(std::make_shared(cmdp.get("log") + "/log", "txt", 23, 59)); auto console = std::make_shared("log", begin(sinks), end(sinks)); spdlog::register_logger(console); //auto console = std::make_shared("name", begin(sinks), end(sinks)); console->info("Reads filtering"); bool db_and_las, db_or_las, fa_and_paf, fa_or_paf; db_and_las = (strlen(name_db) > 0) and (strlen(name_las_base) > 0); db_or_las = (strlen(name_db) > 0) or (strlen(name_las_base) > 0); fa_and_paf = (strlen(name_fasta) > 0) and (strlen(name_paf) > 0); fa_or_paf = (strlen(name_fasta) > 0) or (strlen(name_paf) > 0); if (db_or_las and fa_or_paf){ console->error("Pass in either a db and a las or a fasta and a paf"); return 1; } if (( not fa_and_paf) and (not db_and_las)){ console->error("Pass in at least one of the following two combinations: a db and a las or a fasta and a paf"); return 1; } std::string name_las_string; if (cmdp.exist("mlas")) { if (not db_and_las){ console->error("--mlas works only with db and las"); return 1; } name_las_string = std::string(name_las_base); } else if (strlen(name_las_base) > 0) { if (lastN(std::string(name_las_base), 4) == ".las") name_las_string = std::string(name_las_base); else name_las_string = std::string(name_las_base) + ".las"; } const char * name_las = name_las_string.c_str(); /** * There are two sets of input, the first is db+las, which corresponds to daligner as an overlapper, * the other is fasta + paf, which corresponds to minimap as an overlapper. */ // std::cout << "here now " << std::endl; console->info("name of db: {}, name of .las file {}", name_db, name_las); console->info("name of fasta: {}, name of .paf file {}", name_fasta, name_paf); std::ifstream ini_file(name_config); std::string str((std::istreambuf_iterator(ini_file)), std::istreambuf_iterator()); console->info("Parameters passed in \n{}", str); if (strlen(name_db) > 0) la.openDB(name_db); std::vector name_las_list; std::string name_las_str(name_las); console->info("Las files: {}", name_las_str); if (cmdp.exist("mlas")) { console->info("Calling glob."); name_las_list = glob(name_las_str); } else name_las_list.push_back(name_las_str); int n_read; if (strlen(name_db) > 0) n_read = la.getReadNumber(); std::vector reads; //Vector of pointers to all reads if (strlen(name_fasta) > 0) { n_read = la.loadFASTA(name_fasta,reads); has_qv = false; } console->info("# Reads: {}", n_read); // output some statistics std::vector> QV; if (strlen(name_db) > 0) { la.getRead(reads,0,n_read); if (la.getQV(QV,0,n_read) != 0) // load QV track from .db file has_qv = false; } if (has_qv) for (int i = 0; i < n_read; i++) { for (int j = 0; j < QV[i].size(); j++) QV[i][j] = int(QV[i][j] < 40); } //Binarize QV vector, 40 is the threshold std::set reads_to_keep, reads_to_keep_initial; char * line = NULL; size_t len = 0; if (strlen(name_restrict) > 0){ FILE * restrict_reads; restrict_reads = fopen(name_restrict, "r"); while (getline(&line, &len, restrict_reads) != -1){ std::stringstream ss; ss.clear(); ss << line; int num; ss >> num; reads_to_keep.insert(num); } fclose(restrict_reads); console->info("Reads to debug loaded from: {}", name_restrict); console->info("Number of reads to debug loaded: {}", reads_to_keep.size()); } else console->info("No debug restrictions."); if (strlen(name_las_list[0].c_str()) > 0) la.openAlignmentFile(name_las_list[0]); // get tspace std::vector > QV_mask(n_read); // QV_mask is the mask based on QV for reads, for each read, it has one pair [start, end] if (has_qv) { for (int i = 0; i < n_read; i++) { int s = 0, e = 0; int max = 0, maxs = s, maxe = e; for (int j = 0; j < QV[i].size(); j++) { if ((QV[i][j] == 1) and (j max) { maxe = e ; maxs = s; max = e - s; } s = j+1; e = j+1; } } // get the longest consecutive region that has good QV //printf("maxs %d maxe %d size%d\n",maxs, maxe,QV[i].size()); QV_mask[i] = (std::pair(maxs*la.tspace, maxe*la.tspace)); // tspace the the interval of trace points // create mask by QV } } INIReader reader(name_config); if (reader.ParseError() < 0) { console->warn("Can't load {}", name_config); return 1; } int LENGTH_THRESHOLD = reader.GetInteger("filter", "length_threshold", -1); double QUALITY_THRESHOLD = reader.GetReal("filter", "quality_threshold", 0.0); int N_ITER = reader.GetInteger("filter", "n_iter", -1); int ALN_THRESHOLD = reader.GetInteger("filter", "aln_threshold", -1); int MIN_COV = reader.GetInteger("filter", "min_cov", -1); int CUT_OFF = reader.GetInteger("filter", "cut_off", -1); int THETA = reader.GetInteger("filter", "theta", -1); int N_PROC = reader.GetInteger("running", "n_proc", 4); int EST_COV = reader.GetInteger("filter", "ec", 0); // load the estimated coverage (probably from other programs) from ini file, if it is zero, then estimate it int reso = 40; // resolution of masks, repeat annotation, coverage, etc = 40 basepairs bool use_qv_mask = reader.GetBoolean("filter", "use_qv", true); bool use_coverage_mask = reader.GetBoolean("filter", "coverage", true); int COVERAGE_FRACTION = (int) reader.GetInteger("filter", "coverage_frac_repeat_annotation", 3); const int MIN_REPEAT_ANNOTATION_THRESHOLD = (int) reader.GetInteger("filter", "min_repeat_annotation_threshold", 10); const int MAX_REPEAT_ANNOTATION_THRESHOLD = (int) reader.GetInteger("filter", "max_repeat_annotation_threshold", 20); const int REPEAT_ANNOTATION_GAP_THRESHOLD = (int) reader.GetInteger("filter", "repeat_annotation_gap_threshold",300); //How far two hinges of the same type can be const int NO_HINGE_REGION = (int) reader.GetInteger("filter", "no_hinge_region",500); const int HINGE_MIN_SUPPORT = (int) reader.GetInteger("filter", "hinge_min_support", 7); //Minimum number of reads that have to start in a reso length interval to be considered in hinge calling const int HINGE_BIN_PILEUP_THRESHOLD = (int) reader.GetInteger("filter", "hinge_min_pileup", 7); //Minimum number of reads to have in a pileup to consider a hinge bridged const int HINGE_READ_UNBRIDGED_THRESHOLD = (int) reader.GetInteger("filter", "hinge_unbridged", 6); //Number of reads that one has to see before a pileup to declare a potential hinge unbridged int HINGE_BIN_LENGTH = (int) reader.GetInteger("filter", "hinge_bin", 100); //Physical length of the bins considered const int HINGE_TOLERANCE_LENGTH = (int) reader.GetInteger("filter", "hinge_tolerance_length", 100); //Reads starting at +/- HINGE_TOLERANCE_LENGTH are considered reads starting at hinges HINGE_BIN_LENGTH=2*HINGE_TOLERANCE_LENGTH; bool delete_telomere = (int) reader.GetInteger("layout", "del_telomere", 0); console->info("use_qv_mask set to {}",use_qv_mask); use_qv_mask = use_qv_mask and has_qv; console->info("use_qv_mask set to {}",use_qv_mask); omp_set_num_threads(N_PROC); console->info("number processes set to {}", N_PROC); console->info("LENGTH_THRESHOLD = {}",LENGTH_THRESHOLD); console->info("QUALITY_THRESHOLD = {}",QUALITY_THRESHOLD); console->info("N_ITER = {}",N_ITER); console->info("ALN_THRESHOLD = {}",ALN_THRESHOLD); console->info("MIN_COV = {}",MIN_COV); console->info("CUT_OFF = {}",CUT_OFF); console->info("THETA = {}",THETA); console->info("EST_COV = {}",EST_COV); console->info("reso = {}",reso); console->info("use_coverage_mask = {}",use_coverage_mask); console->info("COVERAGE_FRACTION = {}",COVERAGE_FRACTION); console->info("MIN_REPEAT_ANNOTATION_THRESHOLD = {}",MIN_REPEAT_ANNOTATION_THRESHOLD); console->info("MAX_REPEAT_ANNOTATION_THRESHOLD = {}",MAX_REPEAT_ANNOTATION_THRESHOLD); console->info("REPEAT_ANNOTATION_GAP_THRESHOLD = {}",REPEAT_ANNOTATION_GAP_THRESHOLD); console->info("NO_HINGE_REGION = {}",NO_HINGE_REGION); console->info("HINGE_MIN_SUPPORT = {}",HINGE_MIN_SUPPORT); console->info("HINGE_BIN_PILEUP_THRESHOLD = {}",HINGE_BIN_PILEUP_THRESHOLD); console->info("HINGE_READ_UNBRIDGED_THRESHOLD = {}",HINGE_READ_UNBRIDGED_THRESHOLD); console->info("HINGE_BIN_LENGTH = {}",HINGE_BIN_LENGTH); console->info("HINGE_TOLERANCE_LENGTH = {}",HINGE_TOLERANCE_LENGTH); std::vector aln;//Vector of pointers to all alignments std::vector< std::vector > > coverages(n_read); std::vector< std::vector > > cutoff_coverages(n_read); std::vector< std::vector > > cgs(n_read); //coverage gradient; std::vector> maskvec; std::vector > > repeat_annotation; std::unordered_map> > hinges; std::ofstream cov(out + ".coverage.txt"); std::ofstream homo(out + ".homologous.txt"); std::ofstream rep(out + ".repeat.txt"); std::ofstream filtered(out + ".filtered.fasta"); std::ofstream hg(out + ".hinges.txt"); std::ofstream mask(out + ".mas"); std::ofstream comask(out + ".cmas"); std::ofstream covflag(out + ".cov.flag"); std::ofstream selfflag(out + ".self.flag"); // std::cout << "LAS list length "<< name_las_list.size() << std::endl; // for (int ind = 0; ind < name_las_list.size() ; ind ++) // std::cout << "name of las: "<< name_las_list[ind] << std::endl; int number_of_parts; if (strlen(name_las) > 0) number_of_parts = name_las_list.size(); else if(strlen(name_paf) > 0) number_of_parts = 1; else { console->error("Need to provide either las and db or paf and fasta"); return 1; } for (int part = 0; part < number_of_parts; part++) { console->info("part: {}", part); if (strlen(name_las) > 0) { console->info("name of las: {}", name_las_list[part]); if (strlen(name_las_list[part].c_str()) > 0) la.openAlignmentFile(name_las_list[part]); } int64 n_aln = 0; if (strlen(name_las) > 0) { n_aln = la.getAlignmentNumber(); console->info("Load alignments from {}", name_las_list[part]); console->info("# Alignments: {}", n_aln); } if (strlen(name_las) > 0) { la.resetAlignment(); la.getOverlap(aln, 0, n_read); } if (strlen(name_paf) > 0) { n_aln = la.loadPAF(std::string(name_paf), aln); console->info("Load alignments from {}", name_paf); console->info("# Alignments: {}", n_aln); } if (n_aln == 0) { console->error("No alignments!"); return 1; } console->info("Input data finished, part {}/{}", part + 1, number_of_parts); console->info("length of alignments {}", aln.size()); //if (aln.size() == 0) continue; int r_begin = aln.front()->read_A_id_; int r_end = aln.back()->read_A_id_; console->info("begin {} end {}", r_begin, r_end); std::vector > idx_pileup; // this is the pileup std::vector > idx_pileup_dedup; // this is the deduplicated pileup std::vector > > idx_ab; //unordered_map from (aid, bid) to alignments in a vector std::unordered_map > > self_aln_list; for (int i = 0; i< n_read; i++) { idx_pileup.push_back(std::vector()); idx_pileup_dedup.push_back(std::vector()); idx_ab.push_back(std::unordered_map> ()); repeat_annotation.push_back(std::vector >()); maskvec.push_back(std::pair()); } for (int i = 0; i < aln.size(); i++) { if (aln[i]->read_A_id_ == aln[i]->read_B_id_) { aln[i]->active = false; if (self_aln_list.find(aln[i]->read_A_id_) == self_aln_list.end()) self_aln_list[aln[i]->read_A_id_] = std::vector>(); self_aln_list[aln[i]->read_A_id_].push_back(std::pair(aln[i]->read_A_match_start_, aln[i]->read_A_match_end_)); self_aln_list[aln[i]->read_A_id_].push_back(std::pair(aln[i]->read_B_match_start_, aln[i]->read_B_match_end_)); } if (aln[i]->active) { idx_pileup[aln[i]->read_A_id_].push_back(aln[i]); } } std::set self_match_reads; for (auto it : self_aln_list) { float cov = 0.0; for (int i = 0; i < it.second.size(); i++) cov += it.second[i].second - it.second[i].first; cov /= float(reads[it.first]->len); // std::cout << "selfcov: " << it.first << " " << cov << " " << reads[it.first]->len << std::endl; if ((cov > 4.5) and (reads[it.first]->len > 10000)) self_match_reads.insert(it.first); } for (int i = 0; i < n_read; i++) {// sort overlaps of a reads std::sort(idx_pileup[i].begin(), idx_pileup[i].end(), compare_overlap); } for (int i = 0; i < aln.size(); i++) { idx_ab[aln[i]->read_A_id_][aln[i]->read_B_id_] = std::vector(); } for (int i = 0; i < aln.size(); i++) { idx_ab[aln[i]->read_A_id_][aln[i]->read_B_id_].push_back(aln[i]); } for (int i = 0; i < n_read; i++) { for (std::unordered_map >::iterator it = idx_ab[i].begin(); it!= idx_ab[i].end(); it++) { std::sort(it->second.begin(), it->second.end(), compare_overlap); if (it->second.size() > 0) idx_pileup_dedup[i].push_back(it->second[0]); } } console->info("profile coverage (with and without CUT_OFF)"); //std::vector< std::vector > > his; for (int i = r_begin; i <= r_end; i ++) { std::vector > coverage; std::vector > cutoff_coverage; //TODO : Implement set based gradient std::vector > cg; //profileCoverage: get the coverage based on pile-o-gram la.profileCoverage(idx_pileup[i], cutoff_coverage, reso, CUT_OFF); la.profileCoverage(idx_pileup[i], coverage, reso, 0); cov << "read " << i <<" "; for (int j = 0; j < coverage.size(); j++) cov << coverage[j].first << "," << coverage[j].second << " "; cov << std::endl; //Computes coverage gradients. if (coverage.size() >= 2) for (int j = 0; j < coverage.size() - 1; j++) { cg.push_back(std::pair(coverage[j].first, coverage[j+1].second - coverage[j].second)); } else cg.push_back(std::pair (0,0)); coverages[i] = (coverage); cutoff_coverages[i] = (cutoff_coverage); cgs[i] = (cg); } console->info("profile coverage done part {}/{}", part + 1, number_of_parts); std::set rand_reads; srand(time(NULL)); rand_reads.insert(0); int temp_index(0); while (rand_reads.size() < (r_end - r_begin)/500){ temp_index ++; int rd_id=rand()%(r_end - r_begin) + r_begin; if (reads[rd_id]->len > 5000) rand_reads.insert(rd_id); if (temp_index > 20000) break; } int num_slot = 0; long int total_cov = 0; std::vector read_coverage; long int read_cov=0; int read_slot =0; //Finding the average coverage, probing a small proportion of reads // for (std::set::iterator it=rand_reads.begin();it!=rand_reads.end(); ++it) { for (int i =r_begin; i <= r_end; i++){ if (reads[i]->len < 5000) continue; read_cov=0; read_slot=0; for (int j = 0; j < coverages[i].size(); j++) { //printf("%d\n", coverages[i][j].second); read_cov+=coverages[i][j].second; read_slot++; } total_cov += read_cov; num_slot += read_slot; int mean_read_cov=read_cov / std::max(1,read_slot); read_coverage.push_back(mean_read_cov); } size_t median_id = read_coverage.size() / 2; if (median_id > 0) std::nth_element(read_coverage.begin(), read_coverage.begin()+median_id, read_coverage.end()); int cov_est= read_coverage[median_id]; int mean_cov_est = total_cov / num_slot; //get estimated coverage if (EST_COV != 0) cov_est = EST_COV; console->info("Estimated mean coverage: {}", mean_cov_est); //if the coverage is specified by ini file, cover the estimated one console->info("Estimated median coverage: {}", cov_est); // mask vector, same format as mask_QV if (MIN_COV < cov_est/3) MIN_COV = cov_est/3; if (reads_to_keep.size()>0) { reads_to_keep_initial = reads_to_keep; for (std::set::iterator iter = reads_to_keep_initial.begin(); iter != reads_to_keep_initial.end(); ++iter) { int i = *iter; for (std::unordered_map >::iterator it = idx_ab[i].begin(); it != idx_ab[i].end(); it++) { if (it->second.size() > 0) { LOverlap *ovl = it->second[0]; reads_to_keep.insert(ovl->read_B_id_); } } } console->info("After accounting for neighbours of reads selected, have {} reads", reads_to_keep.size()); } for (int i = r_begin; i <= r_end; i++) { for (int j = 0; j < cutoff_coverages[i].size(); j++) { cutoff_coverages[i][j].second -= MIN_COV; if (cutoff_coverages[i][j].second < 0) cutoff_coverages[i][j].second = 0; } // std::cout << "in here " << i << std::endl; //get the longest consecutive region that has decent coverage, decent coverage = estimated coverage / 3 int start = 0; int end = start; int maxlen = 0, maxstart = 0, maxend = 0; int start_coord = 0, end_coord = 0; int max_start_coord = 0, max_end_coord = 0; for (int j = 0; j < cutoff_coverages[i].size(); j++) { if (cutoff_coverages[i][j].second > 0) { end = cutoff_coverages[i][j].first; end_coord = j; } else { if (end > start) { //std::cout<<"read" << i << " "<" << end << std::endl; if (end - start - reso > maxlen) { maxlen = end - start - reso; maxstart = start + reso; maxend = end; max_start_coord = start_coord + 1; max_end_coord = end_coord; } } start = cutoff_coverages[i][j].first; start_coord =j; end_coord = start_coord; end = start; } } int start_coverage = 0, end_coverage = 0; if (max_end_coord - max_start_coord + 1 > 20){ for (int dummy_index = 0; dummy_index < 10; dummy_index ++){ start_coverage += cutoff_coverages[i][max_start_coord + dummy_index].second + MIN_COV; end_coverage += cutoff_coverages[i][max_end_coord - dummy_index].second + MIN_COV; } start_coverage = start_coverage/10; end_coverage = end_coverage/10; } else{ int limit = (max_end_coord - max_start_coord)/2; for (int dummy_index = 0; dummy_index < limit; dummy_index ++){ start_coverage += cutoff_coverages[i][max_start_coord + dummy_index].second + MIN_COV; end_coverage += cutoff_coverages[i][max_end_coord - dummy_index].second + MIN_COV; } if (limit == 0){ start_coverage = 0; end_coverage = 0; } else { start_coverage = start_coverage / limit; end_coverage = end_coverage / limit; } } if (delete_telomere) { if ((start_coverage >= 10 * end_coverage) or (end_coverage >= 10 * start_coverage)) { covflag << i << std::endl; } if (self_match_reads.find(i) != self_match_reads.end()) { selfflag << i << std::endl; } } if (reads_to_keep.size()>0) { if (reads_to_keep.find(i) == reads_to_keep.end()) { // std::cout<<"setting masks equal"; maxend=maxstart; QV_mask[i].second=QV_mask[i].first; } } comask << i << " " << max_start_coord << " " << max_end_coord << std::endl; if ((use_qv_mask) and (use_coverage_mask)) { maskvec[i] = ( std::pair(std::max(maxstart, QV_mask[i].first), std::min(maxend, QV_mask[i].second))); //get the interestion of two masks mask << i << " " << std::max(maxstart, QV_mask[i].first) << " " << std::min(maxend, QV_mask[i].second) << std::endl; } else if ((use_coverage_mask) and (not use_qv_mask)) { maskvec[i] = (std::pair(maxstart, maxend)); mask << i << " " << maxstart << " " << maxend << std::endl; } else { maskvec[i] = (std::pair(QV_mask[i].first, QV_mask[i].second)); mask << i << " " << QV_mask[i].first << " " << QV_mask[i].second << std::endl; } } //binarize coverage gradient; //detect repeats based on coverage gradient, mark it has rising (1) or falling (-1) for (int i = r_begin; i <= r_end; i++) { std::vector > anno; for (int j = 0; j < cgs[i].size()-1; j++) { // changed, remove the last one //std::cout<< i << " " << cgs[i][j].first << " " << cgs[i][j].second << std::endl; if ((cgs[i][j].first >= maskvec[i].first + NO_HINGE_REGION) and (cgs[i][j].first <= maskvec[i].second - NO_HINGE_REGION)) { if (cgs[i][j].second > std::min( std::max((coverages[i][j].second+MIN_COV)/COVERAGE_FRACTION, MIN_REPEAT_ANNOTATION_THRESHOLD), MAX_REPEAT_ANNOTATION_THRESHOLD)) anno.push_back(std::pair(cgs[i][j].first, 1)); else if (cgs[i][j].second < - std::min( std::max((coverages[i][j].second+MIN_COV)/COVERAGE_FRACTION, MIN_REPEAT_ANNOTATION_THRESHOLD), MAX_REPEAT_ANNOTATION_THRESHOLD)) anno.push_back(std::pair(cgs[i][j].first, -1)); } } repeat_annotation[i] = (anno); } // clean it a bit, merge consecutive 1, or consecutive -1, or adjacent 1 and -1 if their position is within gap_threshold (could be bursty error) for (int i = r_begin; i <= r_end; i++) { for (std::vector >::iterator iter = repeat_annotation[i].begin(); iter < repeat_annotation[i].end(); ) { if (iter+1 < repeat_annotation[i].end()){ if (((iter->second == 1) and ((iter + 1)->second == 1)) and ((iter+1)->first - iter->first < REPEAT_ANNOTATION_GAP_THRESHOLD)) { repeat_annotation[i].erase((iter + 1)); } else if (((iter->second == -1) and ((iter + 1)->second == -1)) and ((iter+1)->first - iter->first < REPEAT_ANNOTATION_GAP_THRESHOLD)) { iter = repeat_annotation[i].erase(iter); } else iter++; } else iter ++; } } // need a better hinge detection // get hinges from repeat annotation information // n_read pos -1 = in_hinge 1 = out_hinge std::ofstream debug_file("debug.txt"); for (int i = r_begin; i <= r_end; i++) { //std::cout << i <>(); int coverage_at_start(0); int num_at_start(0); int num_at_end(0); int coverage_at_end(0); float avg_coverage_at_start; float avg_coverage_at_end; for (int j = 0; j < coverages[i].size(); j++){ if ((coverages[i][j].first <= maskvec[i].first + NO_HINGE_REGION) and (coverages[i][j].first >= maskvec[i].first )){ coverage_at_start += coverages[i][j].second; num_at_start++; } if ((coverages[i][j].first <= maskvec[i].second ) and (coverages[i][j].first >= maskvec[i].second - NO_HINGE_REGION )){ coverage_at_end += coverages[i][j].second; num_at_end++; } } avg_coverage_at_end = (float)coverage_at_end/num_at_end; avg_coverage_at_start = (float)coverage_at_start/num_at_start; if (std::abs(avg_coverage_at_end-avg_coverage_at_start) < 10){ continue; } for (int j = 0; j < repeat_annotation[i].size(); j++) { if (repeat_annotation[i][j].second == -1) { // look for out hinges, negative gradient bool bridged = true; int support = 0; int num_reads_at_end=1; std::vector > read_other_ends; for (int k = 0; k < idx_pileup[i].size(); k++) { int left_overhang, right_overhang; int temp_id; temp_id=idx_pileup[i][k]->read_B_id_; if (idx_pileup[i][k]->reverse_complement_match_==0){ right_overhang= std::max(maskvec[temp_id].second-idx_pileup[i][k]->read_B_match_end_,0); left_overhang= std::max(idx_pileup[i][k]->read_B_match_start_- maskvec[temp_id].first,0); } else if (idx_pileup[i][k]->reverse_complement_match_==1) { right_overhang= std::max(idx_pileup[i][k]->read_B_match_start_- maskvec[temp_id].first,0); left_overhang= std::max(maskvec[temp_id].second-idx_pileup[i][k]->read_B_match_end_,0); } if (right_overhang > THETA) { if ((idx_pileup[i][k]->read_A_match_end_ > repeat_annotation[i][j].first - HINGE_TOLERANCE_LENGTH) and (idx_pileup[i][k]->read_A_match_end_ < repeat_annotation[i][j].first + HINGE_TOLERANCE_LENGTH)) { std::pair other_end; other_end.first=idx_pileup[i][k]->read_A_match_start_; other_end.second=left_overhang; read_other_ends.push_back(other_end); support++; } } } if (support < HINGE_MIN_SUPPORT){ continue; } std::sort(read_other_ends.begin(),read_other_ends.end(), pairAscend); int num_reads_considered=0; int num_reads_extending_to_end=0; int num_reads_with_internal_overlaps=0; for (int id = 0; id < read_other_ends.size() ; ++id) { if (read_other_ends[id].first -maskvec[i].first < HINGE_BIN_LENGTH){ num_reads_considered++; num_reads_extending_to_end++; if ((num_reads_extending_to_end > HINGE_READ_UNBRIDGED_THRESHOLD) or ((num_reads_considered > HINGE_READ_UNBRIDGED_THRESHOLD) and (read_other_ends[id].first - read_other_ends[0].first > HINGE_BIN_LENGTH))) { bridged=false; break; } } else if (read_other_ends[id].second < THETA){ num_reads_considered++; if ((num_reads_extending_to_end > HINGE_READ_UNBRIDGED_THRESHOLD) or ((num_reads_considered > HINGE_READ_UNBRIDGED_THRESHOLD) and (read_other_ends[id].first - read_other_ends[0].first > HINGE_BIN_LENGTH))) { bridged=false; break; } } else if (read_other_ends[id].second > THETA) { num_reads_with_internal_overlaps++; num_reads_considered++; int id1=id+1; int pileup_length=1; while (id1 < read_other_ends.size()){ if (read_other_ends[id1].first - read_other_ends[id].first < HINGE_BIN_LENGTH){ pileup_length++; id1++; } else{ break; } } if (pileup_length > HINGE_BIN_PILEUP_THRESHOLD){ bridged=true; break; } } } if ((not bridged) and (support > HINGE_MIN_SUPPORT)) hinges[i].push_back(std::pair(repeat_annotation[i][j].first,-1)); } else { // look for in_hinges, positive gradient bool bridged = true; int support = 0; int num_reads_at_end=1; std::vector > read_other_ends; for (int k = 0; k < idx_pileup[i].size(); k++) { int left_overhang, right_overhang; int temp_id; temp_id=idx_pileup[i][k]->read_B_id_; if (idx_pileup[i][k]->reverse_complement_match_==0){ right_overhang= std::max(maskvec[temp_id].second-idx_pileup[i][k]->read_B_match_end_,0); left_overhang= std::max(idx_pileup[i][k]->read_B_match_start_- maskvec[temp_id].first,0); } else if (idx_pileup[i][k]->reverse_complement_match_==1) { right_overhang= std::max(idx_pileup[i][k]->read_B_match_start_- maskvec[temp_id].first,0); left_overhang= std::max(maskvec[temp_id].second-idx_pileup[i][k]->read_B_match_end_,0); } if (left_overhang > THETA) { if ((idx_pileup[i][k]->read_A_match_start_ > repeat_annotation[i][j].first - HINGE_TOLERANCE_LENGTH) and (idx_pileup[i][k]->read_A_match_start_ < repeat_annotation[i][j].first + HINGE_TOLERANCE_LENGTH)) { std::pair other_end; other_end.first=idx_pileup[i][k]->read_A_match_end_; other_end.second=right_overhang; read_other_ends.push_back(other_end); support++; } } } if (support < HINGE_MIN_SUPPORT){ continue; } std::sort(read_other_ends.begin(),read_other_ends.end(),pairDescend);//Sort in descending order int num_reads_considered=0; int num_reads_extending_to_end=0; int num_reads_with_internal_overlaps=0; for (int id = 0; id < read_other_ends.size() ; ++id) { if (maskvec[i].second-read_other_ends[id].first < HINGE_BIN_LENGTH){ num_reads_considered++; num_reads_extending_to_end++; if ((num_reads_extending_to_end > HINGE_READ_UNBRIDGED_THRESHOLD) or ((num_reads_considered > HINGE_READ_UNBRIDGED_THRESHOLD) and (read_other_ends[0].first - read_other_ends[id].first > HINGE_BIN_LENGTH))) { bridged=false; break; } } else if (read_other_ends[id].second < THETA){ num_reads_considered++; if ((num_reads_extending_to_end > HINGE_READ_UNBRIDGED_THRESHOLD) or ((num_reads_considered > HINGE_READ_UNBRIDGED_THRESHOLD) and (read_other_ends[0].first - read_other_ends[id].first > HINGE_BIN_LENGTH))) { bridged=false; break; } } else if (read_other_ends[id].second > THETA) { num_reads_with_internal_overlaps++; num_reads_considered++; int id1=id+1; int pileup_length=1; while (id1 < read_other_ends.size()){ if (read_other_ends[id].first - read_other_ends[id1].first < HINGE_BIN_LENGTH){ pileup_length++; id1++; } else{ break; } } if (pileup_length > HINGE_BIN_PILEUP_THRESHOLD){ bridged=true; break; } } } if ((not bridged) and (support > HINGE_MIN_SUPPORT)) hinges[i].push_back(std::pair(repeat_annotation[i][j].first, 1)); } } } console->info("reached end of loop"); //output hinges int ra_cnt = 0; for (int i = r_begin; i <= r_end; i++) { rep << i << " "; for (int j = 0; j < repeat_annotation[i].size(); j++) { rep << repeat_annotation[i][j].first << " " << repeat_annotation[i][j].second << " "; } ra_cnt += repeat_annotation[i].size(); rep << std::endl; } rep.close(); console->info("Number of hinges before filtering: {}", ra_cnt); int hg_cnt = 0; for (int i = r_begin; i < r_end; i++) { hg << i << " "; for (int j = 0; j < hinges[i].size(); j++) { hg << hinges[i][j].first << " " << hinges[i][j].second << " "; } hg_cnt += hinges[i].size(); hg << std::endl; } console->info("Number of hinges: {}", hg_cnt); if (strlen(name_las) > 0) { for (int i = 0; i < aln.size(); i++) { delete aln[i]; } aln.clear(); } console->info("part: {}", part); // console->info("going through: {}", part+1 < name_las_list.size()); } hg.close(); if (strlen(name_db)>0) la.closeDB(); //close database return 0; } HINGE-0.5.0/src/hinge000077500000000000000000000015101314415550300141530ustar00rootroot00000000000000#!/bin/bash export PATH="`dirname $BASH_SOURCE`/../lib/hinge:$PATH" subcommand="$1" shift case "$subcommand" in filter) Reads_filter "$@" ;; layout) hinging "$@" ;; maximal) get_maximal_reads "$@" ;; clip) pruning_and_clipping.py "$@" ;; clip-nanopore) pruning_and_clipping_nanopore.py "$@" ;; draft-path) get_draft_path.py "$@" ;; draft) draft_assembly "$@" ;; correct-head) correct_head.py "$@" ;; consensus) consensus "$@" ;; fasta2q) fasta_to_fastq.py "$@" ;; gfa) get_consensus_gfa.py "$@" ;; visualize|visualise) Visualise_graph.py "$@" ;; condense) condense_graph.py "$@" ;; correct_head) correct_head.py "$@" ;; split_las) split_las.py "$@" ;; *) echo "See hinge(1) for usage information." exit 1 ;; esac HINGE-0.5.0/src/hinge.1.md000066400000000000000000000014301314415550300147070ustar00rootroot00000000000000% HINGE(1) % % October 2016 # NAME hinge - assembler for long-read sequencing data # SYNOPSIS **hinge** {**subcommand**} *options* *files* # OPTIONS Subcommands are described below. Run each subcommand without arguments for usage information. **filter** : filter out short reads and long chimeric reads. **maximal** : get maximal reads. **layout** : generate a layout for assembly **clip** : prune and clip output of the **layout** command **draft-path** : get assembly graph as list of nodes **draft** : construct draft assembly **correct-head** : convert fasta file to daligner-specific format **consensus** : construct consensus sequence **gfa** : Create a graphical fragment assembly file from **consensus** output **visualize**, **visualise** HINGE-0.5.0/src/include/000077500000000000000000000000001314415550300145615ustar00rootroot00000000000000HINGE-0.5.0/src/include/DB.h000077500000000000000000000556131314415550300152340ustar00rootroot00000000000000/************************************************************************************\ * * * Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. * * * * Redistribution and use in source and binary forms, with or without modification, * * are permitted provided that the following conditions are met: * * * * · Redistributions of source code must retain the above copyright notice, this * * list of conditions and the following disclaimer. * * * * · Redistributions in binary form must reproduce the above copyright notice, this * * list of conditions and the following disclaimer in the documentation and/or * * other materials provided with the distribution. * * * * · The name of EWM may not be used to endorse or promote products derived from * * this software without specific prior written permission. * * * * THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, * * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE * * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN * * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * * * For any issues regarding this software and its use, contact EWM at: * * * * Eugene W. Myers Jr. * * Bautzner Str. 122e * * 01099 Dresden * * GERMANY * * Email: gene.myers@gmail.com * * * \************************************************************************************/ /******************************************************************************************* * * Compressed data base module. Auxiliary routines to open and manipulate a data base for * which the sequence and read information are separated into two separate files, and the * sequence is compressed into 2-bits for each base. Support for tracks of additional * information, and trimming according to the current partition. Eventually will also * support compressed quality information. * * Author : Gene Myers * Date : July 2013 * Revised: April 2014 * ********************************************************************************************/ #ifndef _HITS_DB #define _HITS_DB #include #include "QV.h" #define HIDE_FILES // Auxiliary DB files start with a . so they are "hidden" // Undefine if you don't want this // For interactive applications where it is inappropriate to simply exit with an error // message to standard error, define the constant INTERACTIVE. If set, then error // messages are put in the global variable Ebuffer and the caller of a DB routine // can decide how to deal with the error. // // DB, QV, or alignment routines that can encounter errors function as before in // non-INTERACTIVE mode by exiting after printing an error message to stderr. In // INTERACTIVE mode the routines place a message at EPLACE and return an error // value. For such routines that were previously void, they are now int, and // return 1 if an error occured, 0 otherwise. #undef INTERACTIVE #ifdef INTERACTIVE #define EPRINTF sprintf #define EPLACE Ebuffer #define EXIT(x) return (x) #else // BATCH #define EPRINTF fprintf #define EPLACE stderr #define EXIT(x) exit (1) #endif typedef unsigned char uint8; typedef unsigned short uint16; typedef unsigned int uint32; typedef unsigned long long uint64; typedef signed char int8; typedef signed short int16; typedef signed int int32; typedef signed long long int64; typedef float float32; typedef double float64; /******************************************************************************************* * * COMMAND LINE INTERPRETATION MACROS * ********************************************************************************************/ extern char *Prog_Name; // Name of program #ifdef INTERACTIVE extern char Ebuffer[]; #endif #define SYSTEM_ERROR \ { EPRINTF(EPLACE,"%s: System error, read failed!\n",Prog_Name); \ exit (2); \ } #define ARG_INIT(name) \ Prog_Name = Strdup(name,""); \ for (i = 0; i < 128; i++) \ flags[i] = 0; #define ARG_FLAGS(set) \ for (k = 1; argv[i][k] != '\0'; k++) \ { if (index(set,argv[i][k]) == NULL) \ { fprintf(stderr,"%s: -%c is an illegal option\n",Prog_Name,argv[i][k]); \ exit (1); \ } \ flags[(int) argv[i][k]] = 1; \ } #define ARG_POSITIVE(var,name) \ var = strtol(argv[i]+2,&eptr,10); \ if (*eptr != '\0' || argv[i][2] == '\0') \ { fprintf(stderr,"%s: -%c argument is not an integer\n",Prog_Name,argv[i][1]); \ exit (1); \ } \ if (var <= 0) \ { fprintf(stderr,"%s: %s must be positive (%d)\n",Prog_Name,name,var); \ exit (1); \ } #define ARG_NON_NEGATIVE(var,name) \ var = strtol(argv[i]+2,&eptr,10); \ if (*eptr != '\0' || argv[i][2] == '\0') \ { fprintf(stderr,"%s: -%c argument is not an integer\n",Prog_Name,argv[i][1]); \ exit (1); \ } \ if (var < 0) \ { fprintf(stderr,"%s: %s must be non-negative (%d)\n",Prog_Name,name,var); \ exit (1); \ } #define ARG_REAL(var) \ var = strtod(argv[i]+2,&eptr); \ if (*eptr != '\0' || argv[i][2] == '\0') \ { fprintf(stderr,"%s: -%c argument is not a real number\n",Prog_Name,argv[i][1]); \ exit (1); \ } /******************************************************************************************* * * UTILITIES * ********************************************************************************************/ // The following general utilities return NULL if any of their input pointers are NULL, or if they // could not perform their function (in which case they also print an error to stderr). void *Malloc(int64 size, char *mesg); // Guarded versions of malloc, realloc void *Realloc(void *object, int64 size, char *mesg); // and strdup, that output "mesg" to char *Strdup(char *string, char *mesg); // stderr if out of memory FILE *Fopen(char *path, char *mode); // Open file path for "mode" char *PathTo(char *path); // Return path portion of file name "path" char *Root(char *path, char *suffix); // Return the root name, excluding suffix, of "path" // Catenate returns concatenation of path.sep.root.suffix in a *temporary* buffer // Numbered_Suffix returns concatenation of left..right in a *temporary* buffer char *Catenate(char *path, char *sep, char *root, char *suffix); char *Numbered_Suffix(char *left, int num, char *right); // DB-related utilities void Print_Number(int64 num, int width, FILE *out); // Print readable big integer int Number_Digits(int64 num); // Return # of digits in printed number #define COMPRESSED_LEN(len) (((len)+3) >> 2) void Compress_Read(int len, char *s); // Compress read in-place into 2-bit form void Uncompress_Read(int len, char *s); // Uncompress read in-place into numeric form void Print_Read(char *s, int width); void Lower_Read(char *s); // Convert read from numbers to lowercase letters (0-3 to acgt) void Upper_Read(char *s); // Convert read from numbers to uppercase letters (0-3 to ACGT) void Number_Read(char *s); // Convert read from letters to numbers /******************************************************************************************* * * DB IN-CORE DATA STRUCTURES * ********************************************************************************************/ #define DB_QV 0x03ff // Mask for 3-digit quality value #define DB_CSS 0x0400 // This is the second or later of a group of reads from a given insert #define DB_BEST 0x0800 // This is the longest read of a given insert (may be the only 1) typedef struct { int origin; // Well # int rlen; // Length of the sequence (Last pulse = fpulse + rlen) int fpulse; // First pulse int64 boff; // Offset (in bytes) of compressed read in 'bases' file, or offset of // uncompressed bases in memory block int64 coff; // Offset (in bytes) of compressed quiva streams in 'quiva' file int flags; // QV of read + reverse_complement_match_ above } HITS_READ; // A track can be of 3 types: // data == NULL: there are nreads 'anno' records of size 'size'. // data != NULL && size == 4: anno is an array of nreads+1 int's and data[anno[i]..anno[i+1]) // contains the variable length data // data != NULL && size == 8: anno is an array of nreads+1 int64's and data[anno[i]..anno[i+1]) // contains the variable length data typedef struct _track { struct _track *next; // Link to next track char *name; // Symbolic name of track int size; // Size in bytes of anno records void *anno; // over [0,nreads]: read i annotation: int, int64, or 'size' records void *data; // data[anno[i] .. anno[i+1]-1] is data if data != NULL } HITS_TRACK; // The information for accessing QV streams is in a HITS_QV record that is a "pseudo-track" // named ".@qvs" and is always the first track record in the list (if present). Since normal // track names cannot begin with a . (this is enforced), this pseudo-track is never confused // with a normal track. typedef struct { struct _track *next; char *name; int ncodes; // # of coding tables QVcoding *coding; // array [0..ncodes-1] of coding schemes (see QV.h) uint16 *table; // for i in [0,db->nreads-1]: read i should be decompressed with // scheme coding[table[i]] FILE *quiva; // the open file pointer to the .qvs file } HITS_QV; // The DB record holds all information about the current state of an active DB including an // array of HITS_READS, one per read, and a linked list of HITS_TRACKs the first of which // is always a HITS_QV pseudo-track (if the QVs have been loaded). typedef struct { int ureads; // Total number of reads in untrimmed DB int treads; // Total number of reads in trimmed DB int cutoff; // Minimum read length in block (-1 if not yet set) int all; // Consider multiple reads from a given well float freq[4]; // frequency of A, C, G, T, respectively // Set with respect to "active" part of DB (all vs block, untrimmed vs trimmed) int maxlen; // length of maximum read (initially over all DB) int64 totlen; // total # of bases (initially over all DB) int nreads; // # of reads in actively loaded portion of DB int trimmed; // DB has been trimmed by cutoff/all int part; // DB block (if > 0), total DB (if == 0) int ufirst; // Index of first read in block (without trimming) int tfirst; // Index of first read in block (with trimming) // In order to avoid forcing users to have to rebuild all thier DBs to accommodate // the addition of fields for the size of the actively loaded trimmed and untrimmed // blocks, an additional read record is allocated in "reads" when a DB is loaded into // memory (reads[-1]) and the two desired fields are crammed into the first two // integer spaces of the record. char *path; // Root name of DB for .bps, .qvs, and tracks int loaded; // Are reads loaded in memory? void *bases; // file pointer for bases file (to fetch reads from), // or memory pointer to uncompressed block of all sequences. HITS_READ *reads; // Array [-1..nreads] of HITS_READ HITS_TRACK *tracks; // Linked list of loaded tracks } HITS_DB; /******************************************************************************************* * * DB STUB FILE FORMAT = NFILE FDATA^nfile NBLOCK PARAMS BDATA^nblock * ********************************************************************************************/ #define MAX_NAME 10000 // Longest file name or fasta header line #define DB_NFILE "files = %9d\n" // number of files #define DB_FDATA " %9d %s %s\n" // last read index + 1, fasta prolog, file name #define DB_NBLOCK "blocks = %9d\n" // number of blocks #define DB_PARAMS "size = %9lld cutoff = %9d all = %1d\n" // block size, len cutoff, all in well #define DB_BDATA " %9d %9d\n" // First read index (untrimmed), first read index (trimmed) /******************************************************************************************* * * DB ROUTINES * ********************************************************************************************/ // Suppose DB is the name of an original database. Then there will be files .DB.idx, .DB.bps, // .DB.qvs, and files .DB..anno and DB..data where is a track name // (not containing a . !). // A DAM is basically a DB except that: // 1. there are no QV's, instead .coff points the '\0' terminated fasta header of the read // in the file ..hdr file // 2. .origin contains the contig # of the read within a fasta entry (assembly sequences // contain N-separated contigs), and .fpulse the first base of the contig in the // fasta entry // Open the given database or dam, "path" into the supplied HITS_DB record "db". If the name has // a part # in it then just the part is opened. The index array is allocated (for all or // just the part) and read in. // Return status of routine: // -1: The DB could not be opened for a reason reported by the routine to EPLACE // 0: Open of DB proceeded without mishap // 1: Open of DAM proceeded without mishap int Open_DB(char *path, HITS_DB *db); // Trim the DB or part thereof and all loaded tracks according to the cutoff and all settings // of the current DB partition. Reallocate smaller memory blocks for the information kept // for the retained reads. void Trim_DB(HITS_DB *db); // Shut down an open 'db' by freeing all associated space, including tracks and QV structures, // and any open file pointers. The record pointed at by db however remains (the user // supplied it and so should free it). void Close_DB(HITS_DB *db); // If QV pseudo track is not already in db's track list, then load it and set it up. // The database must not have been trimmed yet. -1 is returned if a .qvs file is not // present, and 1 is returned if an error (reported to EPLACE) occured and INTERACTIVE // is defined. Otherwise a 0 is returned. int Load_QVs(HITS_DB *db); // Remove the QV pseudo track, all space associated with it, and close the .qvs file. void Close_QVs(HITS_DB *db); // Look up the file and header in the file of the indicated track. Return: // 1: Track is for trimmed DB // 0: Track is for untrimmed DB // -1: Track is not the right size of DB either trimmed or untrimmed // -2: Could not find the track // In addition, if opened (0 or 1 returned), then kind points at an integer indicating // the type of track as follows: // CUSTOM 0 => a custom track // MASK 1 => a mask track #define CUSTOM_TRACK 0 #define MASK_TRACK 1 int Check_Track(HITS_DB *db, char *track, int *kind); // If track is not already in the db's track list, then allocate all the storage for it, // read it in from the appropriate file, add it to the track list, and return a pointer // to the newly created HITS_TRACK record. If the track does not exist or cannot be // opened for some reason, then NULL is returned if INTERACTIVE is defined. Otherwise // the routine prints an error message to stderr and exits if an error occurs, and returns // with NULL only if the track does not exist. HITS_TRACK *Load_Track(HITS_DB *db, char *track); // If track is on the db's track list, then it is removed and all storage associated with it // is freed. void Close_Track(HITS_DB *db, char *track); // Allocate and return a buffer big enough for the largest read in 'db'. // **NB** free(x-1) if x is the value returned as *prefix* and suffix '\0'(4)-byte // are needed by the alignment algorithms. If cannot allocate memory then return NULL // if INTERACTIVE is defined, or print error to stderr and exit otherwise. char *New_Read_Buffer(HITS_DB *db); // Load into 'read' the i'th read in 'db'. As a lower case ascii string if ascii is 1, an // upper case ascii string if ascii is 2, and a numeric string over 0(A), 1(C), 2(G), and 3(T) // otherwise. A '\0' (or 4) is prepended and appended to the string so it has a delimeter // for traversals in either direction. A non-zero value is returned if an error occured // and INTERACTIVE is defined. int Load_Read(HITS_DB *db, int i, char *read, int ascii); // Load into 'read' the subread [beg,end] of the i'th read in 'db' and return a pointer to the // the start of the subinterval (not necessarily = to read !!! ). As a lower case ascii // string if ascii is 1, an upper case ascii string if ascii is 2, and a numeric string // over 0(A), 1(C), 2(G), and 3(T) otherwise. A '\0' (or 4) is prepended and appended to // the string holding the substring so it has a delimeter for traversals in either direction. // A NULL pointer is returned if an error occured and INTERACTIVE is defined. char *Load_Subread(HITS_DB *db, int i, int beg, int end, char *read, int ascii); // Allocate a set of 5 vectors large enough to hold the longest QV stream that will occur // in the database. If cannot allocate memory then return NULL if INTERACTIVE is defined, // or print error to stderr and exit otherwise. #define DEL_QV 0 // The deletion QVs are x[DEL_QV] if x is the buffer returned by New_QV_Buffer #define DEL_TAG 1 // The deleted characters #define INS_QV 2 // The insertion QVs #define SUB_QV 3 // The substitution QVs #define MRG_QV 4 // The merge QVs char **New_QV_Buffer(HITS_DB *db); // Load into 'entry' the 5 QV vectors for i'th read in 'db'. The deletion tag or characters // are converted to a numeric or upper/lower case ascii string as per ascii. Return with // a zero, except when an error occurs and INTERACTIVE is defined in which case return wtih 1. int Load_QVentry(HITS_DB *db, int i, char **entry, int ascii); // Allocate a block big enough for all the uncompressed sequences, read them into it, // reset the 'off' in each read record to be its in-memory offset, and set the // bases pointer to point at the block after closing the bases file. If ascii is // 1 then the reads are converted to lowercase ascii, if 2 then uppercase ascii, and // otherwise the reads are left as numeric strings over 0(A), 1(C), 2(G), and 3(T). // Return with a zero, except when an error occurs and INTERACTIVE is defined in which // case return wtih 1. int Read_All_Sequences(HITS_DB *db, int ascii); // For the DB or DAM "path" = "prefix/root.[db|dam]", find all the files for that DB, i.e. all // those of the form "prefix/[.]root.part" and call actor with the complete path to each file // pointed at by path, and the suffix of the path by extension. The . proceeds the root // name if the defined constant HIDE_FILES is set. Always the first call is with the // path "prefix/root.[db|dam]" and extension "db" or "dam". There will always be calls for // "prefix/[.]root.idx" and "prefix/[.]root.bps". All other calls are for *tracks* and // so this routine gives one a way to know all the tracks associated with a given DB. // -1 is returned if the path could not be found, and 1 is returned if an error (reported // to EPLACE) occured and INTERACTIVE is defined. Otherwise a 0 is returned. int List_DB_Files(char *path, void actor(char *path, char *extension)); #endif // _HITS_DB HINGE-0.5.0/src/include/INIReader.h000066400000000000000000000040721314415550300164770ustar00rootroot00000000000000// Read an INI file into easy-to-access name/value pairs. // inih and INIReader are released under the New BSD license (see LICENSE.txt). // Go to the project home page for more info: // // https://github.com/benhoyt/inih #ifndef __INIREADER_H__ #define __INIREADER_H__ #include #include // Read an INI file into easy-to-access name/value pairs. (Note that I've gone // for simplicity here rather than speed, but it should be pretty decent.) class INIReader { public: // Construct INIReader and parse given filename. See ini.h for more info // about the parsing. INIReader(std::string filename); // Return the result of ini_parse(), i.e., 0 on success, line number of // first error on parse error, or -1 on file open error. int ParseError(); // Get a string value from INI file, returning default_value if not found. std::string Get(std::string section, std::string name, std::string default_value); // Get an integer (long) value from INI file, returning default_value if // not found or not a valid integer (decimal "1234", "-1234", or hex "0x4d2"). long GetInteger(std::string section, std::string name, long default_value); // Get a real (floating point double) value from INI file, returning // default_value if not found or not a valid floating point value // according to strtod(). double GetReal(std::string section, std::string name, double default_value); // Get a boolean value from INI file, returning default_value if not found or if // not a valid true/false value. Valid true values are "true", "yes", "on", "1", // and valid false values are "false", "no", "off", "0" (not case sensitive). bool GetBoolean(std::string section, std::string name, bool default_value); private: int _error; std::map _values; static std::string MakeKey(std::string section, std::string name); static int ValueHandler(void* user, const char* section, const char* name, const char* value); }; #endif // __INIREADER_H__ HINGE-0.5.0/src/include/LAInterface.h000066400000000000000000000212311314415550300170460ustar00rootroot00000000000000#ifndef LAINTERFACE #define LAINTERFACE #include #include #include extern "C" { #include "DB.h" #include "align.h" } typedef std::pair Interval; class Read { // read class public: int id; // id, start from 0 std::string name; // read name std::string bases; // read bases std::string qv; // qv currently not available std::vector intervals; int effective_start,effective_end; int len; Read(int id, int length, std::string name, std::string bases) : id(id), bases(bases), name(name), len(length) { }; Read(int id, std::string name, std::string bases) : id(id), bases(bases), name(name) { }; bool active = true; void showRead(); }; enum MatchType { FORWARD, BACKWARD, ACOVERB, BCOVERA, UNDEFINED, INTERNAL, NOT_ACTIVE, COVERING, COVERED, MIDDLE, MISMATCH_LEFT, MISMATCH_RIGHT, FORWARD_INTERNAL, BACKWARD_INTERNAL // different type of alignment /** * FORWARD: Alignment and extend to the right * BACKWARD: extend to the left * COVERING: read a covering read b * COVERED: read a covered by read b * MISMATCH_LEFT: read a has a chimeric section on the left, and read b align with the rest of read a and extend it to the left * MISMATCH_RIGHT: read a has a chimeric section on the right, read b align with the rest of read a and extend it to the right * UNDEFINED: any other exceptions * FORWARD_INTERNAL : forward on read A internal on B * BACKWARD_INTERNAL : reverse on read A internal on B **/ } ; class LAlignment { // because class Alignment is taken public: LAlignment() { }; //std::string aseq; //std::string bseq; char * aseq; char * bseq; bool recovered = false; void show() {printf("%d %d %d [%d...%d] x [%d...%d] %d diffs\n", read_A_id_, read_B_id_,flags,abpos,aepos,bbpos,bepos,diffs); }; int read_A_id_; // id of read a int read_B_id_; // id of read b int alen; // length of read a int blen; // length of read b int *trace; // trace uint16 *trace_pts; int trace_pts_len; int tlen; int diffs; int abpos, bbpos; // begin position of read a and b int aepos, bepos; // end position of read a and b int flags; // flag = 1 : 'c', flag = 0 : 'n' int tps; MatchType aln_type; bool active = true; }; class LOverlap { // LOverlap is a simplified version of LAlignment, no trace public: LOverlap() { }; ~LOverlap() {free(trace_pts); }; void show() {printf("%d %d %d [%d...%d]/%d x [%d...%d]/%d %d diffs, %d type\n", read_A_id_, read_B_id_, reverse_complement_match_, read_A_match_start_, read_A_match_end_, alen, read_B_match_start_, read_B_match_end_, blen, diffs, match_type_); }; int read_A_id_, read_B_id_; int alen; // length of read a int blen; // length of read b int tlen; int diffs; //differences int read_A_match_start_, read_B_match_start_; // starting position and ending position of alignment in read a int read_A_match_end_, read_B_match_end_; // starting position and ending position of alignment in read b int eff_read_A_match_start_, eff_read_B_match_start_, eff_read_A_match_end_, eff_read_B_match_end_; int tps; int reverse_complement_match_; //reverse_complement_match_, reverse complement = 1, same direction = 0 int eff_read_A_read_start_, eff_read_A_read_end_, eff_read_B_read_start_, eff_read_B_read_end_; MatchType match_type_ = UNDEFINED; void addtype(int max_overhang); //classify overlaps void AddTypesAsymmetric(int max_overhang, int min_overhang); int GetMatchingPosition(int pos_A); static const int CHI_THRESHOLD = 500; // threshold for chimeric/adaptor at the begining bool active = true; uint16 *trace_pts; int trace_pts_len; void trim_overlap(); void TrimOverlapNaive(); int eff_start_trace_point_index_, eff_end_trace_point_index_; int weight; int length; }; class LAInterface { public: HITS_DB _db1, *db1 = &_db1; // data base 1 HITS_DB _db2, *db2 = &_db2; // data base 2 Overlap _ovl, *ovl = &_ovl; // overlaps Alignment _aln, *aln = &_aln; // alignments, those are data structures required to read the data base char **flist = NULL; int *findx = NULL; int nfiles = 0; // n blocks of the read database char ** flist2 = NULL; int *findx2 = NULL; int nfiles2 = 0; // n blocks of read database 2 FILE *input; int64 novl; int tspace, tbytes, small; int reps, *pts; int input_pts; LAInterface() { }; int openDB2(std::string filename, std::string filename2); // open 2 databases int openDB(std::string filename); // open database int openAlignmentFile(std::string filename); // open .las Alignment file void showRead(int from, int to); // show reads in a range void showRead2(int from, int to); // show reads in a range void showAlignment(int from, int to); // show alignment with 'A read' in a range void showOverlap(int from, int to); // show alignment with 'A read' in a range void resetAlignment(); // rewind the file, need to be called every time before obtaining alignments Read *getRead(int number); //get one read Read *getRead2(int number); //get one read void getRead(std::vector &reads, int from, int to); // get reads within a range int getQV(std::vector > & QV, int from, int to); void getRead2(std::vector &reads, int from, int to); // get reads within a range void getAlignmentB(std::vector &, int n); // get all b reads aligned with a read void getOverlap(std::vector &, std::vector &range); // get overlap(simplified version of alignment) with a read in a range void getOverlap(std::vector &, int from, int64 to); // get overlap(simplified version of alignment) with a read in a range void getOverlapw(std::vector &, int from, int to); // get overlap(simplified version of alignment) with a read in a range void getOverlap(std::vector &, int n); void getAlignment(std::vector &, int from, int to); // get alignment with 'A read' in a range void getAlignment(std::vector &result_vec, std::vector &range); void getAlignment(std::vector &, int n); int closeDB(); // close database int getReadNumber(); // get total number of reads int getReadNumber2(); // get total number of reads from database 2 int64 getAlignmentNumber(); // get total number of alignments int closeDB2(); int printAlignment(FILE *file, Alignment *align, Work_Data *ework, int indent, int width, int border, int upper, int coord); int printAlignment_exp(FILE *file, LAlignment *align, Work_Data *ework, int indent, int width, int border, int upper, int coord); int computeTracePTS(Alignment *align, Work_Data *ework, int trace_spacing); int showAlignmentTags(LAlignment *); int generateConsensus(std::vector &); int recoverAlignment(LAlignment *); std::vector * getCoverage(std::vector alns); std::vector * getCoverage(std::vector alns); std::pair getAlignmentTags(LAlignment *alignment); std::vector > * lowCoverageRegions(std::vector & cov, int min_cov); void profileCoverage(std::vector &alignments, std::vector > & coverage,int reso, int cutoff); void profileCoveragefine(std::vector &alignments, std::vector > & coverage,int reso, int cutoff, int est_coverage); void repeatDetect(std::vector > & coverage, std::vector > & repeat); int loadPAF(std::string filename, std::vector &); int loadFASTA(std::string filename, std::vector & reads); }; class Node { public: int id; int strand; bool pseudo = false; Node(int id, int strand): id(id), strand(strand) {}; Node() {}; void show() { std::cout<& firstElem, const std::pair& secondElem); bool pairDescend(const std::pair& firstElem, const std::pair& secondElem); bool compare_overlap(LOverlap * ovl1, LOverlap * ovl2); bool compare_sum_overlaps(const std::vector * ovl1, const std::vector * ovl2); bool compare_pos(LOverlap * ovl1, LOverlap * ovl2); bool compare_overlap_abpos(LOverlap * ovl1, LOverlap * ovl2); bool compare_overlap_aepos(LOverlap * ovl1, LOverlap * ovl2); bool compare_overlap_weight(LOverlap * ovl1, LOverlap * ovl2); bool compare_overlap_aln(LAlignment * ovl1, LAlignment * ovl2); #endif HINGE-0.5.0/src/include/QV.h000077500000000000000000000165041314415550300152710ustar00rootroot00000000000000/************************************************************************************\ * * * Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. * * * * Redistribution and use in source and binary forms, with or without modification, * * are permitted provided that the following conditions are met: * * * * · Redistributions of source code must retain the above copyright notice, this * * list of conditions and the following disclaimer. * * * * · Redistributions in binary form must reproduce the above copyright notice, this * * list of conditions and the following disclaimer in the documentation and/or * * other materials provided with the distribution. * * * * · The name of EWM may not be used to endorse or promote products derived from * * this software without specific prior written permission. * * * * THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, * * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE * * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN * * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * * * For any issues regarding this software and its use, contact EWM at: * * * * Eugene W. Myers Jr. * * Bautzner Str. 122e * * 01099 Dresden * * GERMANY * * Email: gene.myers@gmail.com * * * \************************************************************************************/ /******************************************************************************************* * * Compressor/decompressor for .quiv files: customized Huffman codes for each stream based on * the histogram of values occuring in a given file. The two low complexity streams * (deletionQV and substitutionQV) use a Huffman coding of the run length of the prevelant * character. * * Author: Gene Myers * Date: Jan 18, 2014 * Modified: July 25, 2014 * ********************************************************************************************/ #ifndef _QV_COMPRESSOR #define _QV_COMPRESSOR // The defined constant INTERACTIVE (set in DB.h) determines whether an interactive or // batch version of the routines in this library are compiled. In batch mode, routines // print an error message and exit. In interactive mode, the routines place the error // message in EPLACE (also defined in DB.h) and return an error value, typically NULL // if the routine returns a pointer, and an unusual integer value if the routine returns // an integer. // Below when an error return is described, one should understand that this value is returned // only if the routine was compiled in INTERACTIVE mode. // A PacBio compression scheme typedef struct { void *delScheme; // Huffman scheme for deletion QVs void *insScheme; // Huffman scheme for insertion QVs void *mrgScheme; // Huffman scheme for merge QVs void *subScheme; // Huffman scheme for substitution QVs void *dRunScheme; // Huffman scheme for deletion run lengths (if delChar > 0) void *sRunScheme; // Huffman scheme for substitution run lengths (if subChar > 0) int delChar; // If > 0, run-encoded deletion value int subChar; // If > 0, run-encoded substitution value int flip; // Need to flip multi-byte integers char *prefix; // Header line prefix } QVcoding; // Read the next nlines of input, and QVentry returns a pointer to the first line if needed. // If end-of-input is encountered before any further input, -1 is returned. If there is // an error than -2 is returned. Otherwise the length of the line(s) read is returned. int Read_Lines(FILE *input, int nlines); char *QVentry(); // Read the .quiva file on input and record frequency statistics. If there is an error // then 1 is returned, otherwise 0. int QVcoding_Scan(FILE *input); // Given QVcoding_Scan has been called at least once, create an encoding scheme based on // the accumulated statistics and return a pointer to it. The returned encoding object // is *statically allocated within the routine. If lossy is set then use a lossy scaling // for the insertion and merge streams. If there is an error, then NULL is returned. QVcoding *Create_QVcoding(int lossy); // Read/write a coding scheme to input/output. The encoding object returned by the reader // is *statically* allocated within the routine. If an error occurs while reading then // NULL is returned. QVcoding *Read_QVcoding(FILE *input); void Write_QVcoding(FILE *output, QVcoding *coding); // Free all the auxiliary storage associated with coding (but not the object itself!) void Free_QVcoding(QVcoding *coding); // Assuming the file pointer is positioned just beyond an entry header line, read the // next set of 5 QV lines, compress them according to 'coding', and output. If lossy // is set then the scheme is a lossy one. A non-zero value is return only if an // error occured. int Compress_Next_QVentry(FILE *input, FILE *output, QVcoding *coding, int lossy); // Assuming the input is position just beyond the compressed encoding of an entry header, // read the set of compressed encodings for the ensuing 5 QV vectors, decompress them, // and place their decompressed values into entry which is a 5 element array of character // pointers. The parameter rlen computed from the preceeding header line, critically // provides the length of each of the 5 vectors. A non-zero value is return only if an // error occured. int Uncompress_Next_QVentry(FILE *input, char **entry, QVcoding *coding, int rlen); #endif // _QV_COMPRESSOR HINGE-0.5.0/src/include/align.h000077500000000000000000000514001314415550300160270ustar00rootroot00000000000000/************************************************************************************\ * * * Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. * * * * Redistribution and use in source and binary forms, with or without modification, * * are permitted provided that the following conditions are met: * * * * · Redistributions of source code must retain the above copyright notice, this * * list of conditions and the following disclaimer. * * * * · Redistributions in binary form must reproduce the above copyright notice, this * * list of conditions and the following disclaimer in the documentation and/or * * other materials provided with the distribution. * * * * · The name of EWM may not be used to endorse or promote products derived from * * this software without specific prior written permission. * * * * THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, * * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE * * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN * * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * * * For any issues regarding this software and its use, contact EWM at: * * * * Eugene W. Myers Jr. * * Bautzner Str. 122e * * 01099 Dresden * * GERMANY * * Email: gene.myers@gmail.com * * * \************************************************************************************/ /******************************************************************************************* * * Local alignment module. Routines for finding local alignments given a seed position, * representing such an l.a. with its interval and a set of pass-thru points, so that * a detailed alignment can be efficiently computed on demand. * * All routines work on a numeric representation of DNA sequences, i.e. 0 for A, 1 for C, * 2 for G, and 3 for T. * * Author: Gene Myers * Date : July 2013 * ********************************************************************************************/ #ifndef _A_MODULE #define _A_MODULE #include "DB.h" #define TRACE_XOVR 125 // If the trace spacing is not more than this value, then can // and do compress traces pts to 8-bit unsigned ints /*** INTERACTIVE vs BATCH version The defined constant INTERACTIVE (set in DB.h) determines whether an interactive or batch version of the routines in this library are compiled. In batch mode, routines print an error message and exit. In interactive mode, the routines place the error message in EPLACE (also defined in DB.h) and return an error value, typically NULL if the routine returns a pointer, and an unusual integer value if the routine returns an integer. Below when an error return is described, one should understand that this value is returned only if the routine was compiled in INTERACTIVE mode. ***/ /*** PATH ABSTRACTION: Coordinates are *between* characters where 0 is the tick just before the first char, 1 is the tick between the first and second character, and so on. Our data structure is called a Path refering to its conceptualization in an edit graph. A local alignment is specified by the point '(read_A_match_start_,read_B_match_start_)' at which its path in the underlying edit graph starts, and the point '(read_A_match_end_,read_B_match_end_)' at which it ends. In otherwords A[read_A_match_start_+1..read_A_match_end_] is aligned to B[read_B_match_start_+1..read_B_match_end_] (assuming X[1] is the *first* character of X). There are 'diffs' differences in an optimal local alignment between the beginning and end points of the alignment (if computed by Compute_Trace), or nearly so (if computed by Local_Alignment). Optionally, a Path can have additional information about the exact nature of the aligned substrings if the field 'trace' is not NULL. Trace points to either an array of integers (if computed by a Compute_Trace routine), or an array of unsigned short integers (if computed by Local_Alignment). If computed by Local_Alignment 'trace' points at a list of 'tlen' (always even) short values: d_0, b_0, d_1, b_1, ... d_n-1, b_n-1, d_n, b_n to be interpreted as follows. The alignment from (read_A_match_start_,read_B_match_start_) to (read_A_match_end_,read_B_match_end_) passes through the n trace points for i in [1,n]: (a_i,b_i) where a_i = floor(read_A_match_start_/TS)*TS + i*TS and b_i = read_B_match_start_ + (b_0 + b_1 + b_i-1) where also let a_0,b_0 = read_A_match_start_,read_B_match_start_ and a_(n+1),b_(n+1) = read_A_match_end_,read_B_match_end_. That is, the interior (i.e. i != 0 and i != n+1) trace points pass through every TS'th position of the aread where TS is the "trace spacing" employed when finding the alignment (see New_Align_Spec). Typically TS is 100. Then d_i is the number of differences in the portion of the alignment between (a_i,b_i) and (a_i+1,b_i+1). These trace points allow the Compute_Trace routines to efficiently compute the exact alignment between the two reads by efficiently computing exact alignments between consecutive pairs of trace points. Moreover, the diff values give one an idea of the quality of the alignment along every segment of TS symbols of the aread. If computed by a Compute_Trace routine, 'trace' points at a list of 'tlen' integers < i1, i2, ... in > that encodes an exact alignment as follows. A negative number j indicates that a dash should be placed before A[-j] and a positive number k indicates that a dash should be placed before B[k], where A and B are the two sequences of the overlap. The indels occur in the trace in the order in which they occur along the alignment. For a good example of how to "decode" a trace into an alignment, see the code for the routine Print_Alignment. ***/ typedef struct { void *trace; int tlen; int diffs; int abpos, bbpos; int aepos, bepos; } Path; /*** ALIGNMENT ABSTRACTION: An alignment is modeled by an Alignment record, which in addition to a *pointer* to a 'path', gives pointers to the A and B sequences, their lengths, and indicates whether the B-sequence needs to be complemented ('comp' non-zero if so). The 'trace' pointer of the 'path' subrecord can be either NULL, a list of pass-through points, or an exact trace depending on what routines have been called on the record. One can (1) compute a trace, with Compute_Trace, either from scratch if 'path.trace' = NULL, or using the sequence of pass-through points in trace, (2) print an ASCII representation of an alignment, or (3) reverse the roles of A and B, and (4) complement a sequence (which is a reversible process). If the alignment record shows the B sequence as complemented, *** THEN IT IS THE RESPONSIBILITY OF THE CALLER *** to make sure that bseq points at a complement of the sequence before calling Compute_Trace or Print_Alignment. Complement_Seq complements the sequence a of length n. The operation does the complementation/reversal in place. Calling it a second time on a given fragment restores it to its original state. ***/ #define COMP(x) ((x) & 0x1) #define COMP_FLAG 0x1 typedef struct { Path *path; uint32 flags; /* Pipeline status and complementation reverse_complement_match_ */ char *aseq; /* Pointer to A sequence */ char *bseq; /* Pointer to B sequence */ int alen; /* Length of A sequence */ int blen; /* Length of B sequence */ } Alignment; void Complement_Seq(char *a, int n); /* Many routines like Local_Alignment, Compute_Trace, and Print_Alignment need working storage that is more efficiently reused with each call, rather than being allocated anew with each call. Each *thread* can create a Work_Data object with New_Work_Data and this object holds and retains the working storage for routines of this module between calls to the routines. If enough memory for a Work_Data is not available then NULL is returned. Free_Work_Data frees a Work_Data object and all working storage held by it. */ typedef void Work_Data; Work_Data *New_Work_Data(); void Free_Work_Data(Work_Data *work); /* Local_Alignment seeks local alignments of a quality determined by a number of parameters. These are coded in an Align_Spec object that can be created with New_Align_Spec and freed with Free_Align_Spec when no longer needed. There are 4 essential parameters: ave_corr: the average correlation (1 - 2*error_rate) for the sought alignments. For Pacbio data we set this to .70 assuming an average of 15% error in each read. trace_space: the spacing interval for keeping trace points and segment differences (see description of 'trace' for Paths above) freq[4]: a 4-element vector where afreq[0] = frequency of A, f(A), freq[1] = f(C), freq[2] = f(G), and freq[3] = f(T). This vector is part of the header of every HITS database (see db.h). If an alignment cannot reach the boundary of the d.p. matrix with this condition (i.e. overlap), then the last/first 30 columns of the alignment are guaranteed to be suffix/prefix positive at correlation ave_corr * g(freq) where g is an empirically measured function that increases from 1 as the entropy of freq decreases. If memory is unavailable or the freq distribution is too skewed then NULL is returned. You can get back the original parameters used to create an Align_Spec with the simple utility functions below. */ typedef void Align_Spec; Align_Spec *New_Align_Spec(double ave_corr, int trace_space, float *freq); void Free_Align_Spec(Align_Spec *spec); int Trace_Spacing (Align_Spec *spec); double Average_Correlation(Align_Spec *spec); float *Base_Frequencies (Align_Spec *spec); /* Local_Alignment finds the longest significant local alignment between the sequences in 'align' subject to: (a) the alignment criterion given by the Align_Spec 'spec', (b) it passes through one of the points (anti+k)/2,(anti-k)/2 for k in [low,hgh] within the underlying dynamic programming matrix (i.e. the points on diagonals low to hgh on anti-diagonal anti or anti-1 (depending on whether the diagonal is odd or even)), (c) if lbord >= 0, then the alignment is always above diagonal low-lbord, and (d) if hbord >= 0, then the alignment is always below diagonal hgh+hbord. The path record of 'align' has its 'trace' filled from the point of view of an overlap between the aread and the bread. In addition a Path record from the point of view of the bread versus the aread is returned by the function, with this Path's 'trace' filled in appropriately. The space for the returned path and the two 'trace's are in the working storage supplied by the Work_Data packet and this space is reused with each call, so if one wants to retain the bread-path and the two trace point sequences, then they must be copied to user-allocated storage before calling the routine again. NULL is returned in the event of an error. Find_Extension is a variant of Local_Alignment that simply finds a local alignment that either ends (if prefix is non-zero) or begins (if prefix is zero) at the point (anti+diag)/2,(anti-diag)/2). All other parameters are as before. It returns a non-zero value only when INTERACTIVE is on and it cannot allocate the memory it needs. Only the path and trace with respect to the aread is returned. This routine is experimental and may not persist in later versions of the code. */ Path *Local_Alignment(Alignment *align, Work_Data *work, Align_Spec *spec, int low, int hgh, int anti, int lbord, int hbord); int Find_Extension(Alignment *align, Work_Data *work, Align_Spec *spec, // experimental !! int diag, int anti, int lbord, int hbord, int prefix); /* Given a legitimate Alignment object, Compute_Trace_X computes an exact trace for the alignment. If 'path.trace' is non-NULL, then it is assumed to be a sequence of pass-through points and diff levels computed by Local_Alignment. In either case 'path.trace' is set to point at an integer array within the storage of the Work_Data packet encoding an exact optimal trace from the start to end points. If the trace is needed beyond the next call to a routine that sets it, then it should be copied to an array allocated and managed by the caller. Compute_Trace_ALL does not require a sequence of pass-through points, as it computes the best alignment between (path->read_A_match_start_,path->read_B_match_start_) and (path->read_A_match_end_,path->read_B_match_end_) in the edit graph between the sequences. Compute_Trace_PTS computes a trace by computing the trace between successive pass through points. It is much, much faster than Compute_Trace_ALL but at the tradeoff of not necessarily being optimal as pass-through points are not all perfect. Compute_Trace_MID computes a trace by computing the trace between the mid-points of alignments between two adjacent pairs of pass through points. It is generally twice as slow as Compute_Trace_PTS, but it produces nearer optimal alignments. All these routines return 1 if an error occurred and 0 otherwise. */ #define LOWERMOST -1 // Possible modes for "mode" parameter below) #define GREEDIEST 0 #define UPPERMOST 1 int Compute_Trace_ALL(Alignment *align, Work_Data *work); int Compute_Trace_PTS(Alignment *align, Work_Data *work, int trace_spacing, int mode); int Compute_Trace_MID(Alignment *align, Work_Data *work, int trace_spacing, int mode); /* Compute_Trace_IRR (IRR for IRRegular) computes a trace for the given alignment where it assumes the spacing between trace points between both the A and B read varies, and futher assumes that the A-spacing is given in the short integers normally occupied by the differences in the alignment between the trace points. This routine is experimental and may not persist in later versions of the code. */ int Compute_Trace_IRR(Alignment *align, Work_Data *work, int mode); // experimental !! /* Alignment_Cartoon prints an ASCII representation of the overlap relationhip between the two reads of 'align' to the given 'file' indented by 'indent' space. Coord controls the display width of numbers, it must be not less than the width of any number to be displayed. If the alignment trace is an exact trace, then one can ask Print_Alignment to print an ASCII representation of the alignment 'align' to the file 'file'. Indent the display by "indent" spaces and put "width" columns per line in the display. Show "border" characters of sequence on each side of the aligned region. If upper is non-zero then display bases in upper case. If coord is greater than 0, then the positions of the first character in A and B in the given row is displayed with a field width given by coord's value. Print_Reference is like Print_Alignment but rather than printing exaclty "width" columns per segment, it prints "block" characters of the A sequence in each segment. This results in segments of different lengths, but is convenient when looking at two alignments involving A as segments are guaranteed to cover the same interval of A in a segment. Both Print routines return 1 if an error occurred (not enough memory), and 0 otherwise. Flip_Alignment modifies align so the roles of A and B are reversed. If full is off then the trace is ignored, otherwise the trace must be to a full alignment trace and this trace is also appropriately inverted. */ void Alignment_Cartoon(FILE *file, Alignment *align, int indent, int coord); int Print_Alignment(FILE *file, Alignment *align, Work_Data *work, int indent, int width, int border, int upper, int coord); int Print_Reference(FILE *file, Alignment *align, Work_Data *work, int indent, int block, int border, int upper, int coord); void Flip_Alignment(Alignment *align, int full); /*** OVERLAP ABSTRACTION: Externally, between modules an Alignment is modeled by an "Overlap" record, which (a) replaces the pointers to the two sequences with their ID's in the HITS data bases, (b) does not contain the length of the 2 sequences (must fetch from DB), and (c) contains its path as a subrecord rather than as a pointer (indeed, typically the corresponding Alignment record points at the Overlap's path sub-record). The trace pointer is always to a sequence of trace points and can be either compressed (uint8) or uncompressed (uint16). One can read and write binary records of an "Overlap". ***/ typedef struct { Path path; /* Path: begin- and end-point of alignment + diffs */ uint32 flags; /* Pipeline status and complementation reverse_complement_match_ */ int aread; /* Id # of A sequence */ int bread; /* Id # of B sequence */ } Overlap; /* Read_Overlap reads the next Overlap record from stream 'input', not including the trace (if any), and without modifying 'ovl's trace pointer. Read_Trace reads the ensuing trace into the memory pointed at by the trace field of 'ovl'. It is assumed to be big enough to accommodate the trace where each value take 'tbytes' bytes (1 if uint8 or 2 if uint16). Write_Overlap write 'ovl' to stream 'output' followed by its trace vector (if any) that occupies 'tbytes' bytes per value. Print_Overlap prints an ASCII version of the contents of 'ovl' to stream 'output' where the trace occupes 'tbytes' per value and the print out is indented from the left margin by 'indent' spaces. Compress_TraceTo8 converts a trace fo 16-bit values to 8-bit values in place, and Decompress_TraceTo16 does the reverse conversion. Check_Trace_Points checks that the number of trace points is correct and that the sum of the b-read displacements equals the b-read alignment interval, assuming the trace spacing is 'tspace'. It reports an error message if there is a problem and 'verbose' is non-zero. The 'ovl' came from the file names 'fname'. */ int Read_Overlap(FILE *input, Overlap *ovl); int Read_Trace(FILE *innput, Overlap *ovl, int tbytes); void Write_Overlap(FILE *output, Overlap *ovl, int tbytes); void Print_Overlap(FILE *output, Overlap *ovl, int tbytes, int indent); void Compress_TraceTo8(Overlap *ovl); void Decompress_TraceTo16(Overlap *ovl); int Check_Trace_Points(Overlap *ovl, int tspace, int verbose, char *fname); #endif // _A_MODULE HINGE-0.5.0/src/include/cmdline.h000066400000000000000000000444571314415550300163630ustar00rootroot00000000000000/* Copyright (c) 2009, Hideyuki Tanaka All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY ''AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #pragma once #include #include #include #include #include #include #include #include #include #include #include namespace cmdline{ namespace detail{ template class lexical_cast_t{ public: static Target cast(const Source &arg){ Target ret; std::stringstream ss; if (!(ss<>ret && ss.eof())) throw std::bad_cast(); return ret; } }; template class lexical_cast_t{ public: static Target cast(const Source &arg){ return arg; } }; template class lexical_cast_t{ public: static std::string cast(const Source &arg){ std::ostringstream ss; ss< class lexical_cast_t{ public: static Target cast(const std::string &arg){ Target ret; std::istringstream ss(arg); if (!(ss>>ret && ss.eof())) throw std::bad_cast(); return ret; } }; template struct is_same { static const bool value = false; }; template struct is_same{ static const bool value = true; }; template Target lexical_cast(const Source &arg) { return lexical_cast_t::value>::cast(arg); } static inline std::string demangle(const std::string &name) { int status=0; char *p=abi::__cxa_demangle(name.c_str(), 0, 0, &status); std::string ret(p); free(p); return ret; } template std::string readable_typename() { return demangle(typeid(T).name()); } template std::string default_value(T def) { return detail::lexical_cast(def); } template <> inline std::string readable_typename() { return "string"; } } // detail //----- class cmdline_error : public std::exception { public: cmdline_error(const std::string &msg): msg(msg){} ~cmdline_error() throw() {} const char *what() const throw() { return msg.c_str(); } private: std::string msg; }; template struct default_reader{ T operator()(const std::string &str){ return detail::lexical_cast(str); } }; template struct range_reader{ range_reader(const T &low, const T &high): low(low), high(high) {} T operator()(const std::string &s) const { T ret=default_reader()(s); if (!(ret>=low && ret<=high)) throw cmdline::cmdline_error("range_error"); return ret; } private: T low, high; }; template range_reader range(const T &low, const T &high) { return range_reader(low, high); } template struct oneof_reader{ T operator()(const std::string &s){ T ret=default_reader()(s); if (std::find(alt.begin(), alt.end(), ret)==alt.end()) throw cmdline_error(""); return ret; } void add(const T &v){ alt.push_back(v); } private: std::vector alt; }; template oneof_reader oneof(T a1) { oneof_reader ret; ret.add(a1); return ret; } template oneof_reader oneof(T a1, T a2) { oneof_reader ret; ret.add(a1); ret.add(a2); return ret; } template oneof_reader oneof(T a1, T a2, T a3) { oneof_reader ret; ret.add(a1); ret.add(a2); ret.add(a3); return ret; } template oneof_reader oneof(T a1, T a2, T a3, T a4) { oneof_reader ret; ret.add(a1); ret.add(a2); ret.add(a3); ret.add(a4); return ret; } template oneof_reader oneof(T a1, T a2, T a3, T a4, T a5) { oneof_reader ret; ret.add(a1); ret.add(a2); ret.add(a3); ret.add(a4); ret.add(a5); return ret; } template oneof_reader oneof(T a1, T a2, T a3, T a4, T a5, T a6) { oneof_reader ret; ret.add(a1); ret.add(a2); ret.add(a3); ret.add(a4); ret.add(a5); ret.add(a6); return ret; } template oneof_reader oneof(T a1, T a2, T a3, T a4, T a5, T a6, T a7) { oneof_reader ret; ret.add(a1); ret.add(a2); ret.add(a3); ret.add(a4); ret.add(a5); ret.add(a6); ret.add(a7); return ret; } template oneof_reader oneof(T a1, T a2, T a3, T a4, T a5, T a6, T a7, T a8) { oneof_reader ret; ret.add(a1); ret.add(a2); ret.add(a3); ret.add(a4); ret.add(a5); ret.add(a6); ret.add(a7); ret.add(a8); return ret; } template oneof_reader oneof(T a1, T a2, T a3, T a4, T a5, T a6, T a7, T a8, T a9) { oneof_reader ret; ret.add(a1); ret.add(a2); ret.add(a3); ret.add(a4); ret.add(a5); ret.add(a6); ret.add(a7); ret.add(a8); ret.add(a9); return ret; } template oneof_reader oneof(T a1, T a2, T a3, T a4, T a5, T a6, T a7, T a8, T a9, T a10) { oneof_reader ret; ret.add(a1); ret.add(a2); ret.add(a3); ret.add(a4); ret.add(a5); ret.add(a6); ret.add(a7); ret.add(a8); ret.add(a9); ret.add(a10); return ret; } //----- class parser{ public: parser(){ } ~parser(){ for (std::map::iterator p=options.begin(); p!=options.end(); p++) delete p->second; } void add(const std::string &name, char short_name=0, const std::string &desc=""){ if (options.count(name)) throw cmdline_error("multiple definition: "+name); options[name]=new option_without_value(name, short_name, desc); ordered.push_back(options[name]); } template void add(const std::string &name, char short_name=0, const std::string &desc="", bool need=true, const T def=T()){ add(name, short_name, desc, need, def, default_reader()); } template void add(const std::string &name, char short_name=0, const std::string &desc="", bool need=true, const T def=T(), F reader=F()){ if (options.count(name)) throw cmdline_error("multiple definition: "+name); options[name]=new option_with_value_with_reader(name, short_name, need, def, desc, reader); ordered.push_back(options[name]); } void footer(const std::string &f){ ftr=f; } void set_program_name(const std::string &name){ prog_name=name; } bool exist(const std::string &name) const { if (options.count(name)==0) throw cmdline_error("there is no flag: --"+name); return options.find(name)->second->has_set(); } template const T &get(const std::string &name) const { if (options.count(name)==0) throw cmdline_error("there is no flag: --"+name); const option_with_value *p=dynamic_cast*>(options.find(name)->second); if (p==NULL) throw cmdline_error("type mismatch flag '"+name+"'"); return p->get(); } const std::vector &rest() const { return others; } bool parse(const std::string &arg){ std::vector args; std::string buf; bool in_quote=false; for (std::string::size_type i=0; i=arg.length()){ errors.push_back("unexpected occurrence of '\\' at end of string"); return false; } } buf+=arg[i]; } if (in_quote){ errors.push_back("quote is not closed"); return false; } if (buf.length()>0) args.push_back(buf); for (size_t i=0; i &args){ int argc=static_cast(args.size()); std::vector argv(argc); for (int i=0; i lookup; for (std::map::iterator p=options.begin(); p!=options.end(); p++){ if (p->first.length()==0) continue; char initial=p->second->short_name(); if (initial){ if (lookup.count(initial)>0){ lookup[initial]=""; errors.push_back(std::string("short option '")+initial+"' is ambiguous"); return false; } else lookup[initial]=p->first; } } for (int i=1; i &args){ if (!options.count("help")) add("help", '?', "print this message"); check(args.size(), parse(args)); } void parse_check(int argc, char *argv[]){ if (!options.count("help")) add("help", '?', "print this message"); check(argc, parse(argc, argv)); } std::string error() const{ return errors.size()>0?errors[0]:""; } std::string error_full() const{ std::ostringstream oss; for (size_t i=0; imust()) oss<short_description()<<" "; } oss<<"[options] ... "<name().length()); } for (size_t i=0; ishort_name()){ oss<<" -"<short_name()<<", "; } else{ oss<<" "; } oss<<"--"<name(); for (size_t j=ordered[i]->name().length(); jdescription()<set()){ errors.push_back("option needs value: --"+name); return; } } void set_option(const std::string &name, const std::string &value){ if (options.count(name)==0){ errors.push_back("undefined option: --"+name); return; } if (!options[name]->set(value)){ errors.push_back("option value is invalid: --"+name+"="+value); return; } } class option_base{ public: virtual ~option_base(){} virtual bool has_value() const=0; virtual bool set()=0; virtual bool set(const std::string &value)=0; virtual bool has_set() const=0; virtual bool valid() const=0; virtual bool must() const=0; virtual const std::string &name() const=0; virtual char short_name() const=0; virtual const std::string &description() const=0; virtual std::string short_description() const=0; }; class option_without_value : public option_base { public: option_without_value(const std::string &name, char short_name, const std::string &desc) :nam(name), snam(short_name), desc(desc), has(false){ } ~option_without_value(){} bool has_value() const { return false; } bool set(){ has=true; return true; } bool set(const std::string &){ return false; } bool has_set() const { return has; } bool valid() const{ return true; } bool must() const{ return false; } const std::string &name() const{ return nam; } char short_name() const{ return snam; } const std::string &description() const { return desc; } std::string short_description() const{ return "--"+nam; } private: std::string nam; char snam; std::string desc; bool has; }; template class option_with_value : public option_base { public: option_with_value(const std::string &name, char short_name, bool need, const T &def, const std::string &desc) : nam(name), snam(short_name), need(need), has(false) , def(def), actual(def) { this->desc=full_description(desc); } ~option_with_value(){} const T &get() const { return actual; } bool has_value() const { return true; } bool set(){ return false; } bool set(const std::string &value){ try{ actual=read(value); has=true; } catch(const std::exception &e){ return false; } return true; } bool has_set() const{ return has; } bool valid() const{ if (need && !has) return false; return true; } bool must() const{ return need; } const std::string &name() const{ return nam; } char short_name() const{ return snam; } const std::string &description() const { return desc; } std::string short_description() const{ return "--"+nam+"="+detail::readable_typename(); } protected: std::string full_description(const std::string &desc){ return desc+" ("+detail::readable_typename()+ (need?"":" [="+detail::default_value(def)+"]") +")"; } virtual T read(const std::string &s)=0; std::string nam; char snam; bool need; std::string desc; bool has; T def; T actual; }; template class option_with_value_with_reader : public option_with_value { public: option_with_value_with_reader(const std::string &name, char short_name, bool need, const T def, const std::string &desc, F reader) : option_with_value(name, short_name, need, def, desc), reader(reader){ } private: T read(const std::string &s){ return reader(s); } F reader; }; std::map options; std::vector ordered; std::string ftr; std::string prog_name; std::vector others; std::vector errors; }; } // cmdline HINGE-0.5.0/src/include/common.h000077500000000000000000000206641314415550300162350ustar00rootroot00000000000000 /* * ===================================================================================== * * Filename: common.h * * Description: Common delclaration for the code base * * Version: 0.1 * Created: 07/16/2013 07:46:23 AM * Revision: none * Compiler: gcc * * Author: Jason Chin, * Company: * * ===================================================================================== #################################################################################$$ # Copyright (c) 2011-2014, Pacific Biosciences of California, Inc. # # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted (subject to the limitations in the # disclaimer below) provided that the following conditions are met: # # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # # * Redistributions in binary form must reproduce the above # copyright notice, this list of conditions and the following # disclaimer in the documentation and/or other materials provided # with the distribution. # # * Neither the name of Pacific Biosciences nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE # GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC # BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF # USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT # OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. #################################################################################$$ */ #ifndef COMMON_H #define COMMON_H #include typedef int seq_coor_t; typedef struct { seq_coor_t aln_str_size ; seq_coor_t dist ; seq_coor_t aln_q_s; seq_coor_t aln_q_e; seq_coor_t aln_t_s; seq_coor_t aln_t_e; char * q_aln_str; char * t_aln_str; } alignment; typedef struct { seq_coor_t pre_k; seq_coor_t x1; seq_coor_t y1; seq_coor_t x2; seq_coor_t y2; } d_path_data; typedef struct { seq_coor_t d; seq_coor_t k; seq_coor_t pre_k; seq_coor_t x1; seq_coor_t y1; seq_coor_t x2; seq_coor_t y2; } d_path_data2; typedef struct { seq_coor_t x; seq_coor_t y; } path_point; typedef struct { seq_coor_t start; seq_coor_t last; seq_coor_t count; } kmer_lookup; typedef unsigned char base; typedef base * seq_array; typedef seq_coor_t seq_addr; typedef seq_addr * seq_addr_array; typedef struct { seq_coor_t count; seq_coor_t * query_pos; seq_coor_t * target_pos; } kmer_match; typedef struct { seq_coor_t s1; seq_coor_t e1; seq_coor_t s2; seq_coor_t e2; long int score; } aln_range; typedef struct { char * sequence; int * eqv; } consensus_data; kmer_lookup * allocate_kmer_lookup (seq_coor_t); void init_kmer_lookup ( kmer_lookup *, seq_coor_t ); void free_kmer_lookup(kmer_lookup *); seq_array allocate_seq(seq_coor_t); void init_seq_array( seq_array, seq_coor_t); void free_seq_array(seq_array); seq_addr_array allocate_seq_addr(seq_coor_t size); void free_seq_addr_array(seq_addr_array); aln_range * find_best_aln_range(kmer_match *, seq_coor_t, seq_coor_t, seq_coor_t); void free_aln_range( aln_range *); kmer_match * find_kmer_pos_for_seq( char *, seq_coor_t, unsigned int K, seq_addr_array, kmer_lookup * ); void free_kmer_match( kmer_match * ptr); void free_kmer_lookup(kmer_lookup * ); void add_sequence ( seq_coor_t, unsigned int, char *, seq_coor_t, seq_addr_array, seq_array, kmer_lookup *); void mask_k_mer(seq_coor_t, kmer_lookup *, seq_coor_t); alignment *_align(char *aseq, seq_coor_t aseq_pos, char *bseq, seq_coor_t bseq_pos, seq_coor_t t, int t2); void free_alignment(alignment *); void free_consensus_data(consensus_data *); void print_d_path( d_path_data2 * base, unsigned long max_idx); void d_path_sort( d_path_data2 * base, unsigned long max_idx); int compare_d_path(const void * a, const void * b); typedef struct { seq_coor_t t_pos; uint8_t delta; char q_base; seq_coor_t p_t_pos; // the tag position of the previous base uint8_t p_delta; // the tag delta of the previous base char p_q_base; // the previous base unsigned q_id; } align_tag_t; typedef struct { seq_coor_t len; align_tag_t * align_tags; } align_tags_t; typedef struct { uint16_t size; uint16_t n_link; seq_coor_t * p_t_pos; // the tag position of the previous base uint8_t * p_delta; // the tag delta of the previous base char * p_q_base; // the previous base uint16_t * link_count; uint16_t count; seq_coor_t best_p_t_pos; uint8_t best_p_delta; uint8_t best_p_q_base; // encoded base double score; } align_tag_col_t; typedef struct { align_tag_col_t * base; } msa_base_group_t; typedef struct { uint8_t size; uint8_t max_delta; msa_base_group_t * delta; } msa_delta_group_t; typedef msa_delta_group_t * msa_pos_t; align_tags_t * get_align_tags( char * aln_q_seq, char * aln_t_seq, seq_coor_t aln_seq_len, aln_range * range, unsigned q_id, seq_coor_t t_offset); align_tags_t * get_align_tags2( char * aln_q_seq, char * aln_t_seq, seq_coor_t aln_seq_len, aln_range * range, unsigned q_id, seq_coor_t t_offset); void free_align_tags( align_tags_t * tags); void allocate_aln_col( align_tag_col_t * col); void realloc_aln_col( align_tag_col_t * col ); void free_aln_col( align_tag_col_t * col); void allocate_delta_group( msa_delta_group_t * g); void realloc_delta_group( msa_delta_group_t * g, uint16_t new_size ); void free_delta_group( msa_delta_group_t * g); void update_col( align_tag_col_t * col, seq_coor_t p_t_pos, uint8_t p_delta, char p_q_base); msa_pos_t * get_msa_working_sapce(unsigned int max_t_len); void clean_msa_working_space( msa_pos_t * msa_array, unsigned int max_t_len); consensus_data * get_cns_from_align_tags( align_tags_t ** tag_seqs, unsigned n_tag_seqs, unsigned t_len, unsigned min_cov ); consensus_data * get_cns_from_align_tags_large( align_tags_t ** tag_seqs, unsigned n_tag_seqs, unsigned t_len, unsigned min_cov ); consensus_data * generate_consensus( char ** input_seq, unsigned int n_seq, unsigned min_cov, unsigned K, double min_idt); consensus_data * generate_utg_consensus( char ** input_seq, seq_coor_t *offset, unsigned int n_seq, unsigned min_cov, unsigned K, double min_idt); void free_consensus_data( consensus_data * consensus ); #endif HINGE-0.5.0/src/include/ini.h000066400000000000000000000053001314415550300155070ustar00rootroot00000000000000/* inih -- simple .INI file parser inih is released under the New BSD license (see LICENSE.txt). Go to the project home page for more info: https://github.com/benhoyt/inih */ #ifndef __INI_H__ #define __INI_H__ /* Make this header file easier to include in C++ code */ #ifdef __cplusplus extern "C" { #endif #include /* Typedef for prototype of handler function. */ typedef int (*ini_handler)(void* user, const char* section, const char* name, const char* value); /* Typedef for prototype of fgets-style reader function. */ typedef char* (*ini_reader)(char* str, int num, void* stream); /* Parse given INI-style file. May have [section]s, name=value pairs (whitespace stripped), and comments starting with ';' (semicolon). Section is "" if name=value pair parsed before any section heading. name:value pairs are also supported as a concession to Python's ConfigParser. For each name=value pair parsed, call handler function with given user pointer as well as section, name, and value (data only valid for duration of handler call). Handler should return nonzero on success, zero on error. Returns 0 on success, line number of first error on parse error (doesn't stop on first error), -1 on file open error, or -2 on memory allocation error (only when INI_USE_STACK is zero). */ int ini_parse(const char* filename, ini_handler handler, void* user); /* Same as ini_parse(), but takes a FILE* instead of filename. This doesn't close the file when it's finished -- the caller must do that. */ int ini_parse_file(FILE* file, ini_handler handler, void* user); /* Same as ini_parse(), but takes an ini_reader function pointer instead of filename. Used for implementing custom or string-based I/O. */ int ini_parse_stream(ini_reader reader, void* stream, ini_handler handler, void* user); /* Nonzero to allow multi-line value parsing, in the style of Python's ConfigParser. If allowed, ini_parse() will call the handler with the same name for each subsequent line parsed. */ #ifndef INI_ALLOW_MULTILINE #define INI_ALLOW_MULTILINE 1 #endif /* Nonzero to allow a UTF-8 BOM sequence (0xEF 0xBB 0xBF) at the start of the file. See http://code.google.com/p/inih/issues/detail?id=21 */ #ifndef INI_ALLOW_BOM #define INI_ALLOW_BOM 1 #endif /* Nonzero to use stack, zero to use heap (malloc/free). */ #ifndef INI_USE_STACK #define INI_USE_STACK 1 #endif /* Stop parsing on first error (default is to keep parsing). */ #ifndef INI_STOP_ON_FIRST_ERROR #define INI_STOP_ON_FIRST_ERROR 0 #endif /* Maximum line length for any line in INI file. */ #ifndef INI_MAX_LINE #define INI_MAX_LINE 200 #endif #ifdef __cplusplus } #endif #endif /* __INI_H__ */ HINGE-0.5.0/src/include/kseq.h000066400000000000000000000214221314415550300156760ustar00rootroot00000000000000/* The MIT License Copyright (c) 2008, 2009, 2011 Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* Last Modified: 05MAR2012 */ #ifndef AC_KSEQ_H #define AC_KSEQ_H #include #include #include #ifndef klib_unused #if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3) #define klib_unused __attribute__ ((__unused__)) #else #define klib_unused #endif #endif /* klib_unused */ #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r #define KS_SEP_TAB 1 // isspace() && !' ' #define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows) #define KS_SEP_MAX 2 #define __KS_TYPE(type_t) \ typedef struct __kstream_t { \ int begin, end; \ int is_eof:2, bufsize:30; \ type_t f; \ unsigned char *buf; \ } kstream_t; #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) #define __KS_BASIC(SCOPE, type_t, __bufsize) \ SCOPE kstream_t *ks_init(type_t f) \ { \ kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ ks->f = f; ks->bufsize = __bufsize; \ ks->buf = (unsigned char*)malloc(__bufsize); \ return ks; \ } \ SCOPE void ks_destroy(kstream_t *ks) \ { \ if (!ks) return; \ free(ks->buf); \ free(ks); \ } #define __KS_INLINED(__read) \ static inline klib_unused int ks_getc(kstream_t *ks) \ { \ if (ks->is_eof && ks->begin >= ks->end) return -1; \ if (ks->begin >= ks->end) { \ ks->begin = 0; \ ks->end = __read(ks->f, ks->buf, ks->bufsize); \ if (ks->end < ks->bufsize) ks->is_eof = 1; \ if (ks->end == 0) return -1; \ } \ return (int)ks->buf[ks->begin++]; \ } \ static inline klib_unused int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ { return ks_getuntil2(ks, delimiter, str, dret, 0); } #ifndef KSTRING_T #define KSTRING_T kstring_t typedef struct __kstring_t { size_t l, m; char *s; } kstring_t; #endif #ifndef kroundup32 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) #endif #define __KS_GETUNTIL(SCOPE, __read) \ SCOPE int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ { \ if (dret) *dret = 0; \ str->l = append? str->l : 0; \ if (ks->begin >= ks->end && ks->is_eof) return -1; \ for (;;) { \ int i; \ if (ks->begin >= ks->end) { \ if (!ks->is_eof) { \ ks->begin = 0; \ ks->end = __read(ks->f, ks->buf, ks->bufsize); \ if (ks->end < ks->bufsize) ks->is_eof = 1; \ if (ks->end == 0) break; \ } else break; \ } \ if (delimiter == KS_SEP_LINE) { \ for (i = ks->begin; i < ks->end; ++i) \ if (ks->buf[i] == '\n') break; \ } else if (delimiter > KS_SEP_MAX) { \ for (i = ks->begin; i < ks->end; ++i) \ if (ks->buf[i] == delimiter) break; \ } else if (delimiter == KS_SEP_SPACE) { \ for (i = ks->begin; i < ks->end; ++i) \ if (isspace(ks->buf[i])) break; \ } else if (delimiter == KS_SEP_TAB) { \ for (i = ks->begin; i < ks->end; ++i) \ if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ } else i = 0; /* never come to here! */ \ if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ str->m = str->l + (i - ks->begin) + 1; \ kroundup32(str->m); \ str->s = (char*)realloc(str->s, str->m); \ } \ memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ str->l = str->l + (i - ks->begin); \ ks->begin = i + 1; \ if (i < ks->end) { \ if (dret) *dret = ks->buf[i]; \ break; \ } \ } \ if (str->s == 0) { \ str->m = 1; \ str->s = (char*)calloc(1, 1); \ } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \ str->s[str->l] = '\0'; \ return str->l; \ } #define KSTREAM_INIT2(SCOPE, type_t, __read, __bufsize) \ __KS_TYPE(type_t) \ __KS_BASIC(SCOPE, type_t, __bufsize) \ __KS_GETUNTIL(SCOPE, __read) \ __KS_INLINED(__read) #define KSTREAM_INIT(type_t, __read, __bufsize) KSTREAM_INIT2(static, type_t, __read, __bufsize) #define KSTREAM_DECLARE(type_t, __read) \ __KS_TYPE(type_t) \ extern int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append); \ extern kstream_t *ks_init(type_t f); \ extern void ks_destroy(kstream_t *ks); \ __KS_INLINED(__read) /****************** * FASTA/Q parser * ******************/ #define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0) #define __KSEQ_BASIC(SCOPE, type_t) \ SCOPE kseq_t *kseq_init(type_t fd) \ { \ kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ s->f = ks_init(fd); \ return s; \ } \ SCOPE void kseq_destroy(kseq_t *ks) \ { \ if (!ks) return; \ free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ ks_destroy(ks->f); \ free(ks); \ } /* Return value: >=0 length of the sequence (normal) -1 end-of-file -2 truncated quality string */ #define __KSEQ_READ(SCOPE) \ SCOPE int kseq_read(kseq_t *seq) \ { \ int c; \ kstream_t *ks = seq->f; \ if (seq->last_char == 0) { /* then jump to the next header line */ \ while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ if (c == -1) return -1; /* end of file */ \ seq->last_char = c; \ } /* else: the first header char has been read in the previous call */ \ seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \ if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \ if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ seq->seq.m = 256; \ seq->seq.s = (char*)malloc(seq->seq.m); \ } \ while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ if (c == '\n') continue; /* skip empty lines */ \ seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \ } \ if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ seq->seq.m = seq->seq.l + 2; \ kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ } \ seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ if (c != '+') return seq->seq.l; /* FASTA */ \ if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \ seq->qual.m = seq->seq.m; \ seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ } \ while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ if (c == -1) return -2; /* error: no quality string */ \ while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \ seq->last_char = 0; /* we have not come to the next header line */ \ if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ return seq->seq.l; \ } #define __KSEQ_TYPE(type_t) \ typedef struct { \ kstring_t name, comment, seq, qual; \ int last_char; \ kstream_t *f; \ } kseq_t; #define KSEQ_INIT2(SCOPE, type_t, __read) \ KSTREAM_INIT2(SCOPE, type_t, __read, 16384) \ __KSEQ_TYPE(type_t) \ __KSEQ_BASIC(SCOPE, type_t) \ __KSEQ_READ(SCOPE) #define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read) #define KSEQ_DECLARE(type_t) \ __KS_TYPE(type_t) \ __KSEQ_TYPE(type_t) \ extern kseq_t *kseq_init(type_t fd); \ void kseq_destroy(kseq_t *ks); \ int kseq_read(kseq_t *seq); #endif HINGE-0.5.0/src/include/paf.h000066400000000000000000000033741314415550300155070ustar00rootroot00000000000000/* The MIT License Copyright (c) 2008, 2009, 2011 Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #ifndef PAF_PAF_H #define PAF_PAF_H #include #include #ifndef KSTRING_T #define KSTRING_T kstring_t typedef struct __kstring_t { size_t l, m; char *s; } kstring_t; #endif typedef struct { void *fp; kstring_t buf; } paf_file_t; typedef struct { const char *qn, *tn; // these point to the input string; NOT allocated uint32_t ql, qs, qe, tl, ts, te; uint32_t ml:31, rev:1, bl; } paf_rec_t; #ifdef __cplusplus extern "C" { #endif paf_file_t *paf_open(const char *fn); int paf_close(paf_file_t *pf); int paf_read(paf_file_t *pf, paf_rec_t *r); #ifdef __cplusplus } #endif #endif HINGE-0.5.0/src/layout/000077500000000000000000000000001314415550300144535ustar00rootroot00000000000000HINGE-0.5.0/src/layout/CMakeLists.txt000066400000000000000000000004751314415550300172210ustar00rootroot00000000000000cmake_minimum_required(VERSION 3.2) set(Boost_USE_STATIC_LIBS ON) FIND_PACKAGE( Boost COMPONENTS graph REQUIRED ) INCLUDE_DIRECTORIES( ${Boost_INCLUDE_DIR} ) add_executable(hinging hinging) target_link_libraries(hinging LAInterface ini spdlog ${Boost_LIBRARIES}) install(TARGETS hinging DESTINATION ${libexec}) HINGE-0.5.0/src/layout/hinging.cpp000066400000000000000000002673411314415550300166170ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include "spdlog/spdlog.h" #include "cmdline.h" #include "INIReader.h" #include "DB.h" #include "align.h" #include "LAInterface.h" #include #include #include #include #include #define LAST_READ_SYMBOL '$' #define HINGED_EDGE 1 #define UNHINGED_EDGE -1 #define REVERSE_COMPLEMENT_MATCH 1 #define SAME_DIRECTION_MATCH 0 using namespace boost; typedef adjacency_list Graph; typedef std::tuple Edge_w; std::string lastN(std::string input, int n) { return input.substr(input.size() - n); } inline std::vector glob(const std::string& pat){ using namespace std; glob_t glob_result; int i = 1; std::string search_name; search_name = pat + "."+std::to_string(i)+".las"; std::cout << search_name << endl; glob(search_name.c_str(),GLOB_TILDE,NULL,&glob_result); // std::cout << "Number of files " << glob_result.gl_pathc << std::endl; vector ret; while (glob_result.gl_pathc != 0){ ret.push_back(string(glob_result.gl_pathv[0])); i ++; search_name = pat + "."+std::to_string(i)+".las"; glob(search_name.c_str(),GLOB_TILDE,NULL,&glob_result); // std::cout << "Number of files " << glob_result.gl_pathc << std::endl; } std::cout << "-------------------------"<< std::endl; std::cout << "Number of files " << i-1 << std::endl; std::cout << "Input string " << pat.c_str() << std::endl; std::cout << "-------------------------"<< std::endl; globfree(&glob_result); return ret; } bool ProcessAlignment(LOverlap * match, Read * read_A, Read * read_B, int ALN_THRESHOLD, int THETA, int THETA2, bool trim){ //Function takes as input pointers to a match, and the read_A and read_B of that match, set constants //ALN_THRESHOLD and THETA //It inputs the effective read start and end into the match class object //Next it trims match //Finally it figures out the type of match we have here by calling AddTypesAsymmetric() on the //class object //std::cout<<" In ProcessAlignment"<eff_read_A_read_start_ = read_A->effective_start; match->eff_read_A_read_end_ = read_A->effective_end; // removed the following if, so that things agree with the convention for reverse complement matches match->eff_read_B_read_start_ = read_B->effective_start; match->eff_read_B_read_end_ = read_B->effective_end; // if (match->reverse_complement_match_ == 0) { // match->eff_read_B_read_start_ = read_B->effective_start; // match->eff_read_B_read_end_ = read_B->effective_end; // } else { // match->eff_read_B_read_start_ = read_B->len - read_B->effective_end; // match->eff_read_B_read_end_ = read_B->len - read_B->effective_start; // } /*printf("bef %d %d %d [%d %d] [%d %d] [%d %d] [%d %d]\n", match->read_A_id_, match->read_B_id_, * match->reverse_complement_match_, match->read_A_match_start_, match->read_A_match_end_, match->read_B_match_start_, match->read_B_match_end_, match->eff_read_A_read_start_, match->eff_read_A_read_end_, match->eff_read_B_read_start_, match->eff_read_B_read_end_ );*/ if (trim) match->trim_overlap(); else { match->eff_read_B_match_start_ = match->read_B_match_start_; match->eff_read_B_match_end_ = match->read_B_match_end_; match->eff_read_A_match_start_ = match->read_A_match_start_; match->eff_read_A_match_end_ = match->read_A_match_end_; } /*printf("aft %d %d %d [%d %d] [%d %d] [%d %d] [%d %d]\n", match->read_A_id_, match->read_B_id_, * match->reverse_complement_match_, match->eff_read_A_match_start_, match->eff_read_A_match_end_, match->eff_read_B_match_start_, match->eff_read_B_match_end_, match->eff_read_A_read_start_, match->eff_read_A_read_end_, match->eff_read_B_read_start_, match->eff_read_B_read_end_ );*/ //std::cout<< contained<eff_read_B_match_end_ - match->eff_read_B_match_start_) < ALN_THRESHOLD) or ((match->eff_read_A_match_end_ - match->eff_read_A_match_start_) < ALN_THRESHOLD) or (!match->active)) { match->active = false; match->match_type_ = NOT_ACTIVE; } else { match->AddTypesAsymmetric(THETA,THETA2); if (match->match_type_ == BCOVERA) { contained = true; } //std::cout<< contained<< std::endl; } match->weight = match->eff_read_A_match_end_ - match->eff_read_A_match_start_ + match->eff_read_B_match_end_ - match->eff_read_B_match_start_; match->length = match->read_A_match_end_ - match->read_A_match_start_ + match->read_B_match_end_ - match->read_B_match_start_; return contained; } class Hinge { public: int pos; int type; // 1, -1 bool active; Hinge(int pos, int t, bool active):pos(pos),type(t), active(active) {}; Hinge():pos(0),type(1), active(true) {}; }; // if we uncomment this, we need to make sure it works with the new convention of B_match_start and // B_match_end for reverse complement matches //bool isValidHinge(LOverlap *match, std::vector &read_hinges){ // //Returns true if read_hinges (a vector of all hinges corresponding to a read ) // // has a hinge of appropriate type within tolerance from positions of start of the // // overlap on read_B of the overlap given. // int tolerance=100;//TODO put as #define // int position=match->eff_read_B_match_start_; // parei aqui // int type; //TODO : Make enum // if (match->match_type_==FORWARD_INTERNAL) // type=1; // else if (match->match_type_==BACKWARD_INTERNAL) // type=-1; // // if (match->reverse_complement_match_==1){ // type=-type; // position=match->eff_read_B_match_end_; // } // // bool valid=false; // for (int index=0; index < read_hinges.size(); index++) { // if ((abs(position - read_hinges[index].pos) < tolerance) and (type == read_hinges[index].type)) // valid = true; // return valid; // } //} void PrintOverlapToFile(FILE * file_pointer, LOverlap * match) { int direction = match->reverse_complement_match_; int hinged; if ((match->match_type_ == FORWARD) or (match->match_type_ == BACKWARD)) hinged = UNHINGED_EDGE; else if ((match->match_type_ == FORWARD_INTERNAL) or (match->match_type_ == BACKWARD_INTERNAL)) hinged = HINGED_EDGE; if ((match->match_type_ == FORWARD_INTERNAL) or (match->match_type_ == FORWARD)) { fprintf(file_pointer, "%d %d %d %d %d %d [%d %d] [%d %d] [%d %d] [%d %d] [%d %d] [%d %d]\n", match->read_A_id_, match->read_B_id_, match->length, 0, direction, hinged, match->eff_read_A_match_start_, match->eff_read_A_match_end_, match->eff_read_B_match_start_, match->eff_read_B_match_end_, match->eff_read_A_read_start_, match->eff_read_A_read_end_, match->eff_read_B_read_start_, match->eff_read_B_read_end_, match->read_A_match_start_, match->read_A_match_end_, match->read_B_match_start_, match->read_B_match_end_ ); } else if ((match->match_type_ == BACKWARD_INTERNAL) or (match->match_type_ == BACKWARD)){ fprintf(file_pointer, "%d %d %d %d %d %d [%d %d] [%d %d] [%d %d] [%d %d] [%d %d] [%d %d]\n", match->read_B_id_, match->read_A_id_, match->length, direction, 0, hinged, match->eff_read_B_match_start_, match->eff_read_B_match_end_, match->eff_read_A_match_start_, match->eff_read_A_match_end_, match->eff_read_B_read_start_, match->eff_read_B_read_end_, match->eff_read_A_read_start_, match->eff_read_A_read_end_, match->read_A_match_start_, match->read_A_match_end_, match->read_B_match_start_, match->read_B_match_end_ ); } } void PrintOverlapToFile2(FILE * file_pointer, LOverlap * match, int hinge_pos) { int direction = match->reverse_complement_match_; int hinged; // if ((match->match_type_ == FORWARD) or (match->match_type_ == BACKWARD)) // hinged = UNHINGED_EDGE; // // else if ((match->match_type_ == FORWARD_INTERNAL) or (match->match_type_ == BACKWARD_INTERNAL)) // hinged = HINGED_EDGE; // if ((match->match_type_ == FORWARD) or (match->match_type_ == BACKWARD)) // hinged = 0; // else if (match->match_type_ == FORWARD_INTERNAL) // hinged = 1; // else if (match->match_type_ == BACKWARD_INTERNAL) // hinged = -1; if (match->match_type_ == FORWARD) { fprintf(file_pointer, "%d %d %d %d %d %d %d [%d %d] [%d %d] [%d %d] [%d %d]\n", match->read_A_id_, match->read_B_id_, match->length, 0, direction, 0, -1, // hinge pos match->eff_read_A_match_start_, match->eff_read_A_match_end_, match->eff_read_B_match_start_, match->eff_read_B_match_end_, match->eff_read_A_read_start_, match->eff_read_A_read_end_, match->eff_read_B_read_start_, match->eff_read_B_read_end_); } else if (match->match_type_ == BACKWARD) { fprintf(file_pointer, "%d %d %d %d %d %d %d [%d %d] [%d %d] [%d %d] [%d %d]\n", match->read_B_id_, match->read_A_id_, match->length, direction, 0, 0, -1, // hinge pos match->eff_read_B_match_start_, match->eff_read_B_match_end_, match->eff_read_A_match_start_, match->eff_read_A_match_end_, match->eff_read_B_read_start_, match->eff_read_B_read_end_, match->eff_read_A_read_start_, match->eff_read_A_read_end_); } else if (match->match_type_ == FORWARD_INTERNAL) { fprintf(file_pointer, "%d %d %d %d %d %d %d [%d %d] [%d %d] [%d %d] [%d %d]\n", match->read_A_id_, match->read_B_id_, match->length, 0, direction, 1, // hinged forward hinge_pos, match->eff_read_A_match_start_, match->eff_read_A_match_end_, match->eff_read_B_match_start_, match->eff_read_B_match_end_, match->eff_read_A_read_start_, match->eff_read_A_read_end_, match->eff_read_B_read_start_, match->eff_read_B_read_end_); } else if (match->match_type_ == BACKWARD_INTERNAL) { fprintf(file_pointer, "%d %d %d %d %d %d %d [%d %d] [%d %d] [%d %d] [%d %d]\n", match->read_B_id_, match->read_A_id_, match->length, direction, 0, -1, // hinged backward hinge_pos, match->eff_read_B_match_start_, match->eff_read_B_match_end_, match->eff_read_A_match_start_, match->eff_read_A_match_end_, match->eff_read_B_read_start_, match->eff_read_B_read_end_, match->eff_read_A_read_start_, match->eff_read_A_read_end_); } } void GetAlignment ( LAInterface &la, std::vector & reads, std::vector > > & idx_ab, std::vector> & matches_forward, std::vector>& matches_backward, int n_read, const char *name_db, const char *name_las_base, const char *name_paf, bool mult_las, int ALN_THRESHOLD, int THETA, int THETA2, bool USE_TWO_MATCHES, int64 n_aln_full, const std::shared_ptr console, std::string name_maximal_reads, bool KEEP_ONLY_MATCHES_BETWEEN_MAXIMAL_READS ){ std::ifstream max_reads_file(name_maximal_reads); n_aln_full = 0; int num_active_reads(0); int64 n_aln_kept_full(0); int64 n_rev_aln_full(0); int64 n_rev_aln_kept_full(0); std::string name_las_string; console->info("Multiple las files: {}", mult_las); if (strlen(name_paf) > 0) console->info("Loading from paf: {}", name_paf); if (strlen(name_las_base) > 0) { if (mult_las) name_las_string = std::string(name_las_base); else { if (lastN(std::string(name_las_base), 4) == ".las") name_las_string = std::string(name_las_base); else name_las_string = std::string(name_las_base) + ".las"; } } n_aln_full = 0; const char * name_las = name_las_string.c_str(); std::vector name_las_list; std::string name_las_str(name_las); console->info("Las files: {}", name_las_str); if (mult_las and strlen(name_las_base) > 0) { console->info("Calling glob."); name_las_list = glob(name_las_str); } else if (strlen(name_las_base) > 0) name_las_list.push_back(name_las_str); else{ name_las_str = std::string(name_paf); name_las_list.push_back(name_las_str); } console->info("number of las files: {}", name_las_list.size()); std::vector maximal_read; maximal_read.resize(n_read, false); std::string read_line; while(std::getline(max_reads_file, read_line)) { int read_number; read_number = atoi(read_line.c_str()); maximal_read[read_number] = true; num_active_reads++; } console->info("Total number of active reads: {}/{}", num_active_reads, n_read); for (int i = 0; i < n_read; i++){ reads[i]->active = (reads[i]->active) and (maximal_read[i]); } int number_of_parts; if (strlen(name_las) > 0) number_of_parts = name_las_list.size(); else if(strlen(name_paf) > 0) number_of_parts = 1; else { console->error("Need to provide either las and db or paf and fasta"); } for (int part = 0; part < number_of_parts; part++) { if (strlen(name_las_base) > 0) { console->info("name of las: {}", name_las_list[part]); if (strlen(name_las_list[part].c_str()) > 0) la.openAlignmentFile(name_las_list[part]); } int64 n_aln = 0; int64 n_aln_accept = 0; int64 n_aln_rcomp_accept = 0; std::vector aln;//Vector of pointers to all alignments if (strlen(name_las_base) > 0) { if (strlen(name_las_list[part].c_str()) > 0) { n_aln = la.getAlignmentNumber(); console->info("Load alignments from {}", name_las_list[part]); console->info("# Alignments: {}", n_aln); } if (strlen(name_las_list[part].c_str()) > 0) { la.resetAlignment(); la.getOverlap(aln, 0, n_read); } } if (strlen(name_paf) > 0){ n_aln = la.loadPAF(std::string(name_paf), aln); console->info("Load alignments from {}", name_paf); console->info("# Alignments: {}", n_aln); } int r_begin = aln.front()->read_A_id_; int r_end = aln.back()->read_A_id_; int num_active_reads_part (0); for (int i = r_begin; i <= r_end; i++) { if (reads[i]->active) num_active_reads_part++; } console->info("# reads: {}", r_end-r_begin+1); console->info("# active reads: {}/{}",num_active_reads_part, r_end-r_begin+1); console->info("Input data finished, part {}/{}", part + 1, name_las_list.size()); for (int i = 0; i < aln.size(); i++) { if (aln[i]->read_A_id_ == aln[i]->read_B_id_) { aln[i]->active = false; } if ((reads[aln[i]->read_A_id_]->active) and ((reads[aln[i]->read_B_id_]->active) and KEEP_ONLY_MATCHES_BETWEEN_MAXIMAL_READS)) { idx_ab[aln[i]->read_A_id_][aln[i]->read_B_id_] = std::vector(); n_aln_accept++; n_aln_rcomp_accept += aln[i]->reverse_complement_match_; } } for (int i = 0; i < aln.size(); i++) { if ((reads[aln[i]->read_A_id_]->active) and ((reads[aln[i]->read_B_id_]->active) and KEEP_ONLY_MATCHES_BETWEEN_MAXIMAL_READS)) idx_ab[aln[i]->read_A_id_][aln[i]->read_B_id_].push_back(aln[i]); } int n_overlaps = 0; int n_rev_overlaps = 0; for (int i = 0; i < aln.size(); i++) { n_overlaps++; n_rev_overlaps += aln[i]->reverse_complement_match_; } for (int i = 0; i < aln.size(); i++) { if ( not ((reads[aln[i]->read_A_id_]->active) and ((reads[aln[i]->read_B_id_]->active) and KEEP_ONLY_MATCHES_BETWEEN_MAXIMAL_READS))) if (strlen(name_las_base) > 0) delete aln[i]; } console->info("kept {}/{} overlaps, {}/{} rev_overlaps in part {}/{}",n_aln_accept, n_overlaps, n_aln_rcomp_accept, n_rev_overlaps, part + 1, name_las_list.size()); n_aln_full += n_aln; n_aln_kept_full += n_aln_accept; n_rev_aln_full += n_rev_overlaps; n_rev_aln_kept_full += n_aln_rcomp_accept; console->info("index finished"); for (int i = r_begin; i <= r_end; i++) { bool contained = false; //std::cout<< "Testing opt " << i << std::endl; if (reads[i]->active == false) { continue; } int containing_read; for (std::unordered_map >::iterator it = idx_ab[i].begin(); it != idx_ab[i].end(); it++) { std::sort(it->second.begin(), it->second.end(), compare_overlap);//Sort overlaps by lengths //std::cout<<"Giving input to ProcessAlignment "<second.size() <second.size() > 0) { //Figure out if read is contained LOverlap *ovl = it->second[0]; bool contained_alignment; if (strlen(name_db) > 0) contained_alignment = ProcessAlignment(ovl, reads[ovl->read_A_id_], reads[ovl->read_B_id_], ALN_THRESHOLD, THETA, THETA2, true); else contained_alignment = ProcessAlignment(ovl, reads[ovl->read_A_id_], reads[ovl->read_B_id_], ALN_THRESHOLD, THETA, THETA2, false); if (contained_alignment == true) { containing_read = ovl->read_B_id_; } if (reads[ovl->read_B_id_]->active == true) contained = contained or contained_alignment; //Filter matches that matter. //TODO Figure out a way to do this more efficiently if ((ovl->match_type_ == FORWARD) or (ovl->match_type_ == FORWARD_INTERNAL)) matches_forward[i].push_back(it->second[0]); else if ((ovl->match_type_ == BACKWARD) or (ovl->match_type_ == BACKWARD_INTERNAL)) matches_backward[i].push_back(it->second[0]); } if ((it->second.size() > 1) and (USE_TWO_MATCHES)) { //Figure out if read is contained LOverlap *ovl = it->second[1]; bool contained_alignment; if (strlen(name_db) > 0) contained_alignment = ProcessAlignment(ovl, reads[ovl->read_A_id_], reads[ovl->read_B_id_], ALN_THRESHOLD, THETA, THETA2, true); else contained_alignment = ProcessAlignment(ovl, reads[ovl->read_A_id_], reads[ovl->read_B_id_], ALN_THRESHOLD, THETA, THETA2, false); if (contained_alignment == true) { containing_read = ovl->read_B_id_; } if (reads[ovl->read_B_id_]->active == true) contained = contained or contained_alignment; //Filter matches that matter. //TODO Figure out a way to do this more efficiently if ((ovl->match_type_ == FORWARD) or (ovl->match_type_ == FORWARD_INTERNAL)) matches_forward[i].push_back(it->second[1]); else if ((ovl->match_type_ == BACKWARD) or (ovl->match_type_ == BACKWARD_INTERNAL)) matches_backward[i].push_back(it->second[1]); } } if (contained) { std::cout << "[contained] Should not happen" << std::endl; reads[i]->active = false; } } } console->info("kept {}/{} overlaps, {}/{} rev_overlaps in {} part(s)", n_aln_kept_full, n_aln_full, n_rev_aln_kept_full, n_rev_aln_full, name_las_list.size()); } int main(int argc, char *argv[]) { mkdir("log",S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH); cmdline::parser cmdp; cmdp.add("db", 'b', "db file name", false, ""); cmdp.add("las", 'l', "las file name", false, ""); cmdp.add("paf", 'p', "paf file name", false, ""); cmdp.add("config", 'c', "configuration file name", false, ""); cmdp.add("fasta", 'f', "fasta file name", false, ""); cmdp.add("prefix", 'x', "(intermediate output) input file prefix", true, ""); cmdp.add("out", 'o', "final output file name", true, ""); cmdp.add("log", 'g', "log folder name", false, "log"); cmdp.add("debug", '\0', "debug mode"); cmdp.add("mlas", '\0', "multiple las files"); // cmdp.add("restrictreads",'r',"restrict to reads in the file",false,""); cmdp.parse_check(argc, argv); LAInterface la; const char *name_db = cmdp.get("db").c_str(); //.db file of reads to load const char *name_las = cmdp.get("las").c_str();//.las file of alignments const char *name_paf = cmdp.get("paf").c_str(); const char *name_fasta = cmdp.get("fasta").c_str(); const char *name_config = cmdp.get("config").c_str();//name of the configuration file, in INI format std::string out = cmdp.get("prefix"); std::string out_name = cmdp.get("out"); // const char * name_restrict = cmdp.get("restrictreads").c_str(); std::string name_mask = out + ".mas"; std::string name_max = out + ".max"; std::string name_homo = out + ".homologous.txt"; std::string name_rep = out + ".repeat.txt"; std::string name_hg = out + ".hinges.txt"; std::string name_cov = out + ".coverage.txt"; std::string name_garbage = out + ".garbage.txt"; std::string name_contained = out + ".contained.txt"; std::string name_deadend = out_name + ".deadends.txt"; std::ofstream deadend_out(name_deadend); std::ofstream garbage_out(name_garbage); std::ifstream homo(name_homo); std::vector homo_reads; // bool delete_telomere = false; // TODO: command line option to set this true int read_id; while (homo >> read_id) homo_reads.push_back(read_id); namespace spd = spdlog; //auto console = spd::stdout_logger_mt("console"); std::vector sinks; sinks.push_back(std::make_shared()); sinks.push_back( std::make_shared(cmdp.get("log") + "/log", "txt", 23, 59)); auto console = std::make_shared("log", std::begin(sinks), std::end(sinks)); spdlog::register_logger(console); console->info("Hinging layout"); bool mult_las; mult_las = cmdp.exist("mlas"); console->info("name of db: {}, name of .las file {}", name_db, name_las); console->info("name of fasta: {}, name of .paf file {}", name_fasta, name_paf); console->info("filter files prefix: {}", out); console->info("output prefix: {}", out_name); console->info("Multiple las files: {}", mult_las); console->info("Multiple las files: {}", cmdp.exist("mlas")); bool db_and_las, db_or_las, fa_and_paf, fa_or_paf; db_and_las = (strlen(name_db) > 0) and (strlen(name_las) > 0); db_or_las = (strlen(name_db) > 0) or (strlen(name_las) > 0); fa_and_paf = (strlen(name_fasta) > 0) and (strlen(name_paf) > 0); fa_or_paf = (strlen(name_fasta) > 0) or (strlen(name_paf) > 0); if (db_or_las and fa_or_paf){ console->error("Pass in either a db and a las or a fasta and a paf"); return 1; } if (( not fa_and_paf) and (not db_and_las)){ console->error("Pass in at least one of the following two combinations: a db and a las or a fasta and a paf"); return 1; } if (cmdp.exist("mlas")) { if (not db_and_las) { console->error("--mlas works only with db and las"); return 1; } } std::ifstream ini_file(name_config); std::string str((std::istreambuf_iterator(ini_file)), std::istreambuf_iterator()); console->info("Parameters passed in \n{}", str); if (strlen(name_db) > 0) la.openDB(name_db); int64 n_aln = 0; int n_read; if (strlen(name_db) > 0) n_read = la.getReadNumber(); std::vector reads; //Vector of pointers to all reads if (strlen(name_fasta) > 0) { n_read = la.loadFASTA(name_fasta, reads); } console->info("# Reads: {}", n_read); // output some statistics //// if (strlen(name_paf) > 0) { // n_aln = la.loadPAF(std::string(name_paf), aln); // console->info("Load alignments from {}", name_paf); // console->info("# Alignments: {}", n_aln); // } // if (n_aln == 0) { // console->error("No alignments!"); // return 1; // } if (strlen(name_db) > 0) { la.getRead(reads, 0, n_read); } console->info("Input data finished"); INIReader reader(name_config); if (reader.ParseError() < 0) { console->warn("Can't load {}", name_config); return 1; } int LENGTH_THRESHOLD = int(reader.GetInteger("filter", "length_threshold", -1)); double QUALITY_THRESHOLD = reader.GetReal("filter", "quality_threshold", 0.0); int N_ITER = (int) reader.GetInteger("filter", "n_iter", -1); int ALN_THRESHOLD = (int) reader.GetInteger("filter", "aln_threshold", -1); int MIN_COV = (int) reader.GetInteger("filter", "min_cov", -1); int CUT_OFF = (int) reader.GetInteger("filter", "cut_off", -1); int THETA = (int) reader.GetInteger("filter", "theta", -1); int THETA2 = (int) reader.GetInteger("filter", "theta2", 0); int N_PROC = (int) reader.GetInteger("running", "n_proc", 4); int HINGE_SLACK = (int) reader.GetInteger("layout", "hinge_slack", 1000); //This is the amount by which a forward overlap //must be longer than a forward internal overlap to be preferred while //building a graph. int HINGE_TOLERANCE = (int) reader.GetInteger("layout", "hinge_tolerance", 150); //This is how far an overlap must start from a hinge to be considered an internal //overlap. int KILL_HINGE_OVERLAP_ALLOWANCE = (int) reader.GetInteger("layout", "kill_hinge_overlap", 300); int KILL_HINGE_INTERNAL_ALLOWANCE = (int) reader.GetInteger("layout", "kill_hinge_internal", 40); int MATCHING_HINGE_SLACK = (int) reader.GetInteger("layout", "matching_hinge_slack", 200); int NUM_EVENTS_TELOMERE = (int) reader.GetInteger("layout", "num_events_telomere", 7); int MIN_CONNECTED_COMPONENT_SIZE = (int) reader.GetInteger("layout", "min_connected_component_size", 8); bool USE_TWO_MATCHES = (int) reader.GetInteger("layout", "use_two_matches", 1); bool KEEP_ONLY_MATCHES_BETWEEN_MAXIMAL_READS = (int) reader.GetInteger("layout", "keep_only_matches_between_maximal_reads", 1); bool delete_telomere = (int) reader.GetInteger("layout", "del_telomere", 0); console->info("LENGTH_THRESHOLD = {}", LENGTH_THRESHOLD); console->info("QUALITY_THRESHOLD = {}", QUALITY_THRESHOLD); console->info("ALN_THRESHOLD = {}", ALN_THRESHOLD); console->info("MIN_COV = {}", MIN_COV); console->info("CUT_OFF = {}", CUT_OFF); console->info("THETA = {}", THETA); console->info("N_ITER = {}", N_ITER); console->info("THETA2 = {}", THETA2); console->info("N_PROC = {}", N_PROC); console->info("HINGE_SLACK = {}", HINGE_SLACK); console->info("HINGE_TOLERANCE = {}", HINGE_TOLERANCE); console->info("KILL_HINGE_OVERLAP_ALLOWANCE = {}", KILL_HINGE_OVERLAP_ALLOWANCE); console->info("KILL_HINGE_INTERNAL_ALLOWANCE = {}", KILL_HINGE_INTERNAL_ALLOWANCE); console->info("MATCHING_HINGE_SLACK = {}", MATCHING_HINGE_SLACK); console->info("MIN_CONNECTED_COMPONENT_SIZE = {}", MIN_CONNECTED_COMPONENT_SIZE); console->info("USE_TWO_MATCHES = {}", USE_TWO_MATCHES); console->info("del_telomeres = {}", delete_telomere); omp_set_num_threads(N_PROC); //std::vector< std::vector* > > idx2(n_read); // unordered_map from (aid) to alignments in a vector std::vector edgelist, edgelist_ms; // save output to edgelist //std::unordered_map >idx3,idx4; // this is the pileup std::vector > > idx_ab; /* idx is a vector of length n_read, each element idx3[read A id] is a map, from read B id to a vector of overlaps */ //std::vector> idx2; /* idx2 is a vector of length n_read, each element idx2[read A id] is a vector, for each read B, we put the best overlap into that vector */ //std::vector> idx3; /* idx3 is a vector of length n_read, each element idx3[read A id] is a map, from read read B id to the best overlap of read A and read B */ std::vector> matches_forward, matches_backward; //matches_forward is the vector of vectors where matches_forward[read_id] is a vector of matches of read_id //of type FORWARD, and FORWARD_INTERNAL //matches_backward is the vector of vectors where matches_backward[read_id] is a vector of matches of read_id //of type BACKWARD, and BACKWARD_INTERNAL std::vector> edges_forward, edges_backward; // edges_forward is a "filtered" version of matches_forward, where every (active) read has at exactly // one outgoing match // edges_backward is a "filtered" version of matches_backward, where every (active) read has at exactly // one incoming match std::vector> intersection_edges_forward, intersection_edges_backward; //Stores the intersection of edges constructing the intersection list of edges FILE *mask_file; mask_file = fopen(name_mask.c_str(), "r"); int read, rs, re; while (fscanf(mask_file, "%d %d %d", &read, &rs, &re) != EOF) { reads[read]->effective_start = rs; reads[read]->effective_end = re; } console->info("read mask finished"); FILE *repeat_file; repeat_file = fopen(name_rep.c_str(), "r"); FILE *hinge_file; hinge_file = fopen(name_hg.c_str(), "r"); char *line = NULL; size_t len = 0; std::unordered_map>> marked_repeats; int telomere_cnt = 0; while (getline(&line, &len, repeat_file) != -1) { std::stringstream ss; ss.clear(); ss << line; int num; ss >> num; //printf("%d\n",num); marked_repeats[num] = std::vector>(); int r1 = 0, r2 = 0; while (!ss.eof()) { r1 = 0; r2 = 0; ss >> r1 >> r2; if ((r1 != 0) and (r2 != 0)) { //printf("[%d %d]\n", r1, r2); marked_repeats[num].push_back(std::pair(r1, r2)); } } ss.clear(); if ((delete_telomere) and (marked_repeats[num].size() > NUM_EVENTS_TELOMERE)) { reads[num]->active = false; telomere_cnt++; } } fclose(repeat_file); console->info("read marked repeats"); console->info("killed {} reads with many repeats",telomere_cnt); std::unordered_map>> marked_hinges; while (getline(&line, &len, hinge_file) != -1) { std::stringstream ss; ss << line; int num; ss >> num; //printf("%d\n",num); marked_hinges[num] = std::vector>(); int r1 = 0, r2 = 0; while (!ss.eof()) { r1 = 0; r2 = 0; ss >> r1 >> r2; if ((r1 != 0) and (r2 != 0)) { //printf("[%d %d]\n", r1, r2); marked_hinges[num].push_back(std::pair(r1, r2)); } } ss.clear(); } fclose(hinge_file); console->info("read marked hinges"); if (line) free(line); int num_active_read = 0; //This seems to be an unnecessary stub for (int i = 0; i < n_read; i++) { if (reads[i]->active) num_active_read++; } console->info("active reads: {}", num_active_read); num_active_read = 0; for (int i = 0; i < n_read; i++) { if (reads[i]->effective_end - reads[i]->effective_start < LENGTH_THRESHOLD) { reads[i]->active = false; garbage_out << i << std::endl; } else num_active_read++; } console->info("active reads: {}", num_active_read); for (int i = 0; i < n_read; i++) { //An initialisation for loop //TODO Preallocate memory. Much more efficient. idx_ab.push_back(std::unordered_map >()); //idx2.push_back(std::vector()); matches_forward.push_back(std::vector()); matches_backward.push_back(std::vector()); edges_forward.push_back(std::vector()); edges_backward.push_back(std::vector()); intersection_edges_forward.push_back(std::vector()); intersection_edges_backward.push_back(std::vector()); } //int num_finished = 0; int num_overlaps = 0; int num_forward_overlaps(0), num_forward_internal_overlaps(0), num_reverse_overlaps(0), num_reverse_internal_overlaps(0), rev_complemented_matches(0); //# pragma omp parallel for GetAlignment ( la, reads, idx_ab, matches_forward, matches_backward, n_read, name_db, name_las, name_paf, mult_las, ALN_THRESHOLD, THETA, THETA2, USE_TWO_MATCHES, n_aln, console, name_max, KEEP_ONLY_MATCHES_BETWEEN_MAXIMAL_READS); for (int i = 0; i < n_read; i++) {//Isn't this just 0 or 1? num_overlaps += matches_forward[i].size() + matches_backward[i].size(); for (int j = 0; j < matches_forward[i].size(); j++) rev_complemented_matches += matches_forward[i][j]->reverse_complement_match_; for (int j = 0; j < matches_backward[i].size(); j++) rev_complemented_matches += matches_backward[i][j]->reverse_complement_match_; } console->info("{} overlaps", num_overlaps); console->info("{} rev overlaps", rev_complemented_matches); num_active_read = 0; for (int i = 0; i < n_read; i++) { if (reads[i]->active) { num_active_read++; } } console->info("removed contained reads, active reads: {}", num_active_read); num_active_read = 0; for (int i = 0; i < n_read; i++) { if (reads[i]->active) num_active_read++; } console->info("active reads: {}", num_active_read); num_overlaps = 0; num_forward_overlaps = 0; num_forward_internal_overlaps = 0; num_reverse_overlaps = 0; num_reverse_internal_overlaps = 0; rev_complemented_matches = 0; int rev_complemented_fwd_matches(0), rev_complemented_bck_matches(0), rev_complemented_fwd_int_matches(0), rev_complemented_bck_int_matches(0); for (int i = 0; i < n_read; i++) { if (reads[i]->active) { for (int j = 0; j < matches_forward[i].size(); j++) { if (reads[matches_forward[i][j]->read_B_id_]->active) { num_overlaps++; if (matches_forward[i][j]->match_type_ == FORWARD) { num_forward_overlaps++; rev_complemented_fwd_matches += matches_forward[i][j]->reverse_complement_match_; } else if (matches_forward[i][j]->match_type_ == FORWARD_INTERNAL) { num_forward_internal_overlaps++; rev_complemented_fwd_int_matches += matches_forward[i][j]->reverse_complement_match_; } if (matches_forward[i][j]->reverse_complement_match_ == 1) rev_complemented_matches++; } } //std::cout <<"First for done "<read_B_id_]->active) { num_overlaps++; if (matches_backward[i][j]->match_type_ == BACKWARD) { num_reverse_overlaps++; rev_complemented_bck_matches += matches_backward[i][j]->reverse_complement_match_; } else if (matches_backward[i][j]->match_type_ == BACKWARD_INTERNAL) { num_reverse_internal_overlaps++; rev_complemented_bck_int_matches += matches_backward[i][j]->reverse_complement_match_; } if (matches_backward[i][j]->reverse_complement_match_ == 1) rev_complemented_matches++; } } } } /*std::cout<active) { std::sort(matches_forward[i].begin(), matches_forward[i].end(), compare_overlap_weight); std::sort(matches_backward[i].begin(), matches_backward[i].end(), compare_overlap_weight); } } // temporary FILE *G_out; G_out = fopen("edges.g_out.txt", "w"); for (int i = 0; i < n_read; i++) { if (reads[i]->active) { for (int j = 0; j < matches_forward[i].size(); j++) { if (reads[matches_forward[i][j]->read_B_id_]->active) { fprintf(G_out, "%d %d %d %d %d [%d %d] [%d %d] [%d %d] [%d %d] \n", matches_forward[i][j]->read_A_id_, matches_forward[i][j]->read_B_id_, matches_forward[i][j]->length, matches_forward[i][j]->reverse_complement_match_, matches_forward[i][j]->match_type_, matches_forward[i][j]->eff_read_A_match_start_, matches_forward[i][j]->eff_read_A_match_end_, matches_forward[i][j]->eff_read_B_match_start_, matches_forward[i][j]->eff_read_B_match_end_, matches_forward[i][j]->eff_read_A_read_start_, matches_forward[i][j]->eff_read_A_read_end_, matches_forward[i][j]->eff_read_B_read_start_, matches_forward[i][j]->eff_read_B_read_end_); break; } } } } fprintf(G_out, "bkw\n"); for (int i = 0; i < n_read; i++) { if (reads[i]->active) { for (int j = 0; j < matches_backward[i].size(); j++) { if (reads[matches_backward[i][j]->read_B_id_]->active) { fprintf(G_out, "%d %d %d %d %d [%d %d] [%d %d] [%d %d] [%d %d] \n", matches_backward[i][j]->read_A_id_, matches_backward[i][j]->read_B_id_, matches_backward[i][j]->length, matches_backward[i][j]->reverse_complement_match_, matches_backward[i][j]->match_type_, matches_backward[i][j]->eff_read_A_match_start_, matches_backward[i][j]->eff_read_A_match_end_, matches_backward[i][j]->eff_read_B_match_start_, matches_backward[i][j]->eff_read_B_match_end_, matches_backward[i][j]->eff_read_A_read_start_, matches_backward[i][j]->eff_read_A_read_end_, matches_backward[i][j]->eff_read_B_read_start_, matches_backward[i][j]->eff_read_B_read_end_); break; } } } } FILE *out_backup; out_backup = fopen("edges.fwd.backup.txt", "w"); for (int i = 0; i < n_read; i++) { if (reads[i]->active) for (int j = 0; j < matches_forward[i].size(); j++) { if (reads[matches_forward[i][j]->read_B_id_]->active) fprintf(out_backup, "%d %d %d %d %d [%d %d] [%d %d] [%d %d] [%d %d] \n", matches_forward[i][j]->read_A_id_, matches_forward[i][j]->read_B_id_, matches_forward[i][j]->length, matches_forward[i][j]->reverse_complement_match_, matches_forward[i][j]->match_type_, matches_forward[i][j]->eff_read_A_match_start_, matches_forward[i][j]->eff_read_A_match_end_, matches_forward[i][j]->eff_read_B_match_start_, matches_forward[i][j]->eff_read_B_match_end_, matches_forward[i][j]->eff_read_A_read_start_, matches_forward[i][j]->eff_read_A_read_end_, matches_forward[i][j]->eff_read_B_read_start_, matches_forward[i][j]->eff_read_B_read_end_); } } fclose(out_backup); out_backup = fopen("edges.bkw.backup.txt", "w"); for (int i = 0; i < n_read; i++) { if (reads[i]->active) for (int j = 0; j < matches_backward[i].size(); j++) { if (reads[matches_backward[i][j]->read_B_id_]->active) fprintf(out_backup, "%d %d %d %d %d [%d %d] [%d %d] [%d %d] [%d %d] \n", matches_backward[i][j]->read_A_id_, matches_backward[i][j]->read_B_id_, matches_backward[i][j]->length, matches_backward[i][j]->reverse_complement_match_, matches_backward[i][j]->match_type_, matches_backward[i][j]->eff_read_A_match_start_, matches_backward[i][j]->eff_read_A_match_end_, matches_backward[i][j]->eff_read_B_match_start_, matches_backward[i][j]->eff_read_B_match_end_, matches_backward[i][j]->eff_read_A_read_start_, matches_backward[i][j]->eff_read_A_read_end_, matches_backward[i][j]->eff_read_B_read_start_, matches_backward[i][j]->eff_read_B_read_end_); } } fclose(out_backup); FILE *out_g1; FILE *out_g2; FILE *out_hg; FILE *out_hg2; FILE *out_greedy; FILE *out_skipped; out_g1 = fopen((std::string(out_name) + ".edges.1").c_str(), "w"); out_g2 = fopen((std::string(out_name) + ".edges.2").c_str(), "w"); // Output files for edges out_hg = fopen((std::string(out_name) + ".edges.hinges").c_str(), "w"); out_hg2 = fopen((std::string(out_name) + ".edges.hinges2").c_str(), "w"); out_greedy = fopen((std::string(out_name) + ".edges.greedy").c_str(), "w"); out_skipped = fopen((std::string(out_name) + ".edges.skipped").c_str(), "w"); // All hinges ikmported from the hinges.txt file std::unordered_map > hinges_vec; // Hinges that we were previously killed in filter.cpp due to bridging std::unordered_map > killed_hinges_vec; // Hinges that will be killed for being matched with a hinge in killed_hinges_vec std::unordered_map > new_killed_hinges_vec; int n = 0; int kh = 0; for (int i = 0; i < n_read; i++) { hinges_vec[i] = std::vector(); std::set > surviving_hinges(marked_hinges[i].begin(), marked_hinges[i].end()); for (int j = 0; j < marked_hinges[i].size(); j++) { hinges_vec[i].push_back(Hinge(marked_hinges[i][j].first, marked_hinges[i][j].second, true)); if (reads[i]->active) { n++; } } for (int j = 0; j < marked_repeats[i].size(); j++) { if (surviving_hinges.find(marked_repeats[i][j]) == surviving_hinges.end()) { killed_hinges_vec[i].push_back(Hinge(marked_repeats[i][j].first, marked_repeats[i][j].second, false)); if (reads[i]->active) { kh++; } } } } console->info("{} killed hinges", kh); console->info("{} hinges", n); std::ofstream killed_out(out + ".killed.hinges"); for (int i = 0; i < n_read; i++) { killed_out << i << " "; for (int j = 0; j < killed_hinges_vec[i].size(); j++) { killed_out << killed_hinges_vec[i][j].type << " " << killed_hinges_vec[i][j].pos << " "; } killed_out << std::endl; } n = 0; for (int i = 0; i < n_read; i++) { for (int j = 0; j < hinges_vec[i].size(); j++) { if ((reads[i]->active) and (hinges_vec[i][j].active)) n++; } } console->info("{} active hinges", n); /** * Switch to naive hinge filtering * Keep the hinge only if there are HINGE_READS reads that start near the hinge and continue to the end of the read */ /*int HINGE_READS = 1; for (int i = 0; i < n_read; i++) { for (int j = 0; j < hinges_vec[i].size(); j++) { int num_near_hinge_reads = 0; if ((reads[i]->active) and (hinges_vec[i][j].active) and (hinges_vec[i][j].type == 1)) { // count reads that start near the hinge and continue to the end of the read printf("read %d hinge %d type %d pos %d ", i, j, 1, hinges_vec[i][j].pos); num_near_hinge_reads = 0; for (int k = 0; k < matches_forward[i].size(); k ++ ) { if ((matches_forward[i][k]->match_type_ == FORWARD) and (reads[matches_forward[i][k]->read_B_id_]->active) and abs((matches_forward[i][k]->eff_read_A_match_start_ - hinges_vec[i][j].pos ) < 300)) num_near_hinge_reads ++; } printf("num %d\n", num_near_hinge_reads); } else if ((reads[i]->active) and (hinges_vec[i][j].active) and (hinges_vec[i][j].type == -1)) { printf("read %d hinge %d type %d pos %d ", i, j, -1, hinges_vec[i][j].pos); num_near_hinge_reads = 0; for (int k = 0; k < matches_backward[i].size(); k ++ ) { if ((matches_backward[i][k]->match_type_ == BACKWARD) and (reads[matches_backward[i][k]->read_B_id_]->active) and (abs(matches_backward[i][k]->eff_read_A_match_end_ - hinges_vec[i][j].pos ) < 300)) num_near_hinge_reads ++; } printf("num %d\n", num_near_hinge_reads); } //if (num_near_hinge_reads != HINGE_READS) hinges_vec[i][j].active = false; } }*/ // TODO: Technically we dont need this filtering, as we can use the hinge graph // construction to do the filtering as well for (int i = 0; i < n_read; i++) { //This is in essence the filtering step //For each read find the best forward match, and remove all incoming hinges starting after the start //of the match corresponding to this. //Update 2/19: Now, we remove any in-hinge (out-hinge) if there is a FORWARD or FORWARD_INTERNAL match // (BACKWARD or BACKWARD_INTERNAL) that starts on or before (after) the hinge. 40 is error margin. if (reads[i]->active) { int forward = 0; int backward = 0; for (int j = 0; j < matches_forward[i].size(); j++) { if (matches_forward[i][j]->active) { if (((matches_forward[i][j]->match_type_ == FORWARD) or (matches_forward[i][j]->match_type_ == FORWARD_INTERNAL)) and (reads[matches_forward[i][j]->read_B_id_]->active)) { for (int k = 0; k < hinges_vec[i].size(); k++) { if ((((matches_forward[i][j]->eff_read_A_match_start_ < hinges_vec[i][k].pos + KILL_HINGE_INTERNAL_ALLOWANCE) and (matches_forward[i][j]->match_type_ == FORWARD_INTERNAL)) or ((matches_forward[i][j]->eff_read_A_match_start_ < hinges_vec[i][k].pos - KILL_HINGE_OVERLAP_ALLOWANCE) and (matches_forward[i][j]->match_type_ == FORWARD))) and (hinges_vec[i][k].type == 1)) { hinges_vec[i][k].active = false; } } //} //forward++; } } } for (int j = 0; j < matches_backward[i].size(); j++) { if (matches_backward[i][j]->active) { if (((matches_backward[i][j]->match_type_ == BACKWARD) or (matches_backward[i][j]->match_type_ == BACKWARD_INTERNAL)) and (reads[matches_backward[i][j]->read_B_id_]->active)) { // if (backward < 1) { //remove certain hinges for (int k = 0; k < hinges_vec[i].size(); k++) { if ((((matches_backward[i][j]->eff_read_A_match_end_ > hinges_vec[i][k].pos - KILL_HINGE_INTERNAL_ALLOWANCE) and (matches_backward[i][j]->match_type_ == BACKWARD_INTERNAL)) or ((matches_backward[i][j]->eff_read_A_match_end_ > hinges_vec[i][k].pos + KILL_HINGE_OVERLAP_ALLOWANCE) and (matches_backward[i][j]->match_type_ == BACKWARD))) and (hinges_vec[i][k].type == -1)) { hinges_vec[i][k].active = false; } } //} //backward++; } } } } } console->info("Building hinge graph"); //ogdf::Graph hinge_graph; //ogdf::HashArray hinge_graph_node_list; int num_hinges(0); for (int i = 0; i < n_read; i++) { num_hinges+=hinges_vec[i].size(); } //ogdf::Graph hinge_graph; //ogdf::HashArray hinge_graph_node_list; console->info("num hinges {}", num_hinges); Graph hinge_graph (num_hinges); int hg(0); std::map< std::pair , int> hinge_graph_node_map; std::map > hinge_graph_node_map_rev; for (int i=0; i< hinges_vec.size(); i++){ for(int j=0; j < hinges_vec[i].size(); j++){ hinge_graph_node_map[std::make_pair(i,j)]=hg; hinge_graph_node_map_rev[hg]= std::make_pair(i,j); hg++; } } // Hinge graph construction // En passant, we identify the new_killed_hinges FILE *out_hgraph; out_hgraph = fopen((std::string(out_name) + ".hgraph").c_str(), "w"); FILE *out_debug; out_debug = fopen((std::string(out_name) + ".debug").c_str(), "w"); FILE * OverlapDebugFile; OverlapDebugFile = fopen("overlap_debug.txt", "w"); int pos_B; for (int i = 0; i < n_read; i++) { if (reads[i]->active) { for (int k = 0; k < hinges_vec[i].size(); k++) { for (int j = 0; j < matches_forward[i].size(); j++) { if (matches_forward[i][j]->active) { if (((matches_forward[i][j]->match_type_ == FORWARD) or (matches_forward[i][j]->match_type_ == FORWARD_INTERNAL)) and (reads[matches_forward[i][j]->read_B_id_]->active)) { // Here we check whether read B has a hinge matching hinges_vec[i][k] // Should we also check whether hinges are active? pos_B = matches_forward[i][j]->GetMatchingPosition(hinges_vec[i][k].pos); // console->info("Matching position is {}", pos_B); // for debugging int req_hinge_type; int rev_int = 0; if (matches_forward[i][j]->reverse_complement_match_ == true) { req_hinge_type = -1 * hinges_vec[i][k].type; rev_int = 1; } else { req_hinge_type = hinges_vec[i][k].type; } // std::cout << req_hinge_type << std::endl; int b_id = matches_forward[i][j]->read_B_id_; for (int l = 0; l < hinges_vec[b_id].size(); l++) { if ((hinges_vec[b_id][l].pos < pos_B + MATCHING_HINGE_SLACK) and (hinges_vec[b_id][l].pos > pos_B - MATCHING_HINGE_SLACK)) { // found a matching hinge if (req_hinge_type == hinges_vec[b_id][l].type) { std::pair first_coord, second_coord; first_coord=std::make_pair(i,k); second_coord=std::make_pair(b_id,l); if (hinges_vec[i][k].type == 1) { add_edge(hinge_graph_node_map[first_coord], hinge_graph_node_map[second_coord], hinge_graph); fprintf(out_hgraph, "%d %d %d %d %d %d\n", i, b_id, hinges_vec[i][k].pos, hinges_vec[b_id][l].pos, 1, rev_int); } else { add_edge(hinge_graph_node_map[second_coord], hinge_graph_node_map[first_coord], hinge_graph); fprintf(out_hgraph, "%d %d %d %d %d %d\n", b_id, i, hinges_vec[b_id][l].pos, hinges_vec[i][k].pos, 1, rev_int); } } } } for (int l = 0; l < killed_hinges_vec[b_id].size(); l++) { // std::cout << i <<"\t" << b_id <<"\t" << k << "\t" << l < pos_B - MATCHING_HINGE_SLACK)) { // found a matching hinge if (req_hinge_type == killed_hinges_vec[b_id][l].type) { if (hinges_vec[i][k].type == 1) { fprintf(out_hgraph, "%d %d %d %d %d %d\n", i, b_id, hinges_vec[i][k].pos, killed_hinges_vec[b_id][l].pos, 0, rev_int); } else { fprintf(out_hgraph, "%d %d %d %d %d %d\n", b_id, i, killed_hinges_vec[b_id][l].pos, hinges_vec[i][k].pos, 0, rev_int); } if (matches_forward[i][j]->match_type_ == FORWARD) { new_killed_hinges_vec[i].push_back(Hinge(hinges_vec[i][k].pos,hinges_vec[i][k].type,false)); if (hinges_vec[i][k].type == -1) { //console->info("This should not have happened."); // If this is a -1 hinge, read i should also bridge the repeat, // and hinges_vec[i][k] would have been killed in filter fprintf(out_debug,"%d %d %d %d %d [%d %d] [%d %d] [%d %d] [%d %d] \n", matches_forward[i][j]->read_A_id_, matches_forward[i][j]->read_B_id_, matches_forward[i][j]->length, matches_forward[i][j]->reverse_complement_match_, matches_forward[i][j]->match_type_, matches_forward[i][j]->eff_read_A_match_start_, matches_forward[i][j]->eff_read_A_match_end_, matches_forward[i][j]->eff_read_B_match_start_, matches_forward[i][j]->eff_read_B_match_end_, matches_forward[i][j]->eff_read_A_read_start_, matches_forward[i][j]->eff_read_A_read_end_, matches_forward[i][j]->eff_read_B_read_start_, matches_forward[i][j]->eff_read_B_read_end_); fprintf(out_debug, "%d %d %d %d\n", hinges_vec[i][k].pos, hinges_vec[i][k].type, killed_hinges_vec[b_id][l].pos, killed_hinges_vec[b_id][l].type); } } } } } } } } for (int j = 0; j < matches_backward[i].size(); j++) { if (matches_backward[i][j]->active) { if (((matches_backward[i][j]->match_type_ == BACKWARD) or (matches_backward[i][j]->match_type_ == BACKWARD_INTERNAL)) and (reads[matches_backward[i][j]->read_B_id_]->active)) { // Need to check whether read B has a hinge matching hinges_vec[i][k] pos_B = matches_backward[i][j]->GetMatchingPosition(hinges_vec[i][k].pos); // console->info("Matching position is {}", pos_B); // for debugging int req_hinge_type; int rev_int = 0; if (matches_backward[i][j]->reverse_complement_match_ == true) { req_hinge_type = -1 * hinges_vec[i][k].type; rev_int = 1; } else { req_hinge_type = hinges_vec[i][k].type; } // std::cout << req_hinge_type << std::endl; int b_id = matches_backward[i][j]->read_B_id_; for (int l = 0; l < hinges_vec[b_id].size(); l++) { if ((hinges_vec[b_id][l].pos < pos_B + MATCHING_HINGE_SLACK) and (hinges_vec[b_id][l].pos > pos_B - MATCHING_HINGE_SLACK)) { // found a matching hinge std::pair first_coord, second_coord; first_coord=std::make_pair(i,k); second_coord=std::make_pair(b_id,l); if (req_hinge_type == hinges_vec[b_id][l].type) { if (hinges_vec[i][k].type == -1) { add_edge(hinge_graph_node_map[first_coord], hinge_graph_node_map[second_coord], hinge_graph); fprintf(out_hgraph, "%d %d %d %d %d %d\n", i, b_id, hinges_vec[i][k].pos, hinges_vec[b_id][l].pos, 1, rev_int); } else { add_edge(hinge_graph_node_map[second_coord], hinge_graph_node_map[first_coord], hinge_graph); fprintf(out_hgraph, "%d %d %d %d %d %d\n", b_id, i, hinges_vec[b_id][l].pos, hinges_vec[i][k].pos, 1, rev_int); } } } } for (int l = 0; l < killed_hinges_vec[b_id].size(); l++) { if ((killed_hinges_vec[b_id][l].pos < pos_B + MATCHING_HINGE_SLACK) and (killed_hinges_vec[b_id][l].pos > pos_B - MATCHING_HINGE_SLACK)) { // found a matching hinge if (req_hinge_type == killed_hinges_vec[b_id][l].type) { if (hinges_vec[i][k].type == -1) { fprintf(out_hgraph, "%d %d %d %d %d %d\n", i, b_id, hinges_vec[i][k].pos, killed_hinges_vec[b_id][l].pos, 0, rev_int); } else { fprintf(out_hgraph, "%d %d %d %d %d %d\n", b_id, i, killed_hinges_vec[b_id][l].pos, hinges_vec[i][k].pos, 0, rev_int); } } if (matches_backward[i][j]->match_type_ == BACKWARD) { new_killed_hinges_vec[i].push_back(Hinge(hinges_vec[i][k].pos,hinges_vec[i][k].type,false)); if (hinges_vec[i][k].type != -1) { //console->info("This should not have happened 2."); // If this is a +1 hinge, read i should also bridge the repeat, // and hinges_vec[i][k] would have been killed in filter } } } } } } } } } } console->info("Hinge graph built"); std::vector component(num_vertices(hinge_graph)); int num = connected_components(hinge_graph, &component[0]); std::vector::size_type i; std::cout << "Total number of components: " << num << std::endl; std::map component_size; for (i = 0; i != component.size(); ++i){ // are we skipping i=0? if ( component_size.find(component[i]) == component_size.end() ){ component_size[component[i]]=1; } else component_size[component[i]]+=1; } // std::unordered_map > filtered_hinges_vec; // for (int i = 0; i < n_read; i++) { // filtered_hinges_vec[i] = std::vector(); // } for (int i = 0; i != component.size(); ++i) { if (component_size[component[i]] < MIN_CONNECTED_COMPONENT_SIZE) { int ind1, ind2; ind1 = hinge_graph_node_map_rev[i].first; ind2 = hinge_graph_node_map_rev[i].second; hinges_vec[ind1][ind2].active=false; // filtered_hinges_vec[ind1].push_back(hinges_vec[ind1][ind2]); } } // std::map< std::pair , int> hinge_graph_node_map; std::map> component_sink; for (i = 0; i != component.size(); ++i){ int ind1, ind2; ind1 = hinge_graph_node_map_rev[i].first; ind2 = hinge_graph_node_map_rev[i].second; // for now let us just pick an arbitrary active hinge as the component main sink if ( hinges_vec[ind1][ind2].active == true ) component_sink[component[i]]= std::make_pair(ind1,ind2); } n = 0; FILE *out_hglist; out_hglist = fopen((std::string(out_name) + ".hinge.list").c_str(), "w"); for (int i = 0; i < n_read; i++) { for (int j = 0; j < hinges_vec[i].size(); j++) { if ((reads[i]->active) and ((hinges_vec[i][j].active))) { fprintf(out_hglist, "%d %d %d\n", i, marked_hinges[i][j].first, marked_hinges[i][j].second); n++; } } } fclose(out_hglist); console->info("after filter {} active hinges", n); // filter hinges std::vector repeat_status_front; std::vector repeat_status_back; for (int i = 0; i < n_read; i++) { bool in = false; bool out = false; for (int j = 0; j < hinges_vec[i].size(); j++) { if ((hinges_vec[i][j].active) and (hinges_vec[i][j].type == 1)) in = true; if ((hinges_vec[i][j].active) and (hinges_vec[i][j].type == -1)) out = true; } repeat_status_front.push_back(out); repeat_status_back.push_back(in); } //Perform greedy graph construction and write outputs out and out2 for (int i = 0; i < n_read; i++) { if (reads[i]->active) { int forward = 0; int backward = 0; for (int j = 0; j < matches_forward[i].size(); j++) { if (matches_forward[i][j]->active) { if ((matches_forward[i][j]->match_type_ == FORWARD) and (reads[matches_forward[i][j]->read_B_id_]->active)) { /*if (not repeat_status_back[i])*/ { if (forward < 1) { PrintOverlapToFile(out_greedy, matches_forward[i][j]); if (matches_forward[i][j]->reverse_complement_match_ == 0) fprintf(out_g1, "%d %d %d [%d %d] [%d %d] [%d %d] [%d %d]\n", matches_forward[i][j]->read_A_id_, matches_forward[i][j]->read_B_id_, matches_forward[i][j]->length, matches_forward[i][j]->eff_read_A_match_start_, matches_forward[i][j]->eff_read_A_match_end_, matches_forward[i][j]->eff_read_B_match_start_, matches_forward[i][j]->eff_read_B_match_end_, matches_forward[i][j]->eff_read_A_read_start_, matches_forward[i][j]->eff_read_A_read_end_, matches_forward[i][j]->eff_read_B_read_start_, matches_forward[i][j]->eff_read_B_read_end_); else fprintf(out_g1, "%d %d' %d [%d %d] [%d %d] [%d %d] [%d %d]\n", matches_forward[i][j]->read_A_id_, matches_forward[i][j]->read_B_id_, matches_forward[i][j]->length, matches_forward[i][j]->eff_read_A_match_start_, matches_forward[i][j]->eff_read_A_match_end_, matches_forward[i][j]->eff_read_B_match_start_, matches_forward[i][j]->eff_read_B_match_end_, matches_forward[i][j]->eff_read_A_read_start_, matches_forward[i][j]->eff_read_A_read_end_, matches_forward[i][j]->eff_read_B_read_start_, matches_forward[i][j]->eff_read_B_read_end_); if (matches_forward[i][j]->reverse_complement_match_ == 0) fprintf(out_g2, "%d' %d' %d [%d %d] [%d %d] [%d %d] [%d %d]\n", matches_forward[i][j]->read_B_id_, matches_forward[i][j]->read_A_id_, matches_forward[i][j]->length, matches_forward[i][j]->eff_read_A_match_start_, matches_forward[i][j]->eff_read_A_match_end_, matches_forward[i][j]->eff_read_B_match_start_, matches_forward[i][j]->eff_read_B_match_end_, matches_forward[i][j]->eff_read_A_read_start_, matches_forward[i][j]->eff_read_A_read_end_, matches_forward[i][j]->eff_read_B_read_start_, matches_forward[i][j]->eff_read_B_read_end_); else fprintf(out_g2, "%d %d' %d [%d %d] [%d %d] [%d %d] [%d %d]\n", matches_forward[i][j]->read_B_id_, matches_forward[i][j]->read_A_id_, matches_forward[i][j]->length, matches_forward[i][j]->eff_read_A_match_start_, matches_forward[i][j]->eff_read_A_match_end_, matches_forward[i][j]->eff_read_B_match_start_, matches_forward[i][j]->eff_read_B_match_end_, matches_forward[i][j]->eff_read_A_read_start_, matches_forward[i][j]->eff_read_A_read_end_, matches_forward[i][j]->eff_read_B_read_start_, matches_forward[i][j]->eff_read_B_read_end_); } } forward++; } } } for (int j = 0; j < matches_backward[i].size(); j++) { if (matches_backward[i][j]->active) { if ((matches_backward[i][j]->match_type_ == BACKWARD) and (reads[matches_backward[i][j]->read_B_id_]->active)) { /*if (not repeat_status_back[i])*/ { if (backward < 1) { PrintOverlapToFile(out_greedy, matches_backward[i][j]); if (matches_backward[i][j]->reverse_complement_match_ == 0) fprintf(out_g1, "%d %d %d [%d %d] [%d %d] [%d %d] [%d %d]\n", matches_backward[i][j]->read_A_id_, matches_backward[i][j]->read_B_id_, matches_backward[i][j]->length, matches_backward[i][j]->eff_read_A_match_start_, matches_backward[i][j]->eff_read_A_match_end_, matches_backward[i][j]->eff_read_B_match_start_, matches_backward[i][j]->eff_read_B_match_end_, matches_backward[i][j]->eff_read_A_read_start_, matches_backward[i][j]->eff_read_A_read_end_, matches_backward[i][j]->eff_read_B_read_start_, matches_backward[i][j]->eff_read_B_read_end_); else fprintf(out_g1, "%d %d' %d [%d %d] [%d %d] [%d %d] [%d %d]\n", matches_backward[i][j]->read_A_id_, matches_backward[i][j]->read_B_id_, matches_backward[i][j]->length, matches_backward[i][j]->eff_read_A_match_start_, matches_backward[i][j]->eff_read_A_match_end_, matches_backward[i][j]->eff_read_B_match_start_, matches_backward[i][j]->eff_read_B_match_end_, matches_backward[i][j]->eff_read_A_read_start_, matches_backward[i][j]->eff_read_A_read_end_, matches_backward[i][j]->eff_read_B_read_start_, matches_backward[i][j]->eff_read_B_read_end_); if (matches_backward[i][j]->reverse_complement_match_ == 0) fprintf(out_g2, "%d' %d' %d [%d %d] [%d %d] [%d %d] [%d %d]\n", matches_backward[i][j]->read_B_id_, matches_backward[i][j]->read_A_id_, matches_backward[i][j]->length, matches_backward[i][j]->eff_read_A_match_start_, matches_backward[i][j]->eff_read_A_match_end_, matches_backward[i][j]->eff_read_B_match_start_, matches_backward[i][j]->eff_read_B_match_end_, matches_backward[i][j]->eff_read_A_read_start_, matches_backward[i][j]->eff_read_A_read_end_, matches_backward[i][j]->eff_read_B_read_start_, matches_backward[i][j]->eff_read_B_read_end_); else fprintf(out_g2, "%d %d' %d [%d %d] [%d %d] [%d %d] [%d %d]\n", matches_backward[i][j]->read_B_id_, matches_backward[i][j]->read_A_id_, matches_backward[i][j]->length, matches_backward[i][j]->eff_read_A_match_start_, matches_backward[i][j]->eff_read_A_match_end_, matches_backward[i][j]->eff_read_B_match_start_, matches_backward[i][j]->eff_read_B_match_end_, matches_backward[i][j]->eff_read_A_read_start_, matches_backward[i][j]->eff_read_A_read_end_, matches_backward[i][j]->eff_read_B_read_start_, matches_backward[i][j]->eff_read_B_read_end_); } } backward++; } } } } } num_overlaps = 0; num_forward_overlaps=0; num_forward_internal_overlaps=0; num_reverse_overlaps=0; num_reverse_internal_overlaps=0; rev_complemented_matches=0; for (int i = 0; i < n_read; i++) { if (reads[i]->active) { for (int j = 0; j < matches_forward[i].size(); j++) { if (reads[matches_forward[i][j]->read_B_id_]->active) { num_overlaps++; if (matches_forward[i][j]->match_type_==FORWARD) num_forward_overlaps++; else if (matches_forward[i][j]->match_type_==FORWARD_INTERNAL) num_forward_internal_overlaps++; if (matches_forward[i][j]->reverse_complement_match_==1) rev_complemented_matches++; } } for (int j = 0; j < matches_backward[i].size(); j++) { if (reads[matches_backward[i][j]->read_B_id_]->active) { num_overlaps++; if (matches_backward[i][j]->match_type_==BACKWARD) num_reverse_overlaps++; else if (matches_backward[i][j]->match_type_==BACKWARD_INTERNAL) num_reverse_internal_overlaps++; if (matches_backward[i][j]->reverse_complement_match_==1) rev_complemented_matches++; } } } } /*std::cout<info("Starting to build assembly graph."); // int eff_b_id; int hinge_pos = -1; for (int i = 0; i < n_read; i++) { if (reads[i]->active) { int forward = 0; int forward_internal = 0; int backward = 0; int backward_internal = 0; LOverlap * chosen_match = NULL; for (int j = 0; j < matches_forward[i].size(); j++){ if (matches_forward[i][j]->active) { if ((reads[matches_forward[i][j]->read_B_id_]->active)) { // and (forward == 0)) { //printf("hinge size %d\n", hinges_vec[matches_forward[i][j]->read_B_id_].size()); if ((matches_forward[i][j]->match_type_ == FORWARD) and (forward == 0)) { // check if read j has new_killed_hinge //TODO: should this be checked for FORWARD_INTERNAL as well? bool poisoned = false; for (int k = 0; k < new_killed_hinges_vec[i].size(); k++) { if ( (matches_forward[i][j]->reverse_complement_match_ != 1) and (new_killed_hinges_vec[i][k].type == -1) and (new_killed_hinges_vec[i][k].pos > matches_forward[i][j]->eff_read_B_match_end_) ) { //TODO: do we need a tolerance in the comparison above? PrintOverlapToFile(out_skipped, matches_forward[i][j]); poisoned = true; } else if ( (matches_forward[i][j]->reverse_complement_match_ == 1) and (new_killed_hinges_vec[i][k].type == 1) and (new_killed_hinges_vec[i][k].pos < matches_forward[i][j]->eff_read_B_match_start_) ) { PrintOverlapToFile(out_skipped, matches_forward[i][j]); poisoned = true; } } if (not poisoned) { chosen_match = matches_forward[i][j]; hinge_pos = -1; forward = 1; //break; } } else if ((matches_forward[i][j]->match_type_ == FORWARD_INTERNAL) //and isValidHinge(matches_forward[i][j], hinges_vec[matches_forward[i][j]->read_B_id_]) and (hinges_vec[matches_forward[i][j]->read_B_id_].size() > 0) and (forward_internal == 0)){ // In the case of a forward_internal match we check whether // the hinge on read B is an in-hinge // (or an out-hinge if it's a reverse complement match) // int hinge_index = 0; int read_B_match_start = matches_forward[i][j]->read_B_match_start_; if (matches_forward[i][j]->reverse_complement_match_ == 1) { read_B_match_start = matches_forward[i][j]->read_B_match_end_; } for (int k = 0; k < hinges_vec[matches_forward[i][j]->read_B_id_].size(); k++) { if ( (read_B_match_start > hinges_vec[matches_forward[i][j]->read_B_id_][k].pos - HINGE_TOLERANCE) and (read_B_match_start < hinges_vec[matches_forward[i][j]->read_B_id_][k].pos + HINGE_TOLERANCE) and (hinges_vec[matches_forward[i][j]->read_B_id_][k].type == (1-2*matches_forward[i][j]->reverse_complement_match_)) and (hinges_vec[matches_forward[i][j]->read_B_id_][k].active) ) { if ((forward == 0) or (matches_forward[i][j]->weight > chosen_match->weight - 2*HINGE_SLACK)) { chosen_match = matches_forward[i][j]; forward = 1; forward_internal = 1; hinge_pos = hinges_vec[matches_forward[i][j]->read_B_id_][k].pos; } break; } } } } } } if (chosen_match != NULL) { PrintOverlapToFile(out_hg,chosen_match); edges_forward[i].push_back(chosen_match); PrintOverlapToFile2(out_hg2,chosen_match,hinge_pos); chosen_match = NULL; } else { // Deadend debugging // Forward dead-end deadend_out << i; // deadend_out << "\t Active: " << reads[i]->active << std::endl; deadend_out << "\t matches_forward size: " << matches_forward[i].size() << std::endl; } for (int j = 0; j < matches_backward[i].size(); j++){ if (matches_backward[i][j]->active) { if ((reads[matches_backward[i][j]->read_B_id_]->active)) { if ((matches_backward[i][j]->match_type_ == BACKWARD) and (backward == 0)){ // check if read j has new_killed_hinge bool poisoned = false; for (int k = 0; k < new_killed_hinges_vec[i].size(); k++) { if ( (matches_backward[i][j]->reverse_complement_match_ != 1) and (new_killed_hinges_vec[i][k].type == 1) and (new_killed_hinges_vec[i][k].pos < matches_backward[i][j]->eff_read_B_match_start_) ) { //TODO: do we need a tolerance in the comparison above? PrintOverlapToFile(out_skipped, matches_backward[i][j]); poisoned = true; } else if ( (matches_backward[i][j]->reverse_complement_match_ == 1) and (new_killed_hinges_vec[i][k].type == -1) and (new_killed_hinges_vec[i][k].pos > matches_backward[i][j]->eff_read_B_match_end_) ) { PrintOverlapToFile(out_skipped, matches_backward[i][j]); poisoned = true; } } if (not poisoned) { chosen_match = matches_backward[i][j]; backward = 1; hinge_pos = -1; } } else if ((matches_backward[i][j]->match_type_ == BACKWARD_INTERNAL) and (hinges_vec[matches_backward[i][j]->read_B_id_].size() > 0) and (backward_internal == 0)) { // In the case of a backward_internal match // we check whether the hinge on read B is an in-hinge // (or an in-hinge if it's a reverse complement match) int read_B_match_end = matches_backward[i][j]->read_B_match_end_; if (matches_backward[i][j]->reverse_complement_match_ == 1) { read_B_match_end = matches_backward[i][j]->read_B_match_start_; } for (int k = 0; k < hinges_vec[matches_backward[i][j]->read_B_id_].size(); k++) { if ( (read_B_match_end > hinges_vec[matches_backward[i][j]->read_B_id_][k].pos - HINGE_TOLERANCE) and (read_B_match_end < hinges_vec[matches_backward[i][j]->read_B_id_][k].pos + HINGE_TOLERANCE) and (hinges_vec[matches_backward[i][j]->read_B_id_][k].type == (-1+2*matches_backward[i][j]->reverse_complement_match_)) and (hinges_vec[matches_backward[i][j]->read_B_id_][k].active) ) { if ((backward == 0) or (matches_backward[i][j]->weight > chosen_match->weight - 2*HINGE_SLACK)) { chosen_match = matches_backward[i][j]; backward = 1; backward_internal = 1; hinge_pos = hinges_vec[matches_backward[i][j]->read_B_id_][k].pos; // int hinge_graph_id = hinge_graph_node_map[std::make_pair(matches_backward[i][j]->read_B_id_,k)]; // eff_b_id = component_sink[component[hinge_graph_id]].first; } break; } } } } } } if (chosen_match != NULL) { PrintOverlapToFile(out_hg,chosen_match); edges_backward[i].push_back(chosen_match); PrintOverlapToFile2(out_hg2,chosen_match,hinge_pos); } else { // Deadend debugging // Backward dead-end deadend_out << i; // deadend_out << "\t Active: " << reads[i]->active << std::endl; deadend_out << "\t matches_backward size: " << matches_backward[i].size() << std::endl; } } } console->info("sort and output finished"); console->info("version 0.0.3"); if (strlen(name_db) > 0) la.closeDB(); //close database return 0; } HINGE-0.5.0/src/lib/000077500000000000000000000000001314415550300137045ustar00rootroot00000000000000HINGE-0.5.0/src/lib/CMakeLists.txt000066400000000000000000000010611314415550300164420ustar00rootroot00000000000000cmake_minimum_required(VERSION 3.2) add_library(ini ini.c INIReader.cpp) add_library(DB DB.c QV.c) add_library(LA align.c) add_library(PAF paf.c) find_package( ZLIB REQUIRED ) if ( ZLIB_FOUND ) include_directories( ${ZLIB_INCLUDE_DIRS} ) target_link_libraries( PAF ${ZLIB_LIBRARIES} ) endif( ZLIB_FOUND ) add_library(LAInterface LAInterface.cpp) target_link_libraries(LAInterface LA DB PAF) add_library(kmer_lookup kmer_lookup.c) add_library(DW_banded DW_banded.c) add_library(falcon falcon.c) target_link_libraries(falcon kmer_lookup DW_banded) HINGE-0.5.0/src/lib/DB.c000077500000000000000000001357671314415550300143630ustar00rootroot00000000000000/************************************************************************************\ * * * Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. * * * * Redistribution and use in source and binary forms, with or without modification, * * are permitted provided that the following conditions are met: * * * * · Redistributions of source code must retain the above copyright notice, this * * list of conditions and the following disclaimer. * * * * · Redistributions in binary form must reproduce the above copyright notice, this * * list of conditions and the following disclaimer in the documentation and/or * * other materials provided with the distribution. * * * * · The name of EWM may not be used to endorse or promote products derived from * * this software without specific prior written permission. * * * * THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, * * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE * * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN * * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * * * For any issues regarding this software and its use, contact EWM at: * * * * Eugene W. Myers Jr. * * Bautzner Str. 122e * * 01099 Dresden * * GERMANY * * Email: gene.myers@gmail.com * * * \************************************************************************************/ /******************************************************************************************* * * Compressed data base module. Auxiliary routines to open and manipulate a data base for * which the sequence and read information are separated into two separate files, and the * sequence is compressed into 2-bits for each base. Support for tracks of additional * information, and trimming according to the current partition. * * Author : Gene Myers * Date : July 2013 * Revised: April 2014 * ********************************************************************************************/ #include #include #include #include #include #include #include "DB.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif /******************************************************************************************* * * GENERAL UTILITIES * ********************************************************************************************/ char *Prog_Name; #ifdef INTERACTIVE char Ebuffer[1000]; #endif void *Malloc(int64 size, char *mesg) { void *p; if ((p = malloc(size)) == NULL) { if (mesg == NULL) EPRINTF(EPLACE,"%s: Out of memory\n",Prog_Name); else EPRINTF(EPLACE,"%s: Out of memory (%s)\n",Prog_Name,mesg); } return (p); } void *Realloc(void *p, int64 size, char *mesg) { if ((p = realloc(p,size)) == NULL) { if (mesg == NULL) EPRINTF(EPLACE,"%s: Out of memory\n",Prog_Name); else EPRINTF(EPLACE,"%s: Out of memory (%s)\n",Prog_Name,mesg); } return (p); } char *Strdup(char *name, char *mesg) { char *s; if (name == NULL) return (NULL); if ((s = strdup(name)) == NULL) { if (mesg == NULL) EPRINTF(EPLACE,"%s: Out of memory\n",Prog_Name); else EPRINTF(EPLACE,"%s: Out of memory (%s)\n",Prog_Name,mesg); } return (s); } FILE *Fopen(char *name, char *mode) { FILE *f; if (name == NULL || mode == NULL) return (NULL); if ((f = fopen(name,mode)) == NULL) EPRINTF(EPLACE,"%s: Cannot open %s for '%s'\n",Prog_Name,name,mode); return (f); } char *PathTo(char *name) { char *path, *find; if (name == NULL) return (NULL); if ((find = rindex(name,'/')) != NULL) { *find = '\0'; path = Strdup(name,"Extracting path from"); *find = '/'; } else path = Strdup(".","Allocating default path"); return (path); } char *Root(char *name, char *suffix) { char *path, *find, *dot; int epos; if (name == NULL) return (NULL); find = rindex(name,'/'); if (find == NULL) find = name; else find += 1; if (suffix == NULL) { dot = strchr(find,'.'); if (dot != NULL) *dot = '\0'; path = Strdup(find,"Extracting root from"); if (dot != NULL) *dot = '.'; } else { epos = strlen(find); epos -= strlen(suffix); if (epos > 0 && strcasecmp(find+epos,suffix) == 0) { find[epos] = '\0'; path = Strdup(find,"Extracting root from"); find[epos] = suffix[0]; } else path = Strdup(find,"Allocating root"); } return (path); } char *Catenate(char *path, char *sep, char *root, char *suffix) { static char *cat = NULL; static int max = -1; int len; if (path == NULL || root == NULL || sep == NULL || suffix == NULL) return (NULL); len = strlen(path); len += strlen(sep); len += strlen(root); len += strlen(suffix); if (len > max) { max = ((int) (1.2*len)) + 100; if ((cat = (char *) realloc(cat,max+1)) == NULL) { EPRINTF(EPLACE,"%s: Out of memory (Making path name for %s)\n",Prog_Name,root); return (NULL); } } sprintf(cat,"%s%s%s%s",path,sep,root,suffix); return (cat); } char *Numbered_Suffix(char *left, int num, char *right) { static char *suffix = NULL; static int max = -1; int len; if (left == NULL || right == NULL) return (NULL); len = strlen(left); len += strlen(right) + 40; if (len > max) { max = ((int) (1.2*len)) + 100; if ((suffix = (char *) realloc(suffix,max+1)) == NULL) { EPRINTF(EPLACE,"%s: Out of memory (Making number suffix for %d)\n",Prog_Name,num); return (NULL); } } sprintf(suffix,"%s%d%s",left,num,right); return (suffix); } #define COMMA ',' // Print big integers with commas/periods for better readability void Print_Number(int64 num, int width, FILE *out) { if (width == 0) { if (num < 1000ll) fprintf(out,"%lld",num); else if (num < 1000000ll) fprintf(out,"%lld%c%03lld",num/1000ll,COMMA,num%1000ll); else if (num < 1000000000ll) fprintf(out,"%lld%c%03lld%c%03lld",num/1000000ll, COMMA,(num%1000000ll)/1000ll,COMMA,num%1000ll); else fprintf(out,"%lld%c%03lld%c%03lld%c%03lld",num/1000000000ll, COMMA,(num%1000000000ll)/1000000ll, COMMA,(num%1000000ll)/1000ll,COMMA,num%1000ll); } else { if (num < 1000ll) fprintf(out,"%*lld",width,num); else if (num < 1000000ll) { if (width <= 4) fprintf(out,"%lld%c%03lld",num/1000ll,COMMA,num%1000ll); else fprintf(out,"%*lld%c%03lld",width-4,num/1000ll,COMMA,num%1000ll); } else if (num < 1000000000ll) { if (width <= 8) fprintf(out,"%lld%c%03lld%c%03lld",num/1000000ll,COMMA,(num%1000000ll)/1000ll, COMMA,num%1000ll); else fprintf(out,"%*lld%c%03lld%c%03lld",width-8,num/1000000ll,COMMA,(num%1000000ll)/1000ll, COMMA,num%1000ll); } else { if (width <= 12) fprintf(out,"%lld%c%03lld%c%03lld%c%03lld",num/1000000000ll,COMMA, (num%1000000000ll)/1000000ll,COMMA, (num%1000000ll)/1000ll,COMMA,num%1000ll); else fprintf(out,"%*lld%c%03lld%c%03lld%c%03lld",width-12,num/1000000000ll,COMMA, (num%1000000000ll)/1000000ll,COMMA, (num%1000000ll)/1000ll,COMMA,num%1000ll); } } } // Return the number of digits, base 10, of num int Number_Digits(int64 num) { int digit; digit = 0; while (num >= 1) { num /= 10; digit += 1; } return (digit); } /******************************************************************************************* * * READ COMPRESSION/DECOMPRESSION UTILITIES * ********************************************************************************************/ // Compress read into 2-bits per base (from [0-3] per byte representation void Compress_Read(int len, char *s) { int i; char c, d; char *s0, *s1, *s2, *s3; s0 = s; s1 = s0+1; s2 = s1+1; s3 = s2+1; c = s1[len]; d = s2[len]; s0[len] = s1[len] = s2[len] = 0; for (i = 0; i < len; i += 4) *s++ = (char ) ((s0[i] << 6) | (s1[i] << 4) | (s2[i] << 2) | s3[i]); s1[len] = c; s2[len] = d; } // Uncompress read form 2-bits per base into [0-3] per byte representation void Uncompress_Read(int len, char *s) { int i, tlen, byte; char *s0, *s1, *s2, *s3; char *t; s0 = s; s1 = s0+1; s2 = s1+1; s3 = s2+1; tlen = (len-1)/4; t = s+tlen; for (i = tlen*4; i >= 0; i -= 4) { byte = *t--; s0[i] = (char) ((byte >> 6) & 0x3); s1[i] = (char) ((byte >> 4) & 0x3); s2[i] = (char) ((byte >> 2) & 0x3); s3[i] = (char) (byte & 0x3); } s[len] = 4; } // Convert read in [0-3] representation to ascii representation (end with '\n') void Lower_Read(char *s) { static char letter[4] = { 'a', 'c', 'g', 't' }; for ( ; *s != 4; s++) *s = letter[(int) *s]; *s = '\0'; } void Upper_Read(char *s) { static char letter[4] = { 'A', 'C', 'G', 'T' }; for ( ; *s != 4; s++) *s = letter[(int) *s]; *s = '\0'; } // Convert read in ascii representation to [0-3] representation (end with 4) void Number_Read(char *s) { static char number[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; for ( ; *s != '\0'; s++) *s = number[(int) *s]; *s = 4; } /******************************************************************************************* * * DB OPEN, TRIM & CLOSE ROUTINES * ********************************************************************************************/ // Open the given database or dam, "path" into the supplied HITS_DB record "db". If the name has // a part # in it then just the part is opened. The index array is allocated (for all or // just the part) and read in. // Return status of routine: // -1: The DB could not be opened for a reason reported by the routine to EPLACE // 0: Open of DB proceeded without mishap // 1: Open of DAM proceeded without mishap int Open_DB(char* path, HITS_DB *db) { HITS_DB dbcopy; char *root, *pwd, *bptr, *fptr, *cat; int nreads; FILE *index, *dbvis; int status, plen, isdam; int part, cutoff, all; int ufirst, tfirst, ulast, tlast; status = -1; dbcopy = *db; plen = strlen(path); if (strcmp(path+(plen-4),".dam") == 0) root = Root(path,".dam"); else root = Root(path,".db"); pwd = PathTo(path); bptr = rindex(root,'.'); if (bptr != NULL && bptr[1] != '\0' && bptr[1] != '-') { part = strtol(bptr+1,&fptr,10); if (*fptr != '\0' || part == 0) part = 0; else *bptr = '\0'; } else part = 0; isdam = 0; cat = Catenate(pwd,"/",root,".db"); if (cat == NULL) return (-1); if ((dbvis = fopen(cat,"r")) == NULL) { cat = Catenate(pwd,"/",root,".dam"); if (cat == NULL) return (-1); if ((dbvis = fopen(cat,"r")) == NULL) { EPRINTF(EPLACE,"%s: Could not open database %s\n",Prog_Name,path); goto error; } isdam = 1; } if ((index = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"r")) == NULL) goto error1; if (fread(db,sizeof(HITS_DB),1,index) != 1) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); goto error2; } { int p, nblocks, nfiles; int64 size; char fname[MAX_NAME], prolog[MAX_NAME]; nblocks = 0; if (fscanf(dbvis,DB_NFILE,&nfiles) != 1) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error2; } for (p = 0; p < nfiles; p++) if (fscanf(dbvis,DB_FDATA,&tlast,fname,prolog) != 3) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error2; } if (fscanf(dbvis,DB_NBLOCK,&nblocks) != 1) if (part == 0) { cutoff = 0; all = 1; } else { EPRINTF(EPLACE,"%s: DB %s has not yet been partitioned, cannot request a block !\n", Prog_Name,root); goto error2; } else { if (fscanf(dbvis,DB_PARAMS,&size,&cutoff,&all) != 3) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error2; } if (part > nblocks) { EPRINTF(EPLACE,"%s: DB %s has only %d blocks\n",Prog_Name,root,nblocks); goto error2; } } if (part > 0) { for (p = 1; p <= part; p++) if (fscanf(dbvis,DB_BDATA,&ufirst,&tfirst) != 2) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error2; } if (fscanf(dbvis,DB_BDATA,&ulast,&tlast) != 2) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error2; } } else { ufirst = tfirst = 0; ulast = db->ureads; tlast = db->treads; } } db->trimmed = 0; db->tracks = NULL; db->part = part; db->cutoff = cutoff; db->all = all; db->ufirst = ufirst; db->tfirst = tfirst; nreads = ulast-ufirst; if (part <= 0) { db->reads = (HITS_READ *) Malloc(sizeof(HITS_READ)*(nreads+2),"Allocating Open_DB index"); if (db->reads == NULL) goto error2; db->reads += 1; if (fread(db->reads,sizeof(HITS_READ),nreads,index) != (size_t) nreads) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); free(db->reads); goto error2; } } else { HITS_READ *reads; int i, r, maxlen; int64 totlen; reads = (HITS_READ *) Malloc(sizeof(HITS_READ)*(nreads+2),"Allocating Open_DB index"); if (reads == NULL) goto error2; reads += 1; fseeko(index,sizeof(HITS_READ)*ufirst,SEEK_CUR); if (fread(reads,sizeof(HITS_READ),nreads,index) != (size_t) nreads) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); free(reads); goto error2; } totlen = 0; maxlen = 0; for (i = 0; i < nreads; i++) { r = reads[i].rlen; totlen += r; if (r > maxlen) maxlen = r; } db->maxlen = maxlen; db->totlen = totlen; db->reads = reads; } ((int *) (db->reads))[-1] = ulast - ufirst; // Kludge, need these for DB part ((int *) (db->reads))[-2] = tlast - tfirst; db->nreads = nreads; db->path = Strdup(Catenate(pwd,PATHSEP,root,""),"Allocating Open_DB path"); if (db->path == NULL) goto error2; db->bases = NULL; db->loaded = 0; status = isdam; error2: fclose(index); error1: fclose(dbvis); error: if (bptr != NULL) *bptr = '.'; free(pwd); free(root); if (status < 0) *db = dbcopy; return (status); } // Trim the DB or part thereof and all loaded tracks according to the cuttof and all settings // of the current DB partition. Reallocate smaller memory blocks for the information kept // for the retained reads. void Trim_DB(HITS_DB *db) { int i, j, r; int allflag, cutoff; int64 totlen; int maxlen, nreads; HITS_TRACK *record; HITS_READ *reads; if (db->trimmed) return; if (db->cutoff <= 0 && db->all) return; cutoff = db->cutoff; if (db->all) allflag = 0; else allflag = DB_BEST; reads = db->reads; nreads = db->nreads; for (record = db->tracks; record != NULL; record = record->next) if (strcmp(record->name,".@qvs") == 0) { uint16 *table = ((HITS_QV *) record)->table; j = 0; for (i = 0; i < db->nreads; i++) if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff) table[j++] = table[i]; } else { int *anno4, size; int64 *anno8; char *anno, *data; size = record->size; data = (char *) record->data; if (data == NULL) { anno = (char *) record->anno; j = 0; for (i = r = 0; i < db->nreads; i++, r += size) if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff) { memmove(anno+j,anno+r,size); j += size; } memmove(anno+j,anno+r,size); } else if (size == 4) { int ai; anno4 = (int *) (record->anno); j = anno4[0] = 0; for (i = 0; i < db->nreads; i++) if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff) { ai = anno4[i]; anno4[j+1] = anno4[j] + (anno4[i+1]-ai); memmove(data+anno4[j],data+ai,anno4[i+1]-ai); j += 1; } record->data = Realloc(record->data,anno4[j],NULL); } else // size == 8 { int64 ai; anno8 = (int64 *) (record->anno); j = anno8[0] = 0; for (i = 0; i < db->nreads; i++) if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff) { ai = anno8[i]; anno8[j+1] = anno8[j] + (anno8[i+1]-ai); memmove(data+anno8[j],data+ai,anno8[i+1]-ai); j += 1; } record->data = Realloc(record->data,anno8[j],NULL); } record->anno = Realloc(record->anno,record->size*(j+1),NULL); } totlen = maxlen = 0; for (j = i = 0; i < nreads; i++) { r = reads[i].rlen; if ((reads[i].flags & DB_BEST) >= allflag && r >= cutoff) { totlen += r; if (r > maxlen) maxlen = r; reads[j++] = reads[i]; } } db->totlen = totlen; db->maxlen = maxlen; db->nreads = j; db->trimmed = 1; if (j < nreads) { db->reads = Realloc(reads-1,sizeof(HITS_READ)*(j+2),NULL); db->reads += 1; } } // The DB has already been trimmed, but a track over the untrimmed DB needs to be loaded. // Trim the track by rereading the untrimmed DB index from the file system. static int Late_Track_Trim(HITS_DB *db, HITS_TRACK *track, int ispart) { int i, j, r; int allflag, cutoff; int ureads; char *root; HITS_READ read; FILE *indx; if (!db->trimmed) return (0); if (db->cutoff <= 0 && db->all) return (0); cutoff = db->cutoff; if (db->all) allflag = 0; else allflag = DB_BEST; root = rindex(db->path,'/') + 2; indx = Fopen(Catenate(db->path,"","",".idx"),"r"); fseeko(indx,sizeof(HITS_DB) + sizeof(HITS_READ)*db->ufirst,SEEK_SET); if (ispart) ureads = ((int *) (db->reads))[-1]; else ureads = db->ureads; if (strcmp(track->name,".@qvs") == 0) { EPRINTF(EPLACE,"%s: Cannot load QV track after trimming\n",Prog_Name); fclose(indx); EXIT(1); } { int *anno4, size; int64 *anno8; char *anno, *data; size = track->size; data = (char *) track->data; if (data == NULL) { anno = (char *) track->anno; j = r = 0; for (i = r = 0; i < ureads; i++, r += size) { if (fread(&read,sizeof(HITS_READ),1,indx) != 1) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); fclose(indx); EXIT(1); } if ((read.flags & DB_BEST) >= allflag && read.rlen >= cutoff) { memmove(anno+j,anno+r,size); j += size; } r += size; } memmove(anno+j,anno+r,size); } else if (size == 4) { int ai; anno4 = (int *) (track->anno); j = anno4[0] = 0; for (i = 0; i < ureads; i++) { if (fread(&read,sizeof(HITS_READ),1,indx) != 1) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); fclose(indx); EXIT(1); } if ((read.flags & DB_BEST) >= allflag && read.rlen >= cutoff) { ai = anno4[i]; anno4[j+1] = anno4[j] + (anno4[i+1]-ai); memmove(data+anno4[j],data+ai,anno4[i+1]-ai); j += 1; } } track->data = Realloc(track->data,anno4[j],NULL); } else // size == 8 { int64 ai; anno8 = (int64 *) (track->anno); j = anno8[0] = 0; for (i = 0; i < ureads; i++) { if (fread(&read,sizeof(HITS_READ),1,indx) != 1) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); fclose(indx); EXIT(1); } if ((read.flags & DB_BEST) >= allflag && read.rlen >= cutoff) { ai = anno8[i]; anno8[j+1] = anno8[j] + (anno8[i+1]-ai); memmove(data+anno8[j],data+ai,anno8[i+1]-ai); j += 1; } } track->data = Realloc(track->data,anno8[j],NULL); } track->anno = Realloc(track->anno,track->size*(j+1),NULL); } fclose(indx); return (0); } // Shut down an open 'db' by freeing all associated space, including tracks and QV structures, // and any open file pointers. The record pointed at by db however remains (the user // supplied it and so should free it). void Close_DB(HITS_DB *db) { HITS_TRACK *t, *p; if (db->loaded) free(((char *) (db->bases)) - 1); else if (db->bases != NULL) fclose((FILE *) db->bases); free(db->reads-1); free(db->path); Close_QVs(db); for (t = db->tracks; t != NULL; t = p) { p = t->next; free(t->anno); free(t->data); free(t); } } /******************************************************************************************* * * QV LOAD & CLOSE ROUTINES * ********************************************************************************************/ HITS_DB *Active_DB = NULL; // Last db/qv used by "Load_QVentry" HITS_QV *Active_QV; // Becomes invalid after closing int Load_QVs(HITS_DB *db) { FILE *quiva, *istub, *indx; char *root; uint16 *table; HITS_QV *qvtrk; QVcoding *coding, *nx; int ncodes; if (db->tracks != NULL && strcmp(db->tracks->name,".@qvs") == 0) return (0); if (db->trimmed) { EPRINTF(EPLACE,"%s: Cannot load QVs after trimming the DB\n",Prog_Name); EXIT(1); } if (db->reads[db->nreads-1].coff < 0) { EPRINTF(EPLACE,"%s: The requested QVs have not been added to the DB!\n",Prog_Name); EXIT(1); } // Open .qvs, .idx, and .db files quiva = Fopen(Catenate(db->path,"","",".qvs"),"r"); if (quiva == NULL) return (-1); istub = NULL; indx = NULL; table = NULL; coding = NULL; qvtrk = NULL; root = rindex(db->path,'/') + 2; istub = Fopen(Catenate(db->path,"/",root,".db"),"r"); if (istub == NULL) goto error; { int first, last, nfiles; char prolog[MAX_NAME], fname[MAX_NAME]; int i, j; if (fscanf(istub,DB_NFILE,&nfiles) != 1) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error; } if (db->part > 0) { int pfirst, plast; int fbeg, fend; int n, k; FILE *indx; // Determine first how many and which files span the block (fbeg to fend) pfirst = db->ufirst; plast = pfirst + db->nreads; first = 0; for (fbeg = 0; fbeg < nfiles; fbeg++) { if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error; } if (last > pfirst) break; first = last; } for (fend = fbeg+1; fend <= nfiles; fend++) { if (last >= plast) break; if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error; } first = last; } indx = Fopen(Catenate(db->path,"","",".idx"),"r"); ncodes = fend-fbeg; coding = (QVcoding *) Malloc(sizeof(QVcoding)*ncodes,"Allocating coding schemes"); table = (uint16 *) Malloc(sizeof(uint16)*db->nreads,"Allocating QV table indices"); if (indx == NULL || coding == NULL || table == NULL) { ncodes = 0; goto error; } // Carefully get the first coding scheme (its offset is most likely in a HITS_RECORD // in .idx that is *not* in memory). Get all the other coding schemes normally and // assign the tables # for each read in the block in "tables". rewind(istub); fscanf(istub,DB_NFILE,&nfiles); first = 0; for (n = 0; n < fbeg; n++) { fscanf(istub,DB_FDATA,&last,fname,prolog); first = last; } for (n = fbeg; n < fend; n++) { fscanf(istub,DB_FDATA,&last,fname,prolog); i = n-fbeg; if (first < pfirst) { HITS_READ read; fseeko(indx,sizeof(HITS_DB) + sizeof(HITS_READ)*first,SEEK_SET); if (fread(&read,sizeof(HITS_READ),1,indx) != 1) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); ncodes = i; goto error; } fseeko(quiva,read.coff,SEEK_SET); nx = Read_QVcoding(quiva); if (nx == NULL) { ncodes = i; goto error; } coding[i] = *nx; } else { fseeko(quiva,db->reads[first-pfirst].coff,SEEK_SET); nx = Read_QVcoding(quiva); if (nx == NULL) { ncodes = i; goto error; } coding[i] = *nx; db->reads[first-pfirst].coff = ftello(quiva); } j = first-pfirst; if (j < 0) j = 0; k = last-pfirst; if (k > db->nreads) k = db->nreads; while (j < k) table[j++] = (uint16) i; first = last; } fclose(indx); indx = NULL; } else { // Load in coding scheme for each file, adjust .coff of first read in the file, and // record which table each read uses ncodes = nfiles; coding = (QVcoding *) Malloc(sizeof(QVcoding)*nfiles,"Allocating coding schemes"); table = (uint16 *) Malloc(sizeof(uint16)*db->nreads,"Allocating QV table indices"); if (coding == NULL || table == NULL) goto error; first = 0; for (i = 0; i < nfiles; i++) { if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error; } fseeko(quiva,db->reads[first].coff,SEEK_SET); nx = Read_QVcoding(quiva); if (nx == NULL) { ncodes = i; goto error; } coding[i] = *nx; db->reads[first].coff = ftello(quiva); for (j = first; j < last; j++) table[j] = (uint16) i; first = last; } } // Allocate and fill in the HITS_QV record and add it to the front of the // track list qvtrk = (HITS_QV *) Malloc(sizeof(HITS_QV),"Allocating QV pseudo-track"); if (qvtrk == NULL) goto error; qvtrk->name = Strdup(".@qvs","Allocating QV pseudo-track name"); if (qvtrk->name == NULL) goto error; qvtrk->next = db->tracks; db->tracks = (HITS_TRACK *) qvtrk; qvtrk->ncodes = ncodes; qvtrk->table = table; qvtrk->coding = coding; qvtrk->quiva = quiva; } fclose(istub); return (0); error: if (qvtrk != NULL) free(qvtrk); if (table != NULL) free(table); if (coding != NULL) { int i; for (i = 0; i < ncodes; i++) Free_QVcoding(coding+i); free(coding); } if (indx != NULL) fclose(indx); if (istub != NULL) fclose(istub); fclose(quiva); EXIT(1); } // Close the QV stream, free the QV pseudo track and all associated memory void Close_QVs(HITS_DB *db) { HITS_TRACK *track; HITS_QV *qvtrk; int i; Active_DB = NULL; track = db->tracks; if (track != NULL && strcmp(track->name,".@qvs") == 0) { qvtrk = (HITS_QV *) track; for (i = 0; i < qvtrk->ncodes; i++) Free_QVcoding(qvtrk->coding+i); free(qvtrk->coding); free(qvtrk->table); fclose(qvtrk->quiva); db->tracks = track->next; free(track); } return; } /******************************************************************************************* * * TRACK LOAD & CLOSE ROUTINES * ********************************************************************************************/ // Return status of track: // 1: Track is for trimmed DB // 0: Track is for untrimmed DB // -1: Track is not the right size of DB either trimmed or untrimmed // -2: Could not find the track int Check_Track(HITS_DB *db, char *track, int *kind) { FILE *afile; int tracklen, size, ispart; int ureads, treads; afile = NULL; if (db->part > 0) { afile = fopen(Catenate(db->path,Numbered_Suffix(".",db->part,"."),track,".anno"),"r"); ispart = 1; } if (afile == NULL) { afile = fopen(Catenate(db->path,".",track,".anno"),"r"); ispart = 0; } if (afile == NULL) return (-2); if (fread(&tracklen,sizeof(int),1,afile) != 1) return (-1); if (fread(&size,sizeof(int),1,afile) != 1) return (-1); if (size == 0) *kind = MASK_TRACK; else if (size > 0) *kind = CUSTOM_TRACK; else return (-1); fclose(afile); if (ispart) { ureads = ((int *) (db->reads))[-1]; treads = ((int *) (db->reads))[-2]; } else { ureads = db->ureads; treads = db->treads; } if (tracklen == ureads) return (0); else if (tracklen == treads) return (1); else return (-1); } // If track is not already in the db's track list, then allocate all the storage for it, // read it in from the appropriate file, add it to the track list, and return a pointer // to the newly created HITS_TRACK record. If the track does not exist or cannot be // opened for some reason, then NULL is returned. HITS_TRACK *Load_Track(HITS_DB *db, char *track) { FILE *afile, *dfile; int tracklen, size; int nreads, ispart; int treads, ureads; void *anno; void *data; char *name; HITS_TRACK *record; if (track[0] == '.') { EPRINTF(EPLACE,"%s: Track name, '%s', cannot begin with a .\n",Prog_Name,track); EXIT(NULL); } for (record = db->tracks; record != NULL; record = record->next) if (strcmp(record->name,track) == 0) return (record); afile = NULL; if (db->part) { afile = fopen(Catenate(db->path,Numbered_Suffix(".",db->part,"."),track,".anno"),"r"); ispart = 1; } if (afile == NULL) { afile = fopen(Catenate(db->path,".",track,".anno"),"r"); ispart = 0; } if (afile == NULL) { EPRINTF(EPLACE,"%s: Track '%s' does not exist\n",Prog_Name,track); return (NULL); } dfile = NULL; anno = NULL; data = NULL; record = NULL; if (ispart) name = Catenate(db->path,Numbered_Suffix(".",db->part,"."),track,".data"); else name = Catenate(db->path,".",track,".data"); if (name == NULL) goto error; dfile = fopen(name,"r"); if (fread(&tracklen,sizeof(int),1,afile) != 1) { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); goto error; } if (fread(&size,sizeof(int),1,afile) != 1) { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); goto error; } if (size < 0) { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); goto error; } if (size == 0) size = 8; if (ispart) { ureads = ((int *) (db->reads))[-1]; treads = ((int *) (db->reads))[-2]; } else { ureads = db->ureads; treads = db->treads; } if (db->trimmed) { if (tracklen != treads && tracklen != ureads) { EPRINTF(EPLACE,"%s: Track '%s' not same size as database !\n",Prog_Name,track); goto error; } if ( ! ispart && db->part > 0) { if (tracklen == treads) fseeko(afile,size*db->tfirst,SEEK_CUR); else fseeko(afile,size*db->ufirst,SEEK_CUR); } } else { if (tracklen != ureads) { if (tracklen == treads) EPRINTF(EPLACE,"%s: Track '%s' is for a trimmed DB !\n",Prog_Name,track); else EPRINTF(EPLACE,"%s: Track '%s' not same size as database !\n",Prog_Name,track); goto error; } if ( ! ispart && db->part > 0) fseeko(afile,size*db->ufirst,SEEK_CUR); } nreads = db->nreads; anno = (void *) Malloc(size*(nreads+1),"Allocating Track Anno Vector"); if (anno == NULL) goto error; if (dfile != NULL) { int64 *anno8, off8, dlen; int *anno4, off4; int i; if (fread(anno,size,nreads+1,afile) != (size_t) (nreads+1)) { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); goto error; } if (size == 4) { anno4 = (int *) anno; off4 = anno4[0]; if (off4 != 0) { for (i = 0; i <= nreads; i++) anno4[i] -= off4; fseeko(dfile,off4,SEEK_SET); } dlen = anno4[nreads]; data = (void *) Malloc(dlen,"Allocating Track Data Vector"); } else { anno8 = (int64 *) anno; off8 = anno8[0]; if (off8 != 0) { for (i = 0; i <= nreads; i++) anno8[i] -= off8; fseeko(dfile,off8,SEEK_SET); } dlen = anno8[nreads]; data = (void *) Malloc(dlen,"Allocating Track Data Vector"); } if (data == NULL) goto error; if (dlen > 0) { if (fread(data,dlen,1,dfile) != 1) { EPRINTF(EPLACE,"%s: Track '%s' data file is junk\n",Prog_Name,track); goto error; } } fclose(dfile); dfile = NULL; } else { if (fread(anno,size,nreads,afile) != (size_t) nreads) { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); goto error; } data = NULL; } fclose(afile); record = (HITS_TRACK *) Malloc(sizeof(HITS_TRACK),"Allocating Track Record"); if (record == NULL) goto error; record->name = Strdup(track,"Allocating Track Name"); if (record->name == NULL) goto error; record->data = data; record->anno = anno; record->size = size; if (db->trimmed && tracklen != treads) { if (Late_Track_Trim(db,record,ispart)) goto error; } if (db->tracks != NULL && strcmp(db->tracks->name,".@qvs") == 0) { record->next = db->tracks->next; db->tracks->next = record; } else { record->next = db->tracks; db->tracks = record; } return (record); error: if (record != NULL) free(record); if (data != NULL) free(data); if (anno != NULL) free(anno); if (dfile != NULL) fclose(dfile); fclose(afile); EXIT (NULL); } void Close_Track(HITS_DB *db, char *track) { HITS_TRACK *record, *prev; prev = NULL; for (record = db->tracks; record != NULL; record = record->next) { if (strcmp(record->name,track) == 0) { free(record->anno); free(record->data); free(record->name); if (prev == NULL) db->tracks = record->next; else prev->next = record->next; free(record); return; } prev = record; } return; } /******************************************************************************************* * * READ BUFFER ALLOCATION AND READ ACCESS * ********************************************************************************************/ // Allocate and return a buffer big enough for the largest read in 'db', leaving room // for an initial delimiter character char *New_Read_Buffer(HITS_DB *db) { char *read; read = (char *) Malloc(db->maxlen+4,"Allocating New Read Buffer"); if (read == NULL) EXIT(NULL); return (read+1); } // Load into 'read' the i'th read in 'db'. As an upper case ASCII string if ascii is 2, as a // lower-case ASCII string is ascii is 1, and as a numeric string over 0(A), 1(C), 2(G), and // 3(T) otherwise. // // **NB**, the byte before read will be set to a delimiter character! int Load_Read(HITS_DB *db, int i, char *read, int ascii) { FILE *bases = (FILE *) db->bases; int64 off; int len, clen; HITS_READ *r = db->reads; if (i >= db->nreads) { EPRINTF(EPLACE,"%s: Index out of bounds (Load_Read)\n",Prog_Name); EXIT(1); } if (bases == NULL) { bases = Fopen(Catenate(db->path,"","",".bps"),"r"); if (bases == NULL) EXIT(1); db->bases = (void *) bases; } off = r[i].boff; len = r[i].rlen; if (ftello(bases) != off) fseeko(bases,off,SEEK_SET); clen = COMPRESSED_LEN(len); if (clen > 0) { if (fread(read,clen,1,bases) != 1) { EPRINTF(EPLACE,"%s: Failed read of .bps file (Load_Read)\n",Prog_Name); EXIT(1); } } Uncompress_Read(len,read); if (ascii == 1) { Lower_Read(read); read[-1] = '\0'; } else if (ascii == 2) { Upper_Read(read); read[-1] = '\0'; } else read[-1] = 4; return (0); } char *Load_Subread(HITS_DB *db, int i, int beg, int end, char *read, int ascii) { FILE *bases = (FILE *) db->bases; int64 off; int len, clen; int bbeg, bend; HITS_READ *r = db->reads; if (i >= db->nreads) { EPRINTF(EPLACE,"%s: Index out of bounds (Load_Read)\n",Prog_Name); EXIT(NULL); } if (bases == NULL) { bases = Fopen(Catenate(db->path,"","",".bps"),"r"); if (bases == NULL) EXIT(NULL); db->bases = (void *) bases; } bbeg = beg/4; bend = (end-1)/4+1; off = r[i].boff + bbeg; len = end - beg; if (ftello(bases) != off) fseeko(bases,off,SEEK_SET); clen = bend-bbeg; if (clen > 0) { if (fread(read,clen,1,bases) != 1) { EPRINTF(EPLACE,"%s: Failed read of .bps file (Load_Read)\n",Prog_Name); EXIT(NULL); } } Uncompress_Read(4*clen,read); read += beg%4; read[len] = 4; if (ascii == 1) { Lower_Read(read); read[-1] = '\0'; } else if (ascii == 2) { Upper_Read(read); read[-1] = '\0'; } else read[-1] = 4; return (read); } /******************************************************************************************* * * QV BUFFER ALLOCATION QV READ ACCESS * ********************************************************************************************/ // Allocate and return a buffer of 5 vectors big enough for the largest read in 'db' char **New_QV_Buffer(HITS_DB *db) { char **entry; char *qvs; int i; qvs = (char *) Malloc(db->maxlen*5,"Allocating New QV Buffer"); entry = (char **) Malloc(sizeof(char *)*5,"Allocating New QV Buffer"); if (qvs == NULL || entry == NULL) EXIT(NULL); for (i = 0; i < 5; i++) entry[i] = qvs + i*db->maxlen; return (entry); } // Load into entry the QV streams for the i'th read from db. The parameter ascii applies to // the DELTAG stream as described for Load_Read. int Load_QVentry(HITS_DB *db, int i, char **entry, int ascii) { HITS_READ *reads; FILE *quiva; int rlen; if (db != Active_DB) { if (db->tracks == NULL || strcmp(db->tracks->name,".@qvs") != 0) { EPRINTF(EPLACE,"%s: QV's are not loaded (Load_QVentry)\n",Prog_Name); EXIT(1); } Active_QV = (HITS_QV *) db->tracks; Active_DB = db; } if (i >= db->nreads) { EPRINTF(EPLACE,"%s: Index out of bounds (Load_QVentry)\n",Prog_Name); EXIT(1); } reads = db->reads; quiva = Active_QV->quiva; rlen = reads[i].rlen; fseeko(quiva,reads[i].coff,SEEK_SET); if (Uncompress_Next_QVentry(quiva,entry,Active_QV->coding+Active_QV->table[i],rlen)) EXIT(1); if (ascii != 1) { char *deltag = entry[1]; if (ascii != 2) { char x = deltag[rlen]; deltag[rlen] = '\0'; Number_Read(deltag); deltag[rlen] = x; } else { int j; int u = 'A'-'a'; for (j = 0; j < rlen; j++) deltag[j] = (char) (deltag[j]+u); } } return (0); } /******************************************************************************************* * * BLOCK LOAD OF ALL READS (PRIMARILY FOR DALIGNER) * ********************************************************************************************/ // Allocate a block big enough for all the uncompressed sequences, read them into it, // reset the 'off' in each read record to be its in-memory offset, and set the // bases pointer to point at the block after closing the bases file. If ascii is // non-zero then the reads are converted to ACGT ascii, otherwise the reads are left // as numeric strings over 0(A), 1(C), 2(G), and 3(T). int Read_All_Sequences(HITS_DB *db, int ascii) { FILE *bases; int nreads = db->nreads; HITS_READ *reads = db->reads; void (*translate)(char *s); char *seq; int64 o, off; int i, len, clen; bases = Fopen(Catenate(db->path,"","",".bps"),"r"); if (bases == NULL) EXIT(1); seq = (char *) Malloc(db->totlen+nreads+4,"Allocating All Sequence Reads"); if (seq == NULL) { fclose(bases); EXIT(1); } *seq++ = 4; if (ascii == 1) translate = Lower_Read; else translate = Upper_Read; o = 0; for (i = 0; i < nreads; i++) { len = reads[i].rlen; off = reads[i].boff; if (ftello(bases) != off) fseeko(bases,off,SEEK_SET); clen = COMPRESSED_LEN(len); if (clen > 0) { if (fread(seq+o,clen,1,bases) != 1) { EPRINTF(EPLACE,"%s: Read of .bps file failed (Read_All_Sequences)\n",Prog_Name); free(seq); fclose(bases); EXIT(1); } } Uncompress_Read(len,seq+o); if (ascii) translate(seq+o); reads[i].boff = o; o += (len+1); } reads[nreads].boff = o; fclose(bases); db->bases = (void *) seq; db->loaded = 1; return (0); } int List_DB_Files(char *path, void actor(char *path, char *extension)) { int status, plen, rlen, dlen; char *root, *pwd, *name; int isdam; DIR *dirp; struct dirent *dp; status = 0; pwd = PathTo(path); plen = strlen(path); if (strcmp(path+(plen-4),".dam") == 0) root = Root(path,".dam"); else root = Root(path,".db"); rlen = strlen(root); if (root == NULL || pwd == NULL) { free(pwd); free(root); EXIT(1); } if ((dirp = opendir(pwd)) == NULL) { EPRINTF(EPLACE,"%s: Cannot open directory %s (List_DB_Files)\n",Prog_Name,pwd); status = -1; goto error; } isdam = 0; while ((dp = readdir(dirp)) != NULL) // Get case dependent root name (if necessary) { name = dp->d_name; if (strcmp(name,Catenate("","",root,".db")) == 0) break; if (strcmp(name,Catenate("","",root,".dam")) == 0) { isdam = 1; break; } if (strcasecmp(name,Catenate("","",root,".db")) == 0) { strncpy(root,name,rlen); break; } if (strcasecmp(name,Catenate("","",root,".dam")) == 0) { strncpy(root,name,rlen); isdam = 1; break; } } if (dp == NULL) { EPRINTF(EPLACE,"%s: Cannot find %s (List_DB_Files)\n",Prog_Name,pwd); status = -1; closedir(dirp); goto error; } if (isdam) actor(Catenate(pwd,"/",root,".dam"),"dam"); else actor(Catenate(pwd,"/",root,".db"),"db"); rewinddir(dirp); // Report each auxiliary file while ((dp = readdir(dirp)) != NULL) { name = dp->d_name; dlen = strlen(name); #ifdef HIDE_FILES if (name[0] != '.') continue; dlen -= 1; name += 1; #endif if (dlen < rlen+1) continue; if (name[rlen] != '.') continue; if (strncmp(name,root,rlen) != 0) continue; actor(Catenate(pwd,PATHSEP,name,""),name+(rlen+1)); } closedir(dirp); error: free(pwd); free(root); return (status); } void Print_Read(char *s, int width) { int i; if (s[0] < 4) { for (i = 0; s[i] != 4; i++) { if (i%width == 0 && i != 0) printf("\n"); printf("%d",s[i]); } printf("\n"); } else { for (i = 0; s[i] != '\0'; i++) { if (i%width == 0 && i != 0) printf("\n"); printf("%c",s[i]); } printf("\n"); } } HINGE-0.5.0/src/lib/DW_banded.c000077500000000000000000000251201314415550300156620ustar00rootroot00000000000000 /* * ===================================================================================== * * Filename: DW_banded.c * * Description: A banded version for the O(ND) greedy sequence alignment algorithm * * Version: 0.1 * Created: 07/20/2013 17:00:00 * Revision: none * Compiler: gcc * * Author: Jason Chin, * Company: * * ===================================================================================== #################################################################################$$ # Copyright (c) 2011-2014, Pacific Biosciences of California, Inc. # # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted (subject to the limitations in the # disclaimer below) provided that the following conditions are met: # # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # # * Redistributions in binary form must reproduce the above # copyright notice, this list of conditions and the following # disclaimer in the documentation and/or other materials provided # with the distribution. # # * Neither the name of Pacific Biosciences nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE # GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC # BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF # USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT # OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. #################################################################################$$ */ #include #include #include #include #include "common.h" int compare_d_path(const void * a, const void * b) { const d_path_data2 * arg1 = a; const d_path_data2 * arg2 = b; if (arg1->d - arg2->d == 0) { return arg1->k - arg2->k; } else { return arg1->d - arg2->d; } } void d_path_sort( d_path_data2 * base, unsigned long max_idx) { qsort(base, max_idx, sizeof(d_path_data2), compare_d_path); } d_path_data2 * get_dpath_idx( seq_coor_t d, seq_coor_t k, unsigned long max_idx, d_path_data2 * base) { d_path_data2 d_tmp; d_path_data2 *rtn; d_tmp.d = d; d_tmp.k = k; rtn = (d_path_data2 *) bsearch( &d_tmp, base, max_idx, sizeof(d_path_data2), compare_d_path); //printf("dp %ld %ld %ld %ld %ld %ld %ld\n", (rtn)->d, (rtn)->k, (rtn)->x1, (rtn)->y1, (rtn)->x2, (rtn)->y2, (rtn)->pre_k); return rtn; } void print_d_path( d_path_data2 * base, unsigned long max_idx) { unsigned long idx; for (idx = 0; idx < max_idx; idx++){ printf("dp %ld %d %d %d %d %d %d %d\n",idx, (base+idx)->d, (base+idx)->k, (base+idx)->x1, (base+idx)->y1, (base+idx)->x2, (base+idx)->y2, (base+idx)->pre_k); } } alignment *_align(char *query_seq, seq_coor_t q_len, char *target_seq, seq_coor_t t_len, seq_coor_t band_tolerance, int get_aln_str) { seq_coor_t * V; seq_coor_t * U; // array of matched bases for each "k" seq_coor_t k_offset; seq_coor_t d; seq_coor_t k, k2; seq_coor_t best_m; // the best "matches" for each d seq_coor_t min_k, new_min_k; seq_coor_t max_k, new_max_k; seq_coor_t pre_k; seq_coor_t x, y; seq_coor_t cd; seq_coor_t ck; seq_coor_t cx, cy, nx, ny; seq_coor_t max_d; seq_coor_t band_size; unsigned long d_path_idx = 0; unsigned long max_idx = 0; d_path_data2 * d_path; d_path_data2 * d_path_aux; path_point * aln_path; seq_coor_t aln_path_idx; alignment * align_rtn; seq_coor_t aln_pos; seq_coor_t i; bool aligned = false; //printf("debug: %ld %ld\n", q_len, t_len); //printf("%s\n", query_seq); max_d = (int) (0.3*(q_len + t_len)); band_size = band_tolerance * 2; V = calloc( max_d * 2 + 1, sizeof(seq_coor_t) ); U = calloc( max_d * 2 + 1, sizeof(seq_coor_t) ); k_offset = max_d; // We should probably use hashmap to store the backtracing information to save memory allocation time // This O(MN) block allocation scheme is convient for now but it is slower for very long sequences d_path = calloc( max_d * (band_size + 1 ) * 2 + 1, sizeof(d_path_data2) ); aln_path = calloc( q_len + t_len + 1, sizeof(path_point) ); align_rtn = calloc( 1, sizeof(alignment)); align_rtn->t_aln_str = calloc( q_len + t_len + 1, sizeof(char)); align_rtn->q_aln_str = calloc( q_len + t_len + 1, sizeof(char)); align_rtn->aln_str_size = 0; align_rtn->aln_q_s = 0; align_rtn->aln_q_e = 0; align_rtn->aln_t_s = 0; align_rtn->aln_t_e = 0; //printf("max_d: %lu, band_size: %lu\n", max_d, band_size); best_m = -1; min_k = 0; max_k = 0; d_path_idx = 0; max_idx = 0; for (d = 0; d < max_d; d ++ ) { if (max_k - min_k > band_size) { break; } for (k = min_k; k <= max_k; k += 2) { if ( (k == min_k) || (k != max_k) && (V[ k - 1 + k_offset ] < V[ k + 1 + k_offset]) ) { pre_k = k + 1; x = V[ k + 1 + k_offset]; } else { pre_k = k - 1; x = V[ k - 1 + k_offset] + 1; } y = x - k; d_path[d_path_idx].d = d; d_path[d_path_idx].k = k; d_path[d_path_idx].x1 = x; d_path[d_path_idx].y1 = y; while ( x < q_len && y < t_len && query_seq[x] == target_seq[y] ){ x++; y++; } d_path[d_path_idx].x2 = x; d_path[d_path_idx].y2 = y; d_path[d_path_idx].pre_k = pre_k; d_path_idx ++; V[ k + k_offset ] = x; U[ k + k_offset ] = x + y; if ( x + y > best_m) { best_m = x + y; } if ( x >= q_len || y >= t_len) { aligned = true; max_idx = d_path_idx; break; } } // For banding new_min_k = max_k; new_max_k = min_k; for (k2 = min_k; k2 <= max_k; k2 += 2) { if (U[ k2 + k_offset] >= best_m - band_tolerance ) { if ( k2 < new_min_k ) { new_min_k = k2; } if ( k2 > new_max_k ) { new_max_k = k2; } } } max_k = new_max_k + 1; min_k = new_min_k - 1; // For no banding // max_k ++; // min_k --; // For debuging // printf("min_max_k,d, %ld %ld %ld\n", min_k, max_k, d); if (aligned == true) { align_rtn->aln_q_e = x; align_rtn->aln_t_e = y; align_rtn->dist = d; align_rtn->aln_str_size = (x + y + d) / 2; align_rtn->aln_q_s = 0; align_rtn->aln_t_s = 0; d_path_sort(d_path, max_idx); //print_d_path(d_path, max_idx); if (get_aln_str > 0) { cd = d; ck = k; aln_path_idx = 0; while (cd >= 0 && aln_path_idx < q_len + t_len + 1) { d_path_aux = (d_path_data2 *) get_dpath_idx( cd, ck, max_idx, d_path); aln_path[aln_path_idx].x = d_path_aux -> x2; aln_path[aln_path_idx].y = d_path_aux -> y2; aln_path_idx ++; aln_path[aln_path_idx].x = d_path_aux -> x1; aln_path[aln_path_idx].y = d_path_aux -> y1; aln_path_idx ++; ck = d_path_aux -> pre_k; cd -= 1; } aln_path_idx --; cx = aln_path[aln_path_idx].x; cy = aln_path[aln_path_idx].y; align_rtn->aln_q_s = cx; align_rtn->aln_t_s = cy; aln_pos = 0; while ( aln_path_idx > 0 ) { aln_path_idx --; nx = aln_path[aln_path_idx].x; ny = aln_path[aln_path_idx].y; if (cx == nx && cy == ny){ continue; } if (nx == cx && ny != cy){ //advance in y for (i = 0; i < ny - cy; i++) { align_rtn->q_aln_str[aln_pos + i] = '-'; } for (i = 0; i < ny - cy; i++) { align_rtn->t_aln_str[aln_pos + i] = target_seq[cy + i]; } aln_pos += ny - cy; } else if (nx != cx && ny == cy){ //advance in x for (i = 0; i < nx - cx; i++) { align_rtn->q_aln_str[aln_pos + i] = query_seq[cx + i]; } for (i = 0; i < nx - cx; i++) { align_rtn->t_aln_str[aln_pos + i] = '-'; } aln_pos += nx - cx; } else { for (i = 0; i < nx - cx; i++) { align_rtn->q_aln_str[aln_pos + i] = query_seq[cx + i]; } for (i = 0; i < ny - cy; i++) { align_rtn->t_aln_str[aln_pos + i] = target_seq[cy + i]; } aln_pos += ny - cy; } cx = nx; cy = ny; } align_rtn->aln_str_size = aln_pos; } break; } } free(V); free(U); free(d_path); free(aln_path); return align_rtn; } void free_alignment(alignment * aln) { free(aln->q_aln_str); free(aln->t_aln_str); free(aln); } HINGE-0.5.0/src/lib/INIReader.cpp000066400000000000000000000045201314415550300161530ustar00rootroot00000000000000// Read an INI file into easy-to-access name/value pairs. // inih and INIReader are released under the New BSD license (see LICENSE.txt). // Go to the project home page for more info: // // https://github.com/benhoyt/inih #include #include #include #include "ini.h" #include "INIReader.h" using std::string; INIReader::INIReader(string filename) { _error = ini_parse(filename.c_str(), ValueHandler, this); } int INIReader::ParseError() { return _error; } string INIReader::Get(string section, string name, string default_value) { string key = MakeKey(section, name); return _values.count(key) ? _values[key] : default_value; } long INIReader::GetInteger(string section, string name, long default_value) { string valstr = Get(section, name, ""); const char* value = valstr.c_str(); char* end; // This parses "1234" (decimal) and also "0x4D2" (hex) long n = strtol(value, &end, 0); return end > value ? n : default_value; } double INIReader::GetReal(string section, string name, double default_value) { string valstr = Get(section, name, ""); const char* value = valstr.c_str(); char* end; double n = strtod(value, &end); return end > value ? n : default_value; } bool INIReader::GetBoolean(string section, string name, bool default_value) { string valstr = Get(section, name, ""); // Convert to lower case to make string comparisons case-insensitive std::transform(valstr.begin(), valstr.end(), valstr.begin(), ::tolower); if (valstr == "true" || valstr == "yes" || valstr == "on" || valstr == "1") return true; else if (valstr == "false" || valstr == "no" || valstr == "off" || valstr == "0") return false; else return default_value; } string INIReader::MakeKey(string section, string name) { string key = section + "=" + name; // Convert to lower case to make section/name lookups case-insensitive std::transform(key.begin(), key.end(), key.begin(), ::tolower); return key; } int INIReader::ValueHandler(void* user, const char* section, const char* name, const char* value) { INIReader* reader = (INIReader*)user; string key = MakeKey(section, name); if (reader->_values[key].size() > 0) reader->_values[key] += "\n"; reader->_values[key] += value; return 1; } HINGE-0.5.0/src/lib/LAInterface.cpp000066400000000000000000004201241314415550300165300ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "LAInterface.h" #include "align.h" #include "DB.h" #include "paf.h" #include "kseq.h" void Read::showRead() { std::cout << "read #" << id << std::endl; std::cout << ">" << name << std::endl; std::cout << bases << std::endl; } int LAInterface::openDB2(std::string filename, std::string filename2) { char *fn = new char[filename.length() + 1]; strcpy(fn, filename.c_str()); char *fn2 = new char[filename2.length() + 1]; strcpy(fn2, filename2.c_str()); int status = Open_DB(fn, this->db1); if (status < 0) exit(1); if (this->db1->part > 0) { fprintf(stderr, "%s: Cannot be called on a block: %s\n", "test", fn); exit(1); } status = Open_DB(fn2, this->db2); if (status < 0) exit(1); if (this->db2->part > 0) { fprintf(stderr, "%s: Cannot be called on a block: %s\n", "test", fn); exit(1); } Trim_DB(db1); Trim_DB(db2); char *fn_1 = new char[filename.length() + 1 + 3]; strcpy(fn_1, fn); strcat(fn_1, ".db"); FILE * dstub = Fopen(fn_1, (char *)"r"); if (dstub == NULL) exit(1); if (fscanf(dstub, DB_NFILE, &nfiles) != 1) SYSTEM_ERROR printf("%d files\n", nfiles); flist = (char **) Malloc(sizeof(char *) * nfiles, (char *)"Allocating file list"); findx = (int *) Malloc(sizeof(int *) * (nfiles + 1), (char *)"Allocating file index"); if (flist == NULL || findx == NULL) exit(1); findx += 1; findx[-1] = 0; for (int i = 0; i < nfiles; i++) { char prolog[MAX_NAME], fname[MAX_NAME]; if (fscanf(dstub, DB_FDATA, findx + i, fname, prolog) != 3) SYSTEM_ERROR if ((flist[i] = Strdup(prolog, (char *)"Adding to file list")) == NULL) exit(1); } fclose(dstub); char *fn_2 = new char[filename2.length() + 1 + 3]; strcpy(fn_2, fn2); strcat(fn_2, ".db"); dstub = Fopen(fn_2, (char*)"r"); if (dstub == NULL) exit(1); if (fscanf(dstub, DB_NFILE, &nfiles2) != 1) SYSTEM_ERROR printf("%d files\n", nfiles2); flist2 = (char **) Malloc(sizeof(char *) * nfiles2, (char *)"Allocating file list"); findx2 = (int *) Malloc(sizeof(int *) * (nfiles2 + 1), (char *)"Allocating file index"); if (flist2 == NULL || findx2 == NULL) exit(1); findx2 += 1; findx2[-1] = 0; for (int i = 0; i < nfiles2; i++) { char prolog[MAX_NAME], fname[MAX_NAME]; if (fscanf(dstub, DB_FDATA, findx2 + i, fname, prolog) != 3) SYSTEM_ERROR if ((flist2[i] = Strdup(prolog, (char *)"Adding to file list")) == NULL) exit(1); } fclose(dstub); delete [] fn; delete [] fn2; delete [] fn_1; delete [] fn_2; return 0; } int LAInterface::openDB(std::string filename) { char *fn = new char[filename.length() + 1]; strcpy(fn, filename.c_str()); int status = Open_DB(fn, this->db1); if (status < 0) exit(1); if (this->db1->part > 0) { fprintf(stderr, "%s: Cannot be called on a block: %s\n", "test", fn); exit(1); } this->db2 = this->db1; Trim_DB(db1); FILE *dstub; char *fn2 = new char[filename.length() + 1 + 3]; strcpy(fn2, fn); strcat(fn2, ".db"); dstub = Fopen(fn2, (char*)"r"); if (dstub == NULL) exit(1); if (fscanf(dstub, DB_NFILE, &nfiles) != 1) SYSTEM_ERROR //printf("%d files\n", nfiles); flist = (char **) Malloc(sizeof(char *) * nfiles, (char *)"Allocating file list"); findx = (int *) Malloc(sizeof(int *) * (nfiles + 1), (char *)"Allocating file index"); if (flist == NULL || findx == NULL) exit(1); findx += 1; findx[-1] = 0; for (int i = 0; i < nfiles; i++) { char prolog[MAX_NAME], fname[MAX_NAME]; if (fscanf(dstub, DB_FDATA, findx + i, fname, prolog) != 3) SYSTEM_ERROR if ((flist[i] = Strdup(prolog, (char *)"Adding to file list")) == NULL) exit(1); } fclose(dstub); delete[] fn; return 0; } int LAInterface::closeDB() { Close_DB(db1); return 0; } int LAInterface::closeDB2() { Close_DB(db1); Close_DB(db2); return 0; } void LAInterface::showRead(int from, int to) { if (flist == NULL || findx == NULL) exit(1); HITS_READ *reads; HITS_TRACK *first; char *read, **entry; int c, b, e, i; int hilight, substr; int map; int (*iscase)(int); read = New_Read_Buffer(db1); int UPPER = 1; int WIDTH = 80; //printf("2"); { entry = NULL; first = db1->tracks; } hilight = 'A' - 'a'; iscase = islower; map = 0; reads = db1->reads; substr = 0; c = 0; b = from; e = to; for (i = b; i < e; i++) { int len; int fst, lst; int flags, qv; HITS_READ *r; HITS_TRACK *track; r = reads + i; len = r->rlen; flags = r->flags; qv = (flags & DB_QV); { while (i < findx[map - 1]) map -= 1; while (i >= findx[map]) map += 1; printf(">%s/%d/%d_%d", flist[map], r->origin, r->fpulse, r->fpulse + len); if (qv > 0) printf(" RQ=0.%3d", qv); } printf("\n"); Load_Read(db1, i, read, UPPER); for (track = first; track != NULL; track = track->next) { int64 *anno; int *data; int64 s, f, j; int bd, ed, m; anno = (int64 *) track->anno; data = (int *) track->data; s = (anno[i] >> 2); f = (anno[i + 1] >> 2); if (s < f) { for (j = s; j < f; j += 2) { bd = data[j]; ed = data[j + 1]; for (m = bd; m < ed; m++) if (iscase(read[m])) read[m] = (char) (read[m] + hilight); if (j == s) printf("> %s:", track->name); printf(" [%d,%d]", bd, ed); } printf("\n"); } } fst = 0; lst = len; { int j; for (j = fst; j + WIDTH < lst; j += WIDTH) printf("%.*s\n", WIDTH, read + j); if (j < lst) printf("%.*s\n", lst - j, read + j); } } } void LAInterface::showRead2(int from, int to) { if (flist2 == NULL || findx2 == NULL) exit(1); HITS_READ *reads; HITS_TRACK *first; char *read, **entry; int c, b, e, i; int hilight, substr; int map; int (*iscase)(int); read = New_Read_Buffer(db2); int UPPER = 1; int WIDTH = 80; //printf("2"); { entry = NULL; first = db2->tracks; } hilight = 'A' - 'a'; iscase = islower; map = 0; reads = db2->reads; substr = 0; c = 0; b = from; e = to; for (i = b; i < e; i++) { int len; int fst, lst; int flags, qv; HITS_READ *r; HITS_TRACK *track; r = reads + i; len = r->rlen; flags = r->flags; qv = (flags & DB_QV); { while (i < findx[map - 1]) map -= 1; while (i >= findx[map]) map += 1; printf(">%s/%d/%d_%d", flist[map], r->origin, r->fpulse, r->fpulse + len); if (qv > 0) printf(" RQ=0.%3d", qv); } printf("\n"); Load_Read(db2, i, read, UPPER); for (track = first; track != NULL; track = track->next) { int64 *anno; int *data; int64 s, f, j; int bd, ed, m; anno = (int64 *) track->anno; data = (int *) track->data; s = (anno[i] >> 2); f = (anno[i + 1] >> 2); if (s < f) { for (j = s; j < f; j += 2) { bd = data[j]; ed = data[j + 1]; for (m = bd; m < ed; m++) if (iscase(read[m])) read[m] = (char) (read[m] + hilight); if (j == s) printf("> %s:", track->name); printf(" [%d,%d]", bd, ed); } printf("\n"); } } fst = 0; lst = len; { int j; for (j = fst; j + WIDTH < lst; j += WIDTH) printf("%.*s\n", WIDTH, read + j); if (j < lst) printf("%.*s\n", lst - j, read + j); } } } Read *LAInterface::getRead(int number) { std::stringstream ss; std::string read_name; std::string read_bases; if (flist == NULL || findx == NULL) exit(1); HITS_READ *reads; HITS_TRACK *first; char *read, **entry; int c, b, e, i; int hilight, substr; int map; int (*iscase)(int); read = New_Read_Buffer(db1); int UPPER = 1; int WIDTH = 80; //printf("2"); entry = NULL; first = db1->tracks; hilight = 'A' - 'a'; map = 0; reads = db1->reads; substr = 0; c = 0; b = number; e = number + 1; for (i = b; i < e; i++) { int len; int fst, lst; int flags, qv; HITS_READ *r; HITS_TRACK *track; r = reads + i; len = r->rlen; flags = r->flags; qv = (flags & DB_QV); { while (i < findx[map - 1]) map -= 1; while (i >= findx[map]) map += 1; ss << flist[map] << '/' << r->origin << '/' << r->fpulse << '_' << r->fpulse + len; if (qv > 0) ss << "RQ=" << qv; } ss >> read_name; Load_Read(db1, i, read, UPPER); for (track = first; track != NULL; track = track->next) { int64 *anno; int *data; int64 s, f, j; int bd, ed, m; anno = (int64 *) track->anno; data = (int *) track->data; s = (anno[i] >> 2); f = (anno[i + 1] >> 2); if (s < f) { for (j = s; j < f; j += 2) { bd = data[j]; ed = data[j + 1]; for (m = bd; m < ed; m++) if (iscase(read[m])) read[m] = (char) (read[m] + hilight); if (j == s) printf("> %s:", track->name); printf(" [%d,%d]", bd, ed); } printf("\n"); } } read_bases = std::string(read); fst = 0; lst = len; } Read *new_r = new Read(number, read_name, read_bases); return new_r; } Read *LAInterface::getRead2(int number) { std::stringstream ss; std::string read_name; std::string read_bases; if (flist2 == NULL || findx2 == NULL) exit(1); HITS_READ *reads; HITS_TRACK *first; char *read, **entry; int c, b, e, i; int hilight, substr; int map; int (*iscase)(int); read = New_Read_Buffer(db2); int UPPER = 1; int WIDTH = 80; //printf("2"); entry = NULL; first = db2->tracks; hilight = 'A' - 'a'; map = 0; reads = db2->reads; substr = 0; c = 0; b = number; e = number + 1; for (i = b; i < e; i++) { int len; int fst, lst; int flags, qv; HITS_READ *r; HITS_TRACK *track; r = reads + i; len = r->rlen; flags = r->flags; qv = (flags & DB_QV); { while (i < findx2[map - 1]) map -= 1; while (i >= findx2[map]) map += 1; ss << flist2[map] << '/' << r->origin << '/' << r->fpulse << '_' << r->fpulse + len; if (qv > 0) ss << "RQ=" << qv; } ss >> read_name; Load_Read(db2, i, read, UPPER); for (track = first; track != NULL; track = track->next) { int64 *anno; int *data; int64 s, f, j; int bd, ed, m; anno = (int64 *) track->anno; data = (int *) track->data; s = (anno[i] >> 2); f = (anno[i + 1] >> 2); if (s < f) { for (j = s; j < f; j += 2) { bd = data[j]; ed = data[j + 1]; for (m = bd; m < ed; m++) if (iscase(read[m])) read[m] = (char) (read[m] + hilight); if (j == s) printf("> %s:", track->name); printf(" [%d,%d]", bd, ed); } printf("\n"); } } read_bases = std::string(read); fst = 0; lst = len; } Read *new_r = new Read(number, read_name, read_bases); return new_r; } int LAInterface::openAlignmentFile(std::string filename) { char *fn = new char[filename.size() + 1]; strcpy(fn, filename.c_str()); input = Fopen(fn, (char*)"r"); if (input == NULL) exit(1); if (fread(&novl, sizeof(int64), 1, input) != 1) SYSTEM_ERROR if (fread(&tspace, sizeof(int), 1, input) != 1) SYSTEM_ERROR if (tspace <= TRACE_XOVR) { small = 1; tbytes = sizeof(uint8); } else { small = 0; tbytes = sizeof(uint16); } //printf("\n%s: ", fn); //Print_Number(novl, 0, stdout); //printf(" records\n"); return 0; } void LAInterface::showAlignment(int from, int to) { int j; uint16 *trace; Work_Data *work; int tmax; int in, npt, idx, ar; int64 tps; char *abuffer, *bbuffer; int ar_wide, br_wide; int ai_wide, bi_wide; int mn_wide, mx_wide; int tp_wide; int blast, match, seen, lhalf, rhalf; bool ALIGN = true; bool REFERENCE = false; bool CARTOON = false; bool OVERLAP = true; bool FLIP = false; bool UPPERCASE = false; bool MAP = false; int INDENT = 4; int WIDTH = 100; int BORDER = 10; aln->path = &(ovl->path); if (ALIGN || REFERENCE) { work = New_Work_Data(); abuffer = New_Read_Buffer(db1); bbuffer = New_Read_Buffer(db2); } else { abuffer = NULL; bbuffer = NULL; work = NULL; } tmax = 1000; trace = (uint16 *) Malloc(sizeof(uint16) * tmax, (char *)"Allocating trace vector"); if (trace == NULL) exit(1); in = 0; //if (pts!=NULL) free(pts); //pts = NULL; pts = new int[4]; pts[0] = from + 1; pts[1] = to; pts[2] = INT32_MAX; npt = pts[0]; idx = 1; ar_wide = Number_Digits((int64) db1->nreads); br_wide = Number_Digits((int64) db2->nreads); ai_wide = Number_Digits((int64) db1->maxlen); bi_wide = Number_Digits((int64) db2->maxlen); if (db1->maxlen < db2->maxlen) { mn_wide = ai_wide; mx_wide = bi_wide; tp_wide = Number_Digits((int64) db1->maxlen / tspace + 2); } else { mn_wide = bi_wide; mx_wide = ai_wide; tp_wide = Number_Digits((int64) db2->maxlen / tspace + 2); } ar_wide += (ar_wide - 1) / 3; br_wide += (br_wide - 1) / 3; ai_wide += (ai_wide - 1) / 3; bi_wide += (bi_wide - 1) / 3; mn_wide += (mn_wide - 1) / 3; tp_wide += (tp_wide - 1) / 3; if (FLIP) { int x; x = ar_wide; ar_wide = br_wide; br_wide = x; x = ai_wide; ai_wide = bi_wide; bi_wide = x; } // For each record do blast = -1; match = 0; seen = 0; lhalf = rhalf = 0; for (j = 0; j < novl; j++) // Read it in { //printf("j:%d/%d\n",j,novl); Read_Overlap(input, ovl); if (ovl->path.tlen > tmax) { tmax = ((int) 1.2 * ovl->path.tlen) + 100; trace = (uint16 *) Realloc(trace, sizeof(uint16) * tmax, (char *)"Allocating trace vector"); if (trace == NULL) exit(1); } ovl->path.trace = (void *) trace; Read_Trace(input, ovl, tbytes); // Determine if it should be displayed ar = ovl->aread + 1; if (in) { while (ar > npt) { npt = pts[idx++]; if (ar < npt) { in = 0; break; } npt = pts[idx++]; } } else { while (ar >= npt) { npt = pts[idx++]; if (ar <= npt) { in = 1; break; } npt = pts[idx++]; } } if (!in) continue; // If -o check display only overlaps aln->alen = db1->reads[ovl->aread].rlen; aln->blen = db2->reads[ovl->bread].rlen; aln->flags = ovl->flags; tps = ovl->path.tlen / 2; if (OVERLAP) { if (ovl->path.abpos != 0 && ovl->path.bbpos != 0) continue; if (ovl->path.aepos != aln->alen && ovl->path.bepos != aln->blen) continue; } // If -M option then check the completeness of the implied mapping if (MAP) { while (ovl->bread != blast) { if (!match && seen && !(lhalf && rhalf)) { printf("Missing "); Print_Number((int64) blast + 1, br_wide + 1, stdout); printf(" %d ->%lld\n", db2->reads[blast].rlen, db2->reads[blast].coff); } match = 0; seen = 0; lhalf = rhalf = 0; blast += 1; } seen = 1; if (ovl->path.abpos == 0) rhalf = 1; if (ovl->path.aepos == aln->alen) lhalf = 1; if (ovl->path.bbpos != 0 || ovl->path.bepos != aln->blen) continue; match = 1; } // Display it if (ALIGN || CARTOON || REFERENCE) printf("\n"); if (FLIP) { Flip_Alignment(aln, 0); Print_Number((int64) ovl->bread + 1, ar_wide + 1, stdout); printf(" "); Print_Number((int64) ovl->aread + 1, br_wide + 1, stdout); } else { Print_Number((int64) ovl->aread , ar_wide + 1, stdout); printf(" "); Print_Number((int64) ovl->bread , br_wide + 1, stdout); } if (COMP(ovl->flags)) printf(" c"); else printf(" n"); printf(" ["); Print_Number((int64) ovl->path.abpos, ai_wide, stdout); printf(".."); Print_Number((int64) ovl->path.aepos, ai_wide, stdout); printf("]%d x [",aln->alen); Print_Number((int64) ovl->path.bbpos, bi_wide, stdout); printf(".."); Print_Number((int64) ovl->path.bepos, bi_wide, stdout); printf("]%d", aln->blen); if (ALIGN || CARTOON || REFERENCE) { if (ALIGN || REFERENCE) { char *aseq, *bseq; int amin, amax; int bmin, bmax; if (FLIP) Flip_Alignment(aln, 0); if (small) Decompress_TraceTo16(ovl); amin = ovl->path.abpos - BORDER; if (amin < 0) amin = 0; amax = ovl->path.aepos + BORDER; if (amax > aln->alen) amax = aln->alen; if (COMP(aln->flags)) { bmin = (aln->blen - ovl->path.bepos) - BORDER; if (bmin < 0) bmin = 0; bmax = (aln->blen - ovl->path.bbpos) + BORDER; if (bmax > aln->blen) bmax = aln->blen; } else { bmin = ovl->path.bbpos - BORDER; if (bmin < 0) bmin = 0; bmax = ovl->path.bepos + BORDER; if (bmax > aln->blen) bmax = aln->blen; } aseq = Load_Subread(db1, ovl->aread, amin, amax, abuffer, 0); bseq = Load_Subread(db2, ovl->bread, bmin, bmax, bbuffer, 0); aln->aseq = aseq - amin; if (COMP(aln->flags)) { Complement_Seq(bseq, bmax - bmin); aln->bseq = bseq - (aln->blen - bmax); } else aln->bseq = bseq - bmin; computeTracePTS(aln, work, tspace); if (FLIP) { if (COMP(aln->flags)) { Complement_Seq(aseq, amax - amin); Complement_Seq(bseq, bmax - bmin); aln->aseq = aseq - (aln->alen - amax); aln->bseq = bseq - bmin; } Flip_Alignment(aln, 1); } } if (CARTOON) { printf(" ("); Print_Number(tps, tp_wide, stdout); printf(" trace pts)\n\n"); Alignment_Cartoon(stdout, aln, INDENT, mx_wide); } else { printf(" : = "); Print_Number((int64) ovl->path.diffs, mn_wide, stdout); printf(" diffs ("); Print_Number(tps, tp_wide, stdout); printf(" trace pts)\n"); } if (REFERENCE) Print_Reference(stdout, aln, work, INDENT, WIDTH, BORDER, UPPERCASE, mx_wide); if (ALIGN) printAlignment(stdout, aln, work, INDENT, WIDTH, BORDER, UPPERCASE, mx_wide); } else { printf(" : < "); Print_Number((int64) ovl->path.diffs, mn_wide, stdout); printf(" diffs ("); Print_Number(tps, tp_wide, stdout); printf(" trace pts)\n"); } } free(trace); if (ALIGN) { free(bbuffer - 1); free(abuffer - 1); Free_Work_Data(work); } } void LAInterface::getAlignmentB(std::vector &result, int from) { int j; uint16 *trace; Work_Data *work; int tmax; int in, npt, idx, ar; int64 tps; char *abuffer, *bbuffer; int ar_wide, br_wide; int ai_wide, bi_wide; int mn_wide, mx_wide; int tp_wide; int blast, match, seen, lhalf, rhalf; bool ALIGN = false; bool REFERENCE = false; bool CARTOON = false; bool OVERLAP = true; bool FLIP = false; bool UPPERCASE = false; bool MAP = false; int INDENT = 4; int WIDTH = 100; int BORDER = 10; aln->path = &(ovl->path); if (ALIGN || REFERENCE) { work = New_Work_Data(); abuffer = New_Read_Buffer(db1); bbuffer = New_Read_Buffer(db2); } else { abuffer = NULL; bbuffer = NULL; work = NULL; } tmax = 1000; trace = (uint16 *) Malloc(sizeof(uint16) * tmax, (char *)"Allocating trace vector"); if (trace == NULL) exit(1); in = 0; //if (pts!=NULL) free(pts); //pts = NULL; pts = new int[4]; pts[0] = from + 1; pts[1] = from + 1; pts[2] = INT32_MAX; npt = pts[0]; idx = 1; ar_wide = Number_Digits((int64) db1->nreads); br_wide = Number_Digits((int64) db2->nreads); ai_wide = Number_Digits((int64) db1->maxlen); bi_wide = Number_Digits((int64) db2->maxlen); if (db1->maxlen < db2->maxlen) { mn_wide = ai_wide; mx_wide = bi_wide; tp_wide = Number_Digits((int64) db1->maxlen / tspace + 2); } else { mn_wide = bi_wide; mx_wide = ai_wide; tp_wide = Number_Digits((int64) db2->maxlen / tspace + 2); } ar_wide += (ar_wide - 1) / 3; br_wide += (br_wide - 1) / 3; ai_wide += (ai_wide - 1) / 3; bi_wide += (bi_wide - 1) / 3; mn_wide += (mn_wide - 1) / 3; tp_wide += (tp_wide - 1) / 3; if (FLIP) { int x; x = ar_wide; ar_wide = br_wide; br_wide = x; x = ai_wide; ai_wide = bi_wide; bi_wide = x; } // For each record do blast = -1; match = 0; seen = 0; lhalf = rhalf = 0; for (j = 0; j < novl; j++) // Read it in { //printf("j:%d/%d\n",j,novl); Read_Overlap(input, ovl); if (ovl->path.tlen > tmax) { tmax = ((int) 1.2 * ovl->path.tlen) + 100; trace = (uint16 *) Realloc(trace, sizeof(uint16) * tmax, (char *)"Allocating trace vector"); if (trace == NULL) exit(1); } ovl->path.trace = (void *) trace; Read_Trace(input, ovl, tbytes); // Determine if it should be displayed ar = ovl->aread + 1; if (in) { while (ar > npt) { npt = pts[idx++]; if (ar < npt) { in = 0; break; } npt = pts[idx++]; } } else { while (ar >= npt) { npt = pts[idx++]; if (ar <= npt) { in = 1; break; } npt = pts[idx++]; } } if (!in) continue; // If -o check display only overlaps aln->alen = db1->reads[ovl->aread].rlen; aln->blen = db2->reads[ovl->bread].rlen; aln->flags = ovl->flags; tps = ovl->path.tlen / 2; if (OVERLAP) { if (ovl->path.abpos != 0 && ovl->path.bbpos != 0) continue; if (ovl->path.aepos != aln->alen && ovl->path.bepos != aln->blen) continue; } // If -M option then check the completeness of the implied mapping if (MAP) { while (ovl->bread != blast) { if (!match && seen && !(lhalf && rhalf)) { printf("Missing "); Print_Number((int64) blast + 1, br_wide + 1, stdout); printf(" %d ->%lld\n", db2->reads[blast].rlen, db2->reads[blast].coff); } match = 0; seen = 0; lhalf = rhalf = 0; blast += 1; } seen = 1; if (ovl->path.abpos == 0) rhalf = 1; if (ovl->path.aepos == aln->alen) lhalf = 1; if (ovl->path.bbpos != 0 || ovl->path.bepos != aln->blen) continue; match = 1; } // Display it if (ALIGN || CARTOON || REFERENCE) printf("\n"); if (FLIP) { Flip_Alignment(aln, 0); //Print_Number((int64) ovl->bread+1,ar_wide+1,stdout); //printf(" "); //Print_Number((int64) ovl->aread+1,br_wide+1,stdout); } else { //Print_Number((int64) ovl->aread+1,ar_wide+1,stdout); //printf(" "); //Print_Number((int64) ovl->bread+1,br_wide+1,stdout); result.push_back(ovl->bread); } //if (COMP(ovl->reverse_complement_match_)) // printf(" c"); //else // printf(" n"); //printf(" ["); //Print_Number((int64) ovl->path.read_A_match_start_,ai_wide,stdout); //printf(".."); //Print_Number((int64) ovl->path.read_A_match_end_,ai_wide,stdout); //printf("] x ["); //Print_Number((int64) ovl->path.read_B_match_start_,bi_wide,stdout); //printf(".."); //Print_Number((int64) ovl->path.read_B_match_end_,bi_wide,stdout); //printf("]"); if ((ALIGN || CARTOON || REFERENCE) && (false)) { if (ALIGN || REFERENCE) { char *aseq, *bseq; int amin, amax; int bmin, bmax; if (FLIP) Flip_Alignment(aln, 0); if (small) Decompress_TraceTo16(ovl); amin = ovl->path.abpos - BORDER; if (amin < 0) amin = 0; amax = ovl->path.aepos + BORDER; if (amax > aln->alen) amax = aln->alen; if (COMP(aln->flags)) { bmin = (aln->blen - ovl->path.bepos) - BORDER; if (bmin < 0) bmin = 0; bmax = (aln->blen - ovl->path.bbpos) + BORDER; if (bmax > aln->blen) bmax = aln->blen; } else { bmin = ovl->path.bbpos - BORDER; if (bmin < 0) bmin = 0; bmax = ovl->path.bepos + BORDER; if (bmax > aln->blen) bmax = aln->blen; } aseq = Load_Subread(db1, ovl->aread, amin, amax, abuffer, 0); bseq = Load_Subread(db2, ovl->bread, bmin, bmax, bbuffer, 0); aln->aseq = aseq - amin; if (COMP(aln->flags)) { Complement_Seq(bseq, bmax - bmin); aln->bseq = bseq - (aln->blen - bmax); } else aln->bseq = bseq - bmin; Compute_Trace_PTS(aln, work, tspace,GREEDIEST); if (FLIP) { if (COMP(aln->flags)) { Complement_Seq(aseq, amax - amin); Complement_Seq(bseq, bmax - bmin); aln->aseq = aseq - (aln->alen - amax); aln->bseq = bseq - bmin; } Flip_Alignment(aln, 1); } } if (CARTOON) { printf(" ("); Print_Number(tps, tp_wide, stdout); printf(" trace pts)\n\n"); Alignment_Cartoon(stdout, aln, INDENT, mx_wide); } else { printf(" : = "); Print_Number((int64) ovl->path.diffs, mn_wide, stdout); printf(" diffs ("); Print_Number(tps, tp_wide, stdout); printf(" trace pts)\n"); } if (REFERENCE) Print_Reference(stdout, aln, work, INDENT, WIDTH, BORDER, UPPERCASE, mx_wide); if (ALIGN) Print_Alignment(stdout, aln, work, INDENT, WIDTH, BORDER, UPPERCASE, mx_wide); } else {// printf(" : < "); // Print_Number((int64) ovl->path.diffs,mn_wide,stdout); // printf(" diffs ("); // Print_Number(tps,tp_wide,stdout); // printf(" trace pts)\n"); } } free(trace); if (ALIGN) { free(bbuffer - 1); free(abuffer - 1); Free_Work_Data(work); } } void LAInterface::getRead(std::vector &reads_vec, int from, int to) { std::stringstream ss; std::string read_name; std::string read_bases; if (flist == NULL || findx == NULL) exit(1); HITS_READ *reads; HITS_TRACK *first; char *read, **entry; int c, b, e, i; int hilight, substr; int map; int (*iscase)(int); read = New_Read_Buffer(db1); int UPPER = 1; int WIDTH = 80; entry = NULL; first = db1->tracks; hilight = 'A' - 'a'; map = 0; reads = db1->reads; substr = 0; c = 0; b = from; e = to; for (i = b; i < e; i++) { int len; int fst, lst; int flags, qv; HITS_READ *r; HITS_TRACK *track; r = reads + i; len = r->rlen; flags = r->flags; qv = (flags & DB_QV); { while (i < findx[map - 1]) map -= 1; while (i >= findx[map]) map += 1; ss << flist[map] << '/' << r->origin << '/' << r->fpulse << '_' << r->fpulse + len; if (qv > 0) ss << "RQ=" << qv; } ss >> read_name; Load_Read(db1, i, read, UPPER); for (track = first; track != NULL; track = track->next) { int64 *anno; int *data; int64 s, f, j; int bd, ed, m; anno = (int64 *) track->anno; data = (int *) track->data; s = (anno[i] >> 2); f = (anno[i + 1] >> 2); if (s < f) { for (j = s; j < f; j += 2) { bd = data[j]; ed = data[j + 1]; for (m = bd; m < ed; m++) if (iscase(read[m])) read[m] = (char) (read[m] + hilight); if (j == s) printf("> %s:", track->name); printf(" [%d,%d]", bd, ed); } printf("\n"); } } read_bases = std::string(read); fst = 0; lst = len; Read *new_r = new Read(i, len, read_name, read_bases); reads_vec.push_back(new_r); } } void LAInterface::getRead2(std::vector &reads_vec, int from, int to) { std::stringstream ss; std::string read_name; std::string read_bases; if (flist2 == NULL || findx2 == NULL) exit(1); HITS_READ *reads; HITS_TRACK *first; char *read, **entry; int c, b, e, i; int hilight, substr; int map; int (*iscase)(int); read = New_Read_Buffer(db2); int UPPER = 1; int WIDTH = 80; entry = NULL; first = db2->tracks; hilight = 'A' - 'a'; map = 0; reads = db2->reads; substr = 0; c = 0; b = from; e = to; for (i = b; i < e; i++) { int len; int fst, lst; int flags, qv; HITS_READ *r; HITS_TRACK *track; r = reads + i; len = r->rlen; flags = r->flags; qv = (flags & DB_QV); { while (i < findx2[map - 1]) map -= 1; while (i >= findx2[map]) map += 1; ss << flist2[map] << '/' << r->origin << '/' << r->fpulse << '_' << r->fpulse + len; if (qv > 0) ss << "RQ=" << qv; } ss >> read_name; Load_Read(db2, i, read, UPPER); for (track = first; track != NULL; track = track->next) { int64 *anno; int *data; int64 s, f, j; int bd, ed, m; anno = (int64 *) track->anno; data = (int *) track->data; s = (anno[i] >> 2); f = (anno[i + 1] >> 2); if (s < f) { for (j = s; j < f; j += 2) { bd = data[j]; ed = data[j + 1]; for (m = bd; m < ed; m++) if (iscase(read[m])) read[m] = (char) (read[m] + hilight); if (j == s) printf("> %s:", track->name); printf(" [%d,%d]", bd, ed); } printf("\n"); } } read_bases = std::string(read); fst = 0; lst = len; Read *new_r = new Read(i, len, read_name, read_bases); reads_vec.push_back(new_r); } } void LAInterface::resetAlignment() { rewind(input); if (fread(&novl, sizeof(int64), 1, input) != 1) SYSTEM_ERROR if (fread(&tspace, sizeof(int), 1, input) != 1) SYSTEM_ERROR if (tspace <= TRACE_XOVR) { small = 1; tbytes = sizeof(uint8); } else { small = 0; tbytes = sizeof(uint16); } //printf("\n%s: ", "read again"); //Print_Number(novl, 0, stdout); //printf(" records\n"); } void LAInterface::getOverlap(std::vector &result_vec, std::vector &range) { int j; uint16 *trace; int tmax; int in, npt, idx, ar; int64 tps; aln->path = &(ovl->path); tmax = 1000; trace = (uint16 *) Malloc(sizeof(uint16) * tmax, (char *)"Allocating trace vector"); if (trace == NULL) exit(1); in = 0; pts = new int[range.size()*2+20]; for (int k = 0; k < range.size(); k++) { pts[k*2] = range[k] + 1; pts[k*2+1] = range[k] + 1; } pts[range.size()*2] = INT32_MAX; npt = pts[0]; idx = 1; // For each record do for (j = 0; j < novl; j++) // Read it in { //if (j % (novl/100) == 0) { // printf("%d percent finished\n", j/(novl/100)); //} Read_Overlap(input, ovl); if (ovl->path.tlen > tmax) { tmax = ((int) 1.2 * ovl->path.tlen) + 100; trace = (uint16 *) Realloc(trace, sizeof(uint16) * tmax, (char *)"Allocating trace vector"); if (trace == NULL) exit(1); } ovl->path.trace = (void *) trace; Read_Trace(input, ovl, tbytes); // Determine if it should be displayed ar = ovl->aread + 1; if (in) { while (ar > npt) { npt = pts[idx++]; if (ar < npt) { in = 0; break; } npt = pts[idx++]; } } else { while (ar >= npt) { npt = pts[idx++]; if (ar <= npt) { in = 1; break; } npt = pts[idx++]; } } if (!in) continue; aln->alen = db1->reads[ovl->aread].rlen; aln->blen = db2->reads[ovl->bread].rlen; aln->flags = ovl->flags; tps = ovl->path.tlen / 2; LOverlap *new_ovl = new LOverlap(); if (COMP(ovl->flags)) { new_ovl->reverse_complement_match_ = 1; } else { new_ovl->reverse_complement_match_ = 0; } if (small) Decompress_TraceTo16(ovl); new_ovl->trace_pts_len = ovl->path.tlen; new_ovl->trace_pts = (uint16 *)malloc(ovl->path.tlen * sizeof(uint16)); memcpy(new_ovl->trace_pts, ovl->path.trace, ovl->path.tlen * sizeof(uint16)); new_ovl->read_A_id_ = ovl->aread; new_ovl->read_B_id_ = ovl->bread; new_ovl->read_A_match_start_ = ovl->path.abpos; new_ovl->read_A_match_end_ = ovl->path.aepos; new_ovl->alen = aln->alen; new_ovl->blen = aln->blen; if (new_ovl->reverse_complement_match_ == 0) { new_ovl->read_B_match_start_ = ovl->path.bbpos; new_ovl->read_B_match_end_ = ovl->path.bepos; } else { new_ovl->read_B_match_start_ = new_ovl->blen - ovl->path.bepos; new_ovl->read_B_match_end_ = new_ovl->blen - ovl->path.bbpos; } new_ovl->diffs = ovl->path.diffs; new_ovl->tlen = ovl->path.tlen; new_ovl->tps = tps; result_vec.push_back(new_ovl); } free(trace); } void LAInterface::getOverlap(std::vector &result_vec, int from, int64 to) { int j; uint16 *trace; int tmax; int in, npt, idx, ar; int64 tps; aln->path = &(ovl->path); tmax = 1000; trace = (uint16 *) Malloc(sizeof(uint16) * tmax, (char *)"Allocating trace vector"); if (trace == NULL) exit(1); in = 0; pts = new int[4]; pts[0] = from + 1; pts[1] = to + 0; pts[2] = INT32_MAX; //printf("from to %d %d\n",pts[0], pts[1]); npt = pts[0]; idx = 1; // For each record do for (j = 0; j < novl; j++) // Read it in { //if (j % (novl/100) == 0) { // printf("%d percent finished\n", j/(novl/100)); //} Read_Overlap(input, ovl); if (ovl->path.tlen > tmax) { tmax = ((int) 1.2 * ovl->path.tlen) + 100; trace = (uint16 *) Realloc(trace, sizeof(uint16) * tmax, (char *)"Allocating trace vector"); if (trace == NULL) exit(1); } ovl->path.trace = (void *) trace; Read_Trace(input, ovl, tbytes); // Determine if it should be displayed ar = ovl->aread + 1; //printf("ar %d\n", ar); if (in) { while (ar > npt) { npt = pts[idx++]; if (ar < npt) { in = 0; break; } npt = pts[idx++]; } } else { while (ar >= npt) { npt = pts[idx++]; if (ar <= npt) { in = 1; break; } npt = pts[idx++]; } } if (!in) continue; aln->alen = db1->reads[ovl->aread].rlen; aln->blen = db2->reads[ovl->bread].rlen; aln->flags = ovl->flags; tps = ovl->path.tlen / 2; LOverlap *new_ovl = new LOverlap(); if (COMP(ovl->flags)) { new_ovl->reverse_complement_match_ = 1; } else { new_ovl->reverse_complement_match_ = 0; } if (small) Decompress_TraceTo16(ovl); new_ovl->trace_pts_len = ovl->path.tlen; new_ovl->trace_pts = (uint16 *)malloc(ovl->path.tlen * sizeof(uint16)); memcpy(new_ovl->trace_pts, ovl->path.trace, ovl->path.tlen * sizeof(uint16)); new_ovl->read_A_id_ = ovl->aread; new_ovl->read_B_id_ = ovl->bread; new_ovl->read_A_match_start_ = ovl->path.abpos; new_ovl->read_A_match_end_ = ovl->path.aepos; new_ovl->alen = aln->alen; new_ovl->blen = aln->blen; if (new_ovl->reverse_complement_match_ == 0) { new_ovl->read_B_match_start_ = ovl->path.bbpos; new_ovl->read_B_match_end_ = ovl->path.bepos; } else { new_ovl->read_B_match_start_ = new_ovl->blen - ovl->path.bepos; new_ovl->read_B_match_end_ = new_ovl->blen - ovl->path.bbpos; } new_ovl->diffs = ovl->path.diffs; new_ovl->tlen = ovl->path.tlen; new_ovl->tps = tps; result_vec.push_back(new_ovl); } free(trace); } void LAInterface::getOverlapw(std::vector &result_vec, int from, int to) { int j; uint16 *trace; int tmax; int in, npt, idx, ar; int64 tps; aln->path = &(ovl->path); tmax = 1000; trace = (uint16 *) Malloc(sizeof(uint16) * tmax, (char *)"Allocating trace vector"); if (trace == NULL) exit(1); in = 0; pts = new int[4]; pts[0] = from + 1; pts[1] = to + 0; pts[2] = INT32_MAX; npt = pts[0]; idx = 1; // For each record do for (j = 0; j < novl; j++) // Read it in { if (j % (novl/100) == 0) { printf("%d percent finished\n", j/(novl/100)); } Read_Overlap(input, ovl); if (ovl->path.tlen > tmax) { tmax = ((int) 1.2 * ovl->path.tlen) + 100; trace = (uint16 *) Realloc(trace, sizeof(uint16) * tmax, (char *)"Allocating trace vector"); if (trace == NULL) exit(1); } ovl->path.trace = (void *) trace; Read_Trace(input, ovl, tbytes); // Determine if it should be displayed ar = ovl->aread + 1; if (in) { while (ar > npt) { npt = pts[idx++]; if (ar < npt) { in = 0; break; } npt = pts[idx++]; } } else { while (ar >= npt) { npt = pts[idx++]; if (ar <= npt) { in = 1; break; } npt = pts[idx++]; } } if (!in) continue; aln->alen = db1->reads[ovl->aread].rlen; aln->blen = db2->reads[ovl->bread].rlen; aln->flags = ovl->flags; tps = ovl->path.tlen / 2; LOverlap *new_ovl = new LOverlap(); if (COMP(ovl->flags)) { new_ovl->reverse_complement_match_ = 1; } else { new_ovl->reverse_complement_match_ = 0; } if (small) Decompress_TraceTo16(ovl); new_ovl->trace_pts_len = ovl->path.tlen; //new_ovl->trace_pts = (uint16 *)malloc(ovl->path.tlen * sizeof(uint16)); //memcpy(new_ovl->trace_pts, ovl->path.trace, ovl->path.tlen * sizeof(uint16)); new_ovl->trace_pts = 0; new_ovl->read_A_id_ = ovl->aread; new_ovl->read_B_id_ = ovl->bread; new_ovl->read_A_match_start_ = ovl->path.abpos; new_ovl->read_A_match_end_ = ovl->path.aepos; new_ovl->read_B_match_start_ = ovl->path.bbpos; new_ovl->read_B_match_end_ = ovl->path.bepos; new_ovl->alen = aln->alen; new_ovl->blen = aln->blen; new_ovl->diffs = ovl->path.diffs; new_ovl->tlen = ovl->path.tlen; new_ovl->tps = tps; result_vec.push_back(new_ovl); } free(trace); } void LAInterface::getOverlap(std::vector &result_vec, int n) { getOverlap(result_vec, n, n + 1); } void LAInterface::getAlignment(std::vector &result_vec, int from) { getAlignment(result_vec, from, from + 1); } void LAInterface::getAlignment(std::vector &result_vec, int from, int to) { int j; uint16 *trace; Work_Data *work; int tmax; int in, npt, idx, ar; int64 tps; char *abuffer, *bbuffer; int ar_wide, br_wide; int ai_wide, bi_wide; int mn_wide, mx_wide; int tp_wide; int blast, match, seen, lhalf, rhalf; bool ALIGN = true; bool REFERENCE = false; bool CARTOON = false; bool OVERLAP = false; bool FLIP = false; bool UPPERCASE = false; bool MAP = false; int INDENT = 4; int WIDTH = 100; int BORDER = 10; aln->path = &(ovl->path); if (ALIGN || REFERENCE) { work = New_Work_Data(); abuffer = New_Read_Buffer(db1); bbuffer = New_Read_Buffer(db2); } else { abuffer = NULL; bbuffer = NULL; work = NULL; } tmax = 1000; trace = (uint16 *) Malloc(sizeof(uint16) * tmax, (char *)"Allocating trace vector"); if (trace == NULL) exit(1); in = 0; //if (pts!=NULL) free(pts); //pts = NULL; pts = new int[4]; pts[0] = from + 1; pts[1] = to ; pts[2] = INT32_MAX; npt = pts[0]; idx = 1; ar_wide = Number_Digits((int64) db1->nreads); br_wide = Number_Digits((int64) db2->nreads); ai_wide = Number_Digits((int64) db1->maxlen); bi_wide = Number_Digits((int64) db2->maxlen); if (db1->maxlen < db2->maxlen) { mn_wide = ai_wide; mx_wide = bi_wide; tp_wide = Number_Digits((int64) db1->maxlen / tspace + 2); } else { mn_wide = bi_wide; mx_wide = ai_wide; tp_wide = Number_Digits((int64) db2->maxlen / tspace + 2); } ar_wide += (ar_wide - 1) / 3; br_wide += (br_wide - 1) / 3; ai_wide += (ai_wide - 1) / 3; bi_wide += (bi_wide - 1) / 3; mn_wide += (mn_wide - 1) / 3; tp_wide += (tp_wide - 1) / 3; if (FLIP) { int x; x = ar_wide; ar_wide = br_wide; br_wide = x; x = ai_wide; ai_wide = bi_wide; bi_wide = x; } // For each record do blast = -1; match = 0; seen = 0; lhalf = rhalf = 0; for (j = 0; j < novl; j++) // Read it in { //printf("j:%d/%d\n",j,novl); Read_Overlap(input, ovl); if (ovl->path.tlen > tmax) { tmax = ((int) 1.2 * ovl->path.tlen) + 100; trace = (uint16 *) Realloc(trace, sizeof(uint16) * tmax, (char *)"Allocating trace vector"); if (trace == NULL) exit(1); } ovl->path.trace = (void *) trace; Read_Trace(input, ovl, tbytes); // Determine if it should be displayed ar = ovl->aread + 1; if (in) { while (ar > npt) { npt = pts[idx++]; if (ar < npt) { in = 0; break; } npt = pts[idx++]; } } else { while (ar >= npt) { npt = pts[idx++]; if (ar <= npt) { in = 1; break; } npt = pts[idx++]; } } if (!in) continue; //printf("j:%d/%d\n",j,novl); // If -o check display only overlaps aln->alen = db1->reads[ovl->aread].rlen; aln->blen = db2->reads[ovl->bread].rlen; aln->flags = ovl->flags; tps = ovl->path.tlen / 2; LAlignment *new_al = new LAlignment(); new_al->read_A_id_ = ovl->aread; new_al->read_B_id_ = ovl->bread; if (COMP(ovl->flags)) //printf(" c"); new_al->flags = 1; else new_al->flags = 0; //printf(" n"); //printf(" ["); //Print_Number((int64) ovl->path.read_A_match_start_,ai_wide,stdout); new_al->abpos = ovl->path.abpos; //printf(".."); //Print_Number((int64) ovl->path.read_A_match_end_,ai_wide,stdout); new_al->aepos = ovl->path.aepos; //printf("] x ["); //Print_Number((int64) ovl->path.read_B_match_start_,bi_wide,stdout); //printf(".."); //Print_Number((int64) ovl->path.read_B_match_end_,bi_wide,stdout); //printf("]"); new_al->bbpos = ovl->path.bbpos; new_al->bepos = ovl->path.bepos; new_al->alen = aln->alen; new_al->blen = aln->blen; new_al->diffs = ovl->path.diffs; new_al->tlen = ovl->path.tlen; new_al->tps = tps; if (OVERLAP) { if (ovl->path.abpos != 0 && ovl->path.bbpos != 0) continue; if (ovl->path.aepos != aln->alen && ovl->path.bepos != aln->blen) continue; } // If -M option then check the completeness of the implied mapping if (MAP) { while (ovl->bread != blast) { if (!match && seen && !(lhalf && rhalf)) { printf("Missing "); Print_Number((int64) blast + 1, br_wide + 1, stdout); printf(" %d ->%lld\n", db2->reads[blast].rlen, db2->reads[blast].coff); } match = 0; seen = 0; lhalf = rhalf = 0; blast += 1; } seen = 1; if (ovl->path.abpos == 0) rhalf = 1; if (ovl->path.aepos == aln->alen) lhalf = 1; if (ovl->path.bbpos != 0 || ovl->path.bepos != aln->blen) continue; match = 1; } // Display it //if (ALIGN || CARTOON || REFERENCE) //printf("\n"); if (FLIP) { Flip_Alignment(aln, 0); //Print_Number((int64) ovl->bread+1,ar_wide+1,stdout); //printf(" "); //Print_Number((int64) ovl->aread+1,br_wide+1,stdout); } else { //Print_Number((int64) ovl->aread+1,ar_wide+1,stdout); //printf(" "); //Print_Number((int64) ovl->bread+1,br_wide+1,stdout); //result.push_back(ovl->bread); } //if (COMP(ovl->reverse_complement_match_)) // printf(" c"); //else // printf(" n"); //printf(" ["); //Print_Number((int64) ovl->path.read_A_match_start_,ai_wide,stdout); //printf(".."); //Print_Number((int64) ovl->path.read_A_match_end_,ai_wide,stdout); //printf("] x ["); //Print_Number((int64) ovl->path.read_B_match_start_,bi_wide,stdout); //printf(".."); //Print_Number((int64) ovl->path.read_B_match_end_,bi_wide,stdout); //printf("]"); if ((ALIGN || CARTOON || REFERENCE) || true) { if (ALIGN || REFERENCE) { char *aseq, *bseq; int amin, amax; int bmin, bmax; if (FLIP) Flip_Alignment(aln, 0); //if (small) // Decompress_TraceTo16(ovl); if (small) Decompress_TraceTo16(ovl); new_al->trace_pts_len = ovl->path.tlen; new_al->trace_pts = (uint16 *)malloc(ovl->path.tlen * sizeof(uint16)); memcpy(new_al->trace_pts, ovl->path.trace, ovl->path.tlen * sizeof(uint16)); /*{ printf("\n"); uint16 *pp = (uint16 *) ovl->path.trace; for (int uu = 0; uu < ovl->path.tlen; uu++) { printf("%d ", pp[uu]); new_al->trace_pts[uu] = pp[uu]; } printf("\n"); }*/ #ifdef DOALIGN amin = ovl->path.read_A_match_start_ - BORDER; if (amin < 0) amin = 0; amax = ovl->path.read_A_match_end_ + BORDER; if (amax > aln->alen) amax = aln->alen; if (COMP(aln->reverse_complement_match_)) { bmin = (aln->blen - ovl->path.read_B_match_end_) - BORDER; if (bmin < 0) bmin = 0; bmax = (aln->blen - ovl->path.read_B_match_start_) + BORDER; if (bmax > aln->blen) bmax = aln->blen; } else { bmin = ovl->path.read_B_match_start_ - BORDER; if (bmin < 0) bmin = 0; bmax = ovl->path.read_B_match_end_ + BORDER; if (bmax > aln->blen) bmax = aln->blen; } aseq = Load_Subread(db1, ovl->aread, amin, amax, abuffer, 0); bseq = Load_Subread(db2, ovl->bread, bmin, bmax, bbuffer, 0); aln->aseq = aseq - amin; if (COMP(aln->reverse_complement_match_)) { Complement_Seq(bseq, bmax - bmin); aln->bseq = bseq - (aln->blen - bmax); } else aln->bseq = bseq - bmin; Compute_Trace_PTS(aln, work, tspace,GREEDIEST); /*new_al->aseq = (char *) malloc(new_al->alen * sizeof(char)); new_al->bseq = (char *) malloc(new_al->blen * sizeof(char)); memcpy(new_al->aseq, aln->aseq, new_al->alen* sizeof(char)); memcpy(new_al->bseq, aln->bseq, new_al->blen* sizeof(char));*/ new_al->aseq = NULL; new_al->bseq = NULL; /*{ int tlen = aln->path->tlen; int *trace = (int *) aln->path->trace; int u; printf(" "); for (u = 0; u < tlen; u++) printf("%d,", (int) trace[u]); printf("\n"); }*/ new_al->tlen = aln->path->tlen; new_al->trace = (int *) malloc(sizeof(int) * aln->path->tlen*2); //if (new_al->trace == NULL) // exit(1); //memcpy(new_al->trace, (void *) aln->path->trace, sizeof(int) * sizeof(int) * aln->path->tlen); //free(trace); //printf("after\n"); { int tlen = aln->path->tlen; int *trace = (int *) aln->path->trace; int u; //printf(" "); for (u = 0; u < tlen; u++) { //printf("%d,", (int) trace[u]); new_al->trace[u] = (int)trace[u]; } //printf("\n"); } #endif if (FLIP) { if (COMP(aln->flags)) { Complement_Seq(aseq, amax - amin); Complement_Seq(bseq, bmax - bmin); aln->aseq = aseq - (aln->alen - amax); aln->bseq = bseq - bmin; } Flip_Alignment(aln, 1); } } if (CARTOON) { //printf(" ("); //Print_Number(tps, tp_wide, stdout); //printf(" trace pts)\n\n"); //Alignment_Cartoon(stdout, aln, INDENT, mx_wide); } else { //printf(" : = "); //Print_Number((int64) ovl->path.diffs, mn_wide, stdout); //printf(" diffs ("); //Print_Number(tps, tp_wide, stdout); //printf(" trace pts)\n"); } if (REFERENCE) Print_Reference(stdout, aln, work, INDENT, WIDTH, BORDER, UPPERCASE, mx_wide); //if (ALIGN) //printAlignment(stdout, aln, work, INDENT, WIDTH, BORDER, UPPERCASE, mx_wide); // printAlignment_exp(stdout, new_al, work, INDENT, WIDTH, BORDER, UPPERCASE, mx_wide); } else {// printf(" : < "); // Print_Number((int64) ovl->path.diffs,mn_wide,stdout); // printf(" diffs ("); // Print_Number(tps,tp_wide,stdout); // printf(" trace pts)\n"); } result_vec.push_back(new_al); } free(trace); if (ALIGN) { free(bbuffer - 1); free(abuffer - 1); Free_Work_Data(work); } } void LAInterface::getAlignment(std::vector &result_vec, std::vector & range) { int j; uint16 *trace; Work_Data *work; int tmax; int in, npt, idx, ar; int64 tps; char *abuffer, *bbuffer; int ar_wide, br_wide; int ai_wide, bi_wide; int mn_wide, mx_wide; int tp_wide; int blast, match, seen, lhalf, rhalf; bool ALIGN = true; bool REFERENCE = false; bool CARTOON = false; bool OVERLAP = false; bool FLIP = false; bool UPPERCASE = false; bool MAP = false; int INDENT = 4; int WIDTH = 100; int BORDER = 10; aln->path = &(ovl->path); if (ALIGN || REFERENCE) { work = New_Work_Data(); abuffer = New_Read_Buffer(db1); bbuffer = New_Read_Buffer(db2); } else { abuffer = NULL; bbuffer = NULL; work = NULL; } tmax = 1000; trace = (uint16 *) Malloc(sizeof(uint16) * tmax, (char *)"Allocating trace vector"); if (trace == NULL) exit(1); in = 0; //if (pts!=NULL) free(pts); //pts = NULL; pts = new int[range.size()*2+20]; for (int k = 0; k < range.size(); k++) { pts[k*2] = range[k] + 1; pts[k*2+1] = range[k] + 1; } pts[range.size()*2] = INT32_MAX; /*for (int i = 0; i < range.size()*2+2; i++) { printf("%d\n",pts[i]); }*/ npt = pts[0]; idx = 1; ar_wide = Number_Digits((int64) db1->nreads); br_wide = Number_Digits((int64) db2->nreads); ai_wide = Number_Digits((int64) db1->maxlen); bi_wide = Number_Digits((int64) db2->maxlen); if (db1->maxlen < db2->maxlen) { mn_wide = ai_wide; mx_wide = bi_wide; tp_wide = Number_Digits((int64) db1->maxlen / tspace + 2); } else { mn_wide = bi_wide; mx_wide = ai_wide; tp_wide = Number_Digits((int64) db2->maxlen / tspace + 2); } ar_wide += (ar_wide - 1) / 3; br_wide += (br_wide - 1) / 3; ai_wide += (ai_wide - 1) / 3; bi_wide += (bi_wide - 1) / 3; mn_wide += (mn_wide - 1) / 3; tp_wide += (tp_wide - 1) / 3; if (FLIP) { int x; x = ar_wide; ar_wide = br_wide; br_wide = x; x = ai_wide; ai_wide = bi_wide; bi_wide = x; } // For each record do blast = -1; match = 0; seen = 0; lhalf = rhalf = 0; for (j = 0; j < novl; j++) // Read it in { //printf("j:%d/%d\n",j,novl); Read_Overlap(input, ovl); if (ovl->path.tlen > tmax) { tmax = ((int) 1.2 * ovl->path.tlen) + 100; trace = (uint16 *) Realloc(trace, sizeof(uint16) * tmax, (char *)"Allocating trace vector"); if (trace == NULL) exit(1); } ovl->path.trace = (void *) trace; Read_Trace(input, ovl, tbytes); // Determine if it should be displayed ar = ovl->aread + 1; if (in) { while (ar > npt) { npt = pts[idx++]; if (ar < npt) { in = 0; break; } npt = pts[idx++]; } } else { while (ar >= npt) { npt = pts[idx++]; if (ar <= npt) { in = 1; break; } npt = pts[idx++]; } } if (!in) continue; //printf("j:%d/%d\n",j,novl); // If -o check display only overlaps aln->alen = db1->reads[ovl->aread].rlen; aln->blen = db2->reads[ovl->bread].rlen; aln->flags = ovl->flags; tps = ovl->path.tlen / 2; LAlignment *new_al = new LAlignment(); new_al->read_A_id_ = ovl->aread; new_al->read_B_id_ = ovl->bread; if (COMP(ovl->flags)) //printf(" c"); new_al->flags = 1; else new_al->flags = 0; //printf(" n"); //printf(" ["); //Print_Number((int64) ovl->path.read_A_match_start_,ai_wide,stdout); new_al->abpos = ovl->path.abpos; //printf(".."); //Print_Number((int64) ovl->path.read_A_match_end_,ai_wide,stdout); new_al->aepos = ovl->path.aepos; //printf("] x ["); //Print_Number((int64) ovl->path.read_B_match_start_,bi_wide,stdout); //printf(".."); //Print_Number((int64) ovl->path.read_B_match_end_,bi_wide,stdout); //printf("]"); new_al->bbpos = ovl->path.bbpos; new_al->bepos = ovl->path.bepos; new_al->alen = aln->alen; new_al->blen = aln->blen; new_al->diffs = ovl->path.diffs; new_al->tlen = ovl->path.tlen; new_al->tps = tps; if (OVERLAP) { if (ovl->path.abpos != 0 && ovl->path.bbpos != 0) continue; if (ovl->path.aepos != aln->alen && ovl->path.bepos != aln->blen) continue; } // If -M option then check the completeness of the implied mapping if (MAP) { while (ovl->bread != blast) { if (!match && seen && !(lhalf && rhalf)) { printf("Missing "); Print_Number((int64) blast + 1, br_wide + 1, stdout); printf(" %d ->%lld\n", db2->reads[blast].rlen, db2->reads[blast].coff); } match = 0; seen = 0; lhalf = rhalf = 0; blast += 1; } seen = 1; if (ovl->path.abpos == 0) rhalf = 1; if (ovl->path.aepos == aln->alen) lhalf = 1; if (ovl->path.bbpos != 0 || ovl->path.bepos != aln->blen) continue; match = 1; } // Display it //if (ALIGN || CARTOON || REFERENCE) //printf("\n"); if (FLIP) { Flip_Alignment(aln, 0); //Print_Number((int64) ovl->bread+1,ar_wide+1,stdout); //printf(" "); //Print_Number((int64) ovl->aread+1,br_wide+1,stdout); } else { //Print_Number((int64) ovl->aread+1,ar_wide+1,stdout); //printf(" "); //Print_Number((int64) ovl->bread+1,br_wide+1,stdout); //result.push_back(ovl->bread); } //if (COMP(ovl->reverse_complement_match_)) // printf(" c"); //else // printf(" n"); //printf(" ["); //Print_Number((int64) ovl->path.read_A_match_start_,ai_wide,stdout); //printf(".."); //Print_Number((int64) ovl->path.read_A_match_end_,ai_wide,stdout); //printf("] x ["); //Print_Number((int64) ovl->path.read_B_match_start_,bi_wide,stdout); //printf(".."); //Print_Number((int64) ovl->path.read_B_match_end_,bi_wide,stdout); //printf("]"); if ((ALIGN || CARTOON || REFERENCE) || true) { if (ALIGN || REFERENCE) { char *aseq, *bseq; int amin, amax; int bmin, bmax; if (FLIP) Flip_Alignment(aln, 0); //if (small) // Decompress_TraceTo16(ovl); if (small) Decompress_TraceTo16(ovl); new_al->trace_pts_len = ovl->path.tlen; new_al->trace_pts = (uint16 *)malloc(ovl->path.tlen * sizeof(uint16)); memcpy(new_al->trace_pts, ovl->path.trace, ovl->path.tlen * sizeof(uint16)); /*{ printf("\n"); uint16 *pp = (uint16 *) ovl->path.trace; for (int uu = 0; uu < ovl->path.tlen; uu++) { printf("%d ", pp[uu]); new_al->trace_pts[uu] = pp[uu]; } printf("\n"); }*/ //#define DOALIGN #ifdef DOALIGN amin = ovl->path.read_A_match_start_ - BORDER; if (amin < 0) amin = 0; amax = ovl->path.read_A_match_end_ + BORDER; if (amax > aln->alen) amax = aln->alen; if (COMP(aln->reverse_complement_match_)) { bmin = (aln->blen - ovl->path.read_B_match_end_) - BORDER; if (bmin < 0) bmin = 0; bmax = (aln->blen - ovl->path.read_B_match_start_) + BORDER; if (bmax > aln->blen) bmax = aln->blen; } else { bmin = ovl->path.read_B_match_start_ - BORDER; if (bmin < 0) bmin = 0; bmax = ovl->path.read_B_match_end_ + BORDER; if (bmax > aln->blen) bmax = aln->blen; } aseq = Load_Subread(db1, ovl->aread, amin, amax, abuffer, 0); bseq = Load_Subread(db2, ovl->bread, bmin, bmax, bbuffer, 0); aln->aseq = aseq - amin; if (COMP(aln->reverse_complement_match_)) { Complement_Seq(bseq, bmax - bmin); aln->bseq = bseq - (aln->blen - bmax); } else aln->bseq = bseq - bmin; Compute_Trace_PTS(aln, work, tspace,GREEDIEST); #endif /*new_al->aseq = (char *) malloc(new_al->alen * sizeof(char)); new_al->bseq = (char *) malloc(new_al->blen * sizeof(char)); memcpy(new_al->aseq, aln->aseq, new_al->alen* sizeof(char)); memcpy(new_al->bseq, aln->bseq, new_al->blen* sizeof(char));*/ //new_al->aseq = NULL; //new_al->bseq = NULL; /*{ int tlen = aln->path->tlen; int *trace = (int *) aln->path->trace; int u; printf(" "); for (u = 0; u < tlen; u++) printf("%d,", (int) trace[u]); printf("\n"); }*/ #ifdef DOALIGN new_al->tlen = aln->path->tlen; new_al->trace = (int *) malloc(sizeof(int) * aln->path->tlen*2); //if (new_al->trace == NULL) // exit(1); //memcpy(new_al->trace, (void *) aln->path->trace, sizeof(int) * sizeof(int) * aln->path->tlen); //free(trace); //printf("after\n"); { int tlen = aln->path->tlen; int *trace = (int *) aln->path->trace; int u; //printf(" "); for (u = 0; u < tlen; u++) { //printf("%d,", (int) trace[u]); new_al->trace[u] = (int)trace[u]; } //printf("\n"); } #endif if (FLIP) { if (COMP(aln->flags)) { Complement_Seq(aseq, amax - amin); Complement_Seq(bseq, bmax - bmin); aln->aseq = aseq - (aln->alen - amax); aln->bseq = bseq - bmin; } Flip_Alignment(aln, 1); } } if (CARTOON) { //printf(" ("); //Print_Number(tps, tp_wide, stdout); //printf(" trace pts)\n\n"); //Alignment_Cartoon(stdout, aln, INDENT, mx_wide); } else { //printf(" : = "); //Print_Number((int64) ovl->path.diffs, mn_wide, stdout); //printf(" diffs ("); //Print_Number(tps, tp_wide, stdout); //printf(" trace pts)\n"); } if (REFERENCE) Print_Reference(stdout, aln, work, INDENT, WIDTH, BORDER, UPPERCASE, mx_wide); //if (ALIGN) //printAlignment(stdout, aln, work, INDENT, WIDTH, BORDER, UPPERCASE, mx_wide); // printAlignment_exp(stdout, new_al, work, INDENT, WIDTH, BORDER, UPPERCASE, mx_wide); } else {// printf(" : < "); // Print_Number((int64) ovl->path.diffs,mn_wide,stdout); // printf(" diffs ("); // Print_Number(tps,tp_wide,stdout); // printf(" trace pts)\n"); } result_vec.push_back(new_al); } free(trace); if (ALIGN) { free(bbuffer - 1); free(abuffer - 1); Free_Work_Data(work); } } int LAInterface::getReadNumber() { return db1->nreads; } int LAInterface::getReadNumber2() { return db2->nreads; } int64 LAInterface::getAlignmentNumber() { resetAlignment(); return novl; } void LAInterface::showOverlap(int from, int to) { int j; uint16 *trace; Work_Data *work; int tmax; int in, npt, idx, ar; int64 tps; char *abuffer, *bbuffer; int ar_wide, br_wide; int ai_wide, bi_wide; int mn_wide, mx_wide; int tp_wide; int blast, match, seen, lhalf, rhalf; bool ALIGN = false; bool REFERENCE = false; bool CARTOON = false; bool OVERLAP = false; bool FLIP = false; bool UPPERCASE = false; bool MAP = false; int INDENT = 4; int WIDTH = 100; int BORDER = 10; aln->path = &(ovl->path); if (ALIGN || REFERENCE) { work = New_Work_Data(); abuffer = New_Read_Buffer(db1); bbuffer = New_Read_Buffer(db2); } else { abuffer = NULL; bbuffer = NULL; work = NULL; } tmax = 1000; trace = (uint16 *) Malloc(sizeof(uint16)*tmax, (char*)"Allocating trace vector"); if (trace == NULL) exit (1); in = 0; npt = pts[0]; idx = 1; ar_wide = Number_Digits((int64) db1->nreads); br_wide = Number_Digits((int64) db2->nreads); ai_wide = Number_Digits((int64) db1->maxlen); bi_wide = Number_Digits((int64) db2->maxlen); if (db1->maxlen < db2->maxlen) { mn_wide = ai_wide; mx_wide = bi_wide; tp_wide = Number_Digits((int64) db1->maxlen/tspace+2); } else { mn_wide = bi_wide; mx_wide = ai_wide; tp_wide = Number_Digits((int64) db2->maxlen/tspace+2); } ar_wide += (ar_wide-1)/3; br_wide += (br_wide-1)/3; ai_wide += (ai_wide-1)/3; bi_wide += (bi_wide-1)/3; mn_wide += (mn_wide-1)/3; tp_wide += (tp_wide-1)/3; if (FLIP) { int x; x = ar_wide; ar_wide = br_wide; br_wide = x; x = ai_wide; ai_wide = bi_wide; bi_wide = x; } // For each record do blast = -1; match = 0; seen = 0; lhalf = rhalf = 0; pts = new int[4]; pts[0] = from + 1; pts[1] = to ; pts[2] = INT32_MAX; npt = pts[0]; idx = 1; for (j = 0; j < novl; j++) // Read it in { Read_Overlap(input,ovl); if (ovl->path.tlen > tmax) { tmax = ((int) 1.2*ovl->path.tlen) + 100; trace = (uint16 *) Realloc(trace,sizeof(uint16)*tmax, (char *)"Allocating trace vector"); if (trace == NULL) exit (1); } ovl->path.trace = (void *) trace; Read_Trace(input,ovl,tbytes); // Determine if it should be displayed ar = ovl->aread+1; if (in) { while (ar > npt) { npt = pts[idx++]; if (ar < npt) { in = 0; break; } npt = pts[idx++]; } } else { while (ar >= npt) { npt = pts[idx++]; if (ar <= npt) { in = 1; break; } npt = pts[idx++]; } } if (!in) continue; // If -o check display only overlaps aln->alen = db1->reads[ovl->aread].rlen; aln->blen = db2->reads[ovl->bread].rlen; aln->flags = ovl->flags; tps = ovl->path.tlen/2; if (OVERLAP) { if (ovl->path.abpos != 0 && ovl->path.bbpos != 0) continue; if (ovl->path.aepos != aln->alen && ovl->path.bepos != aln->blen) continue; } // If -M option then check the completeness of the implied mapping if (MAP) { while (ovl->bread != blast) { if (!match && seen && !(lhalf && rhalf)) { printf("Missing "); Print_Number((int64) blast+1,br_wide+1,stdout); printf(" %d ->%lld\n",db2->reads[blast].rlen,db2->reads[blast].coff); } match = 0; seen = 0; lhalf = rhalf = 0; blast += 1; } seen = 1; if (ovl->path.abpos == 0) rhalf = 1; if (ovl->path.aepos == aln->alen) lhalf = 1; if (ovl->path.bbpos != 0 || ovl->path.bepos != aln->blen) continue; match = 1; } // Display it if (ALIGN || CARTOON || REFERENCE) printf("\n"); if (FLIP) { Flip_Alignment(aln,0); Print_Number((int64) ovl->bread+1,ar_wide+1,stdout); printf(" "); Print_Number((int64) ovl->aread+1,br_wide+1,stdout); } else { Print_Number((int64) ovl->aread+1,ar_wide+1,stdout); printf(" "); Print_Number((int64) ovl->bread+1,br_wide+1,stdout); } if (COMP(ovl->flags)) printf(" c"); else printf(" n"); printf(" ["); Print_Number((int64) ovl->path.abpos,ai_wide,stdout); printf(".."); Print_Number((int64) ovl->path.aepos,ai_wide,stdout); printf("] x ["); Print_Number((int64) ovl->path.bbpos,bi_wide,stdout); printf(".."); Print_Number((int64) ovl->path.bepos,bi_wide,stdout); printf("]%d",aln->blen); if (ALIGN || CARTOON || REFERENCE) { if (ALIGN || REFERENCE) { char *aseq, *bseq; int amin, amax; int bmin, bmax; if (FLIP) Flip_Alignment(aln,0); if (small) Decompress_TraceTo16(ovl); amin = ovl->path.abpos - BORDER; if (amin < 0) amin = 0; amax = ovl->path.aepos + BORDER; if (amax > aln->alen) amax = aln->alen; if (COMP(aln->flags)) { bmin = (aln->blen-ovl->path.bepos) - BORDER; if (bmin < 0) bmin = 0; bmax = (aln->blen-ovl->path.bbpos) + BORDER; if (bmax > aln->blen) bmax = aln->blen; } else { bmin = ovl->path.bbpos - BORDER; if (bmin < 0) bmin = 0; bmax = ovl->path.bepos + BORDER; if (bmax > aln->blen) bmax = aln->blen; } aseq = Load_Subread(db1,ovl->aread,amin,amax,abuffer,0); bseq = Load_Subread(db2,ovl->bread,bmin,bmax,bbuffer,0); aln->aseq = aseq - amin; if (COMP(aln->flags)) { Complement_Seq(bseq,bmax-bmin); aln->bseq = bseq - (aln->blen - bmax); } else aln->bseq = bseq - bmin; Compute_Trace_PTS(aln,work,tspace,GREEDIEST); if (FLIP) { if (COMP(aln->flags)) { Complement_Seq(aseq,amax-amin); Complement_Seq(bseq,bmax-bmin); aln->aseq = aseq - (aln->alen - amax); aln->bseq = bseq - bmin; } Flip_Alignment(aln,1); } } if (CARTOON) { printf(" ("); Print_Number(tps,tp_wide,stdout); printf(" trace pts)\n\n"); Alignment_Cartoon(stdout,aln,INDENT,mx_wide); } else { printf(" : = "); Print_Number((int64) ovl->path.diffs,mn_wide,stdout); printf(" diffs ("); Print_Number(tps,tp_wide,stdout); printf(" trace pts)\n"); } if (REFERENCE) Print_Reference(stdout,aln,work,INDENT,WIDTH,BORDER,UPPERCASE,mx_wide); if (ALIGN) Print_Alignment(stdout,aln,work,INDENT,WIDTH,BORDER,UPPERCASE,mx_wide); } else { printf(" : < "); Print_Number((int64) ovl->path.diffs,mn_wide,stdout); printf(" diffs ("); Print_Number(tps,tp_wide,stdout); printf(" trace pts)\n"); } } free(trace); if (ALIGN) { free(bbuffer-1); free(abuffer-1); Free_Work_Data(work); } return; } typedef struct // Hidden from the user, working space for each thread { int vecmax; void *vector; int celmax; void *cells; int pntmax; void *points; int tramax; void *trace; } _Work_Data; static int enlarge_vector(_Work_Data *work, int newmax) { void *vec; int max; max = ((int) (newmax*1.2)) + 10000; vec = Realloc(work->vector,max, (char *)"Enlarging DP vector"); if (vec == NULL) EXIT(1); work->vecmax = max; work->vector = vec; return (0); } static char ToL[8] = { 'a', 'c', 'g', 't', '.', '[', ']', '-' }; static char ToU[8] = { 'A', 'C', 'G', 'T', '.', '[', ']', '-' }; int LAInterface::printAlignment(FILE *file, Alignment *align, Work_Data *ework, int indent, int width, int border, int upper, int coord) { _Work_Data *work = (_Work_Data *) ework; int *trace = (int *) align->path->trace; int tlen = align->path->tlen; char *Abuf, *Bbuf, *Dbuf; int i, j, o; char *a, *b; char mtag, dtag; int prefa, prefb; int aend, bend; int sa, sb; int match, diff; char *N2A; if (trace == NULL) return (0); #ifdef SHOW_TRACE fprintf(file,"\nTrace:\n"); for (i = 0; i < tlen; i++) fprintf(file," %3d\n",trace[i]); #endif o = sizeof(char)*3*(width+1); if (o > work->vecmax) if (enlarge_vector(work,o)) EXIT(1); if (upper) N2A = ToU; else N2A = ToL; Abuf = (char *) work->vector; Bbuf = Abuf + (width+1); Dbuf = Bbuf + (width+1); aend = align->path->aepos; bend = align->path->bepos; Abuf[width] = Bbuf[width] = Dbuf[width] = '\0'; /* buffer/output next column */ #define COLUMN(x,y) \ { int u, v; \ if (o >= width) \ { fprintf(file,"\n"); \ fprintf(file,"%*s",indent,""); \ if (coord > 0) \ { if (sa <= aend) \ fprintf(file," %*d",coord,sa); \ else \ fprintf(file," %*s",coord,""); \ fprintf(file," %s\n",Abuf); \ fprintf(file,"%*s %*s %s\n",indent,"",coord,"",Dbuf); \ fprintf(file,"%*s",indent,""); \ if (sb <= bend) \ fprintf(file," %*d",coord,sb); \ else \ fprintf(file," %*s",coord,""); \ fprintf(file," %s",Bbuf); \ } \ else \ { fprintf(file," %s\n",Abuf); \ fprintf(file,"%*s %s\n",indent,"",Dbuf); \ fprintf(file,"%*s %s",indent,"",Bbuf); \ } \ fprintf(file," %5.1f%%\n",(100.*diff)/(diff+match)); \ o = 0; \ sa = i; \ sb = j; \ match = diff = 0; \ } \ u = (x); \ v = (y); \ if (u == 4 || v == 4) \ Dbuf[o] = ' '; \ else if (u == v) \ Dbuf[o] = mtag; \ else \ Dbuf[o] = dtag; \ Abuf[o] = N2A[u]; \ Bbuf[o] = N2A[v]; \ o += 1; \ } a = align->aseq - 1; b = align->bseq - 1; o = 0; i = j = 1; prefa = align->path->abpos; prefb = align->path->bbpos; if (prefa > border) { i = prefa-(border-1); prefa = border; } if (prefb > border) { j = prefb-(border-1); prefb = border; } sa = i; sb = j; mtag = ':'; dtag = ':'; while (prefa > prefb) { COLUMN(a[i],4) i += 1; prefa -= 1; } while (prefb > prefa) { COLUMN(4,b[j]) j += 1; prefb -= 1; } while (prefa > 0) { COLUMN(a[i],b[j]) i += 1; j += 1; prefa -= 1; } mtag = '['; if (prefb > 0) COLUMN(5,5) mtag = '|'; dtag = '*'; match = diff = 0; { int p, c; /* Output columns of alignment til reach trace end */ for (c = 0; c < tlen; c++) if ((p = trace[c]) < 0) { p = -p; while (i != p) { COLUMN(a[i],b[j]) if (a[i] == b[j]) match += 1; else diff += 1; i += 1; j += 1; } COLUMN(7,b[j]) j += 1; diff += 1; } else { while (j != p) { COLUMN(a[i],b[j]) if (a[i] == b[j]) match += 1; else diff += 1; i += 1; j += 1; } COLUMN(a[i],7) i += 1; diff += 1; } p = align->path->aepos; while (i <= p) { COLUMN(a[i],b[j]) if (a[i] == b[j]) match += 1; else diff += 1; i += 1; j += 1; } } { int c; /* Output remaining column including unaligned suffix */ mtag = ']'; if (a[i] != 4 && b[j] != 4 && border > 0) COLUMN(6,6) mtag = ':'; dtag = ':'; c = 0; while (c < border && (a[i] != 4 || b[j] != 4)) { if (a[i] != 4) if (b[j] != 4) { COLUMN(a[i],b[j]) i += 1; j += 1; } else { COLUMN(a[i],4) i += 1; } else { COLUMN(4,b[j]) j += 1; } c += 1; } } /* Print remainder of buffered col.s */ fprintf(file,"\n"); fprintf(file,"%*s",indent,""); if (coord > 0) { if (sa <= aend) fprintf(file," %*d",coord,sa); else fprintf(file," %*s",coord,""); fprintf(file," %.*s\n",o,Abuf); fprintf(file,"%*s %*s %.*s\n",indent,"",coord,"",o,Dbuf); fprintf(file,"%*s",indent,""); if (sb <= bend) fprintf(file," %*d",coord,sb); else fprintf(file," %*s",coord,""); fprintf(file," %.*s",o,Bbuf); } else { fprintf(file," %.*s\n",o,Abuf); fprintf(file,"%*s %.*s\n",indent,"",o,Dbuf); fprintf(file,"%*s %.*s",indent,"",o,Bbuf); } if (diff+match > 0) fprintf(file," %5.1f%%\n",(100.*diff)/(diff+match)); else fprintf(file,"\n"); //fprintf(file, "Cool!\n"); fflush(file); return (0); } typedef void Work_Data; typedef struct { int *Stop; // Ongoing stack of alignment indels char *Aabs, *Babs; // Absolute base of A and B sequences int **PVF, **PHF; // List of waves for iterative np algorithms int mida, midb; // mid point division for mid-point algorithms int *VF, *VB; // Forward/Reverse waves for nd algorithms // (defunct: were used for O(nd) algorithms) } Trace_Waves; static int enlarge_trace(_Work_Data *work, int newmax) { void *vec; int max; max = ((int) (newmax*1.2)) + 10000; vec = Realloc(work->trace,max,(char *)"Enlarging trace vector"); if (vec == NULL) EXIT(1); work->tramax = max; work->trace = vec; return (0); } static int iter_np(char *A, int M, char *B, int N, Trace_Waves *wave) { int **PVF = wave->PVF; int **PHF = wave->PHF; int D; int del = M-N; { int *F0, *F1, *F2; int *HF; int low, hgh; int posl, posh; #ifdef DEBUG_ALIGN printf("\n%*s BASE %ld,%ld: %d vs %d\n",depth,"",A-wave->Aabs,B-wave->Babs,M,N); printf("%*s A = ",depth,""); for (D = 0; D < M; D++) printf("%c",ToA[(int) A[D]]); printf("\n"); printf("%*s B = ",depth,""); for (D = 0; D < N; D++) printf("%c",ToA[(int) B[D]]); printf("\n"); #endif if (del >= 0) { low = 0; hgh = del; } else { low = del; hgh = 0; } posl = -INT32_MAX; posh = INT32_MAX; if (wave->Aabs == wave->Babs) { if (B == A) { EPRINTF(EPLACE,"Error: self comparison starts on diagonal 0 (Compute_Trace)\n"); EXIT(-1); } else if (B < A) posl = (B-A)+1; else posh = (B-A)-1; } F1 = PVF[-2]; F0 = PVF[-1]; for (D = low-1; D <= hgh+1; D++) F1[D] = F0[D] = -2; F0[0] = -1; low += 1; hgh -= 1; for (D = 0; 1; D += 1) { int k, i, j; int am, ac, ap; char *a; F2 = F1; F1 = F0; F0 = PVF[D]; HF = PHF[D]; if ((D & 0x1) == 0) { if (low > posl) low -= 1; if (hgh < posh) hgh += 1; } F0[hgh+1] = F0[low-1] = -2; #define FS_MOVE(mdir,pdir) \ ac = F1[k]+1; \ if (ac < am) \ if (ap < am) \ { HF[k] = mdir; \ j = am; \ } \ else \ { HF[k] = pdir; \ j = ap; \ } \ else \ if (ap < ac) \ { HF[k] = 0; \ j = ac; \ } \ else \ { HF[k] = pdir; \ j = ap; \ } \ \ if (N < i) \ while (j < N && B[j] == a[j]) \ j += 1; \ else \ while (j < i && B[j] == a[j]) \ j += 1; \ F0[k] = j; j = -2; a = A + hgh; i = M - hgh; for (k = hgh; k > del; k--) { ap = j+1; am = F2[k-1]; FS_MOVE(-1,4) a -= 1; i += 1; } j = -2; a = A + low; i = M - low; for (k = low; k < del; k++) { ap = F2[k+1]+1; am = j; FS_MOVE(2,1) a += 1; i -= 1; } ap = F0[del+1]+1; am = j; FS_MOVE(2,4) #ifdef DEBUG_AWAVE print_awave(F0,low,hgh); print_awave(HF,low,hgh); #endif if (F0[del] >= N) break; } } { int k, h, m, e, c; char *a; int ap = (wave->Aabs-A)-1; int bp = (B-wave->Babs)+1; PHF[0][0] = 3; c = N; k = del; e = PHF[D][k]; PHF[D][k] = 3; while (e != 3) { h = k+e; if (e > 1) h -= 3; else if (e == 0) D -= 1; else D -= 2; if (h < k) // => e = -1 or 2 { a = A + k; if (k < 0) m = -k; else m = 0; if (PVF[D][h] <= c) c = PVF[D][h]-1; while (c >= m && a[c] == B[c]) c -= 1; if (e < 1) // => edge is 2, others are 1, and 0 { if (c <= PVF[D+2][k+1]) { e = 4; h = k+1; D = D+2; } else if (c == PVF[D+1][k]) { e = 0; h = k; D = D+1; } else PVF[D][h] = c+1; } else // => edge is 0, others are 1, and 2 (if k != del), 0 (otherwise) { if (k == del) m = D; else m = D-2; if (c <= PVF[m][k+1]) { if (k == del) e = 4; else e = 1; h = k+1; D = m; } else if (c == PVF[D-1][k]) { e = 0; h = k; D = D-1; } else PVF[D][h] = c+1; } } m = PHF[D][h]; PHF[D][h] = e; e = m; k = h; } k = D = 0; e = PHF[D][k]; while (e != 3) { h = k-e; c = PVF[D][k]; if (e > 1) h += 3; else if (e == 0) D += 1; else D += 2; if (h > k) *wave->Stop++ = bp+c; else if (h < k) *wave->Stop++ = ap-(c+k); k = h; e = PHF[D][h]; } #ifdef DEBUG_SCRIPT k = D = 0; e = PHF[D][k]; while (e != 3) { h = k-e; c = PVF[D][k]; if (e > 1) h += 3; else if (e == 0) D += 1; else D += 2; if (h > k) printf("%*s D %d(%d)\n",depth,"",(c-k)-(ap-1),c+bp); else if (h < k) printf("%*s I %d(%d)\n",depth,"",c+(bp-1),(c+k)-ap); else printf("%*s %d S %d\n",depth,"",(c+k)-(ap+1),c+(bp-1)); k = h; e = PHF[D][h]; } #endif } return (D + abs(del)); } int LAInterface::computeTracePTS(Alignment *align, Work_Data *ework, int trace_spacing) { _Work_Data *work = (_Work_Data *) ework; Trace_Waves wave; Path *path; char *aseq, *bseq; uint16 *points; int tlen; int ab, bb; int ae, be; int diffs; path = align->path; aseq = align->aseq; bseq = align->bseq; tlen = path->tlen; points = (uint16 *) path->trace; { int64 s; int d; int M, N; int dmax, nmax; int **PVF, **PHF; M = path->aepos-path->abpos; N = path->bepos-path->bbpos; if (M < N) s = N*sizeof(int); else s = M*sizeof(int); if (s > work->tramax) if (enlarge_trace(work,s)) EXIT(1); nmax = 0; dmax = 0; for (d = 1; d < tlen; d += 2) { if (points[d-1] > dmax) dmax = points[d-1]; if (points[d] > nmax) nmax = points[d]; } if (tlen <= 1) nmax = N; if (points[d-1] > dmax) dmax = points[d-1]; s = (dmax+3)*2*((trace_spacing+nmax+3)*sizeof(int) + sizeof(int *)); if (s > work->vecmax) if (enlarge_vector(work,s)) EXIT(1); wave.PVF = PVF = ((int **) (work->vector)) + 2; wave.PHF = PHF = PVF + (dmax+3); s = trace_spacing+nmax+3; PVF[-2] = ((int *) (PHF + (dmax+1))) + (nmax+1); for (d = -1; d <= dmax; d++) PVF[d] = PVF[d-1] + s; PHF[-2] = PVF[dmax] + s; for (d = -1; d <= dmax; d++) PHF[d] = PHF[d-1] + s; } wave.Stop = (int *) (work->trace); wave.Aabs = aseq; wave.Babs = bseq; { int i, d; diffs = 0; ab = path->abpos; ae = (ab/trace_spacing)*trace_spacing; bb = path->bbpos; tlen -= 2; for (i = 1; i < tlen; i += 2) { ae = ae + trace_spacing; be = bb + points[i]; d = iter_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave); if (d < 0) EXIT(1); diffs += d; ab = ae; bb = be; } ae = path->aepos; be = path->bepos; d = iter_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave); if (d < 0) EXIT(1); diffs += d; } path->trace = work->trace; path->tlen = wave.Stop - ((int *) path->trace); path->diffs = diffs; return (0); } int LAInterface::showAlignmentTags(LAlignment *alignment) { //load aseq and bseq first //printf("A:%s\n",alignment->aseq); //printf("B:%s\n",alignment->bseq); int amin, amax, bmin, bmax; const int BORDER = 10; amin = alignment->abpos - BORDER; if (amin < 0) amin = 0; amax = alignment->aepos + BORDER; if (amax > alignment->alen) amax = alignment->alen; if (alignment->flags == 1) { bmin = (alignment->blen - alignment->bepos) - BORDER; if (bmin < 0) bmin = 0; bmax = (alignment->blen - alignment->bbpos) + BORDER; if (bmax > alignment->blen) bmax = alignment->blen; } else { bmin = alignment->bbpos - BORDER; if (bmin < 0) bmin = 0; bmax = alignment->bepos + BORDER; if (bmax > alignment->blen) bmax = alignment->blen; } char * abuffer = New_Read_Buffer(db1); char * bbuffer = New_Read_Buffer(db2); char * aseq = Load_Subread(db1, alignment->read_A_id_, amin, amax, abuffer, 0); char * bseq = Load_Subread(db2, alignment->read_B_id_, bmin, bmax, bbuffer, 0); alignment->aseq = aseq - amin; if (alignment->flags == 1) { Complement_Seq(bseq, bmax - bmin); alignment->bseq = bseq - (alignment->blen - bmax); } else alignment->bseq = bseq - bmin; char *Abuf, *Bbuf, *Dbuf; int i, j, o; char *a, *b; char mtag, dtag; int prefa, prefb; int aend, bend; int sa, sb; int match, diff; char *N2A; int border = 10; int tlen = alignment->tlen; int * trace = alignment->trace; a = alignment->aseq - 1; b = alignment->bseq - 1; i = j = 1; prefa = alignment->abpos; prefb = alignment->bbpos; if (prefa > border) { i = prefa-(border-1); prefa = border; } if (prefb > border) { j = prefb-(border-1); prefb = border; } sa = i; sb = j; mtag = ':'; dtag = ':'; #define COLUMN2(x,y) \ { \ printf(" %c-%c ",ToU[x],ToU[y]); \ } \ while (prefa > prefb) { COLUMN2(a[i],4) i += 1; prefa -= 1; } while (prefb > prefa) { COLUMN2(4,b[j]) j += 1; prefb -= 1; } while (prefa > 0) { COLUMN2(a[i],b[j]) i += 1; j += 1; prefa -= 1; } mtag = '['; if (prefb > 0) COLUMN2(5,5) mtag = '|'; dtag = '*'; match = diff = 0; { int p, c; /* Output columns of alignment til reach trace end */ for (c = 0; c < tlen; c++) if ((p = trace[c]) < 0) { p = -p; //printf("%d\n",trace[c]); while (i != p) { COLUMN2(a[i],b[j]) if (a[i] == b[j]) match += 1; else diff += 1; i += 1; j += 1; } COLUMN2(7,b[j]) j += 1; diff += 1; } else { while (j != p) { COLUMN2(a[i],b[j]) if (a[i] == b[j]) match += 1; else diff += 1; i += 1; j += 1; } COLUMN2(a[i],7) i += 1; diff += 1; } p = alignment->aepos; while (i <= p) { COLUMN2(a[i],b[j]) if (a[i] == b[j]) match += 1; else diff += 1; i += 1; j += 1; } } { int c; /* Output remaining column including unaligned suffix */ mtag = ']'; if (a[i] != 4 && b[j] != 4 && border > 0) COLUMN2(6,6) mtag = ':'; dtag = ':'; c = 0; while (c < border && (a[i] != 4 || b[j] != 4)) { if (a[i] != 4) if (b[j] != 4) { COLUMN2(a[i],b[j]) i += 1; j += 1; } else { COLUMN2(a[i],4) i += 1; } else { COLUMN2(4,b[j]) j += 1; } c += 1; } } free(abuffer - 1); free(bbuffer - 1); alignment->aseq = NULL; alignment->bseq = NULL; return 0; } std::pair LAInterface::getAlignmentTags(LAlignment *alignment) { //load aseq and bseq first //printf("A:%s\n",alignment->aseq); //printf("B:%s\n",alignment->bseq); int amin, amax, bmin, bmax; const int BORDER = 10; amin = alignment->abpos - BORDER; if (amin < 0) amin = 0; amax = alignment->aepos + BORDER; if (amax > alignment->alen) amax = alignment->alen; if (alignment->flags == 1) { bmin = (alignment->blen - alignment->bepos) - BORDER; if (bmin < 0) bmin = 0; bmax = (alignment->blen - alignment->bbpos) + BORDER; if (bmax > alignment->blen) bmax = alignment->blen; } else { bmin = alignment->bbpos - BORDER; if (bmin < 0) bmin = 0; bmax = alignment->bepos + BORDER; if (bmax > alignment->blen) bmax = alignment->blen; } char * abuffer = New_Read_Buffer(db1); char * bbuffer = New_Read_Buffer(db2); char * aseq = Load_Subread(db1, alignment->read_A_id_, amin, amax, abuffer, 0); char * bseq = Load_Subread(db2, alignment->read_B_id_, bmin, bmax, bbuffer, 0); alignment->aseq = aseq - amin; if (alignment->flags == 1) { Complement_Seq(bseq, bmax - bmin); alignment->bseq = bseq - (alignment->blen - bmax); } else alignment->bseq = bseq - bmin; char *Abuf, *Bbuf, *Dbuf; int i, j, o; char *a, *b; char mtag, dtag; int prefa, prefb; int aend, bend; int sa, sb; int match, diff; char *N2A; int border = 10; int tlen = alignment->tlen; int * trace = alignment->trace; // get the trace from here a = alignment->aseq - 1; b = alignment->bseq - 1; i = j = 1; prefa = alignment->abpos; prefb = alignment->bbpos; if (prefa > border) { i = prefa-(border-1); prefa = border; } if (prefb > border) { j = prefb-(border-1); prefb = border; } sa = i; sb = j; mtag = ':'; dtag = ':'; std::string aa = ""; std::string bb = ""; aa.reserve((alignment->aepos - alignment->abpos) * 2); bb.reserve((alignment->bepos - alignment->bbpos) * 2); #define COLUMN3(x,y) \ { \ aa.append(1,ToU[x]); \ bb.append(1,ToU[y]); \ } \ while (prefa > prefb) { //COLUMN(a[i],4) i += 1; prefa -= 1; } while (prefb > prefa) { //COLUMN(4,b[j]) j += 1; prefb -= 1; } while (prefa > 0) { //COLUMN(a[i],b[j]) i += 1; j += 1; prefa -= 1; } mtag = '['; if (prefb > 0) //COLUMN(5,5) mtag = '|'; dtag = '*'; match = diff = 0; { int p, c; /* Output columns of alignment til reach trace end */ for (c = 0; c < tlen; c++) if ((p = trace[c]) < 0) { p = -p; //printf("%d\n",trace[c]); while (i != p) { COLUMN3(a[i],b[j]) if (a[i] == b[j]) match += 1; else diff += 1; i += 1; j += 1; } COLUMN3(7,b[j]) j += 1; diff += 1; } else { while (j != p) { COLUMN3(a[i],b[j]) if (a[i] == b[j]) match += 1; else diff += 1; i += 1; j += 1; } COLUMN3(a[i],7) i += 1; diff += 1; } p = alignment->aepos; while (i <= p) { COLUMN3(a[i],b[j]) if (a[i] == b[j]) match += 1; else diff += 1; i += 1; j += 1; } } /* { int c; // Output remaining column including unaligned suffix mtag = ']'; if (a[i] != 4 && b[j] != 4 && border > 0) COLUMN(6,6) mtag = ':'; dtag = ':'; c = 0; while (c < border && (a[i] != 4 || b[j] != 4)) { if (a[i] != 4) if (b[j] != 4) { COLUMN(a[i],b[j]) i += 1; j += 1; } else { COLUMN(a[i],4) i += 1; } else { COLUMN(4,b[j]) j += 1; } c += 1; } }*/ //printf("%s\n%s\n", aa.c_str(), bb.c_str()); free(abuffer - 1); free(bbuffer - 1); alignment->aseq = NULL; alignment->bseq = NULL; return std::pair(aa,bb); } int LAInterface::printAlignment_exp(FILE *file, LAlignment *align, Work_Data *ework, int indent, int width, int border, int upper, int coord) { _Work_Data *work = (_Work_Data *) ework; int *trace = (int *) align->trace; int tlen = align->tlen; char *Abuf, *Bbuf, *Dbuf; int i, j, o; char *a, *b; char mtag, dtag; int prefa, prefb; int aend, bend; int sa, sb; int match, diff; char *N2A; if (trace == NULL) return (0); #ifdef SHOW_TRACE fprintf(file,"\nTrace:\n"); for (i = 0; i < tlen; i++) fprintf(file," %3d\n",trace[i]); #endif o = sizeof(char)*3*(width+1); if (o > work->vecmax) if (enlarge_vector(work,o)) EXIT(1); if (upper) N2A = ToU; else N2A = ToL; Abuf = (char *) work->vector; Bbuf = Abuf + (width+1); Dbuf = Bbuf + (width+1); aend = align->aepos; bend = align->bepos; Abuf[width] = Bbuf[width] = Dbuf[width] = '\0'; /* buffer/output next column */ a = align->aseq - 1; b = align->bseq - 1; o = 0; i = j = 1; prefa = align->abpos; prefb = align->bbpos; if (prefa > border) { i = prefa-(border-1); prefa = border; } if (prefb > border) { j = prefb-(border-1); prefb = border; } sa = i; sb = j; mtag = ':'; dtag = ':'; while (prefa > prefb) { COLUMN(a[i],4) i += 1; prefa -= 1; } while (prefb > prefa) { COLUMN(4,b[j]) j += 1; prefb -= 1; } while (prefa > 0) { COLUMN(a[i],b[j]) i += 1; j += 1; prefa -= 1; } mtag = '['; if (prefb > 0) COLUMN(5,5) mtag = '|'; dtag = '*'; match = diff = 0; { int p, c; /* Output columns of alignment til reach trace end */ for (c = 0; c < tlen; c++) if ((p = trace[c]) < 0) { p = -p; while (i != p) { COLUMN(a[i],b[j]) if (a[i] == b[j]) match += 1; else diff += 1; i += 1; j += 1; } COLUMN(7,b[j]) j += 1; diff += 1; } else { while (j != p) { COLUMN(a[i],b[j]) if (a[i] == b[j]) match += 1; else diff += 1; i += 1; j += 1; } COLUMN(a[i],7) i += 1; diff += 1; } p = align->aepos; while (i <= p) { COLUMN(a[i],b[j]) if (a[i] == b[j]) match += 1; else diff += 1; i += 1; j += 1; } } { int c; /* Output remaining column including unaligned suffix */ mtag = ']'; if (a[i] != 4 && b[j] != 4 && border > 0) COLUMN(6,6) mtag = ':'; dtag = ':'; c = 0; while (c < border && (a[i] != 4 || b[j] != 4)) { if (a[i] != 4) if (b[j] != 4) { COLUMN(a[i],b[j]) i += 1; j += 1; } else { COLUMN(a[i],4) i += 1; } else { COLUMN(4,b[j]) j += 1; } c += 1; } } /* Print remainder of buffered col.s */ fprintf(file,"\n"); fprintf(file,"%*s",indent,""); if (coord > 0) { if (sa <= aend) fprintf(file," %*d",coord,sa); else fprintf(file," %*s",coord,""); fprintf(file," %.*s\n",o,Abuf); fprintf(file,"%*s %*s %.*s\n",indent,"",coord,"",o,Dbuf); fprintf(file,"%*s",indent,""); if (sb <= bend) fprintf(file," %*d",coord,sb); else fprintf(file," %*s",coord,""); fprintf(file," %.*s",o,Bbuf); } else { fprintf(file," %.*s\n",o,Abuf); fprintf(file,"%*s %.*s\n",indent,"",o,Dbuf); fprintf(file,"%*s %.*s",indent,"",o,Bbuf); } if (diff+match > 0) fprintf(file," %5.1f%%\n",(100.*diff)/(diff+match)); else fprintf(file,"\n"); //fprintf(file, "Cool!\n"); fflush(file); return (0); } int LAInterface::generateConsensus(std::vector &alns) { int seq_count = alns.size(); //TBD return 0; } int LAInterface::recoverAlignment(LAlignment *alignment) { if (alignment->recovered) return -1; int j; uint16 *trace; Work_Data *work; int in, npt, idx, ar; int64 tps; char *abuffer, *bbuffer; int ar_wide, br_wide; int ai_wide, bi_wide; int mn_wide, mx_wide; int tp_wide; int blast, match, seen, lhalf, rhalf; bool ALIGN = true; bool REFERENCE = false; bool CARTOON = false; bool OVERLAP = false; bool FLIP = false; bool UPPERCASE = false; bool MAP = false; int INDENT = 4; int WIDTH = 100; int BORDER = 10; //int tmax = 3000; //trace = (uint16 *) malloc(sizeof(uint16) * tmax); //if (trace == NULL) // exit(1); int amin, amax, bmin, bmax; work = New_Work_Data(); abuffer = New_Read_Buffer(db1); bbuffer = New_Read_Buffer(db2); Overlap * ovl = (Overlap *) malloc(sizeof(Overlap)); Alignment * aln = (Alignment *) malloc(sizeof (Alignment)); aln->path = &(ovl->path); Path * path = &(ovl->path); path->abpos = alignment->abpos; path->aepos = alignment->aepos; path->bbpos = alignment->bbpos; path->bepos = alignment->bepos; path->diffs = alignment->diffs; path->tlen = alignment->tlen; aln->alen = alignment->alen; aln->blen = alignment->blen; aln->flags = (uint32)alignment->flags; ovl->aread = alignment->read_A_id_; ovl->bread = alignment->read_B_id_; path->trace = (uint16 *)malloc(path->tlen * sizeof(uint16)); memcpy(path->trace, alignment->trace_pts, path->tlen * sizeof(uint16)); amin = ovl->path.abpos - BORDER; if (amin < 0) amin = 0; amax = ovl->path.aepos + BORDER; if (amax > aln->alen) amax = aln->alen; if (COMP(aln->flags)) { bmin = (aln->blen - ovl->path.bepos) - BORDER; if (bmin < 0) bmin = 0; bmax = (aln->blen - ovl->path.bbpos) + BORDER; if (bmax > aln->blen) bmax = aln->blen; } else { bmin = ovl->path.bbpos - BORDER; if (bmin < 0) bmin = 0; bmax = ovl->path.bepos + BORDER; if (bmax > aln->blen) bmax = aln->blen; } char * aseq = Load_Subread(db1, ovl->aread, amin, amax, abuffer, 0); char * bseq = Load_Subread(db2, ovl->bread, bmin, bmax, bbuffer, 0); aln->aseq = aseq - amin; if (COMP(aln->flags)) { Complement_Seq(bseq, bmax - bmin); aln->bseq = bseq - (aln->blen - bmax); } else aln->bseq = bseq - bmin; computeTracePTS(aln, work, tspace); /*{ int tlen = aln->path->tlen; int *trace = (int *) aln->path->trace; int u; printf(" "); for (u = 0; u < tlen; u++) printf("%d,", (int) trace[u]); printf("\n"); }*/ alignment->tlen = aln->path->tlen; alignment->trace = (int *) malloc(sizeof(int) * aln->path->tlen*2); { int tlen = aln->path->tlen; int *trace = (int *) aln->path->trace; int u; //printf(" "); for (u = 0; u < tlen; u++) { //printf("%d,", (int) trace[u]); alignment->trace[u] = (int)trace[u]; } //printf("\n"); } free(bbuffer - 1); free(abuffer - 1); Free_Work_Data(work); //free(aln->path->trace); alignment->recovered = true; return 0; } std::vector * LAInterface::getCoverage(std::vector alns) { std::vector * res = new std::vector( alns[0]->alen, 0 ); for (int i = 0; i < alns.size(); i++) { for (int j = alns[i]->read_A_match_start_; j < alns[i]->read_A_match_end_; j++) (*res)[j] ++; } return res; } std::vector *LAInterface::getCoverage(std::vector alns) { std::vector * res = new std::vector( alns[0]->alen, 0 ); for (int i = 0; i < alns.size(); i++) { for (int j = alns[i]->abpos; j < alns[i]->aepos; j++) (*res)[j] ++; } return res; } std::vector > * LAInterface::lowCoverageRegions(std::vector &cov, int min_cov) { std::vector> * reg = new std::vector> (); int pos = 0; while (pos < cov.size()) { int start = 0; if (cov[pos] < min_cov){ start = pos; while ((cov[pos] < min_cov) and (pos < cov.size())) pos ++; reg->push_back(std::pair(start, pos) ); //low coverage region in [a,b) } else pos ++; } return reg; } bool compare_event(std::pair event1,std::pair event2) { return event1.first < event2.first; } void LAInterface::profileCoverage(std::vector &alignments, std::vector > & coverage,int reso, int cutoff) { //Returns coverage, which is a pair of ints std::vector > events; for (int i = 0; i < alignments.size(); i ++) { events.push_back(std::pair(alignments[i]->read_A_match_start_ + cutoff, 1)); events.push_back(std::pair(alignments[i]->read_A_match_end_ - cutoff, -1)); } std::sort(events.begin(), events.end(), compare_event); int pos = 0; int i = 0; int count = 0; while (pos < events.size()) { while ((events[pos].first < i*reso) and (pos < events.size())) { count += events[pos].second; pos++; } coverage.push_back(std::pair(i*reso, count)); i++; } return; } void LAInterface::profileCoveragefine(std::vector &alignments, std::vector > & coverage,int reso, int cutoff, int est_coverage) { std::vector > events; int sz = alignments.size(); if (sz > est_coverage) sz = est_coverage; for (int i = 0; i < sz; i ++) { events.push_back(std::pair(alignments[i]->read_A_match_start_ + cutoff, 1)); events.push_back(std::pair(alignments[i]->read_A_match_end_ - cutoff, -1)); } std::sort(events.begin(), events.end(), compare_event); int pos = 0; int i = 0; int count = 0; while (pos < events.size()) { while ((events[pos].first < i*reso) and (pos < events.size())) { count += events[pos].second; pos++; } coverage.push_back(std::pair(i*reso, count)); i++; } return; } void LAInterface::repeatDetect(std::vector > & coverage, std::vector > & repeat) { for (int i = 1; i < coverage.size(); i++) { if (coverage[i].second > 2*coverage[i-1].second) repeat.push_back(std::pair(coverage[i].first, 1)); if (coverage[i].second < 0.5*coverage[i-1].second) repeat.push_back(std::pair(coverage[i].first, -1)); } return; } static int qv_map[51] = { 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y' }; int LAInterface::getQV(std::vector > & QV, int from, int to) { int b,e; b = from; e = to; HITS_READ * reads = db1->reads; bool UPPER = true; int64 *qv_idx; uint8 *qv_val; //if (DOIQV) { int status, kind; HITS_TRACK *track; status = Check_Track(db1, (char *)"qual",&kind); if (status == -2) { fprintf(stderr,"%s: .qual-track does not exist for this db.\n",Prog_Name); return (1); } if (status == -1) { fprintf(stderr,"%s: .qual-track not sync'd with db.\n",Prog_Name); return (1); } track = Load_Track(db1, (char *)"qual"); qv_idx = (int64 *) track->anno; qv_val = (uint8 *) track->data; } for (int i = b; i < e; i++) { int len; int fst, lst; int flags, qv; HITS_READ *r; r = reads + i; len = r->rlen; /*if (DORED) printf("R %d\n",i+1);*/ flags = r->flags; qv = (flags & DB_QV); /*if (DOHDR) { if (DAM) { char header[MAX_NAME]; fseeko(hdrs,r->coff,SEEK_SET); fgets(header,MAX_NAME,hdrs); header[strlen(header)-1] = '\0'; printf("H %ld %s\n",strlen(header),header); printf("L %d %d %d\n",r->origin,r->fpulse,r->fpulse+len); } else { while (i < findx[map-1]) map -= 1; while (i >= findx[map]) map += 1; printf("H %ld %s\n",strlen(flist[map]),flist[map]); printf("L %d %d %d\n",r->origin,r->fpulse,r->fpulse+len); if (qv > 0) printf("Q: %d\n",qv); } }*/ /*if (DOQVS) Load_QVentry(db,i,entry,UPPER);*/ /*if (DOSEQ) Load_Read(db,i,read,UPPER);*/ /*for (m = 0; m < MTOP; m++) { int64 *anno; int *data; int64 s, f, j; anno = (int64 *) MTRACK[m]->anno; data = (int *) MTRACK[m]->data; s = (anno[i] >> 2); f = (anno[i+1] >> 2); printf("T%d %lld ",m,(f-s)/2); if (s < f) { for (j = s; j < f; j += 2) printf(" %d %d",data[j],data[j+1]); } printf("\n"); } if (substr) { fst = iter->beg; lst = iter->end; } else { fst = 0; lst = len; } if (DOSEQ) { printf("S %d ",lst-fst); printf("%.*s\n",lst-fst,read+fst); } */ //if (DOIQV) { int64 k, e; std::vector qv; k = qv_idx[i]; e = qv_idx[i+1]; //printf("I %lld ",e-k); while (k < e) { qv.push_back(qv_val[k++]); //putchar(qv_map[qv_val[k++]]); } //printf("\n"); QV.push_back(qv); } /*if (DOQVS) { int k; for (k = 0; k < 5; k++) { printf("%c %d ",qvname[k],lst-fst); printf("%.*s\n",lst-fst,entry[k]+fst); } }*/ } return 0; } int LOverlap::GetMatchingPosition(int pos_A) { /** * GetMatchingPosition: Given a position on read A inside the matched segment, * return the corresponding position on B */ if ((pos_A < this->read_A_match_start_) or (pos_A > this->read_A_match_end_)) { return -1; } int rev_sign = 1 - 2*this->reverse_complement_match_; int current_pos_read_A = this->read_A_match_start_; int next_pos_read_A = current_pos_read_A; int current_pos_read_B = this->read_B_match_start_; if (this->reverse_complement_match_ == 1) { current_pos_read_B = this->read_B_match_end_; } for (int j = 0; j < this->trace_pts_len/2-1; j++) { if (current_pos_read_A % 100 != 0) next_pos_read_A = int(ceil(current_pos_read_A / 100.0)) * 100; else next_pos_read_A = current_pos_read_A + 100; if (next_pos_read_A >= pos_A) { return current_pos_read_B + pos_A - current_pos_read_A; } current_pos_read_B = current_pos_read_B + rev_sign * this->trace_pts[2 * j + 1]; current_pos_read_A = next_pos_read_A; } // if we got here, it means the hinge is in the last trace_pt window of A if (current_pos_read_A < pos_A) { // technically, we shouldn' need to check this return current_pos_read_B + pos_A - current_pos_read_A; } return -2; // this shouldn't happen } void LOverlap::trim_overlap() { /** * Trim overlap: the reads are trimmed according to qualities and coverage, * To be consistent, the overlap needs to be trimmed. * Rather than running DAligner on trimmed reads, this function trims the overlap according to trace points. * It finds the trace point that are not trimmed in both reads. */ //before trimming, the positions are read_A_match_start_, read_B_match_start_, read_A_match_end_ // and read_B_match_end_, we add a eff_ prefix to it after trimming this->eff_read_B_match_start_ = this->read_B_match_start_; this->eff_read_B_match_end_ = this->read_B_match_end_; this->eff_read_A_match_start_ = this->read_A_match_start_; this->eff_read_A_match_end_ = this->read_A_match_end_; std::vector > trace_points; if (this->reverse_complement_match_ == 0) { trace_points.push_back(std::pair(this->read_A_match_start_, this->read_B_match_start_)); } else { trace_points.push_back(std::pair(this->read_A_match_start_, this->read_B_match_end_)); } int rev_sign = 1 - 2*this->reverse_complement_match_; int current_position_read_A = this->read_A_match_start_; // this for loop change trace points stored trace_pts[] into coordinate pairs vector: tps for (int j = 0; j < this->trace_pts_len/2-1; j++) { if (current_position_read_A % 100 != 0) current_position_read_A = int(ceil(current_position_read_A / 100.0)) * 100; else current_position_read_A += 100; trace_points.push_back(std::pair(current_position_read_A, trace_points.back().second + rev_sign * this->trace_pts[2 * j + 1])); } if (this->reverse_complement_match_ == 0) { trace_points.push_back(std::pair(this->read_A_match_end_, this->read_B_match_end_)); } else { trace_points.push_back(std::pair(this->read_A_match_end_, this->read_B_match_start_)); } //printf("[%6d %6d] [%6d %6d]\n", this->eff_read_A_read_start_, this->eff_read_A_read_end_, this->eff_read_B_read_start_, this->eff_read_B_read_end_); //printf("[%6d %6d] [%6d %6d]\n", this->eff_read_A_match_start_, this->eff_read_A_match_end_, this->eff_read_B_match_start_, this->eff_read_B_match_end_); /*for (int j = 0; j < trace_points.size(); j++) { printf("a%d b%d ", trace_points[j].first, trace_points[j].second); } printf("\n"); // for debugging */ this->eff_start_trace_point_index_ = trace_points.size(); this->eff_end_trace_point_index_ = 0; if (this->reverse_complement_match_ == 0) { //for trace point pairs, get the first one that is in untrimmed regions for both reads for (int i = 0; i < trace_points.size(); i++) { if ( (trace_points[i].first >= this->eff_read_A_read_start_) and (trace_points[i].second >= this->eff_read_B_read_start_) ) { this->eff_read_A_match_start_ = trace_points[i].first; this->eff_read_B_match_start_ = trace_points[i].second; this->eff_start_trace_point_index_ = i; break; } } //for trace point pairs, get the last one that is in untrimmed regions for both reads for (int i = (int) trace_points.size() - 1; i >= 0; i--) { if ((trace_points[i].first <= this->eff_read_A_read_end_) and (trace_points[i].second <= this->eff_read_B_read_end_)) { this->eff_read_A_match_end_ = trace_points[i].first; this->eff_read_B_match_end_ = trace_points[i].second; this->eff_end_trace_point_index_ = i; break; } } } else { for (int i = 0; i < trace_points.size(); i++) { if ( (trace_points[i].first >= this->eff_read_A_read_start_) and (trace_points[i].second <= this->eff_read_B_read_end_) ) { this->eff_read_A_match_start_ = trace_points[i].first; this->eff_read_B_match_end_ = trace_points[i].second; this->eff_start_trace_point_index_ = i; // "start" with respect to A break; } } for (int i = (int) trace_points.size() - 1; i >= 0; i--) { if ((trace_points[i].first <= this->eff_read_A_read_end_) and (trace_points[i].second >= this->eff_read_B_read_start_)) { this->eff_read_A_match_end_ = trace_points[i].first; this->eff_read_B_match_start_ = trace_points[i].second; this->eff_end_trace_point_index_ = i; break; } } } if (this->eff_start_trace_point_index_ >= this->eff_end_trace_point_index_) { this->active = false; } /*printf("[%6d %6d] [%6d %6d]\n", this->eff_read_A_match_start_, this->eff_read_A_match_end_, this->eff_read_B_match_start_, this->eff_read_B_match_end_); int overhang_read_A_left = this->eff_read_A_match_start_ - this->eff_read_A_read_start_; int overhang_read_A_right = this->eff_read_A_read_end_ - this->eff_read_A_match_end_; int overhang_read_B_left = this->eff_read_B_match_start_ - this->eff_read_B_read_start_; int overhang_read_B_right = this->eff_read_B_read_end_ - this->eff_read_B_match_end_; printf("trim A_left %6d, A_right %6d, B_left %6d, B_right %6d\n", overhang_read_A_left, overhang_read_A_right, overhang_read_B_left, overhang_read_B_right); */ } void LOverlap::TrimOverlapNaive(){ this->eff_read_B_match_start_ = std::max (this->read_B_match_start_,this->eff_read_B_read_start_); this->eff_read_B_match_end_ = std::min (this->read_B_match_end_,this->eff_read_B_read_end_); this->eff_read_A_match_start_ = std::max (this->read_A_match_start_,this->eff_read_A_read_start_); this->eff_read_A_match_end_ = std::min (this->read_A_match_end_,this->eff_read_A_read_end_);; } // This function is no longer used in hinging_v1.cpp void LOverlap::addtype(int max_overhang) { /** * addtype is a function for classifying overlaps, edges are classified into forward, backward, internal match, bcovera and acoverb, it is based on effective positions, rather than positions */ int overhang = std::min(this->eff_read_A_match_start_ - this->eff_read_A_read_start_, this->eff_read_B_match_start_ - this->eff_read_B_read_start_) + std::min(this->eff_read_A_read_end_ - this->eff_read_A_match_end_, this->eff_read_B_read_end_ - this->eff_read_B_match_end_); //int tol = 0; if (overhang > max_overhang) this->match_type_ = INTERNAL; else if ((this->eff_read_A_match_start_ - this->eff_read_A_read_start_ <= this->eff_read_B_match_start_ - this->eff_read_B_read_start_) and (this->eff_read_A_read_end_ - this->eff_read_A_match_end_ <= this->eff_read_B_read_end_ - this->eff_read_B_match_end_)) this->match_type_ = BCOVERA; else if ((this->eff_read_A_match_start_ - this->eff_read_A_read_start_ >= this->eff_read_B_match_start_ - this->eff_read_B_read_start_) and (this->eff_read_A_read_end_ - this->eff_read_A_match_end_ >= this->eff_read_B_read_end_ - this->eff_read_B_match_end_)) this->match_type_ = ACOVERB; else if (this->eff_read_A_match_start_ - this->eff_read_A_read_start_ > this->eff_read_B_match_start_ - this->eff_read_B_read_start_) { if ((this->eff_read_B_read_end_ - this->eff_read_B_match_end_ > 0) and (this->eff_read_A_match_start_ - this->eff_read_A_read_start_ > 0)) this->match_type_ = FORWARD; } else { if ((this->eff_read_B_match_start_ - this->eff_read_B_read_start_ > 0) and (this->eff_read_A_read_end_ - this->eff_read_A_match_end_ > 0)) this->match_type_ = BACKWARD; } } void LOverlap::AddTypesAsymmetric(int max_overhang, int min_overhang) { //Getting a parameter max_overhang, which is the maximum overlap that one can attribute to bad DAligner ends //The function sets the class variable match_type_ according to the relative positions of the reads. //Possible things it can set to are: // BCOVERA, ACOVERB, INTERNAL, FORWARD, FORWARD_INTERNAL, BACKWARD, BACKWARD_INTERNAL int overhang_read_A_left = this->eff_read_A_match_start_ - this->eff_read_A_read_start_; int overhang_read_A_right = this->eff_read_A_read_end_ - this->eff_read_A_match_end_; int overhang_read_B_left = this->eff_read_B_match_start_ - this->eff_read_B_read_start_; int overhang_read_B_right = this->eff_read_B_read_end_ - this->eff_read_B_match_end_; //printf(" A_left %6d, A_right %6d, B_left %6d, B_right %6d\n", // overhang_read_A_left, overhang_read_A_right, // overhang_read_B_left, overhang_read_B_right); if (this->reverse_complement_match_ == 1) { //Exchange overhang left and right of read B if match is reverse complement overhang_read_B_left = this->eff_read_B_read_end_ - this->eff_read_B_match_end_; overhang_read_B_right = this->eff_read_B_match_start_ - this->eff_read_B_read_start_; } if ((std::max(overhang_read_A_left, overhang_read_A_right) < max_overhang) and (std::min(overhang_read_B_left, overhang_read_B_right) > min_overhang )) // and ((overhang_read_A_left <= overhang_read_B_left) // and (overhang_read_A_right <= overhang_read_B_right))) this->match_type_ = BCOVERA; else if ((std::max(overhang_read_B_left, overhang_read_B_right) < max_overhang) and (std::min(overhang_read_A_left, overhang_read_A_right) > min_overhang )) //and (overhang_read_A_left >= overhang_read_B_left) // and (overhang_read_A_right >= overhang_read_B_right)) // this->match_type_ = ACOVERB; else if ((std::min(overhang_read_A_left, overhang_read_A_right) > max_overhang)) this->match_type_ = INTERNAL; else if (overhang_read_A_left <= max_overhang) { //Check if read B if a left extension. As we've handled internal, //we know that this is a BACKWARD or BACKWARD_INTERNAL match if ((overhang_read_B_right <= max_overhang) and (overhang_read_B_left >= max_overhang)) { //Alignment internal in B. (It may be an overlap or a non extending overlap) this->match_type_ = BACKWARD; } else if ((overhang_read_B_right >= max_overhang) and (overhang_read_B_left >= max_overhang)) { //Alignment is a overlap on B. this->match_type_ = BACKWARD_INTERNAL; } } else if (overhang_read_A_right <= max_overhang) { //Check if read B if a right extension. As we've handled internal, //we know that this is a FORWARD or FORWARD_INTERNAL match if ((overhang_read_B_left <= max_overhang) and (overhang_read_B_right >= max_overhang)) { //Alignment internal in B. (It may be an overlap or a non extending overlap) this->match_type_ = FORWARD; } else if ((overhang_read_B_left >= max_overhang) and (overhang_read_B_right >= max_overhang)) { //Alignment is a overlap on B. this->match_type_ = FORWARD_INTERNAL; } else{ this->match_type_ = UNDEFINED; } } /*std::ofstream ofs ("overlapt.txt", std::ofstream::app); ofs << "===============================================\n" << "Read A id "<< std::setfill('0') << std::setw(5) <read_A_id_ << "\nRead B id " << std::setfill('0') << std::setw(5) << this->read_B_id_ << "\nRead A eff start "<< std::setfill('0') << std::setw(5) << this->eff_read_A_read_start_ << " Read A eff end "<< std::setfill('0') << std::setw(5) << this->eff_read_A_read_end_ << " Read A length " << std::setfill('0') << std::setw(5) << this->alen << " Read A match start "<< std::setfill('0') << std::setw(5) << this->read_A_match_start_ << " Read A eff match start " << std::setfill('0') << std::setw(5) << this->eff_read_A_match_start_ << " Read A match end " << std::setfill('0') << std::setw(5) << this->read_A_match_end_ << " Read A eff match end " << std::setfill('0') << std::setw(5) << this->eff_read_A_match_end_ << "\nRead B eff start " << std::setfill('0') << std::setw(5) << this->eff_read_B_read_start_ << " Read B eff end " << std::setfill('0') << std::setw(5) << this->eff_read_B_read_end_ << " Read B length " << std::setfill('0') << std::setw(5) << this->blen << " Read B match start "<< std::setfill('0') << std::setw(5) << this->read_B_match_start_ << " Read B eff match start " << std::setfill('0') << std::setw(5) << this->eff_read_B_match_start_ << " Read B match end " << std::setfill('0') << std::setw(5) << this->read_B_match_end_ << " Read B eff match end " << std::setfill('0') << std::setw(5) << this->eff_read_B_match_end_ << "\nReverse complement " << std::setfill('0') << std::setw(5) << this->reverse_complement_match_ << "\nMatch type "<match_type_ << "\n" << std::endl; ofs.close();*/ } int get_id_from_string(const char * name_str) { const char * sub0 = strchr(name_str, '/'); const char * sub1 = sub0 + 1; const char * sub2 = strchr(sub1, '/'); char substr[15]; strncpy(substr, sub1, strlen(sub1) - strlen(sub2)); substr[strlen(sub1) - strlen(sub2)] = 0; return atoi(substr); } int LAInterface::loadPAF(std::string filename, std::vector & alns) { paf_file_t *fp; paf_rec_t r; fp = paf_open(filename.c_str()); int num = 0; while (paf_read(fp, &r) >= 0) { num ++; LOverlap *new_ovl = new LOverlap(); new_ovl->read_A_match_start_ = r.qs; new_ovl->read_B_match_start_ = r.ts; new_ovl->read_A_match_end_ = r.qe; new_ovl->read_B_match_end_ = r.te; new_ovl->alen = r.ql; new_ovl->blen = r.tl; new_ovl->reverse_complement_match_ = r.rev; new_ovl->diffs = 0; new_ovl->read_A_id_ = get_id_from_string(r.qn) - 1; new_ovl->read_B_id_ = get_id_from_string(r.tn) - 1; //change 1 based to 0 based alns.push_back(new_ovl); } return num; } KSEQ_INIT(gzFile, gzread) int LAInterface::loadFASTA(std::string filename, std::vector & reads) { gzFile fp; kseq_t *seq; int l; int num = 0; fp = gzopen(filename.c_str(), "r"); // STEP 2: open the file handler seq = kseq_init(fp); // STEP 3: initialize seq while ((l = kseq_read(seq)) >= 0) { // STEP 4: read sequence //printf("name: %s\n", seq->name.s); //if (seq->comment.l) printf("comment: %s\n", seq->comment.s); //printf("seq: %s\n", seq->seq.s); //if (seq->qual.l) printf("qual: %s\n", seq->qual.s); Read *new_r = new Read(num, strlen(seq->seq.s), std::string(seq->name.s), std::string(seq->seq.s)); reads.push_back(new_r); num++; } //printf("return value: %d\n", l); kseq_destroy(seq); // STEP 5: destroy seq gzclose(fp); // STEP 6: close the file handler return num; } bool pairAscend(const std::pair& firstElem, const std::pair& secondElem) { return firstElem.first < secondElem.first; } bool pairDescend(const std::pair& firstElem, const std::pair& secondElem) { return firstElem.first > secondElem.first; } bool compare_overlap(LOverlap * ovl1, LOverlap * ovl2) { //Returns True if the sum of the match lengths of the two reads in ovl1 > the sum of the overlap lengths of the two reads in ovl2 //Returns False otherwise. return ((ovl1->read_A_match_end_ - ovl1->read_A_match_start_ + ovl1->read_B_match_end_ - ovl1->read_B_match_start_) > (ovl2->read_A_match_end_ - ovl2->read_A_match_start_ + ovl2->read_B_match_end_ - ovl2->read_B_match_start_)); } bool compare_sum_overlaps(const std::vector * ovl1, const std::vector * ovl2) { //Returns True if the sum of matches over both reads for overlaps in ovl1 > sum of matches over both reads for overlaps in ovl2 //Returns False otherwise int sum1 = 0; int sum2 = 0; for (int i = 0; i < ovl1->size(); i++) sum1 += (*ovl1)[i]->read_A_match_end_ - (*ovl1)[i]->read_A_match_start_ + (*ovl1)[i]->read_B_match_end_ - (*ovl1)[i]->read_B_match_start_; for (int i = 0; i < ovl2->size(); i++) sum2 += (*ovl2)[i]->read_A_match_end_ - (*ovl2)[i]->read_A_match_start_ + (*ovl2)[i]->read_B_match_end_ - (*ovl2)[i]->read_B_match_start_; return sum1 > sum2; } bool compare_pos(LOverlap * ovl1, LOverlap * ovl2) { //True if ovl1 starts earlier than ovl2 on read a. return (ovl1->read_A_match_start_) > (ovl2->read_A_match_start_); } bool compare_overlap_abpos(LOverlap * ovl1, LOverlap * ovl2) { //True if ovl2 starts earlier than ovl1 on read a. //flips the two argumenst in compare_pos return ovl1->read_A_match_start_ < ovl2->read_A_match_start_; } bool compare_overlap_aepos(LOverlap * ovl1, LOverlap * ovl2) { //Same as compare_pos? return ovl1->read_A_match_start_ > ovl2->read_A_match_start_; } bool compare_overlap_weight(LOverlap * ovl1, LOverlap * ovl2) { return (ovl1->weight > ovl2->weight); } bool compare_overlap_aln(LAlignment * ovl1, LAlignment * ovl2) { return ((ovl1->aepos - ovl1->abpos + ovl1->bepos - ovl1->bbpos) > (ovl2->aepos - ovl2->abpos + ovl2->bepos - ovl2->bbpos)); } HINGE-0.5.0/src/lib/QV.c000077500000000000000000001132511314415550300144040ustar00rootroot00000000000000/************************************************************************************\ * * * Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. * * * * Redistribution and use in source and binary forms, with or without modification, * * are permitted provided that the following conditions are met: * * * * · Redistributions of source code must retain the above copyright notice, this * * list of conditions and the following disclaimer. * * * * · Redistributions in binary form must reproduce the above copyright notice, this * * list of conditions and the following disclaimer in the documentation and/or * * other materials provided with the distribution. * * * * · The name of EWM may not be used to endorse or promote products derived from * * this software without specific prior written permission. * * * * THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, * * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE * * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN * * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * * * For any issues regarding this software and its use, contact EWM at: * * * * Eugene W. Myers Jr. * * Bautzner Str. 122e * * 01099 Dresden * * GERMANY * * Email: gene.myers@gmail.com * * * \************************************************************************************/ /******************************************************************************************* * * Compressor/decompressor for .quiv files: customized Huffman codes for each stream based on * the histogram of values occuring in a given file. The two low complexity streams * (deletionQV and substitutionQV) use a Huffman coding of the run length of the prevelant * character. * * Author: Gene Myers * Date: Jan 18, 2014 * Modified: July 25, 2014 * ********************************************************************************************/ #include #include #include #include #include #include "DB.h" #undef DEBUG #define MIN_BUFFER 1000 #define HUFF_CUTOFF 16 // This cannot be larger than 16 ! /******************************************************************************************* * * Endian flipping routines * ********************************************************************************************/ static int LittleEndian; // Little-endian machine ? // Referred by: Decode & Decode_Run static int Flip; // Flip endian of all coded shorts and ints // Referred by: Decode & Decode_Run & Read_Scheme static void Set_Endian(int flip) { uint32 x = 3; uint8 *b = (uint8 *) (&x); Flip = flip; LittleEndian = (b[0] == 3); } static void Flip_Long(void *w) { uint8 *v = (uint8 *) w; uint8 x; x = v[0]; v[0] = v[3]; v[3] = x; x = v[1]; v[1] = v[2]; v[2] = x; } static void Flip_Short(void *w) { uint8 *v = (uint8 *) w; uint8 x; x = v[0]; v[0] = v[1]; v[1] = x; } /******************************************************************************************* * * Routines for computing a Huffman Encoding Scheme * ********************************************************************************************/ typedef struct { int type; // 0 => normal, 1 => normal but has long codes, 2 => truncated uint32 codebits[256]; // If type = 2, then code 255 is the special code for int codelens[256]; // non-Huffman exceptions int lookup[0x10000]; // Lookup table (just for decoding) } HScheme; typedef struct _HTree { struct _HTree *lft, *rgt; uint64 count; } HTree; // Establish heap property from node s down (1 is root, siblings of n are 2n and 2n+1) // assuming s is the only perturbation in the tree. static void Reheap(int s, HTree **heap, int hsize) { int c, l, r; HTree *hs, *hr, *hl; c = s; hs = heap[s]; while ((l = 2*c) <= hsize) { r = l+1; hl = heap[l]; hr = heap[r]; if (r > hsize || hr->count > hl->count) { if (hs->count > hl->count) { heap[c] = hl; c = l; } else break; } else { if (hs->count > hr->count) { heap[c] = hr; c = r; } else break; } } if (c != s) heap[c] = hs; } // Given Huffman tree build a table of codes from it, the low-order codelens[s] bits // of codebits[s] contain the code for symbol s. static void Build_Table(HTree *node, int code, int len, uint32 *codebits, int *codelens) { if (node->rgt == NULL) { uint64 symbol = (uint64) (node->lft); codebits[symbol] = code; codelens[symbol] = len; } else { code <<= 1; len += 1; Build_Table(node->lft,code,len,codebits,codelens); Build_Table(node->rgt,code+1,len,codebits,codelens); } } // For the non-zero symbols in hist, compute a huffman tree over them, and then // build a table of the codes. If inscheme is not NULL, then place all symbols // with code 255 or with more than HUFF_CUTOFF bits in the encoding by inscheme // as a single united entity, whose code signals that the value of these symbols // occur explicitly in 8 (values) or 16 (run lengths) bits following the code. // All the symbols in this class will have the same entry in the code table and // 255 is always in this class. static HScheme *Huffman(uint64 *hist, HScheme *inscheme) { HScheme *scheme; HTree *heap[259]; HTree node[512]; int hsize; HTree *lft, *rgt; int value, range; int i; scheme = (HScheme *) Malloc(sizeof(HScheme),"Allocating Huffman scheme record"); if (scheme == NULL) return (NULL); hsize = 0; // Load heap value = 0; if (inscheme != NULL) { node[0].count = 0; node[0].lft = (HTree *) (uint64) 255; node[0].rgt = NULL; heap[++hsize] = node+(value++); } for (i = 0; i < 256; i++) if (hist[i] > 0) { if (inscheme != NULL && (inscheme->codelens[i] > HUFF_CUTOFF || i == 255)) node[0].count += hist[i]; else { node[value].count = hist[i]; node[value].lft = (HTree *) (uint64) i; node[value].rgt = NULL; heap[++hsize] = node+(value++); } } for (i = hsize/2; i >= 1; i--) // Establish heap property Reheap(i,heap,hsize); range = value; // Merge pairs with smallest count until have a tree for (i = 1; i < value; i++) { lft = heap[1]; heap[1] = heap[hsize--]; Reheap(1,heap,hsize); rgt = heap[1]; node[range].lft = lft; node[range].rgt = rgt; node[range].count = lft->count + rgt->count; heap[1] = node+(range++); Reheap(1,heap,hsize); } for (i = 0; i < 256; i++) // Build the code table { scheme->codebits[i] = 0; scheme->codelens[i] = 0; } Build_Table(node+(range-1),0,0,scheme->codebits,scheme->codelens); if (inscheme != NULL) // Set scheme type and if truncated (2), map truncated codes { scheme->type = 2; // to code and length for 255 for (i = 0; i < 255; i++) if (inscheme->codelens[i] > HUFF_CUTOFF || scheme->codelens[i] > HUFF_CUTOFF) { scheme->codelens[i] = scheme->codelens[255]; scheme->codebits[i] = scheme->codebits[255]; } } else { scheme->type = 0; for (i = 0; i < 256; i++) { if (scheme->codelens[i] > HUFF_CUTOFF) scheme->type = 1; } } return (scheme); } #ifdef DEBUG // For debug, show the coding table static void Print_Table(HScheme *scheme, uint64 *hist, int infosize) { uint64 total_bits; uint32 specval, mask, code, *bits; int speclen, clen, *lens; int i, k; total_bits = 0; bits = scheme->codebits; lens = scheme->codelens; if (scheme->type == 2) { specval = bits[255]; speclen = lens[255]; } else specval = speclen = 0x7fffffff; printf("\nCode Table:\n"); for (i = 0; i < 256; i++) if (lens[i] > 0) { clen = lens[i]; mask = (1 << clen); code = bits[i]; printf(" %3d: %2d ",i,clen); for (k = 0; k < clen; k++) { mask >>= 1; if (code & mask) printf("1"); else printf("0"); } if (code == specval && clen == speclen) { printf(" ***"); if (hist != NULL) total_bits += (clen+infosize)*hist[i]; } else if (hist != NULL) total_bits += clen*hist[i]; printf("\n"); } if (hist != NULL) printf("\nTotal Bytes = %lld\n",(total_bits-1)/8+1); } // For debug, show the histogram static void Print_Histogram(uint64 *hist) { int i, low, hgh; uint64 count; for (hgh = 255; hgh >= 0; hgh--) if (hist[hgh] != 0) break; for (low = 0; low < 256; low++) if (hist[low] != 0) break; count = 0; for (i = low; i <= hgh; i++) count += hist[i]; for (i = hgh; i >= low; i--) printf(" %3d: %8llu %5.1f%%\n",i,hist[i],(hist[i]*100.)/count); } #endif /******************************************************************************************* * * Read and Write Huffman Schemes * ********************************************************************************************/ // Write the code table to out. static void Write_Scheme(HScheme *scheme, FILE *out) { int i; uint8 x; uint32 *bits; int *lens; lens = scheme->codelens; bits = scheme->codebits; x = (uint8) (scheme->type); fwrite(&x,1,1,out); for (i = 0; i < 256; i++) { x = (uint8) (lens[i]); fwrite(&x,1,1,out); if (x > 0) fwrite(bits+i,sizeof(uint32),1,out); } } // Allocate and read a code table from in, and return a pointer to it. static HScheme *Read_Scheme(FILE *in) { HScheme *scheme; int *look, *lens; uint32 *bits, base; int i, j, powr; uint8 x; scheme = (HScheme *) Malloc(sizeof(HScheme),"Allocating Huffman scheme record"); if (scheme == NULL) return (NULL); lens = scheme->codelens; bits = scheme->codebits; look = scheme->lookup; if (fread(&x,1,1,in) != 1) { EPRINTF(EPLACE,"Could not read scheme type byte (Read_Scheme)\n"); free(scheme); return (NULL); } scheme->type = x; for (i = 0; i < 256; i++) { if (fread(&x,1,1,in) != 1) { EPRINTF(EPLACE,"Could not read length of %d'th code (Read_Scheme)\n",i); return (NULL); } lens[i] = x; if (x > 0) { if (fread(bits+i,sizeof(uint32),1,in) != 1) { EPRINTF(EPLACE,"Could not read bit encoding of %d'th code (Read_Scheme)\n",i); free(scheme); return (NULL); } } else bits[i] = 0; } if (Flip) { for (i = 0; i < 256; i++) Flip_Long(bits+i); } for (i = 0; i < 256; i++) { if (lens[i] > 0) { base = (bits[i] << (16-lens[i])); powr = (1 << (16-lens[i])); for (j = 0; j < powr; j++) look[base+j] = i; } } return (scheme); } /******************************************************************************************* * * Encoders and Decoders * ********************************************************************************************/ // Encode read[0..rlen-1] according to scheme and write to out static void Encode(HScheme *scheme, FILE *out, uint8 *read, int rlen) { uint32 x, c, ocode; int n, k, olen, llen; int *nlens; uint32 *nbits; uint32 nspec; int nslen; nlens = scheme->codelens; nbits = scheme->codebits; if (scheme->type == 2) { nspec = nbits[255]; nslen = nlens[255]; } else nspec = nslen = 0x7fffffff; #define OCODE(L,C) \ { int len = olen + (L); \ uint32 code = (C); \ \ llen = olen; \ if (len >= 32) \ { olen = len-32; \ ocode |= (code >> olen); \ fwrite(&ocode,sizeof(uint32),1,out); \ if (olen > 0) \ ocode = (code << (32-olen)); \ else \ ocode = 0; \ } \ else \ { olen = len; \ ocode |= (code << (32-olen));; \ } \ } llen = 0; olen = 0; ocode = 0; for (k = 0; k < rlen; k++) { x = read[k]; n = nlens[x]; c = nbits[x]; OCODE(n,c); if (c == nspec && n == nslen) OCODE(8,x); } if (olen > 0) // Tricky: must pad so decoder does not read past { fwrite(&ocode,sizeof(uint32),1,out); // last integer int the coded output. if (llen > 16 && olen > llen) fwrite(&ocode,sizeof(uint32),1,out); } else if (llen > 16) fwrite(&ocode,sizeof(uint32),1,out); } // Encode read[0..rlen-1] according to non-rchar table neme, and run-length table reme for // runs of rchar characters. Write to out. static void Encode_Run(HScheme *neme, HScheme *reme, FILE *out, uint8 *read, int rlen, int rchar) { uint32 x, c, ocode; int n, h, k, olen, llen; int *nlens, *rlens; uint32 *nbits, *rbits; uint32 nspec, rspec; int nslen, rslen; nlens = neme->codelens; nbits = neme->codebits; rlens = reme->codelens; rbits = reme->codebits; if (neme->type == 2) { nspec = nbits[255]; nslen = nlens[255]; } else nspec = nslen = 0x7fffffff; rspec = rbits[255]; rslen = rlens[255]; llen = 0; olen = 0; ocode = 0; k = 0; while (k < rlen) { h = k; while (k < rlen && read[k] == rchar) k += 1; if (k-h >= 255) x = 255; else x = k-h; n = rlens[x]; c = rbits[x]; OCODE(n,c); if (c == rspec && n == rslen) OCODE(16,k-h); if (k < rlen) { x = read[k]; n = nlens[x]; c = nbits[x]; OCODE(n,c); if (c == nspec && n == nslen) OCODE(8,x); k += 1; } } if (olen > 0) { fwrite(&ocode,sizeof(uint32),1,out); if (llen > 16 && olen > llen) fwrite(&ocode,sizeof(uint32),1,out); } else if (llen > 16) fwrite(&ocode,sizeof(uint32),1,out); } // Read and decode from in, the next rlen symbols into read according to scheme static int Decode(HScheme *scheme, FILE *in, char *read, int rlen) { int *look, *lens; int signal, ilen; uint64 icode; uint32 *ipart; uint16 *xpart; uint8 *cpart; int j, n, c; if (LittleEndian) { ipart = ((uint32 *) (&icode)); xpart = ((uint16 *) (&icode)) + 2; cpart = ((uint8 *) (&icode)) + 5; } else { ipart = ((uint32 *) (&icode)) + 1; xpart = ((uint16 *) (&icode)) + 1; cpart = ((uint8 *) (&icode)) + 2; } if (scheme->type == 2) signal = 255; else signal = 256; lens = scheme->codelens; look = scheme->lookup; #define GET \ if (n > ilen) \ { icode <<= ilen; \ if (fread(ipart,sizeof(uint32),1,in) != 1) \ { EPRINTF(EPLACE,"Could not read more bits (Decode)\n"); \ return (1); \ } \ ilen = n-ilen; \ icode <<= ilen; \ ilen = 32-ilen; \ } \ else \ { icode <<= n; \ ilen -= n; \ } #define GETFLIP \ if (n > ilen) \ { icode <<= ilen; \ if (fread(ipart,sizeof(uint32),1,in) != 1) \ { EPRINTF(EPLACE,"Could not read more bits (Decode)\n"); \ return (1); \ } \ Flip_Long(ipart); \ ilen = n-ilen; \ icode <<= ilen; \ ilen = 32-ilen; \ } \ else \ { icode <<= n; \ ilen -= n; \ } n = 16; ilen = 0; icode = 0; if (Flip) for (j = 0; j < rlen; j++) { GETFLIP c = look[*xpart]; n = lens[c]; if (c == signal) { GETFLIP c = *cpart; n = 8; } read[j] = (char) c; } else for (j = 0; j < rlen; j++) { GET c = look[*xpart]; n = lens[c]; if (c == signal) { GET c = *cpart; n = 8; } read[j] = (char) c; } return (0); } // Read and decode from in, the next rlen symbols into read according to non-rchar scheme // neme, and the rchar runlength shceme reme static int Decode_Run(HScheme *neme, HScheme *reme, FILE *in, char *read, int rlen, int rchar) { int *nlook, *nlens; int *rlook, *rlens; int nsignal, ilen; uint64 icode; uint32 *ipart; uint16 *xpart; uint8 *cpart; int j, n, c, k; if (LittleEndian) { ipart = ((uint32 *) (&icode)); xpart = ((uint16 *) (&icode)) + 2; cpart = ((uint8 *) (&icode)) + 5; } else { ipart = ((uint32 *) (&icode)) + 1; xpart = ((uint16 *) (&icode)) + 1; cpart = ((uint8 *) (&icode)) + 2; } if (neme->type == 2) nsignal = 255; else nsignal = 256; nlens = neme->codelens; nlook = neme->lookup; rlens = reme->codelens; rlook = reme->lookup; n = 16; ilen = 0; icode = 0; if (Flip) for (j = 0; j < rlen; j++) { GETFLIP c = rlook[*xpart]; n = rlens[c]; if (c == 255) { GETFLIP c = *xpart; n = 16; } for (k = 0; k < c; k++) read[j++] = (char) rchar; if (j < rlen) { GETFLIP c = nlook[*xpart]; n = nlens[c]; if (c == nsignal) { GETFLIP c = *cpart; n = 8; } read[j] = (char) c; } } else for (j = 0; j < rlen; j++) { GET c = rlook[*xpart]; n = rlens[c]; if (c == 255) { GET c = *xpart; n = 16; } for (k = 0; k < c; k++) read[j++] = (char) rchar; if (j < rlen) { GET c = nlook[*xpart]; n = nlens[c]; if (c == nsignal) { GET c = *cpart; n = 8; } read[j] = (char) c; } } return (0); } /******************************************************************************************* * * Histogrammers * ********************************************************************************************/ // Histogram runlengths of symbol runChar in stream[0..rlen-1] into run. static void Histogram_Seqs(uint64 *hist, uint8 *stream, int rlen) { int k; for (k = 0; k < rlen; k++) hist[stream[k]] += 1; } static void Histogram_Runs(uint64 *run, uint8 *stream, int rlen, int runChar) { int k, h; k = 0; while (k < rlen) { h = k; while (k < rlen && stream[k] == runChar) k += 1; if (k-h >= 256) run[255] += 1; else run[k-h] += 1; if (k < rlen) k += 1; } } /******************************************************************************************* * * Reader * ********************************************************************************************/ static char *Read = NULL; // Referred by: QVentry, Read_Lines, QVcoding_Scan, static int Rmax = -1; // Compress_Next_QVentry static int Nline; // Referred by: QVcoding_Scan char *QVentry() { return (Read); } // If nlines == 1 trying to read a single header, nlines = 5 trying to read 5 QV/fasta lines // for a sequence. Place line j at Read+j*Rmax and the length of every line is returned // unless eof occurs in which case return -1. If any error occurs return -2. int Read_Lines(FILE *input, int nlines) { int i, rlen; int tmax; char *tread; char *other; if (Read == NULL) { tmax = MIN_BUFFER; tread = (char *) Malloc(5*tmax,"Allocating QV entry read buffer"); if (tread == NULL) EXIT(-2); Rmax = tmax; Read = tread; } Nline += 1; if (fgets(Read,Rmax,input) == NULL) return (-1); rlen = strlen(Read); while (Read[rlen-1] != '\n') { tmax = ((int) 1.4*Rmax) + MIN_BUFFER; tread = (char *) Realloc(Read,5*tmax,"Reallocating QV entry read buffer"); if (tread == NULL) EXIT(-2); Rmax = tmax; Read = tread; if (fgets(Read+rlen,Rmax-rlen,input) == NULL) { EPRINTF(EPLACE,"Line %d: Last line does not end with a newline !\n",Nline); EXIT(-2); } rlen += strlen(Read+rlen); } other = Read; for (i = 1; i < nlines; i++) { other += Rmax; Nline += 1; if (fgets(other,Rmax,input) == NULL) { EPRINTF(EPLACE,"Line %d: incomplete last entry of .quiv file\n",Nline); EXIT(-2); } if (rlen != (int) strlen(other)) { EPRINTF(EPLACE,"Line %d: Lines for an entry are not the same length\n",Nline); EXIT(-2); } } return (rlen-1); } /******************************************************************************************* * * Tag compression and decompression routines * ********************************************************************************************/ // Keep only the symbols in tags[0..rlen-1] for which qvs[k] != rchar and // return the # of symbols kept. static int Pack_Tag(char *tags, char *qvs, int rlen, int rchar) { int j, k; j = 0; for (k = 0; k < rlen; k++) if (qvs[k] != rchar) tags[j++] = tags[k]; tags[j] = '\0'; return (j); } // Count the # of non-rchar symbols in qvs[0..rlen-1] static int Packed_Length(char *qvs, int rlen, int rchar) { int k, clen; clen = 0; for (k = 0; k < rlen; k++) if (qvs[k] != rchar) clen += 1; return (clen); } // Unpack tags by moving its i'th char to position k where qvs[k] is the i'th non-rchar // symbol in qvs. All other chars are set to rchar. rlen is the length of qvs and // the unpacked result, clen is the initial length of tags. static void Unpack_Tag(char *tags, int clen, char *qvs, int rlen, int rchar) { int j, k; j = clen-1; for (k = rlen-1; k >= 0; k--) { if (qvs[k] == rchar) tags[k] = 'n'; else tags[k] = tags[j--]; } } /******************************************************************************************* * * Statistics Scan and Scheme creation and write * ********************************************************************************************/ // Read .quiva file from input, recording stats in the histograms. If zero is set then // start the stats anew with this file. static uint64 delHist[256], insHist[256], mrgHist[256], subHist[256], delRun[256], subRun[256]; static uint64 totChar; static int delChar, subChar; // Referred by: QVcoding_Scan, Create_QVcoding int QVcoding_Scan(FILE *input) { char *slash; int rlen; // Zero histograms bzero(delHist,sizeof(uint64)*256); bzero(mrgHist,sizeof(uint64)*256); bzero(insHist,sizeof(uint64)*256); bzero(subHist,sizeof(uint64)*256); { int i; for (i = 0; i < 256; i++) delRun[i] = subRun[i] = 1; } totChar = 0; delChar = -1; subChar = -1; // Make a sweep through the .quiva entries, histogramming the relevant things // and figuring out the run chars for the deletion and substition streams Nline = 0; while (1) { int well, beg, end, qv; rlen = Read_Lines(input,1); if (rlen == -2) EXIT(1); if (rlen < 0) break; if (rlen == 0 || Read[0] != '@') { EPRINTF(EPLACE,"Line %d: Header in quiv file is missing\n",Nline); EXIT(1); } slash = index(Read+1,'/'); if (slash == NULL) { EPRINTF(EPLACE,"%s: Line %d: Header line incorrectly formatted ?\n", Prog_Name,Nline); EXIT(1); } if (sscanf(slash+1,"%d/%d_%d RQ=0.%d\n",&well,&beg,&end,&qv) != 4) { EPRINTF(EPLACE,"%s: Line %d: Header line incorrectly formatted ?\n", Prog_Name,Nline); EXIT(1); } rlen = Read_Lines(input,5); if (rlen < 0) { if (rlen == -1) EPRINTF(EPLACE,"Line %d: incomplete last entry of .quiv file\n",Nline); EXIT(1); } Histogram_Seqs(delHist,(uint8 *) (Read),rlen); Histogram_Seqs(insHist,(uint8 *) (Read+2*Rmax),rlen); Histogram_Seqs(mrgHist,(uint8 *) (Read+3*Rmax),rlen); Histogram_Seqs(subHist,(uint8 *) (Read+4*Rmax),rlen); if (delChar < 0) { int k; char *del = Read+Rmax; for (k = 0; k < rlen; k++) if (del[k] == 'n' || del[k] == 'N') { delChar = Read[k]; break; } } if (delChar >= 0) Histogram_Runs( delRun,(uint8 *) (Read),rlen,delChar); totChar += rlen; if (subChar < 0) { if (totChar >= 100000) { int k; subChar = 0; for (k = 1; k < 256; k++) if (subHist[k] > subHist[subChar]) subChar = k; } } if (subChar >= 0) Histogram_Runs( subRun,(uint8 *) (Read+4*Rmax),rlen,subChar); } return (0); } // Using the statistics in the global stat tables, create the Huffman schemes and write // them to output. If lossy is set, then create a lossy table for the insertion and merge // QVs. QVcoding *Create_QVcoding(int lossy) { static QVcoding coding; HScheme *delScheme, *insScheme, *mrgScheme, *subScheme; HScheme *dRunScheme, *sRunScheme; delScheme = NULL; dRunScheme = NULL; insScheme = NULL; mrgScheme = NULL; subScheme = NULL; sRunScheme = NULL; // Check whether using a subtitution run char is a win if (totChar < 200000 || subHist[subChar] < .5*totChar) subChar = -1; // If lossy encryption is enabled then scale insertions and merge QVs. if (lossy) { int k; for (k = 0; k < 256; k += 2) { insHist[k] += insHist[k+1]; insHist[k+1] = 0; } for (k = 0; k < 256; k += 4) { mrgHist[k] += mrgHist[k+1]; mrgHist[k] += mrgHist[k+2]; mrgHist[k] += mrgHist[k+3]; mrgHist[k+1] = 0; mrgHist[k+2] = 0; mrgHist[k+3] = 0; } } // Build a Huffman scheme for each stream entity from the histograms #define SCHEME_MACRO(meme,hist,label,bits) \ scheme = Huffman( (hist), NULL); \ if (scheme == NULL) \ goto error; \ if (scheme->type) \ { (meme) = Huffman( (hist), scheme); \ free(scheme); \ } \ else \ (meme) = scheme; #ifdef DEBUG #define MAKE_SCHEME(meme,hist,label,bits) \ SCHEME_MACRO(meme,hist,label,bits) \ printf("\n%s\n", (label) ); \ Print_Histogram( (hist)); \ Print_Table( (meme), (hist), (bits)); #else #define MAKE_SCHEME(meme,hist,label,bits) \ SCHEME_MACRO(meme,hist,label,bits) #endif { HScheme *scheme; if (delChar < 0) { MAKE_SCHEME(delScheme,delHist, "Hisotgram of Deletion QVs", 8); dRunScheme = NULL; } else { delHist[delChar] = 0; MAKE_SCHEME(delScheme,delHist, "Hisotgram of Deletion QVs less run char", 8); MAKE_SCHEME(dRunScheme,delRun, "Histogram of Deletion Runs QVs", 16); #ifdef DEBUG printf("\nRun char is '%c'\n",delChar); #endif } #ifdef DEBUG { int k; uint64 count; count = 0; for (k = 0; k < 256; k++) count += delHist[k]; printf("\nDelTag will require %lld bytes\n",count/4); } #endif MAKE_SCHEME(insScheme,insHist, "Hisotgram of Insertion QVs", 8); MAKE_SCHEME(mrgScheme,mrgHist, "Hisotgram of Merge QVs", 8); if (subChar < 0) { MAKE_SCHEME(subScheme,subHist, "Hisotgram of Subsitution QVs", 8); sRunScheme = NULL; } else { subHist[subChar] = 0; MAKE_SCHEME(subScheme,subHist, "Hisotgram of Subsitution QVs less run char", 8); MAKE_SCHEME(sRunScheme,subRun, "Histogram of Substitution Run QVs", 16); #ifdef DEBUG printf("\nRun char is '%c'\n",subChar); #endif } } // Setup endian handling Set_Endian(0); coding.delScheme = delScheme; coding.insScheme = insScheme; coding.mrgScheme = mrgScheme; coding.subScheme = subScheme; coding.dRunScheme = dRunScheme; coding.sRunScheme = sRunScheme; coding.delChar = delChar; coding.subChar = subChar; coding.prefix = NULL; coding.flip = 0; return (&coding); error: if (delScheme != NULL) free(delScheme); if (dRunScheme != NULL) free(dRunScheme); if (insScheme != NULL) free(insScheme); if (mrgScheme != NULL) free(mrgScheme); if (subScheme != NULL) free(subScheme); if (sRunScheme != NULL) free(sRunScheme); EXIT(NULL); } // Write the encoding scheme 'coding' to 'output' void Write_QVcoding(FILE *output, QVcoding *coding) { // Write out the endian key, run chars, and prefix (if not NULL) { uint16 half; int len; half = 0x33cc; fwrite(&half,sizeof(uint16),1,output); if (coding->delChar < 0) half = 256; else half = (uint16) (coding->delChar); fwrite(&half,sizeof(uint16),1,output); if (coding->subChar < 0) half = 256; else half = (uint16) (coding->subChar); fwrite(&half,sizeof(uint16),1,output); len = strlen(coding->prefix); fwrite(&len,sizeof(int),1,output); fwrite(coding->prefix,1,len,output); } // Write out the scheme tables Write_Scheme(coding->delScheme,output); if (coding->delChar >= 0) Write_Scheme(coding->dRunScheme,output); Write_Scheme(coding->insScheme,output); Write_Scheme(coding->mrgScheme,output); Write_Scheme(coding->subScheme,output); if (coding->subChar >= 0) Write_Scheme(coding->sRunScheme,output); } // Read the encoding scheme 'coding' to 'output' QVcoding *Read_QVcoding(FILE *input) { static QVcoding coding; // Read endian key, run chars, and short name common to all headers { uint16 half; int len; if (fread(&half,sizeof(uint16),1,input) != 1) { EPRINTF(EPLACE,"Could not read flip byte (Read_QVcoding)\n"); EXIT(NULL); } coding.flip = (half != 0x33cc); if (fread(&half,sizeof(uint16),1,input) != 1) { EPRINTF(EPLACE,"Could not read deletion char (Read_QVcoding)\n"); EXIT(NULL); } if (coding.flip) Flip_Short(&half); coding.delChar = half; if (coding.delChar >= 256) coding.delChar = -1; if (fread(&half,sizeof(uint16),1,input) != 1) { EPRINTF(EPLACE,"Could not read substitution char (Read_QVcoding)\n"); EXIT(NULL); } if (coding.flip) Flip_Short(&half); coding.subChar = half; if (coding.subChar >= 256) coding.subChar = -1; // Read the short name common to all headers if (fread(&len,sizeof(int),1,input) != 1) { EPRINTF(EPLACE,"Could not read header name length (Read_QVcoding)\n"); EXIT(NULL); } if (coding.flip) Flip_Long(&len); coding.prefix = (char *) Malloc(len+1,"Allocating header prefix"); if (coding.prefix == NULL) EXIT(NULL); if (len > 0) { if (fread(coding.prefix,len,1,input) != 1) { EPRINTF(EPLACE,"Could not read header name (Read_QVcoding)\n"); EXIT(NULL); } } coding.prefix[len] = '\0'; } // Setup endian handling Set_Endian(coding.flip); // Read the Huffman schemes used to compress the data coding.delScheme = NULL; coding.dRunScheme = NULL; coding.insScheme = NULL; coding.mrgScheme = NULL; coding.subScheme = NULL; coding.sRunScheme = NULL; coding.delScheme = Read_Scheme(input); if (coding.delScheme == NULL) goto error; if (coding.delChar >= 0) { coding.dRunScheme = Read_Scheme(input); if (coding.dRunScheme == NULL) goto error; } coding.insScheme = Read_Scheme(input); if (coding.insScheme == NULL) goto error; coding.mrgScheme = Read_Scheme(input); if (coding.mrgScheme == NULL) goto error; coding.subScheme = Read_Scheme(input); if (coding.subScheme == NULL) goto error; if (coding.subChar >= 0) { coding.sRunScheme = Read_Scheme(input); if (coding.sRunScheme == NULL) goto error; } return (&coding); error: if (coding.delScheme != NULL) free(coding.delScheme); if (coding.dRunScheme != NULL) free(coding.dRunScheme); if (coding.insScheme != NULL) free(coding.insScheme); if (coding.mrgScheme != NULL) free(coding.mrgScheme); if (coding.subScheme != NULL) free(coding.subScheme); if (coding.sRunScheme != NULL) free(coding.sRunScheme); EXIT(NULL); } // Free all the auxilliary storage associated with the encoding argument void Free_QVcoding(QVcoding *coding) { if (coding->subChar >= 0) free(coding->sRunScheme); free(coding->subScheme); free(coding->mrgScheme); free(coding->insScheme); if (coding->delChar >= 0) free(coding->dRunScheme); free(coding->delScheme); free(coding->prefix); } /******************************************************************************************* * * Encode/Decode (w.r.t. coding) next entry from input and write to output * ********************************************************************************************/ int Compress_Next_QVentry(FILE *input, FILE *output, QVcoding *coding, int lossy) { int rlen, clen; // Get all 5 streams, compress each with its scheme, and output rlen = Read_Lines(input,5); if (rlen < 0) { if (rlen == -1) EPRINTF(EPLACE,"Line %d: incomplete last entry of .quiv file\n",Nline); EXIT (1); } if (coding->delChar < 0) { Encode(coding->delScheme, output, (uint8 *) Read, rlen); clen = rlen; } else { Encode_Run(coding->delScheme, coding->dRunScheme, output, (uint8 *) Read, rlen, coding->delChar); clen = Pack_Tag(Read+Rmax,Read,rlen,coding->delChar); } Number_Read(Read+Rmax); Compress_Read(clen,Read+Rmax); fwrite(Read+Rmax,1,COMPRESSED_LEN(clen),output); if (lossy) { uint8 *insert = (uint8 *) (Read+2*Rmax); uint8 *merge = (uint8 *) (Read+3*Rmax); int k; for (k = 0; k < rlen; k++) { insert[k] = (uint8) ((insert[k] >> 1) << 1); merge[k] = (uint8) (( merge[k] >> 2) << 2); } } Encode(coding->insScheme, output, (uint8 *) (Read+2*Rmax), rlen); Encode(coding->mrgScheme, output, (uint8 *) (Read+3*Rmax), rlen); if (coding->subChar < 0) Encode(coding->subScheme, output, (uint8 *) (Read+4*Rmax), rlen); else Encode_Run(coding->subScheme, coding->sRunScheme, output, (uint8 *) (Read+4*Rmax), rlen, coding->subChar); return (0); } int Uncompress_Next_QVentry(FILE *input, char **entry, QVcoding *coding, int rlen) { int clen, tlen; // Decode each stream and write to output if (coding->delChar < 0) { if (Decode(coding->delScheme, input, entry[0], rlen)) EXIT(1); clen = rlen; tlen = COMPRESSED_LEN(clen); if (tlen > 0) { if (fread(entry[1],tlen,1,input) != 1) { EPRINTF(EPLACE,"Could not read deletions entry (Uncompress_Next_QVentry\n"); EXIT(1); } } Uncompress_Read(clen,entry[1]); Lower_Read(entry[1]); } else { if (Decode_Run(coding->delScheme, coding->dRunScheme, input, entry[0], rlen, coding->delChar)) EXIT(1); clen = Packed_Length(entry[0],rlen,coding->delChar); tlen = COMPRESSED_LEN(clen); if (tlen > 0) { if (fread(entry[1],tlen,1,input) != 1) { EPRINTF(EPLACE,"Could not read deletions entry (Uncompress_Next_QVentry\n"); EXIT(1); } } Uncompress_Read(clen,entry[1]); Lower_Read(entry[1]); Unpack_Tag(entry[1],clen,entry[0],rlen,coding->delChar); } if (Decode(coding->insScheme, input, entry[2], rlen)) EXIT(1); if (Decode(coding->mrgScheme, input, entry[3], rlen)) EXIT(1); if (coding->subChar < 0) { if (Decode(coding->subScheme, input, entry[4], rlen)) EXIT(1); } else { if (Decode_Run(coding->subScheme, coding->sRunScheme, input, entry[4], rlen, coding->subChar)) EXIT(1); } return (0); } HINGE-0.5.0/src/lib/align.c000077500000000000000000004060271314415550300151560ustar00rootroot00000000000000/************************************************************************************\ * * * Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. * * * * Redistribution and use in source and binary forms, with or without modification, * * are permitted provided that the following conditions are met: * * * * · Redistributions of source code must retain the above copyright notice, this * * list of conditions and the following disclaimer. * * * * · Redistributions in binary form must reproduce the above copyright notice, this * * list of conditions and the following disclaimer in the documentation and/or * * other materials provided with the distribution. * * * * · The name of EWM may not be used to endorse or promote products derived from * * this software without specific prior written permission. * * * * THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, * * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE * * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN * * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * * * For any issues regarding this software and its use, contact EWM at: * * * * Eugene W. Myers Jr. * * Bautzner Str. 122e * * 01099 Dresden * * GERMANY * * Email: gene.myers@gmail.com * * * \************************************************************************************/ /******************************************************************************************* * * Fast alignment discovery and trace generation along with utilites for displaying alignments * Based on previously unpublished ideas from 2005, subsequently refined in 2013-14. Basic * idea is to keep a dynamically selected interval of the f.r. waves from my 1986 O(nd) paper. * A recent cool idea is to not record all the details of an alignment while discovering it * but simply record trace points through which the optimal alignment passes every 100bp, * allowing rapid recomputation of the alignment details between trace points. * * Author : Gene Myers * First : June 2013 * Current: June 1, 2014 * ********************************************************************************************/ #include #include #include #include #include #include #include #include "DB.h" #include "align.h" #undef DEBUG_PASSES // Show forward / backward extension termini for Local_Alignment #undef DEBUG_POINTS // Show trace points #undef DEBUG_WAVE // Show waves of Local_Alignment #undef SHOW_MATCH_WAVE // For waves of Local_Alignment also show # of matches #undef SHOW_TRAIL // Show trace at the end of forward and reverse passes #undef SHOW_TPS // Show trace points as they are encountered in a wave #undef DEBUG_EXTEND // Show waves of Extend_Until_Overlap #undef DEBUG_ALIGN // Show division points of Compute_Trace #undef DEBUG_SCRIPT // Show trace additions for Compute_Trace #undef DEBUG_AWAVE // Show F/R waves of Compute_Trace #undef SHOW_TRACE // Show full trace for Print_Alignment #undef WAVE_STATS /****************************************************************************************\ * * * Working Storage Abstraction * * * \****************************************************************************************/ typedef struct // Hidden from the user, working space for each thread { int vecmax; void *vector; int celmax; void *cells; int pntmax; void *points; int tramax; void *trace; } _Work_Data; Work_Data *New_Work_Data() { _Work_Data *work; work = (_Work_Data *) Malloc(sizeof(_Work_Data),"Allocating work data block"); if (work == NULL) EXIT(NULL); work->vecmax = 0; work->vector = NULL; work->pntmax = 0; work->points = NULL; work->tramax = 0; work->trace = NULL; work->celmax = 0; work->cells = NULL; return ((Work_Data *) work); } static int enlarge_vector(_Work_Data *work, int newmax) { void *vec; int max; max = ((int) (newmax*1.2)) + 10000; vec = Realloc(work->vector,max,"Enlarging DP vector"); if (vec == NULL) EXIT(1); work->vecmax = max; work->vector = vec; return (0); } static int enlarge_points(_Work_Data *work, int newmax) { void *vec; int max; max = ((int) (newmax*1.2)) + 10000; vec = Realloc(work->points,max,"Enlarging point vector"); if (vec == NULL) EXIT(1); work->pntmax = max; work->points = vec; return (0); } static int enlarge_trace(_Work_Data *work, int newmax) { void *vec; int max; max = ((int) (newmax*1.2)) + 10000; vec = Realloc(work->trace,max,"Enlarging trace vector"); if (vec == NULL) EXIT(1); work->tramax = max; work->trace = vec; return (0); } void Free_Work_Data(Work_Data *ework) { _Work_Data *work = (_Work_Data *) ework; if (work->vector != NULL) free(work->vector); if (work->cells != NULL) free(work->cells); if (work->trace != NULL) free(work->trace); if (work->points != NULL) free(work->points); free(work); } /****************************************************************************************\ * * * ADAPTIVE PATH FINDING * * * \****************************************************************************************/ // Absolute/Fixed Parameters #define BVEC uint64 // Can be uint32 if PATH_LEN <= 32 #define TRIM_LEN 15 // Report as the tip, the last wave maximum for which the last // 2*TRIM_LEN edits are prefix-positive at rate ave_corr*f(bias) // (max value is 20) #define PATH_LEN 60 // Follow the last PATH_LEN columns/edges (max value is 63) // Derivative fixed parameters #define PATH_TOP 0x1000000000000000ll // Must be 1 << PATH_LEN #define PATH_INT 0x0fffffffffffffffll // Must be PATH_TOP-1 #define TRIM_MASK 0x7fff // Must be (1 << TRIM_LEN) - 1 #define TRIM_MLAG 200 // How far can last trim point be behind best point #define WAVE_LAG 30 // How far can worst point be behind the best point static double Bias_Factor[10] = { .690, .690, .690, .690, .780, .850, .900, .933, .966, 1.000 }; // Adjustable paramters typedef struct { double ave_corr; int trace_space; float freq[4]; int ave_path; int16 *score; int16 *table; } _Align_Spec; /* Fill in bit table: TABLE[x] = 1 iff the alignment modeled by x (1 = match, 0 = mismatch) has a non-negative score for every suffix of the alignment under the scoring scheme where match = MATCH and mismatch = -1. MATCH is set so that an alignment with TRIM_PCT matches has zero score ( (1-TRIM_PCT) / TRIM_PCT ). */ #define FRACTION 1000 // Implicit fractional part of scores, i.e. score = x/FRACTION typedef struct { int mscore; int dscore; int16 *table; int16 *score; } Table_Bits; static void set_table(int bit, int prefix, int score, int max, Table_Bits *parms) { if (bit >= TRIM_LEN) { parms->table[prefix] = (int16) (score-max); parms->score[prefix] = (int16) score; } else { if (score > max) max = score; set_table(bit+1,(prefix<<1),score - parms->dscore,max,parms); set_table(bit+1,(prefix<<1) | 1,score + parms->mscore,max,parms); } } /* Create an alignment specification record including path tip tables & values */ Align_Spec *New_Align_Spec(double ave_corr, int trace_space, float *freq) { _Align_Spec *spec; Table_Bits parms; double match; int bias; spec = (_Align_Spec *) Malloc(sizeof(_Align_Spec),"Allocating alignment specification"); if (spec == NULL) EXIT(NULL); spec->ave_corr = ave_corr; spec->trace_space = trace_space; spec->freq[0] = freq[0]; spec->freq[1] = freq[1]; spec->freq[2] = freq[2]; spec->freq[3] = freq[3]; match = freq[0] + freq[3]; if (match > .5) match = 1.-match; bias = (int) ((match+.025)*20.-1.); if (match < .2) { EPRINTF(EPLACE,"Base bias worse than 80/20%% ! (New_Align_Spec)\n"); free(spec); EXIT(NULL); } spec->ave_path = (int) (PATH_LEN * (1. - Bias_Factor[bias] * (1. - ave_corr))); parms.mscore = (int) (FRACTION * Bias_Factor[bias] * (1. - ave_corr)); parms.dscore = FRACTION - parms.mscore; parms.score = (int16 *) Malloc(sizeof(int16)*(TRIM_MASK+1)*2,"Allocating trim table"); if (parms.score == NULL) { free(spec); EXIT(NULL); } parms.table = parms.score + (TRIM_MASK+1); set_table(0,0,0,0,&parms); spec->table = parms.table; spec->score = parms.score; return ((Align_Spec *) spec); } void Free_Align_Spec(Align_Spec *espec) { _Align_Spec *spec = (_Align_Spec *) espec; free(spec->score); free(spec); } double Average_Correlation(Align_Spec *espec) { return (((_Align_Spec *) espec)->ave_corr); } int Trace_Spacing(Align_Spec *espec) { return (((_Align_Spec *) espec)->trace_space); } float *Base_Frequencies(Align_Spec *espec) { return (((_Align_Spec *) espec)->freq); } /****************************************************************************************\ * * * LOCAL ALIGNMENT FINDER: forward_/reverse_wave and Local_Alignment * * * \****************************************************************************************/ #ifdef WAVE_STATS static int64 MAX, TOT, NWV; static int64 RESTARTS; void Init_Stats() { MAX = TOT = NWV = 0; RESTARTS = 0; } void Print_Stats() { printf("\nMax = %lld Ave = %.1f # = %lld\n",MAX,(1.*TOT)/NWV,NWV); printf("\nRestarts = %lld\n",RESTARTS); } #endif #ifdef DEBUG_WAVE static void print_wave(int *V, int *M, int low, int hgh, int besta) { int k, bestk; (void) M; printf(" [%6d,%6d]: ",low,hgh); for (k = low; k <= hgh; k++) { if (besta == V[k]) bestk = k; // printf(" %3d",(V[k]+k)/2); printf(" %3d",besta-V[k]); } printf(" : %d (%d,%d)\n",besta,(besta+bestk)/2,(besta-bestk)/2); #ifdef SHOW_MATCH_WAVE printf(" "); for (k = low; k <= hgh; k++) printf(" %3d",M[k]); printf("\n"); #endif fflush(stdout); } #endif /* At each furthest reaching point, keep a-coordinate of point (V), bitvector recording the last TRIM_LEN columns of the implied alignment (T), and the # of matches (1-bits) in the bitvector (M). */ typedef struct { int ptr; int diag; int diff; int mark; } Pebble; static int VectorEl = 6*sizeof(int) + sizeof(BVEC); static int forward_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, Path *bpath, int *mind, int maxd, int mida, int minp, int maxp) { char *aseq = align->aseq; char *bseq = align->bseq; Path *apath = align->path; int hgh, low, dif; int vlen, vmin, vmax; int *V, *M; int *_V, *_M; BVEC *T; BVEC *_T; int *HA, *HB; int *_HA, *_HB; int *NA, *NB; int *_NA, *_NB; Pebble *cells; int avail, cmax, boff; int TRACE_SPACE = spec->trace_space; int PATH_AVE = spec->ave_path; int16 *SCORE = spec->score; int16 *TABLE = spec->table; int besta, besty; int trima, trimy, trimd; int trimha, trimhb; int morea, morey, mored; int moreha, morehb; int more, morem, lasta; int aclip, bclip; hgh = maxd; low = *mind; dif = 0; { int span, wing; span = (hgh-low)+1; vlen = work->vecmax/VectorEl; wing = (vlen - span)/2; vmin = low - wing; vmax = hgh + wing; _V = ((int *) work->vector); _M = _V + vlen; _HA = _M + vlen; _HB = _HA + vlen; _NA = _HB + vlen; _NB = _NA + vlen; _T = ((BVEC *) (_NB + vlen)); V = _V-vmin; M = _M-vmin; HA = _HA-vmin; HB = _HB-vmin; NA = _NA-vmin; NB = _NB-vmin; T = _T-vmin; cells = (Pebble *) (work->cells); cmax = work->celmax; avail = 0; if (COMP(align->flags)) boff = align->blen % TRACE_SPACE; else boff = 0; } /* Compute 0-wave starting from mid-line */ more = 1; aclip = INT32_MAX; bclip = -INT32_MAX; besta = trima = morea = lasta = mida; besty = trimy = morey = (mida-hgh) >> 1; trimd = mored = 0; trimha = moreha = 0; trimhb = morehb = 1; morem = -1; { int k; char *a; a = aseq + hgh; for (k = hgh; k >= low; k--) { int y, c, d; int ha, hb; int na, nb; Pebble *pb; y = (mida-k) >> 1; if (avail >= cmax-1) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } na = ((y+k)/TRACE_SPACE)*TRACE_SPACE; #ifdef SHOW_TPS printf(" A %d: %d,%d,0,%d\n",avail,-1,k,na); fflush(stdout); #endif pb = cells+avail; pb->ptr = -1; pb->diag = k; pb->diff = 0; pb->mark = na; ha = avail++; na += TRACE_SPACE; nb = ((y+(TRACE_SPACE-boff))/TRACE_SPACE-1)*TRACE_SPACE+boff; #ifdef SHOW_TPS printf(" B %d: %d,%d,0,%d\n",avail,-1,k,nb); fflush(stdout); #endif pb = cells+avail; pb->ptr = -1; pb->diag = k; pb->diff = 0; pb->mark = nb; hb = avail++; nb += TRACE_SPACE; while (1) { c = bseq[y]; if (c == 4) { more = 0; if (bclip < k) bclip = k; break; } d = a[y]; if (c != d) { if (d == 4) { more = 0; aclip = k; } break; } y += 1; } c = (y << 1) + k; while (y+k >= na) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" A %d: %d,%d,0,%d\n",avail,ha,k,na); fflush(stdout); #endif pb = cells+avail; pb->ptr = ha; pb->diag = k; pb->diff = 0; pb->mark = na; ha = avail++; na += TRACE_SPACE; } while (y >= nb) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" B %d: %d,%d,0,%d\n",avail,hb,k,nb); fflush(stdout); #endif pb = cells+avail; pb->ptr = hb; pb->diag = k; pb->diff = 0; pb->mark = nb; hb = avail++; nb += TRACE_SPACE; } if (c > besta) { besta = trima = lasta = c; besty = trimy = y; trimha = ha; trimhb = hb; } V[k] = c; T[k] = PATH_INT; M[k] = PATH_LEN; HA[k] = ha; HB[k] = hb; NA[k] = na; NB[k] = nb; a -= 1; } } if (more == 0) { if (bseq[besty] != 4 && aseq[besta - besty] != 4) more = 1; if (hgh >= aclip) { hgh = aclip-1; if (morem <= M[aclip]) { morem = M[aclip]; morea = V[aclip]; morey = (morea - aclip)/2; moreha = HA[aclip]; morehb = HB[aclip]; } } if (low <= bclip) { low = bclip+1; if (morem <= M[bclip]) { morem = M[bclip]; morea = V[bclip]; morey = (morea - bclip)/2; moreha = HA[bclip]; morehb = HB[bclip]; } } aclip = INT32_MAX; bclip = -INT32_MAX; } #ifdef DEBUG_WAVE printf("\nFORWARD WAVE:\n"); print_wave(V,M,low,hgh,besta); #endif /* Compute successive waves until no furthest reaching points remain */ while (more && lasta >= besta - TRIM_MLAG) { int k, n; int ua, ub; BVEC t; int am, ac, ap; char *a; if (low <= vmin || hgh >= vmax) { int span, wing; int64 move; int64 vd, md, had, hbd, nad, nbd, td; span = (hgh-low)+1; if (.8*vlen < span) { if (enlarge_vector(work,vlen*VectorEl)) EXIT(1); move = ((void *) _V) - work->vector; vlen = work->vecmax/VectorEl; _V = (int *) work->vector; _M = _V + vlen; _HA = _M + vlen; _HB = _HA + vlen; _NA = _HB + vlen; _NB = _NA + vlen; _T = ((BVEC *) (_NB + vlen)); } else move = 0; wing = (vlen - span)/2; vd = ((void *) ( _V+wing)) - (((void *) ( V+low)) - move); md = ((void *) ( _M+wing)) - (((void *) ( M+low)) - move); had = ((void *) (_HA+wing)) - (((void *) (HA+low)) - move); hbd = ((void *) (_HB+wing)) - (((void *) (HB+low)) - move); nad = ((void *) (_NA+wing)) - (((void *) (NA+low)) - move); nbd = ((void *) (_NB+wing)) - (((void *) (NB+low)) - move); td = ((void *) ( _T+wing)) - (((void *) ( T+low)) - move); if (vd < 0) memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); if (md < 0) memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); if (had < 0) memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); if (hbd < 0) memmove(_HB+wing, ((void *) (HB+low)) - move, span*sizeof(int)); if (nad < 0) memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); if (nbd < 0) memmove(_NB+wing, ((void *) (NB+low)) - move, span*sizeof(int)); if (td < 0) memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); if (td > 0) memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); if (nbd > 0) memmove(_NB+wing, ((void *) (NB+low)) - move, span*sizeof(int)); if (nad > 0) memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); if (hbd > 0) memmove(_HB+wing, ((void *) (HB+low)) - move, span*sizeof(int)); if (had > 0) memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); if (md > 0) memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); if (vd > 0) memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); vmin = low-wing; vmax = hgh+wing; V = _V-vmin; M = _M-vmin; HA = _HA-vmin; HB = _HB-vmin; NA = _NA-vmin; NB = _NB-vmin; T = _T-vmin; } if (low > minp) { low -= 1; NA[low] = NA[low+1]; NB[low] = NB[low+1]; V[low] = -1; } if (hgh < maxp) { hgh += 1; NA[hgh] = NA[hgh-1]; NB[hgh] = NB[hgh-1]; V[hgh] = am = -1; } else am = V[hgh]; dif += 1; ac = V[hgh+1] = V[low-1] = -1; a = aseq + hgh; t = PATH_INT; n = PATH_LEN; ua = ub = -1; for (k = hgh; k >= low; k--) { int y, m; int ha, hb; int c, d; BVEC b; Pebble *pb; ap = ac; ac = am; am = V[d = k-1]; if (ac < am) if (am < ap) { c = ap+1; m = n; b = t; ha = ua; hb = ub; } else { c = am+1; m = M[d]; b = T[d]; ha = HA[d]; hb = HB[d]; } else if (ac < ap) { c = ap+1; m = n; b = t; ha = ua; hb = ub; } else { c = ac+2; m = M[k]; b = T[k]; ha = HA[k]; hb = HB[k]; } if ((b & PATH_TOP) != 0) m -= 1; b <<= 1; y = (c-k) >> 1; while (1) { c = bseq[y]; if (c == 4) { more = 0; if (bclip < k) bclip = k; break; } d = a[y]; if (c != d) { if (d == 4) { more = 0; aclip = k; } break; } y += 1; if ((b & PATH_TOP) == 0) m += 1; b = (b << 1) | 1; } c = (y << 1) + k; while (y+k >= NA[k]) { if (cells[ha].mark < NA[k]) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble), "Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" A %d: %d,%d,%d,%d\n",avail,ha,k,dif,NA[k]); fflush(stdout); #endif pb = cells+avail; pb->ptr = ha; pb->diag = k; pb->diff = dif; pb->mark = NA[k]; ha = avail++; } NA[k] += TRACE_SPACE; } while (y >= NB[k]) { if (cells[hb].mark < NB[k]) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble), "Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" B %d: %d,%d,%d,%d\n",avail,hb,k,dif,NB[k]); fflush(stdout); #endif pb = cells+avail; pb->ptr = hb; pb->diag = k; pb->diff = dif; pb->mark = NB[k]; hb = avail++; } NB[k] += TRACE_SPACE; } if (c > besta) { besta = c; besty = y; if (m >= PATH_AVE) { lasta = c; if (TABLE[b & TRIM_MASK] >= 0) if (TABLE[(b >> TRIM_LEN) & TRIM_MASK] + SCORE[b & TRIM_MASK] >= 0) { trima = c; trimy = y; trimd = dif; trimha = ha; trimhb = hb; } } } t = T[k]; n = M[k]; ua = HA[k]; ub = HB[k]; V[k] = c; T[k] = b; M[k] = m; HA[k] = ha; HB[k] = hb; a -= 1; } if (more == 0) { if (bseq[besty] != 4 && aseq[besta-besty] != 4) more = 1; if (hgh >= aclip) { hgh = aclip-1; if (morem <= M[aclip]) { morem = M[aclip]; morea = V[aclip]; morey = (morea - aclip)/2; mored = dif; moreha = HA[aclip]; morehb = HB[aclip]; } } if (low <= bclip) { low = bclip+1; if (morem <= M[bclip]) { morem = M[bclip]; morea = V[bclip]; morey = (morea - bclip)/2; mored = dif; moreha = HA[bclip]; morehb = HB[bclip]; } } aclip = INT32_MAX; bclip = -INT32_MAX; } n = besta - WAVE_LAG; while (hgh >= low) if (V[hgh] < n) hgh -= 1; else { while (V[low] < n) low += 1; break; } #ifdef WAVE_STATS k = (hgh-low)+1; if (k > MAX) MAX = k; TOT += k; NWV += 1; #endif #ifdef DEBUG_WAVE print_wave(V,M,low,hgh,besta); #endif } { uint16 *atrace = (uint16 *) apath->trace; uint16 *btrace = (uint16 *) bpath->trace; int atlen, btlen; int trimx; int a, b, k, h; int d, e; if (morem >= 0) { trimx = morea-morey; trimy = morey; trimd = mored; trimha = moreha; trimhb = morehb; } else trimx = trima-trimy; atlen = btlen = 0; a = -1; for (h = trimha; h >= 0; h = b) { b = cells[h].ptr; cells[h].ptr = a; a = h; } h = a; k = cells[h].diag; b = (mida-k)/2; e = 0; #ifdef SHOW_TRAIL printf(" A path = (%5d,%5d)\n",(mida+k)/2,b); fflush(stdout); #endif for (h = cells[h].ptr; h >= 0; h = cells[h].ptr) { k = cells[h].diag; a = cells[h].mark - k; d = cells[h].diff; atrace[atlen++] = (uint16) (d-e); atrace[atlen++] = (uint16) (a-b); #ifdef SHOW_TRAIL printf(" %4d: (%5d,%5d): %3d / %3d\n",h,a+k,a,d-e,a-b); fflush(stdout); #endif b = a; e = d; } if (b+k != trimx) { atrace[atlen++] = (uint16) (trimd-e); atrace[atlen++] = (uint16) (trimy-b); #ifdef SHOW_TRAIL printf(" (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,trimy-b); fflush(stdout); #endif } else if (b != trimy) { atrace[atlen-1] = (uint16) (atrace[atlen-1] + (trimy-b)); atrace[atlen-2] = (uint16) (atrace[atlen-2] + (trimd-e)); #ifdef SHOW_TRAIL printf(" @ (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,trimy-b); fflush(stdout); #endif } a = -1; for (h = trimhb; h >= 0; h = b) { b = cells[h].ptr; cells[h].ptr = a; a = h; } h = a; k = cells[h].diag; b = (mida+k)/2; e = 0; low = k; #ifdef SHOW_TRAIL printf(" B path = (%5d,%5d)\n",b,(mida-k)/2); fflush(stdout); #endif for (h = cells[h].ptr; h >= 0; h = cells[h].ptr) { k = cells[h].diag; a = cells[h].mark + k; d = cells[h].diff; btrace[btlen++] = (uint16) (d-e); btrace[btlen++] = (uint16) (a-b); #ifdef SHOW_TRAIL printf(" %4d: (%5d,%5d): %3d / %3d\n",h,a,a-k,d-e,a-b); fflush(stdout); #endif b = a; e = d; } if (b-k != trimy) { btrace[btlen++] = (uint16) (trimd-e); btrace[btlen++] = (uint16) (trimx-b); #ifdef SHOW_TRAIL printf(" (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,trimx-b); fflush(stdout); #endif } else if (b != trimx) { btrace[btlen-1] = (uint16) (btrace[btlen-1] + (trimx-b)); btrace[btlen-2] = (uint16) (btrace[btlen-2] + (trimd-e)); #ifdef SHOW_TRAIL printf(" @ (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,trimx-b); fflush(stdout); #endif } apath->aepos = trimx; apath->bepos = trimy; apath->diffs = trimd; apath->tlen = atlen; if (COMP(align->flags)) { bpath->abpos = align->blen - apath->bepos; bpath->bbpos = align->alen - apath->aepos; } else { bpath->aepos = apath->bepos; bpath->bepos = apath->aepos; } bpath->diffs = trimd; bpath->tlen = btlen; } *mind = low; return (0); } /*** Reverse Wave ***/ static int reverse_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, Path *bpath, int mind, int maxd, int mida, int minp, int maxp) { char *aseq = align->aseq - 1; char *bseq = align->bseq - 1; Path *apath = align->path; int hgh, low, dif; int vlen, vmin, vmax; int *V, *M; int *_V, *_M; BVEC *T; BVEC *_T; int *HA, *HB; int *_HA, *_HB; int *NA, *NB; int *_NA, *_NB; Pebble *cells; int avail, cmax, boff; int TRACE_SPACE = spec->trace_space; int PATH_AVE = spec->ave_path; int16 *SCORE = spec->score; int16 *TABLE = spec->table; int besta, besty; int trima, trimy, trimd; int trimha, trimhb; int morea, morey, mored; int moreha, morehb; int more, morem, lasta; int aclip, bclip; hgh = maxd; low = mind; dif = 0; { int span, wing; span = (hgh-low)+1; vlen = work->vecmax/VectorEl; wing = (vlen - span)/2; vmin = low - wing; vmax = hgh + wing; _V = ((int *) work->vector); _M = _V + vlen; _HA = _M + vlen; _HB = _HA + vlen; _NA = _HB + vlen; _NB = _NA + vlen; _T = ((BVEC *) (_NB + vlen)); V = _V-vmin; M = _M-vmin; HA = _HA-vmin; HB = _HB-vmin; NA = _NA-vmin; NB = _NB-vmin; T = _T-vmin; cells = (Pebble *) (work->cells); cmax = work->celmax; avail = 0; if (COMP(align->flags)) boff = align->blen % TRACE_SPACE; else boff = 0; } more = 1; aclip = -INT32_MAX; bclip = INT32_MAX; besta = trima = morea = lasta = mida; besty = trimy = morey = (mida-hgh) >> 1; trimd = mored = 0; trimha = moreha = 0; trimhb = morehb = 1; morem = -1; { int k; char *a; a = aseq + low; for (k = low; k <= hgh; k++) { int y, c, d; int ha, hb; int na, nb; Pebble *pb; y = (mida-k) >> 1; if (avail >= cmax-1) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } na = ((y+k+TRACE_SPACE-1)/TRACE_SPACE-1)*TRACE_SPACE; #ifdef SHOW_TPS printf(" A %d: -1,%d,0,%d\n",avail,k,na+TRACE_SPACE); fflush(stdout); #endif pb = cells+avail; pb->ptr = -1; pb->diag = k; pb->diff = 0; pb->mark = y+k; ha = avail++; nb = ((y+(TRACE_SPACE-boff)-1)/TRACE_SPACE-1)*TRACE_SPACE+boff; #ifdef SHOW_TPS printf(" B %d: -1,%d,0,%d\n",avail,k,nb+TRACE_SPACE); fflush(stdout); #endif pb = cells+avail; pb->ptr = -1; pb->diag = k; pb->diff = 0; pb->mark = y; hb = avail++; while (1) { c = bseq[y]; if (c == 4) { more = 0; if (bclip > k) bclip = k; break; } d = a[y]; if (c != d) { if (d == 4) { more = 0; aclip = k; } break; } y -= 1; } c = (y << 1) + k; while (y+k <= na) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" A %d: %d,%d,0,%d\n",avail,ha,k,na); fflush(stdout); #endif pb = cells+avail; pb->ptr = ha; pb->diag = k; pb->diff = 0; pb->mark = na; ha = avail++; na -= TRACE_SPACE; } while (y <= nb) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" B %d: %d,%d,0,%d\n",avail,hb,k,nb); fflush(stdout); #endif pb = cells+avail; pb->ptr = hb; pb->diag = k; pb->diff = 0; pb->mark = nb; hb = avail++; nb -= TRACE_SPACE; } if (c < besta) { besta = trima = lasta = c; besty = trimy = y; trimha = ha; trimhb = hb; } V[k] = c; T[k] = PATH_INT; M[k] = PATH_LEN; HA[k] = ha; HB[k] = hb; NA[k] = na; NB[k] = nb; a += 1; } } if (more == 0) { if (bseq[besty] != 4 && aseq[besta - besty] != 4) more = 1; if (low <= aclip) { low = aclip+1; if (morem <= M[aclip]) { morem = M[aclip]; morea = V[aclip]; morey = (morea - aclip)/2; moreha = HA[aclip]; morehb = HB[aclip]; } } if (hgh >= bclip) { hgh = bclip-1; if (morem <= M[bclip]) { morem = M[bclip]; morea = V[bclip]; morey = (morea - bclip)/2; moreha = HA[bclip]; morehb = HB[bclip]; } } aclip = -INT32_MAX; bclip = INT32_MAX; } #ifdef DEBUG_WAVE printf("\nREVERSE WAVE:\n"); print_wave(V,M,low,hgh,besta); #endif while (more && lasta <= besta + TRIM_MLAG) { int k, n; int ua, ub; BVEC t; int am, ac, ap; char *a; if (low <= vmin || hgh >= vmax) { int span, wing; int64 move, vd, md, had, hbd, nad, nbd, td; span = (hgh-low)+1; if (.8*vlen < span) { if (enlarge_vector(work,vlen*VectorEl)) EXIT(1); move = ((void *) _V) - work->vector; vlen = work->vecmax/VectorEl; _V = (int *) work->vector; _M = _V + vlen; _HA = _M + vlen; _HB = _HA + vlen; _NA = _HB + vlen; _NB = _NA + vlen; _T = ((BVEC *) (_NB + vlen)); } else move = 0; wing = (vlen - span)/2; vd = ((void *) ( _V+wing)) - (((void *) ( V+low)) - move); md = ((void *) ( _M+wing)) - (((void *) ( M+low)) - move); had = ((void *) (_HA+wing)) - (((void *) (HA+low)) - move); hbd = ((void *) (_HB+wing)) - (((void *) (HB+low)) - move); nad = ((void *) (_NA+wing)) - (((void *) (NA+low)) - move); nbd = ((void *) (_NB+wing)) - (((void *) (NB+low)) - move); td = ((void *) ( _T+wing)) - (((void *) ( T+low)) - move); if (vd < 0) memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); if (md < 0) memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); if (had < 0) memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); if (hbd < 0) memmove(_HB+wing, ((void *) (HB+low)) - move, span*sizeof(int)); if (nad < 0) memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); if (nbd < 0) memmove(_NB+wing, ((void *) (NB+low)) - move, span*sizeof(int)); if (td < 0) memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); if (td > 0) memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); if (nbd > 0) memmove(_NB+wing, ((void *) (NB+low)) - move, span*sizeof(int)); if (nad > 0) memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); if (hbd > 0) memmove(_HB+wing, ((void *) (HB+low)) - move, span*sizeof(int)); if (had > 0) memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); if (md > 0) memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); if (vd > 0) memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); vmin = low-wing; vmax = hgh+wing; V = _V-vmin; M = _M-vmin; HA = _HA-vmin; HB = _HB-vmin; NA = _NA-vmin; NB = _NB-vmin; T = _T-vmin; } if (low > minp) { low -= 1; NA[low] = NA[low+1]; NB[low] = NB[low+1]; V[low] = ap = INT32_MAX; } else ap = V[low]; if (hgh < maxp) { hgh += 1; NA[hgh] = NA[hgh-1]; NB[hgh] = NB[hgh-1]; V[hgh] = INT32_MAX; } dif += 1; ac = V[hgh+1] = V[low-1] = INT32_MAX; a = aseq + low; t = PATH_INT; n = PATH_LEN; ua = ub = -1; for (k = low; k <= hgh; k++) { int y, m; int ha, hb; int c, d; BVEC b; Pebble *pb; am = ac; ac = ap; ap = V[d = k+1]; if (ac > ap) if (ap > am) { c = am-1; m = n; b = t; ha = ua; hb = ub; } else { c = ap-1; m = M[d]; b = T[d]; ha = HA[d]; hb = HB[d]; } else if (ac > am) { c = am-1; m = n; b = t; ha = ua; hb = ub; } else { c = ac-2; m = M[k]; b = T[k]; ha = HA[k]; hb = HB[k]; } if ((b & PATH_TOP) != 0) m -= 1; b <<= 1; y = (c-k) >> 1; while (1) { c = bseq[y]; if (c == 4) { more = 0; if (bclip > k) bclip = k; break; } d = a[y]; if (c != d) { if (d == 4) { more = 0; aclip = k; } break; } y -= 1; if ((b & PATH_TOP) == 0) m += 1; b = (b << 1) | 1; } c = (y << 1) + k; while (y+k <= NA[k]) { if (cells[ha].mark > NA[k]) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble), "Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" A %d: %d,%d,%d,%d\n",avail,ha,k,dif,NA[k]); fflush(stdout); #endif pb = cells+avail; pb->ptr = ha; pb->diag = k; pb->diff = dif; pb->mark = NA[k]; ha = avail++; } NA[k] -= TRACE_SPACE; } while (y <= NB[k]) { if (cells[hb].mark > NB[k]) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble), "Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" B %d: %d,%d,%d,%d\n",avail,hb,k,dif,NB[k]); fflush(stdout); #endif pb = cells+avail; pb->ptr = hb; pb->diag = k; pb->diff = dif; pb->mark = NB[k]; hb = avail++; } NB[k] -= TRACE_SPACE; } if (c < besta) { besta = c; besty = y; if (m >= PATH_AVE) { lasta = c; if (TABLE[b & TRIM_MASK] >= 0) if (TABLE[(b >> TRIM_LEN) & TRIM_MASK] + SCORE[b & TRIM_MASK] >= 0) { trima = c; trimy = y; trimd = dif; trimha = ha; trimhb = hb; } } } t = T[k]; n = M[k]; ua = HA[k]; ub = HB[k]; V[k] = c; T[k] = b; M[k] = m; HA[k] = ha; HB[k] = hb; a += 1; } if (more == 0) { if (bseq[besty] != 4 && aseq[besta - besty] != 4) more = 1; if (low <= aclip) { low = aclip+1; if (morem <= M[aclip]) { morem = M[aclip]; morea = V[aclip]; morey = (morea - aclip)/2; mored = dif; moreha = HA[aclip]; morehb = HB[aclip]; } } if (hgh >= bclip) { hgh = bclip-1; if (morem <= M[bclip]) { morem = M[bclip]; morea = V[bclip]; morey = (morea - bclip)/2; mored = dif; moreha = HA[bclip]; morehb = HB[bclip]; } } aclip = -INT32_MAX; bclip = INT32_MAX; } n = besta + WAVE_LAG; while (hgh >= low) if (V[hgh] > n) hgh -= 1; else { while (V[low] > n) low += 1; break; } #ifdef WAVE_STATS k = (hgh-low)+1; if (k > MAX) MAX = k; TOT += k; NWV += 1; #endif #ifdef DEBUG_WAVE print_wave(V,M,low,hgh,besta); #endif } { uint16 *atrace = (uint16 *) apath->trace; uint16 *btrace = (uint16 *) bpath->trace; int atlen, btlen; int trimx; int a, b, k, h; int d, e; if (morem >= 0) { trimx = morea-morey; trimy = morey; trimd = mored; trimha = moreha; trimhb = morehb; } else trimx = trima-trimy; atlen = btlen = 0; a = -1; for (h = trimha; h >= 0; h = b) { b = cells[h].ptr; cells[h].ptr = a; a = h; } h = a; k = cells[h].diag; b = cells[h].mark - k; e = 0; #ifdef SHOW_TRAIL printf(" A path = (%5d,%5d)\n",b+k,b); fflush(stdout); #endif if ((b+k)%TRACE_SPACE != 0) { h = cells[h].ptr; if (h < 0) { a = trimy; d = trimd; } else { k = cells[h].diag; a = cells[h].mark - k; d = cells[h].diff; } #ifdef SHOW_TRAIL printf(" +%4d: (%5d,%5d): %3d / %3d\n",h,a+k,a,d-e,b-a); fflush(stdout); #endif if (apath->tlen == 0) { atrace[--atlen] = (uint16) (b-a); atrace[--atlen] = (uint16) (d-e); } else { atrace[1] = (uint16) (atrace[1] + (b-a)); atrace[0] = (uint16) (atrace[0] + (d-e)); } b = a; e = d; } if (h >= 0) { for (h = cells[h].ptr; h >= 0; h = cells[h].ptr) { k = cells[h].diag; a = cells[h].mark - k; atrace[--atlen] = (uint16) (b-a); d = cells[h].diff; atrace[--atlen] = (uint16) (d-e); #ifdef SHOW_TRAIL printf(" %4d: (%5d,%5d): %3d / %3d\n",h,a+k,a,d-e,b-a); fflush(stdout); #endif b = a; e = d; } if (b+k != trimx) { atrace[--atlen] = (uint16) (b-trimy); atrace[--atlen] = (uint16) (trimd-e); #ifdef SHOW_TRAIL printf(" (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,b-trimy); fflush(stdout); #endif } else if (b != trimy) { atrace[atlen+1] = (uint16) (atrace[atlen+1] + (b-trimy)); atrace[atlen] = (uint16) (atrace[atlen] + (trimd-e)); #ifdef SHOW_TRAIL printf(" @ (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,b-trimy); fflush(stdout); #endif } } a = -1; for (h = trimhb; h >= 0; h = b) { b = cells[h].ptr; cells[h].ptr = a; a = h; } h = a; k = cells[h].diag; b = cells[h].mark + k; e = 0; #ifdef SHOW_TRAIL printf(" B path = (%5d,%5d)\n",b,b-k); fflush(stdout); #endif if ((b-k)%TRACE_SPACE != boff) { h = cells[h].ptr; if (h < 0) { a = trimx; d = trimd; } else { k = cells[h].diag; a = cells[h].mark + k; d = cells[h].diff; } #ifdef SHOW_TRAIL printf(" +%4d: (%5d,%5d): %3d / %3d\n",h,a,a-k,d-e,b-a); fflush(stdout); #endif if (bpath->tlen == 0) { btrace[--btlen] = (uint16) (b-a); btrace[--btlen] = (uint16) (b-a); } else { btrace[1] = (uint16) (btrace[1] + (b-a)); btrace[0] = (uint16) (btrace[0] + (d-e)); } b = a; e = d; } if (h >= 0) { for (h = cells[h].ptr; h >= 0; h = cells[h].ptr) { k = cells[h].diag; a = cells[h].mark + k; btrace[--btlen] = (uint16) (b-a); d = cells[h].diff; btrace[--btlen] = (uint16) (d-e); #ifdef SHOW_TRAIL printf(" %4d: (%5d,%5d): %3d / %3d\n",h,a,a-k,d-e,b-a); fflush(stdout); #endif b = a; e = d; } if (b-k != trimy) { btrace[--btlen] = (uint16) (b-trimx); btrace[--btlen] = (uint16) (trimd-e); #ifdef SHOW_TRAIL printf(" (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,b-trimx); fflush(stdout); #endif } else if (b != trimx) { btrace[btlen+1] = (uint16) (btrace[btlen+1] + (b-trimx)); btrace[btlen] = (uint16) (btrace[btlen] + (trimd-e)); #ifdef SHOW_TRAIL printf(" @ (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,b-trimx); fflush(stdout); #endif } } apath->abpos = trimx; apath->bbpos = trimy; apath->diffs = apath->diffs + trimd; apath->tlen = apath->tlen - atlen; apath->trace = atrace + atlen; if (COMP(align->flags)) { bpath->aepos = align->blen - apath->bbpos; bpath->bepos = align->alen - apath->abpos; } else { bpath->abpos = apath->bbpos; bpath->bbpos = apath->abpos; } bpath->diffs = bpath->diffs + trimd; bpath->tlen = bpath->tlen - btlen; bpath->trace = btrace + btlen; } return (0); } /* Find the longest local alignment between aseq and bseq through (xcnt,ycnt) See associated .h file for the precise definition of the interface. */ Path *Local_Alignment(Alignment *align, Work_Data *ework, Align_Spec *espec, int low, int hgh, int anti, int lbord, int hbord) { _Work_Data *work = ( _Work_Data *) ework; _Align_Spec *spec = (_Align_Spec *) espec; Path *apath, *bpath; int minp, maxp; int selfie; { int alen, blen; int maxtp, wsize; alen = align->alen; blen = align->blen; if (hgh-low >= 7500) wsize = VectorEl*(hgh-low+1); else wsize = VectorEl*10000; if (wsize >= work->vecmax) if (enlarge_vector(work,wsize)) EXIT(NULL); if (alen < blen) maxtp = 2*(blen/spec->trace_space+2); else maxtp = 2*(alen/spec->trace_space+2); wsize = 4*maxtp*sizeof(uint16) + sizeof(Path); if (wsize > work->pntmax) if (enlarge_points(work,wsize)) EXIT(NULL); apath = align->path; bpath = (Path *) work->points; apath->trace = ((uint16 *) (bpath+1)) + maxtp; bpath->trace = ((uint16 *) apath->trace) + 2*maxtp; } #ifdef DEBUG_PASSES printf("\n"); #endif selfie = (align->aseq == align->bseq); if (lbord < 0) { if (selfie && low >= 0) minp = 1; else minp = -INT32_MAX; } else minp = low-lbord; if (hbord < 0) { if (selfie && hgh <= 0) maxp = -1; else maxp = INT32_MAX; } else maxp = hgh+hbord; if (forward_wave(work,spec,align,bpath,&low,hgh,anti,minp,maxp)) EXIT(NULL); #ifdef DEBUG_PASSES printf("F1 (%d,%d) ~ %d => (%d,%d) %d\n", (2*anti+(low+hgh))/4,(anti-(low+hgh))/4,hgh-low, apath->read_A_match_end_,apath->read_B_match_end_,apath->diffs); #endif if (reverse_wave(work,spec,align,bpath,low,low,anti,minp,maxp)) EXIT(NULL); #ifdef DEBUG_PASSES printf("R1 (%d,%d) => (%d,%d) %d\n", (anti+low)/2,(anti-low)/2,apath->read_A_match_start_,apath->read_B_match_start_,apath->diffs); #endif if (COMP(align->flags)) { uint16 *trace = (uint16 *) bpath->trace; uint16 p; int i, j; i = bpath->tlen-2; j = 0; while (j < i) { p = trace[i]; trace[i] = trace[j]; trace[j] = p; p = trace[i+1]; trace[i+1] = trace[j+1]; trace[j+1] = p; i -= 2; j += 2; } } #ifdef DEBUG_POINTS { uint16 *trace = (uint16 *) apath->trace; int a, h; printf("\nA-path (%d,%d)->(%d,%d)",apath->read_A_match_start_,apath->read_B_match_start_,apath->read_A_match_end_,apath->read_B_match_end_); printf(" %c\n",(COMP(_align->reverse_complement_match_) ? 'c' : 'n')); a = apath->read_B_match_start_; for (h = 1; h < apath->tlen; h += 2) { int dif = trace[h-1]; int del = trace[h]; a += del; printf(" %d / %d (%d)\n",dif,del,a); } } { uint16 *trace = (uint16 *) bpath->trace; int a, h; printf("\nB-path (%d,%d)->(%d,%d)",bpath->read_A_match_start_,bpath->read_B_match_start_,bpath->read_A_match_end_,bpath->read_B_match_end_); printf(" %c [%d,%d]\n",(COMP(align->reverse_complement_match_) ? 'c' : 'n'),align->blen,align->alen); a = bpath->read_B_match_start_; for (h = 1; h < bpath->tlen; h += 2) { int dif = trace[h-1]; int del = trace[h]; a += del; printf(" %d / %d (%d)\n",dif,del,a); } } #endif return (bpath); } /****************************************************************************************\ * * * EXTENSION VERSION OF LOCAL ALIGNMENT * * * \****************************************************************************************/ static int VectorEn = 4*sizeof(int) + sizeof(BVEC); static int forward_extend(_Work_Data *work, _Align_Spec *spec, Alignment *align, int midd, int mida, int minp, int maxp) { char *aseq = align->aseq; char *bseq = align->bseq; Path *apath = align->path; int hgh, low, dif; int vlen, vmin, vmax; int *V, *M; int *_V, *_M; BVEC *T; BVEC *_T; int *HA, *NA; int *_HA, *_NA; Pebble *cells; int avail, cmax, boff; int TRACE_SPACE = spec->trace_space; int PATH_AVE = spec->ave_path; int16 *SCORE = spec->score; int16 *TABLE = spec->table; int besta, besty; int trima, trimy, trimd; int trimha; int morea, morey, mored; int moreha; int more, morem, lasta; int aclip, bclip; hgh = midd; low = midd; dif = 0; { int span, wing; span = (hgh-low)+1; vlen = work->vecmax/VectorEn; wing = (vlen - span)/2; vmin = low - wing; vmax = hgh + wing; _V = ((int *) work->vector); _M = _V + vlen; _HA = _M + vlen; _NA = _HA + vlen; _T = ((BVEC *) (_NA + vlen)); V = _V-vmin; M = _M-vmin; HA = _HA-vmin; NA = _NA-vmin; T = _T-vmin; cells = (Pebble *) (work->cells); cmax = work->celmax; avail = 0; if (COMP(align->flags)) boff = align->blen % TRACE_SPACE; else boff = 0; } /* Compute 0-wave starting from mid-line */ more = 1; aclip = INT32_MAX; bclip = -INT32_MAX; besta = trima = morea = lasta = mida; besty = trimy = morey = (mida-hgh) >> 1; trimd = mored = 0; trimha = moreha = 0; morem = -1; { int k; char *a; a = aseq + hgh; for (k = hgh; k >= low; k--) { int y, c, d; int ha, na; Pebble *pb; y = (mida-k) >> 1; if (avail >= cmax-1) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } na = ((y+k)/TRACE_SPACE)*TRACE_SPACE; #ifdef SHOW_TPS printf(" A %d: %d,%d,0,%d\n",avail,-1,k,na); fflush(stdout); #endif pb = cells+avail; pb->ptr = -1; pb->diag = k; pb->diff = 0; pb->mark = na; ha = avail++; na += TRACE_SPACE; while (1) { c = bseq[y]; if (c == 4) { more = 0; if (bclip < k) bclip = k; break; } d = a[y]; if (c != d) { if (d == 4) { more = 0; aclip = k; } break; } y += 1; } c = (y << 1) + k; while (y+k >= na) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" A %d: %d,%d,0,%d\n",avail,ha,k,na); fflush(stdout); #endif pb = cells+avail; pb->ptr = ha; pb->diag = k; pb->diff = 0; pb->mark = na; ha = avail++; na += TRACE_SPACE; } if (c > besta) { besta = trima = lasta = c; besty = trimy = y; trimha = ha; } V[k] = c; T[k] = PATH_INT; M[k] = PATH_LEN; HA[k] = ha; NA[k] = na; a -= 1; } } if (more == 0) { if (bseq[besty] != 4 && aseq[besta - besty] != 4) more = 1; if (hgh >= aclip) { hgh = aclip-1; if (morem <= M[aclip]) { morem = M[aclip]; morea = V[aclip]; morey = (morea - aclip)/2; moreha = HA[aclip]; } } if (low <= bclip) { low = bclip+1; if (morem <= M[bclip]) { morem = M[bclip]; morea = V[bclip]; morey = (morea - bclip)/2; moreha = HA[bclip]; } } aclip = INT32_MAX; bclip = -INT32_MAX; } #ifdef DEBUG_WAVE printf("\nFORWARD WAVE:\n"); print_wave(V,M,low,hgh,besta); #endif /* Compute successive waves until no furthest reaching points remain */ while (more && lasta >= besta - TRIM_MLAG) { int k, n; int ua; BVEC t; int am, ac, ap; char *a; if (low <= vmin || hgh >= vmax) { int span, wing; int64 move; int64 vd, md, had, nad, td; span = (hgh-low)+1; if (.8*vlen < span) { if (enlarge_vector(work,vlen*VectorEn)) EXIT(1); move = ((void *) _V) - work->vector; vlen = work->vecmax/VectorEn; _V = (int *) work->vector; _M = _V + vlen; _HA = _M + vlen; _NA = _HA + vlen; _T = ((BVEC *) (_NA + vlen)); } else move = 0; wing = (vlen - span)/2; vd = ((void *) ( _V+wing)) - (((void *) ( V+low)) - move); md = ((void *) ( _M+wing)) - (((void *) ( M+low)) - move); had = ((void *) (_HA+wing)) - (((void *) (HA+low)) - move); nad = ((void *) (_NA+wing)) - (((void *) (NA+low)) - move); td = ((void *) ( _T+wing)) - (((void *) ( T+low)) - move); if (vd < 0) memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); if (md < 0) memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); if (had < 0) memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); if (nad < 0) memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); if (td < 0) memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); if (td > 0) memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); if (nad > 0) memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); if (had > 0) memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); if (md > 0) memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); if (vd > 0) memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); vmin = low-wing; vmax = hgh+wing; V = _V-vmin; M = _M-vmin; HA = _HA-vmin; NA = _NA-vmin; T = _T-vmin; } if (low > minp) { low -= 1; NA[low] = NA[low+1]; V[low] = -1; } if (hgh < maxp) { hgh += 1; NA[hgh] = NA[hgh-1]; V[hgh] = am = -1; } else am = V[hgh]; dif += 1; ac = V[hgh+1] = V[low-1] = -1; a = aseq + hgh; t = PATH_INT; n = PATH_LEN; ua = -1; for (k = hgh; k >= low; k--) { int y, m; int ha; int c, d; BVEC b; Pebble *pb; ap = ac; ac = am; am = V[d = k-1]; if (ac < am) if (am < ap) { c = ap+1; m = n; b = t; ha = ua; } else { c = am+1; m = M[d]; b = T[d]; ha = HA[d]; } else if (ac < ap) { c = ap+1; m = n; b = t; ha = ua; } else { c = ac+2; m = M[k]; b = T[k]; ha = HA[k]; } if ((b & PATH_TOP) != 0) m -= 1; b <<= 1; y = (c-k) >> 1; while (1) { c = bseq[y]; if (c == 4) { more = 0; if (bclip < k) bclip = k; break; } d = a[y]; if (c != d) { if (d == 4) { more = 0; aclip = k; } break; } y += 1; if ((b & PATH_TOP) == 0) m += 1; b = (b << 1) | 1; } c = (y << 1) + k; while (y+k >= NA[k]) { if (cells[ha].mark < NA[k]) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble), "Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" A %d: %d,%d,%d,%d\n",avail,ha,k,dif,NA[k]); fflush(stdout); #endif pb = cells+avail; pb->ptr = ha; pb->diag = k; pb->diff = dif; pb->mark = NA[k]; ha = avail++; } NA[k] += TRACE_SPACE; } if (c > besta) { besta = c; besty = y; if (m >= PATH_AVE) { lasta = c; if (TABLE[b & TRIM_MASK] >= 0) if (TABLE[(b >> TRIM_LEN) & TRIM_MASK] + SCORE[b & TRIM_MASK] >= 0) { trima = c; trimy = y; trimd = dif; trimha = ha; } } } t = T[k]; n = M[k]; ua = HA[k]; V[k] = c; T[k] = b; M[k] = m; HA[k] = ha; a -= 1; } if (more == 0) { if (bseq[besty] != 4 && aseq[besta-besty] != 4) more = 1; if (hgh >= aclip) { hgh = aclip-1; if (morem <= M[aclip]) { morem = M[aclip]; morea = V[aclip]; morey = (morea - aclip)/2; mored = dif; moreha = HA[aclip]; } } if (low <= bclip) { low = bclip+1; if (morem <= M[bclip]) { morem = M[bclip]; morea = V[bclip]; morey = (morea - bclip)/2; mored = dif; moreha = HA[bclip]; } } aclip = INT32_MAX; bclip = -INT32_MAX; } n = besta - WAVE_LAG; while (hgh >= low) if (V[hgh] < n) hgh -= 1; else { while (V[low] < n) low += 1; break; } #ifdef WAVE_STATS k = (hgh-low)+1; if (k > MAX) MAX = k; TOT += k; NWV += 1; #endif #ifdef DEBUG_WAVE print_wave(V,M,low,hgh,besta); #endif } { uint16 *atrace = (uint16 *) apath->trace; int atlen; int trimx; int a, b, k, h; int d, e; if (morem >= 0) { trimx = morea-morey; trimy = morey; trimd = mored; trimha = moreha; } else trimx = trima-trimy; atlen = 0; a = -1; for (h = trimha; h >= 0; h = b) { b = cells[h].ptr; cells[h].ptr = a; a = h; } h = a; k = cells[h].diag; b = (mida-k)/2; e = 0; #ifdef SHOW_TRAIL printf(" A path = (%5d,%5d)\n",(mida+k)/2,b); fflush(stdout); #endif for (h = cells[h].ptr; h >= 0; h = cells[h].ptr) { k = cells[h].diag; a = cells[h].mark - k; d = cells[h].diff; atrace[atlen++] = (uint16) (d-e); atrace[atlen++] = (uint16) (a-b); #ifdef SHOW_TRAIL printf(" %4d: (%5d,%5d): %3d / %3d\n",h,a+k,a,d-e,a-b); fflush(stdout); #endif b = a; e = d; } if (b+k != trimx) { atrace[atlen++] = (uint16) (trimd-e); atrace[atlen++] = (uint16) (trimy-b); #ifdef SHOW_TRAIL printf(" (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,trimy-b); fflush(stdout); #endif } else if (b != trimy) { atrace[atlen-1] = (uint16) (atrace[atlen-1] + (trimy-b)); atrace[atlen-2] = (uint16) (atrace[atlen-2] + (trimd-e)); #ifdef SHOW_TRAIL printf(" @ (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,trimy-b); fflush(stdout); #endif } apath->aepos = trimx; apath->bepos = trimy; apath->diffs = trimd; apath->tlen = atlen; } return (0); } static int reverse_extend(_Work_Data *work, _Align_Spec *spec, Alignment *align, int midd, int mida, int minp, int maxp) { char *aseq = align->aseq - 1; char *bseq = align->bseq - 1; Path *apath = align->path; int hgh, low, dif; int vlen, vmin, vmax; int *V, *M; int *_V, *_M; BVEC *T; BVEC *_T; int *HA, *NA; int *_HA, *_NA; Pebble *cells; int avail, cmax, boff; int TRACE_SPACE = spec->trace_space; int PATH_AVE = spec->ave_path; int16 *SCORE = spec->score; int16 *TABLE = spec->table; int besta, besty; int trima, trimy, trimd; int trimha; int morea, morey, mored; int moreha; int more, morem, lasta; int aclip, bclip; hgh = midd; low = midd; dif = 0; { int span, wing; span = (hgh-low)+1; vlen = work->vecmax/VectorEn; wing = (vlen - span)/2; vmin = low - wing; vmax = hgh + wing; _V = ((int *) work->vector); _M = _V + vlen; _HA = _M + vlen; _NA = _HA + vlen; _T = ((BVEC *) (_NA + vlen)); V = _V-vmin; M = _M-vmin; HA = _HA-vmin; NA = _NA-vmin; T = _T-vmin; cells = (Pebble *) (work->cells); cmax = work->celmax; avail = 0; if (COMP(align->flags)) boff = align->blen % TRACE_SPACE; else boff = 0; } more = 1; aclip = -INT32_MAX; bclip = INT32_MAX; besta = trima = morea = lasta = mida; besty = trimy = morey = (mida-hgh) >> 1; trimd = mored = 0; trimha = moreha = 0; morem = -1; { int k; char *a; a = aseq + low; for (k = low; k <= hgh; k++) { int y, c, d; int ha, na; Pebble *pb; y = (mida-k) >> 1; if (avail >= cmax-1) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } na = ((y+k+TRACE_SPACE-1)/TRACE_SPACE-1)*TRACE_SPACE; #ifdef SHOW_TPS printf(" A %d: -1,%d,0,%d\n",avail,k,na+TRACE_SPACE); fflush(stdout); #endif pb = cells+avail; pb->ptr = -1; pb->diag = k; pb->diff = 0; pb->mark = y+k; ha = avail++; while (1) { c = bseq[y]; if (c == 4) { more = 0; if (bclip > k) bclip = k; break; } d = a[y]; if (c != d) { if (d == 4) { more = 0; aclip = k; } break; } y -= 1; } c = (y << 1) + k; while (y+k <= na) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" A %d: %d,%d,0,%d\n",avail,ha,k,na); fflush(stdout); #endif pb = cells+avail; pb->ptr = ha; pb->diag = k; pb->diff = 0; pb->mark = na; ha = avail++; na -= TRACE_SPACE; } if (c < besta) { besta = trima = lasta = c; besty = trimy = y; trimha = ha; } V[k] = c; T[k] = PATH_INT; M[k] = PATH_LEN; HA[k] = ha; NA[k] = na; a += 1; } } if (more == 0) { if (bseq[besty] != 4 && aseq[besta - besty] != 4) more = 1; if (low <= aclip) { low = aclip+1; if (morem <= M[aclip]) { morem = M[aclip]; morea = V[aclip]; morey = (morea - aclip)/2; moreha = HA[aclip]; } } if (hgh >= bclip) { hgh = bclip-1; if (morem <= M[bclip]) { morem = M[bclip]; morea = V[bclip]; morey = (morea - bclip)/2; moreha = HA[bclip]; } } aclip = -INT32_MAX; bclip = INT32_MAX; } #ifdef DEBUG_WAVE printf("\nREVERSE WAVE:\n"); print_wave(V,M,low,hgh,besta); #endif while (more && lasta <= besta + TRIM_MLAG) { int k, n; int ua; BVEC t; int am, ac, ap; char *a; if (low <= vmin || hgh >= vmax) { int span, wing; int64 move, vd, md, had, nad, td; span = (hgh-low)+1; if (.8*vlen < span) { if (enlarge_vector(work,vlen*VectorEn)) EXIT(1); move = ((void *) _V) - work->vector; vlen = work->vecmax/VectorEn; _V = (int *) work->vector; _M = _V + vlen; _HA = _M + vlen; _NA = _HA + vlen; _T = ((BVEC *) (_NA + vlen)); } else move = 0; wing = (vlen - span)/2; vd = ((void *) ( _V+wing)) - (((void *) ( V+low)) - move); md = ((void *) ( _M+wing)) - (((void *) ( M+low)) - move); had = ((void *) (_HA+wing)) - (((void *) (HA+low)) - move); nad = ((void *) (_NA+wing)) - (((void *) (NA+low)) - move); td = ((void *) ( _T+wing)) - (((void *) ( T+low)) - move); if (vd < 0) memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); if (md < 0) memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); if (had < 0) memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); if (nad < 0) memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); if (td < 0) memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); if (td > 0) memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); if (nad > 0) memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); if (had > 0) memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); if (md > 0) memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); if (vd > 0) memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); vmin = low-wing; vmax = hgh+wing; V = _V-vmin; M = _M-vmin; HA = _HA-vmin; NA = _NA-vmin; T = _T-vmin; } if (low > minp) { low -= 1; NA[low] = NA[low+1]; V[low] = ap = INT32_MAX; } else ap = V[low]; if (hgh < maxp) { hgh += 1; NA[hgh] = NA[hgh-1]; V[hgh] = INT32_MAX; } dif += 1; ac = V[hgh+1] = V[low-1] = INT32_MAX; a = aseq + low; t = PATH_INT; n = PATH_LEN; ua = -1; for (k = low; k <= hgh; k++) { int y, m; int ha; int c, d; BVEC b; Pebble *pb; am = ac; ac = ap; ap = V[d = k+1]; if (ac > ap) if (ap > am) { c = am-1; m = n; b = t; ha = ua; } else { c = ap-1; m = M[d]; b = T[d]; ha = HA[d]; } else if (ac > am) { c = am-1; m = n; b = t; ha = ua; } else { c = ac-2; m = M[k]; b = T[k]; ha = HA[k]; } if ((b & PATH_TOP) != 0) m -= 1; b <<= 1; y = (c-k) >> 1; while (1) { c = bseq[y]; if (c == 4) { more = 0; if (bclip > k) bclip = k; break; } d = a[y]; if (c != d) { if (d == 4) { more = 0; aclip = k; } break; } y -= 1; if ((b & PATH_TOP) == 0) m += 1; b = (b << 1) | 1; } c = (y << 1) + k; while (y+k <= NA[k]) { if (cells[ha].mark > NA[k]) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble), "Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" A %d: %d,%d,%d,%d\n",avail,ha,k,dif,NA[k]); fflush(stdout); #endif pb = cells+avail; pb->ptr = ha; pb->diag = k; pb->diff = dif; pb->mark = NA[k]; ha = avail++; } NA[k] -= TRACE_SPACE; } if (c < besta) { besta = c; besty = y; if (m >= PATH_AVE) { lasta = c; if (TABLE[b & TRIM_MASK] >= 0) if (TABLE[(b >> TRIM_LEN) & TRIM_MASK] + SCORE[b & TRIM_MASK] >= 0) { trima = c; trimy = y; trimd = dif; trimha = ha; } } } t = T[k]; n = M[k]; ua = HA[k]; V[k] = c; T[k] = b; M[k] = m; HA[k] = ha; a += 1; } if (more == 0) { if (bseq[besty] != 4 && aseq[besta - besty] != 4) more = 1; if (low <= aclip) { low = aclip+1; if (morem <= M[aclip]) { morem = M[aclip]; morea = V[aclip]; morey = (morea - aclip)/2; mored = dif; moreha = HA[aclip]; } } if (hgh >= bclip) { hgh = bclip-1; if (morem <= M[bclip]) { morem = M[bclip]; morea = V[bclip]; morey = (morea - bclip)/2; mored = dif; moreha = HA[bclip]; } } aclip = -INT32_MAX; bclip = INT32_MAX; } n = besta + WAVE_LAG; while (hgh >= low) if (V[hgh] > n) hgh -= 1; else { while (V[low] > n) low += 1; break; } #ifdef WAVE_STATS k = (hgh-low)+1; if (k > MAX) MAX = k; TOT += k; NWV += 1; #endif #ifdef DEBUG_WAVE print_wave(V,M,low,hgh,besta); #endif } { uint16 *atrace = (uint16 *) apath->trace; int atlen; int trimx; int a, b, k, h; int d, e; if (morem >= 0) { trimx = morea-morey; trimy = morey; trimd = mored; trimha = moreha; } else trimx = trima-trimy; atlen = 0; a = -1; for (h = trimha; h >= 0; h = b) { b = cells[h].ptr; cells[h].ptr = a; a = h; } h = a; k = cells[h].diag; b = cells[h].mark - k; e = 0; #ifdef SHOW_TRAIL printf(" A path = (%5d,%5d)\n",b+k,b); fflush(stdout); #endif if ((b+k)%TRACE_SPACE != 0) { h = cells[h].ptr; if (h < 0) { a = trimy; d = trimd; } else { k = cells[h].diag; a = cells[h].mark - k; d = cells[h].diff; } #ifdef SHOW_TRAIL printf(" +%4d: (%5d,%5d): %3d / %3d\n",h,a+k,a,d-e,b-a); fflush(stdout); #endif atrace[--atlen] = (uint16) (b-a); atrace[--atlen] = (uint16) (d-e); b = a; e = d; } if (h >= 0) { for (h = cells[h].ptr; h >= 0; h = cells[h].ptr) { k = cells[h].diag; a = cells[h].mark - k; atrace[--atlen] = (uint16) (b-a); d = cells[h].diff; atrace[--atlen] = (uint16) (d-e); #ifdef SHOW_TRAIL printf(" %4d: (%5d,%5d): %3d / %3d\n",h,a+k,a,d-e,b-a); fflush(stdout); #endif b = a; e = d; } if (b+k != trimx) { atrace[--atlen] = (uint16) (b-trimy); atrace[--atlen] = (uint16) (trimd-e); #ifdef SHOW_TRAIL printf(" (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,b-trimy); fflush(stdout); #endif } else if (b != trimy) { atrace[atlen+1] = (uint16) (atrace[atlen+1] + (b-trimy)); atrace[atlen] = (uint16) (atrace[atlen] + (trimd-e)); #ifdef SHOW_TRAIL printf(" @ (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,b-trimy); fflush(stdout); #endif } } apath->abpos = trimx; apath->bbpos = trimy; apath->diffs = trimd; apath->tlen = - atlen; apath->trace = atrace + atlen; } return (0); } /* Find the longest local alignment between aseq and bseq through (xcnt,ycnt) See associated .h file for the precise definition of the interface. */ int Find_Extension(Alignment *align, Work_Data *ework, Align_Spec *espec, int diag, int anti, int lbord, int hbord, int prefix) { _Work_Data *work = ( _Work_Data *) ework; _Align_Spec *spec = (_Align_Spec *) espec; Path *apath; int minp, maxp; { int alen, blen; int maxtp, wsize; alen = align->alen; blen = align->blen; wsize = VectorEn*10000; if (wsize >= work->vecmax) if (enlarge_vector(work,wsize)) EXIT(1); if (alen < blen) maxtp = 2*(blen/spec->trace_space+2); else maxtp = 2*(alen/spec->trace_space+2); wsize = 2*maxtp*sizeof(uint16); if (wsize > work->pntmax) if (enlarge_points(work,wsize)) EXIT(1); apath = align->path; apath->trace = ((uint16 *) work->points) + maxtp; } #ifdef DEBUG_PASSES printf("\n"); #endif if (lbord < 0) minp = -INT32_MAX; else minp = diag-lbord; if (hbord < 0) maxp = INT32_MAX; else maxp = diag+hbord; if (prefix) { if (reverse_extend(work,spec,align,diag,anti,minp,maxp)) EXIT(1); apath->aepos = (anti-diag)/2; apath->bepos = (anti+diag)/2; #ifdef DEBUG_PASSES printf("E1 (%d,%d) => (%d,%d) %d\n", (anti+diag)/2,(anti-diag)/2,apath->read_A_match_start_,apath->read_B_match_start_,apath->diffs); #endif } else { if (forward_extend(work,spec,align,diag,anti,minp,maxp)) EXIT(1); apath->abpos = (anti-diag)/2; apath->bbpos = (anti+diag)/2; #ifdef DEBUG_PASSES printf("F1 (%d,%d) => (%d,%d) %d\n", (anti+diag)/2,(anti-diag)/2,apath->read_A_match_end_,apath->read_B_match_end_,apath->diffs); #endif } #ifdef DEBUG_POINTS { uint16 *trace = (uint16 *) apath->trace; int a, h; printf("\nA-path (%d,%d)->(%d,%d)",apath->read_A_match_start_,apath->read_B_match_start_,apath->read_A_match_end_,apath->read_B_match_end_); printf(" %c\n",(COMP(_align->reverse_complement_match_) ? 'c' : 'n')); a = apath->read_B_match_start_; for (h = 1; h < apath->tlen; h += 2) { int dif = trace[h-1]; int del = trace[h]; a += del; printf(" %d / %d (%d)\n",dif,del,a); } } #endif return (0); } /****************************************************************************************\ * * * OVERLAP MANIPULATION * * * \****************************************************************************************/ static int64 PtrSize = sizeof(void *); static int64 OvlIOSize = sizeof(Overlap) - sizeof(void *); int Read_Overlap(FILE *input, Overlap *ovl) { if (fread( ((char *) ovl) + PtrSize, OvlIOSize, 1, input) != 1) return (1); return (0); } int Read_Trace(FILE *input, Overlap *ovl, int tbytes) { if (tbytes > 0 && ovl->path.tlen > 0) { if (fread(ovl->path.trace, tbytes*ovl->path.tlen, 1, input) != 1) return (1); } return (0); } void Write_Overlap(FILE *output, Overlap *ovl, int tbytes) { fwrite( ((char *) ovl) + PtrSize, OvlIOSize, 1, output); if (ovl->path.trace != NULL) fwrite(ovl->path.trace,tbytes,ovl->path.tlen,output); } void Compress_TraceTo8(Overlap *ovl) { uint16 *t16 = (uint16 *) ovl->path.trace; uint8 *t8 = (uint8 *) ovl->path.trace; int j; for (j = 0; j < ovl->path.tlen; j++) t8[j] = (uint8) (t16[j]); } void Decompress_TraceTo16(Overlap *ovl) { uint16 *t16 = (uint16 *) ovl->path.trace; uint8 *t8 = (uint8 *) ovl->path.trace; int j; for (j = ovl->path.tlen-1; j >= 0; j--) t16[j] = t8[j]; } void Print_Overlap(FILE *output, Overlap *ovl, int tbytes, int indent) { int i; fprintf(output,"%*s%d vs. ",indent,"",ovl->aread); if (COMP(ovl->flags)) fprintf(output,"c(%d)\n",ovl->bread); else fprintf(output,"%d\n",ovl->bread); fprintf(output,"%*s [%d,%d] vs [%d,%d] w. %d diffs\n",indent,"", ovl->path.abpos,ovl->path.aepos,ovl->path.bbpos,ovl->path.bepos,ovl->path.diffs); if (tbytes == 1) { uint8 *trace = (uint8 *) (ovl->path.trace); if (trace != NULL) { int p = ovl->path.bbpos + trace[1]; fprintf(output,"%*sTrace: %3d/%5d",indent,"",trace[0],p); for (i = 3; i < ovl->path.tlen; i += 2) { if (i%10 == 0) fprintf(output,"\n%*s",indent+6,""); p += trace[i]; fprintf(output," %3d/%5d",trace[i-1],p); } fprintf(output,"\n"); } } else { uint16 *trace = (uint16 *) (ovl->path.trace); if (trace != NULL) { int p = ovl->path.bbpos + trace[1]; fprintf(output,"%*sTrace: %3d/%5d",indent,"",trace[0],p); for (i = 3; i < ovl->path.tlen; i += 2) { if (i%10 == 0) fprintf(output,"\n%*s",indent+6,""); p += trace[i]; fprintf(output," %3d/%5d",trace[i-1],p); } fprintf(output,"\n"); } } } int Check_Trace_Points(Overlap *ovl, int tspace, int verbose, char *fname) { int i, p; if (((ovl->path.aepos-1)/tspace - ovl->path.abpos/tspace)*2 != ovl->path.tlen-2) { if (verbose) EPRINTF(EPLACE," %s: Wrong number of trace points\n",fname); return (1); } p = ovl->path.bbpos; if (tspace <= TRACE_XOVR) { uint8 *trace8 = (uint8 *) ovl->path.trace; for (i = 1; i < ovl->path.tlen; i += 2) p += trace8[i]; } else { uint16 *trace16 = (uint16 *) ovl->path.trace; for (i = 1; i < ovl->path.tlen; i += 2) p += trace16[i]; } if (p != ovl->path.bepos) { if (verbose) EPRINTF(EPLACE," %s: Trace point sum != aligned interval\n",fname); return (1); } return (0); } void Flip_Alignment(Alignment *align, int full) { char *aseq = align->aseq; char *bseq = align->bseq; int alen = align->alen; int blen = align->blen; Path *path = align->path; int comp = COMP(align->flags); int *trace = (int *) path->trace; int tlen = path->tlen; int i, j, p; if (comp) { p = path->abpos; path->abpos = blen - path->bepos; path->bepos = alen - p; p = path->aepos; path->aepos = blen - path->bbpos; path->bbpos = alen - p; if (full) { alen += 2; blen += 2; for (i = 0; i < tlen; i++) if ((p = trace[i]) < 0) trace[i] = alen + p; else trace[i] = p - blen; i = tlen-1; j = 0; while (j < i) { p = trace[i]; trace[i] = trace[j]; trace[j] = p; i -= 1; j += 1; } alen -= 2; blen -= 2; } } else { p = path->abpos; path->abpos = path->bbpos; path->bbpos = p; p = path->aepos; path->aepos = path->bepos; path->bepos = p; if (full) for (i = 0; i < tlen; i++) trace[i] = - (trace[i]); } align->aseq = bseq; align->bseq = aseq; align->alen = blen; align->blen = alen; } /****************************************************************************************\ * * * ALIGNMENT PRINTING * * * \****************************************************************************************/ /* Complement the sequence in fragment aseq. The operation does the complementation/reversal in place. Calling it a second time on a given fragment restores it to its original state. */ void Complement_Seq(char *aseq, int len) { char *s, *t; int c; s = aseq; t = aseq + (len-1); while (s < t) { c = 3 - *s; *s++ = (char) (3 - *t); *t-- = (char) c; } if (s == t) *s = (char) (3 - *s); } /* Print an alignment to file between a and b given in trace (unpacked). Prefix gives the length of the initial prefix of a that is unaligned. */ static char ToL[8] = { 'a', 'c', 'g', 't', '.', '[', ']', '-' }; static char ToU[8] = { 'A', 'C', 'G', 'T', '.', '[', ']', '-' }; int Print_Alignment(FILE *file, Alignment *align, Work_Data *ework, int indent, int width, int border, int upper, int coord) { _Work_Data *work = (_Work_Data *) ework; int *trace = align->path->trace; int tlen = align->path->tlen; char *Abuf, *Bbuf, *Dbuf; int i, j, o; char *a, *b; char mtag, dtag; int prefa, prefb; int aend, bend; int sa, sb; int match, diff; char *N2A; if (trace == NULL) return (0); #ifdef SHOW_TRACE fprintf(file,"\nTrace:\n"); for (i = 0; i < tlen; i++) fprintf(file," %3d\n",trace[i]); #endif o = sizeof(char)*3*(width+1); if (o > work->vecmax) if (enlarge_vector(work,o)) EXIT(1); if (upper) N2A = ToU; else N2A = ToL; Abuf = (char *) work->vector; Bbuf = Abuf + (width+1); Dbuf = Bbuf + (width+1); aend = align->path->aepos; bend = align->path->bepos; Abuf[width] = Bbuf[width] = Dbuf[width] = '\0'; /* buffer/output next column */ #define COLUMN(x,y) \ { int u, v; \ if (o >= width) \ { fprintf(file,"\n"); \ fprintf(file,"%*s",indent,""); \ if (coord > 0) \ { if (sa <= aend) \ fprintf(file," %*d",coord,sa); \ else \ fprintf(file," %*s",coord,""); \ fprintf(file," %s\n",Abuf); \ fprintf(file,"%*s %*s %s\n",indent,"",coord,"",Dbuf); \ fprintf(file,"%*s",indent,""); \ if (sb <= bend) \ fprintf(file," %*d",coord,sb); \ else \ fprintf(file," %*s",coord,""); \ fprintf(file," %s",Bbuf); \ } \ else \ { fprintf(file," %s\n",Abuf); \ fprintf(file,"%*s %s\n",indent,"",Dbuf); \ fprintf(file,"%*s %s",indent,"",Bbuf); \ } \ fprintf(file," %5.1f%%\n",(100.*diff)/(diff+match)); \ o = 0; \ sa = i; \ sb = j; \ match = diff = 0; \ } \ u = (x); \ v = (y); \ if (u == 4 || v == 4) \ Dbuf[o] = ' '; \ else if (u == v) \ Dbuf[o] = mtag; \ else \ Dbuf[o] = dtag; \ Abuf[o] = N2A[u]; \ Bbuf[o] = N2A[v]; \ o += 1; \ } a = align->aseq - 1; b = align->bseq - 1; o = 0; i = j = 1; prefa = align->path->abpos; prefb = align->path->bbpos; if (prefa > border) { i = prefa-(border-1); prefa = border; } if (prefb > border) { j = prefb-(border-1); prefb = border; } sa = i; sb = j; mtag = ':'; dtag = ':'; while (prefa > prefb) { COLUMN(a[i],4) i += 1; prefa -= 1; } while (prefb > prefa) { COLUMN(4,b[j]) j += 1; prefb -= 1; } while (prefa > 0) { COLUMN(a[i],b[j]) i += 1; j += 1; prefa -= 1; } mtag = '['; if (prefb > 0) COLUMN(5,5) mtag = '|'; dtag = '*'; match = diff = 0; { int p, c; /* Output columns of alignment til reach trace end */ for (c = 0; c < tlen; c++) if ((p = trace[c]) < 0) { p = -p; while (i != p) { COLUMN(a[i],b[j]) if (a[i] == b[j]) match += 1; else diff += 1; i += 1; j += 1; } COLUMN(7,b[j]) j += 1; diff += 1; } else { while (j != p) { COLUMN(a[i],b[j]) if (a[i] == b[j]) match += 1; else diff += 1; i += 1; j += 1; } COLUMN(a[i],7) i += 1; diff += 1; } p = align->path->aepos; while (i <= p) { COLUMN(a[i],b[j]) if (a[i] == b[j]) match += 1; else diff += 1; i += 1; j += 1; } } { int c; /* Output remaining column including unaligned suffix */ mtag = ']'; if (a[i] != 4 && b[j] != 4 && border > 0) COLUMN(6,6) mtag = ':'; dtag = ':'; c = 0; while (c < border && (a[i] != 4 || b[j] != 4)) { if (a[i] != 4) if (b[j] != 4) { COLUMN(a[i],b[j]) i += 1; j += 1; } else { COLUMN(a[i],4) i += 1; } else { COLUMN(4,b[j]) j += 1; } c += 1; } } /* Print remainder of buffered col.s */ fprintf(file,"\n"); fprintf(file,"%*s",indent,""); if (coord > 0) { if (sa <= aend) fprintf(file," %*d",coord,sa); else fprintf(file," %*s",coord,""); fprintf(file," %.*s\n",o,Abuf); fprintf(file,"%*s %*s %.*s\n",indent,"",coord,"",o,Dbuf); fprintf(file,"%*s",indent,""); if (sb <= bend) fprintf(file," %*d",coord,sb); else fprintf(file," %*s",coord,""); fprintf(file," %.*s",o,Bbuf); } else { fprintf(file," %.*s\n",o,Abuf); fprintf(file,"%*s %.*s\n",indent,"",o,Dbuf); fprintf(file,"%*s %.*s",indent,"",o,Bbuf); } if (diff+match > 0) fprintf(file," %5.1f%%\n",(100.*diff)/(diff+match)); else fprintf(file,"\n"); fflush(file); return (0); } int Print_Reference(FILE *file, Alignment *align, Work_Data *ework, int indent, int block, int border, int upper, int coord) { _Work_Data *work = (_Work_Data *) ework; int *trace = align->path->trace; int tlen = align->path->tlen; char *Abuf, *Bbuf, *Dbuf; int i, j, o; char *a, *b; char mtag, dtag; int prefa, prefb; int aend, bend; int sa, sb, s0; int match, diff; char *N2A; int vmax; if (trace == NULL) return (0); #ifdef SHOW_TRACE fprintf(file,"\nTrace:\n"); for (i = 0; i < tlen; i++) fprintf(file," %3d\n",trace[i]); #endif vmax = work->vecmax/3; o = sizeof(char)*6*(block+1); if (o > vmax) { if (enlarge_vector(work,3*o)) EXIT(1); vmax = work->vecmax/3; } Abuf = (char *) work->vector; Bbuf = Abuf + vmax; Dbuf = Bbuf + vmax; if (upper) N2A = ToU; else N2A = ToL; aend = align->path->aepos; bend = align->path->bepos; #define BLOCK(x,y) \ { int u, v; \ if (i%block == 1 && i != s0 && x < 4 && o > 0) \ { fprintf(file,"\n"); \ fprintf(file,"%*s",indent,""); \ if (coord > 0) \ { if (sa <= aend) \ fprintf(file," %*d",coord,sa); \ else \ fprintf(file," %*s",coord,""); \ fprintf(file," %.*s\n",o,Abuf); \ fprintf(file,"%*s %*s %.*s\n",indent,"",coord,"",o,Dbuf); \ fprintf(file,"%*s",indent,""); \ if (sb <= bend) \ fprintf(file," %*d",coord,sb); \ else \ fprintf(file," %*s",coord,""); \ fprintf(file," %.*s",o,Bbuf); \ } \ else \ { fprintf(file," %.*s\n",o,Abuf); \ fprintf(file,"%*s %.*s\n",indent,"",o,Dbuf); \ fprintf(file,"%*s %.*s",indent,"",o,Bbuf); \ } \ fprintf(file," %5.1f%%\n",(100.*diff)/(diff+match)); \ o = 0; \ sa = i; \ sb = j; \ match = diff = 0; \ } \ u = (x); \ v = (y); \ if (u == 4 || v == 4) \ Dbuf[o] = ' '; \ else if (u == v) \ Dbuf[o] = mtag; \ else \ Dbuf[o] = dtag; \ Abuf[o] = N2A[u]; \ Bbuf[o] = N2A[v]; \ o += 1; \ if (o >= vmax) \ { if (enlarge_vector(work,3*o)) \ EXIT(1); \ vmax = work->vecmax/3; \ memmove(work->vector+2*vmax,Dbuf,o); \ memmove(work->vector+vmax,Bbuf,o); \ memmove(work->vector,Abuf,o); \ Abuf = (char *) work->vector; \ Bbuf = Abuf + vmax; \ Dbuf = Bbuf + vmax; \ } \ } a = align->aseq - 1; b = align->bseq - 1; o = 0; i = j = 1; prefa = align->path->abpos; prefb = align->path->bbpos; if (prefa > border) { i = prefa-(border-1); prefa = border; } if (prefb > border) { j = prefb-(border-1); prefb = border; } s0 = i; sa = i; sb = j; mtag = ':'; dtag = ':'; while (prefa > prefb) { BLOCK(a[i],4) i += 1; prefa -= 1; } while (prefb > prefa) { BLOCK(4,b[j]) j += 1; prefb -= 1; } while (prefa > 0) { BLOCK(a[i],b[j]) i += 1; j += 1; prefa -= 1; } mtag = '['; if (prefb > 0) BLOCK(5,5) mtag = '|'; dtag = '*'; match = diff = 0; { int p, c; /* Output columns of alignment til reach trace end */ for (c = 0; c < tlen; c++) if ((p = trace[c]) < 0) { p = -p; while (i != p) { BLOCK(a[i],b[j]) if (a[i] == b[j]) match += 1; else diff += 1; i += 1; j += 1; } BLOCK(7,b[j]) j += 1; diff += 1; } else { while (j != p) { BLOCK(a[i],b[j]) if (a[i] == b[j]) match += 1; else diff += 1; i += 1; j += 1; } BLOCK(a[i],7) i += 1; diff += 1; } p = align->path->aepos; while (i <= p) { BLOCK(a[i],b[j]) if (a[i] == b[j]) match += 1; else diff += 1; i += 1; j += 1; } } { int c; /* Output remaining column including unaligned suffix */ mtag = ']'; if (a[i] != 4 && b[j] != 4 && border > 0) BLOCK(6,6) mtag = ':'; dtag = ':'; c = 0; while (c < border && (a[i] != 4 || b[j] != 4)) { if (a[i] != 4) if (b[j] != 4) { BLOCK(a[i],b[j]) i += 1; j += 1; } else { BLOCK(a[i],4) i += 1; } else { BLOCK(4,b[j]) j += 1; } c += 1; } } /* Print remainder of buffered col.s */ fprintf(file,"\n"); fprintf(file,"%*s",indent,""); if (coord > 0) { if (sa <= aend) fprintf(file," %*d",coord,sa); else fprintf(file," %*s",coord,""); fprintf(file," %.*s\n",o,Abuf); fprintf(file,"%*s %*s %.*s\n",indent,"",coord,"",o,Dbuf); fprintf(file,"%*s",indent,""); if (sb <= bend) fprintf(file," %*d",coord,sb); else fprintf(file," %*s",coord,""); fprintf(file," %.*s",o,Bbuf); } else { fprintf(file," %.*s\n",o,Abuf); fprintf(file,"%*s %.*s\n",indent,"",o,Dbuf); fprintf(file,"%*s %.*s",indent,"",o,Bbuf); } if (diff+match > 0) fprintf(file," %5.1f%%\n",(100.*diff)/(diff+match)); else fprintf(file,"\n"); fflush(file); return (0); } /* Print an ASCII representation of the overlap in _align between fragments a and b to given file. */ static inline void repchar(FILE *file, int symbol, int rep) { while (rep-- > 0) fputc(symbol,file); } void Alignment_Cartoon(FILE *file, Alignment *align, int indent, int coord) { int alen = align->alen; int blen = align->blen; Path *path = align->path; int comp = COMP(align->flags); int w; fprintf(file,"%*s",indent,""); if (path->abpos > 0) fprintf(file," %*d ",coord,path->abpos); else fprintf(file,"%*s",coord+5,""); if (path->aepos < alen) fprintf(file,"%*s%d",coord+8,"",alen-path->aepos); fprintf(file,"\n"); fprintf(file,"%*s",indent,""); if (path->abpos > 0) { fprintf(file,"A "); w = Number_Digits((int64) path->abpos); repchar(file,' ',coord-w); repchar(file,'=',w+3); fputc('+',file); repchar(file,'-',coord+5); } else { fprintf(file,"A %*s",coord+4,""); repchar(file,'-',coord+5); } if (path->aepos < alen) { fputc('+',file); w = Number_Digits((int64) (alen-path->aepos)); repchar(file,'=',w+2); fputc('>',file); repchar(file,' ',w); } else { fputc('>',file); repchar(file,' ',coord+3); } { int asub, bsub; asub = path->aepos - path->abpos; bsub = path->bepos - path->bbpos; fprintf(file," dif/(len1+len2) = %d/(%d+%d) = %5.2f%%\n", path->diffs,asub,bsub,(200.*path->diffs)/(asub+bsub)); } { int sym1e, sym2e; int sym1p, sym2p; if (comp > 0) { sym1p = '<'; sym2p = '-'; sym1e = '<'; sym2e = '='; } else { sym1p = '-'; sym2p = '>'; sym1e = '='; sym2e = '>'; } fprintf(file,"%*s",indent,""); if (path->bbpos > 0) { fprintf(file,"B "); w = Number_Digits((int64) path->bbpos); repchar(file,' ',coord-w); fputc(sym1e,file); repchar(file,'=',w+2); fputc('+',file); repchar(file,'-',coord+5); } else { fprintf(file,"B "); repchar(file,' ',coord+3); fputc(sym1p,file); repchar(file,'-',coord+5); } if (path->bepos < blen) { fprintf(file,"+"); w = Number_Digits((int64) (blen-path->bepos)); repchar(file,'=',w+2); fprintf(file,"%c\n",sym2e); } else fprintf(file,"%c\n",sym2p); } fprintf(file,"%*s",indent,""); if (path->bbpos > 0) fprintf(file," %*d ",coord,path->bbpos); else fprintf(file,"%*s",coord+5,""); if (path->bepos < blen) fprintf(file,"%*s%d",coord+8,"",blen-path->bepos); fprintf(file,"\n"); fflush(file); } /****************************************************************************************\ * * * O(ND) trace algorithm * * * \****************************************************************************************/ #ifdef DEBUG_AWAVE static void print_awave(int *V, int low, int hgh) { int k; printf(" [%6d,%6d]: ",low,hgh); for (k = low; k <= hgh; k++) printf(" %3d",V[k]); printf("\n"); fflush(stdout); } #endif #ifdef DEBUG_ALIGN static int depth = 0; #endif typedef struct { int *Stop; // Ongoing stack of alignment indels char *Aabs, *Babs; // Absolute base of A and B sequences int **PVF, **PHF; // List of waves for iterative np algorithms int mida, midb; // mid point division for mid-point algorithms int *VF, *VB; // Forward/Reverse waves for nd algorithms // (defunct: were used for O(nd) algorithms) } Trace_Waves; static int dandc_nd(char *A, int M, char *B, int N, Trace_Waves *wave) { int x, y; int D; #ifdef DEBUG_ALIGN printf("%*s %ld,%ld: %d vs %d\n",depth,"",A-wave->Aabs,B-wave->Babs,M,N); #endif if (M <= 0) { x = (wave->Aabs-A)-1; for (y = 1; y <= N; y++) { *wave->Stop++ = x; #ifdef DEBUG_SCRIPT printf("%*s *I %ld(%ld)\n",depth,"",y+(B-wave->Babs),(A-wave->Aabs)+1); #endif } return (N); } if (N <= 0) { y = (B-wave->Babs)+1; for (x = 1; x <= M; x++) { *wave->Stop++ = y; #ifdef DEBUG_SCRIPT printf("%*s *D %ld(%ld)\n",depth,"",x+(A-wave->Aabs),(B-wave->Babs)+1); #endif } return (M); } { int *VF = wave->VF; int *VB = wave->VB; int flow; // fhgh == D ! int blow, bhgh; char *a; y = 0; if (N < M) while (y < N && B[y] == A[y]) y += 1; else { while (y < M && B[y] == A[y]) y += 1; if (y >= M && N == M) return (0); } flow = 0; VF[0] = y; VF[-1] = -2; x = N-M; a = A-x; y = N-1; if (N > M) while (y >= x && B[y] == a[y]) y -= 1; else while (y >= 0 && B[y] == a[y]) y -= 1; blow = bhgh = -x; VB += x; VB[blow] = y; VB[blow-1] = N+1; for (D = 1; 1; D += 1) { int k, r; int am, ac, ap; // Forward wave flow -= 1; am = ac = VF[flow-1] = -2; a = A + D; x = M - D; for (k = D; k >= flow; k--) { ap = ac; ac = am+1; am = VF[k-1]; if (ac < am) if (ap < am) y = am; else y = ap; else if (ap < ac) y = ac; else y = ap; if (blow <= k && k <= bhgh) { r = VB[k]; if (y > r) { D = (D<<1)-1; if (ap > r) y = ap; else if (ac > r) y = ac; else y = r+1; x = k+y; goto OVERLAP2; } } if (N < x) while (y < N && B[y] == a[y]) y += 1; else while (y < x && B[y] == a[y]) y += 1; VF[k] = y; a -= 1; x += 1; } #ifdef DEBUG_AWAVE print_awave(VF,flow,D); #endif // Reverse Wave bhgh += 1; blow -= 1; am = ac = VB[blow-1] = N+1; a = A + bhgh; x = -bhgh; for (k = bhgh; k >= blow; k--) { ap = ac+1; ac = am; am = VB[k-1]; if (ac > am) if (ap > am) y = am; else y = ap; else if (ap > ac) y = ac; else y = ap; if (flow <= k && k <= D) { r = VF[k]; if (y <= r) { D = (D << 1); if (ap <= r) y = ap; else if (ac <= r) y = ac; else y = r; x = k+y; goto OVERLAP2; } } y -= 1; if (x > 0) while (y >= x && B[y] == a[y]) y -= 1; else while (y >= 0 && B[y] == a[y]) y -= 1; VB[k] = y; a -= 1; x += 1; } #ifdef DEBUG_AWAVE print_awave(VB,blow,bhgh); #endif } } OVERLAP2: #ifdef DEBUG_ALIGN printf("%*s (%d,%d) @ %d\n",depth,"",x,y,D); fflush(stdout); #endif if (D > 1) { #ifdef DEBUG_ALIGN depth += 2; #endif dandc_nd(A,x,B,y,wave); dandc_nd(A+x,M-x,B+y,N-y,wave); #ifdef DEBUG_ALIGN depth -= 2; #endif } else if (D == 1) { if (M > N) { *wave->Stop++ = (B-wave->Babs)+y+1; #ifdef DEBUG_SCRIPT printf("%*s D %ld(%ld)\n",depth,"",(A-wave->Aabs)+x,(B-wave->Babs)+y+1); #endif } else if (M < N) { *wave->Stop++ = (wave->Aabs-A)-x-1; #ifdef DEBUG_SCRIPT printf("%*s I %ld(%ld)\n",depth,"",(B-wave->Babs)+y,(A-wave->Aabs)+x+1); #endif } #ifdef DEBUG_SCRIPT else printf("%*s %ld S %ld\n",depth,"",(wave->Aabs-A)+x,(B-wave->Babs)+y); #endif } return (D); } static int Compute_Trace_ND_ALL(Alignment *align, Work_Data *ework) { _Work_Data *work = (_Work_Data *) ework; Trace_Waves wave; int L, D; int asub, bsub; Path *path; int *trace; path = align->path; asub = path->aepos-path->abpos; bsub = path->bepos-path->bbpos; if (asub < bsub) L = bsub; else L = asub; L *= sizeof(int); if (L > work->tramax) if (enlarge_trace(work,L)) EXIT(1); trace = wave.Stop = ((int *) work->trace); D = 2*(path->diffs + 4)*sizeof(int); if (D > work->vecmax) if (enlarge_vector(work,D)) EXIT(1); D = (path->diffs+3)/2; wave.VF = ((int *) work->vector) + (D+1); wave.VB = wave.VF + (2*D+1); wave.Aabs = align->aseq; wave.Babs = align->bseq; path->diffs = dandc_nd(align->aseq+path->abpos,path->aepos-path->abpos, align->bseq+path->bbpos,path->bepos-path->bbpos,&wave); path->trace = trace; path->tlen = wave.Stop - trace; return (0); } /****************************************************************************************\ * * * O(NP) tracing algorithms * * * \****************************************************************************************/ /* Iterative O(np) algorithm for finding the alignment between two substrings (specified by a Path record). The variation includes handling substitutions and guarantees to find left-most alignments so that low complexity runs are always aligned in the same way. */ #ifdef DEBUG_ALIGN static int ToA[4] = { 'a', 'c', 'g', 't' }; #endif static int iter_np(char *A, int M, char *B, int N, Trace_Waves *wave, int mode) { int **PVF = wave->PVF; int **PHF = wave->PHF; int D; int del = M-N; { int *F0, *F1, *F2; int *HF; int low, hgh; int posl, posh; #ifdef DEBUG_ALIGN printf("\n BASE %ld,%ld: %d vs %d\n",A-wave->Aabs,B-wave->Babs,M,N); printf(" A = "); for (D = 0; D < M; D++) printf("%c",ToA[(int) A[D]]); printf("\n"); printf(" B = "); for (D = 0; D < N; D++) printf("%c",ToA[(int) B[D]]); printf("\n"); #endif if (del >= 0) { low = 0; hgh = del; } else { low = del; hgh = 0; } posl = -INT32_MAX; posh = INT32_MAX; if (wave->Aabs == wave->Babs) { if (B == A) { EPRINTF(EPLACE,"Error: self comparison starts on diagonal 0 (Compute_Trace)\n"); EXIT(-1); } else if (B < A) posl = (B-A)+1; else posh = (B-A)-1; } F1 = PVF[-2]; F0 = PVF[-1]; for (D = low-1; D <= hgh+1; D++) F1[D] = F0[D] = -2; F0[0] = -1; low += 1; hgh -= 1; for (D = 0; 1; D += 1) { int k, i, j; int am, ac, ap; char *a; F2 = F1; F1 = F0; F0 = PVF[D]; HF = PHF[D]; if ((D & 0x1) == 0) { if (low > posl) low -= 1; if (hgh < posh) hgh += 1; } F0[hgh+1] = F0[low-1] = -2; #define FS_MOVE(mdir,pdir) \ ac = F1[k]+1; \ if (ac < am) \ if (ap < am) \ { HF[k] = mdir; \ j = am; \ } \ else \ { HF[k] = pdir; \ j = ap; \ } \ else \ if (ap < ac) \ { HF[k] = 0; \ j = ac; \ } \ else \ { HF[k] = pdir; \ j = ap; \ } \ \ if (N < i) \ while (j < N && B[j] == a[j]) \ j += 1; \ else \ while (j < i && B[j] == a[j]) \ j += 1; \ F0[k] = j; j = -2; a = A + hgh; i = M - hgh; for (k = hgh; k > del; k--) { ap = j+1; am = F2[k-1]; FS_MOVE(-1,4) a -= 1; i += 1; } j = -2; a = A + low; i = M - low; for (k = low; k < del; k++) { ap = F2[k+1]+1; am = j; FS_MOVE(2,1) a += 1; i -= 1; } ap = F0[del+1]+1; am = j; FS_MOVE(2,4) #ifdef DEBUG_AWAVE print_awave(F0,low,hgh); print_awave(HF,low,hgh); #endif if (F0[del] >= N) break; } } { int k, h, m, e, c; int ap = (wave->Aabs-A)-1; int bp = (B-wave->Babs)+1; PHF[0][0] = 3; c = N; k = del; e = PHF[D][k]; PHF[D][k] = 3; if (mode == UPPERMOST) while (e != 3) { h = k+e; if (e > 1) h -= 3; else if (e == 0) D -= 1; else D -= 2; if (h < k) // => e = -1 or 2, UPPERMOST { char *a; a = A + k; if (k < 0) m = -k; else m = 0; if (PVF[D][h] <= c) c = PVF[D][h]-1; while (c >= m && a[c] == B[c]) c -= 1; if (e == -1) // => edge is 2, others are 1, and 0 { if (c <= PVF[D+2][k+1]) { e = 4; h = k+1; D = D+2; } else if (c == PVF[D+1][k]) { e = 0; h = k; D = D+1; } else PVF[D][h] = c+1; } else // => edge is 0, others are 1, and 2 (if k != del), 0 (otherwise) { if (k == del) m = D; else m = D-2; if (c <= PVF[m][k+1]) { if (k == del) e = 4; else e = 1; h = k+1; D = m; } else if (c == PVF[D-1][k]) { e = 0; h = k; D = D-1; } else PVF[D][h] = c+1; } } m = PHF[D][h]; PHF[D][h] = e; e = m; k = h; } else if (mode == LOWERMOST) while (e != 3) { h = k+e; if (e > 1) h -= 3; else if (e == 0) D -= 1; else D -= 2; if (h > k) // => e = 1 or 4, LOWERMOST { char *a; a = A + k; if (k < 0) m = -k; else m = 0; if (PVF[D][h] < c) c = PVF[D][h]; while (c >= m && a[c] == B[c]) c -= 1; if (e == 1) // => edge is 2, others are 1, and 0 { if (c < PVF[D+2][k-1]) { e = 2; h = k-1; D = D+2; } else if (c == PVF[D+1][k]) { e = 0; h = k; D = D+1; } else PVF[D][h] = c--; } else // => edge is 0, others are 1, and 2 (if k != del), 0 (otherwise) { if (k == del) m = D; else m = D-2; if (c < PVF[m][k-1]) { if (k == del) e = 2; else e = -1; h = k-1; D = m; } else if (c == PVF[D-1][k]) { e = 0; h = k; D = D-1; } else PVF[D][h] = c--; } } m = PHF[D][h]; PHF[D][h] = e; e = m; k = h; } else // mode == GREEDIEST while (e != 3) { h = k+e; if (e > 1) h -= 3; else if (e == 0) D -= 1; else D -= 2; m = PHF[D][h]; PHF[D][h] = e; e = m; k = h; } k = D = 0; e = PHF[D][k]; while (e != 3) { h = k-e; c = PVF[D][k]; if (e > 1) h += 3; else if (e == 0) D += 1; else D += 2; #ifdef DEBUG_SCRIPT if (h > k) printf(" D %d(%d)\n",(c-k)-(ap-1),c+bp); else if (h < k) printf(" I %d(%d)\n",c+(bp-1),(c+k)-ap); else printf(" %d S %d\n",(c+k)-(ap+1),c+(bp-1)); #endif if (h > k) *wave->Stop++ = bp+c; else if (h < k) *wave->Stop++ = ap-(c+k); k = h; e = PHF[D][h]; } } return (D + abs(del)); } static int middle_np(char *A, int M, char *B, int N, Trace_Waves *wave, int mode) { int **PVF = wave->PVF; int **PHF = wave->PHF; int D; int del = M-N; { int *F0, *F1, *F2; int *HF; int low, hgh; int posl, posh; #ifdef DEBUG_ALIGN printf("\n%*s BASE %ld,%ld: %d vs %d\n",depth,"",A-wave->Aabs,B-wave->Babs,M,N); printf("%*s A = ",depth,""); for (D = 0; D < M; D++) printf("%c",ToA[(int) A[D]]); printf("\n"); printf("%*s B = ",depth,""); for (D = 0; D < N; D++) printf("%c",ToA[(int) B[D]]); printf("\n"); #endif if (del >= 0) { low = 0; hgh = del; } else { low = del; hgh = 0; } posl = -INT32_MAX; posh = INT32_MAX; if (wave->Aabs == wave->Babs) { if (B == A) { EPRINTF(EPLACE,"Error: self comparison starts on diagonal 0 (Compute_Trace)\n"); EXIT(1); } else if (B < A) posl = (B-A)+1; else posh = (B-A)-1; } F1 = PVF[-2]; F0 = PVF[-1]; for (D = low-1; D <= hgh+1; D++) F1[D] = F0[D] = -2; F0[0] = -1; low += 1; hgh -= 1; for (D = 0; 1; D += 1) { int k, i, j; int am, ac, ap; char *a; F2 = F1; F1 = F0; F0 = PVF[D]; HF = PHF[D]; if ((D & 0x1) == 0) { if (low > posl) low -= 1; if (hgh < posh) hgh += 1; } F0[hgh+1] = F0[low-1] = -2; j = -2; a = A + hgh; i = M - hgh; for (k = hgh; k > del; k--) { ap = j+1; am = F2[k-1]; FS_MOVE(-1,4) a -= 1; i += 1; } j = -2; a = A + low; i = M - low; for (k = low; k < del; k++) { ap = F2[k+1]+1; am = j; FS_MOVE(2,1) a += 1; i -= 1; } ap = F0[del+1]+1; am = j; FS_MOVE(2,4) #ifdef DEBUG_AWAVE print_awave(F0,low,hgh); print_awave(HF,low,hgh); #endif if (F0[del] >= N) break; } } { int k, h, m, e, c; int d, f; d = D + abs(del); c = N; k = del; if (mode == UPPERMOST) for (f = d/2; d > f; d--) { e = PHF[D][k]; h = k+e; if (e > 1) h -= 3; else if (e == 0) D -= 1; else D -= 2; if (h < k) // => e = -1 or 2, UPPERMOST { char *a; a = A + k; if (k < 0) m = -k; else m = 0; if (PVF[D][h] <= c) c = PVF[D][h]-1; while (c >= m && a[c] == B[c]) c -= 1; if (e == -1) // => edge is 2, others are 1, and 0 { if (c <= PVF[D+2][k+1]) { e = 4; h = k+1; D = D+2; } else if (c == PVF[D+1][k]) { e = 0; h = k; D = D+1; } else PVF[D][h] = c+1; } else // => edge is 0, others are 1, and 2 (if k != del), 0 (otherwise) { if (k == del) m = D; else m = D-2; if (c <= PVF[m][k+1]) { if (k == del) e = 4; else e = 1; h = k+1; D = m; } else if (c == PVF[D-1][k]) { e = 0; h = k; D = D-1; } else PVF[D][h] = c+1; } } k = h; } else if (mode == LOWERMOST) for (f = d/2; d > f; d--) { e = PHF[D][k]; h = k+e; if (e > 1) h -= 3; else if (e == 0) D -= 1; else D -= 2; if (h > k) // => e = 1 or 4, LOWERMOST { char *a; a = A + k; if (k < 0) m = -k; else m = 0; if (PVF[D][h] < c) c = PVF[D][h]; while (c >= m && a[c] == B[c]) c -= 1; if (e == 1) // => edge is 2, others are 1, and 0 { if (c < PVF[D+2][k-1]) { e = 2; h = k-1; D = D+2; } else if (c == PVF[D+1][k]) { e = 0; h = k; D = D+1; } else PVF[D][h] = c--; } else // => edge is 0, others are 1, and 2 (if k != del), 0 (otherwise) { if (k == del) m = D; else m = D-2; if (c < PVF[m][k-1]) { if (k == del) e = 2; else e = -1; h = k-1; D = m; } else if (c == PVF[D-1][k]) { e = 0; h = k; D = D-1; } else PVF[D][h] = c--; } } k = h; } else // mode == GREEDIEST for (f = d/2; d > f; d--) { e = PHF[D][k]; h = k+e; if (e > 1) h -= 3; else if (e == 0) D -= 1; else D -= 2; k = h; } wave->midb = (B-wave->Babs) + PVF[D][k]; wave->mida = (A-wave->Aabs) + k + PVF[D][k]; } return (0); } /****************************************************************************************\ * * * COMPUTE_TRACE FLAVORS * * * \****************************************************************************************/ int Compute_Trace_ALL(Alignment *align, Work_Data *ework) { _Work_Data *work = (_Work_Data *) ework; Trace_Waves wave; Path *path; char *aseq, *bseq; int M, N, D; path = align->path; aseq = align->aseq; bseq = align->bseq; M = path->aepos-path->abpos; N = path->bepos-path->bbpos; { int64 s; int d; int dmax; int **PVF, **PHF; if (M < N) s = N; else s = M; s *= sizeof(int); if (s > work->tramax) if (enlarge_trace(work,s)) EXIT(1); dmax = path->diffs - abs(M-N); s = (dmax+3)*2*((M+N+3)*sizeof(int) + sizeof(int *)); if (s > 256000000) return (Compute_Trace_ND_ALL(align,ework)); if (s > work->vecmax) if (enlarge_vector(work,s)) EXIT(1); wave.PVF = PVF = ((int **) (work->vector)) + 2; wave.PHF = PHF = PVF + (dmax+3); s = M+N+3; PVF[-2] = ((int *) (PHF + (dmax+1))) + (N+1); for (d = -1; d <= dmax; d++) PVF[d] = PVF[d-1] + s; PHF[-2] = PVF[dmax] + s; for (d = -1; d <= dmax; d++) PHF[d] = PHF[d-1] + s; } wave.Stop = ((int *) work->trace); wave.Aabs = aseq; wave.Babs = bseq; D = iter_np(aseq+path->abpos,M,bseq+path->bbpos,N,&wave,GREEDIEST); if (D < 0) EXIT(1); path->diffs = D; path->trace = work->trace; path->tlen = wave.Stop - ((int *) path->trace); return (0); } int Compute_Trace_PTS(Alignment *align, Work_Data *ework, int trace_spacing, int mode) { _Work_Data *work = (_Work_Data *) ework; Trace_Waves wave; Path *path; char *aseq, *bseq; uint16 *points; int tlen; int ab, bb; int ae, be; int diffs; path = align->path; aseq = align->aseq; bseq = align->bseq; tlen = path->tlen; points = (uint16 *) path->trace; { int64 s; int d; int M, N; int dmax, nmax; int **PVF, **PHF; M = path->aepos-path->abpos; N = path->bepos-path->bbpos; if (M < N) s = N*sizeof(int); else s = M*sizeof(int); if (s > work->tramax) if (enlarge_trace(work,s)) EXIT(1); nmax = 0; dmax = 0; for (d = 1; d < tlen; d += 2) { if (points[d-1] > dmax) dmax = points[d-1]; if (points[d] > nmax) nmax = points[d]; } if (tlen <= 1) nmax = N; s = (dmax+3)*2*((trace_spacing+nmax+3)*sizeof(int) + sizeof(int *)); if (s > work->vecmax) if (enlarge_vector(work,s)) EXIT(1); wave.PVF = PVF = ((int **) (work->vector)) + 2; wave.PHF = PHF = PVF + (dmax+3); s = trace_spacing+nmax+3; PVF[-2] = ((int *) (PHF + (dmax+1))) + (nmax+1); for (d = -1; d <= dmax; d++) PVF[d] = PVF[d-1] + s; PHF[-2] = PVF[dmax] + s; for (d = -1; d <= dmax; d++) PHF[d] = PHF[d-1] + s; } wave.Stop = (int *) (work->trace); wave.Aabs = aseq; wave.Babs = bseq; { int i, d; diffs = 0; ab = path->abpos; ae = (ab/trace_spacing)*trace_spacing; bb = path->bbpos; tlen -= 2; for (i = 1; i < tlen; i += 2) { ae = ae + trace_spacing; be = bb + points[i]; d = iter_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode); if (d < 0) EXIT(1); diffs += d; ab = ae; bb = be; } ae = path->aepos; be = path->bepos; d = iter_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode); if (d < 0) EXIT(1); diffs += d; } path->trace = work->trace; path->tlen = wave.Stop - ((int *) path->trace); path->diffs = diffs; return (0); } int Compute_Trace_MID(Alignment *align, Work_Data *ework, int trace_spacing, int mode) { _Work_Data *work = (_Work_Data *) ework; Trace_Waves wave; Path *path; char *aseq, *bseq; uint16 *points; int tlen; int ab, bb; int ae, be; int diffs; path = align->path; aseq = align->aseq; bseq = align->bseq; tlen = path->tlen; points = (uint16 *) path->trace; { int64 s; int d; int M, N; int dmax, nmax; int **PVF, **PHF; M = path->aepos-path->abpos; N = path->bepos-path->bbpos; if (M < N) s = N*sizeof(int); else s = M*sizeof(int); if (s > work->tramax) if (enlarge_trace(work,s)) EXIT(1); nmax = 0; dmax = 0; for (d = 1; d < tlen; d += 2) { if (points[d-1] > dmax) dmax = points[d-1]; if (points[d] > nmax) nmax = points[d]; } if (tlen <= 1) nmax = N; s = (dmax+3)*4*((trace_spacing+nmax+3)*sizeof(int) + sizeof(int *)); if (s > work->vecmax) if (enlarge_vector(work,s)) EXIT(1); wave.PVF = PVF = ((int **) (work->vector)) + 2; wave.PHF = PHF = PVF + (dmax+3); s = trace_spacing+nmax+3; PVF[-2] = ((int *) (PHF + (dmax+1))) + (nmax+1); for (d = -1; d <= dmax; d++) PVF[d] = PVF[d-1] + s; PHF[-2] = PVF[dmax] + s; for (d = -1; d <= dmax; d++) PHF[d] = PHF[d-1] + s; } wave.Stop = ((int *) work->trace); wave.Aabs = aseq; wave.Babs = bseq; { int i, d; int as, bs; int af, bf; diffs = 0; ab = as = af = path->abpos; ae = (ab/trace_spacing)*trace_spacing; bb = bs = bf = path->bbpos; tlen -= 2; for (i = 1; i < tlen; i += 2) { ae = ae + trace_spacing; be = bb + points[i]; if (middle_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode)) EXIT(1); af = wave.mida; bf = wave.midb; d = iter_np(aseq+as,af-as,bseq+bs,bf-bs,&wave,mode); if (d < 0) EXIT(1); diffs += d; ab = ae; bb = be; as = af; bs = bf; } ae = path->aepos; be = path->bepos; if (middle_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode)) EXIT(1); af = wave.mida; bf = wave.midb; d = iter_np(aseq+as,af-as,bseq+bs,bf-bs,&wave,mode); if (d < 0) EXIT(1); diffs += d; as = af; bs = bf; d += iter_np(aseq+af,ae-as,bseq+bf,be-bs,&wave,mode); if (d < 0) EXIT(1); diffs += d; } path->trace = work->trace; path->tlen = wave.Stop - ((int *) path->trace); path->diffs = diffs; return (0); } int Compute_Trace_IRR(Alignment *align, Work_Data *ework, int mode) { _Work_Data *work = (_Work_Data *) ework; Trace_Waves wave; Path *path; char *aseq, *bseq; uint16 *points; int tlen; int ab, bb; int ae, be; int diffs; path = align->path; aseq = align->aseq; bseq = align->bseq; tlen = path->tlen; points = (uint16 *) path->trace; { int64 s; int d; int M, N; int mmax, nmax, dmax; int **PVF, **PHF; M = path->aepos-path->abpos; N = path->bepos-path->bbpos; if (M < N) s = N*sizeof(int); else s = M*sizeof(int); if (s > work->tramax) if (enlarge_trace(work,s)) EXIT(1); nmax = mmax = 0; for (d = 0; d < tlen; d += 2) { if (points[d] > mmax) mmax = points[d]; if (points[d+1] > nmax) nmax = points[d+1]; } if (tlen <= 1) { mmax = M; nmax = N; } if (mmax > nmax) dmax = nmax; else dmax = mmax; s = (dmax+3)*2*((mmax+nmax+3)*sizeof(int) + sizeof(int *)); if (s > work->vecmax) if (enlarge_vector(work,s)) EXIT(1); wave.PVF = PVF = ((int **) (work->vector)) + 2; wave.PHF = PHF = PVF + (dmax+3); s = mmax+nmax+3; PVF[-2] = ((int *) (PHF + (dmax+1))) + (nmax+1); for (d = -1; d <= dmax; d++) PVF[d] = PVF[d-1] + s; PHF[-2] = PVF[dmax] + s; for (d = -1; d <= dmax; d++) PHF[d] = PHF[d-1] + s; } wave.Stop = (int *) (work->trace); wave.Aabs = aseq; wave.Babs = bseq; { int i, d; diffs = 0; ab = path->abpos; bb = path->bbpos; for (i = 0; i < tlen; i += 2) { ae = ab + points[i]; be = bb + points[i+1]; d = iter_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode); if (d < 0) EXIT(1); diffs += d; ab = ae; bb = be; } } path->trace = work->trace; path->tlen = wave.Stop - ((int *) path->trace); path->diffs = diffs; return (0); } HINGE-0.5.0/src/lib/falcon.c000077500000000000000000000662061314415550300153270ustar00rootroot00000000000000/* * ===================================================================================== * * Filename: fastcon.c * * Description: * * Version: 0.1 * Created: 07/20/2013 17:00:00 * Revision: none * Compiler: gcc * * Author: Jason Chin, * Company: * * ===================================================================================== #################################################################################$$ # Copyright (c) 2011-2014, Pacific Biosciences of California, Inc. # # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted (subject to the limitations in the # disclaimer below) provided that the following conditions are met: # # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # # * Redistributions in binary form must reproduce the above # copyright notice, this list of conditions and the following # disclaimer in the documentation and/or other materials provided # with the distribution. # # * Neither the name of Pacific Biosciences nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE # GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC # BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF # USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT # OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. #################################################################################$$ */ #include #include #include #include #include #include #include "common.h" align_tags_t * get_align_tags( char * aln_q_seq, char * aln_t_seq, seq_coor_t aln_seq_len, aln_range * range, unsigned q_id, seq_coor_t t_offset) { char p_q_base; align_tags_t * tags; seq_coor_t i, j, jj, k, p_j, p_jj; tags = calloc( 1, sizeof(align_tags_t) ); tags->len = aln_seq_len; tags->align_tags = calloc( aln_seq_len + 1, sizeof(align_tag_t) ); i = range->s1 - 1; j = range->s2 - 1; jj = 0; p_j = -1; p_jj = 0; p_q_base = '.'; for (k = 0; k < aln_seq_len; k++) { if (aln_q_seq[k] != '-') { i ++; jj ++; } if (aln_t_seq[k] != '-') { j ++; jj = 0; } //printf("t %d %d %d %c %c\n", q_id, j, jj, aln_t_seq[k], aln_q_seq[k]); if ( j + t_offset >= 0 && jj < UINT8_MAX && p_jj < UINT8_MAX) { (tags->align_tags[k]).t_pos = j + t_offset; (tags->align_tags[k]).delta = jj; (tags->align_tags[k]).p_t_pos = p_j + t_offset; (tags->align_tags[k]).p_delta = p_jj; (tags->align_tags[k]).p_q_base = p_q_base; (tags->align_tags[k]).q_base = aln_q_seq[k]; (tags->align_tags[k]).q_id = q_id; p_j = j; p_jj = jj; p_q_base = aln_q_seq[k]; } } // sentinal at the end //k = aln_seq_len; tags->len = k; (tags->align_tags[k]).t_pos = UINT_MAX; (tags->align_tags[k]).delta = UINT8_MAX; (tags->align_tags[k]).q_base = '.'; (tags->align_tags[k]).q_id = UINT_MAX; return tags; } void free_align_tags( align_tags_t * tags) { free( tags->align_tags ); free( tags ); } void allocate_aln_col( align_tag_col_t * col) { col->p_t_pos = ( seq_coor_t * ) calloc(col->size, sizeof( seq_coor_t )); col->p_delta = ( uint8_t * ) calloc(col->size, sizeof( uint8_t )); col->p_q_base = ( char * )calloc(col->size, sizeof( char )); col->link_count = ( uint16_t * ) calloc(col->size, sizeof( uint16_t )); } void realloc_aln_col( align_tag_col_t * col ) { col->p_t_pos = (seq_coor_t *) realloc( col->p_t_pos, (col->size) * sizeof( seq_coor_t )); col->p_delta = ( uint8_t *) realloc( col->p_delta, (col->size) * sizeof( uint8_t )); col->p_q_base = (char *) realloc( col->p_q_base, (col->size) * sizeof( char )); col->link_count = ( uint16_t *) realloc( col->link_count, (col->size) * sizeof( uint16_t )); } void free_aln_col( align_tag_col_t * col) { free(col->p_t_pos); free(col->p_delta); free(col->p_q_base); free(col->link_count); } void allocate_delta_group( msa_delta_group_t * g) { int i,j; g->max_delta = 0; g->delta = (msa_base_group_t *) calloc( g->size, sizeof(msa_base_group_t)); for (i = 0; i< g->size; i++) { g->delta[i].base = ( align_tag_col_t * ) calloc( 5, sizeof(align_tag_col_t ) ); for (j = 0; j < 5; j++ ) { g->delta[i].base[j].size = 8; allocate_aln_col(&(g->delta[i].base[j])); } } } void realloc_delta_group( msa_delta_group_t * g, uint16_t new_size ) { int i, j, bs, es; bs = g->size; es = new_size; g->delta = (msa_base_group_t *) realloc(g->delta, new_size * sizeof(msa_base_group_t)); for (i=bs; i < es; i++) { g->delta[i].base = ( align_tag_col_t *) calloc( 5, sizeof(align_tag_col_t ) ); for (j = 0; j < 5; j++ ) { g->delta[i].base[j].size = 8; allocate_aln_col(&(g->delta[i].base[j])); } } g->size = new_size; } void free_delta_group( msa_delta_group_t * g) { //manything to do here int i, j; for (i = 0; i < g->size; i++) { for (j = 0; j < 5; j++) { free_aln_col( &(g->delta[i].base[j]) ); } free(g->delta[i].base); } free(g->delta); } void update_col( align_tag_col_t * col, seq_coor_t p_t_pos, uint8_t p_delta, char p_q_base) { int updated = 0; int kk; col->count += 1; for (kk = 0; kk < col->n_link; kk++) { if ( p_t_pos == col->p_t_pos[kk] && p_delta == col->p_delta[kk] && p_q_base == col->p_q_base[kk] ) { col->link_count[kk] ++; updated = 1; break; } } if (updated == 0) { if (col->n_link + 1 > col->size) { if (col->size < (UINT16_MAX > 1)-1) { col->size *= 2; } else { col->size += 256; } assert( col->size < UINT16_MAX-1 ); realloc_aln_col(col); } kk = col->n_link; col->p_t_pos[kk] = p_t_pos; col->p_delta[kk] = p_delta; col->p_q_base[kk] = p_q_base; col->link_count[kk] = 1; col->n_link++; } } msa_pos_t * get_msa_working_sapce(unsigned int max_t_len) { msa_pos_t * msa_array; unsigned int i; msa_array = calloc(max_t_len, sizeof(msa_pos_t *)); for (i = 0; i < max_t_len; i++) { msa_array[i] = calloc(1, sizeof(msa_delta_group_t)); msa_array[i]->size = 8; allocate_delta_group(msa_array[i]); } return msa_array; } void clean_msa_working_space( msa_pos_t * msa_array, unsigned int max_t_len) { unsigned int i,j,k; align_tag_col_t * col; for (i = 0; i < max_t_len; i++) { for (j =0; j < msa_array[i]->max_delta + 1; j++) { for (k = 0; k < 5; k++ ) { col = msa_array[i]->delta[j].base + k; /* for (c =0; c < col->size; c++) { col->p_t_pos[c] = 0; col->p_delta[c] = 0; col->p_q_base[c] = 0; col->link_count[c] =0; } */ col->n_link = 0; col->count = 0; col->best_p_t_pos = 0; col->best_p_delta = 0; col->best_p_q_base = 0; col->score = 0; } } msa_array[i]->max_delta = 0; } } #define STATIC_ALLOCATE //#undef STATIC_ALLOCATE consensus_data * get_cns_from_align_tags( align_tags_t ** tag_seqs, unsigned n_tag_seqs, unsigned t_len, unsigned min_cov ) { seq_coor_t i, j; seq_coor_t t_pos = 0; unsigned int * coverage; unsigned int * local_nbase; consensus_data * consensus; //char * consensus; align_tag_t * c_tag; static msa_pos_t * msa_array = NULL; coverage = calloc( t_len, sizeof(unsigned int) ); local_nbase = calloc( t_len, sizeof(unsigned int) ); #ifndef STATIC_ALLOCATE msa_array = calloc(t_len, sizeof(msa_pos_t *)); for (i = 0; i < t_len; i++) { msa_array[i] = calloc(1, sizeof(msa_delta_group_t)); msa_array[i]->size = 8; allocate_delta_group(msa_array[i]); } #endif #ifdef STATIC_ALLOCATE if ( msa_array == NULL) { msa_array = get_msa_working_sapce( 100000 ); } assert(t_len < 100000); #endif // loop through every alignment //printf("XX %d\n", n_tag_seqs); for (i = 0; i < n_tag_seqs; i++) { // for each alignment position, insert the alignment tag to msa_array for (j = 0; j < tag_seqs[i]->len; j++) { c_tag = tag_seqs[i]->align_tags + j; unsigned int delta; delta = c_tag->delta; if (delta == 0) { t_pos = c_tag->t_pos; coverage[ t_pos ] ++; } // Assume t_pos was set on earlier iteration. // (Otherwise, use its initial value, which might be an error. ~cd) if (delta > msa_array[t_pos]->max_delta) { msa_array[t_pos]->max_delta = delta; if (msa_array[t_pos]->max_delta + 4 > msa_array[t_pos]->size ) { realloc_delta_group(msa_array[t_pos], msa_array[t_pos]->max_delta + 8); } } unsigned int base = -1; switch (c_tag->q_base) { case 'A': base = 0; break; case 'C': base = 1; break; case 'G': base = 2; break; case 'T': base = 3; break; case '-': base = 4; break; } // Note: On bad input, base may be -1. update_col( &(msa_array[t_pos]->delta[delta].base[base]), c_tag->p_t_pos, c_tag->p_delta, c_tag->p_q_base); local_nbase[ t_pos ] ++; } } // propogate score throught the alignment links, setup backtracking information align_tag_col_t * g_best_aln_col = 0; unsigned int g_best_ck = 0; seq_coor_t g_best_t_pos = 0; { int kk; int ck; // char base; int best_i; int best_j; int best_b; int best_ck = -1; double score; double best_score; double g_best_score; // char best_mark; align_tag_col_t * aln_col; g_best_score = -1; for (i = 0; i < t_len; i++) { //loop through every template base //printf("max delta: %d %d\n", i, msa_array[i]->max_delta); for (j = 0; j <= msa_array[i]->max_delta; j++) { // loop through every delta position for (kk = 0; kk < 5; kk++) { // loop through diff bases of the same delta posiiton /* switch (kk) { case 0: base = 'A'; break; case 1: base = 'C'; break; case 2: base = 'G'; break; case 3: base = 'T'; break; case 4: base = '-'; break; } */ aln_col = msa_array[i]->delta[j].base + kk; if (aln_col->count >= 0) { best_score = -1; best_i = -1; best_j = -1; best_b = -1; for (ck = 0; ck < aln_col->n_link; ck++) { // loop through differnt link to previous column int pi; int pj; int pkk; pi = aln_col->p_t_pos[ck]; pj = aln_col->p_delta[ck]; switch (aln_col->p_q_base[ck]) { case 'A': pkk = 0; break; case 'C': pkk = 1; break; case 'G': pkk = 2; break; case 'T': pkk = 3; break; case '-': pkk = 4; break; default: pkk = 4; } if (aln_col->p_t_pos[ck] == -1) { score = (double) aln_col->link_count[ck] - (double) coverage[i] * 0.5; } else { score = msa_array[pi]->delta[pj].base[pkk].score + (double) aln_col->link_count[ck] - (double) coverage[i] * 0.5; } // best_mark = ' '; if (score > best_score) { best_score = score; aln_col->best_p_t_pos = best_i = pi; aln_col->best_p_delta = best_j = pj; aln_col->best_p_q_base = best_b = pkk; best_ck = ck; // best_mark = '*'; } /* printf("X %d %d %d %c %d %d %d %c %d %lf %c\n", coverage[i], i, j, base, aln_col->count, aln_col->p_t_pos[ck], aln_col->p_delta[ck], aln_col->p_q_base[ck], aln_col->link_count[ck], score, best_mark); */ } aln_col->score = best_score; if (best_score > g_best_score) { g_best_score = best_score; g_best_aln_col = aln_col; g_best_ck = best_ck; g_best_t_pos = i; //printf("GB %d %d %d %d\n", i, j, ck, g_best_aln_col); } } } } } assert(g_best_score != -1); } // reconstruct the sequences unsigned int index; char bb = '$'; int ck; char * cns_str; int * eqv; double score0; consensus = calloc( 1, sizeof(consensus_data) ); consensus->sequence = calloc( t_len * 2 + 1, sizeof(char) ); consensus->eqv = calloc( t_len * 2 + 1, sizeof(unsigned int) ); cns_str = consensus->sequence; eqv = consensus->eqv; index = 0; ck = g_best_ck; i = g_best_t_pos; while (1) { if (coverage[i] > min_cov) { switch (ck) { case 0: bb = 'A'; break; case 1: bb = 'C'; break; case 2: bb = 'G'; break; case 3: bb = 'T'; break; case 4: bb = '-'; break; } } else { switch (ck) { case 0: bb = 'a'; break; case 1: bb = 'c'; break; case 2: bb = 'g'; break; case 3: bb = 't'; break; case 4: bb = '-'; break; } } // Note: On bad input, bb will keep previous value, possibly '$'. score0 = g_best_aln_col->score; i = g_best_aln_col->best_p_t_pos; if (i == -1 || index >= t_len * 2) break; j = g_best_aln_col->best_p_delta; ck = g_best_aln_col->best_p_q_base; g_best_aln_col = msa_array[i]->delta[j].base + ck; if (bb != '-') { cns_str[index] = bb; eqv[index] = (int) score0 - (int) g_best_aln_col->score; //printf("C %d %d %c %lf %d %d\n", i, index, bb, g_best_aln_col->score, coverage[i], eqv[index] ); index ++; } } // reverse the sequence for (i = 0; i < index/2; i++) { cns_str[i] = cns_str[i] ^ cns_str[index-i-1]; cns_str[index-i-1] = cns_str[i] ^ cns_str[index-i-1]; cns_str[i] = cns_str[i] ^ cns_str[index-i-1]; eqv[i] = eqv[i] ^ eqv[index-i-1]; eqv[index-i-1] = eqv[i] ^ eqv[index-i-1]; eqv[i] = eqv[i] ^ eqv[index-i-1]; } cns_str[index] = 0; //printf("%s\n", cns_str); #ifndef STATIC_ALLOCATE for (i = 0; i < t_len; i++) { free_delta_group(msa_array[i]); free(msa_array[i]); } free(msa_array); #endif #ifdef STATIC_ALLOCATE clean_msa_working_space(msa_array, t_len+1); #endif free(coverage); free(local_nbase); return consensus; } //const unsigned int K = 8; consensus_data * generate_consensus( char ** input_seq, unsigned int n_seq, unsigned min_cov, unsigned K, double min_idt) { unsigned int j; unsigned int seq_count; unsigned int aligned_seq_count; kmer_lookup * lk_ptr; seq_array sa_ptr; seq_addr_array sda_ptr; kmer_match * kmer_match_ptr; aln_range * arange; alignment * aln; align_tags_t ** tags_list; //char * consensus; consensus_data * consensus; double max_diff; max_diff = 1.0 - min_idt; seq_count = n_seq; //printf("XX n_seq %d\n", n_seq); //for (j=0; j < seq_count; j++) { // printf("seq_len: %u %u\n", j, strlen(input_seq[j])); //}; fflush(stdout); tags_list = calloc( seq_count, sizeof(align_tags_t *) ); lk_ptr = allocate_kmer_lookup( 1 << (K * 2) ); sa_ptr = allocate_seq( (seq_coor_t) strlen( input_seq[0]) ); sda_ptr = allocate_seq_addr( (seq_coor_t) strlen( input_seq[0]) ); add_sequence( 0, K, input_seq[0], strlen(input_seq[0]), sda_ptr, sa_ptr, lk_ptr); //mask_k_mer(1 << (K * 2), lk_ptr, 16); aligned_seq_count = 0; for (j=1; j < seq_count; j++) { //printf("seq_len: %ld %u\n", j, strlen(input_seq[j])); kmer_match_ptr = find_kmer_pos_for_seq(input_seq[j], strlen(input_seq[j]), K, sda_ptr, lk_ptr); #define INDEL_ALLOWENCE_0 6 arange = find_best_aln_range(kmer_match_ptr, K, K * INDEL_ALLOWENCE_0, 5); // narrow band to avoid aligning through big indels //printf("1:%ld %ld %ld %ld\n", arange_->s1, arange_->e1, arange_->s2, arange_->e2); //arange = find_best_aln_range2(kmer_match_ptr, K, K * INDEL_ALLOWENCE_0, 5); // narrow band to avoid aligning through big indels //printf("2:%ld %ld %ld %ld\n\n", arange->s1, arange->e1, arange->s2, arange->e2); #define INDEL_ALLOWENCE_1 0.10 if (arange->e1 - arange->s1 < 100 || arange->e2 - arange->s2 < 100 || abs( (arange->e1 - arange->s1 ) - (arange->e2 - arange->s2) ) > (int) (0.5 * INDEL_ALLOWENCE_1 * (arange->e1 - arange->s1 + arange->e2 - arange->s2))) { free_kmer_match( kmer_match_ptr); free_aln_range(arange); continue; } //printf("%ld %s\n", strlen(input_seq[j]), input_seq[j]); //printf("%ld %s\n\n", strlen(input_seq[0]), input_seq[0]); #define INDEL_ALLOWENCE_2 150 aln = _align(input_seq[j]+arange->s1, arange->e1 - arange->s1 , input_seq[0]+arange->s2, arange->e2 - arange->s2 , INDEL_ALLOWENCE_2, 1); if (aln->aln_str_size > 500 && ((double) aln->dist / (double) aln->aln_str_size) < max_diff) { tags_list[aligned_seq_count] = get_align_tags( aln->q_aln_str, aln->t_aln_str, aln->aln_str_size, arange, j, 0); aligned_seq_count ++; } /*** for (k = 0; k < tags_list[j]->len; k++) { printf("%ld %d %c\n", tags_list[j]->align_tags[k].t_pos, tags_list[j]->align_tags[k].delta, tags_list[j]->align_tags[k].q_base); } ***/ free_aln_range(arange); free_alignment(aln); free_kmer_match( kmer_match_ptr); } if (aligned_seq_count > 0) { consensus = get_cns_from_align_tags( tags_list, aligned_seq_count, strlen(input_seq[0]), min_cov ); } else { // allocate an empty consensus sequence consensus = calloc( 1, sizeof(consensus_data) ); consensus->sequence = calloc( 1, sizeof(char) ); consensus->eqv = calloc( 1, sizeof(unsigned int) ); } //free(consensus); free_seq_addr_array(sda_ptr); free_seq_array(sa_ptr); free_kmer_lookup(lk_ptr); for (j=0; j < aligned_seq_count; j++) { free_align_tags(tags_list[j]); } free(tags_list); return consensus; } consensus_data * generate_utg_consensus( char ** input_seq, seq_coor_t *offset, unsigned int n_seq, unsigned min_cov, unsigned K, double min_idt) { unsigned int j; unsigned int seq_count; unsigned int aligned_seq_count; aln_range * arange; alignment * aln; align_tags_t ** tags_list; //char * consensus; consensus_data * consensus; double max_diff; seq_coor_t utg_len; seq_coor_t r_len; max_diff = 1.0 - min_idt; seq_count = n_seq; /*** for (j=0; j < seq_count; j++) { printf("seq_len: %u %u\n", j, strlen(input_seq[j])); }; fflush(stdout); ***/ tags_list = calloc( seq_count+1, sizeof(align_tags_t *) ); utg_len = strlen(input_seq[0]); aligned_seq_count = 0; arange = calloc( 1, sizeof(aln_range) ); arange->s1 = 0; arange->e1 = strlen(input_seq[0]); arange->s2 = 0; arange->e2 = strlen(input_seq[0]); tags_list[aligned_seq_count] = get_align_tags( input_seq[0], input_seq[0], strlen(input_seq[0]), arange, 0, 0); aligned_seq_count += 1; for (j=1; j < seq_count; j++) { arange->s1 = 0; arange->e1 = strlen(input_seq[j])-1; arange->s2 = 0; arange->e2 = strlen(input_seq[j])-1; r_len = strlen(input_seq[j]); //printf("seq_len: %u %u\n", j, r_len); if ( offset[j] < 0) { if ((r_len + offset[j]) < 128) { continue; } if ( r_len + offset[j] < utg_len ) { //printf("1: %ld %u %u\n", offset[j], r_len, utg_len); aln = _align(input_seq[j] - offset[j], r_len + offset[j] , input_seq[0], r_len + offset[j] , 500, 1); } else { //printf("2: %ld %u %u\n", offset[j], r_len, utg_len); aln = _align(input_seq[j] - offset[j], utg_len , input_seq[0], utg_len , 500, 1); } offset[j] = 0; } else { if ( offset[j] > utg_len - 128) { continue; } if ( offset[j] + r_len > utg_len ) { //printf("3: %ld %u %u\n", offset[j], r_len, utg_len); aln = _align(input_seq[j], utg_len - offset[j] , input_seq[0]+offset[j], utg_len - offset[j], 500, 1); } else { //printf("4: %ld %u %u\n", offset[j], r_len, utg_len); aln = _align(input_seq[j], r_len , input_seq[0]+offset[j], r_len , 500, 1); } } if (aln->aln_str_size > 500 && ((double) aln->dist / (double) aln->aln_str_size) < max_diff) { tags_list[aligned_seq_count] = get_align_tags( aln->q_aln_str, aln->t_aln_str, aln->aln_str_size, arange, j, offset[j]); aligned_seq_count ++; } free_alignment(aln); } free_aln_range(arange); if (aligned_seq_count > 0) { consensus = get_cns_from_align_tags( tags_list, aligned_seq_count, utg_len, 0 ); } else { // allocate an empty consensus sequence consensus = calloc( 1, sizeof(consensus_data) ); consensus->sequence = calloc( 1, sizeof(char) ); consensus->eqv = calloc( 1, sizeof(unsigned int) ); } //free(consensus); for (j=0; j < aligned_seq_count; j++) { free_align_tags(tags_list[j]); } free(tags_list); return consensus; } void free_consensus_data( consensus_data * consensus ){ free(consensus->sequence); free(consensus->eqv); free(consensus); } /*** void main() { unsigned int j; char small_buffer[1024]; char big_buffer[65536]; char ** input_seq; char ** seq_id; int seq_count; char * consensus; input_seq = calloc( 501, sizeof(char *)); seq_id = calloc( 501, sizeof(char *)); while(1) { seq_count = 0; while (1) { scanf("%s", small_buffer); seq_id[seq_count] = calloc( strlen(small_buffer) + 1, sizeof(char)); strcpy(seq_id[seq_count], small_buffer); scanf("%s", big_buffer); input_seq[seq_count] = calloc( strlen(big_buffer) + 1 , sizeof(char)); strcpy(input_seq[seq_count], big_buffer); if (strcmp(seq_id[seq_count], "+") == 0) { break; } if (strcmp(seq_id[seq_count], "-") == 0) { break; } //printf("%s\n", seq_id[seq_count]); seq_count += 1; if (seq_count > 500) break; } //printf("sc: %d\n", seq_count); if (seq_count < 10 && strcmp(seq_id[seq_count], "-") != 0 ) continue; if (seq_count < 10 && strcmp(seq_id[seq_count], "-") == 0 ) break; consensus = generate_consensus(input_seq, seq_count, 8, 8); if (strlen(consensus) > 500) { printf(">%s\n%s\n", seq_id[0], consensus); } fflush(stdout); free(consensus); for (j=0; j < seq_count; j++) { free(seq_id[j]); free(input_seq[j]); }; } for (j=0; j < seq_count; j++) { free(seq_id[j]); free(input_seq[j]); }; free(seq_id); free(input_seq); } ***/ HINGE-0.5.0/src/lib/ini.c000066400000000000000000000116001314415550300146250ustar00rootroot00000000000000/* inih -- simple .INI file parser inih is released under the New BSD license (see LICENSE.txt). Go to the project home page for more info: https://github.com/benhoyt/inih */ #ifdef _MSC_VER #define _CRT_SECURE_NO_WARNINGS #endif #include #include #include #include "ini.h" #if !INI_USE_STACK #include #endif #define MAX_SECTION 50 #define MAX_NAME 50 /* Strip whitespace chars off end of given string, in place. Return s. */ static char* rstrip(char* s) { char* p = s + strlen(s); while (p > s && isspace((unsigned char)(*--p))) *p = '\0'; return s; } /* Return pointer to first non-whitespace char in given string. */ static char* lskip(const char* s) { while (*s && isspace((unsigned char)(*s))) s++; return (char*)s; } /* Return pointer to first char c or ';' comment in given string, or pointer to null at end of string if neither found. ';' must be prefixed by a whitespace character to register as a comment. */ static char* find_char_or_comment(const char* s, char c) { int was_whitespace = 0; while (*s && *s != c && !(was_whitespace && *s == ';')) { was_whitespace = isspace((unsigned char)(*s)); s++; } return (char*)s; } /* Version of strncpy that ensures dest (size bytes) is null-terminated. */ static char* strncpy0(char* dest, const char* src, size_t size) { strncpy(dest, src, size); dest[size - 1] = '\0'; return dest; } /* See documentation in header file. */ int ini_parse_stream(ini_reader reader, void* stream, ini_handler handler, void* user) { /* Uses a fair bit of stack (use heap instead if you need to) */ #if INI_USE_STACK char line[INI_MAX_LINE]; #else char* line; #endif char section[MAX_SECTION] = ""; char prev_name[MAX_NAME] = ""; char* start; char* end; char* name; char* value; int lineno = 0; int error = 0; #if !INI_USE_STACK line = (char*)malloc(INI_MAX_LINE); if (!line) { return -2; } #endif /* Scan through stream line by line */ while (reader(line, INI_MAX_LINE, stream) != NULL) { lineno++; start = line; #if INI_ALLOW_BOM if (lineno == 1 && (unsigned char)start[0] == 0xEF && (unsigned char)start[1] == 0xBB && (unsigned char)start[2] == 0xBF) { start += 3; } #endif start = lskip(rstrip(start)); if (*start == ';' || *start == '#') { /* Per Python ConfigParser, allow '#' comments at start of line */ } #if INI_ALLOW_MULTILINE else if (*prev_name && *start && start > line) { /* Non-black line with leading whitespace, treat as continuation of previous name's value (as per Python ConfigParser). */ if (!handler(user, section, prev_name, start) && !error) error = lineno; } #endif else if (*start == '[') { /* A "[section]" line */ end = find_char_or_comment(start + 1, ']'); if (*end == ']') { *end = '\0'; strncpy0(section, start + 1, sizeof(section)); *prev_name = '\0'; } else if (!error) { /* No ']' found on section line */ error = lineno; } } else if (*start && *start != ';') { /* Not a comment, must be a name[=:]value pair */ end = find_char_or_comment(start, '='); if (*end != '=') { end = find_char_or_comment(start, ':'); } if (*end == '=' || *end == ':') { *end = '\0'; name = rstrip(start); value = lskip(end + 1); end = find_char_or_comment(value, '\0'); if (*end == ';') *end = '\0'; rstrip(value); /* Valid name[=:]value pair found, call handler */ strncpy0(prev_name, name, sizeof(prev_name)); if (!handler(user, section, name, value) && !error) error = lineno; } else if (!error) { /* No '=' or ':' found on name[=:]value line */ error = lineno; } } #if INI_STOP_ON_FIRST_ERROR if (error) break; #endif } #if !INI_USE_STACK free(line); #endif return error; } /* See documentation in header file. */ int ini_parse_file(FILE* file, ini_handler handler, void* user) { return ini_parse_stream((ini_reader)fgets, file, handler, user); } /* See documentation in header file. */ int ini_parse(const char* filename, ini_handler handler, void* user) { FILE* file; int error; file = fopen(filename, "r"); if (!file) return -1; error = ini_parse_file(file, handler, user); fclose(file); return error; } HINGE-0.5.0/src/lib/kmer_lookup.c000077500000000000000000000434661314415550300164170ustar00rootroot00000000000000/* * ===================================================================================== * * Filename: kmer_count.c * * Description: * * Version: 0.1 * Created: 07/20/2013 17:00:00 * Revision: none * Compiler: gcc * * Author: Jason Chin, * Company: * * ===================================================================================== #################################################################################$$ # Copyright (c) 2011-2014, Pacific Biosciences of California, Inc. # # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted (subject to the limitations in the # disclaimer below) provided that the following conditions are met: # # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # # * Redistributions in binary form must reproduce the above # copyright notice, this list of conditions and the following # disclaimer in the documentation and/or other materials provided # with the distribution. # # * Neither the name of Pacific Biosciences nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE # GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC # BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF # USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT # OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. #################################################################################$$ */ #include #include #include #include "common.h" const unsigned int KMERMATCHINC = 10000; int compare_seq_coor(const void * a, const void * b) { const seq_coor_t * arg1 = a; const seq_coor_t * arg2 = b; return (* arg1) - (* arg2); } kmer_lookup * allocate_kmer_lookup ( seq_coor_t size ) { kmer_lookup * kl; //printf("%lu is allocated for kmer lookup\n", size); kl = (kmer_lookup *) malloc( size * sizeof(kmer_lookup) ); init_kmer_lookup( kl, size); return kl; } void init_kmer_lookup ( kmer_lookup * kl, seq_coor_t size ) { seq_coor_t i; //printf("%lu is allocated for kmer lookup\n", size); for (i=0; i threshold) { kl[i].start = INT_MAX; kl[i].last = INT_MAX; //kl[i].count = 0; } } } kmer_match * find_kmer_pos_for_seq( char * seq, seq_coor_t seq_len, unsigned int K, seq_addr_array sda, kmer_lookup * lk) { seq_coor_t i; seq_coor_t kmer_bv; seq_coor_t kmer_mask; seq_coor_t kmer_pos; seq_coor_t next_kmer_pos; unsigned int half_K; seq_coor_t kmer_match_rtn_allocation_size = KMERMATCHINC; kmer_match * kmer_match_rtn; base * sa; kmer_match_rtn = (kmer_match *) malloc( sizeof(kmer_match) ); kmer_match_rtn->count = 0; kmer_match_rtn->query_pos = (seq_coor_t *) calloc( kmer_match_rtn_allocation_size, sizeof( seq_coor_t ) ); kmer_match_rtn->target_pos = (seq_coor_t *) calloc( kmer_match_rtn_allocation_size, sizeof( seq_coor_t ) ); sa = calloc( seq_len, sizeof(base) ); kmer_mask = 0; for (i = 0; i < K; i++) { kmer_mask <<= 2; kmer_mask |= 0x00000003; } for (i = 0; i < seq_len; i++) { switch ( seq[i] ) { case 'A': sa[ i ] = 0; break; case 'C': sa[ i ] = 1; break; case 'G': sa[ i ] = 2; break; case 'T': sa[ i ] = 3; } } kmer_bv = get_kmer_bitvector(sa, K); half_K = K >> 1; for (i = 0; i < seq_len - K; i += half_K) { kmer_bv = get_kmer_bitvector(sa + i, K); if (lk[kmer_bv].start == INT_MAX) { //for high count k-mers continue; } kmer_pos = lk[ kmer_bv ].start; next_kmer_pos = sda[ kmer_pos ]; kmer_match_rtn->query_pos[ kmer_match_rtn->count ] = i; kmer_match_rtn->target_pos[ kmer_match_rtn->count ] = kmer_pos; kmer_match_rtn->count += 1; if (kmer_match_rtn->count > kmer_match_rtn_allocation_size - 1000) { kmer_match_rtn_allocation_size += KMERMATCHINC; kmer_match_rtn->query_pos = (seq_coor_t *) realloc( kmer_match_rtn->query_pos, kmer_match_rtn_allocation_size * sizeof(seq_coor_t) ); kmer_match_rtn->target_pos = (seq_coor_t *) realloc( kmer_match_rtn->target_pos, kmer_match_rtn_allocation_size * sizeof(seq_coor_t) ); } while ( next_kmer_pos > kmer_pos ){ kmer_pos = next_kmer_pos; next_kmer_pos = sda[ kmer_pos ]; kmer_match_rtn->query_pos[ kmer_match_rtn->count ] = i; kmer_match_rtn->target_pos[ kmer_match_rtn->count ] = kmer_pos; kmer_match_rtn->count += 1; if (kmer_match_rtn->count > kmer_match_rtn_allocation_size - 1000) { kmer_match_rtn_allocation_size += KMERMATCHINC; kmer_match_rtn->query_pos = (seq_coor_t *) realloc( kmer_match_rtn->query_pos, kmer_match_rtn_allocation_size * sizeof(seq_coor_t) ); kmer_match_rtn->target_pos = (seq_coor_t *) realloc( kmer_match_rtn->target_pos, kmer_match_rtn_allocation_size * sizeof(seq_coor_t) ); } } } free(sa); return kmer_match_rtn; } void free_kmer_match( kmer_match * ptr) { free(ptr->query_pos); free(ptr->target_pos); free(ptr); } aln_range* find_best_aln_range(kmer_match * km_ptr, seq_coor_t K, seq_coor_t bin_size, seq_coor_t count_th) { seq_coor_t i; seq_coor_t j; seq_coor_t q_min, q_max, t_min, t_max; seq_coor_t * d_count; seq_coor_t * q_coor; seq_coor_t * t_coor; aln_range * arange; long int d, d_min, d_max; long int cur_score; long int max_score; long int max_k_mer_count; long int max_k_mer_bin; seq_coor_t cur_start; arange = calloc(1 , sizeof(aln_range)); q_min = INT_MAX; q_max = 0; t_min = INT_MAX; t_max = 0; d_min = INT_MAX; d_max = LONG_MIN; for (i = 0; i < km_ptr->count; i++ ) { if ( km_ptr -> query_pos[i] < q_min) { q_min = km_ptr->query_pos[i]; } if ( km_ptr -> query_pos[i] > q_max) { q_max = km_ptr->query_pos[i]; } if ( km_ptr -> target_pos[i] < t_min) { t_min = km_ptr->target_pos[i]; } if ( km_ptr -> query_pos[i] > t_max) { t_max = km_ptr->target_pos[i]; } d = (long int) km_ptr->query_pos[i] - (long int) km_ptr->target_pos[i]; if ( d < d_min ) { d_min = d; } if ( d > d_max ) { d_max = d; } } //printf("%lu %ld %ld\n" , km_ptr->count, d_min, d_max); d_count = calloc( (d_max - d_min)/bin_size + 1, sizeof(seq_coor_t) ); q_coor = calloc( km_ptr->count, sizeof(seq_coor_t) ); t_coor = calloc( km_ptr->count, sizeof(seq_coor_t) ); for (i = 0; i < km_ptr->count; i++ ) { d = (long int) (km_ptr->query_pos[i]) - (long int) (km_ptr->target_pos[i]); d_count[ (d - d_min)/ (long int) bin_size ] += 1; q_coor[i] = INT_MAX; t_coor[i] = INT_MAX; } j = 0; max_k_mer_count = 0; max_k_mer_bin = INT_MAX; for (i = 0; i < km_ptr->count; i++ ) { d = (long int) (km_ptr->query_pos[i]) - (long int) (km_ptr->target_pos[i]); if ( d_count[ (d - d_min)/ (long int) bin_size ] > max_k_mer_count) { max_k_mer_count = d_count[ (d - d_min)/ (long int) bin_size ]; max_k_mer_bin = (d - d_min)/ (long int) bin_size; } } //printf("k_mer: %lu %lu\n" , max_k_mer_count, max_k_mer_bin); if ( max_k_mer_bin != INT_MAX && max_k_mer_count > count_th ) { for (i = 0; i < km_ptr->count; i++ ) { d = (long int) (km_ptr->query_pos[i]) - (long int) (km_ptr->target_pos[i]); if ( abs( ( (d - d_min)/ (long int) bin_size ) - max_k_mer_bin ) > 5 ) { continue; } if (d_count[ (d - d_min)/ (long int) bin_size ] > count_th) { q_coor[j] = km_ptr->query_pos[i]; t_coor[j] = km_ptr->target_pos[i]; //printf("d_count: %lu %lu\n" ,i, d_count[(d - d_min)/ (long int) bin_size]); //printf("coor: %lu %lu\n" , q_coor[j], t_coor[j]); j ++; } } } if (j > 1) { arange->s1 = q_coor[0]; arange->e1 = q_coor[0]; arange->s2 = t_coor[0]; arange->e2 = t_coor[0]; arange->score = 0; max_score = 0; cur_score = 0; cur_start = 0; for (i = 1; i < j; i++) { cur_score += 32 - (q_coor[i] - q_coor[i-1]); //printf("deltaD, %lu %ld\n", q_coor[i] - q_coor[i-1], cur_score); if (cur_score < 0) { cur_score = 0; cur_start = i; } else if (cur_score > max_score) { arange->s1 = q_coor[cur_start]; arange->s2 = t_coor[cur_start]; arange->e1 = q_coor[i]; arange->e2 = t_coor[i]; max_score = cur_score; arange->score = max_score; //printf("%lu %lu %lu %lu\n", arange.s1, arange.e1, arange.s2, arange.e2); } } } else { arange->s1 = 0; arange->e1 = 0; arange->s2 = 0; arange->e2 = 0; arange->score = 0; } // printf("free\n"); free(d_count); free(q_coor); free(t_coor); return arange; } aln_range* find_best_aln_range2(kmer_match * km_ptr, seq_coor_t K, seq_coor_t bin_width, seq_coor_t count_th) { seq_coor_t * d_coor; seq_coor_t * hit_score; seq_coor_t * hit_count; seq_coor_t * last_hit; seq_coor_t max_q, max_t; seq_coor_t s, e, max_s, max_e, max_span, d_s, d_e, delta, d_len; seq_coor_t px, py, cx, cy; seq_coor_t max_hit_idx; seq_coor_t max_hit_score, max_hit_count; seq_coor_t i, j; seq_coor_t candidate_idx, max_d, d; aln_range * arange; arange = calloc(1 , sizeof(aln_range)); d_coor = calloc( km_ptr->count, sizeof(seq_coor_t) ); max_q = -1; max_t = -1; for (i = 0; i < km_ptr->count; i++ ) { d_coor[i] = km_ptr->query_pos[i] - km_ptr->target_pos[i]; max_q = max_q > km_ptr->query_pos[i] ? max_q : km_ptr->query_pos[i]; max_t = max_t > km_ptr->target_pos[i] ? max_q : km_ptr->target_pos[i]; } qsort(d_coor, km_ptr->count, sizeof(seq_coor_t), compare_seq_coor); s = 0; e = 0; max_s = -1; max_e = -1; max_span = -1; delta = (long int) ( 0.05 * ( max_q + max_t ) ); d_len = km_ptr->count; d_s = -1; d_e = -1; while (1) { d_s = d_coor[s]; d_e = d_coor[e]; while (d_e < d_s + delta && e < d_len-1) { e += 1; d_e = d_coor[e]; } if ( max_span == -1 || e - s > max_span ) { max_span = e - s; max_s = s; max_e = e; } s += 1; if (s == d_len || e == d_len) { break; } } if (max_s == -1 || max_e == -1 || max_e - max_s < 32) { arange->s1 = 0; arange->e1 = 0; arange->s2 = 0; arange->e2 = 0; arange->score = 0; free(d_coor); return arange; } last_hit = calloc( km_ptr->count, sizeof(seq_coor_t) ); hit_score = calloc( km_ptr->count, sizeof(seq_coor_t) ); hit_count = calloc( km_ptr->count, sizeof(seq_coor_t) ); for (i = 0; i < km_ptr->count; i++ ) { last_hit[i] = -1; hit_score[i] = 0; hit_count[i] = 0; } max_hit_idx = -1; max_hit_score = 0; for (i = 0; i < km_ptr->count; i ++) { cx = km_ptr->query_pos[i]; cy = km_ptr->target_pos[i]; d = cx - cy; if ( d < d_coor[max_s] || d > d_coor[max_e] ) continue; j = i - 1; candidate_idx = -1; max_d = 65535; while (1) { if ( j < 0 ) break; px = km_ptr->query_pos[j]; py = km_ptr->target_pos[j]; d = px - py; if ( d < d_coor[max_s] || d > d_coor[max_e] ) { j--; continue; } if (cx - px > 320) break; //the number here controling how big alignment gap to be considered if (cy > py && cx - px + cy - py < max_d && cy - py <= 320 ) { max_d = cx - px + cy - py; candidate_idx = j; } j--; } if (candidate_idx != -1) { last_hit[i] = candidate_idx; hit_score[i] = hit_score[candidate_idx] + (64 - max_d); hit_count[i] = hit_count[candidate_idx] + 1; if (hit_score[i] < 0) { hit_score[i] = 0; hit_count[i] = 0; } } else { hit_score[i] = 0; hit_count[i] = 0; } if (hit_score[i] > max_hit_score) { max_hit_score = hit_score[i]; max_hit_count = hit_count[i]; max_hit_idx = i; } } if (max_hit_idx == -1) { arange->s1 = 0; arange->e1 = 0; arange->s2 = 0; arange->e2 = 0; arange->score = 0; free(d_coor); free(last_hit); free(hit_score); free(hit_count); return arange; } arange->score = max_hit_count + 1; arange->e1 = km_ptr->query_pos[max_hit_idx]; arange->e2 = km_ptr->target_pos[max_hit_idx]; i = max_hit_idx; while (last_hit[i] != -1) { i = last_hit[i]; } arange->s1 = km_ptr->query_pos[i]; arange->s2 = km_ptr->target_pos[i]; free(d_coor); free(last_hit); free(hit_score); free(hit_count); return arange; } void free_aln_range( aln_range * arange) { free(arange); } HINGE-0.5.0/src/lib/paf.c000066400000000000000000000054141314415550300146220ustar00rootroot00000000000000/* The MIT License Copyright (c) 2008, 2009, 2011 Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include #include #include #include "paf.h" #include "kseq.h" KSTREAM_INIT(gzFile, gzread, 0x10000) paf_file_t *paf_open(const char *fn) { kstream_t *ks; gzFile fp; paf_file_t *pf; fp = fn && strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); if (fp == 0) return 0; ks = ks_init(fp); pf = (paf_file_t*)calloc(1, sizeof(paf_file_t)); pf->fp = ks; return pf; } int paf_close(paf_file_t *pf) { kstream_t *ks; if (pf == 0) return 0; free(pf->buf.s); ks = (kstream_t*)pf->fp; gzclose(ks->f); ks_destroy(ks); free(pf); return 0; } int paf_parse(int l, char *s, paf_rec_t *pr) // s must be NULL terminated { // on return: <0 for failure; 0 for success; >0 for filtered char *q, *r; int i, t; for (i = t = 0, q = s; i <= l; ++i) { if (i < l && s[i] != '\t') continue; s[i] = 0; if (t == 0) pr->qn = q; else if (t == 1) pr->ql = strtol(q, &r, 10); else if (t == 2) pr->qs = strtol(q, &r, 10); else if (t == 3) pr->qe = strtol(q, &r, 10); else if (t == 4) pr->rev = (*q == '-'); else if (t == 5) pr->tn = q; else if (t == 6) pr->tl = strtol(q, &r, 10); else if (t == 7) pr->ts = strtol(q, &r, 10); else if (t == 8) pr->te = strtol(q, &r, 10); else if (t == 9) pr->ml = strtol(q, &r, 10); else if (t == 10) pr->bl = strtol(q, &r, 10); ++t, q = i < l? &s[i+1] : 0; } if (t < 10) return -1; return 0; } int paf_read(paf_file_t *pf, paf_rec_t *r) { int ret, dret; file_read_more: ret = ks_getuntil((kstream_t*)pf->fp, KS_SEP_LINE, &pf->buf, &dret); if (ret < 0) return ret; ret = paf_parse(pf->buf.l, pf->buf.s, r); if (ret < 0) goto file_read_more; return ret; } HINGE-0.5.0/src/maximal/000077500000000000000000000000001314415550300145665ustar00rootroot00000000000000HINGE-0.5.0/src/maximal/CMakeLists.txt000066400000000000000000000003121314415550300173220ustar00rootroot00000000000000cmake_minimum_required(VERSION 3.2) add_executable(get_maximal_reads maximal) target_link_libraries(get_maximal_reads LAInterface ini spdlog) install(TARGETS get_maximal_reads DESTINATION ${libexec}) HINGE-0.5.0/src/maximal/maximal.cpp000066400000000000000000001047761314415550300167410ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "INIReader.h" #include "spdlog/spdlog.h" #include "DB.h" #include "align.h" #include "LAInterface.h" #include "cmdline.h" std::string lastN(std::string input, int n) { return input.substr(input.size() - n); } inline std::vector glob(const std::string& pat){ using namespace std; glob_t glob_result; int i = 1; std::string search_name; search_name = pat + "."+std::to_string(i)+".las"; std::cout << search_name << endl; glob(search_name.c_str(),GLOB_TILDE,NULL,&glob_result); // std::cout << "Number of files " << glob_result.gl_pathc << std::endl; vector ret; while (glob_result.gl_pathc != 0){ ret.push_back(string(glob_result.gl_pathv[0])); i ++; search_name = pat + "."+std::to_string(i)+".las"; glob(search_name.c_str(),GLOB_TILDE,NULL,&glob_result); // std::cout << "Number of files " << glob_result.gl_pathc << std::endl; } std::cout << "-------------------------"<< std::endl; std::cout << "Number of files " << i-1 << std::endl; std::cout << "Input string " << pat.c_str() << std::endl; std::cout << "-------------------------"<< std::endl; globfree(&glob_result); return ret; } bool ProcessAlignment(LOverlap * match, Read * read_A, Read * read_B, int ALN_THRESHOLD, int THETA, int THETA2, bool trim){ //Function takes as input pointers to a match, and the read_A and read_B of that match, set constants //ALN_THRESHOLD and THETA //It inputs the effective read start and end into the match class object //Next it trims match //Finally it figures out the type of match we have here by calling AddTypesAsymmetric() on the //class object //std::cout<<" In ProcessAlignment"<eff_read_A_read_start_ = read_A->effective_start; match->eff_read_A_read_end_ = read_A->effective_end; // removed the following if, so that things agree with the convention for reverse complement matches match->eff_read_B_read_start_ = read_B->effective_start; match->eff_read_B_read_end_ = read_B->effective_end; // if (match->reverse_complement_match_ == 0) { // match->eff_read_B_read_start_ = read_B->effective_start; // match->eff_read_B_read_end_ = read_B->effective_end; // } else { // match->eff_read_B_read_start_ = read_B->len - read_B->effective_end; // match->eff_read_B_read_end_ = read_B->len - read_B->effective_start; // } /*printf("bef %d %d %d [%d %d] [%d %d] [%d %d] [%d %d]\n", match->read_A_id_, match->read_B_id_, * match->reverse_complement_match_, match->read_A_match_start_, match->read_A_match_end_, match->read_B_match_start_, match->read_B_match_end_, match->eff_read_A_read_start_, match->eff_read_A_read_end_, match->eff_read_B_read_start_, match->eff_read_B_read_end_ );*/ if (trim) match->trim_overlap(); else { match->eff_read_B_match_start_ = match->read_B_match_start_; match->eff_read_B_match_end_ = match->read_B_match_end_; match->eff_read_A_match_start_ = match->read_A_match_start_; match->eff_read_A_match_end_ = match->read_A_match_end_; } /*printf("aft %d %d %d [%d %d] [%d %d] [%d %d] [%d %d]\n", match->read_A_id_, match->read_B_id_, * match->reverse_complement_match_, match->eff_read_A_match_start_, match->eff_read_A_match_end_, match->eff_read_B_match_start_, match->eff_read_B_match_end_, match->eff_read_A_read_start_, match->eff_read_A_read_end_, match->eff_read_B_read_start_, match->eff_read_B_read_end_ );*/ //std::cout<< contained<eff_read_B_match_end_ - match->eff_read_B_match_start_) < ALN_THRESHOLD) or ((match->eff_read_A_match_end_ - match->eff_read_A_match_start_) < ALN_THRESHOLD) or (!match->active)) { match->active = false; match->match_type_ = NOT_ACTIVE; } else { match->AddTypesAsymmetric(THETA,THETA2); if (match->match_type_ == BCOVERA) { contained = true; } //std::cout<< contained<< std::endl; } match->weight = match->eff_read_A_match_end_ - match->eff_read_A_match_start_ + match->eff_read_B_match_end_ - match->eff_read_B_match_start_; match->length = match->read_A_match_end_ - match->read_A_match_start_ + match->read_B_match_end_ - match->read_B_match_start_; return contained; } std::vector> Merge(std::vector & intervals, int cutoff) //Returns sections of read a which are covered by overlaps. Each overlap is considered as // . { //std::cout<<"Merge"< > ret; int n = intervals.size(); // Length of the vector intervals if (n == 0) return ret; if(n == 1) { ret.push_back(std::pair(intervals[0]->read_A_match_start_, intervals[0]->read_A_match_end_)); return ret; } //Where is sort defined ? Is this std::sort? sort(intervals.begin(),intervals.end(),compare_overlap_abpos); //sort according to left (start position of // overlap beginning on a) int left= intervals[0]->read_A_match_start_ + cutoff, right = intervals[0]->read_A_match_end_ - cutoff; //left, right means maximal possible interval now for(int i = 1; i < n; i++) { //Ovl1 ~ Ovl2 if Ovl1 and Ovl2 have a nonzero intersection. (that is both the b read maps // to the same position on the a read) //This defines a chain of connected overlaps. This for loop returns a a vector ret which // is a pair of if(intervals[i]->read_A_match_start_ + cutoff <= right) { right=std::max(right, intervals[i]->read_A_match_end_ - cutoff); } else { ret.push_back(std::pair(left,right)); left = intervals[i]->read_A_match_start_ + cutoff; right = intervals[i]->read_A_match_end_ - cutoff; } } ret.push_back(std::pair(left,right)); return ret; } //Interval = pair. Defined in LAInterface.h Interval Effective_length(std::vector & intervals, int min_cov) { //Returns //start_pos : the first position at which Read a of the overlaps have at least min_cov matches on it. //end_pos : the last position that the (#overlaps- min_cov)th read (in order of start positions ends). //Should compare_overlap_aepos actually compare read_A_match_end_? If that is done, then the end_pos // will be the last position // on the a read so that all positions beyond have less than min_cov matches on them Interval ret; sort(intervals.begin(),intervals.end(),compare_overlap_abpos); //sort according to left if (intervals.size() > min_cov) { ret.first = intervals[min_cov]->read_A_match_start_; } else ret.first = 0; sort(intervals.begin(),intervals.end(),compare_overlap_aepos); //sort according to left if (intervals.size() > min_cov) { ret.second = intervals[min_cov]->read_A_match_end_; } else ret.second = 0; return ret; } bool bridge(LOverlap* ovl, int s, int e){ //Returns True if [s e] on read a is bridged by ovl. False else. //Put 500 in a typedef perhaps? return ((ovl->read_A_match_start_ < s - 500) and (ovl->read_A_match_end_ > e + 500)); } float number_of_bridging_reads(std::vector ovl_reads, int hinge_location, int hinge_type,int threshold){ int num_bridging_reads=0; //int threshold=100; std::vector read_ends; if (hinge_type==1){ for (int i=0; i < ovl_reads.size(); i++){ if ((ovl_reads[i]->read_A_match_start_ > hinge_location-threshold ) and (ovl_reads[i]->read_A_match_start_ < hinge_location+threshold )) read_ends.push_back(ovl_reads[i]->read_A_match_end_); } } else if (hinge_type==-1){ for (int i=0; i < ovl_reads.size(); i++){ if ((ovl_reads[i]->read_A_match_end_ > hinge_location-threshold ) and (ovl_reads[i]->read_A_match_end_ < hinge_location+threshold )) read_ends.push_back(ovl_reads[i]->read_A_match_start_); } } std::sort(read_ends.begin(),read_ends.end(), std::greater()); int start_point=0; int num_bins=0; for (int i=0; i 2 * threshold) { num_bins++; start_point = i; } } return num_bins/((float)1); } int main(int argc, char *argv[]) { mkdir("log",S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH); cmdline::parser cmdp; cmdp.add("db", 'b', "db file name", false, ""); cmdp.add("las", 'l', "las file name", false, ""); cmdp.add("paf", 'p', "paf file name", false, ""); cmdp.add("config", 'c', "configuration file name", false, ""); cmdp.add("fasta", 'f', "fasta file name", false, ""); cmdp.add("prefix", 'x', "prefix of (intermediate) output", false, "out"); cmdp.add("restrictreads",'r',"restrict to reads in the file",false,""); cmdp.add("log", 'g', "log folder name", false, "log"); cmdp.add("mlas", '\0', "multiple las files"); cmdp.add("debug", '\0', "debug mode"); cmdp.parse_check(argc, argv); LAInterface la; const char * name_db = cmdp.get("db").c_str(); //.db file of reads to load const char * name_las_base = cmdp.get("las").c_str();//.las file of alignments const char * name_paf = cmdp.get("paf").c_str(); const char * name_fasta = cmdp.get("fasta").c_str(); const char * name_config = cmdp.get("config").c_str();//name of the configuration file, in INI format std::string out = cmdp.get("prefix"); bool has_qv = true; const char * name_restrict = cmdp.get("restrictreads").c_str(); std::string name_mask = out + ".mas"; namespace spd = spdlog; //auto console = spd::stdout_logger_mt("console",true); std::vector sinks; sinks.push_back(std::make_shared()); sinks.push_back(std::make_shared(cmdp.get("log") + "/log", "txt", 23, 59)); auto console = std::make_shared("log", begin(sinks), end(sinks)); spdlog::register_logger(console); //auto console = std::make_shared("name", begin(sinks), end(sinks)); console->info("Getting maximal reads"); bool db_and_las, db_or_las, fa_and_paf, fa_or_paf; db_and_las = (strlen(name_db) > 0) and (strlen(name_las_base) > 0); db_or_las = (strlen(name_db) > 0) or (strlen(name_las_base) > 0); fa_and_paf = (strlen(name_fasta) > 0) and (strlen(name_paf) > 0); fa_or_paf = (strlen(name_fasta) > 0) or (strlen(name_paf) > 0); if (db_or_las and fa_or_paf){ console->error("Pass in either a db and a las or a fasta and a paf"); return 1; } if (( not fa_and_paf) and (not db_and_las)){ console->error("Pass in at least one of the following two combinations: a db and a las or a fasta and a paf"); return 1; } std::string name_las_string; if (cmdp.exist("mlas")) { if (not db_and_las){ console->error("--mlas works only with db and las"); return 1; } name_las_string = std::string(name_las_base); } else if (strlen(name_las_base) > 0) { if (lastN(std::string(name_las_base), 4) == ".las") name_las_string = std::string(name_las_base); else name_las_string = std::string(name_las_base) + ".las"; } const char * name_las = name_las_string.c_str(); /** * There are two sets of input, the first is db+las, which corresponds to daligner as an overlapper, * the other is fasta + paf, which corresponds to minimap as an overlapper. */ console->info("name of db: {}, name of .las file {}", name_db, name_las); console->info("name of fasta: {}, name of .paf file {}", name_fasta, name_paf); std::ifstream ini_file(name_config); std::string str((std::istreambuf_iterator(ini_file)), std::istreambuf_iterator()); console->info("Parameters passed in \n{}", str); if (strlen(name_db) > 0) la.openDB(name_db); std::vector name_las_list; std::string name_las_str(name_las); console->info("Las files: {}", name_las_str); if (cmdp.exist("mlas")) { console->info("Calling glob."); name_las_list = glob(name_las_str); } else name_las_list.push_back(name_las_str); int n_read; if (strlen(name_db) > 0) n_read = la.getReadNumber(); std::vector reads; //Vector of pointers to all reads if (strlen(name_fasta) > 0) { n_read = la.loadFASTA(name_fasta,reads); has_qv = false; } console->info("# Reads: {}", n_read); // output some statistics std::vector> QV; if (strlen(name_db) > 0) { la.getRead(reads,0,n_read); if (la.getQV(QV,0,n_read) != 0) // load QV track from .db file has_qv = false; } if (has_qv) for (int i = 0; i < n_read; i++) { for (int j = 0; j < QV[i].size(); j++) QV[i][j] = int(QV[i][j] < 40); } //Binarize QV vector, 40 is the threshold std::set reads_to_keep, reads_to_keep_initial; char * line = NULL; size_t len = 0; if (strlen(name_restrict) > 0){ FILE * restrict_reads; restrict_reads = fopen(name_restrict, "r"); while (getline(&line, &len, restrict_reads) != -1){ std::stringstream ss; ss.clear(); ss << line; int num; ss >> num; reads_to_keep.insert(num); } fclose(restrict_reads); console->info("Reads to debug loaded from: {}", name_restrict); console->info("Number of reads to debug loaded: {}", reads_to_keep.size()); } else console->info("No debug restrictions."); if (strlen(name_las_list[0].c_str()) > 0) la.openAlignmentFile(name_las_list[0]); // get tspace std::vector > QV_mask(n_read); // QV_mask is the mask based on QV for reads, for each read, it has one pair [start, end] if (has_qv) { for (int i = 0; i < n_read; i++) { int s = 0, e = 0; int max = 0, maxs = s, maxe = e; for (int j = 0; j < QV[i].size(); j++) { if ((QV[i][j] == 1) and (j max) { maxe = e ; maxs = s; max = e - s; } s = j+1; e = j+1; } } // get the longest consecutive region that has good QV //printf("maxs %d maxe %d size%d\n",maxs, maxe,QV[i].size()); QV_mask[i] = (std::pair(maxs*la.tspace, maxe*la.tspace)); // tspace the the interval of trace points // create mask by QV } } INIReader reader(name_config); if (reader.ParseError() < 0) { console->warn("Can't load {}", name_config); return 1; } int LENGTH_THRESHOLD = reader.GetInteger("filter", "length_threshold", -1); double QUALITY_THRESHOLD = reader.GetReal("filter", "quality_threshold", 0.0); int N_ITER = reader.GetInteger("filter", "n_iter", -1); int ALN_THRESHOLD = reader.GetInteger("filter", "aln_threshold", -1); int MIN_COV = reader.GetInteger("filter", "min_cov", -1); int CUT_OFF = reader.GetInteger("filter", "cut_off", -1); int THETA = reader.GetInteger("filter", "theta", -1); int THETA2 = (int) reader.GetInteger("filter", "theta2", 0); int N_PROC = reader.GetInteger("running", "n_proc", 4); int EST_COV = reader.GetInteger("filter", "ec", 0); // load the estimated coverage (probably from other programs) from ini file, if it is zero, then estimate it int reso = 40; // resolution of masks, repeat annotation, coverage, etc = 40 basepairs bool use_qv_mask = reader.GetBoolean("filter", "use_qv", true); bool use_coverage_mask = reader.GetBoolean("filter", "coverage", true); int COVERAGE_FRACTION = (int) reader.GetInteger("filter", "coverage_frac_repeat_annotation", 3); const int MIN_REPEAT_ANNOTATION_THRESHOLD = (int) reader.GetInteger("filter", "min_repeat_annotation_threshold", 10); const int MAX_REPEAT_ANNOTATION_THRESHOLD = (int) reader.GetInteger("filter", "max_repeat_annotation_threshold", 20); const int REPEAT_ANNOTATION_GAP_THRESHOLD = (int) reader.GetInteger("filter", "repeat_annotation_gap_threshold",300); //How far two hinges of the same type can be const int NO_HINGE_REGION = (int) reader.GetInteger("filter", "no_hinge_region",500); const int HINGE_MIN_SUPPORT = (int) reader.GetInteger("filter", "hinge_min_support", 7); //Minimum number of reads that have to start in a reso length interval to be considered in hinge calling const int HINGE_BIN_PILEUP_THRESHOLD = (int) reader.GetInteger("filter", "hinge_min_pileup", 7); //Minimum number of reads to have in a pileup to consider a hinge bridged const int HINGE_READ_UNBRIDGED_THRESHOLD = (int) reader.GetInteger("filter", "hinge_unbridged", 6); //Number of reads that one has to see before a pileup to declare a potential hinge unbridged int HINGE_BIN_LENGTH = (int) reader.GetInteger("filter", "hinge_bin", 100); //Physical length of the bins considered const int HINGE_TOLERANCE_LENGTH = (int) reader.GetInteger("filter", "hinge_tolerance_length", 100); bool USE_TWO_MATCHES = (int) reader.GetInteger("layout", "use_two_matches", 1); //Reads starting at +/- HINGE_TOLERANCE_LENGTH are considered reads starting at hinges HINGE_BIN_LENGTH=2*HINGE_TOLERANCE_LENGTH; console->info("use_qv_mask set to {}",use_qv_mask); use_qv_mask = use_qv_mask and has_qv; console->info("use_qv_mask set to {}",use_qv_mask); omp_set_num_threads(N_PROC); console->info("number processes set to {}", N_PROC); console->info("LENGTH_THRESHOLD = {}",LENGTH_THRESHOLD); console->info("QUALITY_THRESHOLD = {}",QUALITY_THRESHOLD); console->info("N_ITER = {}",N_ITER); console->info("ALN_THRESHOLD = {}",ALN_THRESHOLD); console->info("MIN_COV = {}",MIN_COV); console->info("CUT_OFF = {}",CUT_OFF); console->info("THETA = {}",THETA); console->info("EST_COV = {}",EST_COV); console->info("reso = {}",reso); console->info("use_coverage_mask = {}",use_coverage_mask); console->info("COVERAGE_FRACTION = {}",COVERAGE_FRACTION); console->info("MIN_REPEAT_ANNOTATION_THRESHOLD = {}",MIN_REPEAT_ANNOTATION_THRESHOLD); console->info("MAX_REPEAT_ANNOTATION_THRESHOLD = {}",MAX_REPEAT_ANNOTATION_THRESHOLD); console->info("REPEAT_ANNOTATION_GAP_THRESHOLD = {}",REPEAT_ANNOTATION_GAP_THRESHOLD); console->info("NO_HINGE_REGION = {}",NO_HINGE_REGION); console->info("HINGE_MIN_SUPPORT = {}",HINGE_MIN_SUPPORT); console->info("HINGE_BIN_PILEUP_THRESHOLD = {}",HINGE_BIN_PILEUP_THRESHOLD); console->info("HINGE_READ_UNBRIDGED_THRESHOLD = {}",HINGE_READ_UNBRIDGED_THRESHOLD); console->info("HINGE_BIN_LENGTH = {}",HINGE_BIN_LENGTH); console->info("HINGE_TOLERANCE_LENGTH = {}",HINGE_TOLERANCE_LENGTH); std::vector aln;//Vector of pointers to all alignments std::vector< std::vector > > coverages(n_read); std::vector< std::vector > > cutoff_coverages(n_read); std::vector< std::vector > > cgs(n_read); //coverage gradient; std::vector> maskvec; std::vector > > repeat_annotation; std::unordered_map> > hinges; std::ofstream cov(out + ".coverage.txt"); std::ofstream homo(out + ".homologous.txt"); std::ofstream filtered(out + ".filtered.fasta"); std::ofstream contained_out(out + ".contained.txt"); std::ofstream maximal_reads(out + ".max"); FILE *mask_file; mask_file = fopen(name_mask.c_str(), "r"); int read, rs, re; while (fscanf(mask_file, "%d %d %d", &read, &rs, &re) != EOF) { reads[read]->effective_start = rs; reads[read]->effective_end = re; } console->info("read mask finished"); int num_active_read = 0; for (int i = 0; i < n_read; i++) { if (reads[i]->active) num_active_read++; } console->info("active reads at start: {}", num_active_read); num_active_read = 0; for (int i = 0; i < n_read; i++) { if (reads[i]->effective_end - reads[i]->effective_start < LENGTH_THRESHOLD) { reads[i]->active = false; } else num_active_read++; } console->info("active reads after correcting for read lengths: {}", num_active_read); int number_of_parts; if (strlen(name_las) > 0) number_of_parts = name_las_list.size(); else if(strlen(name_paf) > 0) number_of_parts = 1; else { console->error("Need to provide either las and db or paf and fasta"); return 1; } console->info("number of las files: {}", number_of_parts); for (int part = 0; part < name_las_list.size(); part++) { console->info("name of las: {}", name_las_list[part]); int64 n_aln = 0; if(strlen(name_las_base)> 0) { if (strlen(name_las_list[part].c_str()) > 0) la.openAlignmentFile(name_las_list[part]); if (strlen(name_las_list[part].c_str()) > 0) { n_aln = la.getAlignmentNumber(); console->info("Load alignments from {}", name_las_list[part]); console->info("# Alignments: {}", n_aln); } if (strlen(name_las_list[part].c_str()) > 0) { la.resetAlignment(); la.getOverlap(aln, 0, n_read); } } if (strlen(name_paf) > 0) { n_aln = la.loadPAF(std::string(name_paf), aln); console->info("Load alignments from {}", name_paf); console->info("# Alignments: {}", n_aln); } if (n_aln == 0) { console->error("No alignments!"); return 1; } console->info("Input data finished, part {}/{}", part + 1, number_of_parts); int r_begin = aln.front()->read_A_id_; int r_end = aln.back()->read_A_id_; std::vector > idx_pileup; // this is the pileup std::vector > idx_pileup_dedup; // this is the deduplicated pileup std::vector > > idx_ab; //unordered_map from (aid, bid) to alignments in a vector for (int i = 0; i< n_read; i++) { idx_pileup.push_back(std::vector()); idx_pileup_dedup.push_back(std::vector()); idx_ab.push_back(std::unordered_map> ()); repeat_annotation.push_back(std::vector >()); maskvec.push_back(std::pair()); } for (int i = 0; i < aln.size(); i++) { if (aln[i]->read_A_id_ == aln[i]->read_B_id_) { aln[i]->active = false; } if (aln[i]->active) { idx_pileup[aln[i]->read_A_id_].push_back(aln[i]); } } for (int i = 0; i < n_read; i++) {// sort overlaps of a reads std::sort(idx_pileup[i].begin(), idx_pileup[i].end(), compare_overlap); } for (int i = 0; i < aln.size(); i++) { idx_ab[aln[i]->read_A_id_][aln[i]->read_B_id_] = std::vector(); } for (int i = 0; i < aln.size(); i++) { idx_ab[aln[i]->read_A_id_][aln[i]->read_B_id_].push_back(aln[i]); } for (int i = 0; i < n_read; i++) { for (std::unordered_map >::iterator it = idx_ab[i].begin(); it!= idx_ab[i].end(); it++) { std::sort(it->second.begin(), it->second.end(), compare_overlap); if (it->second.size() > 0) idx_pileup_dedup[i].push_back(it->second[0]); } } console->info("profile coverage (with and without CUT_OFF)"); //std::vector< std::vector > > his; for (int i = r_begin; i <= r_end; i ++) { std::vector > coverage; std::vector > cutoff_coverage; //TODO : Implement set based gradient std::vector > cg; //profileCoverage: get the coverage based on pile-o-gram la.profileCoverage(idx_pileup[i], cutoff_coverage, reso, CUT_OFF); la.profileCoverage(idx_pileup[i], coverage, reso, 0); cov << "read " << i <<" "; for (int j = 0; j < coverage.size(); j++) cov << coverage[j].first << "," << coverage[j].second << " "; cov << std::endl; //Computes coverage gradients. if (coverage.size() >= 2) for (int j = 0; j < coverage.size() - 1; j++) { cg.push_back(std::pair(coverage[j].first, coverage[j+1].second - coverage[j].second)); } else cg.push_back(std::pair (0,0)); coverages[i] = (coverage); cutoff_coverages[i] = (cutoff_coverage); cgs[i] = (cg); } console->info("profile coverage done part {}/{}", part + 1, number_of_parts); std::set rand_reads; srand(time(NULL)); rand_reads.insert(0); int temp_index(0); while (rand_reads.size() < (r_end - r_begin)/500){ temp_index ++; int rd_id=rand()%(r_end - r_begin) + r_begin; if (reads[rd_id]->len > 5000) rand_reads.insert(rd_id); if (temp_index > 20000) break; } int num_slot = 0; long int total_cov = 0; std::vector read_coverage; long int read_cov=0; int read_slot =0; //Finding the average coverage, probing a small proportion of reads // for (std::set::iterator it=rand_reads.begin();it!=rand_reads.end(); ++it) { for (int i =r_begin; i <= r_end; i++){ if (reads[i]->len < 5000) continue; read_cov=0; read_slot=0; for (int j = 0; j < coverages[i].size(); j++) { //printf("%d\n", coverages[i][j].second); read_cov+=coverages[i][j].second; read_slot++; } total_cov += read_cov; num_slot += read_slot; int mean_read_cov=read_cov / std::max(1,read_slot); read_coverage.push_back(mean_read_cov); } size_t median_id = read_coverage.size() / 2; if (median_id > 0) std::nth_element(read_coverage.begin(), read_coverage.begin()+median_id, read_coverage.end()); int cov_est= read_coverage[median_id]; int mean_cov_est = total_cov / num_slot; //get estimated coverage if (EST_COV != 0) cov_est = EST_COV; console->info("Estimated mean coverage: {}", mean_cov_est); //if the coverage is specified by ini file, cover the estimated one console->info("Estimated median coverage: {}", cov_est); // mask vector, same format as mask_QV if (MIN_COV < cov_est/3) MIN_COV = cov_est/3; if (reads_to_keep.size()>0) { reads_to_keep_initial = reads_to_keep; for (std::set::iterator iter = reads_to_keep_initial.begin(); iter != reads_to_keep_initial.end(); ++iter) { int i = *iter; for (std::unordered_map >::iterator it = idx_ab[i].begin(); it != idx_ab[i].end(); it++) { if (it->second.size() > 0) { LOverlap *ovl = it->second[0]; reads_to_keep.insert(ovl->read_B_id_); } } } console->info("After accounting for neighbours of reads selected, have {} reads", reads_to_keep.size()); } std::unordered_map > matches_forward, matches_backward; for (int i = r_begin; i <= r_end; i ++) { //An initialisation for loop //TODO Preallocate memory. Much more efficient. //idx2.push_back(std::vector()); matches_forward[i] = std::vector(); matches_backward[i] = std::vector(); } for (int i = r_begin; i <= r_end; i ++) { bool contained = false; //std::cout<< "Testing opt " << i << std::endl; if (reads[i]->active == false) { continue; } int containing_read; for (std::unordered_map >::iterator it = idx_ab[i].begin(); it != idx_ab[i].end(); it++) { std::sort(it->second.begin(), it->second.end(), compare_overlap);//Sort overlaps by lengths //std::cout<<"Giving input to ProcessAlignment "<second.size() <second.size() > 0) { //Figure out if read is contained LOverlap *ovl = it->second[0]; bool contained_alignment; if (strlen(name_db) > 0) contained_alignment = ProcessAlignment(ovl, reads[ovl->read_A_id_], reads[ovl->read_B_id_], ALN_THRESHOLD, THETA, THETA2, true); else contained_alignment = ProcessAlignment(ovl, reads[ovl->read_A_id_], reads[ovl->read_B_id_], ALN_THRESHOLD, THETA, THETA2, false); if (contained_alignment == true) { containing_read = ovl->read_B_id_; } if (reads[ovl->read_B_id_]->active == true) contained = contained or contained_alignment; //Filter matches that matter. //TODO Figure out a way to do this more efficiently if ((ovl->match_type_ == FORWARD) or (ovl->match_type_ == FORWARD_INTERNAL)) matches_forward[i].push_back(it->second[0]); else if ((ovl->match_type_ == BACKWARD) or (ovl->match_type_ == BACKWARD_INTERNAL)) matches_backward[i].push_back(it->second[0]); } if ((it->second.size() > 1) and (USE_TWO_MATCHES)) { //Figure out if read is contained LOverlap *ovl = it->second[1]; bool contained_alignment; if (strlen(name_db) > 0) contained_alignment = ProcessAlignment(ovl, reads[ovl->read_A_id_], reads[ovl->read_B_id_], ALN_THRESHOLD, THETA, THETA2, true); else contained_alignment = ProcessAlignment(ovl, reads[ovl->read_A_id_], reads[ovl->read_B_id_], ALN_THRESHOLD, THETA, THETA2, false); if (contained_alignment == true) { containing_read = ovl->read_B_id_; } if (reads[ovl->read_B_id_]->active == true) contained = contained or contained_alignment; //Filter matches that matter. //TODO Figure out a way to do this more efficiently if ((ovl->match_type_ == FORWARD) or (ovl->match_type_ == FORWARD_INTERNAL)) matches_forward[i].push_back(it->second[1]); else if ((ovl->match_type_ == BACKWARD) or (ovl->match_type_ == BACKWARD_INTERNAL)) matches_backward[i].push_back(it->second[1]); } } if (contained) { reads[i]->active = false; contained_out << i << "\t" << containing_read << std::endl; } } int num_overlaps = 0; int num_forward_overlaps(0), num_forward_internal_overlaps(0), num_reverse_overlaps(0), num_reverse_internal_overlaps(0), rev_complemented_matches(0); for (int i = 0; i < n_read; i++) {//Isn't this just 0 or 1? num_overlaps += matches_forward[i].size() + matches_backward[i].size(); for (int j = 0; j < matches_forward[i].size(); j++) rev_complemented_matches += matches_forward[i][j]->reverse_complement_match_; for (int j = 0; j < matches_backward[i].size(); j++) rev_complemented_matches += matches_backward[i][j]->reverse_complement_match_; } console->info("{} overlaps", num_overlaps); console->info("{} rev overlaps", rev_complemented_matches); num_active_read = 0; for (int i = r_begin; i <= r_end; i ++) { if (reads[i]->active) { num_active_read++; maximal_reads << i << std::endl; } } console->info("removed contained reads, active reads: {}", num_active_read); num_active_read = 0; for (int i = r_begin; i <= r_end; i ++) { if (reads[i]->active) num_active_read++; } console->info("active reads: {}", num_active_read); console->info("total reads: {}", r_end-r_begin+1); if (strlen(name_las) > 0) { for (int i = 0; i < aln.size(); i++) { delete aln[i]; } aln.clear(); } } if (strlen(name_db)>0) la.closeDB(); //close database return 0; } HINGE-0.5.0/src/spdlog/000077500000000000000000000000001314415550300144265ustar00rootroot00000000000000HINGE-0.5.0/src/test/000077500000000000000000000000001314415550300141155ustar00rootroot00000000000000HINGE-0.5.0/src/test/CMakeLists.txt000066400000000000000000000000451314415550300166540ustar00rootroot00000000000000cmake_minimum_required(VERSION 3.2) HINGE-0.5.0/src/test/LAInterface_consensus_test.cpp000066400000000000000000000101641314415550300220770ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include "DB.h" #include "align.h" #include "LAInterface.h" #include #include #include #include extern "C" { #include "common.h" } #define LAST_READ_SYMBOL '$' static int ORDER(const void *l, const void *r) { int x = *((int32 *) l); int y = *((int32 *) r); return (x - y); } int main(int argc, char *argv[]) { LAInterface la; std::cout << "hello" << std::endl; Read *test_read; la.openDB("G"); std::cout<<"# Reads:" << la.getReadNumber() << std::endl; la.showRead(1, 3); //show read [1,3) test_read = la.getRead(0); //get read 0 test_read->showRead(); // show read 0 la.openAlignmentFile("G.1.las"); la.showAlignment(0, 2); // show alignments of read [0,2) std::cout<<"# Alignments:" << la.getAlignmentNumber() << std::endl; la.resetAlignment(); std::vector res; la.getAlignmentB(res, 1); //get alignment for read 1 for (auto i:res) printf("%d ", i); printf("\n"); std::vector res1; la.resetAlignment(); la.getOverlap(res1, 3, 5); // get alignment(overlap) for reads [3,5) for (auto i:res1) i->show(); printf("\n"); std::vector res2; la.resetAlignment(); //la.getAlignment(res2, 0, 3);// get alignment for reads [0,3) la.getAlignment(res2, 0, 1); int seq_count = res2.size(); align_tags_t ** tags_list; tags_list = (align_tags_t **) calloc( seq_count+1, sizeof(align_tags_t *) ); test_read = la.getRead(0); //get read 0 std::string base_structure = test_read->bases; std::transform(base_structure.begin(), base_structure.end(),base_structure.begin(), ::toupper); aln_range * arange; arange = (aln_range*) calloc(1 , sizeof(aln_range)); arange->s1 = 0; arange->s2 = 0; arange->e1 = base_structure.size(); arange->e2 = base_structure.size(); char * seq = (char *) malloc(base_structure.size()* sizeof(char)); strcpy(seq, base_structure.c_str()); tags_list[0] = get_align_tags( seq, seq, strlen(seq), arange, 0, 0); for (int i = 0; i < seq_count; i ++) { res2[i]->show(); la.recoverAlignment(res2[i]); std::pair alignment = la.getAlignmentTags(res2[i]); //alignment.first.erase (std::remove(alignment.first.begin(), alignment.first.end(), '-'), alignment.first.end()); //alignment.second.erase (std::remove(alignment.second.begin(), alignment.second.end(), '-'), alignment.second.end()); //std::cout << alignment.first.size() <s1 = res2[i]->bbpos; arange->e1 = res2[i]->bepos; arange->s2 = res2[i]->abpos; arange->e2 = res2[i]->aepos; tags_list[i+1] = get_align_tags( q_aln_str, t_aln_str, aln_str_size, arange, (unsigned int)i + 1, 0); free(q_aln_str); free(t_aln_str); free_aln_range(arange); } //print consensus consensus_data * consensus; consensus = get_cns_from_align_tags( tags_list, seq_count+1, strlen(seq), 6 ); printf("Consensus:%s\n", consensus->sequence); free_consensus_data(consensus); for (int i = 0; i #include #include #include #include #include #include #include #include "DB.h" #include "align.h" #include "LAInterface.h" #include "gtest/gtest.h" #include #define LAST_READ_SYMBOL '$' static int ORDER(const void *l, const void *r) { int x = *((int32 *) l); int y = *((int32 *) r); return (x - y); } int main(int argc, char *argv[]) { LAInterface la; std::cout << "hello" << std::endl; Read *test_read; la.openDB("G"); std::cout<<"# Reads:" << la.getReadNumber() << std::endl; la.showRead(1, 3); //show read [1,3) test_read = la.getRead(0); //get read 0 test_read->showRead(); // show read 0 la.openAlignmentFile("G.1.las"); la.showAlignment(0, 2); // show alignments of read [0,2) std::cout<<"# Alignments:" << la.getAlignmentNumber() << std::endl; la.resetAlignment(); std::vector res; la.getAlignmentB(res, 1); //get alignment for read 1 for (auto i:res) printf("%d ", i); printf("\n"); std::vector res1; la.resetAlignment(); la.getOverlap(res1, 3, 5); // get alignment(overlap) for reads [3,5) for (auto i:res1) i->show(); printf("\n"); std::vector res2; la.resetAlignment(); //la.getAlignment(res2, 0, 3);// get alignment for reads [0,3) la.getAlignment(res2, 0, 2); for (int i = 0; i < 5; i ++) { res2[i]->show(); /*int tlen = res2[i]->tlen; int *trace = (int *) res2[i]->trace; int u; printf(" "); for (u = 0; u < tlen; u++) { printf("%d,", (int) trace[u]); } printf("\n"); la.showAlignmentTags(res2[i]);*/ /*printf("\n"); for (int j = 0; j < res2[i]->trace_pts_len; j++) printf(" %d", res2[i]->trace_pts[j]); printf("\n");*/ } la.closeDB(); //close database return 0; }HINGE-0.5.0/src/test/LAInterface_test1.cpp000066400000000000000000000004761314415550300200650ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include "DB.h" #include "align.h" #include "LAInterface.h" #include "gtest/gtest.h" #include TEST(SimpleTest, Plus) { ASSERT_EQ(1+1, 2); }HINGE-0.5.0/src/test/LAInterface_test_2DB.cpp000066400000000000000000000041151314415550300204250ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include "DB.h" #include "align.h" #include "LAInterface.h" #include #define LAST_READ_SYMBOL '$' static int ORDER(const void *l, const void *r) { int x = *((int32 *) l); int y = *((int32 *) r); return (x - y); } int main(int argc, char *argv[]) { LAInterface la; std::cout << "hello" << std::endl; Read *test_read; la.openDB2("G", "G"); std::cout<<"# Reads:" << la.getReadNumber() << std::endl; std::cout<<"# Reads:" << la.getReadNumber2() << std::endl; la.showRead(1, 3); //show read [1,3) test_read = la.getRead(0); //get read 0 test_read->showRead(); // show read 0 test_read = la.getRead2(1); //get read 0 test_read->showRead(); // show read 0 la.openAlignmentFile("G.1.las"); la.showAlignment(0, 2); // show alignments of read [0,2) std::cout<<"# Alignments:" << la.getAlignmentNumber() << std::endl; la.resetAlignment(); std::vector res; la.getAlignmentB(res, 1); //get alignment for read 1 for (auto i:res) printf("%d ", i); printf("\n"); std::vector res1; la.resetAlignment(); la.getOverlap(res1, 3, 5); // get alignment(overlap) for reads [3,5) for (auto i:res1) i->show(); printf("\n"); std::vector res2; la.resetAlignment(); //la.getAlignment(res2, 0, 3);// get alignment for reads [0,3) la.getAlignment(res2, 0, 2); for (int i = 0; i < 5; i ++) { res2[i]->show(); /*int tlen = res2[i]->tlen; int *trace = (int *) res2[i]->trace; int u; printf(" "); for (u = 0; u < tlen; u++) { printf("%d,", (int) trace[u]); } printf("\n"); la.showAlignmentTags(res2[i]);*/ /*printf("\n"); for (int j = 0; j < res2[i]->trace_pts_len; j++) printf(" %d", res2[i]->trace_pts[j]); printf("\n");*/ } la.closeDB(); //close database return 0; }HINGE-0.5.0/src/test/omp_test.c000066400000000000000000000012221314415550300161100ustar00rootroot00000000000000// // Created by Fei Xia on 10/25/15. // #include #include #include int main (int argc, char *argv[]) { int nthreads, tid; /* Fork a team of threads giving them their own copies of variables */ #pragma omp parallel private(nthreads, tid) { /* Obtain thread number */ tid = omp_get_thread_num(); printf("Hello World from thread = %d\n", tid); /* Only master thread does this */ if (tid == 0) { nthreads = omp_get_num_threads(); printf("Number of threads = %d\n", nthreads); } } /* All threads join master thread and disband */ } HINGE-0.5.0/thirdparty/000077500000000000000000000000001314415550300145415ustar00rootroot00000000000000HINGE-0.5.0/thirdparty/DALIGNER/000077500000000000000000000000001314415550300157265ustar00rootroot00000000000000HINGE-0.5.0/thirdparty/DASCRUBBER/000077500000000000000000000000001314415550300161555ustar00rootroot00000000000000HINGE-0.5.0/thirdparty/DAZZ_DB/000077500000000000000000000000001314415550300156565ustar00rootroot00000000000000HINGE-0.5.0/thirdparty/DEXTRACTOR/000077500000000000000000000000001314415550300162205ustar00rootroot00000000000000HINGE-0.5.0/utils/000077500000000000000000000000001314415550300135075ustar00rootroot00000000000000HINGE-0.5.0/utils/build.sh000077500000000000000000000005301314415550300151430ustar00rootroot00000000000000#!/bin/bash pwd=$PWD cd $pwd/thirdparty/DAZZ_DB make -j 8 cd $pwd/thirdparty//DALIGNER make -j 8 cd $pwd/thirdparty/DASCRUBBER make -j 8 cd $pwd/thirdparty/DEXTRACTOR make -j 8 cd $pwd mkdir build cd $pwd/build cmake .. -DCMAKE_INSTALL_PREFIX=../inst -DCMAKE_C_COMPILER=gcc-4.8 -DCMAKE_CXX_COMPILER=g++-4.8 make -j 8 make install exit $? HINGE-0.5.0/utils/clean.sh000077500000000000000000000004601314415550300151300ustar00rootroot00000000000000#!/bin/bash pwd=$PWD cd $pwd/thirdparty/DAZZ_DB make clean cd $pwd/thirdparty//DALIGNER make clean cd $pwd/thirdparty/DASCRUBBER make clean cd $pwd/thirdparty/DEXTRACTOR make clean cd $pwd mkdir build cd $pwd/build cmake .. -DCMAKE_C_COMPILER=gcc-4.9 -DCMAKE_CXX_COMPILER=g++-4.9 make clean exit $?HINGE-0.5.0/utils/compile.sh000077500000000000000000000002261314415550300154760ustar00rootroot00000000000000#!/bin/bash pwd=/io cd $pwd rm -rf build mkdir build cd $pwd/build cmake .. -DCMAKE_C_COMPILER=gcc-4.8 -DCMAKE_CXX_COMPILER=g++-4.8 make exit $? HINGE-0.5.0/utils/nominal.ini000066400000000000000000000006451314415550300156520ustar00rootroot00000000000000 [filter] length_threshold = 1000; quality_threshold = 0.23; n_iter = 3; // filter iteration aln_threshold = 1000; min_cov = 5; cut_off = 300; theta = 300; use_qv = true; [running] n_proc = 12; [draft] min_cov = 10; trim = 200; edge_safe = 100; tspace = 900; step = 50; [consensus] min_length = 4000; trim_end = 200; best_n = 1; quality_threshold = 0.23; [layout] hinge_slack = 1000 min_connected_component_size = 8 HINGE-0.5.0/utils/run.sh000077500000000000000000000013251314415550300146530ustar00rootroot00000000000000#!/bin/bash echo "Setting stuff up" cur_fol=$PWD cd ~/AwesomeAssembler && source utils/setup.sh cd $cur_fol echo "Running filter" Reads_filter --las $1.las --db $1 --config ~/AwesomeAssembler/utils/nominal.ini -x $1 echo "Running hinging" hinging --las $1.las --db $1 --config ~/AwesomeAssembler/utils/nominal.ini -o $1.$USER -x $1 echo "Running Visualise" python ~/AwesomeAssembler/scripts/Visualise_graph.py $1.edges.hinges hinge_list.txt echo "Running Condense" python ~/AwesomeAssembler/scripts/condense_graph.py $1.edges.hinges echo "Putting ground truth and condensing" if [ -e "$1.mapping.1.json" ] then python ~/AwesomeAssembler/scripts/condense_graph_with_aln_json.py $1.edges.hinges $1.mapping.1.json fiHINGE-0.5.0/utils/setup.sh000077500000000000000000000005531314415550300152110ustar00rootroot00000000000000#DIR=`dirname ${0}` PPWD=$PWD #echo $PWD #echo $DIR export PATH="$PATH:$PPWD/thirdparty:$PPWD/thirdparty/DALIGNER:$PPWD/thirdparty/DAZZ_DB:$PPWD/thirdparty/DEXTRACTOR/:$PPWD/thirdparty/DASCRUBBER" export PATH="$PATH:$PPWD/thirdparty/racon/bin:$PPWD/thirdparty/racon/tools/minimap" export PATH="$PATH:$PPWD/inst/bin" export MANPATH="$MANPATH:$PPWD/inst/share/man" HINGE-0.5.0/utils/test.sh000077500000000000000000000007031314415550300150250ustar00rootroot00000000000000#!/bin/bash pwd=$PWD cd $pwd/DAZZ_DB make clean && make -j 8 cd $pwd/DALIGNER make clean && make -j 8 cd $pwd/DASCRUBBER make clean && make -j 8 cd $pwd source setup.sh mkdir $pwd/data cd $pwd/data #rm -rf * rm G.* simulator 1.0 -c50. >G.fasta fasta2DB G G.fasta DBsplit -s20 G HPCdaligner G | csh -v rm G.*.G.*.las LAmerge G G.*.las DASqv -c50 G G.las touch log.txt LAInterface_test>log.txt Consensus_test>log.txt LAInterface_test_2DB>log.txt HINGE-0.5.0/utils/update.sh000077500000000000000000000017301314415550300153310ustar00rootroot00000000000000#!/bin/bash if [ "$2" == "scripts" ]; then rsync -rizP --delete --exclude '.*' --exclude '*.pyc' --exclude 'figures' scripts/ $1@shannon.stanford.edu:/home/$1/AwesomeAssembler/scripts fi if [ "$2" == "utils" ]; then rsync -rizP --delete --exclude '.*' --exclude '*.pyc' --exclude 'figures' utils/ $1@shannon.stanford.edu:/home/$1/AwesomeAssembler/utils fi if [ "$2" == "push" ]; then rsync -rizP --delete --exclude '.*' --exclude 'build' src/ $1@shannon.stanford.edu:/home/$1/AwesomeAssembler/src fi if [ "$2" == "pull" ]; then rsync -rizP --delete --exclude '.*' --exclude 'build' $1@shannon.stanford.edu:/home/$1/AwesomeAssembler/src/ src fi if [ "$2" == "update" ]; then ssh -t $1@shannon.stanford.edu "export TEMP=/home/$1/tmp && cd /home/$1/AwesomeAssembler && ./utils/build.sh" fi if [ "$2" == "all" ]; then rsync -rizP --delete --exclude '.*' --exclude 'data' --exclude '*.pyc' --exclude 'figures' --exclude 'build' . $1@shannon.stanford.edu:/home/$1/AwesomeAssembler fi